diff --git a/sycl/ReleaseNotes.md b/sycl/ReleaseNotes.md index bdc0daffbbf04..9960159f80281 100644 --- a/sycl/ReleaseNotes.md +++ b/sycl/ReleaseNotes.md @@ -10,10 +10,10 @@ Release notes for commit range 4fc5ebe..bd68232 - Added [sRGBA support](doc/extenstions/SRGB/srgb_support.asciidoc) [e488327][191efdd] - Added a preview feature implementation for the DPC++ experimental - [matrix extension]((doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc) + [matrix extension](doc/extensions/experimental/SYCL_EXT_ONEAPI_MATRIX.asciidoc) [7f218531] [a95f46d] - Added support for SYCL 2020 exceptions [5c0f748][eef07606][5af8c43d] - - Added support for [SYCL_EXT_INTEL_BF16_CONVERSION extension](doc/extensions/Bf16Conversion/SYCL_INTEL_bf16_conversion.asciidoc) + - Added support for [SYCL_EXT_INTEL_BF16_CONVERSION extension](doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc) [8075463] - Added support for fallback implementation of [assert feature](doc/Assert.md) [56c9ec4] @@ -22,14 +22,14 @@ Release notes for commit range 4fc5ebe..bd68232 ### Documentation - Added design document for [optional kernel features](doc/OptionalDeviceFeatures.md) [88cfe16] - - Added [SYCL_INTEL_bf16_conversion extension document](doc/extensions/Bf16Conversion/SYCL_INTEL_bf16_conversion.asciidoc) + - Added [SYCL_INTEL_bf16_conversion extension document](doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc) [9f8cc3af] - Align [SYCL_EXT_ONEAPI_GROUP_MASK extension](doc/extensions/GroupMask/GroupMask.asciidoc) with SYCL 2020 specification [a06bd1fb] - Added [documentation](doc/SYCLInstrumentationUsingXPTI.md) of XPTI related tracing in SYCL [1308fe7b] - Align `SYCL_EXT_ONEAPI_LOCAL_MEMORY` extension - [document](doc/extensions/LocalMemory/LocalMemory.asciidoc) with SYCL 2020 + [document](doc/extensions/supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc) with SYCL 2020 specification [6ed6565] ## Improvements @@ -94,7 +94,7 @@ Release notes for commit range 4fc5ebe..bd68232 - Optimized Cuda plugin work with small kernels [07189af0] - Optimized submission of kernels [441dc3b2][33432df6] - Aligned implementation of `SYCL_EXT_ONEAPI_LOCAL_MEMORY` extension - [document](doc/extensions/LocalMemory/LocalMemory.asciidoc) with updated + [document](doc/extensions/supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc) with updated document [b3db5e5] - Improved `sycl::accessor` initialization performance on device [a10199d] - Added support `sycl::get_kernel_ids` and cache for `sycl::kernel_id` objects @@ -123,7 +123,7 @@ Release notes for commit range 4fc5ebe..bd68232 ### Tools - Added support for ROCm devices in `get_device_count_by_type` [03155e7] ### Documentation - - Extended group [sort algorithms extension](doc/extensions/GroupAlgorithms/SYCL_INTEL_group_sort.asciidoc) + - Extended group [sort algorithms extension](doc/extensions/experimental/SYCL_EXT_ONEAPI_GROUP_SORT.asciidoc) with interfaces to scratchpad memory [f57091d] - Updated several extension documents to follow SYCL 2020 extension rules [7fb56cf] @@ -256,7 +256,7 @@ Release notes for commit range 6a49170027fb..962909fe9e78 - Added support for AMD GPU devices [ec612228] - Implemented SYCL 2020 `sycl::is_device_copyable` type trait [44c1cbcd] - Implemented SYCL 2020 USM features [1df6873d] - - Implemented support for Device UUID from [Intel's Extensions for Device Information](doc/extensions/IntelGPU/IntelGPUDeviceInfo.md) [25aee287] + - Implemented support for Device UUID from [Intel's Extensions for Device Information](doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md) [25aee287] - Implemented SYCL 2020 `sycl::atomic_fence` [dcd59547] - Implemented `intel::loop_count_max`, `intel::loop_count_max`, `intel::loop_count_avg` attributes that allow to specify number of loop @@ -267,13 +267,13 @@ Release notes for commit range 6a49170027fb..962909fe9e78 from [sub-group extension](doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc#attributes) [347e41c] - Implemented SYCL 2020 interoperability API [e6733e4] - - Added [group sorting algorithm](doc/extensions/GroupAlgorithms/SYCL_INTEL_group_sort.asciidoc) + - Added [group sorting algorithm](doc/extensions/experimental/SYCL_EXT_ONEAPI_GROUP_SORT.asciidoc) extension specification [edaee9b] - - Added [initial draft](doc/extensions/LevelZeroBackend/LevelZeroBackend.md) + - Added [initial draft](doc/extensions/supported/SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO.md) for querying of free device memory in LevelZero backend extension [fa428bf] - Added [InvokeSIMD](doc/extensions/proposed/SYCL_EXT_ONEAPI_INVOKE_SIMD.asciidoc) and [Uniform](doc/extensions/proposed/SYCL_EXT_ONEAPI_UNIFORM.asciidoc) extensions [72e1611] - - Added [Matrix Programming Extension for DPC++ document](doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc) [ace4c733] + - Added [Matrix Programming Extension for DPC++ document](doc/extensions/experimental/SYCL_EXT_ONEAPI_MATRIX.asciidoc) [ace4c733] - Implemented SYCL 2020 `sycl::span` [9356d53] - Added [device-if](doc/extensions/proposed/SYCL_EXT_ONEAPI_DEVICE_IF.asciidoc) extension [4fb95fc] @@ -355,7 +355,7 @@ Release notes for commit range 6a49170027fb..962909fe9e78 - Deprecated `sycl::buffer::get_count()`, please use `sycl::buffer::size()` instead [baf2ed9d] - Implemented `sycl::group_barrier` free function [48363902] - - Added support of [SYCL_INTEL_enqueue_barrier extension](doc/extensions/EnqueueBarrier/enqueue_barrier.asciidoc) for CUDA backend [2e978482] + - Added support of [SYCL_INTEL_enqueue_barrier extension](doc/extensions/supported/SYCL_EXT_ONEAPI_ENQUEUE_BARRIER.asciidoc) for CUDA backend [2e978482] - Deprecated `has_extension` method of `sycl::device` and `sycl::platform` classes, please use `has` method with aspects APIs instead [51c747da] - Deprecated `sycl::*_class` types, please use STL classes instead [51c747da] @@ -387,9 +387,9 @@ Release notes for commit range 6a49170027fb..962909fe9e78 - Updated [sub-group algoritms](doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc) extension to use `marray` instead of `vec` [98715ae] - Updated data flow pipes extension to be based on SYCL 2020 [f22f2e0] - - Updated [ESIMD documentation](doc/extensions/ExplicitSIMD/dpcpp-explicit-simd.md) + - Updated [ESIMD documentation](doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/SYCL_EXT_INTEL_ESIMD.md) reflecting recent API changes [1e0bd1ed] - - Updated [devicelib](doc/extensions/C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst) + - Updated [devicelib](doc/extensions/supported/C-CXX-StandardLibrary.rst) extension document with `scalnbn`, `abs` and `div` (and their variants) as supported [febfb5a] - Addressed renaming of TBB dll to `tbb12.dll` in the @@ -419,7 +419,7 @@ Release notes for commit range 6a49170027fb..962909fe9e78 - Fixed build issue when CUDA 11 is used [f7224f1] - Fixed caching of sub-devices in Level Zero backend[4c34f93] - Fixed requesting of USM memory allocation info on CUDA [691f842] - - Fixed [`joint_matrix_mad`](doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc) + - Fixed [`joint_matrix_mad`](doc/extensions/experimental/SYCL_EXT_ONEAPI_MATRIX.asciidoc) behaviour to return `A*B+C` instead of assigning the result to `C` [ea59c2b] - Workaround an issue in Level Zero backend when event isn't waited upon its completion but is queried for its status in an infinite loop [bfef316] @@ -512,9 +512,9 @@ Release notes for commit range 2ffafb95f887..6a49170027fb - Added an initial AOT implementation of the experimental matrix extension on the CPU device to target AMX hardware. Base features are supported [35db973] - Added support for - [SYCL_INTEL_local_memory extension](doc/extensions/LocalMemory/SYCL_INTEL_local_memory.asciidoc) + [SYCL_INTEL_local_memory extension](doc/extensions/supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc) [5a66fcb] [9a734f6] - - Documented [Level Zero backend](doc/extensions/LevelZeroBackend/LevelZeroBackend.md) + - Documented [Level Zero backend](doc/extensions/supported/SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO.md) [8994e6d] ## Improvements @@ -697,7 +697,7 @@ Release notes for commit range 5eebd1e4bfce..2ffafb95f887 default one [184d258b902a] - Enabled support for multiple AOCX device binaries for FPGA [6ea38f0f1f7a] ### SYCL Library - - Implemented [`online_compiler`](doc/extensions/OnlineCompilation/OnlineCompilation.asciidoc) + - Implemented [`online_compiler`](doc/extensions/experimental/SYCL_EXT_INTEL_ONLINE_COMPILER.asciidoc) feature [91122526e74d] ### Documentation - Added specification for [set kernel cache configuration extension](doc/extensions/IntelGPU/IntelGPUKernelCache.md) @@ -807,9 +807,9 @@ Release notes for commit range 5d7e0925..5eebd1e4bfce [c70b0477aa8a, cf0d0538d162] - Add online compilation API interface [70ac47d23264] ### Documentation - - Proposal for [new device descriptors extension](doc/extensions/IntelGPU/IntelGPUDeviceInfo.md) + - Proposal for [new device descriptors extension](doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md) was added [1ad813ba133e] - - Added [online compilation extension](doc/extensions/OnlineCompilation/OnlineCompilation.asciidoc) + - Added [online compilation extension](doc/extensions/experimental/SYCL_EXT_INTEL_ONLINE_COMPILER.asciidoc) specification [e05a19c8d303] ## Improvements @@ -1188,8 +1188,8 @@ Release notes for commit range 5976ff0..1fc0e4f - Added documentation for [`SPV_INTEL_usm_storage_classes`](doc/extensions/SPIRV/SPV_INTEL_usm_storage_classes.asciidoc) and [SYCL_INTEL_usm_address_spaces](doc/extensions/USMAddressSpaces/usm_address_spaces.asciidoc) [781fbfc] - Fixed SPIR-V format name spelling [6e9bf3b] - - Added extension [LocalMemory](doc/extensions/LocalMemory/SYCL_INTEL_local_memory.asciidoc) draft specification [4b5308a] - - Added extension [free functions queries](doc/extensions/FreeFunctionQueries/SYCL_INTEL_free_function_queries.asciidoc) draft specification [8953bfd] + - Added extension [LocalMemory](doc/extensions/supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc) draft specification [4b5308a] + - Added extension [free functions queries](doc/extensions/experimental/SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES.asciidoc) draft specification [8953bfd] - Removed documentation for implicit attribute `buffer_location` [71a56e7] ## Bug fixes @@ -1264,10 +1264,10 @@ Release notes for commit range 5976ff0..1fc0e4f Release notes for the commit range 75b3dc2..5976ff0 ## New features - - Implemented basic support for the [Explicit SIMD extension](doc/extensions/ExplicitSIMD/dpcpp-explicit-simd.md) + - Implemented basic support for the [Explicit SIMD extension](doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/SYCL_EXT_INTEL_ESIMD.md) for low-level GPU performance tuning [84bf234] [32bf607] [a lot of others] - Implemented support for the [SYCL_INTEL_usm_address_spaces extension](https://github.com/intel/llvm/pull/1840) - - Implemented support for the [Use Pinned Host Memory Property extension](doc/extensions/UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc) [e5ea144][aee2d6c][396759d] + - Implemented support for the [Use Pinned Host Memory Property extension](doc/extensions/supported/SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY.asciidoc) [e5ea144][aee2d6c][396759d] - Implemented aspects feature from the SYCL 2020 provisional Specification [89804af] @@ -1348,19 +1348,19 @@ Release notes for the commit range 75b3dc2..5976ff0 query native handles of SYCL objects and to create SYCL objects by providing a native handle: platform, device, queue, program. The feature is described in the SYCL 2020 provisional specification [a51c333] - - Added support for `sycl::intel::atomic_ref` from [SYCL_INTEL_extended_atomics extension](doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) + - Added support for `sycl::intel::atomic_ref` from [SYCL_INTEL_extended_atomics extension](doc/extensions/supported/SYCL_EXT_ONEAPI_EXTENDED_ATOMICS.asciidoc) ### Documentation - Added [SYCL_INTEL_accessor_properties](doc/extensions/accessor_properties/SYCL_INTEL_accessor_properties.asciidoc) extension specification [58fc414] - The documentation for the CUDA BE has been improved [928b815] - The [Queue Shortcuts extension](sycl/doc/extensions/QueueShortcuts/QueueShortcuts.adoc) document has been updated [defac3c2] - - Added [Use Pinned Host Memory Property extension](doc/extensions/UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc) specification [e5ea144] - - Updated the [SYCL_INTEL_extended_atomics extension](doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) + - Added [Use Pinned Host Memory Property extension](doc/extensions/supported/SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY.asciidoc) specification [e5ea144] + - Updated the [SYCL_INTEL_extended_atomics extension](doc/extensions/supported/SYCL_EXT_ONEAPI_EXTENDED_ATOMICS.asciidoc) to describe `sycl::intel::atomic_accessor` [4968e7c] - The [SYCL_INTEL_sub_group extension](doc/extensions/SubGroup/SYCL_INTEL_sub_group.asciidoc) document has been updated [067536e] - - Added [FPGA lsu extension](sycl/doc/extensions/IntelFPGA/FPGALsu.md) + - Added [FPGA lsu extension](sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_LSU.md) document [2c2b5f2] ## Bug fixes @@ -1454,7 +1454,7 @@ Release notes for the commit range ba404be..24726df - Introduced the Level Zero plugin which enables SYCL working on top of Level0 API. Interoperability is not supportet yet [d32da99] - Implemented [parallel_for simplification extension](doc/extensions/ParallelForSimpification) [13fe9fb] - - Implemented [SYCL_INTEL_enqueue_barrier extension](doc/extensions/EnqueueBarrier/enqueue_barrier.asciidoc) [da6bfd0] + - Implemented [SYCL_INTEL_enqueue_barrier extension](doc/extensions/supported/SYCL_EXT_ONEAPI_ENQUEUE_BARRIER.asciidoc) [da6bfd0] - Implemented [SYCL_INTEL_accessor_simplification extension](https://github.com/intel/llvm/pull/1498) [1f76efc] - Implemented OpenCL interoperability API following [SYCL Generalization proposal](https://github.com/KhronosGroup/SYCL-Shared/blob/master/proposals/sycl_generalization.md) [bae0639] @@ -1681,7 +1681,7 @@ Release notes for the commit range ba404be..67d3d9e [reduction extension proposal](doc/extensions/Reduction/Reduction.md) [f695479] - Published [parallel_for simplification extension](doc/extensions/ParallelForSimpification/SYCL_INTEL_parallel_for_simplification.asciidoc) [856a777] - - Added memory scope to [ExtendedAtomics extension](doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) [f8e11e0] + - Added memory scope to [ExtendedAtomics extension](doc/extensions/supported/SYCL_EXT_ONEAPI_EXTENDED_ATOMICS.asciidoc) [f8e11e0] - Published [math array extension](doc/extensions/MathArray/SYCL_INTEL_math_array.asciidoc) [36c5041] - Added more comments that describe Scheduler design [ad441a0] - Published [extension mechanism proposal](doc/extensions/ExtensionMechanism/SYCL_INTEL_extension_api.asciidoc) [cf65794] @@ -1819,7 +1819,7 @@ Release notes for the commit range e8f1f29..ba404be [29d9cc2] - More details have been added about the `-fintelfpga` option in the [Compiler User Manual](doc/SYCLCompilerUserManual.md) [4b03ddb] - - Added [SYCL_INTEL_enqueue_barrier extension document](doc/extensions/EnqueueBarrier/enqueue_barrier.asciidoc) + - Added [SYCL_INTEL_enqueue_barrier extension document](doc/extensions/supported/SYCL_EXT_ONEAPI_ENQUEUE_BARRIER.asciidoc) [6cfd2cb] - Added [standard layout relaxation extension](doc/extensions/RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc) [ce53521] @@ -1983,7 +1983,7 @@ Release notes for commit e8f1f29 [proposal](doc/extensions/OrderedQueue/OrderedQueue_v2.adoc) [9fa878f] - Added device code split options documentation to the [user's manual](doc/UsersManual.md) [1355aa6] - - Added documentation for [ExtendedAtomics extension](doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) [4445462] + - Added documentation for [ExtendedAtomics extension](doc/extensions/supported/SYCL_EXT_ONEAPI_EXTENDED_ATOMICS.asciidoc) [4445462] - Removed old Ordered Queue proposal and make a note of deprecation [e8f1f29] ## Bug fixes @@ -2095,7 +2095,7 @@ Release notes for commit 78d80a1cc628af76f09c53673ada906a3d2f0131 ### Documentation - Added support for generation of SYCL documentation with Doxygen [de418d6] - - [Design document](doc/extensions/C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst) + - [Design document](doc/extensions/supported/C-CXX-StandardLibrary.rst) which describes design of C/C++ standard library support has been added ## Bug fixes @@ -2216,9 +2216,9 @@ Release notes for commit e0a62df4e20eaf4bdff5c7dd46cbde566fbaee90 to use proper names of AOT related options [b3ee6a2] - Added [unnamed lambda extension](doc/extensions/UnnamedKernelLambda/SYCL_INTEL_unnamed_kernel_lambda.asciidoc) draft [47c4c71] - - Added [kernel restrict all extension](doc/extensions/KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc) + - Added [kernel restrict all extension](doc/extensions/supported/SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT.asciidoc) draft [47c4c71] - - Added initial draft of [data flow pipes extension](doc/extensions/DataFlowPipes/data_flow_pipes.asciidoc) + - Added initial draft of [data flow pipes extension](doc/extensions/supported/SYCL_EXT_INTEL_DATAFLOW_PIPES.asciidoc) proposal [ee2b482] - [USM doc](doc/extensions/USM/USM.adoc) was updated with new version of allocation functions [0c32410] @@ -2508,8 +2508,8 @@ Release notes for commit d4efd2ae3a708fc995e61b7da9c7419dac900372 See [SYCL ENV VARIABLES](doc/SYCLEnvironmentVariables.md) for information how to enable it. [c615566] - Added support for - [`cl::sycl::intel::fpga_reg`](doc/extensions/IntelFPGA/FPGAReg.md) and - [`cl::sycl::intel::fpga_selector`](doc/extensions/IntelFPGA/FPGASelector.md) + [`cl::sycl::intel::fpga_reg`](doc/extensions/supported/SYCL_EXT_INTEL_FPGA_REG.md) and + [`cl::sycl::intel::fpga_selector`](doc/extensions/supported/SYCL_EXT_INTEL_FPGA_DEVICE_SELECTOR.md) extensions. [e438d2b] ## Improvements diff --git a/sycl/doc/CompileTimeProperties.md b/sycl/doc/CompileTimeProperties.md index caf9a29672157..45a00d7643a40 100644 --- a/sycl/doc/CompileTimeProperties.md +++ b/sycl/doc/CompileTimeProperties.md @@ -148,7 +148,7 @@ kernel arguments. For example, the [SYCL\_ONEAPI\_accessor\_properties][6] extension could be redesigned to use compile-time properties. Such a redesign might look like: -[6]: +[6]: ``` namespace sycl { diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 89c9341b772ca..be6dd7ffc8e54 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -20,7 +20,7 @@ compiler and runtime. | `SYCL_CACHE_THRESHOLD` | Positive integer | Cache eviction threshold in days (default value is 7 for 1 week). Set to 0 for disabling time-based cache eviction. | | `SYCL_CACHE_MIN_DEVICE_IMAGE_SIZE` | Positive integer | Minimum size of device code image in bytes which is reasonable to cache on disk because disk access operation may take more time than do JIT compilation for it. Default value is 0 to cache all images. | | `SYCL_CACHE_MAX_DEVICE_IMAGE_SIZE` | Positive integer | Maximum size of device image in bytes which is cached. Too big kernels may overload disk too fast. Default value is 1 GB. | -| `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/PlatformContext/PlatformContext.adoc) extension to learn more. Enabled by default on Linux and disabled on Windows. | +| `SYCL_ENABLE_DEFAULT_CONTEXTS` | '1' or '0' | Enable ('1') or disable ('0') creation of default platform contexts in SYCL runtime. The default context for each platform contains all devices in the platform. Refer to [Platform Default Contexts](extensions/supported/SYCL_EXT_ONEAPI_DEFAULT_CONTEXT.asciidoc) extension to learn more. Enabled by default on Linux and disabled on Windows. | | `SYCL_USM_HOSTPTR_IMPORT` | Integer | Enable by specifying non-zero value. Buffers created with a host pointer will result in host data promotion to USM, improving data transfer performance. To use this feature, also set SYCL_HOST_UNIFIED_MEMORY=1. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 5e561780db1dd..1d6e1ad1d6754 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -705,7 +705,7 @@ SYCL_BE=PI_CUDA ./simple-sycl-app-cuda.exe **NOTE**: DPC++/SYCL developers can specify SYCL device for execution using device selectors (e.g. `cl::sycl::cpu_selector`, `cl::sycl::gpu_selector`, -[Intel FPGA selector(s)](extensions/IntelFPGA/FPGASelector.md)) as +[Intel FPGA selector(s)](extensions/supported/SYCL_EXT_INTEL_FPGA_DEVICE_SELECTOR.md)) as explained in following section [Code the program for a specific GPU](#code-the-program-for-a-specific-gpu). diff --git a/sycl/doc/MultiTileCardWithLevelZero.md b/sycl/doc/MultiTileCardWithLevelZero.md index 1eacda97406d3..6a76cf63a823b 100644 --- a/sycl/doc/MultiTileCardWithLevelZero.md +++ b/sycl/doc/MultiTileCardWithLevelZero.md @@ -24,7 +24,7 @@ One is using environment variable SYCL_DEVICE_FILTER described in [EnvironmentVa $ SYCL_DEVICE_FILTER=level_zero sycl-ls [level_zero:0] GPU : Intel(R) Level-Zero 1.1 [1.1.19792] ``` -Another way is to use similar SYCL API described in [FilterSelector.md](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/FilterSelector/FilterSelector.adoc) +Another way is to use similar SYCL API described in [SYCL\_EXT\_ONEAPI\_FILTER\_SELECTOR](extensions/supported/SYCL_EXT_ONEAPI_FILTER_SELECTOR.asciidoc) E.g. `filter_selector("level_zero")` will only see Level-Zero operated devices. If there are multiple GPUs in a system then they will be seen as multiple different root-devices. diff --git a/sycl/doc/extensions/Bitcast/SYCL_INTEL_bitcast.asciidoc b/sycl/doc/extensions/Bitcast/SYCL_INTEL_bitcast.asciidoc deleted file mode 100644 index 42add70ae4cd7..0000000000000 --- a/sycl/doc/extensions/Bitcast/SYCL_INTEL_bitcast.asciidoc +++ /dev/null @@ -1,140 +0,0 @@ -= SYCL_INTEL_bitcast - -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -// This is necessary for asciidoc, but not for asciidoctor -:cpp: C++ - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are -trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. -used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. -GitHub does not render image icons. - -This document describes an extension that allows reinterpreting bits in a data -type as another data type. - -== Name Strings - -+SYCL_INTEL_bitcast+ - -== Notice - -Copyright (c) 2020 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to -a feature for review and community feedback. When the feature matures, this -specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are -subject to change they are not intended to be used by shipping software -products. - -== Version - -Built On: {docdate} + -Revision: 1 - -== Contact -Felipe de Azevedo Piovezan, Intel (felipe 'dot' de 'dot' azevedo 'dot' piovezan 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision 6. - -== Overview - -It is frequently necessary to reinterpret the bits of an object as an object of -a different data type. Several methods to achieve this conversion exist but -most result in undefined behavior according to the {cpp} language. This -extension defines the function `sycl::bit_cast` to convert between data types -of the same size. The semantics of `sycl::bit_cast` are aligned with those of -`std::bit_cast`, which will be introduced by the {cpp}20 language (p0476r2). - -== Modifications of SYCL 1.2.1 Specification - -=== Add Section 4.14 Conversions and Type Casting - -==== Add Section 4.14.1 Reinterpreting Data As Another Type - -Reinterpreting the bits of an object as an object of a different data type can -be accomplished with the `sycl::bit_cast` function. This function has the same -specification as `std::bit_cast`, defined by {cpp}20. - -[source,c++,`sycl::bit_cast`,linenums] ----- -namespace cl { -namespace sycl { - template - constexpr To bit_cast(const From& from) noexcept; -} -} ----- - -|======================================== -|Function|Description -|+template constexpr To bit_cast(const From& from) noexcept+ -|Reinterprets the bits of an object of type `From` as an object of type `To`. Data types `To` and `From` must have the same size and must be trivially copyable. -|======================================== - -== Issues - -None. - -== Feature test macro - -This extension provides a feature-test macro as described in the core SYCL -specification section 6.3.3 "Feature test macros". Therefore, an implementation -supporting this extension must predefine the macro `SYCL_EXT_INTEL_BITCAST` to -one of the values defined in the table below. Applications can test for the -existence of this macro to determine if the implementation supports this -feature, or applications can test the macro's value to determine which of the -extension's APIs the implementation supports. - -[%header,cols="1,5"] -|=== -|Value |Description -|1 |Initial extension version. Base features are supported. -|=== - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2020-04-14|Felipe de Azevedo Piovezan|*Initial public working draft* -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/C-CXX-StandardLibrary/DeviceLibExtensions.rst b/sycl/doc/extensions/C-CXX-StandardLibrary/DeviceLibExtensions.rst index 76487b31358be..500aab981c39e 100644 --- a/sycl/doc/extensions/C-CXX-StandardLibrary/DeviceLibExtensions.rst +++ b/sycl/doc/extensions/C-CXX-StandardLibrary/DeviceLibExtensions.rst @@ -48,7 +48,7 @@ Invoking `__devicelib_assert_read` after a kernel doesn't imply the kernel has assertion failed. See also: assert_extension_. -.. _assert_extension: ../Assert/SYCL_ONEAPI_ASSERT.asciidoc) +.. _assert_extension: ../supported/SYCL_EXT_ONEAPI_ASSERT.asciidoc) cl_intel_devicelib_math ========================== diff --git a/sycl/doc/extensions/DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc b/sycl/doc/extensions/DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc deleted file mode 100644 index 54a361c8e2dcd..0000000000000 --- a/sycl/doc/extensions/DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc +++ /dev/null @@ -1,160 +0,0 @@ -= SYCL_INTEL_device_specific_kernel_queries - -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -// This is necessary for asciidoc, but not for asciidoctor -:cpp: C++ - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are -trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. -used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. -GitHub does not render image icons. - -This document describes an extension to rename device-specific kernel queries -to better describe the operations performed. - -== Name Strings - -+SYCL_INTEL_device_specific_kernel_queries+ - -== Notice - -Copyright (c) 2020 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to -a feature for review and community feedback. When the feature matures, this -specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are -subject to change they are not intended to be used by shipping software -products. - -== Version - -Built On: {docdate} + -Revision: 1 - -== Contact -Felipe de Azevedo Piovezan, Intel (felipe 'dot' de 'dot' azevedo 'dot' piovezan 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision 6. - -== Overview - -OpenCL provides two functions for querying properties of a kernel: -`clGetKernelInfo` is used for kernel properties that are device agnostic, -whereas `clGetKernelWorkGroupInfo` is used for kernel and work-group properties -that depend on a specific device. The name `clGetKernelWorkGroupInfo` doesn't -convey the API's intended use explicitly, that is, queries for -*device-specific* properties. In this sense, `clGetKernelWorkGroupInfo` is a -misnomer for some uses of the API. - -SYCL inherited these names in the form of the queries `kernel::get_info` and -`kernel::get_work_group_info`. This extension renames the latter (and its -template arguments as appropriate) in order to align the query name with its -functionality. - -== Modifications of SYCL 1.2.1 Specification - -=== Change Section 4.8.7 Kernel class - -==== Change `kernel` class interface - -Remove lines 29-31 (inclusive) from the `kernel` class interface: - -[source,c++,`sycl::kernel`,linenums] ----- -template -typename info::param_traits::return_type -get_work_group_info(const device &dev) const; ----- - -Add function overload `get_info` to the `kernel` class interface: - -[source,c++,`sycl::kernel`,linenums] ----- -template -typename info::param_traits::return_type -get_info(const device &dev) const; ----- - -==== Change table 4.83 Member functions of the Kernel class - -Remove row `get_work_group_info(const device &dev)`: - -[width="40%",frame="topbot",options="header,footer"] -|====================== -|Member functions |Description -|`template typename info::param_traits::return_type get_work_group_info(const device &dev)const` | -Query information from the work-group from a kernel using the info::kernel_work_group descriptor for a specific device -|====================== - -Add row `get_info(const device &dev)`: - -[width="40%",frame="topbot",options="header,footer"] -|====================== -|Member functions |Description -|`template typename info::param_traits::return_type get_info(const device &dev)const` | -Query information from a kernel using the info::kernel_device_specific descriptor for a specific device. -|====================== - -==== Change table 4.85 Kernel work-group information descriptors - -Rename table to: +Kernel device-specific information descriptors+. - -Replace all references to `info::kernel_work_group` with -`info::kernel_device_specific`. - -==== Change A.5 Kernel Information Descriptors - -Rename `enum class kernel_work_group: int` to `enum class -kernel_device_specific: int`. - -== Issues - -None. - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2020-04-15|Felipe de Azevedo Piovezan|*Initial public working draft* -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/DiscardQueueEvents/SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS.asciidoc b/sycl/doc/extensions/DiscardQueueEvents/SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS.asciidoc index 6ff89aec9fb98..fc0d62f3d1a81 100644 --- a/sycl/doc/extensions/DiscardQueueEvents/SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS.asciidoc +++ b/sycl/doc/extensions/DiscardQueueEvents/SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS.asciidoc @@ -168,7 +168,7 @@ enum class event_command_status : int { This non-normative section describes the conditions when the DPC++ implementation provides an optimization benefit* for the `discard_events` property. - The queue must be constructed with the `in_order` property. - - A kernel submitted to the queue must not use the https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/Assert/SYCL_ONEAPI_ASSERT.asciidoc[fallback assert feature]. + - A kernel submitted to the queue must not use the link:../supported/SYCL_EXT_ONEAPI_ASSERT.asciidoc[fallback assert feature]. - A queue operation submitted to the queue must not use streams or buffer / image accessors. However, local accessors do not inhibit optimization. - Any queue operations using Level Zero backend temporarily work without optimization. diff --git a/sycl/doc/extensions/ExtendedAtomics/README.md b/sycl/doc/extensions/ExtendedAtomics/README.md deleted file mode 100644 index 66431c1c16dc1..0000000000000 --- a/sycl/doc/extensions/ExtendedAtomics/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# SYCL_INTEL_extended_atomics - -Introduces the `cl::sycl::intel::atomic_ref` class, which exposes additional functionality aligned with the `std::atomic_ref` class from C++20. diff --git a/sycl/doc/extensions/KernelRHSAttributes/README.md b/sycl/doc/extensions/KernelRHSAttributes/README.md deleted file mode 100644 index f647a761f40d7..0000000000000 --- a/sycl/doc/extensions/KernelRHSAttributes/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# SYCL_INTEL_attribute_style - -Extension that deprecates use of function attributes (left-sided) applied to -device functions for kernel attributes, and introduces instead function-type -attributes (right-sided) for kernel attributes that apply directly to kernel functions. - diff --git a/sycl/doc/extensions/KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc b/sycl/doc/extensions/KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc deleted file mode 100755 index b9f7555a54332..0000000000000 --- a/sycl/doc/extensions/KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc +++ /dev/null @@ -1,219 +0,0 @@ -= SYCL_INTEL_attribute_style - -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -// This is necessary for asciidoc, but not for asciidoctor -:cpp: C++ - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) -are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark -of Apple Inc. used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. -GitHub does not render image icons. - -This document describes an extension that deprecates use of function attributes -(left-sided) for kernel attributes, and introduces use of function-type -attributes (right-sided) for kernel attributes. This allows SYCL kernel -attributes to be applied directly to kernels defined as lambdas, and no longer requires -propagation of attributes across call trees (which left-sided function attributes require). - -== Name Strings - -+SYCL_INTEL_attribute_style+ - -== Notice - -Copyright (c) 2020 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access -to a feature for review and community feedback. When the feature matures, -this specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are -subject to change they are not intended to be used by shipping software products. - -== Version - -Built On: {docdate} + -Revision: 1 - -== Contact -Michael Kinsner, Intel (michael 'dot' kinsner 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision 6. - -== Overview - -SYCL 1.2.1 defines kernel attributes as applying to device functions (functions called -by kernels), and describes a call tree-based propagation scheme in which the attributes would -propagate to calling kernels. This extension instead enables attributes to be applied -directly to kernel functions, avoiding complex and error prone call tree propagation, and -making it clear to which kernel an attribute applies. - -A kernel attribute applied to the function as required by SYCL 1.2.1 looks like: - -[source,c++] ----- -[[attrib]] void foo1() {}; - -class f { - [[attrib]] void foo2() {}; -}; ----- - -where `attrib` is a placeholder for any of the kernel attributes defined by the SYCL specification or extensions. - -This extension deprecates the SYCL 1.2.1 attribute style (attribute applied to -a device function) and instead defines kernel attributes as attributes that apply to the -function type. The location of the resulting attributes looks like: - -[source,c++] ----- -void bar1() [[attrib]] {}; - -class f { - void bar2() [[attrib]] {}; -}; - -class KernelFunctor { - public: - void operator()(sycl::item<1> item) [[attrib]] {}; -}; - -auto bar3 = []()[[attrib]]{}; // Works on lambdas. operator() type ----- - -The function type attributes have an effect when applied to a kernel function, -do not propagate up or down call trees unless specified by a specific attribute, -and the effect when applied to non-kernel functions or non-functions is implementation defined. - -== Modifications of SYCL 1.2.1 Specification - -=== Modify Section 6.7 (Attributes) - -==== Rename the section - -Rename Section 6.7 from "Attributes" to "Kernel attributes". - -==== Replace the entire section with: - -The SYCL programming interface defines attributes that augment the -information available while generating the device code for a particular platform. -The attributes in Table 1 are defined in the `sycl::` namespace -and are applied to the function-type of kernel function declarations using -{cpp}11 attribute specifier syntax. - -A given attribute-token shall appear at most once in each attribute-list. The -first declaration of a function shall specify an attribute if any declaration -of that function specifies the same attribute. If a function is declared with -an attribute in one translation unit and the same function is declared without -the same attribute in another translation unit, the program is ill-formed and -no diagnostic is required. - -If there are any conflicts between different kernel attributes, then the behavior -is undefined. The attributes have an effect when applied to a device function and no -effect otherwise (i.e. no effect on non-device functions and on anything other than a -function). If an attribute is applied to a device function that is not a kernel function -(but that is potentially called from a kernel function), then the effect is implementation defined. -It is implementation defined whether any diagnostic is produced when an attribute is applied -to anything other than the function-type of a kernel function declaration. - -.Attributes supported by the SYCL programming interface -[cols="2*"] -|=== -|`reqd_work_group_size(dim0)` -`reqd_work_group_size(dim0, dim1)` -`reqd_work_group_size(dim0, dim1, dim2)` -|Indicates that the kernel must be launched with the specified work-group size. The sizes -are written in row-major format. Each argument to the attribute must be an integral -constant expression. The dimensionality of the attribute variant used must match the -dimensionality of the work-group used to invoke the kernel. - -SYCL device compilers should give a compilation error if the required work-group size -is unsupported. If the kernel is submitted for execution using an incompatible -work-group size, the SYCL runtime must throw an `nd_range_error`. - -|`work_group_size_hint(dim0)` -`work_group_size_hint(dim0, dim1)` -`work_group_size_hint(dim0, dim1, dim2)` -|Hint to the compiler on the work-group size most likely to be used when launching the kernel -at runtime. Each argument must be an integral constant expression, and the number of dimensional -values defined provide additional information to the compiler on the dimensionality most likely -to be used when launching the kernel at runtime. The effect of this attribute, if any, is -implementation defined. - -|`vec_type_hint()` -|Hint to the compiler on the vector computational width of of the kernel. The argument must be -one of the vector types defined in section 4.10.2. This attribute is deprecated by this -extension (available for use, but will be removed in the future and is not recommended for -use in new code). -|=== - - - -==== Add new sub-section 6.7.1: Deprecated attribute syntax -The SYCL 1.2.1 specification defined two mechanisms for kernel attributes to be specified, -which are deprecated by this extension. Deprecation means that the syntaxes are supported, -but will be removed in the future, and are therefore not recommended for use. Specifically, -the following two attribute syntaxes defined by the SYCL 1.2.1 specification are deprecated: - -1. The `attribute` syntax defined by the OpenCL C specification within device -code (`__attribute__\((attrib))`). -2. {cpp}11 attribute specifier syntax (`\[[attrib]]`) applied to device functions -(not the function-type), including automatic propagation of the attribute to any -caller of those device functions. - - -== Issues - -None. - -//. asd -//+ -//-- -//*RESOLUTION*: Not resolved. -//-- - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2020-04-08|Michael Kinsner|*Initial public working draft* -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/KernelRestrictAll/README.md b/sycl/doc/extensions/KernelRestrictAll/README.md deleted file mode 100644 index 7c32b0918e82c..0000000000000 --- a/sycl/doc/extensions/KernelRestrictAll/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# SYCL_INTEL_kernel_restrict_all - -Optimization hint attribute asserting that no kernel arguments will alias with each other. - diff --git a/sycl/doc/extensions/Matrix/README.md b/sycl/doc/extensions/Matrix/README.md deleted file mode 100644 index a930e3598ca88..0000000000000 --- a/sycl/doc/extensions/Matrix/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# Matrix Programming Extension for DPC++ -`matrix` is a new experimental DPC++ extension to provide unified matrix programming on different tensor hardware. The current implementation provides support of the matrix interface using Intel(R) Advanced Matrix Extensions (AMX). \ No newline at end of file diff --git a/sycl/doc/extensions/OrderedQueue/OrderedQueue_v2.adoc b/sycl/doc/extensions/OrderedQueue/OrderedQueue_v2.adoc deleted file mode 100644 index 0b66b2ac17da1..0000000000000 --- a/sycl/doc/extensions/OrderedQueue/OrderedQueue_v2.adoc +++ /dev/null @@ -1,33 +0,0 @@ -= SYCL Proposals: Queue Order Properties -James Brodman -v0.2 -:source-highlighter: pygments -:icons: font -== Introduction -This document presents an addition proposed for a future version of the SYCL Specification. The goal of this proposal is to reduce the complexity and verbosity of using SYCL for programmers. - -NOTE: This proposal replaces the previous one that was based on a separate class for in-order queues. - -== Queue Order Properties -Queues in SYCL are out-of-order by default. SYCL constructs directed acyclic graphs to ensure tasks are properly ordered based on their data dependences. However, many programs only require linear DAGs. The overheads of constructing and managing DAGs are unnecessary for this class of program. The `in_order` queue property is proposed to serve this class of programs by providing programmer-specified in-order semantics. This property is used with the existing `property_list` mechanism that is part of the normal SYCL `queue`. - -.Proposed Queue Property -[cols="^50,50",options="header"] -|=== - -|Property |Description -|`property::queue::in_order` -| The `in_order` property adds the requirement that the SYCL queue provides in-order semantics where tasks are executed in the order in which they are submitted. Tasks submitted in this fashion can be viewed as having an implicit dependence on the previously submitted operation. -|=== - -.Proposed Queue Method Addition -[cols="^25,^25,50",options="header"] -|=== - -|Method |Return Type |Description -|`is_in_order()` -|`bool` -| Returns `true` if a SYCL `queue` was created with the `in_order` property. Equivalent to `has_property()`. -|=== - - diff --git a/sycl/doc/extensions/ParallelForSimplification/SYCL_INTEL_parallel_for_simplification.asciidoc b/sycl/doc/extensions/ParallelForSimplification/SYCL_INTEL_parallel_for_simplification.asciidoc deleted file mode 100644 index 5cc4bbd2966c1..0000000000000 --- a/sycl/doc/extensions/ParallelForSimplification/SYCL_INTEL_parallel_for_simplification.asciidoc +++ /dev/null @@ -1,471 +0,0 @@ -= SYCL_INTEL_parallel_for_simplification -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. GitHub does not render image icons. - -This document describes an extension that adds features for SYCL handler parallel_for API simplification. - - -== Name Strings - -+SYCL_INTEL_parallel_for_simplification+ - -== Notice - -Copyright (c) 2020 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products. - -== Version - -Built On: {docdate} + -Revision: 1 - -== Contact -Ruslan Arutyunyan, Intel (ruslan 'dot' arutyunyan 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision 6. - -== Overview - -SYCL 1.2.1 is based on {cpp}11. This extension describes the following features necessary to allow `parallel_for` simplification: - -* Allow parallel_for call with number or *braced-init-list* as the first argument, i.e. + -`parallel_for(5, _some_kernel_)` is equivalent to `parallel_for(range<1>{5}, _some_kernel_)` + -`parallel_for({5}, _some_kernel_)` is equivalent to `parallel_for(range<1>{5}, _some_kernel_)` + -`parallel_for({5, 5}, _some_kernel_)` is equivalent to `parallel_for(range<2>{5, 5}, _some_kernel_)` + -`parallel_for({5, 5, 5}, _some_kernel_)` is equivalent to `parallel_for(range<3>{5, 5, 5}, _some_kernel_)` - -* Allow {cpp}14 *Generic lambda expression* as the kernel for `parallel_for`, i.e `[](auto){}` - -* Allow {cpp}14 *Generic lambda expression* as the kernel for `parallel_for_work_group` - -* Allow integral type as the kernel argument for `parallel_for` called with `range<1>`, e.g. `[](int i){}` - -* Allow `item<1>` to `size_t` implicit conversion - -* Resolve ambiguity for `accessor::operator[]` when the argument for it is an `item` - -* `parallel_for` kernel shall always take `item` as an argument (not `item` or `id`) - -SYCL 1.2.1 example: - -[source,c++,UsageFrom,linenums] ----- -int main() { - constexpr int N = 32; - sycl::buffer B(N); - sycl::queue{}.submit([&](auto &h) { - auto a = B.get_access(h); - h.parallel_for(sycl::range(N), [=](cl::sycl::id<1> i) { - a[i] = i[0]; - }); - }); -} ----- - -Same example, but with this extension applied: - -[source,c++,UsageTo,linenums] ----- -int main() { - constexpr int N = 32; - sycl::buffer B(N); - sycl::queue{}.submit([&](auto &h) { - auto a = B.get_access(h); - h.parallel_for(N, [=](auto i) { - a[i] = i; - }); - }); -} ----- - -== Enabling the extension - -This extension is enabled for any {cpp}11 compliant compiler except *generic lambda expressions* simplification that requires {cpp}14. The {cpp}14 compilation mode flag is implementation defined (e.g. `-std=c{plus}{plus}14` or `/std:c{plus}{plus}14`). - -== Modifications of SYCL 1.2.1 specification - -=== Changes in 4.8.5 (SYCL functions for invoking kernels) - -==== Table 4.78 - -===== From - -|=== -|Member function | Description -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - KernelType kernelFunc) ----- | -Defines and invokes a SYCL kernel function as a lambda function or a named function object type, for the specified range and given an id or item for indexing in the indexing space defined by range. If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - id workItemOffset, - KernelType kernelFunc) ----- | -Defines and invokes a SYCL kernel function as a lambda function or a named function object type, for the specified range and offset and given an id or item for indexing in the indexing space defined by range. If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(nd_range executionRange, - KernelType kernelFunc) ----- | -Defines and invokes a SYCL kernel functionas a lambda function or a named function -object type, for the specified `nd-range` and given an `nd-item` for indexing in the indexing space defined by the `nd-range`. If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for_work_group(range numWorkGroups, - WorkgroupFunctionType kernelFunc) ----- | -Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-group to launch. May contain multiple calls to `parallel_for_work_item(..)` methods representing the execution on each workitem. Launches num_work_groups workgroups of runtime-defined size. Described in detail in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for_work_group(range numWorkGroups, - range workGroupSize, - WorkgroupFunctionType kernelFunc) ----- | -Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-group to launch. May contain multiple calls to `parallel_for_work_item` methods representing the execution on each work-item. -Launches num_work_groups work-groups of `work_group_size` work-items each. Described in detail in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - kernel syclKernel) ----- | -Kernel invocation method of a pre-compiled kernel defined by SYCL `sycl-kernel-function` instance, for the specified range and given an id or item for indexing in the indexing space defined by range, described in detail in 4.8.5 - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - id workItemOffset, - kernel syclKernel) ----- | -Kernel invocation method of a pre-compiled kernel defined by SYCL `sycl-kernel-function` instance, for the specified range and offset and given an id or item for indexing in the indexing space defined by range, described in detail in 4.8.5 - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(nd_range ndRange, - kernel syclKernel) ----- | -Kernel invocation method of a pre-compiled kernel defined by SYCL kernel instance, -for the specified `nd-range` and given an `nd_item` for indexing in the indexing space -defined by the `nd_range`, described in detail in 4.8.5 -|=== - - -===== To - -|=== -|Member function | Description -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - KernelType kernelFunc) ----- | -Defines and invokes a SYCL kernel function as a lambda function or a named function object type, for the specified range and given an `item` or integral type (e.g `int`, `size_t`), if range is 1-dimensional, for indexing in the indexing space defined by range. Generic kernel functions are permitted, in that case the argument type is an `item`. If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - id workItemOffset, - KernelType kernelFunc) ----- | -Defines and invokes a SYCL kernel function as a lambda function or a named function object type, for the specified range and offset and given an `item` or integral type (e.g `int`, `size_t`), if range is 1-dimensional, for indexing in the indexing space defined by range. Generic kernel functions are permitted, in that case the argument type is an `item`. If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(nd_range executionRange, - KernelType kernelFunc) ----- | -Defines and invokes a SYCL kernel functionas a lambda function or a named function -object type, for the specified `nd-range` and given an `nd-item` for indexing in the indexing space defined by the `nd-range` Generic kernel functions are permitted, in that case the argument type is an `nd_item`. If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for_work_group(range numWorkGroups, - WorkgroupFunctionType kernelFunc) ----- | -Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-group to launch. Generic kernel functions are permitted, in that case the argument type is a `group`. May contain multiple calls to `parallel_for_work_item(..)` methods representing the execution on each workitem. Launches num_work_groups workgroups of runtime-defined size. Described in detail in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for_work_group(range numWorkGroups, - range workGroupSize, - WorkgroupFunctionType kernelFunc) ----- | -Hierarchical kernel invocation method of a kernel defined as a lambda encoding the body of each work-group to launch. Generic kernel functions are permitted, in that case the argument type is a `group`. May contain multiple calls to `parallel_for_work_item` methods representing the execution on each work-item. -Launches num_work_groups work-groups of `work_group_size` work-items each. Described in detail in 4.8.5. - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - kernel syclKernel) ----- | -Kernel invocation method of a pre-compiled kernel defined by SYCL `sycl-kernel-function` instance for the specified range and given an `item` or integral type (e.g `int`, `size_t`), if range is 1-dimensional, for indexing in the indexing space defined by range. Generic kernel functions are permitted, in that case the argument type is an `item`. Described in detail in 4.8.5 - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(range numWorkItems, - id workItemOffset, - kernel syclKernel) ----- | -Kernel invocation method of a pre-compiled kernel defined by SYCL `sycl-kernel-function` instance for the specified range and offset and given an `item` or integral type (e.g `int`, `size_t`), if range is 1-dimensional, for indexing in the indexing space defined by range. Generic kernel functions are permitted, in that case the argument type is an `item`. Described in detail in 4.8.5 - -a| -[source,c++,multiptr,linenums] ----- -template -void parallel_for(nd_range ndRange, - kernel syclKernel) ----- | -Kernel invocation method of a pre-compiled kernel defined by SYCL kernel instance -for the specified `nd-range` and given an `nd_item` for indexing in the indexing space -defined by the `nd_range`. Generic kernel functions are permitted, in that case the argument type is an `nd_item`. Described in detail in 4.8.5 -|=== - -=== Changes in 4.8.5.2 (parallel_for invoke) - -==== The following paragraph changes - -===== From - -For the simplest case, users need only provide the global range (the total number of work-items in the index space) via a SYCL `range` parameter, and the SYCL runtime will select a local range (the number of work-items in each work-group). The local range chosen by the SYCL runtime is entirely implementation defined. In this case the function object that represents the SYCL kernel function must take either a single SYCL `id` parameter, or a single SYCL `item` parameter, representing the currently executing work-item within the range specified by the range parameter. - -===== To - -For the simplest case, users need only provide the global range (the total number of work-items in the index space) via a SYCL `range` parameter, and the SYCL runtime will select a local range (the number of work-items in each work-group). The local range chosen by the SYCL runtime is entirely implementation defined. In this case the function object that represents the SYCL kernel function must take one of: 1) a single SYCL `item` parameter, 2) single generic parameter (template parameter or `auto`), treated as `item` 3) single integral parameter (e.g. `int`, `size_t`) if `range` is 1-dimensional, representing the currently executing work-item within the range specified by the range parameter. - -==== Remove the following paragraph with example - -Below is an example of invoking a SYCL kernel function with `parallel_for` using a lambda function, and passing a SYCL `id` parameter. In this case only the global id is available. This variant of `parallel_for` is designed for when it is not necessary to query the global range of the index space being executed across, or the local (workgroup) size chosen by the implementation. - -[source,c++,multiptr,linenums] ----- -myQueue.submit([&](handler & cgh) { - auto acc = myBuffer.get_access(cgh); - cgh.parallel_for(range<1>(numWorkItems), [=] (id<1> index) { - acc[index] = 42.0f; - }); -}); ----- - -==== Add the following paragraph with example - -Below is an example of invoking a SYCL kernel function with `parallel_for` using a lambda function and passing `auto` parameter, treated as `item`. In this case both the global id and global range are queryable. The same effect can be achieved using `class` with template `operator()`. This variant of `parallel_for` is designed for when it is necessary to query the global range within which the global id will vary. -No information is queryable on the local (work-group) size chosen by the implementation. -[source,c++,multiptr,linenums] ----- -myQueue.submit([&](handler & cgh) { - auto acc = myBuffer.get_access(cgh); - cgh.parallel_for(range<1>(numWorkItems), [=] (auto item) { - size_t index = item.get_linear_id(); - acc[index] = 42.0f; - }); -}); ----- - -==== Add the following paragraph with example - -Below is an example of invoking a SYCL kernel function with `parallel_for` using a lambda function and passing integral type (e.g. `int`, `size_t`) parameter. This example is only valid when calling `parallel_for` with `range<1>`. In this case only the global id is available. This variant of `parallel_for` is designed for -when it is not necessary to query the global range of the index space being executed across, or the local (workgroup) size chosen by the implementation. -[source,c++,multiptr,linenums] ----- -myQueue.submit([&](handler & cgh) { - auto acc = myBuffer.get_access(cgh); - cgh.parallel_for(range<1>(numWorkItems), [=] (size_t index) { - acc[index] = 42.0f; - }); -}); ----- - -==== Add the following paragraph with example - -The `parallel_for` overload without offset can be called with either number or `braced-init-list` with 1-3 elements. If the case the following calls are equivalent: - -* `parallel_for(N, _some_kernel_)` has same effect as `parallel_for(range<1>(N), _some_kernel_)` - -* `parallel_for({N}, _some_kernel_)` has same effect as `parallel_for(range<1>(N), _some_kernel_)` - -* `parallel_for({N1, N2}, _some_kernel_)` has same effect as `parallel_for(range<2>(N1, N2), _some_kernel_)` - -* `parallel_for({N1, N2, N3}, _some_kernel_)` has same effect as `parallel_for(range<3>(N1, N2, N3), _some_kernel_)` - -[source,c++,multiptr,linenums] ----- -myQueue.submit([&](handler & cgh) { - auto acc = myBuffer.get_access(cgh); - cgh.parallel_for(numWorkItems, [=] (auto item) { - size_t index = item.get_linear_id(); - acc[index] = 42.0f; - }); -}); ----- - -== Changes in 4.8.1.5 (Item interface) - -=== Changes in synopsis - -==== Add the following public method with description - -[source,c++,multiptr,linenums] ----- -// only available if dimensions == 1 -operator size_t() const; ----- - -=== Changes in Table 4.70 - -==== Add the following row - -|=== -a| -[source,c++,multiptr,linenums] ----- -operator size_t() const ----- | -Returns the index representing the work-item position in the iteration space. + -This member function is only available if `dimensions` is equal to `1` -|=== - -== Changes in 4.7.6.6 (Buffer accessor interface) - -=== Changes in synopsis - -==== Remove the following public method with description - -[source,c++,multiptr,linenums] ----- -/* Available only when: (accessMode == access::mode::write || accessMode == -access::mode::read_write || accessMode == access::mode::discard_write || -accessMode == access::mode::discard_read_write) && dimensions == 1) */ -dataT &operator[](size_t index) const; ----- - -=== Changes in Table 4.45 - -==== Delete the following row - -|=== -|`dataT &operator[](size_t index) const` | -Available only when: `(accessMode == access::mode::write \|\| accessMode == access::mode::read_write \|\| accessMode == access::mode::discard_write \|\| accessMode == access::mode::discard_read_write) && dimensions == 1)``. + -Returns a reference to the element stored within the SYCL buffer this SYCL -accessor is accessing at the index specified by index. -|=== - -== Prototype implementation - -https://github.com/otcshare/llvm/pull/1054 - -== Issues - -None. - -//. asd -//+ -//-- -//*RESOLUTION*: Not resolved. -//-- - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2020-17-04|Ruslan Arutyunyan|*Initial public working draft* -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/QueueShortcuts/QueueShortcuts.adoc b/sycl/doc/extensions/QueueShortcuts/QueueShortcuts.adoc deleted file mode 100644 index d4b68ec6365f3..0000000000000 --- a/sycl/doc/extensions/QueueShortcuts/QueueShortcuts.adoc +++ /dev/null @@ -1,24 +0,0 @@ -= SYCL Proposals: Queue Shortcuts -James Brodman -v0.1 -:source-highlighter: pygments -:icons: font -== Introduction -This document presents an addition proposed for a future version of the SYCL Specification. The goal of this proposal is to reduce the complexity and verbosity of using SYCL for programmers. - -== Queue Simplifications -Tasks are submitted to queues for execution in SYCL. This is normally done by invoking the `submit` method of a `queue` and passing a lambda that specifies the operation to perform and its dependences. A task's dependences have traditionally been specified through the creation of `accessor` objects that tell the SYCL runtime how data in a `buffer` or `image` is used. However, new proposals for data management in SYCL, such as Unified Shared Memory, provide alternatives to the buffer and accessor model. The USM proposal specifies dependences between kernels using `event` objects. The Queue Properties proposal specifies how to create a `queue` that has in-order semantics where each operation is performed after the previous operation has finished. - -It makes sense, in a SYCL with those proposals, to provide programmers with shortcuts to eliminate unnecessary extra code. When using in-order queues, for example, the lambda passed to `submit` does nothing except invoke `parallel_for` or `single_task`. This proposal adds the kernel specification methods directly to the `queue` class. These additional methods have two flavors. The first handles the "empty lambda" case. The second handles the USM dependence case by adding methods that take an `event` or vector of `events` that specify the dependences that must be satisfied before the kernel executes. Both flavors could be implemented in a header file by specifying the `submit` lambda for the programmer. - -Note: These simplifications do not depend on queue order properties. They apply both for in-order and out-of-order queues. - -.Queue Shortcuts -[source,cpp] ----- -include::queue.hpp[] ----- - -Overloads 4-12 shall support generic lambda as the kernel argument. For overloads 4-9 the generic argument is the `item` with the same dimensions that `range` argument has. For overloads 10-12 the generic argument is the `nd_item` with the same dimensions that `nd_range` argument has. - -Overloads 4-6 shall support number or `braced-init-list` as the `range` argument. diff --git a/sycl/doc/extensions/QueueShortcuts/queue.hpp b/sycl/doc/extensions/QueueShortcuts/queue.hpp deleted file mode 100644 index 5639a0bf20b5c..0000000000000 --- a/sycl/doc/extensions/QueueShortcuts/queue.hpp +++ /dev/null @@ -1,50 +0,0 @@ -class queue { -public: - // ... - template - event single_task(KernelType KernelFunc); // (1) - - template - event single_task(event DepEvent, KernelType KernelFunc); // (2) - - template - event single_task(const std::vector &DepEvents, - KernelType KernelFunc); // (3) - - template - event parallel_for(range NumWorkItems, KernelType KernelFunc); // (4) - - template - event parallel_for(range NumWorkItems, event DepEvent, - KernelType KernelFunc); // (5) - - template - event parallel_for(range NumWorkItems, - const std::vector &DepEvents, - KernelType KernelFunc); // (6) - - template - event parallel_for(range NumWorkItems, id WorkItemOffset, - KernelType KernelFunc); // (7) - - template - event parallel_for(range NumWorkItems, id WorkItemOffset, - event DepEvent, KernelType KernelFunc); // (8) - - template - event parallel_for(range NumWorkItems, id WorkItemOffset, - const std::vector &DepEvents, - KernelType KernelFunc); // (9) - - template - event parallel_for(nd_range ExecutionRange, KernelType KernelFunc); // (10) - - template - event parallel_for(nd_range ExecutionRange, event DepEvent, - KernelType KernelFunc); // (11) - - template - event parallel_for(nd_range ExecutionRange, - const std::vector &DepEvents, - KernelType KernelFunc); // (12) -}; diff --git a/sycl/doc/extensions/README.md b/sycl/doc/extensions/README.md index a473b5d328fc8..704deb8103dab 100755 --- a/sycl/doc/extensions/README.md +++ b/sycl/doc/extensions/README.md @@ -7,40 +7,14 @@ DPC++ extensions status: | Extension | Status | Comment | |-------------|:------------|:------------| -| [SYCL_INTEL_bitcast](Bitcast/SYCL_INTEL_bitcast.asciidoc) | Supported | As sycl::detail::bit_cast | -| [C and C++ Standard libraries support](C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst) | Partially supported(OpenCL: CPU, GPU) | | -| [SYCL_INTEL_data_flow_pipes](DataFlowPipes/data_flow_pipes.asciidoc) | Partially supported(OpenCL: ACCELERATOR) | kernel_host_pipe_support part is not implemented | -| [SYCL_INTEL_deduction_guides](deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) | Supported | | -| [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal | | -| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Supported(OpenCL, Level Zero) | | -| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) | Supported(OpenCL: CPU, GPU) | | | [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Deprecated | | | [GroupMask](GroupMask/GroupMask.asciidoc) | Proposal | | -| [FPGA selector](IntelFPGA/FPGASelector.md) | Supported | | -| [FPGA reg](IntelFPGA/FPGAReg.md) | Supported(OpenCL: ACCELERATOR) | | -| [SYCL_INTEL_kernel_restrict_all](KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc) | Supported(OpenCL) | | -| [SYCL_INTEL_attribute_style](KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc) | Proposal | | -| [Queue Order Properties](OrderedQueue/OrderedQueue_v2.adoc) | Supported | | -| [Queue Shortcuts](QueueShortcuts/QueueShortcuts.adoc) | Supported | | | [Reductions for ND-Range Parallelism](Reduction/Reduction.md) | Partially supported(OpenCL: CPU, GPU; CUDA) | Not supported: multiple reduction vars, multi-dimensional reduction vars | -| [SYCL_INTEL_relax_standard_layout](RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc) | Supported | | | [SPV_INTEL_function_pointers](SPIRV/SPV_INTEL_function_pointers.asciidoc) | Supported(OpenCL: CPU, GPU; HOST) | | | [SPV_INTEL_inline_assembly](SPIRV/SPV_INTEL_inline_assembly.asciidoc) | Supported(OpenCL: GPU) | | -| [LocalMemory](LocalMemory/LocalMemory.asciidoc) | Supported(OpenCL; CUDA) | Revision 1 of the spec is fully supported, future revisions are expected to expand the functionality | | [SYCL_INTEL_static_local_memory_query](StaticLocalMemoryQuery/SYCL_INTEL_static_local_memory_query.asciidoc) | Proposal | | | [Sub-groups for NDRange Parallelism](SubGroupNDRange/SubGroupNDRange.md) | Deprecated(OpenCL: CPU, GPU) | | | [Sub-groups](SubGroup/SYCL_INTEL_sub_group.asciidoc) | Deprecated | | -| [SYCL_INTEL_unnamed_kernel_lambda](UnnamedKernelLambda/SYCL_INTEL_unnamed_kernel_lambda.asciidoc) | Supported(OpenCL) | | -| [Unified Shared Memory](USM/USM.adoc) | Supported(OpenCL) | | -| [Use Pinned Memory Property](UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc) | Supported | | -| [Level-Zero backend specification](LevelZeroBackend/LevelZeroBackend.md) | Supported | | -| [Platform Context](PlatformContext/PlatformContext.adoc) | Proposal | | -| [SYCL_INTEL_group_sort](GroupAlgorithms/SYCL_INTEL_group_sort.asciidoc) | Experimental. Partially supported | | -| [Assert](Assert/SYCL_ONEAPI_ASSERT.asciidoc) | Proposal | | -| [Matrix](Matrix/dpcpp-joint-matrix.asciidoc) | Partially supported(AMX AOT) | Not supported: dynamic-extent, wg and wi scopes, layouts other than packed| -| [SYCL_INTEL_free_function_queries](FreeFunctionQueries/SYCL_INTEL_free_function_queries.asciidoc) | Supported (experimental) | | -| [EXT_ONEAPI_max_work_groups](MaxWorkGroupQueries/max_work_group_query.md) | Supported | | -| [SYCL_INTEL_bf16_conversion](Bf16Conversion/SYCL_INTEL_bf16_conversion.asciidoc) | Partially supported (Level Zero: GPU) | Currently available only on Xe HP GPU. ext_intel_bf16_conversion aspect is not supported. | | [Property List](PropertyList/SYCL_EXT_ONEAPI_property_list.asciidoc) | Proposal | | | [DiscardQueueEvents](DiscardQueueEvents/SYCL_EXT_ONEAPI_DISCARD_QUEUE_EVENTS.asciidoc) | Proposal | | diff --git a/sycl/doc/extensions/RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc b/sycl/doc/extensions/RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc deleted file mode 100755 index 3c5bce70f2740..0000000000000 --- a/sycl/doc/extensions/RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc +++ /dev/null @@ -1,161 +0,0 @@ -= SYCL_INTEL_relax_standard_layout - -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -// This is necessary for asciidoc, but not for asciidoctor -:cpp: C++ - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. GitHub does not render image icons. - -This document describes an extension that removes the requirement for data accessible on a device to be standard layout. The trivially copyable requirement remains unchanged. - -== Name Strings - -+SYCL_INTEL_relax_standard_layout+ - -== Notice - -Copyright (c) 2020 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products. - -== Version - -Built On: {docdate} + -Revision: 2 - -== Contact -Michael Kinsner, Intel (michael 'dot' kinsner 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision 6. - -== Overview - -SYCL 1.2.1 requires data stored into a buffer or passed as a kernel argument to be standard layout. This is in addition to the data also being trivially copyable. This extension relaxes the standard layout requirement while leaving the trivially copyable requirement intact. - -Standard layout does not guarantee ABI compatibility across devices or between a device and the host, but can help. To avoid introducing incompatible data layouts, this extension also requires device compilers to validate that their layout of data is compatible with that used by the host. There are multiple implementation approaches that can achieve this, although they are beyond the scope of this extension which describes the behavior only. - -== Modifications of SYCL 1.2.1 Specification - -=== Modify Sentence in Section 3.10 (Language restrictions in kernels) - -==== From: - -Sharing data structures between host and device code imposes certain restrictions, such as use of only user defined classes that are {cpp}11 standard layout classes for the data structures, classes that are {cpp}11 trivially copyable classes for the data structures, and in general, no pointers initialized for the host can be used on the device. - -==== To: - -Sharing data structures between host and device code imposes certain restrictions, such as use of only user defined classes that are {cpp}11 trivially copyable classes for the data structures, and in general, no pointers initialized for the host can be used on the device. - -=== Modify Sentence in Section 4.8.5 (SYCL function for invoking kernels, +set_arg+ description) - -==== From: - -The argument can be either a SYCL accessor, a SYCL sampler or a trivially copyable and standard-layout C++ type. - -==== To: - -The argument can be either a SYCL accessor, a SYCL sampler or a trivially copyable C++ type. - -=== Modify Sentence in Section 4.8.9 (Defining kernels) - -==== From: - -These function objects provide the same functionality as any C++ function object, with the restriction that they need to follow {cpp}11 standard layout rules. - -==== To: - -These function objects provide the same functionality as any C++ function object, with the restriction that they need to follow {cpp}11 rules to be trivially copyable. - - -=== Add new bullet point in Section 4.8.11 (Rules for parameter passing to kernels) - -The device compiler(s) must validate that the layout of any data shared between the host and the device(s) (e.g. value kernel arguments or data accessed through an accessor or USM) matches the layout of that data on the host. If there is a layout mismatch, realized or potential, the device compiler must issue an error and compilation must fail. - -=== Modify bullet point in Section 4.8.11 (Rules for parameter passing to kernels) - -==== From: - -{cpp} standard layout values must be passed by value to the kernel. - -==== To: - -{cpp} trivially copyable types must be passed by value to the kernel. - -=== Modify bullet point in Section 4.8.11 (Rules for parameter passing to kernels) - -==== From: - -{cpp} non-standard layout values must not be passed as arguments to a kernel that is compiled for a device. - -==== To: - -{cpp} non-trivially copyable types must not be passed as arguments to a kernel that is compiled for a device. - -=== Modify sentence in glossary entry for "SYCL kernel function" - -==== From: - -The function object can be a named standard layout type or lambda function. - -==== To: - -The function object can be a named trivially copyable type or lambda function. - -== Issues - -None. - -//. asd -//+ -//-- -//*RESOLUTION*: Not resolved. -//-- - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2020-03-17|Michael Kinsner|*Initial public working draft* -|2|2020-03-24|Michael Kinsner|Remove repeated sentence and fix typo -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/USM/USM.adoc b/sycl/doc/extensions/USM/USM.adoc deleted file mode 100644 index 6a001d771568e..0000000000000 --- a/sycl/doc/extensions/USM/USM.adoc +++ /dev/null @@ -1,12 +0,0 @@ -= SYCL(TM) Proposals: Unified Shared Memory -James Brodman ; Ben Ashbaugh ; Michael Kinsner -v0.999 -:source-highlighter: pygments -:icons: font -:y: icon:check[role="green"] -:n: icon:times[role="red"] - -== Please Refer to SYCL 2020 - -The Unified Shared Memory (USM) extension is now part of the provisional SYCL 2020 specification. -Please refer to that document for the latest definition of USM in SYCL at https://www.khronos.org/sycl/[SYCL @ Khronos]. \ No newline at end of file diff --git a/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc b/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc deleted file mode 100644 index aff8eb116b82c..0000000000000 --- a/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc +++ /dev/null @@ -1,1264 +0,0 @@ -= cl_intel_unified_shared_memory - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C, -// for syntax highlighting purposes. -:language: c - -// This is what is needed for C++, since docbook uses c++ -// and everything else uses cpp. This doesn't work when -// source blocks are in table cells, though, so don't use -// C++ unless it is required. -//:language: {basebackend@docbook:c++:cpp} - -[float] -== XXX - Not complete yet!!! - -== Name Strings - -`cl_intel_unified_shared_memory` - -== Contact - -Ben Ashbaugh, Intel (ben 'dot' ashbaugh 'at' intel 'dot' com) - -== Contributors - -// spell-checker: disable -Ben Ashbaugh, Intel + -James Brodman, Intel + -Wenju He, Intel + -Kris Kang, Intel + -Michael Kinsner, Intel + -Michal Mrozek, Intel + -Lukasz Towarek, Intel + -TODO: many more... -// spell-checker: enable - -== Notice - -Copyright (c) 2020 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to a feature for review and community feedback. -When the feature matures, this specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products. -If you are interested in using this feature in your software product, please let us know! - -== Version - -Built On: {docdate} + -Revision: R - -== Dependencies - -This extension is written against the OpenCL API Specification Version 2.2, Revision v2.2-11. -This extension extends the `clSetKernelExecInfo` API from OpenCL 2.0 and hence requires an OpenCL 2.0 platform, however it is intended to be implementable by devices supporting many diverse OpenCL versions. - -== Overview - -This extension adds "Unified Shared Memory" (USM) to OpenCL. -Unified Shared Memory provides: - -* Easier integration into existing code bases by representing OpenCL allocations as pointers rather than handles (`cl_mems`), with full support for pointer arithmetic into allocations. -* Fine-grain control over ownership and accessibility of OpenCL allocations, to optimally choose between performance and programmer convenience. -* A simpler programming model, by automatically migrating some allocations between OpenCL devices and the host. - -While Unified Shared Memory (USM) shares many features with Shared Virtual Memory (SVM), Unified Shared Memory provides a different mix of capabilities and control. -Specifically: - -* The matrix of USM capabilities supports combinations of features beyond the SVM capability queries. - -* USM provides explicit control over memory placement and migration by supporting host allocations with wide visibility, devices allocations for best performance, and shared allocations that may migrate between devices and the host. - -* USM allocations may be associated with both a device and a context. -The USM allocation APIs support additional memory flags and optional properties to affect how memory is allocated and migrated. - -* There is no need for APIs to map or unmap USM allocations, because host accessible USM allocations do not need to be mapped or unmapped to access the contents of a USM allocation on the host. - -* An application may indicate that a kernel may access categories of USM allocations indirectly, without passing a set of all indirectly accessed USM allocations to the kernel, improving usability and reducing driver overhead for kernels that access many USM allocations. - -* USM adds API functions to query properties of a USM allocation and to provide memory advice for an allocation. - -Unified Shared Memory and Shared Virtual Memory can and will coexist for many implementations. -All implementations that support Shared Virtual Memory may support at least some types of Unified Shared Memory. - -== New API Functions - -[source] ----- -void* clHostMemAllocINTEL( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -void* clDeviceMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -void* clSharedMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); - -cl_int clMemFreeINTEL( - cl_context context, - void* ptr); - -cl_int clMemBlockingFreeINTEL( - cl_context context, - void* ptr); - -cl_int clGetMemAllocInfoINTEL( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); - -cl_int clSetKernelArgMemPointerINTEL( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value); - -cl_int clEnqueueMemFillINTEL( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -cl_int clEnqueueMemcpyINTEL( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -cl_int clEnqueueMigrateMemINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags_intel flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); - -cl_int clEnqueueMemAdviseINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); ----- - -== New API Enums - -Accepted value for the _param_name_ parameter to *clGetDeviceInfo* to query the Unified Shared Memory capabilities of an OpenCL device: - -[source] ----- -#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190 -#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191 -#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192 -#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193 -#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194 ----- - -Bitfield type and bits describing the Unified Shared Memory capabilities of an OpenCL device: - -[source] ----- -typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel; - -#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0) -#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2) -#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3) ----- - -Type to describe optional Unified Shared Memory allocation properties: - -[source] ----- -typedef cl_bitfield cl_mem_properties_intel; ----- - -Enumerant value requesting optional allocation properties for a Unified Shared Memory allocation: - -[source] ----- -#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195 ----- - -Bitfield type and bits describing optional allocation properties for a Unified Shared Memory allocation: - -[source] ----- -typedef cl_bitfield cl_mem_alloc_flags_intel; - -#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0) ----- - -Enumeration type and values for the _param_name_ parameter to *clGetMemAllocInfoINTEL* to query information about a Unified Shared Memory allocation. -Optional allocation properties may also be queried using *clGetMemAllocInfoINTEL*: - -[source] ----- -typedef cl_uint cl_mem_info_intel; - -#define CL_MEM_ALLOC_TYPE_INTEL 0x419A -#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B -#define CL_MEM_ALLOC_SIZE_INTEL 0x419C -#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D -/* CL_MEM_ALLOC_FLAGS_INTEL - defined above */ -/* Enum values 0x419E-0x419F are reserved for future queries. */ ----- - -Enumeration type and values describing the type of Unified Shared Memory allocation. -Returned by *clGetMemAllocInfoINTEL* when _param_name_ is `CL_MEM_ALLOC_TYPE_INTEL`: - -[source] ----- -typedef cl_uint cl_unified_shared_memory_type_intel; - -#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196 -#define CL_MEM_TYPE_HOST_INTEL 0x4197 -#define CL_MEM_TYPE_DEVICE_INTEL 0x4198 -#define CL_MEM_TYPE_SHARED_INTEL 0x4199 ----- - -Bitfield type and bits used by *clEnqueueMigrateMemINTEL* to describe how to migrate a Unified Shared Memory allocation. -Note, this bitfield type and bits are aliases of the `cl_mem_migration_flags` bitfield type and bits: - -[source] ----- -typedef cl_bitfield cl_mem_migration_flags_intel; - -#define CL_MIGRATE_MEM_OBJECT_HOST_INTEL (1 << 0) -#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED_INTEL (1 << 1) ----- - -Enumeration type and values for the _advice_ parameter to *clEnqueueMemAdviseINTEL* to provide memory advice for a Unified Shared Memory allocation: - -[source] ----- -typedef cl_uint cl_mem_advice_intel; -/* Enum values 0x4208-0x420F are reserved for future memory advices. */ ----- - -Accepted value for the _param_name_ parameter to *clSetKernelExecInfo* to specify that the kernel may indirectly access Unified Shared Memory allocations of the specified type: - -[source] ----- -#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200 -#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL 0x4201 -#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL 0x4202 ----- - -Accepted value for the _param_name_ parameter to *clSetKernelExecInfo* to specify a set of Unified Shared Memory allocations that the kernel may indirectly access: - -[source] ----- -#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203 ----- - -New return values from *clGetEventInfo* when _param_name_ is `CL_EVENT_COMMAND_TYPE`: - -[source] ----- -#define CL_COMMAND_MEMFILL_INTEL 0x4204 -#define CL_COMMAND_MEMCPY_INTEL 0x4205 -#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206 -#define CL_COMMAND_MEMADVISE_INTEL 0x4207 ----- - -== Modifications to the OpenCL API Specification - -=== Section 3.3 - Memory Model - -TODO - WIP - -=== Section 4.2 - Querying Devices: - -Add to Table 5 - OpenCL Device Queries: - -[caption="Table 5. "] -.OpenCL Device Queries -[width="100%",cols="<30%,<20%,<50%",options="header"] -|==== -| *cl_device_info* | Return Type | Description -| `CL_DEVICE_HOST_{zwsp}MEM_CAPABILITIES_INTEL` + - {blank} - `CL_DEVICE_DEVICE_{zwsp}MEM_CAPABILITIES_INTEL` + - {blank} - `CL_DEVICE_SINGLE_DEVICE_SHARED_{zwsp}MEM_CAPABILITIES_INTEL` + - {blank} - `CL_DEVICE_CROSS_DEVICE_SHARED_{zwsp}MEM_CAPABILITIES_INTEL` + - {blank} - `CL_DEVICE_SHARED_SYSTEM_{zwsp}MEM_CAPABILITIES_INTEL` - | `cl_device_unified_shared_{zwsp}memory_capabilities_intel` - | Describes the ability for a device to access Unified Shared Memory allocations of the specified type. - - The host memory access capabilities apply to any host allocation. - - The device memory access capabilities apply to any device allocation associated with this device. - - The single device shared memory access capabilities apply to any shared allocation associated with this device. - - The cross-device shared memory access capabilities apply to any shared allocation associated with this device, or to any shared memory allocation on another device that also supports the same cross-device shared memory access capability. - - The shared system memory access capabilities apply to any allocations made by a system allocator, such as `malloc` or `new`. - - The access capabilities are encoded as bits in a bitfield. - Supported capabilities are: - - `CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL`: - The device may access (read or write) Unified Shared Memory allocations of this type. - - `CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL`: - The device may perform atomic operations on Unified Shared Memory allocations of this type. - - `CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL`: - The device supports concurrent access to Unified Shared Memory allocations of this type. - Concurrent access may be from the host, or from other OpenCL devices, where applicable. - - `CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL`: - The device supports concurrent atomic access to Unified Shared Memory allocations of this type. -|==== - -=== New Section 5.X - Unified Shared Memory - -This section describes _Unified Shared Memory_, abbreviated _USM_. -Unified Shared Memory allocations are represented as pointers in the host application, rather than as handles (specifically, `cl_mems`). -Unified Shared Memory additionally provides fine-grain control over placement and accessibility of an allocation, allowing many tradeoffs between programmer convenience and performance. - -Three types of Unified Shared Memory allocations are supported. -The type describes the _ownership_ of the allocation: - -. **Host** allocations are owned by the host and are intended to be allocated out of system memory. -Host allocations are accessible by the host and one or more devices. -The same pointer to a host allocation may be used on the host and all supported devices; they have _address equivalence_. -Host allocations are not expected to migrate between system memory and device local memory. -Host allocations trade off wide accessibility and transfer benefits for potentially higher per-access costs, such as over PCI express. - -. **Device** allocations are owned by a specific device and are intended to be allocated out of device local memory, if present. -Device allocations generally trade off access limitations for higher performance. -With very few exceptions, device allocations may only be accessed by the specific device they are allocated on, or copied to a host or another device allocation. -The same pointer to a device allocation may be used on any supported device. - -. **Shared** allocations share ownership and are intended to migrate between the host and one or more devices. -Shared allocations are accessible by at least the host and an associated device. -Shared allocations may be accessed by other devices in some cases. -Shared allocations trade off transfer costs for per-access benefits. -The same pointer to a shared allocation may be used on the host and all supported devices. - -A **Shared System** allocation is a sub-class of a **Shared** allocation, where the memory is allocated by a _system allocator_ - such as `malloc` or `new` - rather than by a USM allocation API. -Shared system allocations have no associated device - they are inherently cross-device. -Like other shared allocations, shared system allocations are intended to migrate between the host and supported devices, and the same pointer to a shared system allocation may be used on the host and all supported devices. - -.Summary of Unified Shared Memory Capabilities -[width="100%",options="header"] -|==== -| Name | Initial Location 2+| Accessible By 2+| Migratable To - -.2+| **Host** .2+| Host -| Host | Yes | Host | N/A -| Any Device | Yes (perhaps over PCIe) | Device | No - -.3+| **Device** .3+| Specific Device -| Host | No | Host | No -| Specific Device | Yes | Device | N/A -| Another Device | Optional | Another Device | No - -.3+| **Shared** .3+| Host, or Specific Device, Or Unspecified -| Host | Yes | Host | Yes -| Specific Device | Yes | Device | Yes -| Another Device | Optional | Another Device | Optional - -.2+| **Shared System** .2+| Host -| Host | Yes | Host | Yes -| Device | Yes | Device | Yes - -|==== - -OpenCL devices may support different capabilities for each type of Unified Shared Memory allocation. -Supported capabilities are: - -* `CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL`: -The device may access (read or write) Unified Shared Memory allocations of this type. - -* `CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL`: -The device may perform atomic operations on Unified Shared Memory allocations of this type. - -* `CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL`: -The device supports concurrent access to Unified Shared Memory allocations of this type. -Concurrent access may be from the host, or from other OpenCL devices, where applicable. - -* `CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL`: -The device supports concurrent atomic access to Unified Shared Memory allocations of this type. - -Some devices may _oversubscribe_ some shared allocations. -When and how such oversubscription occurs, including which allocations are evicted when the working set changes, are considered implementation details. - -The minimum set of capabilities are: - -.Minimum Unified Shared Memory Capabilities -[width="100%",cols="^h,^,^,^,^",options="header"] -|==== -| Allocation Type | Access | Atomic Access | Concurrent Access | Concurrent Atomic Access -| Host | Optional | Optional | Optional | Optional -| Device | Required | Optional | Optional | Optional -| Shared | Optional | Optional | Optional | Optional -| Shared (Cross-Device) | Optional | Optional | Optional | Optional -| Shared System (Cross-Device) | Optional | Optional | Optional | Optional -|==== - -==== Allocating and Freeing Unified Shared Memory - -===== Host Allocations - -The function - -[source] ----- -void* clHostMemAllocINTEL( - cl_context context, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); ----- - -allocates host Unified Shared Memory. - -_context_ is a valid OpenCL context used to allocate the host memory. - -_properties_ is an optional list of allocation properties and their corresponding values. -The list is terminated with the special property `0`. -If no allocation properties are required, _properties_ may be `NULL`. -Please refer to the <> for valid property values and their description. - -_size_ is the size in bytes of the requested host allocation. - -_alignment_ is the minimum alignment in bytes for the requested host allocation. -It must be a power of two and must be equal to or smaller than the size of the largest data type supported by any OpenCL device in _context_. -If _alignment_ is `0`, a default alignment will be used that is equal to the size of the largest data type supported by any OpenCL device in _context_. - -_errcode_ret_ may return an appropriate error code. -If _errcode_ret_ is `NULL` then no error code will be returned. - -*clHostMemAllocINTEL* will return a valid non-`NULL` address and `CL_SUCCESS` will be returned in _errcode_ret_ if the host Unified Shared Memory is allocated successfully. -Otherwise, `NULL` will be returned, and _errcode_ret_ will be set to one of the following error values: - -* `CL_INVALID_CONTEXT` if _context_ is not a valid context. -* `CL_INVALID_OPERATION` if `CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL` is zero for all devices in _context_, indicating that no devices in _context_ support host Unified Shared Memory allocations. -* `CL_INVALID_VALUE` if _alignment_ is not zero or a power of two. -* `CL_INVALID_VALUE` if _alignment_ is greater than the size of the largest data type supported by any OpenCL device in _context_ that supports host Unified Shared Memory allocations. -* `CL_INVALID_PROPERTY` if a memory property name in _properties_ is not a supported property name, if the value specified for a supported property name is not valid, or if the same property name is specified more than once. -* `CL_INVALID_BUFFER_SIZE` if _size_ is zero or greater than `CL_DEVICE_MAX_MEM_ALLOC_SIZE` for any OpenCL device in _context_ that supports host Unified Shared Memory allocations. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - - -===== Device Allocations - -The function - -[source] ----- -void* clDeviceMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); ----- - -allocates Unified Shared Memory specific to an OpenCL device. - -_context_ is a valid OpenCL context used to allocate the device memory. - -_device_ is a valid OpenCL device ID to associate with the allocation. - -_properties_ is an optional list of allocation properties and their corresponding values. -The list is terminated with the special property `0`. -If no allocation properties are required, _properties_ may be `NULL`. -Please refer to the <> for valid property values and their description. - -_size_ is the size in bytes of the requested device allocation. - -_alignment_ is the minimum alignment in bytes for the requested device allocation. -It must be a power of two and must be equal to or smaller than the size of the largest data type supported by _device_. -If _alignment_ is `0`, a default alignment will be used that is equal to the size of largest data type supported by _device_. - -_errcode_ret_ may return an appropriate error code. -If _errcode_ret_ is `NULL` then no error code will be returned. - -*clDeviceMemAllocINTEL* will return a valid non-`NULL` address and `CL_SUCCESS` will be returned in _errcode_ret_ if the device Unified Shared Memory is allocated successfully. -Otherwise, `NULL` will be returned, and _errcode_ret_ will be set to one of the following error values: - -* `CL_INVALID_CONTEXT` if _context_ is not a valid context. -* `CL_INVALID_DEVICE` if _device_ is not a valid device or is not associated with _context_. -* `CL_INVALID_OPERATION` if `CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL` is zero for _device_, indicating that _device_ does not support device Unified Shared Memory allocations. -* `CL_INVALID_VALUE` if _alignment_ is not zero or a power of two. -* `CL_INVALID_VALUE` if _alignment_ is greater than the size of the largest data type supported by _device_. -* `CL_INVALID_PROPERTY` if a memory property name in _properties_ is not a supported property name, if the value specified for a supported property name is not valid, or if the same property name is specified more than once. -* `CL_INVALID_BUFFER_SIZE` if _size_ is zero or greater than `CL_DEVICE_MAX_MEM_ALLOC_SIZE` for _device_. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -===== Shared Allocations - -The function - -[source] ----- -void* clSharedMemAllocINTEL( - cl_context context, - cl_device_id device, - const cl_mem_properties_intel* properties, - size_t size, - cl_uint alignment, - cl_int* errcode_ret); ----- - -allocates Unified Shared Memory with shared ownership between the host and the specified OpenCL device. -If the specified OpenCL device supports cross-device access capabilities, the allocation is also accessible by other OpenCL devices in the context that have the same cross-device access capabilities. - -_context_ is a valid OpenCL context used to allocate the Unified Shared Memory. - -_device_ is an optional OpenCL device ID to associate with the allocation. -If _device_ is `NULL` then the allocation is not associated with any device. -Allocations with no associated device are accessible by the host and OpenCL devices in the context that have cross-device access capabilities. - -_properties_ is an optional list of allocation properties and their corresponding values. -The list is terminated with the special property `0`. -If no allocation properties are required, _properties_ may be `NULL`. -Please refer to the <> for valid property values and their description. - -_size_ is the size in bytes of the requested shared allocation. - -_alignment_ is the minimum alignment in bytes for the requested shared allocation. -It must be a power of two and must be equal to or smaller than the size of the largest data type supported by _device_. -If _alignment_ is `0`, a default alignment will be used that is equal to the size of largest data type supported by _device_. -If _device_ is `NULL`, _alignment_ must be a power of two equal to or smaller than the size of the largest data type supported by any OpenCL device in _context_, and the default alignment will be equal to the size of the largest data type supported by any OpenCL device in _context_. - -_errcode_ret_ may return an appropriate error code. -If _errcode_ret_ is `NULL` then no error code will be returned. - -*clSharedMemAllocINTEL* will return a valid non-`NULL` address and `CL_SUCCESS` will be returned in _errcode_ret_ if the shared Unified Shared Memory is allocated successfully. -Otherwise, `NULL` will be returned, and _errcode_ret_ will be set to one of the following error values: - -* `CL_INVALID_CONTEXT` if _context_ is not a valid context. -* `CL_INVALID_DEVICE` if _device_ is not `NULL` and is either not a valid device or is not associated with _context_. -* `CL_INVALID_OPERATION` if _device_ is not `NULL` and `CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL` and `CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL` are both zero, indicating that _device_ does not support shared Unified Shared Memory allocations, or if _device_ is `NULL` and no devices in _context_ support shared Unified Shared Memory allocations. -* `CL_INVALID_VALUE` if _alignment_ is not zero or a power of two. -* `CL_INVALID_VALUE` if _device_ is not `NULL` and _alignment_ is greater than the size of the largest data type supported by _device_, or if _device_ is `NULL` and _alignment_ is greater than the size of the largest data type supported by any OpenCL device in _context_ that supports shared Unified Shared Memory allocations. -* `CL_INVALID_PROPERTY` if a memory property name in _properties_ is not a supported property name, if the value specified for a supported property name is not valid, or if the same property name is specified more than once. -* `CL_INVALID_BUFFER_SIZE` if _size_ is zero, or if _device_ is not `NULL` and _size_ is greater than `CL_DEVICE_MAX_MEM_ALLOC_SIZE` for _device_, or if _device_ is `NULL` and _size_ is greater than `CL_DEVICE_MAX_MEM_ALLOC_SIZE` for any device in _context_ that supports shared Unified Shared Memory allocations. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -===== Freeing Allocations - -The functions - -[source] ----- -cl_int clMemFreeINTEL( - cl_context context, - void* ptr); - -cl_int clMemBlockingFreeINTEL( - cl_context context, - void* ptr); ----- - -free a Unified Shared Memory allocation. - -_context_ is a valid OpenCL context used to free the Unified Shared Memory allocation. - -_ptr_ is the Unified Shared Memory allocation to free. -It must be a value returned by *clHostMemAllocINTEL*, *clDeviceMemAllocINTEL*, or *clSharedMemAllocINTEL*, or a `NULL` pointer. -If _ptr_ is `NULL` then no action occurs. - -Note that *clMemFreeINTEL* may not wait for previously enqueued commands that may be using _ptr_ to finish before freeing _ptr_. -It is the responsibility of the application to make sure enqueued commands that use _ptr_ are complete before freeing _ptr_. -Applications should take particular care freeing memory allocations with kernels that may access memory indirectly, since a kernel with indirect memory access counts as using all memory allocations of the specified type or types. -To wait for previously enqueued commands to finish that may be using _ptr_ before freeing _ptr_, use the *clMemBlockingFreeINTEL* function instead. - -*clMemFreeINTEL* and *clMemBlockingFreeINTEL* will return `CL_SUCCESS` if the function executes successfully. -Otherwise, they will return one of the following error values: - -* `CL_INVALID_CONTEXT` if _context_ is not a valid context. -* `CL_INVALID_VALUE` if _ptr_ is not a value returned by *clHostMemAllocINTEL*, *clDeviceMemAllocINTEL*, *clSharedMemAllocINTEL*, or a `NULL` pointer. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -===== Controlling Allocations - -The table below describes allocation properties that may be passed to control allocation behavior. - -[[cl_mem_properties_intel]] -.List of Supported `cl_mem_properties_intel` Properties -[width="100%",cols="1,1,2",options="header"] -|==== -| Property -| Property Type -| Description - -| `CL_MEM_ALLOC_FLAGS_INTEL` - | cl_mem_alloc_flags_intel - | Flags specifying allocation and usage information. - This is a bitfield type that may be set to any combination of the following values: - - `CL_MEM_ALLOC_WRITE_COMBINED_INTEL`: - Request write combined (WC) memory. - Write combined memory may improve performance in some cases, however write combined memory must be used with care since it may hurt performance in other cases or use different coherency protocols than non-write combined memory. - -|==== - -==== Unified Shared Memory Queries - -The function - -[source] ----- -cl_int clGetMemAllocInfoINTEL( - cl_context context, - const void* ptr, - cl_mem_info_intel param_name, - size_t param_value_size, - void* param_value, - size_t* param_value_size_ret); ----- - -queries information about a Unified Shared Memory allocation. - -_context_ is a valid OpenCL context to query for information about the Unified Shared Memory allocation. - -_ptr_ is a pointer into a Unified Shared Memory allocation to query. -_ptr_ need not be a value returned by *clHostMemAllocINTEL*, *clDeviceMemAllocINTEL*, or *clSharedMemAllocINTEL*, but the query may be faster if it is. - -_param_name_ specifies the information to query. -The list of supported _param_name_ values and the information returned in _param_value_ is described in the <> table. - -_param_value_ is a pointer to memory where the appropriate result being queried is returned. -If _param_value_ is `NULL`, it is ignored. - -_param_value_size_ is used to specify the size in bytes of memory pointed to by _param_value_. -This size must be greater than or equal to the size of return type as described in the <> table. -If _param_value_ is `NULL`, it is ignored. - -_param_value_size_ret_ returns the actual size in bytes of data being queried by _param_name_. -If _param_value_size_ret_ is `NULL`, it is ignored. - -*clGetMemAllocInfoINTEL* returns `CL_SUCCESS` if the function is executed successfully. -Otherwise, it will return one of the following error values: - -* `CL_INVALID_CONTEXT` if _context_ is not a valid context. -* `CL_INVALID_VALUE` if _param_name_ is not a valid Unified Shared Memory allocation query. -* `CL_INVALID_VALUE` if _param_value_ is not `NULL` and _param_value_size_ is smaller than the size of the query return type. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -[[cl_mem_info_intel]] -.List of supported param_names by clGetMemAllocInfoINTEL -[width="100%",cols="<34%,<33%,<33%",options="header"] -|==== -| *cl_mem_info_intel* | Return type | Info. returned in _param_value_ -| `CL_MEM_ALLOC_TYPE_INTEL` - | cl_unified_shared_memory_type_intel - | Returns the type of the Unified Shared Memory allocation. - - Returns `CL_MEM_TYPE_HOST_INTEL` for allocations made by *clHostMemAllocINTEL* . - Returns `CL_MEM_TYPE_DEVICE_INTEL` for allocations made by *clDeviceMemAllocINTEL*. - Returns `CL_MEM_TYPE_SHARED_INTEL` for allocations made by *clSharedMemAllocINTEL*. - Returns `CL_MEM_TYPE_UNKNOWN_INTEL` if the type of the Unified Shared Memory allocation cannot be determined or if _ptr_ does not point into a Unified Shared Memory allocation. -| `CL_MEM_ALLOC_BASE_PTR_INTEL` - | void* - | Returns the base address of the Unified Shared Memory allocation. - - Returns `NULL` for `CL_MEM_TYPE_UNKNOWN_INTEL` allocations. -| `CL_MEM_ALLOC_SIZE_INTEL` - | size_t - | Returns the size in bytes of the Unified Shared Memory allocation. - - Returns `0` for `CL_MEM_TYPE_UNKNOWN_INTEL` allocations. -| `CL_MEM_ALLOC_DEVICE_INTEL` - | cl_device_id - | Returns the device associated with the Unified Shared Memory allocation. - - Returns `NULL` for `CL_MEM_TYPE_HOST_INTEL` allocations, for `CL_MEM_TYPE_SHARED_INTEL` allocations with no associated device, and for `CL_MEM_TYPE_UNKNOWN_INTEL` allocations. -| `CL_MEM_ALLOC_FLAGS_INTEL` - | cl_mem_alloc_flags_intel - | Returns allocation flags for the Unified Shared Memory allocation. - - Returns `0` if no allocation flags were specified for the Unified Shared Memory allocation and for `CL_MEM_TYPE_UNKNOWN_INTEL` allocations. - -|==== - -==== Using Unified Shared Memory with Kernels - -The function - -[source] ----- -cl_int clSetKernelArgMemPointerINTEL( - cl_kernel kernel, - cl_uint arg_index, - const void* arg_value); ----- - -is used to set a pointer into a Unified Shared Memory allocation as an argument to a kernel. - -_kernel_ is a valid kernel object. - -_arg_index_ is the argument index to set. -Arguments to the kernel are referred to by indices that go from 0 for the leftmost argument to _n_ - 1, where _n_ is the total number of arguments declared by a kernel. - -_arg_value_ is the pointer value that should be used as the argument specified by _arg_index_. -The pointer value will be used as the argument by all API calls that enqueue a kernel until the argument value is set to a different pointer value by a subsequent call. -A pointer into Unified Shared Memory allocation may only be set as an argument value for an argument declared to be a pointer to `global` or `constant` memory. -For devices supporting shared system allocations, any pointer value is valid. -Otherwise, the pointer value must be `NULL` or must point into a Unified Shared Memory allocation returned by *clHostMemAllocINTEL*, *clDeviceMemAllocINTEL*, or *clSharedMemAllocINTEL*. - -*clSetKernelArgMemPointerINTEL* returns `CL_SUCCESS` if the function is executed successfully. -Otherwise, it will return one of the following errors: - -* `CL_INVALID_KERNEL` if _kernel_ is not a valid kernel object. -* `CL_INVALID_ARG_INDEX` if _arg_index_ is not a valid argument index. -* `CL_INVALID_ARG_VALUE` if _arg_value_ is not a valid argument value. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -In addition to direct use of a Unified Shared Memory allocation as a kernel argument, Unified Shared Memory allocations may be accessed by kernels indirectly. -The new _param_name_ values described below may be used with the existing *clSetKernelExecInfo* function to describe how Unified Shared Memory allocations are accessed indirectly by a kernel: - -[caption="Table 28. "] -.List of supported param_names by clSetKernelExecInfo -[width="100%",cols="<34%,<33%,<33%",options="header"] -|==== -| *cl_kernel_exec_info* | Type | Description -| `CL_KERNEL_EXEC_INFO_{zwsp}USM_PTRS_INTEL` - | void*[] - | Specifies an explicit set of Unified Shared Memory allocations accessed indirectly by the kernel. - The new set replaces any previously specified set of Unified Shared Memory allocations. - - Initially, the set of Unified Shared Memory allocations accessed indirectly by the kernel is the empty set. -| `CL_KERNEL_EXEC_INFO_{zwsp}INDIRECT_HOST_ACCESS_INTEL` - | cl_bool - | Specifies that the kernel may access any host Unified Shared Memory allocation indirectly. - - By default, the value for this flag is `CL_FALSE`, indicating that the kernel will only access explicitly specified host Unified Shared Memory allocations. -| `CL_KERNEL_EXEC_INFO_{zwsp}INDIRECT_DEVICE_ACCESS_INTEL` - | cl_bool - | Specifies that the kernel may access any device Unified Shared Memory allocation indirectly. - - By default, the value for this flag is `CL_FALSE`, indicating that the kernel will only access explicitly specified device Unified Shared Memory allocations. -| `CL_KERNEL_EXEC_INFO_{zwsp}INDIRECT_SHARED_ACCESS_INTEL` - | cl_bool - | Specifies that the kernel may access any shared Unified Shared Memory allocation indirectly. - - By default, the value for this flag is `CL_FALSE`, indicating that the kernel will only access explicitly specified shared Unified Shared Memory allocations. - -|==== - -==== Filling and Copying Unified Shared Memory - -The function - -[source] ----- -cl_int clEnqueueMemFillINTEL( - cl_command_queue command_queue, - void* dst_ptr, - const void* pattern, - size_t pattern_size, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); ----- - -fills a region of a memory with the specified pattern. - -_command_queue_ is a valid host command queue. -The memory fill command will be queued for execution on the device associated with _command_queue_. - -_dst_ptr_ is a pointer to the start of the memory region to fill. -The Unified Shared Memory allocation pointed to by _dst_ptr_ must be valid for the context associated with _command_queue_, must be accessible by the device associated with _command_queue_, and must be aligned to _pattern_size_ bytes. - -_pattern_ is a pointer to the value to write to the Unified Shared Memory region. -The memory associated with _pattern_ can be reused or freed after the function returns. - -_pattern_size_ describes the size of of the value to write to the Unified Shared Memory region, in bytes. -_pattern_size_ must be a power of two and must be less than or equal to the size of the largest integer or floating-point vector data type supported by the device. - -_size_ describes the size of the memory region to set, in bytes. -_size_ must be a multiple of _pattern_size_. - -_event_wait_list_ and _num_events_in_wait_list_ specify events that need to complete before this command can be executed. -If _event_wait_list_ is `NULL`, then this command does not wait on any event to complete. -If _event_wait_list_ is `NULL`, _num_events_in_wait_list_ must be 0. -If _event_wait_list_ is not `NULL`, the list of events pointed to by _event_wait_list_ must be valid and _num_events_in_wait_list_ must be greater than 0. -The events specified in _event_wait_list_ act as synchronization points. -The context associated with events in _event_wait_list_ and _command_queue_ must be the same. -The memory associated with _event_wait_list_ can be reused or freed after the function returns. - -_event_ returns a unique event object that identifies this command. -If _event_ is `NULL`, no event will be created and therefore it will not be possible to query or wait for this command. -If the _event_wait_list_ and the _event_ arguments are not `NULL`, the _event_ argument must not refer to an element of the _event_wait_list_ array. - -*clEnqueueMemFillINTEL* returns CL_SUCCESS if the command is queued successfully. -Otherwise, it will return one of the following errors: - -* `CL_INVALID_COMMAND_QUEUE` if _command_queue_ is not a valid host command-queue. -* `CL_INVALID_CONTEXT` if the context associated with _command_queue_ and events in _event_wait_list_ are not the same. -* `CL_INVALID_VALUE` if _dst_ptr_ is `NULL`, or if _dst_ptr_ is not aligned to _pattern_size_ bytes. -* `CL_INVALID_VALUE` if _pattern_ is `NULL`. -* `CL_INVALID_VALUE` if _pattern_size_ is not a power of two or is greater than the size of the largest integer or floating-point vector data type supported by the device associated with _command_queue_. -* `CL_INVALID_VALUE` if _size_ is not a multiple of _pattern_size_. -* `CL_INVALID_EVENT_WAIT_LIST` if _event_wait_list_ is `NULL` and _num_events_in_wait_list_ is greater than zero, or if _event_wait_list_ is not `NULL` and _num_events_in_wait_list_ is zero, or if event objects in _event_wait_list_ are not valid events. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -The function - -[source] ----- -cl_int clEnqueueMemcpyINTEL( - cl_command_queue command_queue, - cl_bool blocking, - void* dst_ptr, - const void* src_ptr, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); ----- - -copies a region of memory from one location to another. - -_command_queue_ is a valid host command queue. -The memory copy command will be queued for execution on the device associated with _command_queue_. - -_blocking_ indicates if the copy operation is blocking or non-blocking. -If _blocking_ is `CL_TRUE`, the copy command is blocking, and the function will not return until the copy command is complete. -Otherwise, if _blocking_ is `CL_FALSE`, the copy command is non-blocking, and the contents of the _dst_ptr_ cannot be used nor can the contents of the _src_ptr_ be overwritten until the copy command is complete. - -_dst_ptr_ is a pointer to the start of the memory region to copy to. -If _dst_ptr_ is a pointer into a Unified Shared Memory allocation it must be valid for the context associated with _command_queue_. - -_src_ptr_ is a pointer to the start of the memory region to copy from. -If _src_ptr_ is a pointer into a Unified Shared Memory allocation it must be valid for the context associated with _command_queue_. - -_size_ describes the size of the memory region to copy, in bytes. - -_event_wait_list_ and _num_events_in_wait_list_ specify events that need to complete before this command can be executed. -If _event_wait_list_ is `NULL`, then this command does not wait on any event to complete. -If _event_wait_list_ is `NULL`, _num_events_in_wait_list_ must be 0. -If _event_wait_list_ is not `NULL`, the list of events pointed to by _event_wait_list_ must be valid and _num_events_in_wait_list_ must be greater than 0. -The events specified in _event_wait_list_ act as synchronization points. -The context associated with events in _event_wait_list_ and _command_queue_ must be the same. -The memory associated with _event_wait_list_ can be reused or freed after the function returns. - -_event_ returns a unique event object that identifies this command. -If _event_ is `NULL`, no event will be created and therefore it will not be possible to query or wait for this command. -If the _event_wait_list_ and the _event_ arguments are not `NULL`, the _event_ argument must not refer to an element of the _event_wait_list_ array. - -*clEnqueueMemcpyINTEL* returns CL_SUCCESS if the command is queued successfully. -Otherwise, it will return one of the following errors: - -* `CL_INVALID_COMMAND_QUEUE` if _command_queue_ is not a valid host command-queue. -* `CL_INVALID_CONTEXT` if the context associated with _command_queue_ and events in _event_wait_list_ are not the same. -* `CL_INVALID_VALUE` if either _dst_ptr_ or _src_ptr_ are `NULL`. -* `CL_INVALID_EVENT_WAIT_LIST` if _event_wait_list_ is `NULL` and _num_events_in_wait_list_ is greater than zero, or if _event_wait_list_ is not `NULL` and _num_events_in_wait_list_ is zero, or if event objects in _event_wait_list_ are not valid events. -* `CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST` if the copy operation is blocking and the execution status of any of the events in _event_wait_list_ is a negative integer value. -* `CL_MEM_COPY_OVERLAP` if the values specified for _dst_ptr_, _src_ptr_ and _size_ result in an overlapping copy. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -==== Unified Shared Memory Hints - -The function - -[source] ----- -cl_int clEnqueueMigrateMemINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_migration_flags_intel flags, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); ----- - -explicitly migrates a region of a shared Unified Shared Memory allocation to the device associated with _command_queue_. -This is a hint that may improve performance and is not required for correctness. -Memory migration may not be supported for all allocation types for all devices. -If memory migration is not supported for the specified memory range then the migration hint may be ignored. -Memory migration may only be supported at a device-specific granularity, such as a page boundary. -In this case, the memory range may be expanded such that the start and end of the range satisfy the granularity requirements. - -_command_queue_ is a valid host command queue. -The memory migration command will be queued for execution on the device associated with _command_queue_. - -_ptr_ is a pointer to the start of the shared Unified Shared Memory allocation to migrate. - -_size_ describes the size of the memory region to migrate. - -_flags_ is a bit-field that is used to specify memory migration options. - -_event_wait_list_ and _num_events_in_wait_list_ specify events that need to complete before this command can be executed. -If _event_wait_list_ is `NULL`, then this command does not wait on any event to complete. -If _event_wait_list_ is `NULL`, _num_events_in_wait_list_ must be 0. -If _event_wait_list_ is not `NULL`, the list of events pointed to by _event_wait_list_ must be valid and _num_events_in_wait_list_ must be greater than 0. -The events specified in _event_wait_list_ act as synchronization points. -The context associated with events in _event_wait_list_ and _command_queue_ must be the same. -The memory associated with _event_wait_list_ can be reused or freed after the function returns. - -_event_ returns a unique event object that identifies this command. -If _event_ is `NULL`, no event will be created and therefore it will not be possible to query or wait for this command. -If the _event_wait_list_ and the _event_ arguments are not `NULL`, the _event_ argument must not refer to an element of the _event_wait_list_ array. - -*clEnqueueMigrateMemINTEL* returns CL_SUCCESS if the command is queued successfully. -Otherwise, it will return one of the following errors: - -* `CL_INVALID_COMMAND_QUEUE` if _command_queue_ is not a valid host command-queue. -* `CL_INVALID_CONTEXT` if the context associated with _command_queue_ and events in _event_wait_list_ are not the same. -* `CL_INVALID_VALUE` **TODO**, are any values of _ptr_ and _size_ considered invalid? -* `CL_INVALID_VALUE` if _flags_ is zero or is not a supported combination of memory migration flags. -* `CL_INVALID_EVENT_WAIT_LIST` if _event_wait_list_ is `NULL` and _num_events_in_wait_list_ is greater than zero, or if _event_wait_list_ is not `NULL` and _num_events_in_wait_list_ is zero, or if event objects in _event_wait_list_ are not valid events. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -The function - -[source] ----- -cl_int clEnqueueMemAdviseINTEL( - cl_command_queue command_queue, - const void* ptr, - size_t size, - cl_mem_advice_intel advice, - cl_uint num_events_in_wait_list, - const cl_event* event_wait_list, - cl_event* event); ----- - -provides advice about a region of a shared Unified Shared Memory allocation. -Memory advice is a performance hint only and is not required for correctness. -Providing memory advice hints may override driver heuristics that control shared memory behavior. -Not all memory advice hints may be supported for all allocation types for all devices. -If a memory advice hint is not supported by the device it will be ignored. -Memory advice hints may only be supported at a device-specific granularity, such as at a page boundary. -In this case, the memory range may be expanded such that the start and end of the range satisfy the granularity requirements. - -_command_queue_ is a valid host command queue. -The memory advice hints will be queued for the device associated with _command_queue_. - -_ptr_ is a pointer to the start of the shared Unified Shared Memory allocation. - -_size_ describes the size of the memory region. - -_advice_ is a bit-field describing the memory advice hints for the region. - -_event_wait_list_ and _num_events_in_wait_list_ specify events that need to complete before this command can be executed. -If _event_wait_list_ is `NULL`, then this command does not wait on any event to complete. -If _event_wait_list_ is `NULL`, _num_events_in_wait_list_ must be 0. -If _event_wait_list_ is not `NULL`, the list of events pointed to by _event_wait_list_ must be valid and _num_events_in_wait_list_ must be greater than 0. -The events specified in _event_wait_list_ act as synchronization points. -The context associated with events in _event_wait_list_ and _command_queue_ must be the same. -The memory associated with _event_wait_list_ can be reused or freed after the function returns. - -_event_ returns a unique event object that identifies this command. -If _event_ is `NULL`, no event will be created and therefore it will not be possible to query or wait for this command. -If the _event_wait_list_ and the _event_ arguments are not `NULL`, the _event_ argument must not refer to an element of the _event_wait_list_ array. - -*clEnqueueMemAdviseINTEL* returns CL_SUCCESS if the command is queued successfully. -Otherwise, it will return one of the following errors: - -* `CL_INVALID_COMMAND_QUEUE` if _command_queue_ is not a valid host command-queue. -* `CL_INVALID_CONTEXT` if the context associated with _command_queue_ and events in _event_wait_list_ are not the same. -* `CL_INVALID_VALUE` **TODO**, are any values of _ptr_ and _size_ considered invalid? -* `CL_INVALID_VALUE` if _advice_ is not supported advice for the device associated with _command_queue_. -* `CL_INVALID_EVENT_WAIT_LIST` if _event_wait_list_ is `NULL` and _num_events_in_wait_list_ is greater than zero, or if _event_wait_list_ is not `NULL` and _num_events_in_wait_list_ is zero, or if event objects in _event_wait_list_ are not valid events. -* `CL_OUT_OF_RESOURCES` if there is a failure to allocate resources required by the OpenCL implementation on the device. -* `CL_OUT_OF_HOST_MEMORY` if there is a failure to allocate resources required by the OpenCL implementation on the host. - -== Issues - -. Is there a minimum supported granularity for concurrent access? For example, might it be possible to concurrently access different pages of an allocation, but not different bytes within the same page? -+ --- -*UNRESOLVED*: --- - -. What other Unified Shared Memory allocation properties should we support? -+ --- -*UNRESOLVED*: -The proposed Unified Shared Memory allocation APIs accept `cl_mem_alloc_flags_intel`. -We could also accept (some? all?) `cl_mem_flags`, for example. --- - -. Do we need separate "concurrent access" capabilities for host access vs. device access? -+ --- -*UNRESOLVED*: -We don't differentiate right now, but we could differentiate between concurrent host access vs. concurrent access from another device. --- - -. What would we need to add to support system allocations? -+ --- -`RESOLVED`: -Added `CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL`. --- - -. Do we need the ability to "register" or "use" an existing host allocations? -+ --- -*UNRESOLVED*: -Currently, only the ability to "allocate" host memory is supported. -If we did support this then there may be alignment and size granularity requirements for "registering" a host allocation. --- - -. Do we want to support both a _flags_ argument and a _properties_ argument to the USM allocation APIs? -+ --- -`RESOLVED`: -The _flags_ argument was folded into the _properties_ in revision C. --- - -. What should behavior be for `clGetMemAllocInfoINTEL` if the passed-in _ptr_ is `NULL` or doesn't point into a USM allocation? -+ --- -`RESOLVED`: -The behavior was defined for all queries for this case in revision G. --- - -. Do we want separate "memset" APIs to set to different sized "value", such as 8-bits, 16-bits?, 32-bits, or others? Do we want to go back to a "fill" API? -+ --- -`RESOLVED`: -Switched to a "fill" API in revision I. - -Discussion: The host "memset" only sets to an 8-bit value. -Switching back to a "fill" API is very flexible, but perhaps overkill, since it supports any supported integer or floating-point scalar or vector type. --- - -. What are the restrictions for the _dst_ptr_ values that can be passed to the "fill" API? -+ --- -*UNRESOLVED*: -Need to close on: - -* Can a device "fill" another device's allocation? (Recommendation: Yes, if accessible.) -* Can a device "fill" arbitrary host memory? (Recommendation: Maybe?) -* Can a device "fill" a USM allocation from another context? (Recommendation: No.) --- - -. What are the restrictions for the _src_ptr_ and _dst_ptr_ values that can be passed to the "memcpy" API? -+ --- -*UNRESOLVED*: -Need to close on: - -* Can a device "memcpy" from another device's allocation? -* Can a device "memcpy" to another device's allocation? -* Can a device "memcpy" to or from a USM allocation in another context? (Recommendation: No?) -* Can a device "memcpy" to arbitrary host memory? (Recommendation: Yes.) -* Can a device "memcpy" from arbitrary host memory? (Recommendation: Yes.) -* Can a device "memcpy" from arbitrary host memory to arbitrary host memory? (Recommendation: Yes.) -* Can the memory region to copy to overlap the memory region to copy from? (Recommendation: No?) --- - -. Do we want to support migrating to devices other than the device associated with _command_queue_? -+ --- -*UNRESOLVED*: -We could add an explicit _dst_device_ argument if desired, which could be `NULL` when migrating to the device associated with the _command_queue_. -We could also add a mechanism to allow migrating to the host. --- - -. Should *clEnqueueMigrateMemINTEL* support migrating an array of pointers with one API call, similar to *clEnqueueSVMMigrateMem*? -+ --- -*UNRESOLVED*: -This depends how frequently the migrate APIs are called. --- - -. Could the _device_ argument to *clSharedMemAllocINTEL* be `NULL` if there is no need to associate the shared allocation to a specific device? -+ --- -`RESOLVED`: -Yes, this case is documented in revision G. --- - -. Should we allow querying the associated device for a USM allocation using *clGetMemAllocInfoINTEL*? -+ --- -`RESOLVED`: -This query was added in revision G. --- - -. Should we add explicit mem alloc flags for `CACHED` and `UNCACHED`? -+ --- -*UNRESOLVED*: --- - -. At least for HOST and SHARED allocations, should have separate mem alloc flags for the host and the device? -+ --- -*UNRESOLVED*: --- - -. What are invalid values for `ptr` and `size` for *clEnqueueMigrateMemINTEL* and *clEnqueueMemAdviseINTEL*? -How about *clEnqueueMemFillINTEL* and *clEnqueueMemcpyINTEL*? -Specifically, is `NULL` a valid value for `ptr`? -Is `size` equal to zero valid? -+ --- -*UNRESOLVED*: --- - -. Should we add a device query for a maximum supported USM alignment, or should the maximum supported alignment implicitly be defined by the size of the largest data type supported by the device? -+ --- -*UNRESOLVED*: -A device query would allow for larger supported alignments, such as page alignment. -Note that supported alignments should always be a power of two. --- - -. Should we add a device query for a maximum supported USM fill pattern size, or should the maximum supported fill pattern size implicitly be defined by the size of the largest data type supported by the device? -+ --- -*UNRESOLVED*: -A device query would allow for larger fill patterns. -Note that the fill pattern size should always be a power of two. --- - -. Can a pointer to a device, host, or shared USM allocation be used to create a `cl_mem` using `CL_MEM_USE_HOST_PTR`? -+ --- -*UNRESOLVED*: -Trending "no" in all cases. -If the USM allocation is from the same context this could be an error, such as `CL_INVALID_HOST_PTR`. -If the USM allocation is from a different context then behavior could be undefined. --- - -. Can a pointer to a device, host, or shared USM allocation be used to create a `cl_mem` buffer using `CL_MEM_COPY_HOST_PTR`? -+ --- -*UNRESOLVED*: -Trending "no" for device and shared USM allocations. -If the USM allocation is from the same context this could be an error, such as `CL_INVALID_HOST_PTR`. -If the USM allocation is from a different context then behavior could be undefined. - -Trending "yes" for host USM allocations, both when the host USM allocation is from this context and from another context. --- - -. Can a pointer to a device, host, or shared USM allocation be passed to API functions to read from or write to `cl_mem` objects, such as `clEnqueueReadBuffer` or `clEnqueueWriteImage`? -+ --- -*UNRESOLVED*: -Trending "yes" for device USM allocations, so long as the device USM allocation is accessible by the device associated with the command queue, and the device allocation was made against the context associated with the command queue. - -Trending "yes" for host USM allocations, both when the host USM allocation is from this context and from another context. - -Trending "no" for shared USM allocations. -If the shared USM allocation is from the same context this could be an error, such as `CL_INVALID_HOST_PTR`. -If the shared USM allocation is from a different context then behavior could be undefined. --- - -. Can a pointer to a device, host, or shared USM allocation be passed to API functions to fill a `cl_mem`, SVM allocation, or USM allocation, such as `clEnqueueFillBuffer`? -+ --- -*UNRESOLVED*: -Trending "no" for device and shared allocations. -If the USM allocation is from the same context this could be an error, such as `CL_INVALID_HOST_PTR`. -If the USM allocation is from a different context then behavior could be undefined. - -Trending "yes" for host USM allocations, both when the host USM allocation is from this context and from another context. --- - -. Should we support passing traditional `cl_mem_flags` via the USM allocation properties? -+ --- -*UNRESOLVED*: -Trending "yes", by allowing `CL_MEM_FLAGS` as a property and `cl_mem_flags` as the property value. - -Note that some flags will not be valid, such as `CL_MEM_USE_HOST_PTR`. --- - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|A|2019-01-18|Ben Ashbaugh|*Initial revision* -|B|2019-03-25|Ben Ashbaugh|Minor name changes. -|C|2019-06-18|Ben Ashbaugh|Moved flags argument into properties. -|D|2019-07-19|Ben Ashbaugh|Editorial fixes. -|E|2019-07-22|Ben Ashbaugh|Allocation properties should be const. -|F|2019-07-26|Ben Ashbaugh|Removed DEFAULT mem alloc flag. -|G|2019-08-23|Ben Ashbaugh|Added mem alloc query for associated device. -|H|2019-10-11|Ben Ashbaugh|Added initial list and description of error codes. -|I|2019-11-14|Ben Ashbaugh|Switched from a memset to a memfill API. -|J|2019-11-18|Ben Ashbaugh|Updated a few more error conditions. -|K|2019-12-18|Krzysztof Gibala|Updated write combine description. -|L|2020-01-15|Ben Ashbaugh|Added invalid arg case to setkernelarg API. -|M|2020-01-17|Ben Ashbaugh|Minor name changes, removed const from memfree API. -|N|2020-01-22|Ben Ashbaugh|Updated write combine description. -|O|2020-01-23|Ben Ashbaugh|Added aliases for USM migration flags. -|P|2020-02-28|Ben Ashbaugh|Added blocking memfree API. -|Q|2020-03-12|Ben Ashbaugh|Name tweak for blocking memfree API, added comparison to SVM, allow zero memory advice. -|R|2020-08-21|Ben Ashbaugh|Fixed enum name typo in table. -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use `mono` text for device APIs, or [source] syntax highlighting. -//* Use `mono` text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/UnnamedKernelLambda/README.md b/sycl/doc/extensions/UnnamedKernelLambda/README.md deleted file mode 100644 index fbc465aa1ff9f..0000000000000 --- a/sycl/doc/extensions/UnnamedKernelLambda/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# SYCL_INTEL_unnamed_kernel_lambda - -Make kernel name class for lambda-defined kernels optional. Simplifies invocation interface. - diff --git a/sycl/doc/extensions/UnnamedKernelLambda/SYCL_INTEL_unnamed_kernel_lambda.asciidoc b/sycl/doc/extensions/UnnamedKernelLambda/SYCL_INTEL_unnamed_kernel_lambda.asciidoc deleted file mode 100755 index d9df6e5eaf88b..0000000000000 --- a/sycl/doc/extensions/UnnamedKernelLambda/SYCL_INTEL_unnamed_kernel_lambda.asciidoc +++ /dev/null @@ -1,184 +0,0 @@ -= SYCL_INTEL_unnamed_kernel_lambda -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. GitHub does not render image icons. - -This document describes an extension that makes it optional, instead of required, to name (through template parameter of invocation function) kernels that are defined as lambdas. - - -== Name Strings - -+SYCL_INTEL_unnamed_kernel_lambda+ - -== Notice - -Copyright (c) 2019 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products. - -== Version - -Built On: {docdate} + -Revision: 1 - -== Contact -Michael Kinsner, Intel (michael 'dot' kinsner 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-5. - -== Overview - -SYCL allows device kernels to be defined in a variety of ways, one of which is as a lambda. Separate compilation means that the host-side invocation of a kernel lambda must be matched to the device-compiled variant of the lambda from a potentially different compilation. Lambda internal compiler naming is an implementation detail and not required to match across different compilers. Differences in macro expansion and other details between host and device compilations can further lead to divergence of compiled lambda name, for the same lambda in the original source code. - -SYCL 1.2.1 requires that a kernel name be specified as a template parameter of the invocation function when a kernel has been defined as a lambda. This kernel name enables correlation of the lambda name between multiple compilations. - -The extension described by this document makes naming of lambdas at invocation time optional in SYCL 1.2.1. Launch of a kernel is simplified from, for example: - -[source,c++,LambdaNamed,linenums] ----- -... -cgh.parallel_for(range<1>{ 1024 }, [=](id<1> idx) { - writeResult[idx] = idx[0]; -}); -... ----- - -To: - -[source,c++,NoName,linenums] ----- -... -cgh.parallel_for(range<1>{ 1024 }, [=](id<1> idx) { - writeResult[idx] = idx[0]; -}); -... ----- - -In this example, `` is optional and does not need to be included. - -== Enabling the extension - -In the Intel prototype implementation of this extension at https://github.com/intel/llvm[GitHub link], the extension is enabled through the compiler flag `-fsycl-unnamed-lambda`. When enabled, the extension defines the macro `$$__SYCL_INTEL_UNNAMED_LAMBDA__$$`. - -== Modifications of SYCL 1.2.1 specification - -=== Modify paragraph in Section 3.2 - -*Change from:* - -The `parallel_for` function is templated with a class, in this case called `class simple_test`. This class is used only as a name to enable the kernel (compiled with a device compiler) and the host code (possibly compiled with a different host compiler) to be linked. This is required because C++ lambda functions have no name that a linker could use to link the kernel to the host code. - -*To:* - -The `parallel_for` function is templated with a class, in this case called `class simple_test`. This class is used to manually name the kernel when desired, such as to avoid a compiler-generated name when debugging a kernel defined through a lambda. - -=== Remove sentences from Section 3.4.1.1 - -[line-through]#In SYCL, all kernels must have a kernel name, which must be a globally-accessible {cpp} type name. This is required to enable kernels compiled with one compiler to be linked to host code compiled with a different compiler.# - -[line-through]#For named function objects, the type name of the function object is sufficient as the kernel name, but for {cpp}11 lambda functions, the user must provide a user-defined type name as the kernel name.# - -=== Modify part of Section 3.9.2 - -For a lambda function there is no globally-visible name, so the user may optionally provide one for debugging or code style reasons. In SYCL, this optional name is provided as a template parameter to the kernel invocation function, such as `` in `parallel_for(...` - -=== Modify part of Section 4.8.5 - -*Change from:* - -Each function takes a kernel name template parameter. The kernel name must be a datatype that is unique for each kernel invocation. If a kernel is a named function object, and its type is globally visible, then the kernel's function object type will be automatically used as the kernel name and so the user does not need to supply a name. If the kernel function is a {cpp}11 lambda function, then the user must manually provide a kernel name to enable linking between host and device code to occur. - -*To:* - -Each function takes an optional kernel name template parameter. The user may optionally provide a kernel name, otherwise an implementation defined name will be generated for the kernel. - - -=== Modify Table 4.78 - -*Replace each instance of:* - -If it is a named function object and the function object type is globally visible there is no need for the developer to provide a kernel name (`typename KernelName`) for it, as described in 4.8.5. - -*With:* - -Specification of a kernel name (`typename KernelName`), as described in 4.8.5, is optional. - -=== Modify part of Section 4.8.9.2 - -*Change from:* - -We allow lambda functions to define kernels in SYCL, but we have an extra requirement to name lambda functions in order to enable the linking of the SYCL device kernels with the host code to invoke them. The name of a lambda function in SYCL is a {cpp} class. If the lambda function relies on template arguments, then the name of the lambda function must contain those template arguments. The class used for the name of a lambda function is only used for naming purposes and is not required to be defined. For details on restrictions for kernel naming, please refer to 6.2. - -To invoke a {cpp}11 lambda, the kernel name must be included explicitly by the user as a template parameter to the kernel invoke function. - -*To:* - -Kernels may be defined as lambda functions in SYCL. The name of a lambda function in SYCL may optionally be specified by passing it as a template parameter to the invoking method, and in that case, the lambda name is a {cpp} class. If the lambda function relies on template arguments, then if specified, the name of the lambda function must contain those template arguments. The class used for the name of a lambda function is only used for naming purposes and is not required to be defined. For details on restrictions for kernel naming, please refer to 6.2. - - -=== Modify part of Section 6.2 - -Lambdas do not have a globally-visible or unique name. A typename can optionally be provided to the kernel invoking interface, as described in 4.8.5, so that the developer can control the kernel name for purposes such as debugging. - - -== Proof of concept implementation references - -. https://github.com/intel/llvm/pull/387/files -. https://github.com/intel/llvm/pull/250/files - -== Issues - -None. - -//. asd -//+ -//-- -//*RESOLUTION*: Not resolved. -//-- - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2019-11-11|Michael Kinsner|*Initial public working draft* -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc b/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc deleted file mode 100755 index 56f419e78eaf3..0000000000000 --- a/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc +++ /dev/null @@ -1,185 +0,0 @@ -= SYCL_INTEL_deduction_guides -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. -:language: {basebackend@docbook:c++:cpp} - -== Introduction -IMPORTANT: This specification is a draft. - -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by permission by Khronos. - -NOTE: This document is better viewed when rendered as html with asciidoctor. GitHub does not render image icons. - -This document describes an extension that adds deduction guides for {cpp}17 compilers supporting class template argument deduction. - - -== Name Strings - -+SYCL_INTEL_deduction_guides+ - -== Notice - -Copyright (c) 2019 Intel Corporation. All rights reserved. - -== Status - -Working Draft - -This is a preview extension specification, intended to provide early access to a feature for review and community feedback. When the feature matures, this specification may be released as a formal extension. - -Because the interfaces defined by this specification are not final and are subject to change they are not intended to be used by shipping software products. - -== Version - -Built On: {docdate} + -Revision: 1 - -== Contact -Roland Schulz, Intel (roland 'dot' schulz 'at' intel 'dot' com) - -== Dependencies - -This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-5. - -== Overview - -SYCL 1.2.1 is based on {cpp}11. {cpp}17 adds **class template argument deduction** (CTAD). This extension adds {cpp}17 deduction guides for users which use SYCL with a {cpp}17 compiler. It simplifies usage of, e.g.: - -[source,c++,UsageFrom,linenums] -buffer b(ptr, range<2>(5, 5)); - -to: - -[source,c++,UsageTo,linenums] -buffer b(ptr, range(5, 5)); - -This extension only specifies explicit deduction guides. For many classes CTAD works implicitly without requiring deduction guides. A {cpp}17 compiler is required to support CTAD for all types according to the {cpp}17 standard. An example for a type without deduction guides but with CTAD support is `nd_range`: - -[source,c++,ndrange,linenums] -nd_range r(range(4), range(2)); - -== Enabling the extension - -This extension is enabled for any {cpp}17 compiler with CTAD support. The {cpp}17 compilation mode flag is implementation defined (e.g. `-std=c{plus}{plus}17` or `/std:c{plus}{plus}17`). - -== Modifications of SYCL 1.2.1 specification - -=== Add to the end of the header in 4.7.2.1 - -[source,c++,buffer,linenums] -#ifdef __cpp_deduction_guides -template -buffer(InputIterator, InputIterator, AllocatorT, const property_list & = {}) - ->buffer::value_type, 1, - AllocatorT>; -template -buffer(InputIterator, InputIterator, const property_list & = {}) - ->buffer::value_type, 1>; -template -buffer(const T *, const range &, AllocatorT, - const property_list & = {}) - ->buffer; -template -buffer(const T *, const range &, const property_list & = {}) - ->buffer; -#endif - -=== Add to the end of the header in 4.7.7.1 - -[source,c++,multiptr,linenums] -#ifdef __cpp_deduction_guides -template -multi_ptr( - accessor) - ->multi_ptr; -template -multi_ptr(accessor) - ->multi_ptr; -template -multi_ptr(accessor) - ->multi_ptr; -#endif - -=== Add to the end of the header in 4.8.1.1 - -[source,c++,range,linenums] -#ifdef __cpp_deduction_guides -range(size_t)->range<1>; -range(size_t, size_t)->range<2>; -range(size_t, size_t, size_t)->range<3>; -#endif - -=== Add to the end of the header in 4.8.1.3 - -[source,c++,id,linenums] -#ifdef __cpp_deduction_guides -id(size_t)->id<1>; -id(size_t, size_t)->id<2>; -id(size_t, size_t, size_t)->id<3>; -#endif - -=== Add to the end of the header in 4.20.2.1 - -[source,c++,vec,linenums] -#ifdef __cpp_deduction_guides -// Available only when: (std::is_same_v && ...) -template -vec(T, U...)->vec; -#endif - -=== Add to end of the last paragraph in 6.4 - -The SYCL specification defines some features requiring newer versions for {cpp} (e.g. {cpp}17 deduction guides). -These are available if the {cpp} compilers supports these features. - -== Proof of concept implementation references - -. https://github.com/intel/llvm/pull/772 -. https://github.com/intel/llvm/pull/773 -. https://github.com/intel/llvm/pull/834 - -== Issues - -None. - -//. asd -//+ -//-- -//*RESOLUTION*: Not resolved. -//-- - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2019-11-05|Roland Schulz|*Initial public working draft* -|======================================== - -//************************************************************************ -//Other formatting suggestions: -// -//* Use *bold* text for host APIs, or [source] syntax highlighting. -//* Use +mono+ text for device APIs, or [source] syntax highlighting. -//* Use +mono+ text for extension names, types, or enum values. -//* Use _italics_ for parameters. -//************************************************************************ diff --git a/sycl/doc/extensions/experimental/README.md b/sycl/doc/extensions/experimental/README.md new file mode 100644 index 0000000000000..fcbe58d14b6b4 --- /dev/null +++ b/sycl/doc/extensions/experimental/README.md @@ -0,0 +1,12 @@ +This directory contains the specifications for SYCL extensions that are +considered experimental in the DPC++ implementation. The APIs in these +extensions are not stable. They may be changed or even removed in subsequent +releases of DPC++ without prior notice. As a result, they are not recommended +for use in production code. + +Experimental extensions may eventually be promoted to "supported". When this +happens, a new specification is added to the "supported" directory, which may +not exactly match the experimental version. (In particular, the namespace +containing the APIs is often changed from `sycl::ext::oneapi::experimental` to +`sycl::ext::oneapi`.) The original experimental specification may be retained +for a time, or it may be removed. diff --git a/sycl/doc/extensions/Bf16Conversion/SYCL_INTEL_bf16_conversion.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc similarity index 98% rename from sycl/doc/extensions/Bf16Conversion/SYCL_INTEL_bf16_conversion.asciidoc rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc index 125a2c3d203d1..9b1018ced0b34 100644 --- a/sycl/doc/extensions/Bf16Conversion/SYCL_INTEL_bf16_conversion.asciidoc +++ b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc @@ -102,6 +102,9 @@ supports conversion of values of `float` type to `bfloat16` and back. If the device doesn't have the aspect, objects of `bfloat16` class must not be used in the device code. +**NOTE**: The `ext_intel_bf16_conversion` aspect is not yet supported. The +`bfloat16` class is currently supported only on Xe HP GPU. + == New `bfloat16` class The `bfloat16` class below provides the conversion functionality. Conversion diff --git a/sycl/doc/extensions/ExplicitSIMD/ESIMD-TODO-list.md b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/ESIMD-TODO-list.md similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/ESIMD-TODO-list.md rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/ESIMD-TODO-list.md diff --git a/sycl/doc/extensions/ExplicitSIMD/README.md b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/README.md similarity index 90% rename from sycl/doc/extensions/ExplicitSIMD/README.md rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/README.md index 42f7d8cc5eb65..a0b84d2cc6619 100644 --- a/sycl/doc/extensions/ExplicitSIMD/README.md +++ b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/README.md @@ -4,8 +4,8 @@ OneAPI provides the "Explicit SIMD" SYCL extension (or simply "ESIMD") for lower-level Intel GPU programming. It provides APIs closely matching Intel GPU ISA yet allows to write explicitly vectorized device code. This helps programmer to have more control over the generated code and depend less on compiler -optimizations. The [specification](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/ExplicitSIMD/dpcpp-explicit-simd.md), -[documented ESIMD APIs headers](https://github.com/intel/llvm/tree/sycl/sycl/include/sycl/ext/intel/experimental/esimd) and +optimizations. The [specification](SYCL_EXT_INTEL_ESIMD.md), +[documented ESIMD APIs headers](../../../../include/sycl/ext/intel/experimental/esimd) and [working code examples](https://github.com/intel/llvm-test-suite/tree/intel/SYCL/ESIMD) are available on the Intel DPC++ project's github. **_NOTE:_** _This extension is under active development and lots of APIs are @@ -81,7 +81,7 @@ program behavior if violated. ##### Features not supported with ESIMD extension: - Ahead-of-time compilation -- The [C and C++ Standard libraries support](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst) +- The [C and C++ Standard libraries support](../supported/C-CXX-StandardLibrary.rst) - The [Device library extensions](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/C-CXX-StandardLibrary/DeviceLibExtensions.rst) - Host device (in some cases) diff --git a/sycl/doc/extensions/ExplicitSIMD/dpcpp-explicit-simd.md b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/SYCL_EXT_INTEL_ESIMD.md similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/dpcpp-explicit-simd.md rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/SYCL_EXT_INTEL_ESIMD.md diff --git a/sycl/doc/extensions/ExplicitSIMD/images/Matrix_2_2_2_4__1_2.svg b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/Matrix_2_2_2_4__1_2.svg similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/images/Matrix_2_2_2_4__1_2.svg rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/Matrix_2_2_2_4__1_2.svg diff --git a/sycl/doc/extensions/ExplicitSIMD/images/Matrix_4_1_4_2__0_0.svg b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/Matrix_4_1_4_2__0_0.svg similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/images/Matrix_4_1_4_2__0_0.svg rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/Matrix_4_1_4_2__0_0.svg diff --git a/sycl/doc/extensions/ExplicitSIMD/images/VectorEven.svg b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/VectorEven.svg similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/images/VectorEven.svg rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/VectorEven.svg diff --git a/sycl/doc/extensions/ExplicitSIMD/images/VectorOdd.svg b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/VectorOdd.svg similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/images/VectorOdd.svg rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/VectorOdd.svg diff --git a/sycl/doc/extensions/ExplicitSIMD/images/simd_view.svg b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/simd_view.svg old mode 100755 new mode 100644 similarity index 100% rename from sycl/doc/extensions/ExplicitSIMD/images/simd_view.svg rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/images/simd_view.svg diff --git a/sycl/doc/extensions/OnlineCompilation/OnlineCompilation.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ONLINE_COMPILER.asciidoc similarity index 100% rename from sycl/doc/extensions/OnlineCompilation/OnlineCompilation.asciidoc rename to sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ONLINE_COMPILER.asciidoc diff --git a/sycl/doc/extensions/FreeFunctionQueries/SYCL_INTEL_free_function_queries.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES.asciidoc similarity index 100% rename from sycl/doc/extensions/FreeFunctionQueries/SYCL_INTEL_free_function_queries.asciidoc rename to sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES.asciidoc diff --git a/sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_sort.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GROUP_SORT.asciidoc old mode 100755 new mode 100644 similarity index 100% rename from sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_sort.asciidoc rename to sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GROUP_SORT.asciidoc diff --git a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_MATRIX.asciidoc similarity index 99% rename from sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc rename to sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_MATRIX.asciidoc index f3b96a8827ea0..141d5a14b39d4 100644 --- a/sycl/doc/extensions/Matrix/dpcpp-joint-matrix.asciidoc +++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_MATRIX.asciidoc @@ -567,7 +567,7 @@ for (int i = 0; i < msize; i++) ### Memory scope -The current experimental API uses `joint_` semantics to define the memory scope of the matrix. The long term solution is to use the proposed https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/LocalMemory/SYCL_INTEL_local_memory.asciidoc[`group_local_memory` extension] to allocate the matrix in local memory associated with a SYCL group as shown in the example below. +The current experimental API uses `joint_` semantics to define the memory scope of the matrix. The long term solution is to use the proposed link:../supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc[`group_local_memory` extension] to allocate the matrix in local memory associated with a SYCL group as shown in the example below. ```c++ diff --git a/sycl/doc/extensions/MaxWorkGroupQueries/max_work_group_query.md b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY.md similarity index 100% rename from sycl/doc/extensions/MaxWorkGroupQueries/max_work_group_query.md rename to sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY.md diff --git a/sycl/doc/extensions/fpga_io_pipes_design.rst b/sycl/doc/extensions/fpga_io_pipes_design.rst index ee51216bb015f..d562eeb44896e 100644 --- a/sycl/doc/extensions/fpga_io_pipes_design.rst +++ b/sycl/doc/extensions/fpga_io_pipes_design.rst @@ -22,7 +22,7 @@ Requirements Links ----- -.. _Spec: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/DataFlowPipes/data_flow_pipes.asciidoc +.. _Spec: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/SYCL_EXT_INTEL_DATAFLOW_PIPES.asciidoc .. _Interesting comment from Ronan: https://github.com/intel/llvm/pull/635#discussion_r325851766 Summary diff --git a/sycl/doc/extensions/removed/README.md b/sycl/doc/extensions/removed/README.md new file mode 100644 index 0000000000000..be47a7b5be0ea --- /dev/null +++ b/sycl/doc/extensions/removed/README.md @@ -0,0 +1,33 @@ +This directory contains an archive of old DPC++ extensions which are no longer +implemented. + +Normally, a supported extension is first marked "deprecated", and the compiler +raises a warning for applications that use it. After the deprecation period, +support for the extension may be removed, and the specification for the +extension is moved to this directory for reference. + +Experimental extensions may change or be removed without any deprecation +period. Since we do not expect production code to use experimental extensions, +we do not archive their specifications when they are changed or removed. +Likewise, we do not archive "proposed" extension specifications if we later +decide not to implement them. + +Note that the following extension specifications have been removed because +their features have been incorporated into the core +[SYCL 2020 specification][1]. Please see that document for the most accurate +description of these features. + +[1]: + +| Extension | Description | +|-----------------------------------------------|-------------------------------------------------------------| +|SYCL\_INTEL\_bitcast | Adds `sycl::bit_cast` | +|SYCL\_INTEL\_device\_specific\_kernel\_queries | Adds `info::kernel_device_specific` queries | +|SYCL\_INTEL\_attribute\_style | Changes position of C++ attributes | +|Queue Order Properties | Adds `property::queue::in_order` | +|SYCL\_INTEL\_parallel\_for\_simplification | Makes calls to `parallel_for` less verbose | +|Queue Shortcuts | Adds shortcut functions to `queue` | +|SYCL\_INTEL\_relax\_standard\_layout | Drops standard layout requirement for data in buffers, etc. | +|Unified Shared Memory | Adds new unified shared memory APIs | +|SYCL\_INTEL\_unnamed\_kernel\_lambda | Makes kernel type-names optional | +|SYCL\_INTEL\_deduction\_guides | Simplifies SYCL object construction by using C++ CTAD | diff --git a/sycl/doc/extensions/C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst b/sycl/doc/extensions/supported/C-CXX-StandardLibrary.rst similarity index 100% rename from sycl/doc/extensions/C-CXX-StandardLibrary/C-CXX-StandardLibrary.rst rename to sycl/doc/extensions/supported/C-CXX-StandardLibrary.rst diff --git a/sycl/doc/extensions/supported/README.md b/sycl/doc/extensions/supported/README.md new file mode 100644 index 0000000000000..fc8e36782ddbb --- /dev/null +++ b/sycl/doc/extensions/supported/README.md @@ -0,0 +1,11 @@ +This directory contains the specifications for SYCL extensions that are fully +supported by the DPC++ implementation. The APIs in these extensions are +generally stable in future releases of DPC++, retaining backward compatibility +with application code. + +If support is dropped for one of these extensions, it goes through a +deprecation process. The APIs in the extension are first marked "deprecated", +so that the compiler issues a warning when they are used, but the extension +remains supported during this time. Once the deprecation period elapses, the +support for the extension may be dropped, and the extension specification +document is moved to the "../removed" directory. diff --git a/sycl/doc/extensions/accessor_properties/SYCL_INTEL_buffer_location.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_BUFFER_LOCATION.asciidoc similarity index 100% rename from sycl/doc/extensions/accessor_properties/SYCL_INTEL_buffer_location.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_BUFFER_LOCATION.asciidoc diff --git a/sycl/doc/extensions/DataFlowPipes/data_flow_pipes.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_DATAFLOW_PIPES.asciidoc old mode 100755 new mode 100644 similarity index 100% rename from sycl/doc/extensions/DataFlowPipes/data_flow_pipes.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_DATAFLOW_PIPES.asciidoc diff --git a/sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md similarity index 100% rename from sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md diff --git a/sycl/doc/extensions/IntelFPGA/FPGASelector.md b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_DEVICE_SELECTOR.md similarity index 100% rename from sycl/doc/extensions/IntelFPGA/FPGASelector.md rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_DEVICE_SELECTOR.md diff --git a/sycl/doc/extensions/IntelFPGA/FPGALsu.md b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_LSU.md similarity index 100% rename from sycl/doc/extensions/IntelFPGA/FPGALsu.md rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_LSU.md diff --git a/sycl/doc/extensions/IntelFPGA/FPGAReg.md b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_REG.md similarity index 100% rename from sycl/doc/extensions/IntelFPGA/FPGAReg.md rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_FPGA_REG.md diff --git a/sycl/doc/extensions/KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT.asciidoc old mode 100755 new mode 100644 similarity index 100% rename from sycl/doc/extensions/KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_INTEL_KERNEL_ARGS_RESTRICT.asciidoc diff --git a/sycl/doc/extensions/accessor_properties/SYCL_ONEAPI_accessor_properties.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_ACCESSOR_PROPERTIES.asciidoc similarity index 100% rename from sycl/doc/extensions/accessor_properties/SYCL_ONEAPI_accessor_properties.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_ACCESSOR_PROPERTIES.asciidoc diff --git a/sycl/doc/extensions/Assert/SYCL_ONEAPI_ASSERT.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_ASSERT.asciidoc similarity index 100% rename from sycl/doc/extensions/Assert/SYCL_ONEAPI_ASSERT.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_ASSERT.asciidoc diff --git a/sycl/doc/extensions/LevelZeroBackend/LevelZeroBackend.md b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO.md similarity index 97% rename from sycl/doc/extensions/LevelZeroBackend/LevelZeroBackend.md rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO.md index 43a251c280a03..869098bf1c154 100644 --- a/sycl/doc/extensions/LevelZeroBackend/LevelZeroBackend.md +++ b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO.md @@ -45,7 +45,8 @@ For further details see here: . +[SYCL\_EXT\_ONEAPI\_FILTER\_SELECTOR](../supported/SYCL_EXT_ONEAPI_FILTER_SELECTOR.asciidoc). + Similar to how SYCL_DEVICE_FILTER applies filtering to the entire process this device selector can be used to programmatically select the Level-Zero backend. diff --git a/sycl/doc/extensions/PlatformContext/PlatformContext.adoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_DEFAULT_CONTEXT.asciidoc similarity index 100% rename from sycl/doc/extensions/PlatformContext/PlatformContext.adoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_DEFAULT_CONTEXT.asciidoc diff --git a/sycl/doc/extensions/SYCL_ONEAPI_dot_accumulate.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_DOT_ACCUMULATE.asciidoc old mode 100755 new mode 100644 similarity index 100% rename from sycl/doc/extensions/SYCL_ONEAPI_dot_accumulate.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_DOT_ACCUMULATE.asciidoc diff --git a/sycl/doc/extensions/EnqueueBarrier/enqueue_barrier.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_ENQUEUE_BARRIER.asciidoc similarity index 100% rename from sycl/doc/extensions/EnqueueBarrier/enqueue_barrier.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_ENQUEUE_BARRIER.asciidoc diff --git a/sycl/doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_EXTENDED_ATOMICS.asciidoc similarity index 100% rename from sycl/doc/extensions/ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_EXTENDED_ATOMICS.asciidoc diff --git a/sycl/doc/extensions/FilterSelector/FilterSelector.adoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_FILTER_SELECTOR.asciidoc similarity index 100% rename from sycl/doc/extensions/FilterSelector/FilterSelector.adoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_FILTER_SELECTOR.asciidoc diff --git a/sycl/doc/extensions/LocalMemory/LocalMemory.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc similarity index 100% rename from sycl/doc/extensions/LocalMemory/LocalMemory.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_LOCAL_MEMORY.asciidoc diff --git a/sycl/doc/extensions/SubGroupMask/SubGroupMask.asciidoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_SUB_GROUP_MASK.asciidoc old mode 100755 new mode 100644 similarity index 100% rename from sycl/doc/extensions/SubGroupMask/SubGroupMask.asciidoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_SUB_GROUP_MASK.asciidoc diff --git a/sycl/doc/extensions/UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc b/sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY.asciidoc similarity index 100% rename from sycl/doc/extensions/UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc rename to sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY.asciidoc diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp index 5b3f75429bd67..35b5f635c8603 100644 --- a/sycl/include/CL/sycl/access/access.hpp +++ b/sycl/include/CL/sycl/access/access.hpp @@ -72,7 +72,7 @@ template struct mode_target_tag_t { explicit mode_target_tag_t() = default; }; -#if __cplusplus > 201402L +#if __cplusplus >= 201703L inline constexpr mode_tag_t read_only{}; inline constexpr mode_tag_t read_write{}; diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index cd0b69a2b457f..ba2f5e15f373a 100644 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -876,7 +876,7 @@ class __SYCL_SPECIAL_CLASS accessor : return AdjustedMode; } -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template static constexpr bool IsValidTag() { return std::is_same>::value || @@ -1187,7 +1187,7 @@ class __SYCL_SPECIAL_CLASS accessor : } #endif -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template 201402L +#if __cplusplus >= 201703L template 201402L +#if __cplusplus >= 201703L template 201402L +#if __cplusplus >= 201703L template 201402L +#if __cplusplus >= 201703L template 201402L +#if __cplusplus >= 201703L template 201402L +#if __cplusplus >= 201703L template accessor(buffer) @@ -2302,7 +2302,7 @@ class host_accessor return std::is_same::value && (Dims > 0) && (Dims == Dimensions); } -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template static constexpr bool IsValidTag() { return std::is_same>::value; @@ -2361,7 +2361,7 @@ class host_accessor const detail::code_location CodeLoc = detail::code_location::current()) : AccessorT(BufferRef, PropertyList, CodeLoc) {} -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template ()>> @@ -2381,7 +2381,7 @@ class host_accessor const detail::code_location CodeLoc = detail::code_location::current()) : AccessorT(BufferRef, CommandGroupHandler, PropertyList, CodeLoc) {} -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template ()>> @@ -2402,7 +2402,7 @@ class host_accessor const detail::code_location CodeLoc = detail::code_location::current()) : AccessorT(BufferRef, AccessRange, {}, PropertyList, CodeLoc) {} -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template ()>> @@ -2425,7 +2425,7 @@ class host_accessor : AccessorT(BufferRef, CommandGroupHandler, AccessRange, {}, PropertyList, CodeLoc) {} -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template ()>> @@ -2449,7 +2449,7 @@ class host_accessor : AccessorT(BufferRef, AccessRange, AccessOffset, PropertyList, CodeLoc) { } -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template ()>> @@ -2473,7 +2473,7 @@ class host_accessor : AccessorT(BufferRef, CommandGroupHandler, AccessRange, AccessOffset, PropertyList, CodeLoc) {} -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template ()>> @@ -2489,7 +2489,7 @@ class host_accessor #endif }; -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template host_accessor(buffer) diff --git a/sycl/include/CL/sycl/buffer.hpp b/sycl/include/CL/sycl/buffer.hpp index 4e603da0f4811..4c8dea672b6a6 100644 --- a/sycl/include/CL/sycl/buffer.hpp +++ b/sycl/include/CL/sycl/buffer.hpp @@ -72,6 +72,12 @@ class buffer { using EnableIfSameNonConstIterators = typename detail::enable_if_t< std::is_same::value && !std::is_const::value, ItA>; + std::array rangeToArray(range<3> &r) { return {r[0], r[1], r[2]}; } + + std::array rangeToArray(range<2> &r) { return {r[0], r[1], 0}; } + + std::array rangeToArray(range<1> &r) { return {r[0], 0, 0}; } + buffer(const range &bufferRange, const property_list &propList = {}, const detail::code_location CodeLoc = detail::code_location::current()) @@ -79,7 +85,9 @@ class buffer { impl = std::make_shared( size() * sizeof(T), detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>()); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), nullptr, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(const range &bufferRange, AllocatorT allocator, @@ -90,7 +98,9 @@ class buffer { size() * sizeof(T), detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), nullptr, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(T *hostData, const range &bufferRange, @@ -101,7 +111,9 @@ class buffer { hostData, size() * sizeof(T), detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>()); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), hostData, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(T *hostData, const range &bufferRange, @@ -113,7 +125,9 @@ class buffer { propList, make_unique_ptr>( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), hostData, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } template @@ -126,7 +140,9 @@ class buffer { hostData, size() * sizeof(T), detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>()); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), hostData, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } template @@ -140,7 +156,9 @@ class buffer { propList, make_unique_ptr>( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), hostData, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(const std::shared_ptr &hostData, @@ -153,7 +171,10 @@ class buffer { propList, make_unique_ptr>( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), + (void *)hostData.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(const std::shared_ptr &hostData, @@ -166,7 +187,10 @@ class buffer { propList, make_unique_ptr>( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), + (void *)hostData.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(const std::shared_ptr &hostData, @@ -178,7 +202,10 @@ class buffer { hostData, size() * sizeof(T), detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>()); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), + (void *)hostData.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(const std::shared_ptr &hostData, @@ -190,7 +217,10 @@ class buffer { hostData, size() * sizeof(T), detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>()); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), + (void *)hostData.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } template >( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), &*first, + (const void *)typeid(T).name(), dimensions, + sizeof(T), {Range[0], 0, 0}); } template >()); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + size_t r[3] = {Range[0], 0, 0}; + impl->constructorNotification(CodeLoc, (void *)impl.get(), &*first, + (const void *)typeid(T).name(), dimensions, + sizeof(T), r); } // This constructor is a prototype for a future SYCL specification @@ -235,7 +270,10 @@ class buffer { detail::getNextPowerOfTwo(sizeof(T)), propList, make_unique_ptr>( allocator)); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + size_t r[3] = {Range[0], 0, 0}; + impl->constructorNotification(CodeLoc, (void *)impl.get(), container.data(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), r); } // This constructor is a prototype for a future SYCL specification @@ -252,7 +290,9 @@ class buffer { : impl(b.impl), Range(subRange), OffsetInBytes(getOffsetInBytes(baseIndex, b.Range)), IsSubBuffer(true) { - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), impl.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); if (b.is_sub_buffer()) throw cl::sycl::invalid_object_error( @@ -281,7 +321,9 @@ class buffer { detail::pi::cast(MemObject), SyclContext, BufSize, make_unique_ptr>(), AvailableEvent); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), &MemObject, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } #endif @@ -289,14 +331,18 @@ class buffer { const detail::code_location CodeLoc = detail::code_location::current()) : impl(rhs.impl), Range(rhs.Range), OffsetInBytes(rhs.OffsetInBytes), IsSubBuffer(rhs.IsSubBuffer) { - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), impl.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer(buffer &&rhs, const detail::code_location CodeLoc = detail::code_location::current()) : impl(std::move(rhs.impl)), Range(rhs.Range), OffsetInBytes(rhs.OffsetInBytes), IsSubBuffer(rhs.IsSubBuffer) { - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), impl.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } buffer &operator=(const buffer &rhs) = default; @@ -373,7 +419,7 @@ class buffer { *this, accessRange, accessOffset, {}, CodeLoc); } -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template auto get_access(Ts... args) { return accessor{*this, args...}; @@ -485,7 +531,9 @@ class buffer { MemObject, SyclContext, BufSize, make_unique_ptr>(), AvailableEvent); - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), &MemObject, + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } // Reinterpret contructor @@ -495,7 +543,9 @@ class buffer { const detail::code_location CodeLoc = detail::code_location::current()) : impl(Impl), Range(reinterpretRange), OffsetInBytes(reinterpretOffset), IsSubBuffer(isSubBuffer) { - impl->constructorNotification(CodeLoc, (void *)impl.get()); + impl->constructorNotification(CodeLoc, (void *)impl.get(), Impl.get(), + (const void *)typeid(T).name(), dimensions, + sizeof(T), rangeToArray(Range).data()); } template diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp index 745854f0aa9ed..7c231b1ad27dc 100644 --- a/sycl/include/CL/sycl/detail/accessor_impl.hpp +++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp @@ -209,7 +209,7 @@ using Requirement = AccessorImplHost; void __SYCL_EXPORT addHostAccessorAndWait(Requirement *Req); -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template constexpr access::mode deduceAccessMode() { diff --git a/sycl/include/CL/sycl/detail/buffer_impl.hpp b/sycl/include/CL/sycl/detail/buffer_impl.hpp index 4bd84207a1d0d..09595d31bae52 100644 --- a/sycl/include/CL/sycl/detail/buffer_impl.hpp +++ b/sycl/include/CL/sycl/detail/buffer_impl.hpp @@ -155,6 +155,11 @@ class __SYCL_EXPORT buffer_impl final : public SYCLMemObjT { void *allocateMem(ContextImplPtr Context, bool InitFromUserData, void *HostPtr, RT::PiEvent &OutEventToWait) override; + void constructorNotification(const detail::code_location &CodeLoc, + void *UserObj, const void *HostObj, + const void *Type, uint32_t Dim, + uint32_t ElemType, size_t Range[3]); + // TODO: remove once ABI break is allowed void constructorNotification(const detail::code_location &CodeLoc, void *UserObj); void destructorNotification(void *UserObj); diff --git a/sycl/include/CL/sycl/detail/defines_elementary.hpp b/sycl/include/CL/sycl/detail/defines_elementary.hpp index ba3160f05a0ce..5865498a384ca 100644 --- a/sycl/include/CL/sycl/detail/defines_elementary.hpp +++ b/sycl/include/CL/sycl/detail/defines_elementary.hpp @@ -75,7 +75,7 @@ #endif #ifndef __SYCL_FALLTHROUGH -#if defined(__cplusplus) && __cplusplus > 201402L && \ +#if defined(__cplusplus) && __cplusplus >= 201703L && \ __SYCL_HAS_CPP_ATTRIBUTE(fallthrough) #define __SYCL_FALLTHROUGH [[fallthrough]] #elif __SYCL_HAS_CPP_ATTRIBUTE(gnu::fallthrough) diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp index 9e079561868f2..f81e77749724f 100644 --- a/sycl/include/CL/sycl/detail/type_traits.hpp +++ b/sycl/include/CL/sycl/detail/type_traits.hpp @@ -58,7 +58,7 @@ __SYCL_INLINE_CONSTEXPR bool is_group_v = namespace detail { // Type for Intel device UUID extension. // For details about this extension, see -// sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md +// sycl/doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md using uuid_type = std::array; template struct copy_cv_qualifiers; diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in index e6053ebf4ff1c..9e1dc5d4f9621 100644 --- a/sycl/include/CL/sycl/feature_test.hpp.in +++ b/sycl/include/CL/sycl/feature_test.hpp.in @@ -47,7 +47,6 @@ namespace sycl { #define SYCL_EXT_ONEAPI_SRGB 1 #define SYCL_EXT_ONEAPI_SUB_GROUP 1 #define SYCL_EXT_INTEL_BF16_CONVERSION 1 -#define SYCL_EXT_INTEL_BITCAST 1 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1 #ifdef __clang__ #if __has_extension(sycl_extended_atomics) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index bc2be4de427b7..1fb2062bcebe6 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -1298,7 +1298,7 @@ class __SYCL_EXPORT handler { handler &operator=(const handler &) = delete; handler &operator=(handler &&) = delete; -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template void set_specialization_constant( typename std::remove_reference_t::value_type Value) { diff --git a/sycl/include/CL/sycl/kernel_bundle.hpp b/sycl/include/CL/sycl/kernel_bundle.hpp index 8cd76f5fd64e2..4244d3b18fb1e 100644 --- a/sycl/include/CL/sycl/kernel_bundle.hpp +++ b/sycl/include/CL/sycl/kernel_bundle.hpp @@ -259,7 +259,7 @@ class kernel_bundle : public detail::kernel_bundle_plain { // This guard is needed because the libsycl.so can compiled with C++ <=14 // while the code requires C++17. This code is not supposed to be used by the // libsycl.so so it should not be a problem. -#if __cplusplus > 201402L +#if __cplusplus >= 201703L /// \returns true if any device image in the kernel_bundle uses specialization /// constant whose address is SpecName template bool has_specialization_constant() const noexcept { diff --git a/sycl/include/CL/sycl/kernel_handler.hpp b/sycl/include/CL/sycl/kernel_handler.hpp index 4fcbd2bc45cfd..e18925b793720 100644 --- a/sycl/include/CL/sycl/kernel_handler.hpp +++ b/sycl/include/CL/sycl/kernel_handler.hpp @@ -21,7 +21,7 @@ namespace sycl { /// \ingroup sycl_api class kernel_handler { public: -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template typename std::remove_reference_t::value_type get_specialization_constant() { @@ -35,7 +35,7 @@ class kernel_handler { PI_INVALID_OPERATION); #endif // __SYCL_DEVICE_ONLY__ } -#endif // __cplusplus > 201402L +#endif // __cplusplus >= 201703L private: void __init_specialization_constants_buffer( diff --git a/sycl/include/CL/sycl/properties/accessor_properties.hpp b/sycl/include/CL/sycl/properties/accessor_properties.hpp index 547d93374e23d..4b608ff9c2fce 100644 --- a/sycl/include/CL/sycl/properties/accessor_properties.hpp +++ b/sycl/include/CL/sycl/properties/accessor_properties.hpp @@ -25,7 +25,7 @@ class __SYCL2020_DEPRECATED("spelling is now: no_init") noinit } // namespace property -#if __cplusplus > 201402L +#if __cplusplus >= 201703L __SYCL_INLINE_CONSTEXPR property::no_init no_init; @@ -61,7 +61,7 @@ struct buffer_location { }; }; } // namespace property -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template inline constexpr property::buffer_location::instance buffer_location{}; #endif @@ -93,7 +93,7 @@ struct no_alias { }; } // namespace property -#if __cplusplus > 201402L +#if __cplusplus >= 201703L inline constexpr property::no_offset::instance no_offset; inline constexpr property::no_alias::instance no_alias; diff --git a/sycl/include/sycl/ext/intel/experimental/esimd.hpp b/sycl/include/sycl/ext/intel/experimental/esimd.hpp index 6459b47789cbb..d20ade95c1db4 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd.hpp @@ -13,7 +13,7 @@ /// @defgroup sycl_esimd DPC++ Explicit SIMD API /// This is a low-level API providing direct access to Intel GPU hardware /// features. ESIMD overview can be found -/// [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/ExplicitSIMD/dpcpp-explicit-simd.md). +/// [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/SYCL_EXT_INTEL_ESIMD/SYCL_EXT_INTEL_ESIMD.md). ///@{ /// @ingroup sycl_esimd diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp index b85bb4685de1f..7e59085d0836e 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_mask_impl.hpp @@ -49,7 +49,7 @@ class simd_mask_impl /// Construct from an array. To allow e.g. simd_mask m({1,0,0,1,...}). template > simd_mask_impl(const raw_element_type (&&Arr)[N1]) { - base_type::template init_from_array(std::move(Arr)); + base_type::init_from_array(std::move(Arr)); } /// Implicit conversion from simd. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp index 2d02bdc79a96d..6feabf9e553d3 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp @@ -77,8 +77,36 @@ struct is_simd_flag_type> : std::true_type {}; template static inline constexpr bool is_simd_flag_type_v = is_simd_flag_type::value; +/// @cond ESIMD_DETAIL + namespace detail { +// Functions to support efficient simd constructors - avoiding internal loop +// over elements. +template +constexpr vector_type_t make_vector_impl(const T (&&Arr)[N], + std::index_sequence) { + return vector_type_t{Arr[Is]...}; +} + +template +constexpr vector_type_t make_vector(const T (&&Arr)[N]) { + return make_vector_impl(std::move(Arr), std::make_index_sequence{}); +} + +template +constexpr vector_type_t make_vector_impl(T Base, T Stride, + std::index_sequence) { + return vector_type_t{(T)(Base + ((T)Is) * Stride)...}; +} + +template +constexpr vector_type_t make_vector(T Base, T Stride) { + return make_vector_impl(Base, Stride, std::make_index_sequence{}); +} + +/// @endcond ESIMD_DETAIL + /// This is a base class for all ESIMD simd classes with real storage (simd, /// simd_mask_impl). It wraps a clang vector as the storage for the elements. /// Additionally this class supports region operations that map to Intel GPU @@ -120,10 +148,13 @@ class simd_obj_impl { static constexpr int length = N; protected: - template > - void init_from_array(const RawTy (&&Arr)[N1]) noexcept { - for (auto I = 0; I < N; ++I) { - M_data[I] = Arr[I]; + void init_from_array(const Ty (&&Arr)[N]) noexcept { + if constexpr (is_wrapper_elem_type_v) { + for (auto I = 0; I < N; ++I) { + M_data[I] = bitcast_to_raw_type(Arr[I]); + } + } else { + M_data = make_vector(std::move(Arr)); } } @@ -158,10 +189,13 @@ class simd_obj_impl { /// Initialize a simd_obj_impl object with an initial value and step. simd_obj_impl(Ty Val, Ty Step) noexcept { __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); -#pragma unroll - for (int i = 0; i < N; ++i) { - M_data[i] = bitcast_to_raw_type(Val); - Val = binary_op(Val, Step); + if constexpr (is_wrapper_elem_type_v || !std::is_integral_v) { + for (int i = 0; i < N; ++i) { + M_data[i] = bitcast_to_raw_type(Val); + Val = binary_op(Val, Step); + } + } else { + M_data = make_vector(Val, Step); } } @@ -175,8 +209,8 @@ class simd_obj_impl { /// Construct from an array. To allow e.g. simd_mask_type m({1,0,0,1,...}). template > - simd_obj_impl(const RawTy (&&Arr)[N1]) noexcept { - __esimd_dbg_print(simd_obj_impl(const RawTy(&&Arr)[N1])); + simd_obj_impl(const Ty (&&Arr)[N1]) noexcept { + __esimd_dbg_print(simd_obj_impl(const Ty(&&Arr)[N1])); init_from_array(std::move(Arr)); } diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp index 33de91b2ad96d..8017d1f52ee56 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/math.hpp @@ -39,7 +39,10 @@ namespace esimd { /// @param src the input vector. /// @return vector of elements converted to \p T0 with saturation. template -__ESIMD_API simd saturate(simd src) { +__ESIMD_API std::enable_if_t || + std::is_same_v, + simd> +saturate(simd src) { if constexpr (detail::is_generic_floating_point_v) return __esimd_sat(src.data()); else if constexpr (detail::is_generic_floating_point_v) { @@ -54,9 +57,9 @@ __ESIMD_API simd saturate(simd src) { return __esimd_ustrunc_sat(src.data()); } else { if constexpr (std::is_signed::value) - return __esimd_sutrunc_sat(src.data()); - else return __esimd_sstrunc_sat(src.data()); + else + return __esimd_sutrunc_sat(src.data()); } } @@ -1723,7 +1726,12 @@ ESIMD_NODEBUG ESIMD_INLINE T exp(T src0) { simd Result = __esimd_##name(src0.data()); \ if (flag != saturation_on) \ return Result; \ - return esimd::saturate(Result); \ + if constexpr (!std::is_same_v) { \ + auto RawRes = esimd::saturate(Result).data(); \ + return detail::convert_vector(std::move(RawRes)); \ + } else { \ + return esimd::saturate(Result); \ + } \ } \ template \ __ESIMD_API T name(float src0, int flag = saturation_off) { \ diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 5e923483ae844..4edd6cfde7fdf 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -116,81 +116,48 @@ __ESIMD_API SurfaceIndex get_surface_index(AccessorTy acc) { // /// Flat-address gather. /// -template > -__ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) && - (ElemsPerAddr == 1 || ElemsPerAddr == 2 || - ElemsPerAddr == 4)), - simd> +template > +__ESIMD_API std::enable_if_t> gather(const Tx *p, simd offsets, simd_mask pred = 1) { simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); addrs = addrs + offsets_i; - if constexpr (sizeof(T) == 1 && ElemsPerAddr == 2) { - auto Ret = __esimd_svm_gather()>( - addrs.data(), detail::ElemsPerAddrEncoding(), - pred.data()); - return __esimd_rdregion(Ret, 0); - } else if constexpr (sizeof(T) == 1 && ElemsPerAddr == 1) { + if constexpr (sizeof(T) == 1) { auto Ret = __esimd_svm_gather()>( - addrs.data(), detail::ElemsPerAddrEncoding(), - pred.data()); - return __esimd_rdregion(Ret, 0); - } else if constexpr (sizeof(T) == 2 && ElemsPerAddr == 1) { + addrs.data(), detail::ElemsPerAddrEncoding<1>(), pred.data()); + return __esimd_rdregion(Ret, 0); + } else if constexpr (sizeof(T) == 2) { auto Ret = __esimd_svm_gather()>( addrs.data(), detail::ElemsPerAddrEncoding<2>(), pred.data()); return __esimd_rdregion(Ret, 0); - } else if constexpr (sizeof(T) == 2) - return __esimd_svm_gather()>( - addrs.data(), detail::ElemsPerAddrEncoding<2 * ElemsPerAddr>(), - pred.data()); - else - return __esimd_svm_gather()>( - addrs.data(), detail::ElemsPerAddrEncoding(), - pred.data()); + } else + return __esimd_svm_gather()>( + addrs.data(), detail::ElemsPerAddrEncoding<1>(), pred.data()); } /// Flat-address scatter. /// -template > -__ESIMD_API std::enable_if_t<((n == 8 || n == 16 || n == 32) && - (ElemsPerAddr == 1 || ElemsPerAddr == 2 || - ElemsPerAddr == 4))> -scatter(Tx *p, simd offsets, simd vals, +template > +__ESIMD_API std::enable_if_t +scatter(Tx *p, simd offsets, simd vals, simd_mask pred = 1) { simd offsets_i = convert(offsets); simd addrs(reinterpret_cast(p)); addrs = addrs + offsets_i; - if constexpr (sizeof(T) == 1 && ElemsPerAddr == 2) { - simd D; - D = __esimd_wrregion( - D.data(), vals.data(), 0); - __esimd_svm_scatter()>( - addrs.data(), D.data(), detail::ElemsPerAddrEncoding(), - pred.data()); - } else if constexpr (sizeof(T) == 1 && ElemsPerAddr == 1) { + if constexpr (sizeof(T) == 1) { simd D; - D = __esimd_wrregion( - D.data(), vals.data(), 0); + D = __esimd_wrregion(D.data(), vals.data(), 0); __esimd_svm_scatter()>( - addrs.data(), D.data(), detail::ElemsPerAddrEncoding(), - pred.data()); - } else if constexpr (sizeof(T) == 2 && ElemsPerAddr == 1) { + addrs.data(), D.data(), detail::ElemsPerAddrEncoding<1>(), pred.data()); + } else if constexpr (sizeof(T) == 2) { simd D; D = __esimd_wrregion(D.data(), vals.data(), 0); __esimd_svm_scatter()>( addrs.data(), D.data(), detail::ElemsPerAddrEncoding<2>(), pred.data()); - } else if constexpr (sizeof(T) == 2) - __esimd_svm_scatter()>( - addrs.data(), vals.data(), - detail::ElemsPerAddrEncoding<2 * ElemsPerAddr>(), pred.data()); - else - __esimd_svm_scatter()>( - addrs.data(), vals.data(), detail::ElemsPerAddrEncoding(), + } else + __esimd_svm_scatter()>( + addrs.data(), vals.data(), detail::ElemsPerAddrEncoding<1>(), pred.data()); } diff --git a/sycl/include/sycl/ext/oneapi/atomic_accessor.hpp b/sycl/include/sycl/ext/oneapi/atomic_accessor.hpp index 7bf6376fbe1ca..e034399adddba 100644 --- a/sycl/include/sycl/ext/oneapi/atomic_accessor.hpp +++ b/sycl/include/sycl/ext/oneapi/atomic_accessor.hpp @@ -18,7 +18,7 @@ namespace sycl { namespace ext { namespace oneapi { -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template struct order_tag_t { explicit order_tag_t() = default; @@ -69,7 +69,7 @@ class atomic_accessor using AccessorT::AccessorT; -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template @@ -108,7 +108,7 @@ class atomic_accessor } }; -#if __cplusplus > 201402L +#if __cplusplus >= 201703L template diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index f67d5268443d8..4b05475f86cc8 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1740,7 +1740,7 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: // TODO: Check if Intel device UUID extension is utilized for CUDA. // For details about this extension, see - // sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md + // sycl/doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md case PI_DEVICE_INFO_UUID: return PI_INVALID_VALUE; diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 1a2faac5589e1..865041a359eec 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2269,7 +2269,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, case PI_DEVICE_INFO_UUID: // Intel extension for device UUID. This returns the UUID as // std::array. For details about this extension, - // see sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md. + // see sycl/doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md. return ReturnValue(Device->ZeDeviceProperties->uuid.id); case PI_DEVICE_INFO_EXTENSIONS: { // Convention adopted from OpenCL: @@ -3398,7 +3398,7 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of // pinned host memory, see: - // https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/UsePinnedMemoryProperty/UsePinnedMemoryPropery.adoc + // sycl/doc/extensions/supported/SYCL_EXT_ONEAPI_USE_PINNED_HOST_MEMORY_PROPERTY.asciidoc // We are however missing such functionality in Level Zero, so we just // ignore the flag for now. // diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index c53e73cdd5cf0..69f31a582f973 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -270,7 +270,7 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: // TODO: Check if device UUID extension is enabled in OpenCL. // For details about Intel UUID extension, see - // sycl/doc/extensions/IntelGPU/IntelGPUDeviceInfo.md + // sycl/doc/extensions/supported/SYCL_EXT_INTEL_DEVICE_INFO.md case PI_DEVICE_INFO_UUID: // TODO: Implement. case PI_DEVICE_INFO_ATOMIC_64: diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index a88bb44ee38f5..02673bf239cb5 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -31,10 +31,20 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, std::move(Context), this, HostPtr, HostPtrReadOnly, BaseT::getSize(), BaseT::MInteropEvent, BaseT::MInteropContext, MProps, OutEventToWait); } +void buffer_impl::constructorNotification(const detail::code_location &CodeLoc, + void *UserObj, const void *HostObj, + const void *Type, uint32_t Dim, + uint32_t ElemSize, size_t Range[3]) { + XPTIRegistry::bufferConstructorNotification(UserObj, CodeLoc, HostObj, Type, + Dim, ElemSize, Range); +} +// TODO: remove once ABI break is allowed void buffer_impl::constructorNotification(const detail::code_location &CodeLoc, void *UserObj) { - XPTIRegistry::bufferConstructorNotification(UserObj, CodeLoc); + size_t r[3] = {0, 0, 0}; + constructorNotification(CodeLoc, UserObj, nullptr, "", 0, 0, r); } + void buffer_impl::destructorNotification(void *UserObj) { XPTIRegistry::bufferDestructorNotification(UserObj); } diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 6b592371833b2..1cd77083e0c16 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -298,7 +298,6 @@ void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, return UserPtr; void *NewMem = MemObj->allocateHostMem(); - // Need to initialize new memory if user provides pointer to read only // memory. if (UserPtr && HostPtrReadOnly == true) diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp index c49ba9de7674c..386da714f142d 100644 --- a/sycl/source/detail/xpti_registry.cpp +++ b/sycl/source/detail/xpti_registry.cpp @@ -17,22 +17,15 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { #ifdef XPTI_ENABLE_INSTRUMENTATION -xpti::trace_event_data_t * -XPTIRegistry::createTraceEvent(void *Obj, const char *ObjName, uint64_t &IId, - const detail::code_location &CodeLoc, - uint16_t TraceEventType) { - std::string Name; - if (CodeLoc.fileName()) { - Name = std::string(CodeLoc.fileName()) + ":" + - std::to_string(CodeLoc.lineNumber()) + ":" + - std::to_string(CodeLoc.columnNumber()); - } else { - xpti::utils::StringHelper NG; - Name = NG.nameWithAddress(ObjName, Obj); - } - xpti::payload_t Payload( - Name.c_str(), (CodeLoc.fileName() ? CodeLoc.fileName() : ""), - CodeLoc.lineNumber(), CodeLoc.columnNumber(), (void *)Obj); +xpti::trace_event_data_t *XPTIRegistry::createTraceEvent( + const void *Obj, const void *FuncPtr, uint64_t &IId, + const detail::code_location &CodeLoc, uint16_t TraceEventType) { + xpti::utils::StringHelper NG; + auto Name = NG.nameWithAddress(CodeLoc.functionName(), + const_cast(FuncPtr)); + xpti::payload_t Payload(Name.c_str(), + (CodeLoc.fileName() ? CodeLoc.fileName() : ""), + CodeLoc.lineNumber(), CodeLoc.columnNumber(), Obj); // Calls could be at different user-code locations; We create a new event // based on the code location info and if this has been seen before, a @@ -43,16 +36,28 @@ XPTIRegistry::createTraceEvent(void *Obj, const char *ObjName, uint64_t &IId, #endif // XPTI_ENABLE_INSTRUMENTATION void XPTIRegistry::bufferConstructorNotification( - void *UserObj, const detail::code_location &CodeLoc) { + const void *UserObj, const detail::code_location &CodeLoc, + const void *HostObj, const void *Type, uint32_t Dim, uint32_t ElemSize, + size_t Range[3]) { (void)UserObj; (void)CodeLoc; + (void)HostObj; + (void)Type; + (void)Dim; + (void)ElemSize; + (void)Range; #ifdef XPTI_ENABLE_INSTRUMENTATION GlobalHandler::instance().getXPTIRegistry().initializeFrameworkOnce(); if (!xptiTraceEnabled()) return; uint64_t IId; - xpti::offload_buffer_data_t BufConstr{(uintptr_t)UserObj}; + xpti::offload_buffer_data_t BufConstr{(uintptr_t)UserObj, + (uintptr_t)HostObj, + (const char *)Type, + ElemSize, + Dim, + {Range[0], Range[1], Range[2]}}; xpti::trace_event_data_t *TraceEvent = createTraceEvent( UserObj, "buffer", IId, CodeLoc, xpti::trace_offload_buffer_event); @@ -61,7 +66,8 @@ void XPTIRegistry::bufferConstructorNotification( #endif } -void XPTIRegistry::bufferAssociateNotification(void *UserObj, void *MemObj) { +void XPTIRegistry::bufferAssociateNotification(const void *UserObj, + const void *MemObj) { (void)UserObj; (void)MemObj; #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -77,7 +83,8 @@ void XPTIRegistry::bufferAssociateNotification(void *UserObj, void *MemObj) { #endif } -void XPTIRegistry::bufferReleaseNotification(void *UserObj, void *MemObj) { +void XPTIRegistry::bufferReleaseNotification(const void *UserObj, + const void *MemObj) { (void)UserObj; (void)MemObj; #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -93,7 +100,7 @@ void XPTIRegistry::bufferReleaseNotification(void *UserObj, void *MemObj) { #endif } -void XPTIRegistry::bufferDestructorNotification(void *UserObj) { +void XPTIRegistry::bufferDestructorNotification(const void *UserObj) { (void)UserObj; #ifdef XPTI_ENABLE_INSTRUMENTATION if (!xptiTraceEnabled()) @@ -107,8 +114,8 @@ void XPTIRegistry::bufferDestructorNotification(void *UserObj) { } void XPTIRegistry::bufferAccessorNotification( - void *UserObj, void *AccessorObj, uint32_t Target, uint32_t Mode, - const detail::code_location &CodeLoc) { + const void *UserObj, const void *AccessorObj, uint32_t Target, + uint32_t Mode, const detail::code_location &CodeLoc) { (void)UserObj; (void)AccessorObj; (void)CodeLoc; diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index bd682cf79960a..d5b0b7f916b85 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -89,15 +89,16 @@ class XPTIRegistry { #endif // XPTI_ENABLE_INSTRUMENTATION } - static void - bufferConstructorNotification(void *UserObj, - const detail::code_location &CodeLoc); - static void bufferAssociateNotification(void *UserObj, void *MemObj); - static void bufferReleaseNotification(void *UserObj, void *MemObj); - static void bufferDestructorNotification(void *UserObj); - static void bufferAccessorNotification(void *UserObj, void *AccessorObj, - uint32_t Target, uint32_t Mode, - const detail::code_location &CodeLoc); + static void bufferConstructorNotification(const void *, + const detail::code_location &, + const void *, const void *, + uint32_t, uint32_t, size_t[3]); + static void bufferAssociateNotification(const void *, const void *); + static void bufferReleaseNotification(const void *, const void *); + static void bufferDestructorNotification(const void *); + static void bufferAccessorNotification(const void *, const void *, uint32_t, + uint32_t, + const detail::code_location &); private: std::unordered_set MActiveStreams; @@ -105,7 +106,7 @@ class XPTIRegistry { #ifdef XPTI_ENABLE_INSTRUMENTATION static xpti::trace_event_data_t * - createTraceEvent(void *Obj, const char *ObjName, uint64_t &IId, + createTraceEvent(const void *Obj, const void *ObjName, uint64_t &IId, const detail::code_location &CodeLoc, uint16_t TraceEventType); #endif // XPTI_ENABLE_INSTRUMENTATION diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 15a7e9e55e3e4..2681453647fb6 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3834,6 +3834,7 @@ _ZN2cl4sycl6detail11SYCLMemObjTC2EmRKNS0_7contextEmNS0_5eventESt10unique_ptrINS1 _ZN2cl4sycl6detail11buffer_impl11allocateMemESt10shared_ptrINS1_12context_implEEbPvRP9_pi_event _ZN2cl4sycl6detail11buffer_impl22destructorNotificationEPv _ZN2cl4sycl6detail11buffer_impl23constructorNotificationERKNS1_13code_locationEPv +_ZN2cl4sycl6detail11buffer_impl23constructorNotificationERKNS1_13code_locationEPvPKvS8_jjPm _ZN2cl4sycl6detail11make_deviceEmNS0_7backendE _ZN2cl4sycl6detail11make_kernelERKNS0_7contextERKNS0_13kernel_bundleILNS0_12bundle_stateE2EEEmbNS0_7backendE _ZN2cl4sycl6detail11make_kernelEmRKNS0_7contextENS0_7backendE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 786b52a911e0e..2359db7857240 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -1508,6 +1508,7 @@ ?compile_with_kernel_name@program@sycl@cl@@AEAAXV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@0_J@Z ?compile_with_source@program@sycl@cl@@QEAAXV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@0@Z ?constructorNotification@buffer_impl@detail@sycl@cl@@QEAAXAEBUcode_location@234@PEAX@Z +?constructorNotification@buffer_impl@detail@sycl@cl@@QEAAXAEBUcode_location@234@PEAXPEBX2IIQEA_K@Z ?constructorNotification@detail@sycl@cl@@YAXPEAX0W4target@access@23@W4mode@523@AEBUcode_location@123@@Z ?contains_specialization_constants@kernel_bundle_plain@detail@sycl@cl@@QEBA_NXZ ?contextSetExtendedDeleter@pi@detail@sycl@cl@@YAXAEBVcontext@34@P6AXPEAX@Z1@Z diff --git a/sycl/test/esimd/ctor_codegen.cpp b/sycl/test/esimd/ctor_codegen.cpp new file mode 100644 index 0000000000000..7073dfd281096 --- /dev/null +++ b/sycl/test/esimd/ctor_codegen.cpp @@ -0,0 +1,102 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -S %s -o - | FileCheck %s + +// Check efficiency of LLVM IR generated for various simd constructors. + +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::experimental::esimd; + +// clang-format off + +// Array-based constructor, FP element type, no loops exected - check. +SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z3food( +// CHECK: {{[^,]*}} %[[RES:[a-zA-Z0-9_\.]+]], +// CHECK: {{[^,]*}} %[[I:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val({ i, i }); + return val; +// CHECK: %[[V0:[a-zA-Z0-9_\.]+]] = insertelement <2 x double> undef, double %[[I]], i64 0 +// CHECK-NEXT: %[[V1:[a-zA-Z0-9_\.]+]] = shufflevector <2 x double> %[[V0]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x double> %[[V1]], <2 x double> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +// Base + step constructor, FP element type, loops exected - don't check. +SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION { + simd val(17, 3); + return val; +} + +// Base + step constructor, integer element type, no loops exected - check. +SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(17, 3); + return val; + // CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 + // CHECK-NEXT: store <2 x i32> , <2 x i32> addrspace(4)* %[[MDATA]] + // CHECK-NEXT: ret void + // CHECK-NEXT: } +} + +// Broadcast constructor, FP element type, no loops exected - check. +SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(-7); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x float> , <2 x float> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +// Array-based simd_mask constructor, no loops exected - check. +SYCL_EXTERNAL auto foomask() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z7foomaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd_mask<2> val({ 1, 0 }); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x i16> , <2 x i16> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +// Broadcast simd_mask constructor, no loops exected - check. +SYCL_EXTERNAL auto geemask() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z7geemaskv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd_mask<2> val(1); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x i16> , <2 x i16> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} + +// The element type is 'half', which requires conversion, so code generation +// is less efficient - has loop over elements. No much reason to check. +SYCL_EXTERNAL auto foohalf(half i) SYCL_ESIMD_FUNCTION { + simd val({ i, i }); + return val; +} + +// The element type is 'half', which requires conversion, so code generation +// is less efficient - has loop over elements. No much reason to check. +SYCL_EXTERNAL auto barhalf() SYCL_ESIMD_FUNCTION { + simd val(17, 3); + return val; +} + +// Here the element is half too, but code generation is efficient because +// no per-element operations are needed - scalar is converted before broadcasting. +SYCL_EXTERNAL auto geehalf() SYCL_ESIMD_FUNCTION { +// CHECK: define dso_local spir_func void @_Z7geehalfv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(-7); + return val; +// CHECK: %[[MDATA:[a-zA-Z0-9_\.]+]] = getelementptr inbounds {{.*}} %[[RES]], i64 0, i32 0, i32 0 +// CHECK-NEXT: store <2 x half> , <2 x half> addrspace(4)* %[[MDATA]] +// CHECK-NEXT: ret void +// CHECK-NEXT: } +} diff --git a/xpti/include/xpti/xpti_data_types.h b/xpti/include/xpti/xpti_data_types.h index 7b9a948742b16..a4f4eb6b4ec4b 100644 --- a/xpti/include/xpti/xpti_data_types.h +++ b/xpti/include/xpti/xpti_data_types.h @@ -168,7 +168,7 @@ struct payload_t { // valid since we can potentially reconstruct the name and the source file // information during post-processing step of symbol resolution; this // indicates a partial but valid payload. - payload_t(void *codeptr) { + payload_t(const void *codeptr) { code_ptr_va = codeptr; name = nullptr; ///< Invalid name string pointer source_file = nullptr; ///< Invalid source file string pointer @@ -193,7 +193,7 @@ struct payload_t { } } - payload_t(const char *func_name, void *codeptr) { + payload_t(const char *func_name, const void *codeptr) { code_ptr_va = codeptr; name = func_name; ///< Invalid name string pointer source_file = nullptr; ///< Invalid source file string pointer @@ -210,7 +210,7 @@ struct payload_t { // on dynamic backtrace as a possibility. In this case, we send in the // caller/callee information as a string in the form "caller->callee" that // will be used to generate the unique ID. - payload_t(const char *kname, const char *caller_callee, void *codeptr) { + payload_t(const char *kname, const char *caller_callee, const void *codeptr) { if (codeptr) { code_ptr_va = codeptr; flags |= (uint64_t)payload_flag_t::CodePointerAvailable; @@ -231,7 +231,7 @@ struct payload_t { // also have the function name and source file name along with the line and // column number of the trace point that forms the payload. payload_t(const char *kname, const char *sf, int line, int col, - void *codeptr) { + const void *codeptr) { code_ptr_va = codeptr; /// Capture the rest of the parameters name = kname; @@ -520,6 +520,16 @@ struct trace_event_data_t { struct offload_buffer_data_t { /// A pointer to user level memory offload object. uintptr_t user_object_handle = 0; + /// A pointer to host memory offload object. + uintptr_t host_object_handle = 0; + /// A string representing the type of buffer element. + const char *element_type = nullptr; + /// Buffer element size in bytes + uint32_t element_size = 0; + /// Buffer dimensions number. + uint32_t dim = 0; + /// Buffer size for each dimension. + size_t range[3] = {0, 0, 0}; }; /// Describes offload accessor diff --git a/xptifw/unit_test/xpti_api_tests.cpp b/xptifw/unit_test/xpti_api_tests.cpp index 6d2d3d0e5afcd..da5eb8b206753 100644 --- a/xptifw/unit_test/xpti_api_tests.cpp +++ b/xptifw/unit_test/xpti_api_tests.cpp @@ -433,8 +433,16 @@ TEST_F(xptiApiTest, xptiNotifySubscribersGoodInput) { StreamID, (uint16_t)xpti::trace_point_type_t::offload_alloc_destruct, fn_callback); EXPECT_EQ(Result, xpti::result_t::XPTI_RESULT_SUCCESS); + Result = xptiRegisterCallback( + StreamID, (uint16_t)xpti::trace_point_type_t::offload_alloc_release, + fn_callback); + EXPECT_EQ(Result, xpti::result_t::XPTI_RESULT_SUCCESS); + Result = xptiRegisterCallback( + StreamID, (uint16_t)xpti::trace_point_type_t::offload_alloc_accessor, + fn_callback); + EXPECT_EQ(Result, xpti::result_t::XPTI_RESULT_SUCCESS); - xpti::offload_buffer_data_t UserBufferData{0x01020304}; + xpti::offload_buffer_data_t UserBufferData{1, 5, "int", 4, 2, {3, 2, 0}}; xpti::offload_buffer_association_data_t AssociationData{0x01020304, 0x05060708}; xpti::offload_accessor_data_t UserAccessorData{0x01020304, 0x09000102, 1, 2};