Add get_info runtime query

intel · dm-vodopyanov · Aug 28, 2023 · Jan 9, 2023 · Jan 9, 2023 · Jan 10, 2023
commit 805630cdbb965baf6c6fa3157fda915f8e539139
@@ -58,11 +58,11 @@ XMX.
 == Overview
 The Intel backend implementations on both Intel AMX and Intel XMX
 support `joint_matrix`, `joint_matrix_load`, `joint_matrix_store`,
-`joint_matrix_mad`, `joint_matrix_fill`, `get_wi_data`, and the query
-interface, as they are defined in the sycl_ext_oneapi_matrix
-extension. There are additional specifics about the supported layouts
-that enable extra performance and functionality listed in this
-document.
+`joint_matrix_mad`, `joint_matrix_fill`, `joint_matrix_apply`, and the
+query interface, as they are defined in the sycl_ext_oneapi_matrix
+extension. Besides element-wise operations with mapping information,
+there are additional specifics about the supported layouts that enable
+extra performance and functionality listed in this document.
 This extension presents some supplementary Intel AMX and Intel XMX
 features not contained within the sycl_ext_oneapi_matrix
 extension. The additional features are built on top of the
@@ -75,11 +75,11 @@ AMX and Intel XMX backends.
 
 This extension provides a feature-test macro as described in the core SYCL
 specification. An implementation supporting this extension must
-predefine the macro `SYCL_EXT_INTEL_MATRIX` to one of the values defined in the table below.
-Applications can test for the existence of this macro to determine if the
-implementation supports this feature, or applications can test the macro's
-value to determine which of the extension's APIs the implementation
-supports.
+predefine the macro `SYCL_EXT_INTEL_MATRIX` to one of the values
+defined in the table below.Applications can test for the existence of
+this macro to determine if the implementation supports this feature,
+or applications can test the macro's value to determine which of the
+extension's APIs the implementation supports.
 
 [%header,cols="1,5"]
 |===
@@ -213,7 +213,7 @@ order to reason about the matrix view and extract the relevant
 piece. However, for element-wise operations where the same operation
 is performed on all the elements of the matrix, having all the WIs in
 the group apply the operation inside a loop iterating over the
-`length` of `wi_data` guarantees the whole matrix element-wise operation.   
+`length` of `wi_data` guarantees the whole matrix element-wise operation.
 
 Note that `get_wi_data` cannot return a fixed size array length
 because the length of the WI portion is a runtime variable for the
@@ -248,7 +248,7 @@ class wi_element {
   wi_element &operator*=(const T &rhs);
   wi_element &operator/=(const T &rhs);
 
-  std::tuple<size_t, size_t> get_coord();	
+  std::tuple<size_t, size_t> get_coord();
 };
 }
 ```
@@ -257,14 +257,14 @@ In the following example `wi_data_c` is a reference to the WI owned
 portion of the joint matrix `matC`. As such `wi_data_c[i] OP rhs`
 updates the corresponding matrix element in the joint_matrix `matC`.
 Vectorization along the sub group dimension will get enabled
-automatically to vectorize the contiguous portion of the matrix. 
+automatically to vectorize the contiguous portion of the matrix.
 
 
 ```c++
 auto wi_data_c = get_wi_data(sg, matC);
 for (int i = 0; i < wi_data_c.length(); i++)
         wi_data_c[i] *= alpha;    // Note that the indexing here "i"
-	is in the vector owned by a WI, not in the matrix C        
+	is in the vector owned by a WI, not in the matrix C
 ```
 
 IMPORTANT: In the current implementation, only the `sub_group` scope
@@ -287,7 +287,7 @@ auto data = get_wi_data(sg, tA);
 for (int i = 0; i < data.length(); ++i) {
   auto [row, col] = data[i].get_coord();
   sum_of_local_rows[row] += data[i];
-}  
+}
 ```
 
 IMPORTANT: `get_coord` is not implemented yet.
@@ -314,7 +314,7 @@ for a 16-bit type.
       // a3, b3, c3, d3
       // a4, b4, c4, d4
       // ---------------------------------
-      // The same matrix reformatted in packed layout. 
+      // The same matrix reformatted in packed layout.
       // Here, packing of 2 elements is needed to form 32 bits.
       // Element a1 is contiguous in memory with element a2, etc.
       // ---------------------------------
@@ -332,7 +332,7 @@ for a 16-bit type.
       // a3, b3, c3, d3
       // a4, b4, c4, d4
       // ---------------------------------
-      // The same matrix reformatted in packed layout.  
+      // The same matrix reformatted in packed layout.
       // Here, packing of 4 elements is needed to form 32 bits.
       // Elements a1, a2, a3, a4 are contiguous in memory, etc.
       // ---------------------------------
@@ -348,7 +348,7 @@ range<2> L = {1, SG_SIZE};
 int8_t *memA = malloc_shared<int8_t>(M*K, q);
 int8_t *memB = malloc_shared<int8_t>(K*N, q);
 int32_t *memC = malloc_shared<int32_t>(M*N, q);
-q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> item)                            
+q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> item)
   [[sycl::reqd_sub_group_size(SG_SIZE)]] {
    const auto global_idx = item.get_global_id(0);
    const auto global_idy = item.get_global_id(1);
@@ -366,12 +366,12 @@ q.parallel_for(nd_range<2>(G, L), [=](nd_item<2> item)
 	  sg_startx * tM * K + k, K);
      joint_matrix_load(sg, tB,
           multi_ptr<int8_t, sycl::access::address_space::global_space>(memB) +
-	  k * N*4 + sg_starty/SG_SIZE*tN*4, N*4); 
+	  k * N*4 + sg_starty/SG_SIZE*tN*4, N*4);
      tC = joint_matrix_mad(sg, tA, tB, tC);
    }
    auto wi_data_c = ext::intel::experimental::matrix::get_wi_data(sg, tC);
    for (int i = 0; i < wi_data_c.length(); i++)
-     wi_data_c[i] *= alpha; 
+     wi_data_c[i] *= alpha;
    joint_matrix_store(sg, tC,
         multi_ptr<int32_t, sycl::access::address_space::global_space>(memC) +
 	sg_startx * tM * N + sg_starty/SG_SIZE*tN, N, layout::row_major);