/* ipt.hpp - In-Place Transposition * * When transitioning between different media, * e.g. CPU to GPU, CPU to Network, CPU to disk, * it's often necessary to physically transpose * multi-dimensional arrays to reformat as C or * Fortran order. Tranposing matrices is also * a common action in linear algebra, but often * you can get away with just changing the strides. * * An out-of-place transposition is easy to write, * often faster, but will spike peak memory consumption. * * This library grants the user the option of performing * an in-place transposition which trades CPU time for * peak memory usage. * * Author: William Silversmith * Date: Feb. 2019 */ #include #include #include #ifndef IN_PLACE_TRANSPOSE_H #define IN_PLACE_TRANSPOSE_H // ipt = in-place transpose // call as: // 2d: ipt::ipt(arr, sx, sy); // 3d: ipt::ipt(arr, sx, sy, sz); // 4d: ipt::ipt(arr, sx, sy, sz, sw); namespace ipt { template void square_ipt(T* arr, const size_t sx, const size_t sy) { T tmp = 0; size_t k = 0; size_t next_k = 0; size_t base_k = 0; // just for going faster for (size_t y = 0; y < sy; y++) { base_k = sx * y; for (size_t x = y; x < sx; x++) { k = x + base_k; next_k = y + sy * x; tmp = arr[next_k]; arr[next_k] = arr[k]; arr[k] = tmp; } } } /* A permutation, P(k), is a mapping of * one arrangement of numbers to another. * For an m x n array, the permuatation * mapping from C to Fortran order is: * * P(k) := mk mod mn - 1 * iP(k) := nk mod mn - 1 (the inverse) * * Where does this come from? Assume we are * going from C to Fortran order (it doesn't * matter either way). The indicies are defined * as: * * k = C(x,y) = x + sx * y * F(x,y) = y + sy * x * * The permutation P(k) is the transformation: * * P(C(x,y)) = F(x,y) * * 1. P(x + sx * y) = y + sx * x * 2. sy (x + sx y) = sy x + sx sy y * 3. Let q = (sx sy - 1) * 4. sy x + sx sy y % q * 5. ((sy x % q) + (sx sy y % q)) % q by distributive identity * 6. sy x is identical b/c q is always bigger * 7. sx sy y reduces to y * 8 q is always bigger than sy x + y so it disappears * * ==> P(k) = y + sy * x = F(x,y) * ==> P(k) = sy * k % (sx sy - 1) * * Note that P(0) and P(q) are always 0 and q respectively. * * Now we need a way to implement this insight. * How can we move the data around without using too * much extra space? A simple algorithm is * "follow-the-cycles". Each time you try moving a * k to P(k), it displaces the resident tile. Eventually, * this forms a cycle. When you reach the end of a cycle, * you can stop processing and move to unvisited parts of * the array. This requires storing a packed bit representation * of where we've visited to make sure we get everything. * This means we need to store between 2.0x and 1.016x * memory in the size of the original array depending on its * data type (2.0x would be a transpose of another bit packed * array and 1.016x would be 64-bit data types). * * There are fancier algorithms that use divide-and-conquer, * and SIMD tricks, and near zero extra memory, but * this is a good place to start. Fwiw, the bit vector * has an O(nm) time complexity (really 2nm) while the * sans-bit vector algorithms are O(nm log nm). */ template void rect_ipt(T* arr, const size_t sx, const size_t sy) { const size_t sxy = sx * sy; std::vector visited; visited.resize(sxy); visited[0] = true; visited[sxy - 1] = true; const size_t q = sxy - 1; size_t k, next_k; T tmp1, tmp2; for (size_t i = 1; i < q; i++) { if (visited[i]) { continue; } k = i; tmp1 = arr[k]; next_k = sy * k - q * (k / sx); // P(k) while (!visited[next_k]) { tmp2 = arr[next_k]; arr[next_k] = tmp1; tmp1 = tmp2; visited[next_k] = true; k = next_k; next_k = sy * k - q * (k / sx); // P(k) } } } // note: sx == sy == sz... find better convention? // still good for mutliple-dispatch. template void square_ipt( T* arr, const size_t sx, const size_t sy, const size_t sz ) { T tmp = 0; const size_t sxy = sx * sy; const size_t syz = sy * sz; size_t k = 0; size_t next_k = 0; size_t base_k = 0; for (size_t z = 0; z < sz; z++) { for (size_t y = 0; y < sy; y++) { base_k = sx * y + sxy * z; for (size_t x = z; x < sx; x++) { k = x + base_k; next_k = z + sz * y + syz * x; tmp = arr[next_k]; arr[next_k] = arr[k]; arr[k] = tmp; } } } } inline size_t P_3d( const size_t k, const size_t sx, const size_t sy, const size_t sz ) { const size_t sxy = sx * sy; // k = x + sx y + sx sy z size_t z = k / sxy; size_t y = (k - (z * sxy)) / sx; size_t x = k - sx * (y + z * sy); return z + sz * (y + sy * x); } template void rect_ipt( T* arr, const size_t sx, const size_t sy, const size_t sz ) { const size_t sxy = sx * sy; const size_t N = sxy * sz; std::vector visited; visited.resize(N); visited[0] = true; visited[N - 1] = true; size_t k, next_k; T tmp1 = 0, tmp2 = 0; for (size_t i = 1; i < (N - 1); i++) { if (visited[i]) { continue; } k = i; tmp1 = arr[k]; next_k = P_3d(k, sx, sy, sz); while (!visited[next_k]) { tmp2 = arr[next_k]; arr[next_k] = tmp1; tmp1 = tmp2; visited[next_k] = true; k = next_k; next_k = P_3d(k, sx, sy, sz); } } } inline size_t P_4d( const size_t k, const size_t sx, const size_t sy, const size_t sz, const size_t sw ) { const size_t sxy = sx * sy; const size_t sxyz = sxy * sz; // k = x + sx y + sx sy z + sx sy sz w size_t w = k / sxyz; size_t z = (k - w * sxyz) / sxy; size_t y = (k - (w * sxyz) - (z * sxy)) / sx; size_t x = k - (w * sxyz) - (z * sxy) - y * sx; return w + sw * (z + sz * (y + sy * x)); } template void rect_ipt( T* arr, const size_t sx, const size_t sy, const size_t sz, const size_t sw ) { const size_t N = sx * sy * sz * sw; std::vector visited; visited.resize(N); visited[0] = true; visited[N - 1] = true; size_t k, next_k; T tmp1 = 0, tmp2 = 0; for (size_t i = 1; i < (N - 1); i++) { if (visited[i]) { continue; } k = i; tmp1 = arr[k]; next_k = P_4d(k, sx, sy, sz, sw); while (!visited[next_k]) { tmp2 = arr[next_k]; arr[next_k] = tmp1; tmp1 = tmp2; visited[next_k] = true; k = next_k; next_k = P_4d(k, sx, sy, sz, sw); } } } template void ipt(T* arr, const size_t sx) { return; } template void ipt(T* arr, const size_t sx, const size_t sy) { if (sx * sy <= 1) { return; } if (sx == sy) { square_ipt(arr, sx, sy); } else { rect_ipt(arr, sx, sy); } } template void ipt(T* arr, const size_t sx, const size_t sy, const size_t sz) { if (sx * sy * sz <= 1) { return; } if (sx == sy && sy == sz) { square_ipt(arr, sx, sy, sz); } else { rect_ipt(arr, sx, sy, sz); } } template void ipt( T* arr, const size_t sx, const size_t sy, const size_t sz, const size_t sw ) { if (sx * sy * sz * sw <= 1) { return; } rect_ipt(arr, sx, sy, sz, sw); } }; namespace pyipt { template void _ipt2d(T* arr, const size_t sx, const size_t sy) { ipt::ipt(arr, sx, sy); } template void _ipt3d(T* arr, const size_t sx, const size_t sy, const size_t sz) { ipt::ipt(arr, sx, sy, sz); } template void _ipt4d( T* arr, const size_t sx, const size_t sy, const size_t sz, const size_t sw ) { ipt::ipt(arr, sx, sy, sz, sw); } }; #endif