From 8c5786bd5b677f5294be8ffb9fe049c6a1dbd67a Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 25 Jan 2017 13:36:21 +0100 Subject: [PATCH 1/7] Add the glue code for AFKMC2 --- .travis.yml | 5 ++++- kmcuda.cc | 20 ++++++++++++-------- kmcuda.h | 7 ++++++- kmeans.cu | 6 +++--- private.h | 8 ++++---- python.cc | 34 ++++++++++++++++++++++++++++------ 6 files changed, 57 insertions(+), 23 deletions(-) diff --git a/.travis.yml b/.travis.yml index 47881b8..55f5eff 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,4 +11,7 @@ before_script: - cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0 .. script: - - make -j2 VERBOSE=1 \ No newline at end of file + - make -j2 VERBOSE=1 + +notifications: + email: false \ No newline at end of file diff --git a/kmcuda.cc b/kmcuda.cc index 1642337..afedb6d 100644 --- a/kmcuda.cc +++ b/kmcuda.cc @@ -189,10 +189,10 @@ static KMCUDAResult print_memory_stats(const std::vector &devs) { extern "C" { KMCUDAResult kmeans_init_centroids( - KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size, - uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, - const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, - const float *host_centroids, const udevptrs &samples, + KMCUDAInitMethod method, void *init_params, uint32_t samples_size, + uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric, + uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, + int32_t verbosity, const float *host_centroids, const udevptrs &samples, udevptrs *dists, udevptrs *dev_sums, udevptrs *centroids) { srand(seed); switch (method) { @@ -237,7 +237,7 @@ KMCUDAResult kmeans_init_centroids( ); break; } - case kmcudaInitMethodPlusPlus: + case kmcudaInitMethodPlusPlus: { INFO("performing kmeans++...\n"); float smoke = NAN; uint32_t first_offset; @@ -307,13 +307,17 @@ KMCUDAResult kmeans_init_centroids( (j - 1) * features_size, features_size); } break; + } + case kmcudaInitMethodAFKMC2: { + break; + } } INFO("\rdone \n"); return kmcudaSuccess; } KMCUDAResult kmeans_cuda( - KMCUDAInitMethod init, float tolerance, float yinyang_t, + KMCUDAInitMethod init, void *init_params, float tolerance, float yinyang_t, KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, @@ -392,8 +396,8 @@ KMCUDAResult kmeans_cuda( FOR_EACH_DEV(cudaProfilerStart()); #endif RETERR(kmeans_init_centroids( - init, samples_size, features_size, clusters_size, metric, seed, devs, - device_ptrs, fp16x2, verbosity, centroids, device_samples, + init, init_params, samples_size, features_size, clusters_size, metric, + seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples, reinterpret_cast*>(&device_assignments), reinterpret_cast*>(&device_assignments_prev), &device_centroids), diff --git a/kmcuda.h b/kmcuda.h index b4a6c01..dbe4f11 100644 --- a/kmcuda.h +++ b/kmcuda.h @@ -15,6 +15,7 @@ enum KMCUDAResult { enum KMCUDAInitMethod { kmcudaInitMethodRandom = 0, kmcudaInitMethodPlusPlus, + kmcudaInitMethodAFKMC2, kmcudaInitMethodImport }; @@ -27,6 +28,10 @@ extern "C" { /// @brief Performs K-means clustering on GPU / CUDA. /// @param init centroids initialization method. +/// @param init_params pointer to a struct / number with centroid initialization +/// parameters. Ignored unless init == kmcudaInitMethodAFKMC2. +/// In case with kmcudaInitMethodAFKMC2 it is expected to be +/// uint32_t* to m; m == 0 means the default value (200). /// @param tolerance if the number of reassignments drop below this ratio, stop. /// @param yinyang_t the relative number of cluster groups, usually 0.1. /// @param metric the distance metric to use. The default is Euclidean (L2), can be @@ -53,7 +58,7 @@ extern "C" { /// the corresponding centroids. If nullptr, not calculated. /// @return KMCUDAResult. KMCUDAResult kmeans_cuda( - KMCUDAInitMethod init, float tolerance, float yinyang_t, + KMCUDAInitMethod init, void *init_params, float tolerance, float yinyang_t, KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, diff --git a/kmeans.cu b/kmeans.cu index 8aa17ec..e81d67a 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -857,9 +857,9 @@ KMCUDAResult kmeans_cuda_yy( tmpbufs2.emplace_back(tmpbufs.back().get() + h_clusters_size, true); } RETERR(kmeans_init_centroids( - kmcudaInitMethodPlusPlus, h_clusters_size, h_features_size, h_yy_groups_size, - metric, 0, devs, -1, fp16x2, verbosity, nullptr, *centroids, &tmpbufs, drifts_yy, - centroids_yy), + kmcudaInitMethodPlusPlus, nullptr, h_clusters_size, h_features_size, + h_yy_groups_size, metric, 0, devs, -1, fp16x2, verbosity, nullptr, + *centroids, &tmpbufs, drifts_yy, centroids_yy), INFO("kmeans_init_centroids() failed for yinyang groups: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_cuda_lloyd( diff --git a/private.h b/private.h index 54766e8..820ff19 100644 --- a/private.h +++ b/private.h @@ -262,10 +262,10 @@ KMCUDAResult kmeans_cuda_setup( uint32_t yy_groups_size, const std::vector &devs, int32_t verbosity); KMCUDAResult kmeans_init_centroids( - KMCUDAInitMethod method, uint32_t samples_size, uint16_t features_size, - uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, - const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, - const float *host_centroids, const udevptrs &samples, + KMCUDAInitMethod method, void *init_params, uint32_t samples_size, + uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric, + uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, + int32_t verbosity, const float *host_centroids, const udevptrs &samples, udevptrs *dists, udevptrs *dev_sums, udevptrs *centroids); KMCUDAResult kmeans_cuda_yy( diff --git a/python.cc b/python.cc index 7674a39..aae22dd 100644 --- a/python.cc +++ b/python.cc @@ -70,6 +70,7 @@ using pyarray = _pyobj; static const std::unordered_map init_methods { {"kmeans++", kmcudaInitMethodPlusPlus}, {"k-means++", kmcudaInitMethodPlusPlus}, + {"afkmc2", kmcudaInitMethodAFKMC2}, {"random", kmcudaInitMethodRandom} }; @@ -168,6 +169,7 @@ static bool get_samples( static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs) { uint32_t clusters_size = 0, + afkmc2_m = 0, seed = static_cast(time(NULL)), device = 0; int32_t verbosity = 0; @@ -188,19 +190,39 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs } KMCUDAInitMethod init; - if (init_obj == Py_None) { - init = kmcudaInitMethodPlusPlus; - } else if (PyUnicode_Check(init_obj)) { - pyobj bytes(PyUnicode_AsASCIIString(init_obj)); + auto set_init = [&init](PyObject *obj) { + pyobj bytes(PyUnicode_AsASCIIString(obj)); auto iminit = init_methods.find(PyBytes_AsString(bytes.get())); if (iminit == init_methods.end()) { PyErr_SetString( PyExc_ValueError, "Unknown centroids initialization method. Supported values are " "\"kmeans++\", \"random\" and ."); - return NULL; + return false; } init = iminit->second; + return true; + }; + + if (init_obj == Py_None) { + init = kmcudaInitMethodPlusPlus; + } else if (PyUnicode_Check(init_obj)) { + if (!set_init(init_obj)) { + return NULL; + } + } else if PyTuple_Check(init_obj) { + auto e1 = PyTuple_GetItem(init_obj, 0); + if (e1 == nullptr || e1 == Py_None) { + PyErr_SetString( + PyExc_ValueError, "centroid initialization method may not be null."); + return NULL; + } + if (!set_init(e1)) { + return NULL; + } + if (PyTuple_Size(init_obj) > 1 && init == kmcudaInitMethodAFKMC2) { + afkmc2_m = PyLong_AsUnsignedLong(PyTuple_GetItem(init_obj, 1)); + } } else { init = kmcudaInitMethodImport; } @@ -345,7 +367,7 @@ static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs int result; Py_BEGIN_ALLOW_THREADS result = kmeans_cuda( - init, tolerance, yinyang_t, metric, samples_size, + init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size, static_cast(features_size), clusters_size, seed, device, device_ptrs, fp16x2, verbosity, samples, centroids, assignments, adflag? &average_distance : nullptr); From b4d4f1df491e9416f0a96a8d770b04501d5afb2d Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 25 Jan 2017 18:11:42 +0100 Subject: [PATCH 2/7] Implement q calculation --- CMakeLists.txt | 3 +- kmcuda.cc | 84 +++++++++++++++--- kmcuda.h | 2 +- kmeans.cu | 226 +++++++++++++++++++++++++++++++------------------ knn.cu | 16 +--- private.h | 12 ++- test.py | 8 ++ tricks.cuh | 35 ++++++++ 8 files changed, 273 insertions(+), 113 deletions(-) create mode 100644 tricks.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index 98abcf3..204272f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ if (NOT DEFINED CUDA_ARCH) set(CUDA_ARCH "61") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}") -set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h +set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh metric_abstraction.h kmeans.cu knn.cu) if (NOT DISABLE_PYTHON) list(APPEND SOURCE_FILES python.cc) @@ -29,6 +29,7 @@ if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std c++11") endif() cuda_add_library(KMCUDA SHARED ${SOURCE_FILES} OPTIONS ${NVCC_FLAGS}) +target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY}) if(PYTHONLIBS_FOUND) include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES}) target_link_libraries(KMCUDA ${PYTHON_LIBRARIES}) diff --git a/kmcuda.cc b/kmcuda.cc index afedb6d..71c34da 100644 --- a/kmcuda.cc +++ b/kmcuda.cc @@ -2,13 +2,14 @@ #include #include #include -#include #include #include #include +#include #include #include +#include #ifdef PROFILE #include #endif @@ -188,12 +189,45 @@ static KMCUDAResult print_memory_stats(const std::vector &devs) { extern "C" { +static const std::map CURAND_ERRORS { + {CURAND_STATUS_SUCCESS, "CURAND_STATUS_SUCCESS"}, + {CURAND_STATUS_VERSION_MISMATCH, "CURAND_STATUS_VERSION_MISMATCH"}, + {CURAND_STATUS_NOT_INITIALIZED, "CURAND_STATUS_NOT_INITIALIZED"}, + {CURAND_STATUS_ALLOCATION_FAILED, "CURAND_STATUS_ALLOCATION_FAILED"}, + {CURAND_STATUS_TYPE_ERROR, "CURAND_STATUS_TYPE_ERROR"}, + {CURAND_STATUS_OUT_OF_RANGE, "CURAND_STATUS_OUT_OF_RANGE"}, + {CURAND_STATUS_LENGTH_NOT_MULTIPLE, "CURAND_STATUS_LENGTH_NOT_MULTIPLE"}, + {CURAND_STATUS_DOUBLE_PRECISION_REQUIRED, "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"}, + {CURAND_STATUS_LAUNCH_FAILURE, "CURAND_STATUS_LAUNCH_FAILURE"}, + {CURAND_STATUS_PREEXISTING_FAILURE, "CURAND_STATUS_PREEXISTING_FAILURE"}, + {CURAND_STATUS_INITIALIZATION_FAILED, "CURAND_STATUS_INITIALIZATION_FAILED"}, + {CURAND_STATUS_ARCH_MISMATCH, "CURAND_STATUS_ARCH_MISMATCH"}, + {CURAND_STATUS_INTERNAL_ERROR, "CURAND_STATUS_INTERNAL_ERROR"} +}; + +#define CURANDCH(cuda_call, ret, ...) \ +do { \ + auto __res = cuda_call; \ + if (__res != CURAND_STATUS_SUCCESS) { \ + DEBUG("%s\n", #cuda_call); \ + INFO("%s:%d -> %s\n", __FILE__, __LINE__, CURAND_ERRORS.find(__res)->second); \ + __VA_ARGS__; \ + return ret; \ + } \ +} while (false) + +class CurandGenerator : public unique_devptr_parent { + public: + explicit CurandGenerator(curandGenerator_t ptr) : unique_devptr_parent( + ptr, [](curandGenerator_t p){ curandDestroyGenerator(p); }) {} +}; + KMCUDAResult kmeans_init_centroids( - KMCUDAInitMethod method, void *init_params, uint32_t samples_size, + KMCUDAInitMethod method, const void *init_params, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, const float *host_centroids, const udevptrs &samples, - udevptrs *dists, udevptrs *dev_sums, udevptrs *centroids) { + udevptrs *dists, udevptrs *centroids) { srand(seed); switch (method) { case kmcudaInitMethodImport: @@ -253,9 +287,9 @@ KMCUDAResult kmeans_init_centroids( printf("kmeans++: dump %" PRIu32 " %" PRIu32 " %p\n", samples_size, features_size, host_dists.get()); FOR_EACH_DEVI( - printf("kmeans++: dev #%d: %p %p %p %p\n", devs[devi], + printf("kmeans++: dev #%d: %p %p %p\n", devs[devi], samples[devi].get(), (*centroids)[devi].get(), - (*dists)[devi].get(), (*dev_sums)[devi].get()); + (*dists)[devi].get()); ); } for (uint32_t i = 1; i < clusters_size; i++) { @@ -267,7 +301,7 @@ KMCUDAResult kmeans_init_centroids( float dist_sum = 0; RETERR(kmeans_cuda_plus_plus( samples_size, features_size, i, metric, devs, fp16x2, verbosity, - samples, centroids, dists, dev_sums, host_dists.get(), &dist_sum), + samples, centroids, dists, host_dists.get(), &dist_sum), DEBUG("\nkmeans_cuda_plus_plus failed\n")); if (dist_sum != dist_sum) { assert(dist_sum == dist_sum); @@ -309,6 +343,33 @@ KMCUDAResult kmeans_init_centroids( break; } case kmcudaInitMethodAFKMC2: { + curandGenerator_t rndgen_; + CURANDCH(curandCreateGenerator(&rndgen_, CURAND_RNG_PSEUDO_DEFAULT), + kmcudaRuntimeError); + CurandGenerator rndgen(rndgen_); + CURANDCH(curandSetPseudoRandomGeneratorSeed(rndgen.get(), seed), + kmcudaRuntimeError); + float smoke = NAN; + uint32_t first_offset; + while (smoke != smoke) { + first_offset = (rand() % samples_size) * features_size; + cudaSetDevice(devs[0]); + CUCH(cudaMemcpy(&smoke, samples[0].get() + first_offset, sizeof(float), + cudaMemcpyDeviceToHost), kmcudaMemoryCopyError); + } + INFO("afkmc2: calculating q (c0 = %" PRIu32 ")... ", + first_offset / features_size); + CUMEMCPY_D2D_ASYNC(*centroids, 0, samples, first_offset, features_size); + auto q = dists; + kmeans_cuda_afkmc2_calc_q( + samples_size, features_size, first_offset / features_size, metric, + devs, fp16x2, verbosity, samples, q); + INFO("done\n"); + + kmeans_init_centroids( + kmcudaInitMethodPlusPlus, init_params, samples_size, features_size, clusters_size, + metric, seed, devs, device_ptrs, fp16x2, verbosity, host_centroids, + samples, dists, centroids); break; } } @@ -317,15 +378,15 @@ KMCUDAResult kmeans_init_centroids( } KMCUDAResult kmeans_cuda( - KMCUDAInitMethod init, void *init_params, float tolerance, float yinyang_t, + KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t, KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, uint32_t *assignments, float *average_distance) { - DEBUG("arguments: %d %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %" - PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, tolerance, - yinyang_t, metric, samples_size, features_size, clusters_size, seed, - device, fp16x2, verbosity, samples, centroids, assignments, + DEBUG("arguments: %d %p %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %" + PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, init_params, + tolerance, yinyang_t, metric, samples_size, features_size, clusters_size, + seed, device, fp16x2, verbosity, samples, centroids, assignments, average_distance); RETERR(check_kmeans_args( tolerance, yinyang_t, samples_size, features_size, clusters_size, @@ -399,7 +460,6 @@ KMCUDAResult kmeans_cuda( init, init_params, samples_size, features_size, clusters_size, metric, seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples, reinterpret_cast*>(&device_assignments), - reinterpret_cast*>(&device_assignments_prev), &device_centroids), DEBUG("kmeans_init_centroids failed: %s\n", CUERRSTR())); RETERR(kmeans_cuda_yy( diff --git a/kmcuda.h b/kmcuda.h index dbe4f11..dba461b 100644 --- a/kmcuda.h +++ b/kmcuda.h @@ -58,7 +58,7 @@ extern "C" { /// the corresponding centroids. If nullptr, not calculated. /// @return KMCUDAResult. KMCUDAResult kmeans_cuda( - KMCUDAInitMethod init, void *init_params, float tolerance, float yinyang_t, + KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t, KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs, int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids, diff --git a/kmeans.cu b/kmeans.cu index e81d67a..1d92f17 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -7,8 +7,10 @@ #include "private.h" #include "metric_abstraction.h" +#include "tricks.cuh" -#define BS_KMPP 512 +#define BS_KMPP 1024 +#define BS_AFKMC2_Q 512 #define BS_LL_ASS 256 #define BS_LL_CNT 256 #define BS_YY_INI 256 @@ -31,61 +33,68 @@ __constant__ int d_shmem_size; // Device functions //---------------------------------------------------------- //////////////////////---------------------------------------------------------- -// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ -__device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) { - int mask = __ballot(1); - int leader = __ffs(mask) - 1; - uint32_t res; - if ((threadIdx.x % 32) == leader) { - res = atomicAdd(ctr, __popc(mask)); - } - res = __shfl(res, leader); - return res + __popc(mask & ((1 << (threadIdx.x % 32)) - 1)); -} - template __global__ void kmeans_plus_plus( const uint32_t length, const uint32_t cc, const F *__restrict__ samples, const F *__restrict__ centroids, float *__restrict__ dists, - float *__restrict__ dist_sums) { + float *dists_sum) { uint32_t sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample >= length) { - return; - } - samples += static_cast(sample) * d_features_size; - centroids += (cc - 1) * d_features_size; - extern __shared__ float local_dists[]; float dist = 0; - if (_eq(samples[0], samples[0])) { - dist = METRIC::distance(samples, centroids); - } - float prev_dist; - if (cc == 1 || dist < (prev_dist = dists[sample])) { - dists[sample] = dist; - } else { - dist = prev_dist; + if (sample < length) { + samples += static_cast(sample) * d_features_size; + centroids += (cc - 1) * d_features_size; + if (_eq(samples[0], samples[0])) { + dist = METRIC::distance(samples, centroids); + } + float prev_dist; + if (cc == 1 || dist < (prev_dist = dists[sample])) { + dists[sample] = dist; + } else { + dist = prev_dist; + } } - local_dists[threadIdx.x] = dist; - uint32_t end = blockDim.x; - if ((blockIdx.x + 1) * blockDim.x > length) { - end = length - blockIdx.x * blockDim.x; + dist = warpReduceSum(dist); + if (threadIdx.x % 32 == 0) { + atomicAdd(dists_sum, dist); } - __syncthreads(); - if (threadIdx.x % 16 == 0) { - float psum = 0; - for (uint32_t i = threadIdx.x; i < end && i < threadIdx.x + 16; i++) { - psum += local_dists[i]; +} + +template +__global__ void kmeans_afkmc2_calc_q_dists( + const uint32_t offset, const uint32_t length, uint32_t c1_index, + const F *__restrict__ samples, float *__restrict__ dists, + float *dsum) { + volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x; + float dist = 0; + if (sample < length) { + sample += offset; + extern __shared__ float shmem_afkmc2[]; + auto c1 = reinterpret_cast(shmem_afkmc2); + uint16_t size_each = dupper(d_features_size, static_cast(blockDim.x)); + for (uint16_t i = size_each * threadIdx.x; + i < min(size_each * (threadIdx.x + 1), d_features_size); i++) { + c1[i] = samples[static_cast(c1_index) * d_features_size + i]; } - local_dists[threadIdx.x] = psum; + __syncthreads(); + dist = METRIC::distance(c1, samples + sample * d_features_size); + dist *= dist; + dists[sample] = dist; } - __syncthreads(); - if (threadIdx.x == 0) { - float block_sum = 0; - for (uint32_t i = 0; i < end; i += 16) { - block_sum += local_dists[i]; - } - dist_sums[blockIdx.x] = block_sum; + float sum = warpReduceSum(dist); + if (threadIdx.x % 32 == 0) { + atomicAdd(dsum, sum); + } +} + +__global__ void kmeans_afkmc2_calc_q( + const uint32_t offset, const uint32_t length, + float dsum, float *__restrict__ q) { + volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x; + if (sample >= length) { + return; } + sample += offset; + q[sample] = 1 / (2.f * d_samples_size) + q[sample] / (2 * dsum); } template @@ -554,7 +563,7 @@ __global__ void kmeans_calc_average_distance( uint32_t offset, uint32_t length, const F *__restrict__ samples, const F *__restrict__ centroids, const uint32_t *__restrict__ assignments, float *distance) { - uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x; + volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x; float dist = 0; if (sample < length) { sample += offset; @@ -562,13 +571,8 @@ __global__ void kmeans_calc_average_distance( samples + sample * d_features_size, centroids + assignments[sample] * d_features_size); } - __shared__ float dists[BLOCK_SIZE]; - dists[threadIdx.x] = dist; - if (threadIdx.x == 0) { - float sum = 0; - for (int i = 0; i < BLOCK_SIZE; i++) { - sum += dists[i]; - } + float sum = warpReduceSum(dist); + if (threadIdx.x % 32 == 0) { atomicAdd(distance, sum); } } @@ -658,8 +662,7 @@ KMCUDAResult kmeans_cuda_plus_plus( uint32_t h_samples_size, uint32_t h_features_size, uint32_t cc, KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, int verbosity, const udevptrs &samples, udevptrs *centroids, - udevptrs *dists, udevptrs *dev_sums, float *host_dists, - float *dist_sum) { + udevptrs *dists, float *host_dists, float *dist_sum) { auto plan = distribute(h_samples_size, h_features_size * sizeof(float), devs); uint32_t max_len = 0; for (auto &p : plan) { @@ -668,11 +671,9 @@ KMCUDAResult kmeans_cuda_plus_plus( max_len = len; } } - CUMEMSET(*dev_sums, 0, upper(max_len, static_cast(BS_KMPP))); - size_t dist_sums_size = h_samples_size / BS_KMPP + devs.size(); - std::unique_ptr dist_sums(new float[dist_sums_size]); - memset(dist_sums.get(), 0, dist_sums_size * sizeof(float)); - + udevptrs dev_dists; + CUMALLOC(dev_dists, sizeof(float)); + CUMEMSET_ASYNC(dev_dists, 0, sizeof(float)); FOR_EACH_DEVI( uint32_t offset, length; std::tie(offset, length) = plan[devi]; @@ -681,11 +682,11 @@ KMCUDAResult kmeans_cuda_plus_plus( } dim3 block(BS_KMPP, 1, 1); dim3 grid(upper(length, block.x), 1, 1); - KERNEL_SWITCH(kmeans_plus_plus, <<>>( + KERNEL_SWITCH(kmeans_plus_plus, <<>>( length, cc, reinterpret_cast(samples[devi].get() + offset * h_features_size), reinterpret_cast((*centroids)[devi].get()), - (*dists)[devi].get(), (*dev_sums)[devi].get())); + (*dists)[devi].get(), dev_dists[devi].get())); ); uint32_t dist_offset = 0; FOR_EACH_DEVI( @@ -696,18 +697,78 @@ KMCUDAResult kmeans_cuda_plus_plus( CUCH(cudaMemcpyAsync( host_dists + offset, (*dists)[devi].get(), length * sizeof(float), cudaMemcpyDeviceToHost), kmcudaMemoryCopyError); - CUCH(cudaMemcpyAsync( - dist_sums.get() + dist_offset, (*dev_sums)[devi].get(), - grid.x * sizeof(float), cudaMemcpyDeviceToHost), kmcudaMemoryCopyError); dist_offset += grid.x; ); - SYNC_ALL_DEVS; - double ds = 0; - #pragma omp simd reduction(+:ds) - for (uint32_t i = 0; i < dist_offset; i++) { - ds += dist_sums[i]; - } - *dist_sum = ds; + float sum = 0; + FOR_EACH_DEVI( + if (std::get<1>(plan[devi]) == 0) { + continue; + } + float hdist; + CUCH(cudaMemcpy(&hdist, dev_dists[devi].get(), sizeof(float), + cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); + sum += hdist; + ); + *dist_sum = sum; + return kmcudaSuccess; +} + +KMCUDAResult kmeans_cuda_afkmc2_calc_q( + uint32_t h_samples_size, uint32_t h_features_size, uint32_t firstc, + KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, + int verbosity, const udevptrs &samples, udevptrs *q) { + auto plan = distribute(h_samples_size, h_features_size * sizeof(float), devs); + udevptrs dev_dists; + CUMALLOC(dev_dists, sizeof(float)); + CUMEMSET_ASYNC(dev_dists, 0, sizeof(float)); + FOR_EACH_DEVI( + uint32_t offset, length; + std::tie(offset, length) = plan[devi]; + if (length == 0) { + continue; + } + dim3 block(BS_AFKMC2_Q, 1, 1); + dim3 grid(upper(length, block.x), 1, 1); + int shmem = std::max( + BS_AFKMC2_Q, static_cast(h_features_size)) * sizeof(float); + KERNEL_SWITCH(kmeans_afkmc2_calc_q_dists, + <<>>( + offset, length, firstc, + reinterpret_cast(samples[devi].get()), + (*q)[devi].get(), dev_dists[devi].get())); + + ); + float dists_sum = 0; + FOR_EACH_DEVI( + if (std::get<1>(plan[devi]) == 0) { + continue; + } + float hdist; + CUCH(cudaMemcpy(&hdist, dev_dists[devi].get(), sizeof(float), + cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); + dists_sum += hdist; + ); + FOR_EACH_DEVI( + uint32_t offset, length; + std::tie(offset, length) = plan[devi]; + if (length == 0) { + continue; + } + dim3 block(BLOCK_SIZE, 1, 1); + dim3 grid(upper(length, block.x), 1, 1); + kmeans_afkmc2_calc_q<<>>( + offset, length, dists_sum, (*q)[devi].get()); + ); + FOR_EACH_DEVI( + uint32_t offset, length; + std::tie(offset, length) = plan[devi]; + FOR_OTHER_DEVS( + CUP2P(q, offset, length); + ); + ); + return kmcudaSuccess; } @@ -859,7 +920,7 @@ KMCUDAResult kmeans_cuda_yy( RETERR(kmeans_init_centroids( kmcudaInitMethodPlusPlus, nullptr, h_clusters_size, h_features_size, h_yy_groups_size, metric, 0, devs, -1, fp16x2, verbosity, nullptr, - *centroids, &tmpbufs, drifts_yy, centroids_yy), + *centroids, &tmpbufs, centroids_yy), INFO("kmeans_init_centroids() failed for yinyang groups: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_cuda_lloyd( @@ -1046,25 +1107,28 @@ KMCUDAResult kmeans_cuda_calc_average_distance( float *average_distance) { INFO("calculating the average distance...\n"); auto plans = distribute(h_samples_size, h_features_size * sizeof(float), devs); - float *dev_dists[devs.size()]; + udevptrs dev_dists; + CUMALLOC(dev_dists, sizeof(float)); + CUMEMSET_ASYNC(dev_dists, 0, sizeof(float)); FOR_EACH_DEVI( uint32_t offset, length; std::tie(offset, length) = plans[devi]; - CUCH(cudaMalloc(&dev_dists[devi], sizeof(float)), - kmcudaMemoryAllocationFailure); - CUCH(cudaMemsetAsync(dev_dists[devi], 0, sizeof(float)), - kmcudaRuntimeError); + if (length == 0) { + continue; + } dim3 block(BLOCK_SIZE, 1, 1); dim3 grid(upper(length, block.x), 1, 1); - KERNEL_SWITCH(kmeans_calc_average_distance, <<>>( + KERNEL_SWITCH(kmeans_calc_average_distance, + <<>>( offset, length, reinterpret_cast(samples[devi].get()), reinterpret_cast(centroids[devi].get()), - assignments[devi].get(), dev_dists[devi])); + assignments[devi].get(), dev_dists[devi].get())); ); float sum = 0; FOR_EACH_DEVI( float hdist; - CUCH(cudaMemcpy(&hdist, dev_dists[devi], sizeof(float), cudaMemcpyDeviceToHost), + CUCH(cudaMemcpy(&hdist, dev_dists[devi].get(), sizeof(float), + cudaMemcpyDeviceToHost), kmcudaMemoryCopyError); sum += hdist; ); diff --git a/knn.cu b/knn.cu index 3fcae6a..90a1c5d 100644 --- a/knn.cu +++ b/knn.cu @@ -2,6 +2,7 @@ #include "private.h" #include "metric_abstraction.h" +#include "tricks.cuh" #define CLUSTER_DISTANCES_BLOCK_SIZE 512 #define CLUSTER_DISTANCES_SHMEM 12288 // in float-s @@ -14,20 +15,6 @@ __constant__ uint32_t d_samples_size; __constant__ uint32_t d_clusters_size; __device__ unsigned long long int d_dists_calced; -template -FPATTR T dupper(T size, T each) { - T div = size / each; - if (div * each == size) { - return div; - } - return div + 1; -} - -template -FPATTR T dmin(T a, T b) { - return a <= b? a : b; -} - /// sample_dists musr be zero-ed! template __global__ void knn_calc_cluster_radiuses( @@ -90,6 +77,7 @@ __global__ void knn_calc_cluster_distances( // stage 1 - accumulate distances for (uint16_t fpos = 0; fpos < d_features_size; fpos += fstep) { + __syncthreads(); const uint16_t fsize = dmin( fstep, static_cast(d_features_size - fpos)); uint32_t cbase = x * bs + threadIdx.x; diff --git a/private.h b/private.h index 820ff19..fa00988 100644 --- a/private.h +++ b/private.h @@ -254,19 +254,23 @@ KMCUDAResult kmeans_cuda_plus_plus( uint32_t samples_size, uint32_t features_size, uint32_t cc, KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, int verbosity, const udevptrs &samples, udevptrs *centroids, - udevptrs *dists, udevptrs *dev_sums, float *host_dists, - float *dists_sum); + udevptrs *dists, float *host_dists, float *dists_sum); + +KMCUDAResult kmeans_cuda_afkmc2_calc_q( + uint32_t samples_size, uint32_t features_size, uint32_t firstc, + KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, + int verbosity, const udevptrs &samples, udevptrs *q); KMCUDAResult kmeans_cuda_setup( uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t yy_groups_size, const std::vector &devs, int32_t verbosity); KMCUDAResult kmeans_init_centroids( - KMCUDAInitMethod method, void *init_params, uint32_t samples_size, + KMCUDAInitMethod method, const void *init_params, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, const float *host_centroids, const udevptrs &samples, - udevptrs *dists, udevptrs *dev_sums, udevptrs *centroids); + udevptrs *dists, udevptrs *centroids); KMCUDAResult kmeans_cuda_yy( float tolerance, uint32_t yy_groups_size, uint32_t samples_size, diff --git a/test.py b/test.py index bd270d9..2e37e35 100644 --- a/test.py +++ b/test.py @@ -236,6 +236,14 @@ def test_import_lloyd(self): self.assertEqual(self._get_iters_number(self.stdout), 8) self._validate(centroids, assignments, 0.05) + def test_afkmc2_lloyd(self): + with self.stdout: + centroids, assignments = kmeans_cuda( + self.samples, 50, init=("afkmc2", 200), device=0, + verbosity=2, seed=3, tolerance=0.05, yinyang_t=0) + self.assertEqual(self._get_iters_number(self.stdout), 4) + self._validate(centroids, assignments, 0.05) + def test_random_lloyd_2gpus(self): with self.stdout: centroids, assignments = kmeans_cuda( diff --git a/tricks.cuh b/tricks.cuh new file mode 100644 index 0000000..efd50ae --- /dev/null +++ b/tricks.cuh @@ -0,0 +1,35 @@ +template +__device__ __forceinline__ T dupper(T size, T each) { + T div = size / each; + if (div * each == size) { + return div; + } + return div + 1; +} + +template +__device__ __forceinline__ T dmin(T a, T b) { + return a <= b? a : b; +} + +// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ +__device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) { + int mask = __ballot(1); + int leader = __ffs(mask) - 1; + uint32_t res; + if ((threadIdx.x % 32) == leader) { + res = atomicAdd(ctr, __popc(mask)); + } + res = __shfl(res, leader); + return res + __popc(mask & ((1 << (threadIdx.x % 32)) - 1)); +} + +// https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ +template +__device__ __forceinline__ T warpReduceSum(T val) { + #pragma unroll + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_down(val, offset); + } + return val; +} \ No newline at end of file From dc53fbf3b70cf5e2699ff62122a1d2e749edf61c Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 25 Jan 2017 19:15:36 +0100 Subject: [PATCH 3/7] Add the random step --- kmeans.cu | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++- knn.cu | 6 +++--- tricks.cuh | 4 ++-- 3 files changed, 65 insertions(+), 6 deletions(-) diff --git a/kmeans.cu b/kmeans.cu index 1d92f17..f0160f4 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -5,6 +5,8 @@ #include #include +#include + #include "private.h" #include "metric_abstraction.h" #include "tricks.cuh" @@ -17,6 +19,7 @@ #define BS_YY_GFL 512 #define BS_YY_LFL 512 #define BLOCK_SIZE 1024 // for all the rest of the kernels +#define SHMEM_AFKMC2_RC 8191 // in float-s, the actual value is +1 #define YINYANG_GROUP_TOLERANCE 0.02 #define YINYANG_DRAFT_REASSIGNMENTS 0.11 @@ -97,6 +100,56 @@ __global__ void kmeans_afkmc2_calc_q( q[sample] = 1 / (2.f * d_samples_size) + q[sample] / (2 * dsum); } +__global__ void kmeans_cuda_afkmc2_random_step( + const uint32_t length, const uint64_t seed, const uint64_t seq, + const float *__restrict__ q, uint32_t *__restrict__ choices, + float *__restrict__ samples) { + volatile uint32_t ti = blockIdx.x * blockDim.x + threadIdx.x; + curandState_t state; + curand_init(seed, ti, seq, &state); + float part = curand_uniform(&state); + if (ti < length) { + samples[ti] = curand_uniform(&state); + } + float accum = 0, corr = 0; + bool found = false; + __shared__ float shared_q[SHMEM_AFKMC2_RC + 1]; + int32_t *all_found = reinterpret_cast(shared_q + SHMEM_AFKMC2_RC); + const uint32_t size_each = dupper( + static_cast(SHMEM_AFKMC2_RC), blockDim.x); + for (uint32_t sample = 0; sample < d_samples_size; sample += SHMEM_AFKMC2_RC) { + __syncthreads(); + if (*all_found == 0) { + return; + } + for (uint32_t i = 0; + i < size_each && threadIdx.x * size_each + i < SHMEM_AFKMC2_RC; + i++) { + shared_q[threadIdx.x * size_each + i] = q[sample + threadIdx.x * size_each + i]; + } + __syncthreads(); + if (!found) { + int i = 0; + #pragma unroll 4 + for (; i < SHMEM_AFKMC2_RC && accum < part; i++) { + // Kahan summation with inverted c + float v = shared_q[i++]; + float y = _add(corr, v); + float t = accum + y; + corr = y - (t - accum); + accum = t; + } + if (accum >= part) { + if (ti < length) { + choices[ti] = sample + i; + } + found = true; + atomicSub(all_found, 1); + } + } + } +} + template __global__ void kmeans_assign_lloyd_smallc( const uint32_t length, const F *__restrict__ samples, @@ -768,10 +821,16 @@ KMCUDAResult kmeans_cuda_afkmc2_calc_q( CUP2P(q, offset, length); ); ); - return kmcudaSuccess; } +/*void func() { + kmeans_cuda_afkmc2_random_step( + const uint32_t length, const uint64_t seed, const uint64_t seq, + const float *__restrict__ q, uint32_t *__restrict__ choices, + float *__restrict__ samples) +}*/ + KMCUDAResult kmeans_cuda_lloyd( float tolerance, uint32_t h_samples_size, uint32_t h_clusters_size, uint16_t h_features_size, KMCUDADistanceMetric metric, bool resume, diff --git a/knn.cu b/knn.cu index 90a1c5d..79d811e 100644 --- a/knn.cu +++ b/knn.cu @@ -30,11 +30,11 @@ __global__ void knn_calc_cluster_radiuses( // stage 1 - accumulate partial distances for every sample __shared__ F shcents[CLUSTER_RADIUSES_SHMEM]; - volatile const int cent_step = dmin( + volatile const int cent_step = min( CLUSTER_RADIUSES_SHMEM / blockDim.x, static_cast(d_features_size)); F *volatile const my_cent = shcents + cent_step * threadIdx.x; for (int cfi = 0; cfi < d_features_size; cfi += cent_step) { - const int fsize = dmin(cent_step, d_features_size - cfi); + const int fsize = min(cent_step, d_features_size - cfi); for (int f = 0; f < fsize; f++) { my_cent[f] = centroids[ci * d_features_size + cfi + f]; } @@ -78,7 +78,7 @@ __global__ void knn_calc_cluster_distances( // stage 1 - accumulate distances for (uint16_t fpos = 0; fpos < d_features_size; fpos += fstep) { __syncthreads(); - const uint16_t fsize = dmin( + const uint16_t fsize = min( fstep, static_cast(d_features_size - fpos)); uint32_t cbase = x * bs + threadIdx.x; if (cbase < d_clusters_size) { diff --git a/tricks.cuh b/tricks.cuh index efd50ae..19f6a9d 100644 --- a/tricks.cuh +++ b/tricks.cuh @@ -7,10 +7,10 @@ __device__ __forceinline__ T dupper(T size, T each) { return div + 1; } -template +/*template __device__ __forceinline__ T dmin(T a, T b) { return a <= b? a : b; -} +}*/ // https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/ __device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) { From c98f4502996c6e9333d50bf581b319b38c7c792c Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Wed, 25 Jan 2017 21:07:03 +0100 Subject: [PATCH 4/7] Add two more kernels --- kmcuda.cc | 69 +++++++++++++++++++------------------------------------ kmeans.cu | 69 ++++++++++++++++++++++++++++++++++++++++++++++--------- private.h | 11 ++++++++- 3 files changed, 91 insertions(+), 58 deletions(-) diff --git a/kmcuda.cc b/kmcuda.cc index 71c34da..91326a9 100644 --- a/kmcuda.cc +++ b/kmcuda.cc @@ -9,7 +9,6 @@ #include #include -#include #ifdef PROFILE #include #endif @@ -189,45 +188,12 @@ static KMCUDAResult print_memory_stats(const std::vector &devs) { extern "C" { -static const std::map CURAND_ERRORS { - {CURAND_STATUS_SUCCESS, "CURAND_STATUS_SUCCESS"}, - {CURAND_STATUS_VERSION_MISMATCH, "CURAND_STATUS_VERSION_MISMATCH"}, - {CURAND_STATUS_NOT_INITIALIZED, "CURAND_STATUS_NOT_INITIALIZED"}, - {CURAND_STATUS_ALLOCATION_FAILED, "CURAND_STATUS_ALLOCATION_FAILED"}, - {CURAND_STATUS_TYPE_ERROR, "CURAND_STATUS_TYPE_ERROR"}, - {CURAND_STATUS_OUT_OF_RANGE, "CURAND_STATUS_OUT_OF_RANGE"}, - {CURAND_STATUS_LENGTH_NOT_MULTIPLE, "CURAND_STATUS_LENGTH_NOT_MULTIPLE"}, - {CURAND_STATUS_DOUBLE_PRECISION_REQUIRED, "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"}, - {CURAND_STATUS_LAUNCH_FAILURE, "CURAND_STATUS_LAUNCH_FAILURE"}, - {CURAND_STATUS_PREEXISTING_FAILURE, "CURAND_STATUS_PREEXISTING_FAILURE"}, - {CURAND_STATUS_INITIALIZATION_FAILED, "CURAND_STATUS_INITIALIZATION_FAILED"}, - {CURAND_STATUS_ARCH_MISMATCH, "CURAND_STATUS_ARCH_MISMATCH"}, - {CURAND_STATUS_INTERNAL_ERROR, "CURAND_STATUS_INTERNAL_ERROR"} -}; - -#define CURANDCH(cuda_call, ret, ...) \ -do { \ - auto __res = cuda_call; \ - if (__res != CURAND_STATUS_SUCCESS) { \ - DEBUG("%s\n", #cuda_call); \ - INFO("%s:%d -> %s\n", __FILE__, __LINE__, CURAND_ERRORS.find(__res)->second); \ - __VA_ARGS__; \ - return ret; \ - } \ -} while (false) - -class CurandGenerator : public unique_devptr_parent { - public: - explicit CurandGenerator(curandGenerator_t ptr) : unique_devptr_parent( - ptr, [](curandGenerator_t p){ curandDestroyGenerator(p); }) {} -}; - KMCUDAResult kmeans_init_centroids( KMCUDAInitMethod method, const void *init_params, uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, const float *host_centroids, const udevptrs &samples, - udevptrs *dists, udevptrs *centroids) { + udevptrs *dists, udevptrs *aux, udevptrs *centroids) { srand(seed); switch (method) { case kmcudaInitMethodImport: @@ -343,12 +309,14 @@ KMCUDAResult kmeans_init_centroids( break; } case kmcudaInitMethodAFKMC2: { - curandGenerator_t rndgen_; - CURANDCH(curandCreateGenerator(&rndgen_, CURAND_RNG_PSEUDO_DEFAULT), - kmcudaRuntimeError); - CurandGenerator rndgen(rndgen_); - CURANDCH(curandSetPseudoRandomGeneratorSeed(rndgen.get(), seed), - kmcudaRuntimeError); + uint32_t m = *reinterpret_cast(init_params); + if (m == 0) { + m = 200; + } else if (m > samples_size / 2) { + INFO("afkmc2: m > %" PRIu32 " is not supported (got %" PRIu32 ")\n", + samples_size / 2, m); + return kmcudaInvalidArguments; + } float smoke = NAN; uint32_t first_offset; while (smoke != smoke) { @@ -365,11 +333,19 @@ KMCUDAResult kmeans_init_centroids( samples_size, features_size, first_offset / features_size, metric, devs, fp16x2, verbosity, samples, q); INFO("done\n"); - - kmeans_init_centroids( - kmcudaInitMethodPlusPlus, init_params, samples_size, features_size, clusters_size, - metric, seed, devs, device_ptrs, fp16x2, verbosity, host_centroids, - samples, dists, centroids); + auto choices = std::unique_ptr(new uint32_t[m]); + auto random_samples = std::unique_ptr(new float[m]); + auto min_dists = std::unique_ptr(new float[m]); + for (uint32_t k = 1; k < clusters_size; k++) { + RETERR(kmeans_cuda_afkmc2_random_step( + k, m, seed, verbosity, q->back().get(), + reinterpret_cast(aux->back().get()), + choices.get(), aux->back().get() + m, random_samples.get())); + RETERR(kmeans_cuda_afkmc2_min_dist( + m, k, metric, fp16x2, verbosity, samples.back().get(), + reinterpret_cast(aux->back().get()), + centroids->back().get(), aux->back().get() + m, min_dists.get())); + } break; } } @@ -460,6 +436,7 @@ KMCUDAResult kmeans_cuda( init, init_params, samples_size, features_size, clusters_size, metric, seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples, reinterpret_cast*>(&device_assignments), + reinterpret_cast*>(&device_assignments_prev), &device_centroids), DEBUG("kmeans_init_centroids failed: %s\n", CUERRSTR())); RETERR(kmeans_cuda_yy( diff --git a/kmeans.cu b/kmeans.cu index f0160f4..0df49ea 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -13,6 +13,7 @@ #define BS_KMPP 1024 #define BS_AFKMC2_Q 512 +#define BS_AFKMC2_R 512 #define BS_LL_ASS 256 #define BS_LL_CNT 256 #define BS_YY_INI 256 @@ -100,15 +101,15 @@ __global__ void kmeans_afkmc2_calc_q( q[sample] = 1 / (2.f * d_samples_size) + q[sample] / (2 * dsum); } -__global__ void kmeans_cuda_afkmc2_random_step( - const uint32_t length, const uint64_t seed, const uint64_t seq, +__global__ void kmeans_afkmc2_random_step( + const uint32_t m, const uint64_t seed, const uint64_t seq, const float *__restrict__ q, uint32_t *__restrict__ choices, float *__restrict__ samples) { volatile uint32_t ti = blockIdx.x * blockDim.x + threadIdx.x; curandState_t state; curand_init(seed, ti, seq, &state); float part = curand_uniform(&state); - if (ti < length) { + if (ti < m) { samples[ti] = curand_uniform(&state); } float accum = 0, corr = 0; @@ -140,7 +141,7 @@ __global__ void kmeans_cuda_afkmc2_random_step( accum = t; } if (accum >= part) { - if (ti < length) { + if (ti < m) { choices[ti] = sample + i; } found = true; @@ -150,6 +151,27 @@ __global__ void kmeans_cuda_afkmc2_random_step( } } +template +__global__ void kmeans_afkmc2_min_dist( + const uint32_t m, const uint32_t k, const F *__restrict__ samples, + const uint32_t *__restrict__ choices, const F *__restrict__ centroids, + float *__restrict__ min_dists) { + uint32_t chi = blockIdx.x * blockDim.x + threadIdx.x; + if (chi >= m) { + return; + } + float min_dist = FLT_MAX; + for (uint32_t c = 0; c < k; c++) { + float dist = METRIC::distance( + samples + static_cast(choices[chi]) * d_features_size, + centroids + c * d_features_size); + if (dist < min_dist) { + min_dist = dist; + } + } + min_dists[chi] = min_dist * min_dist; +} + template __global__ void kmeans_assign_lloyd_smallc( const uint32_t length, const F *__restrict__ samples, @@ -824,12 +846,37 @@ KMCUDAResult kmeans_cuda_afkmc2_calc_q( return kmcudaSuccess; } -/*void func() { - kmeans_cuda_afkmc2_random_step( - const uint32_t length, const uint64_t seed, const uint64_t seq, - const float *__restrict__ q, uint32_t *__restrict__ choices, - float *__restrict__ samples) -}*/ +KMCUDAResult kmeans_cuda_afkmc2_random_step( + uint32_t k, uint32_t m, uint64_t seed, int verbosity, const float *q, + uint32_t *d_choices, uint32_t *h_choices, float *d_samples, float *h_samples) { + dim3 block(BS_AFKMC2_R, 1, 1); + dim3 grid(upper(m, block.x), 1, 1); + int shmem = (SHMEM_AFKMC2_RC + 1) * sizeof(float); + kmeans_afkmc2_random_step<<>>( + m, seed, k, q, d_choices, d_samples); + CUCH(cudaMemcpy(h_choices, d_choices, m * sizeof(uint32_t), + cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); + CUCH(cudaMemcpy(h_samples, d_samples, m * sizeof(float), + cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); + return kmcudaSuccess; +} + +KMCUDAResult kmeans_cuda_afkmc2_min_dist( + const uint32_t m, const uint32_t k, KMCUDADistanceMetric metric, + int fp16x2, int32_t verbosity, const float *samples, const uint32_t *choices, + const float *centroids, float *d_min_dists, float *h_min_dists) { + dim3 block(BLOCK_SIZE, 1, 1); + dim3 grid(upper(m, block.x), 1, 1); + KERNEL_SWITCH(kmeans_afkmc2_min_dist, <<>>( + m, k, reinterpret_cast(samples), choices, + reinterpret_cast(centroids), d_min_dists)); + CUCH(cudaMemcpy(h_min_dists, d_min_dists, m * sizeof(float), + cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); + return kmcudaSuccess; +} KMCUDAResult kmeans_cuda_lloyd( float tolerance, uint32_t h_samples_size, uint32_t h_clusters_size, @@ -979,7 +1026,7 @@ KMCUDAResult kmeans_cuda_yy( RETERR(kmeans_init_centroids( kmcudaInitMethodPlusPlus, nullptr, h_clusters_size, h_features_size, h_yy_groups_size, metric, 0, devs, -1, fp16x2, verbosity, nullptr, - *centroids, &tmpbufs, centroids_yy), + *centroids, &tmpbufs, nullptr, centroids_yy), INFO("kmeans_init_centroids() failed for yinyang groups: %s\n", cudaGetErrorString(cudaGetLastError()))); RETERR(kmeans_cuda_lloyd( diff --git a/private.h b/private.h index fa00988..d0bf907 100644 --- a/private.h +++ b/private.h @@ -261,6 +261,15 @@ KMCUDAResult kmeans_cuda_afkmc2_calc_q( KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, int verbosity, const udevptrs &samples, udevptrs *q); +KMCUDAResult kmeans_cuda_afkmc2_random_step( + uint32_t k, uint32_t m, uint64_t seed, int verbosity, const float *q, + uint32_t *d_choices, uint32_t *h_choices, float *d_samples, float *h_samples); + +KMCUDAResult kmeans_cuda_afkmc2_min_dist( + const uint32_t m, const uint32_t k, KMCUDADistanceMetric metric, + int fp16x2, int32_t verbosity, const float *samples, const uint32_t *choices, + const float *centroids, float *d_min_dists, float *h_min_dists); + KMCUDAResult kmeans_cuda_setup( uint32_t samples_size, uint16_t features_size, uint32_t clusters_size, uint32_t yy_groups_size, const std::vector &devs, int32_t verbosity); @@ -270,7 +279,7 @@ KMCUDAResult kmeans_init_centroids( uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric, uint32_t seed, const std::vector &devs, int device_ptrs, int fp16x2, int32_t verbosity, const float *host_centroids, const udevptrs &samples, - udevptrs *dists, udevptrs *centroids); + udevptrs *dists, udevptrs *aux, udevptrs *centroids); KMCUDAResult kmeans_cuda_yy( float tolerance, uint32_t yy_groups_size, uint32_t samples_size, From 6eddbf596c7d9961ce7e452841e249600f051670 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Thu, 26 Jan 2017 18:45:58 +0100 Subject: [PATCH 5/7] Pass the smoke test for afkmc2 --- kmcuda.cc | 35 ++++++++++++++++++++++++++--------- kmeans.cu | 35 ++++++++++++++++++++--------------- private.h | 7 ++++--- 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/kmcuda.cc b/kmcuda.cc index 91326a9..a132ffe 100644 --- a/kmcuda.cc +++ b/kmcuda.cc @@ -328,23 +328,40 @@ KMCUDAResult kmeans_init_centroids( INFO("afkmc2: calculating q (c0 = %" PRIu32 ")... ", first_offset / features_size); CUMEMCPY_D2D_ASYNC(*centroids, 0, samples, first_offset, features_size); - auto q = dists; + auto q = std::unique_ptr(new float[samples_size]); kmeans_cuda_afkmc2_calc_q( samples_size, features_size, first_offset / features_size, metric, - devs, fp16x2, verbosity, samples, q); + devs, fp16x2, verbosity, samples, dists, q.get()); INFO("done\n"); - auto choices = std::unique_ptr(new uint32_t[m]); - auto random_samples = std::unique_ptr(new float[m]); - auto min_dists = std::unique_ptr(new float[m]); + auto cand_ind = std::unique_ptr(new uint32_t[m]); + auto rand_a = std::unique_ptr(new float[m]); + auto p_cand = std::unique_ptr(new float[m]); for (uint32_t k = 1; k < clusters_size; k++) { + if (verbosity > 1 || (verbosity > 0 && ( + clusters_size < 100 || k % (clusters_size / 100) == 0))) { + printf("\rstep %d", k); + fflush(stdout); + } RETERR(kmeans_cuda_afkmc2_random_step( - k, m, seed, verbosity, q->back().get(), + k, m, seed, verbosity, dists->back().get(), reinterpret_cast(aux->back().get()), - choices.get(), aux->back().get() + m, random_samples.get())); + cand_ind.get(), aux->back().get() + m, rand_a.get())); RETERR(kmeans_cuda_afkmc2_min_dist( - m, k, metric, fp16x2, verbosity, samples.back().get(), + k, m, metric, fp16x2, verbosity, samples.back().get(), reinterpret_cast(aux->back().get()), - centroids->back().get(), aux->back().get() + m, min_dists.get())); + centroids->back().get(), aux->back().get() + m, p_cand.get())); + float curr_prob = 0; + uint32_t curr_ind = 0; + for (uint32_t j = 0; j < m; j++) { + auto cand_prob = p_cand[j] / q[cand_ind[j]]; + if (curr_prob == 0 || cand_prob / curr_prob > rand_a[j]) { + curr_ind = j; + curr_prob = cand_prob; + } + } + CUMEMCPY_D2D_ASYNC(*centroids, k * features_size, + samples, cand_ind[curr_ind] * features_size, + features_size); } break; } diff --git a/kmeans.cu b/kmeans.cu index 0df49ea..b2753b8 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -116,6 +116,7 @@ __global__ void kmeans_afkmc2_random_step( bool found = false; __shared__ float shared_q[SHMEM_AFKMC2_RC + 1]; int32_t *all_found = reinterpret_cast(shared_q + SHMEM_AFKMC2_RC); + *all_found = blockDim.x; const uint32_t size_each = dupper( static_cast(SHMEM_AFKMC2_RC), blockDim.x); for (uint32_t sample = 0; sample < d_samples_size; sample += SHMEM_AFKMC2_RC) { @@ -123,26 +124,26 @@ __global__ void kmeans_afkmc2_random_step( if (*all_found == 0) { return; } - for (uint32_t i = 0; - i < size_each && threadIdx.x * size_each + i < SHMEM_AFKMC2_RC; + for (uint32_t i = 0, si = threadIdx.x * size_each; + i < size_each && (si = threadIdx.x * size_each + i) < SHMEM_AFKMC2_RC; i++) { - shared_q[threadIdx.x * size_each + i] = q[sample + threadIdx.x * size_each + i]; + shared_q[si] = q[sample + si]; } __syncthreads(); if (!found) { int i = 0; #pragma unroll 4 - for (; i < SHMEM_AFKMC2_RC && accum < part; i++) { + for (; i < SHMEM_AFKMC2_RC && accum < part && sample + i < d_samples_size; + i++) { // Kahan summation with inverted c - float v = shared_q[i++]; - float y = _add(corr, v); + float y = _add(corr, shared_q[i]); float t = accum + y; corr = y - (t - accum); accum = t; } if (accum >= part) { if (ti < m) { - choices[ti] = sample + i; + choices[ti] = sample + i - 1; } found = true; atomicSub(all_found, 1); @@ -792,7 +793,8 @@ KMCUDAResult kmeans_cuda_plus_plus( KMCUDAResult kmeans_cuda_afkmc2_calc_q( uint32_t h_samples_size, uint32_t h_features_size, uint32_t firstc, KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, - int verbosity, const udevptrs &samples, udevptrs *q) { + int verbosity, const udevptrs &samples, udevptrs *d_q, + float *h_q) { auto plan = distribute(h_samples_size, h_features_size * sizeof(float), devs); udevptrs dev_dists; CUMALLOC(dev_dists, sizeof(float)); @@ -811,7 +813,7 @@ KMCUDAResult kmeans_cuda_afkmc2_calc_q( <<>>( offset, length, firstc, reinterpret_cast(samples[devi].get()), - (*q)[devi].get(), dev_dists[devi].get())); + (*d_q)[devi].get(), dev_dists[devi].get())); ); float dists_sum = 0; @@ -834,15 +836,19 @@ KMCUDAResult kmeans_cuda_afkmc2_calc_q( dim3 block(BLOCK_SIZE, 1, 1); dim3 grid(upper(length, block.x), 1, 1); kmeans_afkmc2_calc_q<<>>( - offset, length, dists_sum, (*q)[devi].get()); + offset, length, dists_sum, (*d_q)[devi].get()); ); FOR_EACH_DEVI( uint32_t offset, length; std::tie(offset, length) = plan[devi]; + CUCH(cudaMemcpyAsync(h_q + offset, (*d_q)[devi].get() + offset, + length * sizeof(float), cudaMemcpyDeviceToHost), + kmcudaMemoryCopyError); FOR_OTHER_DEVS( - CUP2P(q, offset, length); + CUP2P(d_q, offset, length); ); ); + SYNC_ALL_DEVS; return kmcudaSuccess; } @@ -851,8 +857,7 @@ KMCUDAResult kmeans_cuda_afkmc2_random_step( uint32_t *d_choices, uint32_t *h_choices, float *d_samples, float *h_samples) { dim3 block(BS_AFKMC2_R, 1, 1); dim3 grid(upper(m, block.x), 1, 1); - int shmem = (SHMEM_AFKMC2_RC + 1) * sizeof(float); - kmeans_afkmc2_random_step<<>>( + kmeans_afkmc2_random_step<<>>( m, seed, k, q, d_choices, d_samples); CUCH(cudaMemcpy(h_choices, d_choices, m * sizeof(uint32_t), cudaMemcpyDeviceToHost), @@ -864,8 +869,8 @@ KMCUDAResult kmeans_cuda_afkmc2_random_step( } KMCUDAResult kmeans_cuda_afkmc2_min_dist( - const uint32_t m, const uint32_t k, KMCUDADistanceMetric metric, - int fp16x2, int32_t verbosity, const float *samples, const uint32_t *choices, + uint32_t k, uint32_t m, KMCUDADistanceMetric metric, int fp16x2, + int32_t verbosity, const float *samples, const uint32_t *choices, const float *centroids, float *d_min_dists, float *h_min_dists) { dim3 block(BLOCK_SIZE, 1, 1); dim3 grid(upper(m, block.x), 1, 1); diff --git a/private.h b/private.h index d0bf907..f82962f 100644 --- a/private.h +++ b/private.h @@ -259,15 +259,16 @@ KMCUDAResult kmeans_cuda_plus_plus( KMCUDAResult kmeans_cuda_afkmc2_calc_q( uint32_t samples_size, uint32_t features_size, uint32_t firstc, KMCUDADistanceMetric metric, const std::vector &devs, int fp16x2, - int verbosity, const udevptrs &samples, udevptrs *q); + int verbosity, const udevptrs &samples, udevptrs *d_q, + float *h_q); KMCUDAResult kmeans_cuda_afkmc2_random_step( uint32_t k, uint32_t m, uint64_t seed, int verbosity, const float *q, uint32_t *d_choices, uint32_t *h_choices, float *d_samples, float *h_samples); KMCUDAResult kmeans_cuda_afkmc2_min_dist( - const uint32_t m, const uint32_t k, KMCUDADistanceMetric metric, - int fp16x2, int32_t verbosity, const float *samples, const uint32_t *choices, + uint32_t k, uint32_t m, KMCUDADistanceMetric metric, int fp16x2, + int32_t verbosity, const float *samples, const uint32_t *choices, const float *centroids, float *d_min_dists, float *h_min_dists); KMCUDAResult kmeans_cuda_setup( From dd11f9031c9e94eb5dc68ba7d80fc22455a1e1c5 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Fri, 27 Jan 2017 00:22:33 +0100 Subject: [PATCH 6/7] Finish the implementation and pass the rest of the tests --- README.md | 9 ++++++--- kmeans.cu | 59 +++++++++++++++++++++++++++++++++++++++++++++++++----- test.py | 29 ++++++++++++++++++++++++++- tricks.cuh | 40 ++++++++++++++++++++++++++++++++---- 4 files changed, 124 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 070b0bd..f7a5e69 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,17 @@ have several days and 12 GB of GPU memory); 300K samples are grouped into 5000 clusters in 4½ minutes on NVIDIA Titan X (15 iterations); 3M samples and 1000 clusters take 20 minutes (33 iterations). Yinyang can be turned off to save GPU memory but the slower Lloyd will be used then. -Three centroid initialization schemes are supported: random, k-means++ and import. -Two distance metrics are supported: L2 (the usual one) and angular (refined cosine). +Four centroid initialization schemes are supported: random, k-means++, +[AFKMC2](http://olivierbachem.ch/files/afkmcmc-oral-pdf.pdf) and import. +Two distance metrics are supported: L2 (the usual one) and angular +(arccos of the scalar product). L1 is in development. 16-bit float support delivers 2x memory compression. If you've got several GPUs, they can be utilized together and it gives the corresponding linear speedup either for Lloyd or Yinyang. The code has been thoroughly tested to yield bit-to-bit identical -results from Yinyang and Lloyd. +results from Yinyang and Lloyd. AFKMC2 was converted from +[the reference code](https://github.com/obachem/kmc2). Read the articles: [1](http://blog.sourced.tech/post/towards_kmeans_on_gpu/), [2](https://blog.sourced.tech/post/kmcuda4/). diff --git a/kmeans.cu b/kmeans.cu index b2753b8..8e34709 100644 --- a/kmeans.cu +++ b/kmeans.cu @@ -14,6 +14,7 @@ #define BS_KMPP 1024 #define BS_AFKMC2_Q 512 #define BS_AFKMC2_R 512 +#define BS_AFKMC2_MDT 512 #define BS_LL_ASS 256 #define BS_LL_CNT 256 #define BS_YY_INI 256 @@ -21,6 +22,7 @@ #define BS_YY_LFL 512 #define BLOCK_SIZE 1024 // for all the rest of the kernels #define SHMEM_AFKMC2_RC 8191 // in float-s, the actual value is +1 +#define SHMEM_AFKMC2_MT 8192 #define YINYANG_GROUP_TOLERANCE 0.02 #define YINYANG_DRAFT_REASSIGNMENTS 0.11 @@ -173,6 +175,42 @@ __global__ void kmeans_afkmc2_min_dist( min_dists[chi] = min_dist * min_dist; } +// min_dists must be set to FLT_MAX or +inf or NAN! +template +__global__ void kmeans_afkmc2_min_dist_transposed( + const uint32_t m, const uint32_t k, const F *__restrict__ samples, + const uint32_t *__restrict__ choices, const F *__restrict__ centroids, + float *__restrict__ min_dists) { + uint32_t c = blockIdx.x * blockDim.x + threadIdx.x; + extern __shared__ float shared_min_dists[]; + uint32_t size_each = dupper(m, blockDim.x); + for (uint32_t i = size_each * threadIdx.x; + i < min(size_each * (threadIdx.x + 1), m); + i++) { + shared_min_dists[i] = FLT_MAX; + } + __syncthreads(); + for (uint32_t chi = 0; chi < m; chi++) { + float dist = FLT_MAX; + if (c < k) { + dist = METRIC::distance( + samples + static_cast(choices[chi]) * d_features_size, + centroids + c * d_features_size); + } + float warp_min = warpReduceMin(dist); + warp_min *= warp_min; + if (threadIdx.x % 32 == 0 && c < k) { + atomicMin(shared_min_dists + chi, warp_min); + } + } + __syncthreads(); + if (threadIdx.x == 0) { + for (uint32_t chi = 0; chi < m; chi++) { + atomicMin(min_dists + chi, shared_min_dists[chi]); + } + } +} + template __global__ void kmeans_assign_lloyd_smallc( const uint32_t length, const F *__restrict__ samples, @@ -872,11 +910,22 @@ KMCUDAResult kmeans_cuda_afkmc2_min_dist( uint32_t k, uint32_t m, KMCUDADistanceMetric metric, int fp16x2, int32_t verbosity, const float *samples, const uint32_t *choices, const float *centroids, float *d_min_dists, float *h_min_dists) { - dim3 block(BLOCK_SIZE, 1, 1); - dim3 grid(upper(m, block.x), 1, 1); - KERNEL_SWITCH(kmeans_afkmc2_min_dist, <<>>( - m, k, reinterpret_cast(samples), choices, - reinterpret_cast(centroids), d_min_dists)); + if (m > k || m > SHMEM_AFKMC2_MT) { + dim3 block(BLOCK_SIZE, 1, 1); + dim3 grid(upper(m, block.x), 1, 1); + KERNEL_SWITCH(kmeans_afkmc2_min_dist, <<>>( + m, k, reinterpret_cast(samples), choices, + reinterpret_cast(centroids), d_min_dists)); + } else { + dim3 block(BS_AFKMC2_MDT, 1, 1); + dim3 grid(upper(k, block.x), 1, 1); + CUCH(cudaMemsetAsync(d_min_dists, 0xff, m * sizeof(float)), + kmcudaRuntimeError); + KERNEL_SWITCH(kmeans_afkmc2_min_dist_transposed, + <<>>( + m, k, reinterpret_cast(samples), choices, + reinterpret_cast(centroids), d_min_dists)); + } CUCH(cudaMemcpy(h_min_dists, d_min_dists, m * sizeof(float), cudaMemcpyDeviceToHost), kmcudaMemoryCopyError); diff --git a/test.py b/test.py index 2e37e35..aafc0d9 100644 --- a/test.py +++ b/test.py @@ -239,7 +239,7 @@ def test_import_lloyd(self): def test_afkmc2_lloyd(self): with self.stdout: centroids, assignments = kmeans_cuda( - self.samples, 50, init=("afkmc2", 200), device=0, + self.samples, 50, init=("afkmc2", 200), device=1, verbosity=2, seed=3, tolerance=0.05, yinyang_t=0) self.assertEqual(self._get_iters_number(self.stdout), 4) self._validate(centroids, assignments, 0.05) @@ -264,6 +264,21 @@ def test_kmeanspp_lloyd_2gpus(self): self.assertEqual(assignments.shape, (13000,)) self._validate(centroids, assignments, 0.05) + def test_afkmc2_lloyd_2gpus(self): + with self.stdout: + centroids, assignments = kmeans_cuda( + self.samples, 50, init="afkmc2", device=0, + verbosity=2, seed=3, tolerance=0.05, yinyang_t=0) + self.assertEqual(self._get_iters_number(self.stdout), 4) + self._validate(centroids, assignments, 0.05) + + def test_afkmc2_big_k_lloyd(self): + with self.stdout: + kmeans_cuda( + self.samples, 200, init=("afkmc2", 100), device=0, + verbosity=2, seed=3, tolerance=0.05, yinyang_t=0) + self.assertEqual(self._get_iters_number(self.stdout), 4) + def test_random_lloyd_all_explicit_gpus(self): with self.assertRaises(ValueError): centroids, assignments = kmeans_cuda( @@ -435,6 +450,18 @@ def test_fp16_kmeanspp_lloyd(self): centroids = centroids.astype(numpy.float32) self._validate(centroids, assignments, 0.05) + @unittest.skipUnless(supports_fp16, + "16-bit floats are not supported by this CUDA arch") + def test_fp16_afkmc2_lloyd(self): + samples = self.samples.astype(numpy.float16) + with self.stdout: + centroids, assignments = kmeans_cuda( + samples, 50, init="afkmc2", device=1, + verbosity=2, seed=3, tolerance=0.05, yinyang_t=0) + self.assertEqual(self._get_iters_number(self.stdout), 4) + centroids = centroids.astype(numpy.float32) + self._validate(centroids, assignments, 0.05) + @unittest.skipUnless(supports_fp16, "16-bit floats are not supported by this CUDA arch") def test_fp16_kmeanspp_validate(self): diff --git a/tricks.cuh b/tricks.cuh index 19f6a9d..79f20ec 100644 --- a/tricks.cuh +++ b/tricks.cuh @@ -1,3 +1,7 @@ +#include + +#define warpSize 32 + template __device__ __forceinline__ T dupper(T size, T each) { T div = size / each; @@ -17,19 +21,47 @@ __device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) { int mask = __ballot(1); int leader = __ffs(mask) - 1; uint32_t res; - if ((threadIdx.x % 32) == leader) { + if ((threadIdx.x % warpSize) == leader) { res = atomicAdd(ctr, __popc(mask)); } res = __shfl(res, leader); - return res + __popc(mask & ((1 << (threadIdx.x % 32)) - 1)); + return res + __popc(mask & ((1 << (threadIdx.x % warpSize)) - 1)); } // https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/ template __device__ __forceinline__ T warpReduceSum(T val) { #pragma unroll - for (int offset = 16; offset > 0; offset /= 2) { + for (int offset = warpSize / 2; offset > 0; offset /= 2) { val += __shfl_down(val, offset); } return val; -} \ No newline at end of file +} + +template +__device__ __forceinline__ T warpReduceMin(T val) { + #pragma unroll + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val = min(val, __shfl_down(val, offset)); + } + return val; +} + +// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu#L53 +__device__ __forceinline__ void atomicMin( + float *const address, const float value) { + if (*address <= value) { + return; + } + + int32_t *const address_as_i = reinterpret_cast(address); + int32_t old = *address_as_i, assumed; + + do { + assumed = old; + if (__int_as_float(assumed) <= value) { + break; + } + old = atomicCAS(address_as_i, assumed, __float_as_int(value)); + } while (assumed != old); +} From 72eb2b69bb24c73ea5e40c49d2fb1ae0a9837c55 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Fri, 27 Jan 2017 00:26:56 +0100 Subject: [PATCH 7/7] Bump the version to 6.0.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ee06e9f..fec4914 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ def is_pure(self): setup( name="libKMCUDA", description="Accelerated K-means and K-nn on GPU", - version="5.1.0", + version="6.0.0", license="MIT", author="Vadim Markovtsev", author_email="vadim@sourced.tech",