From 73d604790334dbb75dbd4e820da78938db9e73b3 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Thu, 7 Apr 2022 15:18:31 +0200 Subject: [PATCH 01/68] added platform cuda --- itensor/tensor/lapack_wrap.h | 219 +++++++++++++++++++---------------- 1 file changed, 122 insertions(+), 97 deletions(-) diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index 618398231..0af8e3c49 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -56,16 +56,16 @@ using LAPACK_INT = lapack_int; using LAPACK_REAL = double; using LAPACK_COMPLEX = lapack_complex_double; -inline LAPACK_REAL& -realRef(LAPACK_COMPLEX & z) - { +inline LAPACK_REAL& +realRef(LAPACK_COMPLEX & z) + { auto* p = reinterpret_cast(&z); return p[0]; } -inline LAPACK_REAL& -imagRef(LAPACK_COMPLEX & z) - { +inline LAPACK_REAL& +imagRef(LAPACK_COMPLEX & z) + { auto* p = reinterpret_cast(&z); return p[1]; } @@ -87,10 +87,10 @@ imagRef(LAPACK_COMPLEX & z) using LAPACK_REAL = __CLPK_doublereal; using LAPACK_COMPLEX = __CLPK_doublecomplex; - inline LAPACK_REAL& + inline LAPACK_REAL& realRef(LAPACK_COMPLEX & z) { return z.r; } - inline LAPACK_REAL& + inline LAPACK_REAL& imagRef(LAPACK_COMPLEX & z) { return z.i; } } @@ -111,10 +111,10 @@ imagRef(LAPACK_COMPLEX & z) using LAPACK_REAL = double; using LAPACK_COMPLEX = MKL_Complex16; - inline LAPACK_REAL& + inline LAPACK_REAL& realRef(LAPACK_COMPLEX & z) { return z.real; } - inline LAPACK_REAL& + inline LAPACK_REAL& imagRef(LAPACK_COMPLEX & z) { return z.imag; } } @@ -135,13 +135,38 @@ imagRef(LAPACK_COMPLEX & z) LAPACK_REAL real, imag; } LAPACK_COMPLEX; - inline LAPACK_REAL& + inline LAPACK_REAL& realRef(LAPACK_COMPLEX & z) { return z.real; } - inline LAPACK_REAL& + inline LAPACK_REAL& imagRef(LAPACK_COMPLEX & z) { return z.imag; } } +#elif defined PLATFORM_cuda + +#define ITENSOR_USE_CUDA + +#include +#include +#include +#include + +namespace itensor { +cudaDataType_t typeComplexData = CUDA_C_64F; +cudaDataType_t typeRealData = CUDA_R_64F; +cublasComputeType_t typeCompute = CUBLAS_COMPUTE_64F; + +using LAPACK_INT = int; +using LAPACK_REAL = double; +using LAPACK_COMPLEX = cuDoubleComplex; + +inline LAPACK_REAL& +realRef(LAPACK_COMPLEX & z) { return z.x; } + +inline LAPACK_REAL& +imagRef(LAPACK_COMPLEX & z) { return z.y; } +} + #endif // different PLATFORM types @@ -174,14 +199,14 @@ LAPACK_REAL F77NAME(dnrm2)(LAPACK_INT* N, LAPACK_REAL* X, LAPACK_INT* incx); #ifdef ITENSOR_USE_CBLAS void cblas_daxpy(const int n, const double alpha, const double *X, const int incX, double *Y, const int incY); #else -void F77NAME(daxpy)(LAPACK_INT* n, LAPACK_REAL* alpha, +void F77NAME(daxpy)(LAPACK_INT* n, LAPACK_REAL* alpha, LAPACK_REAL* X, LAPACK_INT* incx, LAPACK_REAL* Y, LAPACK_INT* incy); #endif //ddot declaration #ifdef ITENSOR_USE_CBLAS -LAPACK_REAL +LAPACK_REAL cblas_ddot(const LAPACK_INT N, const LAPACK_REAL *X, const LAPACK_INT incx, const LAPACK_REAL *Y, const LAPACK_INT incy); #else LAPACK_REAL F77NAME(ddot)(LAPACK_INT* N, LAPACK_REAL* X, LAPACK_INT* incx, LAPACK_REAL* Y, LAPACK_INT* incy); @@ -189,7 +214,7 @@ LAPACK_REAL F77NAME(ddot)(LAPACK_INT* N, LAPACK_REAL* X, LAPACK_INT* incx, LAPAC //zdotc declaration #ifdef ITENSOR_USE_CBLAS -LAPACK_REAL +LAPACK_REAL cblas_zdotc_sub(const LAPACK_INT N, const void *X, const LAPACK_INT incx, const void *Y, const LAPACK_INT incy, void *res); #else LAPACK_COMPLEX F77NAME(zdotc)(LAPACK_INT* N, LAPACK_COMPLEX* X, LAPACK_INT* incx, LAPACK_COMPLEX* Y, LAPACK_INT* incy); @@ -211,19 +236,19 @@ void F77NAME(dgemm)(char*,char*,LAPACK_INT*,LAPACK_INT*,LAPACK_INT*, //zgemm declaration #ifdef PLATFORM_openblas -void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, - OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, - OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, - OPENBLAS_CONST blasint M, - OPENBLAS_CONST blasint N, +void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, + OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, + OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, + OPENBLAS_CONST blasint M, + OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, - OPENBLAS_CONST double *alpha, - OPENBLAS_CONST double *A, - OPENBLAS_CONST blasint lda, - OPENBLAS_CONST double *B, - OPENBLAS_CONST blasint ldb, - OPENBLAS_CONST double *beta, - double *C, + OPENBLAS_CONST double *alpha, + OPENBLAS_CONST double *A, + OPENBLAS_CONST blasint lda, + OPENBLAS_CONST double *B, + OPENBLAS_CONST blasint ldb, + OPENBLAS_CONST double *beta, + double *C, OPENBLAS_CONST blasint ldc); #else //platform not openblas @@ -257,22 +282,22 @@ void F77NAME(dgemv)(char* transa,LAPACK_INT* M,LAPACK_INT* N,LAPACK_REAL* alpha, //zgemv declaration #ifdef PLATFORM_openblas -void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order, - OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, - OPENBLAS_CONST blasint m, +void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order, + OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, + OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, - OPENBLAS_CONST double *alpha, - OPENBLAS_CONST double *a, - OPENBLAS_CONST blasint lda, - OPENBLAS_CONST double *x, - OPENBLAS_CONST blasint incx, - OPENBLAS_CONST double *beta, - double *y, + OPENBLAS_CONST double *alpha, + OPENBLAS_CONST double *a, + OPENBLAS_CONST blasint lda, + OPENBLAS_CONST double *x, + OPENBLAS_CONST blasint incx, + OPENBLAS_CONST double *beta, + double *y, OPENBLAS_CONST blasint incy); #else #ifdef ITENSOR_USE_CBLAS -void cblas_zgemv(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE trans, const LAPACK_INT m, - const LAPACK_INT n, const void *alpha, const void *a, const LAPACK_INT lda, +void cblas_zgemv(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE trans, const LAPACK_INT m, + const LAPACK_INT n, const void *alpha, const void *a, const LAPACK_INT lda, const void *x, const LAPACK_INT incx, const void *beta, void *y, const LAPACK_INT incy); #else void F77NAME(zgemv)(char* transa,LAPACK_INT* M,LAPACK_INT* N,LAPACK_COMPLEX* alpha, LAPACK_COMPLEX* A, @@ -282,8 +307,8 @@ void F77NAME(zgemv)(char* transa,LAPACK_INT* M,LAPACK_INT* N,LAPACK_COMPLEX* alp #endif //zgemv declaration #ifdef PLATFORM_acml -void F77NAME(dsyev)(char *jobz, char *uplo, int *n, double *a, int *lda, - double *w, double *work, int *lwork, int *info, +void F77NAME(dsyev)(char *jobz, char *uplo, int *n, double *a, int *lda, + double *w, double *work, int *lwork, int *info, int jobz_len, int uplo_len); #else void F77NAME(dsyev)(const char* jobz, const char* uplo, const LAPACK_INT* n, double* a, @@ -299,66 +324,66 @@ void F77NAME(dscal)(LAPACK_INT* N, LAPACK_REAL* alpha, LAPACK_REAL* X,LAPACK_INT #ifdef PLATFORM_acml -void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, - double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, +void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, + double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, double *work, LAPACK_INT *lwork, LAPACK_INT *iwork, LAPACK_INT *info, int jobz_len); #else -void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, - double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, +void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, + double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, double *work, LAPACK_INT *lwork, LAPACK_INT *iwork, LAPACK_INT *info); #endif #ifdef PLATFORM_acml - void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, - double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, + void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, + double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, double *work, LAPACK_INT *lwork, LAPACK_INT *info, int jobz_len); #else - void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, - double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, + void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, + double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, double *work, LAPACK_INT *lwork, LAPACK_INT *info); #endif #ifdef PLATFORM_acml - void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s, - LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, + void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s, + LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_REAL * rwork, LAPACK_INT *info, int jobz_len); #else - void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s, - LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, + void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s, + LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_REAL * rwork, LAPACK_INT *info); #endif #ifdef PLATFORM_acml -void F77NAME(zgesdd)(char *jobz, int *m, int *n, LAPACK_COMPLEX *a, int *lda, double *s, - LAPACK_COMPLEX *u, int *ldu, LAPACK_COMPLEX *vt, int *ldvt, - LAPACK_COMPLEX *work, int *lwork, double *rwork, int *iwork, int *info, +void F77NAME(zgesdd)(char *jobz, int *m, int *n, LAPACK_COMPLEX *a, int *lda, double *s, + LAPACK_COMPLEX *u, int *ldu, LAPACK_COMPLEX *vt, int *ldvt, + LAPACK_COMPLEX *work, int *lwork, double *rwork, int *iwork, int *info, int jobz_len); #else -void F77NAME(zgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, double *s, - LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, +void F77NAME(zgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, double *s, + LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, LAPACK_INT *iwork, LAPACK_INT *info); #endif -void F77NAME(dgeqrf)(LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, +void F77NAME(dgeqrf)(LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork, LAPACK_INT *info); -void F77NAME(dorgqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, double *a, - LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork, +void F77NAME(dorgqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, double *a, + LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork, LAPACK_INT *info); - -void F77NAME(zgeqrf)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, + +void F77NAME(zgeqrf)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_INT *info); #ifdef PLATFORM_lapacke -void LAPACKE_zungqr(int matrix_layout, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a, - LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, +void LAPACKE_zungqr(int matrix_layout, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a, + LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_INT *info); #else -void F77NAME(zungqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a, - LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, +void F77NAME(zungqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a, + LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_INT *info); #endif @@ -388,51 +413,51 @@ LAPACK_REAL F77NAME(zlange)(char* norm, LAPACK_INT* m, LAPACK_INT* n, LAPACK_COM lapack_int LAPACKE_zheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda, double* w); #elif defined PLATFORM_acml -void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, - double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, +void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, + double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, LAPACK_INT *info, LAPACK_INT jobz_len, LAPACK_INT uplo_len); #else -void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, - double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, +void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, + double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, LAPACK_INT *info); #endif #ifdef PLATFORM_acml -void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a, - LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work, - LAPACK_INT *lwork, LAPACK_INT *info, LAPACK_INT jobz_len, LAPACK_INT uplo_len); +void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a, + LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work, + LAPACK_INT *lwork, LAPACK_INT *info, LAPACK_INT jobz_len, LAPACK_INT uplo_len); #else -void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a, - LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work, +void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a, + LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work, LAPACK_INT *lwork, LAPACK_INT *info); #endif #ifdef PLATFORM_acml -void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, - LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl, - double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, +void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, + LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl, + double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, LAPACK_INT *info, LAPACK_INT jobvl_len, LAPACK_INT jobvr_len); #else -void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, - LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl, - double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, +void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, + LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl, + double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, LAPACK_INT *info); #endif #ifdef PLATFORM_acml -void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a, - LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl, - LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr, - LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, +void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a, + LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl, + LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr, + LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, LAPACK_INT *info, LAPACK_INT jobvl_len, LAPACK_INT jobvr_len); #else -void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a, - LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl, - LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr, - LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, +void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a, + LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl, + LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr, + LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, LAPACK_INT *info); #endif @@ -483,7 +508,7 @@ zdotc_wrapper(LAPACK_INT N, // dgemm // void -gemm_wrapper(bool transa, +gemm_wrapper(bool transa, bool transb, LAPACK_INT m, LAPACK_INT n, @@ -498,7 +523,7 @@ gemm_wrapper(bool transa, // zgemm // void -gemm_wrapper(bool transa, +gemm_wrapper(bool transa, bool transb, LAPACK_INT m, LAPACK_INT n, @@ -513,7 +538,7 @@ gemm_wrapper(bool transa, // dgemv - matrix*vector multiply // void -gemv_wrapper(bool trans, +gemv_wrapper(bool trans, LAPACK_REAL alpha, LAPACK_REAL beta, LAPACK_INT m, @@ -528,7 +553,7 @@ gemv_wrapper(bool trans, // zgemv - matrix*vector multiply // void -gemv_wrapper(bool trans, +gemv_wrapper(bool trans, Cplx alpha, Cplx beta, LAPACK_INT m, @@ -694,7 +719,7 @@ zgesv_wrapper(LAPACK_INT n, // // dlange // -// returns the value of the 1-norm, Frobenius norm, infinity-norm, +// returns the value of the 1-norm, Frobenius norm, infinity-norm, // or the largest absolute value of any element of a general rectangular matrix. // double @@ -706,7 +731,7 @@ dlange_wrapper(char norm, // // zlange // -// returns the value of the 1-norm, Frobenius norm, infinity-norm, +// returns the value of the 1-norm, Frobenius norm, infinity-norm, // or the largest absolute value of any element of a general rectangular matrix. // LAPACK_REAL @@ -720,7 +745,7 @@ zlange_wrapper(char norm, // // Eigenvalues and eigenvectors of complex Hermitian matrix A // -LAPACK_INT +LAPACK_INT zheev_wrapper(LAPACK_INT N, //number of cols of A Cplx * A, //matrix A, on return contains eigenvectors LAPACK_REAL * d); //eigenvalues on return From 3ee56cc8a96297cc7e1aeab6a2b4ca7c9b87ed79 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 8 Apr 2022 13:54:45 +0200 Subject: [PATCH 02/68] added wrapper to all blas routines --- itensor/tensor/lapack_wrap.cc | 231 ++++++++++++++++++++++++++++------ 1 file changed, 191 insertions(+), 40 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 41d54825a..09060cd50 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -22,7 +22,7 @@ namespace itensor { // daxpy // Y += alpha*X // -void +void daxpy_wrapper(LAPACK_INT n, //number of elements of X,Y LAPACK_REAL alpha, //scale factor const LAPACK_REAL* X, //pointer to head of vector X @@ -32,6 +32,19 @@ daxpy_wrapper(LAPACK_INT n, //number of elements of X,Y { #ifdef ITENSOR_USE_CBLAS cblas_daxpy(n,alpha,X,incx,Y,incy); +#elif ITENSOR_USE_CUDA + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_REAL *d_X, *d_Y; + cudaMalloc(&d_X, n * sizeof(LAPACK_REAL)); + cudaMalloc(&d_Y, n * sizeof(LAPACK_REAL)); + cublasSetVector(n, sizeof(LAPACK_REAL), X, incx, d_X, incx); + cublasSetVector(n, sizeof(LAPACK_REAL), Y, incy, d_Y, incy); + cublasDaxpy(handle, n, &alpha, d_X, incx, d_Y, incy); + cublasGetVector(n, sizeof(LAPACK_REAL), d_Y, incy, Y, incy); + cudaFree(d_X); + cudaFree(d_Y); + cublasDestroy(handle); #else auto Xnc = const_cast(X); F77NAME(daxpy)(&n,&alpha,Xnc,&incx,Y,&incy); @@ -41,13 +54,24 @@ daxpy_wrapper(LAPACK_INT n, //number of elements of X,Y // // dnrm2 // -LAPACK_REAL +LAPACK_REAL dnrm2_wrapper(LAPACK_INT N, const LAPACK_REAL* X, LAPACK_INT incx) { #ifdef ITENSOR_USE_CBLAS return cblas_dnrm2(N,X,incx); +#elif ITENSOR_USE_CUDA + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_REAL *d_X; + cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); + cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); + LAPACK_REAL result; + cublasDnrm2(handle, N, d_X, incx, &result); + cudaFree(d_X); + cublasDestroy(handle); + return result; #else auto *Xnc = const_cast(X); return F77NAME(dnrm2)(&N,Xnc,&incx); @@ -58,7 +82,7 @@ dnrm2_wrapper(LAPACK_INT N, // // ddot // -LAPACK_REAL +LAPACK_REAL ddot_wrapper(LAPACK_INT N, const LAPACK_REAL* X, LAPACK_INT incx, @@ -67,6 +91,20 @@ ddot_wrapper(LAPACK_INT N, { #ifdef ITENSOR_USE_CBLAS return cblas_ddot(N,X,incx,Y,incy); +#elif ITENSOR_USE_CUDA + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_REAL *d_X, *d_Y; + cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); + cudaMalloc(&d_Y, N * sizeof(LAPACK_REAL)); + cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); + cublasSetVector(N, sizeof(LAPACK_REAL), Y, incy, d_Y, incy); + LAPACK_REAL result; + cublasDdot (handle, N, d_X, incx, d_Y, incy, &result); + cudaFree(d_X); + cudaFree(d_Y); + cublasDestroy(handle); + return result; #else auto *Xnc = const_cast(X); auto *Ync = const_cast(Y); @@ -98,6 +136,20 @@ zdotc_wrapper(LAPACK_INT N, #endif cblas_zdotc_sub(N,pX,incx,pY,incy,pres); return res; +#elif ITENSOR_USE_CUDA + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_COMPLEX res; + LAPACK_COMPLEX *d_X, *d_Y; + cudaMalloc(&d_X, N * sizeof(LAPACK_COMPLEX)); + cudaMalloc(&d_Y, N * sizeof(LAPACK_COMPLEX)); + cublasSetVector(N, sizeof(LAPACK_COMPLEX), X, incx, d_X, incx); + cublasSetVector(N, sizeof(LAPACK_COMPLEX), Y, incy, d_Y, incy); + cublasZdotc(handle, N, d_X, incx, d_Y, incy, &res); + cudaFree(d_X); + cudaFree(d_Y); + cublasDestroy(handle); + return res; #else auto ncX = const_cast(X); auto ncY = const_cast(Y); @@ -113,8 +165,8 @@ zdotc_wrapper(LAPACK_INT N, // // dgemm // -void -gemm_wrapper(bool transa, +void +gemm_wrapper(bool transa, bool transb, LAPACK_INT m, LAPACK_INT n, @@ -141,6 +193,34 @@ gemm_wrapper(bool transa, ldb = n; } cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m); +#elif ITENSOR_USE_CUDA + cublasOperation_t at = CUBLAS_OP_N; + cublasOperation_t bt = CUBLAS_OP_N; + if(transa) + { + at = CUBLAS_OP_T; + lda = k; + } + if(transb) + { + bt = CUBLAS_OP_T; + ldb = n; + } + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_REAL *d_A, *d_B, *d_C; + cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL)); + cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL)); + cudaMalloc(&d_C, m * n * sizeof(LAPACK_REAL)); + cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda); + cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb); + cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); + cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m) + cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m); + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + cublasDestroy(handle); #else auto *pA = const_cast(A); auto *pB = const_cast(B); @@ -163,8 +243,8 @@ gemm_wrapper(bool transa, // // zgemm // -void -gemm_wrapper(bool transa, +void +gemm_wrapper(bool transa, bool transb, LAPACK_INT m, LAPACK_INT n, @@ -204,6 +284,34 @@ gemm_wrapper(bool transa, auto* pB = reinterpret_cast(B); auto* pC = reinterpret_cast(C); cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,pA,lda,pB,ldb,pbeta,pC,m); +#elif ITENSOR_USE_CUDA + cublasOperation_t at = CUBLAS_OP_N; + cublasOperation_t bt = CUBLAS_OP_N; + if(transa) + { + at = CUBLAS_OP_T; + lda = k; + } + if(transb) + { + bt = CUBLAS_OP_T; + ldb = n; + } + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_COMPLEX *d_A, *d_B, *d_C; + cudaMalloc(&d_A, m * k * sizeof(LAPACK_COMPLEX)); + cudaMalloc(&d_B, k * n * sizeof(LAPACK_COMPLEX)); + cudaMalloc(&d_C, m * n * sizeof(LAPACK_COMPLEX)); + cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda); + cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb); + cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m); + cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m) + cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m); + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + cublasDestroy(handle); #else //platform not openblas #ifdef ITENSOR_USE_CBLAS auto at = CblasNoTrans, @@ -218,8 +326,8 @@ gemm_wrapper(bool transa, bt = CblasTrans; ldb = n; } - auto palpha = (void*)(&alpha); - auto pbeta = (void*)(&beta); + auto palpha = (void*)(&alpha); + auto pbeta = (void*)(&beta); cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,(void*)A,lda,(void*)B,ldb,pbeta,(void*)C,m); #else //use Fortran zgemm auto *ncA = const_cast(A); @@ -246,8 +354,8 @@ gemm_wrapper(bool transa, #endif } -void -gemv_wrapper(bool trans, +void +gemv_wrapper(bool trans, LAPACK_REAL alpha, LAPACK_REAL beta, LAPACK_INT m, @@ -261,6 +369,23 @@ gemv_wrapper(bool trans, #ifdef ITENSOR_USE_CBLAS auto Tr = trans ? CblasTrans : CblasNoTrans; cblas_dgemv(CblasColMajor,Tr,m,n,alpha,A,m,x,incx,beta,y,incy); +#elif ITENSOR_USE_CUDA + cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_REAL *d_A, *d_x, *d_y; + cudaMalloc(&d_A, m * n * sizeof(LAPACK_REAL)); + cudaMalloc(&d_x, m * sizeof(LAPACK_REAL)); + cudaMalloc(&d_y, m * sizeof(LAPACK_REAL)); + cublasSetMatrix(m, n, sizeof(LAPACK_REAL), A, m, d_A, m); + cublasSetVector(m, sizeof(LAPACK_REAL), x, incx, d_x, incx); + cublasSetVector(m, sizeof(LAPACK_REAL), y, incy, d_y, incy); + cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy) + cublasGetVector(m, sizeof(LAPACK_REAL), d_y, incy, y, incy); + cudaFree(d_A); + cudaFree(d_x); + cudaFree(d_y); + cublasDestroy(handle); #else char Tr = trans ? 'T' : 'N'; F77NAME(dgemv)(&Tr,&m,&n,&alpha,const_cast(A),&m,const_cast(x),&incx,&beta,y,&incy); @@ -268,7 +393,7 @@ gemv_wrapper(bool trans, } void -gemv_wrapper(bool trans, +gemv_wrapper(bool trans, Cplx alpha, Cplx beta, LAPACK_INT m, @@ -295,21 +420,38 @@ gemv_wrapper(bool trans, auto* px = reinterpret_cast(x); auto* py = reinterpret_cast(y); cblas_zgemv(CblasColMajor,Tr,m,n,palpha,pA,m,px,incx,pbeta,py,incy); +#elif ITENSOR_USE_CUDA + cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_COMPLEX *d_A, *d_x, *d_y; + cudaMalloc(&d_A, m * n * sizeof(LAPACK_COMPLEX)); + cudaMalloc(&d_x, m * sizeof(LAPACK_COMPLEX)); + cudaMalloc(&d_y, m * sizeof(LAPACK_COMPLEX)); + cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m); + cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx); + cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy); + cublasZgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy) + cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy); + cudaFree(d_A); + cudaFree(d_x); + cudaFree(d_y); + cublasDestroy(handle); #else //platform other than openblas #ifdef ITENSOR_USE_CBLAS auto Tr = trans ? CblasTrans : CblasNoTrans; - auto palpha = reinterpret_cast(&alpha); - auto pbeta = reinterpret_cast(&beta); + auto palpha = reinterpret_cast(&alpha); + auto pbeta = reinterpret_cast(&beta); cblas_zgemv(CblasColMajor,Tr,m,n,palpha,(void*)A,m,(void*)x,incx,pbeta,(void*)y,incy); #else char Tr = trans ? 'T' : 'N'; - auto ncA = const_cast(A); - auto ncx = const_cast(x); - auto pA = reinterpret_cast(ncA); - auto px = reinterpret_cast(ncx); - auto py = reinterpret_cast(y); - auto palpha = reinterpret_cast(&alpha); - auto pbeta = reinterpret_cast(&beta); + auto ncA = const_cast(A); + auto ncx = const_cast(x); + auto pA = reinterpret_cast(ncA); + auto px = reinterpret_cast(ncx); + auto py = reinterpret_cast(y); + auto palpha = reinterpret_cast(&alpha); + auto pbeta = reinterpret_cast(&beta); F77NAME(zgemv)(&Tr,&m,&n,palpha,pA,&m,px,&incx,pbeta,py,&incy); #endif #endif @@ -319,7 +461,7 @@ gemv_wrapper(bool trans, // // dsyev // -void +void dsyev_wrapper(char jobz, //if jobz=='V', compute eigs and evecs char uplo, //if uplo=='U', read from upper triangle of A LAPACK_INT n, //number of cols of A @@ -349,7 +491,7 @@ dsyev_wrapper(char jobz, //if jobz=='V', compute eigs and evecs // // dscal // -void +void dscal_wrapper(LAPACK_INT N, LAPACK_REAL alpha, LAPACK_REAL* data, @@ -357,12 +499,22 @@ dscal_wrapper(LAPACK_INT N, { #ifdef ITENSOR_USE_CBLAS cblas_dscal(N,alpha,data,inc); +#elif ITENSOR_USE_CUDA + cublasHandle_t handle; + cublasCreate(&handle); + LAPACK_REAL *d_x; + cudaMalloc(&d_x, N * sizeof(LAPACK_REAL)); + cublasSetVector(N, sizeof(LAPACK_REAL), data, inc, d_x, inc); + cublasDscal(handle, N, &alpha, d_x, inc) + cublasGetVector(N, sizeof(LAPACK_REAL), d_x, inc, data, inc); + cudaFree(d_x); + cublasDestroy(handle); #else F77NAME(dscal)(&N,&alpha,data,&inc); #endif } -void +void zgesdd_wrapper(char *jobz, //char* specifying how much of U, V to compute //choosing *jobz=='S' computes min(m,n) cols of U, V LAPACK_INT *m, //number of rows of input matrix *A @@ -395,7 +547,7 @@ zgesdd_wrapper(char *jobz, //char* specifying how much of U, V to comp - void + void dgesdd_wrapper(char* jobz, //char* specifying how much of U, V to compute //choosing *jobz=='S' computes min(m,n) cols of U, V LAPACK_INT *m, //number of rows of input matrix *A @@ -423,7 +575,7 @@ dgesdd_wrapper(char* jobz, //char* specifying how much of U, V to comp - void + void zgesvd_wrapper(char *jobz, //char* specifying how much of U, V to compute //choosing *jobz=='S' computes min(m,n) cols of U, V LAPACK_INT *m, //number of rows of input matrix *A @@ -456,7 +608,7 @@ zgesvd_wrapper(char *jobz, //char* specifying how much of U, V to comp - void + void dgesvd_wrapper(char* jobz, //char* specifying how much of U, V to compute //choosing *jobz=='S' computes min(m,n) cols of U, V LAPACK_INT *m, //number of rows of input matrix *A @@ -469,7 +621,7 @@ dgesvd_wrapper(char* jobz, //char* specifying how much of U, V to comp { std::vector work; // std::vector superb; - + std::vector iwork; LAPACK_INT l = std::min(*m,*n), g = std::max(*m,*n); @@ -490,7 +642,7 @@ dgesvd_wrapper(char* jobz, //char* specifying how much of U, V to comp // // QR factorization of a real matrix A // -void +void dgeqrf_wrapper(LAPACK_INT* m, //number of rows of A LAPACK_INT* n, //number of cols of A LAPACK_REAL* A, //matrix A @@ -503,7 +655,7 @@ dgeqrf_wrapper(LAPACK_INT* m, //number of rows of A static const LAPACK_INT one = 1; std::vector work; LAPACK_INT lwork = std::max(one,4*std::max(*n,*m)); - work.resize(lwork+2); + work.resize(lwork+2); F77NAME(dgeqrf)(m,n,A,lda,tau,work.data(),&lwork,info); } @@ -512,7 +664,7 @@ dgeqrf_wrapper(LAPACK_INT* m, //number of rows of A // // Generates Q from output of QR factorization routine dgeqrf (see above) // -void +void dorgqr_wrapper(LAPACK_INT* m, //number of rows of A LAPACK_INT* n, //number of cols of A LAPACK_INT* k, //number of elementary reflectors, typically min(m,n) @@ -525,7 +677,7 @@ dorgqr_wrapper(LAPACK_INT* m, //number of rows of A static const LAPACK_INT one = 1; std::vector work; auto lwork = std::max(one,4*std::max(*n,*m)); - work.resize(lwork+2); + work.resize(lwork+2); F77NAME(dorgqr)(m,n,k,A,lda,tau,work.data(),&lwork,info); } @@ -535,7 +687,7 @@ dorgqr_wrapper(LAPACK_INT* m, //number of rows of A // // QR factorization of a complex matrix A // -void +void zgeqrf_wrapper(LAPACK_INT* m, //number of rows of A LAPACK_INT* n, //number of cols of A Cplx* A, //matrix A @@ -559,7 +711,7 @@ zgeqrf_wrapper(LAPACK_INT* m, //number of rows of A // // Generates Q from output of QR factorization routine zgeqrf (see above) // -void +void zungqr_wrapper(LAPACK_INT* m, //number of rows of A LAPACK_INT* n, //number of cols of A LAPACK_INT* k, //number of elementary reflectors, typically min(m,n) @@ -675,7 +827,7 @@ zlange_wrapper(char norm, // // Eigenvalues and eigenvectors of complex Hermitian matrix A // -LAPACK_INT +LAPACK_INT zheev_wrapper(LAPACK_INT N, //number of cols of A Cplx * A, //matrix A, on return contains eigenvectors LAPACK_REAL * d) //eigenvalues on return @@ -713,7 +865,7 @@ zheev_wrapper(LAPACK_INT N, //number of cols of A // A and B must be symmetric // B must be positive definite // -void +void dsygv_wrapper(char* jobz, //if 'V', compute both eigs and evecs //if 'N', only eigenvalues char* uplo, //if 'U', use upper triangle of A @@ -743,7 +895,7 @@ dsygv_wrapper(char* jobz, //if 'V', compute both eigs and evecs // Eigenvalues and eigenvectors of real, square matrix A // A can be a general real matrix, not assumed symmetric // -LAPACK_INT +LAPACK_INT dgeev_wrapper(char jobvl, //if 'V', compute left eigenvectors, else 'N' char jobvr, //if 'V', compute right eigenvectors, else 'N' LAPACK_INT n, //number of rows/cols of A @@ -758,7 +910,7 @@ dgeev_wrapper(char jobvl, //if 'V', compute left eigenvectors, else 'N' cpA.resize(n*n); std::copy(A,A+n*n,cpA.data()); - + LAPACK_INT nevecl = (jobvl == 'V' ? n : 1); LAPACK_INT nevecr = (jobvr == 'V' ? n : 1); LAPACK_INT info = 0; @@ -802,7 +954,7 @@ dgeev_wrapper(char jobvl, //if 'V', compute left eigenvectors, else 'N' // Eigenvalues and eigenvectors of complex, square matrix A // A can be a general complex matrix, not assumed symmetric // -LAPACK_INT +LAPACK_INT zgeev_wrapper(char jobvl, //if 'V', compute left eigenvectors, else 'N' char jobvr, //if 'V', compute right eigenvectors, else 'N' LAPACK_INT n, //number of rows/cols of A @@ -841,4 +993,3 @@ zgeev_wrapper(char jobvl, //if 'V', compute left eigenvectors, else 'N' } } //namespace itensor - From e421a33a417c2df90eae8f6d4d7bf5a0d65f7143 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 19 Apr 2022 14:57:05 +0200 Subject: [PATCH 03/68] added cuda in options.mk --- options.mk.sample | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/options.mk.sample b/options.mk.sample index 3412c8961..fe92f215b 100644 --- a/options.mk.sample +++ b/options.mk.sample @@ -9,8 +9,11 @@ ## GNU GCC compiler CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC +## CUDA compiler +#CCCOM=nvcc -m64 -std=c++17 -fconcepts -fPIC + ## Clang compiler (good to use on macOS) -## Note: try this if you have trouble compiling +## Note: try this if you have trouble compiling ## with g++ on macOS Mojave 10.14 #CCCOM=clang++ -std=c++17 -fPIC -Wno-gcc-compat @@ -26,23 +29,23 @@ CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC ######### ## [2] -## +## ## BLAS/LAPACK Related Options -## +## ## o The variable PLATFORM can be set to macos,lapack,mkl, or acml. ## This tells ITensor which style of BLAS/Lapack library to expect, ## and turns various lines of code on or off inside the files ## itensor/tensor/lapack_wrap.h and lapack_wrap.cc. -## -## o BLAS_LAPACK_LIBFLAGS specifies blas or lapack related -## flags used during the linking step. For example, +## +## o BLAS_LAPACK_LIBFLAGS specifies blas or lapack related +## flags used during the linking step. For example, ## flags of the type: ## -L/path/to/lapack/lib -llapack -lblas ## though the names of the static libraries (the files referred ## to by the -l flags) can be highly variable - see examples below. ## -## o BLAS_LAPACK_INCLUDEFLAGS are blas or lapack related flags -## needed during compilation. It may include flags such as +## o BLAS_LAPACK_INCLUDEFLAGS are blas or lapack related flags +## needed during compilation. It may include flags such as ## -I/path/to/lapack/include ## where "include" is a folder containing .h header files. ## @@ -60,7 +63,15 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## #PLATFORM=lapack -#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack +#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack + +## +## Example using CUDA on GNU/LINUX systems +## (Path to lib/ folder may differ on your system) +## + +#PLATFORM=cuda +#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack -lcublas ## ## Example using the Intel MKL library @@ -70,8 +81,8 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate #PLATFORM=mkl ## MKL example - you may need to change the names of the folders below #BLAS_LAPACK_LIBFLAGS=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_rt -lmkl_core -liomp5 -lpthread -## Example of using a sequential version of MKL. You may want to link to -## the sequential version of MKL if you are using ITensor's native multithreading, +## Example of using a sequential version of MKL. You may want to link to +## the sequential version of MKL if you are using ITensor's native multithreading, ## see section [4] for more details. #BLAS_LAPACK_LIBFLAGS=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_rt -lmkl_core -lpthread #BLAS_LAPACK_INCLUDEFLAGS=-I/opt/intel/mkl/include @@ -100,12 +111,12 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## If you want ITensor to be compiled with HDF5 support ## uncomment the line below, and edit it to be the ## location in which the HDF5 libraries and headers are -## installed. +## installed. ## ## To determine the prefix on your system, use the command: ## h5cc -show ## -## For more help, visit: +## For more help, visit: ## http://itensor.org/docs.cgi?vers=cppv3&page=install ## #HDF5_PREFIX=/usr/local @@ -120,7 +131,7 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## conservation may benefit. ## ## If you want to enable OpenMP multithreading in ITensor, -## uncomment the line below. You must have OpenMP installed +## uncomment the line below. You must have OpenMP installed ## on your system. ## ## You should set the environment variable OMP_NUM_THREADS @@ -144,7 +155,7 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## ## export OPENBLAS_NUM_THREADS=1 ## -## For more help, visit: +## For more help, visit: ## http://itensor.org/docs.cgi?vers=cppv3&page=install ## #ITENSOR_USE_OMP=1 From 40994fbf3710143662b070680930aa91464168dd Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 19 Apr 2022 15:17:33 +0200 Subject: [PATCH 04/68] fixed typos and compilation options --- itensor/tensor/lapack_wrap.cc | 22 +++++++++++----------- options.mk.sample | 6 +++--- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 09060cd50..46113bec4 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -32,7 +32,7 @@ daxpy_wrapper(LAPACK_INT n, //number of elements of X,Y { #ifdef ITENSOR_USE_CBLAS cblas_daxpy(n,alpha,X,incx,Y,incy); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasHandle_t handle; cublasCreate(&handle); LAPACK_REAL *d_X, *d_Y; @@ -61,7 +61,7 @@ dnrm2_wrapper(LAPACK_INT N, { #ifdef ITENSOR_USE_CBLAS return cblas_dnrm2(N,X,incx); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasHandle_t handle; cublasCreate(&handle); LAPACK_REAL *d_X; @@ -91,7 +91,7 @@ ddot_wrapper(LAPACK_INT N, { #ifdef ITENSOR_USE_CBLAS return cblas_ddot(N,X,incx,Y,incy); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasHandle_t handle; cublasCreate(&handle); LAPACK_REAL *d_X, *d_Y; @@ -136,7 +136,7 @@ zdotc_wrapper(LAPACK_INT N, #endif cblas_zdotc_sub(N,pX,incx,pY,incy,pres); return res; -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasHandle_t handle; cublasCreate(&handle); LAPACK_COMPLEX res; @@ -193,7 +193,7 @@ gemm_wrapper(bool transa, ldb = n; } cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasOperation_t at = CUBLAS_OP_N; cublasOperation_t bt = CUBLAS_OP_N; if(transa) @@ -215,7 +215,7 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda); cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb); cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); - cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m) + cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m); cudaFree(d_A); cudaFree(d_B); @@ -284,7 +284,7 @@ gemm_wrapper(bool transa, auto* pB = reinterpret_cast(B); auto* pC = reinterpret_cast(C); cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,pA,lda,pB,ldb,pbeta,pC,m); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasOperation_t at = CUBLAS_OP_N; cublasOperation_t bt = CUBLAS_OP_N; if(transa) @@ -306,7 +306,7 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda); cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb); cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m); - cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m) + cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m); cudaFree(d_A); cudaFree(d_B); @@ -369,7 +369,7 @@ gemv_wrapper(bool trans, #ifdef ITENSOR_USE_CBLAS auto Tr = trans ? CblasTrans : CblasNoTrans; cblas_dgemv(CblasColMajor,Tr,m,n,alpha,A,m,x,incx,beta,y,incy); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N; cublasHandle_t handle; cublasCreate(&handle); @@ -420,7 +420,7 @@ gemv_wrapper(bool trans, auto* px = reinterpret_cast(x); auto* py = reinterpret_cast(y); cblas_zgemv(CblasColMajor,Tr,m,n,palpha,pA,m,px,incx,pbeta,py,incy); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N; cublasHandle_t handle; cublasCreate(&handle); @@ -499,7 +499,7 @@ dscal_wrapper(LAPACK_INT N, { #ifdef ITENSOR_USE_CBLAS cblas_dscal(N,alpha,data,inc); -#elif ITENSOR_USE_CUDA +#elif defined ITENSOR_USE_CUDA cublasHandle_t handle; cublasCreate(&handle); LAPACK_REAL *d_x; diff --git a/options.mk.sample b/options.mk.sample index fe92f215b..992bf82bd 100644 --- a/options.mk.sample +++ b/options.mk.sample @@ -10,7 +10,7 @@ CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC ## CUDA compiler -#CCCOM=nvcc -m64 -std=c++17 -fconcepts -fPIC +#CCCOM=nvcc -m64 -std=c++17 ## Clang compiler (good to use on macOS) ## Note: try this if you have trouble compiling @@ -168,10 +168,10 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## used to compile optimized and debug code, you can do so here. ## Flags to give the compiler for "release mode" -OPTIMIZATIONS=-O2 -DNDEBUG -Wall -Wno-unknown-pragmas +OPTIMIZATIONS=-O2 -DNDEBUG ## Flags to give the compiler for "debug mode" -DEBUGFLAGS=-DDEBUG -g -Wall -Wno-unknown-pragmas -pedantic +DEBUGFLAGS=-DDEBUG -g -pedantic # ## Set this to 1 if you want ITensor to also build dynamic libraries ## These can be faster to link and give smaller binary sizes From 8d2b7452bf7f06143c236bf929b234619235d12b Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 19 Apr 2022 15:27:10 +0200 Subject: [PATCH 05/68] fixed type cast --- itensor/tensor/lapack_wrap.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 46113bec4..14ad0748f 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -149,7 +149,7 @@ zdotc_wrapper(LAPACK_INT N, cudaFree(d_X); cudaFree(d_Y); cublasDestroy(handle); - return res; + return (LAPACK_COMPLEX) res; #else auto ncX = const_cast(X); auto ncY = const_cast(Y); @@ -306,7 +306,7 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda); cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb); cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m); - cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); + cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, d_A, lda, d_B, ldb, (LAPACK_COMPLEX*) &beta, d_C, m); cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m); cudaFree(d_A); cudaFree(d_B); @@ -380,7 +380,7 @@ gemv_wrapper(bool trans, cublasSetMatrix(m, n, sizeof(LAPACK_REAL), A, m, d_A, m); cublasSetVector(m, sizeof(LAPACK_REAL), x, incx, d_x, incx); cublasSetVector(m, sizeof(LAPACK_REAL), y, incy, d_y, incy); - cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy) + cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy); cublasGetVector(m, sizeof(LAPACK_REAL), d_y, incy, y, incy); cudaFree(d_A); cudaFree(d_x); From 3596a6d0995ddd9aa779711bc05c1e90f1943ae0 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 19 Apr 2022 15:46:39 +0200 Subject: [PATCH 06/68] fixed types --- itensor/tensor/lapack_wrap.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 14ad0748f..3148c608b 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -149,7 +149,7 @@ zdotc_wrapper(LAPACK_INT N, cudaFree(d_X); cudaFree(d_Y); cublasDestroy(handle); - return (LAPACK_COMPLEX) res; + return (Cplx) res; #else auto ncX = const_cast(X); auto ncY = const_cast(Y); @@ -431,7 +431,7 @@ gemv_wrapper(bool trans, cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m); cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx); cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy); - cublasZgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy) + cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy) cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy); cudaFree(d_A); cudaFree(d_x); @@ -505,7 +505,7 @@ dscal_wrapper(LAPACK_INT N, LAPACK_REAL *d_x; cudaMalloc(&d_x, N * sizeof(LAPACK_REAL)); cublasSetVector(N, sizeof(LAPACK_REAL), data, inc, d_x, inc); - cublasDscal(handle, N, &alpha, d_x, inc) + cublasDscal(handle, N, &alpha, d_x, inc); cublasGetVector(N, sizeof(LAPACK_REAL), d_x, inc, data, inc); cudaFree(d_x); cublasDestroy(handle); From 9753d7e91540008193ec366c97209e5d452d2c29 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 19 Apr 2022 15:48:09 +0200 Subject: [PATCH 07/68] fixe types --- itensor/tensor/lapack_wrap.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 3148c608b..e8a3d1e36 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -149,7 +149,7 @@ zdotc_wrapper(LAPACK_INT N, cudaFree(d_X); cudaFree(d_Y); cublasDestroy(handle); - return (Cplx) res; + return Cplx(res.x, res.y); #else auto ncX = const_cast(X); auto ncY = const_cast(Y); @@ -431,7 +431,7 @@ gemv_wrapper(bool trans, cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m); cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx); cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy); - cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy) + cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy); cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy); cudaFree(d_A); cudaFree(d_x); From 84011c546ba640611e57f724d7d748c4a9ab11cd Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 19 Apr 2022 15:56:02 +0200 Subject: [PATCH 08/68] added fortran functions --- itensor/tensor/lapack_wrap.h | 1 + 1 file changed, 1 insertion(+) diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index 0af8e3c49..7000c07c3 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -145,6 +145,7 @@ imagRef(LAPACK_COMPLEX & z) #elif defined PLATFORM_cuda #define ITENSOR_USE_CUDA +#define LAPACK_REQUIRE_EXTERN #include #include From c9535cd77c7a46aa8bb5ec54ef3aec0b029c5814 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Thu, 28 Apr 2022 11:05:12 +0200 Subject: [PATCH 09/68] trying to fix already defined type error --- itensor/tensor/lapack_wrap.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index 7000c07c3..014cd1e57 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -145,7 +145,7 @@ imagRef(LAPACK_COMPLEX & z) #elif defined PLATFORM_cuda #define ITENSOR_USE_CUDA -#define LAPACK_REQUIRE_EXTERN +//#define LAPACK_REQUIRE_EXTERN #include #include @@ -153,9 +153,9 @@ imagRef(LAPACK_COMPLEX & z) #include namespace itensor { -cudaDataType_t typeComplexData = CUDA_C_64F; -cudaDataType_t typeRealData = CUDA_R_64F; -cublasComputeType_t typeCompute = CUBLAS_COMPUTE_64F; +//cudaDataType_t typeComplexData = CUDA_C_64F; +//cudaDataType_t typeRealData = CUDA_R_64F; +//cublasComputeType_t typeCompute = CUBLAS_COMPUTE_64F; using LAPACK_INT = int; using LAPACK_REAL = double; From c94c67dcffdf885611af11432549af594e925446 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Thu, 28 Apr 2022 11:47:10 +0200 Subject: [PATCH 10/68] changed lapack in cuda platform --- itensor/tensor/lapack_wrap.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index 014cd1e57..9c1ecc18a 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -151,6 +151,8 @@ imagRef(LAPACK_COMPLEX & z) #include #include #include +#include "lapacke.h" +#undef I //lapacke.h includes complex.h which defined an `I` macro namespace itensor { //cudaDataType_t typeComplexData = CUDA_C_64F; From 16de1b7115f1dd840cabee80bc081cb8d9ce35fe Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Thu, 28 Apr 2022 11:51:49 +0200 Subject: [PATCH 11/68] reverted lapack in cuda platform --- itensor/tensor/lapack_wrap.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index 9c1ecc18a..ddab8c504 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -145,14 +145,12 @@ imagRef(LAPACK_COMPLEX & z) #elif defined PLATFORM_cuda #define ITENSOR_USE_CUDA -//#define LAPACK_REQUIRE_EXTERN +#define LAPACK_REQUIRE_EXTERN #include #include #include #include -#include "lapacke.h" -#undef I //lapacke.h includes complex.h which defined an `I` macro namespace itensor { //cudaDataType_t typeComplexData = CUDA_C_64F; From 33f23b3a14ec0bb730ec4e94cf90b0ffe216a1c3 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Thu, 28 Apr 2022 12:02:55 +0200 Subject: [PATCH 12/68] set cuda default in options --- options.mk.sample | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/options.mk.sample b/options.mk.sample index 992bf82bd..280a8f0e2 100644 --- a/options.mk.sample +++ b/options.mk.sample @@ -7,10 +7,10 @@ ## ## Set which compiler to use by defining CCCOM: ## GNU GCC compiler -CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC +#CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC ## CUDA compiler -#CCCOM=nvcc -m64 -std=c++17 +CCCOM=nvcc -m64 -std=c++17 ## Clang compiler (good to use on macOS) ## Note: try this if you have trouble compiling @@ -54,8 +54,8 @@ CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC ## Mac OSX system ## -PLATFORM=macos -BLAS_LAPACK_LIBFLAGS=-framework Accelerate +#PLATFORM=macos +#BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## ## Example using a C interface to LAPACK on GNU/LINUX systems @@ -70,8 +70,8 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate ## (Path to lib/ folder may differ on your system) ## -#PLATFORM=cuda -#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack -lcublas +PLATFORM=cuda +BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack -lcublas ## ## Example using the Intel MKL library @@ -171,7 +171,7 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate OPTIMIZATIONS=-O2 -DNDEBUG ## Flags to give the compiler for "debug mode" -DEBUGFLAGS=-DDEBUG -g -pedantic +DEBUGFLAGS=-DDEBUG -g # ## Set this to 1 if you want ITensor to also build dynamic libraries ## These can be faster to link and give smaller binary sizes @@ -217,7 +217,7 @@ ITENSOR_LIBFLAGS += -fopenmp ITENSOR_LIBGFLAGS += -fopenmp endif -CCFLAGS=-I. $(ITENSOR_INCLUDEFLAGS) $(OPTIMIZATIONS) -Wno-unused-variable +CCFLAGS=-I. $(ITENSOR_INCLUDEFLAGS) $(OPTIMIZATIONS) CCGFLAGS=-I. $(ITENSOR_INCLUDEFLAGS) $(DEBUGFLAGS) LIBFLAGS=-L'$(ITENSOR_LIBDIR)' $(ITENSOR_LIBFLAGS) LIBGFLAGS=-L'$(ITENSOR_LIBDIR)' $(ITENSOR_LIBGFLAGS) From 3fcc47e34228ce2bd969324fdc30c2ffe10419cd Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 16:46:35 +0200 Subject: [PATCH 13/68] added nrm2 debug print --- itensor/tensor/lapack_wrap.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index e8a3d1e36..fd20dc57a 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -68,7 +68,13 @@ dnrm2_wrapper(LAPACK_INT N, cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); LAPACK_REAL result; - cublasDnrm2(handle, N, d_X, incx, &result); + cublasDnrm2(handle, N, d_X, incxr &result); + std::cout << result << " is the norm of:" << std:endl; + for(auto v : X) + { + std::cout << v << " "; + } + std::cout << std::endl << std::endl; cudaFree(d_X); cublasDestroy(handle); return result; From 89b729e77c9d5d1bdef2e37aa8e435300bc6ba67 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 16:50:09 +0200 Subject: [PATCH 14/68] fixed typo in nrm2 --- itensor/tensor/lapack_wrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index fd20dc57a..d1d6686e1 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -68,7 +68,7 @@ dnrm2_wrapper(LAPACK_INT N, cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); LAPACK_REAL result; - cublasDnrm2(handle, N, d_X, incxr &result); + cublasDnrm2(handle, N, d_X, incx, &result); std::cout << result << " is the norm of:" << std:endl; for(auto v : X) { From a4e089a437769b1642652157588315dd0a79de77 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 16:50:56 +0200 Subject: [PATCH 15/68] fixed typo in std::endl --- itensor/tensor/lapack_wrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index d1d6686e1..96230c997 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -69,7 +69,7 @@ dnrm2_wrapper(LAPACK_INT N, cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); LAPACK_REAL result; cublasDnrm2(handle, N, d_X, incx, &result); - std::cout << result << " is the norm of:" << std:endl; + std::cout << result << " is the norm of:" << std::endl; for(auto v : X) { std::cout << v << " "; From 8a395da2d97c2ed9a83e2962c5783c8847f9f50e Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 16:53:04 +0200 Subject: [PATCH 16/68] fixed vector printing --- itensor/tensor/lapack_wrap.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 96230c997..7ff396944 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -70,9 +70,9 @@ dnrm2_wrapper(LAPACK_INT N, LAPACK_REAL result; cublasDnrm2(handle, N, d_X, incx, &result); std::cout << result << " is the norm of:" << std::endl; - for(auto v : X) + for(int i=0; i Date: Fri, 29 Apr 2022 17:09:45 +0200 Subject: [PATCH 17/68] added debug in gemv --- itensor/tensor/lapack_wrap.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 7ff396944..bfaea6d4c 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -391,6 +391,26 @@ gemv_wrapper(bool trans, cudaFree(d_A); cudaFree(d_x); cudaFree(d_y); + std::cout << "Matrix A:" << std::endl; + for(int i=0; i Date: Fri, 29 Apr 2022 17:20:14 +0200 Subject: [PATCH 18/68] added zgemv debug --- itensor/tensor/lapack_wrap.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index bfaea6d4c..b3b213296 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -462,6 +462,25 @@ gemv_wrapper(bool trans, cudaFree(d_A); cudaFree(d_x); cudaFree(d_y); + std::cout << "Matrix A:" << std::endl; + for(int i=0; i Date: Fri, 29 Apr 2022 17:45:44 +0200 Subject: [PATCH 19/68] added debug dgemm --- itensor/tensor/lapack_wrap.cc | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index b3b213296..820796334 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -227,6 +227,27 @@ gemm_wrapper(bool transa, cudaFree(d_B); cudaFree(d_C); cublasDestroy(handle); + std::cout << "Matrix A:" << std::endl; + for(int i=0; i(A); auto *pB = const_cast(B); From 40aabd50708ee39755be37bad59a296d3d746d55 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 17:54:48 +0200 Subject: [PATCH 20/68] dont actually use gemm on gpu --- itensor/tensor/lapack_wrap.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 820796334..6ac6a7f40 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -248,6 +248,7 @@ gemm_wrapper(bool transa, if(!((i+1)%m)) std::cout << std::endl; } std::cout << std::endl; + F77NAME(dgemm)(&at,&bt,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m); #else auto *pA = const_cast(A); auto *pB = const_cast(B); From 5de66a00bb67af5e37436a0b733a7a09878719cb Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 17:57:33 +0200 Subject: [PATCH 21/68] dont actually use gemm on gpu - fixed typo --- itensor/tensor/lapack_wrap.cc | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 6ac6a7f40..2d9f65ab2 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -248,7 +248,37 @@ gemm_wrapper(bool transa, if(!((i+1)%m)) std::cout << std::endl; } std::cout << std::endl; - F77NAME(dgemm)(&at,&bt,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m); + + // I just want to try to not use gpu multiplication + auto *pA = const_cast(A); + auto *pB = const_cast(B); + char at = 'N'; + char bt = 'N'; + if(transa) + { + at = 'T'; + lda = k; + } + if(transb) + { + bt = 'T'; + ldb = n; + } + auto *pA = const_cast(A); + auto *pB = const_cast(B); + char at2 = 'N'; + char bt2 = 'N'; + if(transa) + { + at2 = 'T'; + lda = k; + } + if(transb) + { + bt2 = 'T'; + ldb = n; + } + F77NAME(dgemm)(&at2,&bt2,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m); #else auto *pA = const_cast(A); auto *pB = const_cast(B); From 39d5b67c02a06835d1338cded6a778132323a1b6 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 17:59:01 +0200 Subject: [PATCH 22/68] dont actually use gemm on gpu - fixed typo 2 --- itensor/tensor/lapack_wrap.cc | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 2d9f65ab2..c176cf199 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -252,20 +252,6 @@ gemm_wrapper(bool transa, // I just want to try to not use gpu multiplication auto *pA = const_cast(A); auto *pB = const_cast(B); - char at = 'N'; - char bt = 'N'; - if(transa) - { - at = 'T'; - lda = k; - } - if(transb) - { - bt = 'T'; - ldb = n; - } - auto *pA = const_cast(A); - auto *pB = const_cast(B); char at2 = 'N'; char bt2 = 'N'; if(transa) From fa88178ebb0162dfd28e55e506d2c151d9526686 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 29 Apr 2022 18:46:17 +0200 Subject: [PATCH 23/68] dont actually use gemm on gpu - dont print --- itensor/tensor/lapack_wrap.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index c176cf199..5ea4184bf 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -200,6 +200,7 @@ gemm_wrapper(bool transa, } cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m); #elif defined ITENSOR_USE_CUDA +/* cublasOperation_t at = CUBLAS_OP_N; cublasOperation_t bt = CUBLAS_OP_N; if(transa) @@ -248,6 +249,7 @@ gemm_wrapper(bool transa, if(!((i+1)%m)) std::cout << std::endl; } std::cout << std::endl; +*/ // I just want to try to not use gpu multiplication auto *pA = const_cast(A); From 83c8d2b049d64501d8e0abed7f4d756260eb801f Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Mon, 2 May 2022 14:31:17 +0200 Subject: [PATCH 24/68] output A B C --- itensor/tensor/lapack_wrap.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 5ea4184bf..fd92ca1d4 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -69,12 +69,12 @@ dnrm2_wrapper(LAPACK_INT N, cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); LAPACK_REAL result; cublasDnrm2(handle, N, d_X, incx, &result); - std::cout << result << " is the norm of:" << std::endl; - for(int i=0; i(A); auto *pB = const_cast(B); @@ -267,6 +267,7 @@ gemm_wrapper(bool transa, ldb = n; } F77NAME(dgemm)(&at2,&bt2,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m); +*/ #else auto *pA = const_cast(A); auto *pB = const_cast(B); From 3d19cddfbb90aaaca53f14f4981e0a368fc2c03e Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Mon, 2 May 2022 14:43:35 +0200 Subject: [PATCH 25/68] output A B C and transa b --- itensor/tensor/lapack_wrap.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index fd92ca1d4..544fa1964 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -223,19 +223,27 @@ gemm_wrapper(bool transa, cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb); cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); + double* A_copy; + malloc(A_copy, m * k * sizeof(double)); + cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); + std::cout << "difference" << std::endl; + for(int i=0; i Date: Mon, 2 May 2022 14:45:14 +0200 Subject: [PATCH 26/68] fixed malloc --- itensor/tensor/lapack_wrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 544fa1964..b1b2fc3e7 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -224,7 +224,7 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); double* A_copy; - malloc(A_copy, m * k * sizeof(double)); + A_copy = malloc(m * k * sizeof(double)); cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); std::cout << "difference" << std::endl; for(int i=0; i Date: Mon, 2 May 2022 14:46:15 +0200 Subject: [PATCH 27/68] fixed malloc --- itensor/tensor/lapack_wrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index b1b2fc3e7..5adafa6f8 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -224,7 +224,7 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); double* A_copy; - A_copy = malloc(m * k * sizeof(double)); + A_copy = (*double)malloc(m * k * sizeof(double)); cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); std::cout << "difference" << std::endl; for(int i=0; i Date: Mon, 2 May 2022 14:47:00 +0200 Subject: [PATCH 28/68] fixed malloc --- itensor/tensor/lapack_wrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 5adafa6f8..8f88602ed 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -224,7 +224,7 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); double* A_copy; - A_copy = (*double)malloc(m * k * sizeof(double)); + A_copy = (double*)malloc(m * k * sizeof(double)); cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); std::cout << "difference" << std::endl; for(int i=0; i Date: Mon, 2 May 2022 15:00:53 +0200 Subject: [PATCH 29/68] trying cudamemcpy --- itensor/tensor/lapack_wrap.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 8f88602ed..43d128744 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -224,14 +224,16 @@ gemm_wrapper(bool transa, cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); double* A_copy; - A_copy = (double*)malloc(m * k * sizeof(double)); - cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); + A_copy = (LAPACK_REAL*)malloc(m * k * sizeof(LAPACK_REAL)); + //cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); + cudaMemcpy( A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost); std::cout << "difference" << std::endl; for(int i=0; i Date: Mon, 2 May 2022 15:09:54 +0200 Subject: [PATCH 30/68] trying cudamemcpy instead of setmatrix --- itensor/tensor/lapack_wrap.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 43d128744..43d3920b8 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -219,14 +219,17 @@ gemm_wrapper(bool transa, cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL)); cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL)); cudaMalloc(&d_C, m * n * sizeof(LAPACK_REAL)); - cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda); - cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb); - cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); + //cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda); + cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); + //cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb); + cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); + //cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); + cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); double* A_copy; A_copy = (LAPACK_REAL*)malloc(m * k * sizeof(LAPACK_REAL)); //cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); - cudaMemcpy( A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost); + cudaMemcpy(A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost); std::cout << "difference" << std::endl; for(int i=0; i Date: Mon, 2 May 2022 15:16:25 +0200 Subject: [PATCH 31/68] fixed gemm? --- itensor/tensor/lapack_wrap.cc | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 43d3920b8..d0de1fef2 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -226,21 +226,13 @@ gemm_wrapper(bool transa, //cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m); cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m); - double* A_copy; - A_copy = (LAPACK_REAL*)malloc(m * k * sizeof(LAPACK_REAL)); - //cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m); - cudaMemcpy(A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost); - std::cout << "difference" << std::endl; - for(int i=0; i Date: Tue, 3 May 2022 10:55:57 +0200 Subject: [PATCH 32/68] changed every set get with memcpy --- itensor/tensor/lapack_wrap.cc | 136 ++++++---------------------------- 1 file changed, 22 insertions(+), 114 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index d0de1fef2..aeee8d749 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -38,10 +38,10 @@ daxpy_wrapper(LAPACK_INT n, //number of elements of X,Y LAPACK_REAL *d_X, *d_Y; cudaMalloc(&d_X, n * sizeof(LAPACK_REAL)); cudaMalloc(&d_Y, n * sizeof(LAPACK_REAL)); - cublasSetVector(n, sizeof(LAPACK_REAL), X, incx, d_X, incx); - cublasSetVector(n, sizeof(LAPACK_REAL), Y, incy, d_Y, incy); + cudaMemcpy(d_X, X, n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); + cudaMemcpy(d_Y, Y, n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); cublasDaxpy(handle, n, &alpha, d_X, incx, d_Y, incy); - cublasGetVector(n, sizeof(LAPACK_REAL), d_Y, incy, Y, incy); + cudaMemcpy(Y, d_Y, n * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost); cudaFree(d_X); cudaFree(d_Y); cublasDestroy(handle); @@ -66,15 +66,9 @@ dnrm2_wrapper(LAPACK_INT N, cublasCreate(&handle); LAPACK_REAL *d_X; cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); - cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx); + cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); LAPACK_REAL result; cublasDnrm2(handle, N, d_X, incx, &result); - //std::cout << result << " is the norm of:" << std::endl; - //for(int i=0; i(A); - auto *pB = const_cast(B); - char at2 = 'N'; - char bt2 = 'N'; - if(transa) - { - at2 = 'T'; - lda = k; - } - if(transb) - { - bt2 = 'T'; - ldb = n; - } - F77NAME(dgemm)(&at2,&bt2,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m); -*/ #else auto *pA = const_cast(A); auto *pB = const_cast(B); @@ -356,11 +303,11 @@ gemm_wrapper(bool transa, cudaMalloc(&d_A, m * k * sizeof(LAPACK_COMPLEX)); cudaMalloc(&d_B, k * n * sizeof(LAPACK_COMPLEX)); cudaMalloc(&d_C, m * n * sizeof(LAPACK_COMPLEX)); - cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda); - cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb); - cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m); + cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice); + cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice); cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, d_A, lda, d_B, ldb, (LAPACK_COMPLEX*) &beta, d_C, m); - cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m); + cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyDeviceToHost); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); @@ -430,34 +377,14 @@ gemv_wrapper(bool trans, cudaMalloc(&d_A, m * n * sizeof(LAPACK_REAL)); cudaMalloc(&d_x, m * sizeof(LAPACK_REAL)); cudaMalloc(&d_y, m * sizeof(LAPACK_REAL)); - cublasSetMatrix(m, n, sizeof(LAPACK_REAL), A, m, d_A, m); - cublasSetVector(m, sizeof(LAPACK_REAL), x, incx, d_x, incx); - cublasSetVector(m, sizeof(LAPACK_REAL), y, incy, d_y, incy); + cudaMemcpy(d_A, A, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); + cudaMemcpy(d_x, x, m * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); + cudaMemcpy(d_y, y, m * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy); - cublasGetVector(m, sizeof(LAPACK_REAL), d_y, incy, y, incy); + cudaMemcpy(y, d_y, m * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost); cudaFree(d_A); cudaFree(d_x); cudaFree(d_y); - std::cout << "Matrix A:" << std::endl; - for(int i=0; i Date: Tue, 3 May 2022 12:46:39 +0200 Subject: [PATCH 33/68] trying static cublas handle --- itensor/tensor/lapack_wrap.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index aeee8d749..965bbdb8f 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -62,15 +62,15 @@ dnrm2_wrapper(LAPACK_INT N, #ifdef ITENSOR_USE_CBLAS return cblas_dnrm2(N,X,incx); #elif defined ITENSOR_USE_CUDA - cublasHandle_t handle; - cublasCreate(&handle); + static cublasHandle_t handle; + //cublasCreate(&handle); LAPACK_REAL *d_X; cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); LAPACK_REAL result; cublasDnrm2(handle, N, d_X, incx, &result); cudaFree(d_X); - cublasDestroy(handle); + //cublasDestroy(handle); return result; #else auto *Xnc = const_cast(X); @@ -206,8 +206,8 @@ gemm_wrapper(bool transa, bt = CUBLAS_OP_T; ldb = n; } - cublasHandle_t handle; - cublasCreate(&handle); + static cublasHandle_t handle; + //cublasCreate(&handle); LAPACK_REAL *d_A, *d_B, *d_C; cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL)); cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL)); @@ -220,7 +220,7 @@ gemm_wrapper(bool transa, cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); - cublasDestroy(handle); + //cublasDestroy(handle); #else auto *pA = const_cast(A); auto *pB = const_cast(B); From ff6ff239bb7fe0500aad3e6309286c254c51b614 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 3 May 2022 12:57:57 +0200 Subject: [PATCH 34/68] reverting static handle --- itensor/tensor/lapack_wrap.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 965bbdb8f..aeee8d749 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -62,15 +62,15 @@ dnrm2_wrapper(LAPACK_INT N, #ifdef ITENSOR_USE_CBLAS return cblas_dnrm2(N,X,incx); #elif defined ITENSOR_USE_CUDA - static cublasHandle_t handle; - //cublasCreate(&handle); + cublasHandle_t handle; + cublasCreate(&handle); LAPACK_REAL *d_X; cudaMalloc(&d_X, N * sizeof(LAPACK_REAL)); cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice); LAPACK_REAL result; cublasDnrm2(handle, N, d_X, incx, &result); cudaFree(d_X); - //cublasDestroy(handle); + cublasDestroy(handle); return result; #else auto *Xnc = const_cast(X); @@ -206,8 +206,8 @@ gemm_wrapper(bool transa, bt = CUBLAS_OP_T; ldb = n; } - static cublasHandle_t handle; - //cublasCreate(&handle); + cublasHandle_t handle; + cublasCreate(&handle); LAPACK_REAL *d_A, *d_B, *d_C; cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL)); cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL)); @@ -220,7 +220,7 @@ gemm_wrapper(bool transa, cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); - //cublasDestroy(handle); + cublasDestroy(handle); #else auto *pA = const_cast(A); auto *pB = const_cast(B); From 3ffaf1d56afe426ac50f97fbc6f65f90d1e0a12c Mon Sep 17 00:00:00 2001 From: Kemal Bidzhiev Date: Thu, 5 May 2022 13:43:28 +0200 Subject: [PATCH 35/68] comment to the option.mk --- options.mk.sample | 1 + 1 file changed, 1 insertion(+) diff --git a/options.mk.sample b/options.mk.sample index 280a8f0e2..5908a012c 100644 --- a/options.mk.sample +++ b/options.mk.sample @@ -10,6 +10,7 @@ #CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC ## CUDA compiler +## comment line CCCOM=nvcc -m64 -std=c++17 ## Clang compiler (good to use on macOS) From 6a688e7b6da44b6f903dca10d1c2a487f67c7a29 Mon Sep 17 00:00:00 2001 From: Kemal Bidzhiev Date: Thu, 5 May 2022 13:49:24 +0200 Subject: [PATCH 36/68] comment in cuda section is removed --- options.mk.sample | 1 - 1 file changed, 1 deletion(-) diff --git a/options.mk.sample b/options.mk.sample index 5908a012c..280a8f0e2 100644 --- a/options.mk.sample +++ b/options.mk.sample @@ -10,7 +10,6 @@ #CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC ## CUDA compiler -## comment line CCCOM=nvcc -m64 -std=c++17 ## Clang compiler (good to use on macOS) From 61bf459db68ca2cce28c3836def56475257a9413 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Thu, 5 May 2022 14:01:42 +0200 Subject: [PATCH 37/68] comments showing how to replace storage type --- itensor/util/vector_no_init.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/itensor/util/vector_no_init.h b/itensor/util/vector_no_init.h index 19200ff92..e1e8ea177 100644 --- a/itensor/util/vector_no_init.h +++ b/itensor/util/vector_no_init.h @@ -67,7 +67,12 @@ class uninitialized_allocator }; template -using vector_no_init = std::vector>; +//#ifdef PLATFORM_cuda + //using vector_no_init = thrust::vector>; +//#else + using vector_no_init = std::vector>; +//#endif + } //namespace itensor From 0a0dc51847bdbdf1a25871309190fd4624c52f01 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 16:13:07 +0200 Subject: [PATCH 38/68] thrust allocation --- itensor/util/vector_no_init.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/itensor/util/vector_no_init.h b/itensor/util/vector_no_init.h index e1e8ea177..9c1fc5a07 100644 --- a/itensor/util/vector_no_init.h +++ b/itensor/util/vector_no_init.h @@ -17,6 +17,7 @@ #define __ITENSOR_VECTOR_NO_INIT_H #include +#include namespace itensor { @@ -64,14 +65,16 @@ class uninitialized_allocator bool operator!=(uninitialized_allocator const&) { return false; } + //where is destroy? + }; template -//#ifdef PLATFORM_cuda - //using vector_no_init = thrust::vector>; -//#else +#ifdef PLATFORM_cuda + using vector_no_init = thrust::device_vector; +#else using vector_no_init = std::vector>; -//#endif +#endif } //namespace itensor From ba6080142beed3419f84c868fa809efdde529438 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 16:35:27 +0200 Subject: [PATCH 39/68] safe pointer to thrust --- itensor/util/safe_ptr.h | 58 ++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h index 12eb319a4..7d273d0bc 100644 --- a/itensor/util/safe_ptr.h +++ b/itensor/util/safe_ptr.h @@ -18,6 +18,7 @@ #include "itensor/util/print.h" #include "itensor/util/stdx.h" +#include namespace itensor { @@ -44,19 +45,19 @@ class SafePtr SafePtr(T* pt, size_t offset_end) : p_(pt), offset_(0), offset_end_(offset_end) - { + { //if(!p_) throw std::runtime_error("SafePtr: pointer is null"); } SafePtr(T* pt, size_t offset, size_t offset_end) : p_(pt), offset_(offset), offset_end_(offset_end) - { + { if(!p_) throw std::runtime_error("SafePtr: pointer is null"); } //Allow automatic conversion SafePtr -> SafePtr SafePtr(SafePtr const& P) - { + { operator=(P); } SafePtr& @@ -75,29 +76,29 @@ class SafePtr bool validOffset() const { return (offset_ < offset_end_); } size_t - range() const - { + range() const + { if(validOffset()) return (offset_end_-offset_); return 0ul; } pointer - get() const - { + get() const + { if(!p_) return nullptr; - return (p_+offset_); + return (p_+offset_); } pointer safeGet(size_t expected_range) { - if(!p_) + if(!p_) { throw std::runtime_error("SafePtr: dereferencing null pointer"); } if(!validOffset()) { - auto error_msg = + auto error_msg = format("SafePtr: offset >= offset_end (%d >= %d)", offset_,offset_end_); throw std::runtime_error(error_msg); @@ -105,7 +106,7 @@ class SafePtr auto actual_range = offsetEnd()-offset(); if(expected_range > actual_range) { - auto error_msg = + auto error_msg = format("SafePtr: expected_range > actual_range (%d > %d)", expected_range,actual_range); throw std::runtime_error(error_msg); @@ -118,7 +119,7 @@ class SafePtr SafePtr& operator+=(size_t shift) { - //if(!p_) + //if(!p_) // { // throw std::runtime_error("SafePtr: incrementing (+=) null pointer"); // } @@ -186,8 +187,8 @@ class SafePtr } bool - operator!=(SafePtr const& other) const - { + operator!=(SafePtr const& other) const + { if(p_ != other.p_) throw std::runtime_error("SafePtr: error, comparing two different starting pointers"); return (offset_ != other.offset_); @@ -198,20 +199,20 @@ class SafePtr pointer - getStart() const - { + getStart() const + { return p_; } private: pointer - safeFront() const + safeFront() const { if(!p_) throw std::runtime_error("SafePtr: dereferencing null pointer"); if(!validOffset()) { - auto error_msg = + auto error_msg = format("SafePtr: offset >= offset_end (%d >= %d)", offset_,offset_end_); throw std::runtime_error(error_msg); @@ -269,12 +270,21 @@ reinterpret(SafePtr const& p) #else -//bare pointer versions of macros -#define MAKE_SAFE_PTR(P,SZ) (P) -#define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) -#define SAFE_PTR_GET(P,SZ) P -#define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) -#define SAFE_PTR_OF(T) T* + #ifdef PLATFORM_cuda + //bare pointer versions of macros + #define MAKE_SAFE_PTR(P,SZ) (P) + #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) + #define SAFE_PTR_GET(P,SZ) P + #define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) + #define SAFE_PTR_OF(T) thrust::device_ptr + #else + //bare pointer versions of macros + #define MAKE_SAFE_PTR(P,SZ) (P) + #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) + #define SAFE_PTR_GET(P,SZ) P + #define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) + #define SAFE_PTR_OF(T) T* + #endif #endif From 7281ae33d06f1a6046b45095c0f7ce8d7a06ffda Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 16:53:52 +0200 Subject: [PATCH 40/68] safe pointer reinterpret cast --- itensor/util/safe_ptr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h index 7d273d0bc..751cf4fed 100644 --- a/itensor/util/safe_ptr.h +++ b/itensor/util/safe_ptr.h @@ -275,7 +275,7 @@ reinterpret(SafePtr const& p) #define MAKE_SAFE_PTR(P,SZ) (P) #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) #define SAFE_PTR_GET(P,SZ) P - #define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) + #define SAFE_REINTERPRET(NT,SP) reinterpret_cast>(SP) #define SAFE_PTR_OF(T) thrust::device_ptr #else //bare pointer versions of macros From 7b32260538484ea9502a2130bb5657e8e29f8c21 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 16:54:58 +0200 Subject: [PATCH 41/68] fixed typo --- itensor/util/safe_ptr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h index 751cf4fed..4e832e2e6 100644 --- a/itensor/util/safe_ptr.h +++ b/itensor/util/safe_ptr.h @@ -275,7 +275,7 @@ reinterpret(SafePtr const& p) #define MAKE_SAFE_PTR(P,SZ) (P) #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) #define SAFE_PTR_GET(P,SZ) P - #define SAFE_REINTERPRET(NT,SP) reinterpret_cast>(SP) + #define SAFE_REINTERPRET(NT,SP) reinterpret_cast>(SP) #define SAFE_PTR_OF(T) thrust::device_ptr #else //bare pointer versions of macros From bb3a48214d2311c1337c8aef36e7475696eee821 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 16:58:32 +0200 Subject: [PATCH 42/68] redefined complex if cuda --- itensor/types.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 2ed40e45e..834d874ef 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -28,18 +28,24 @@ namespace itensor { using Real = double; -using Cplx = std::complex; -using Complex = std::complex; +#ifdef PLATFORM_cuda + #include + using Cplx = cuDoubleComplex; + using Complex = cuDoubleComplex; +#else + using Cplx = std::complex; + using Complex = std::complex; +#endif const Cplx Complex_1 = Cplx(1,0); const Cplx Complex_i = Cplx(0,1); const Cplx Cplx_1 = Cplx(1,0); const Cplx Cplx_i = Cplx(0,1); -inline Real& +inline Real& realRef(Cplx & z) { return reinterpret_cast(&z)[0]; } -inline Real& +inline Real& imagRef(Cplx & z) { return reinterpret_cast(&z)[1]; } void inline @@ -78,11 +84,11 @@ formatVal(Cplx const& val) } template>> -constexpr const char* +constexpr const char* typeName(int=0) { return "Real"; } template>> -constexpr const char* +constexpr const char* typeName(long=0) { return "Cplx"; } } From 7d3823087967c81207ad9f11b75809e9de2c9ba2 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 17:05:14 +0200 Subject: [PATCH 43/68] used thrust complex instead of cuda complex --- itensor/types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 834d874ef..56d545789 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -29,9 +29,9 @@ namespace itensor { using Real = double; #ifdef PLATFORM_cuda - #include - using Cplx = cuDoubleComplex; - using Complex = cuDoubleComplex; + #include + using Cplx = thrust::complex; + using Complex = thrust::complex; #else using Cplx = std::complex; using Complex = std::complex; From 36dcbbf4d2371e2561d5cd592ac3d043d2674449 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 17:59:19 +0200 Subject: [PATCH 44/68] reverted to cuda complex --- itensor/types.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 56d545789..1dd889785 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -29,18 +29,22 @@ namespace itensor { using Real = double; #ifdef PLATFORM_cuda - #include - using Cplx = thrust::complex; - using Complex = thrust::complex; + #include + using Cplx = cuDoubleComplex; + using Complex = cuDoubleComplex; + const Cplx Complex_1 = make_cuDoubleComplex(1,0); + const Cplx Complex_i = make_cuDoubleComplex(0,1); + const Cplx Cplx_1 = make_cuDoubleComplex(1,0); + const Cplx Cplx_i = make_cuDoubleComplex(0,1); #else using Cplx = std::complex; using Complex = std::complex; + const Cplx Complex_1 = Cplx(1,0); + const Cplx Complex_i = Cplx(0,1); + const Cplx Cplx_1 = Cplx(1,0); + const Cplx Cplx_i = Cplx(0,1); #endif -const Cplx Complex_1 = Cplx(1,0); -const Cplx Complex_i = Cplx(0,1); -const Cplx Cplx_1 = Cplx(1,0); -const Cplx Cplx_i = Cplx(0,1); inline Real& realRef(Cplx & z) { return reinterpret_cast(&z)[0]; } From f77896e8bd561e8de147dc679ac37875141e8eb7 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Tue, 17 May 2022 18:03:26 +0200 Subject: [PATCH 45/68] reverted to simple complex --- itensor/types.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/itensor/types.h b/itensor/types.h index 1dd889785..53ab9d8a5 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -28,6 +28,7 @@ namespace itensor { using Real = double; +/* #ifdef PLATFORM_cuda #include using Cplx = cuDoubleComplex; @@ -37,13 +38,14 @@ using Real = double; const Cplx Cplx_1 = make_cuDoubleComplex(1,0); const Cplx Cplx_i = make_cuDoubleComplex(0,1); #else +*/ using Cplx = std::complex; using Complex = std::complex; const Cplx Complex_1 = Cplx(1,0); const Cplx Complex_i = Cplx(0,1); const Cplx Cplx_1 = Cplx(1,0); const Cplx Cplx_i = Cplx(0,1); -#endif +//#endif inline Real& From 6ef61858bd1a114f114c6e271819e6f35e4cb9af Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 15:29:22 +0200 Subject: [PATCH 46/68] try cucomplex --- itensor/types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 53ab9d8a5..5cef40fd6 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -28,7 +28,7 @@ namespace itensor { using Real = double; -/* +/**/ #ifdef PLATFORM_cuda #include using Cplx = cuDoubleComplex; @@ -38,14 +38,14 @@ using Real = double; const Cplx Cplx_1 = make_cuDoubleComplex(1,0); const Cplx Cplx_i = make_cuDoubleComplex(0,1); #else -*/ +/**/ using Cplx = std::complex; using Complex = std::complex; const Cplx Complex_1 = Cplx(1,0); const Cplx Complex_i = Cplx(0,1); const Cplx Cplx_1 = Cplx(1,0); const Cplx Cplx_i = Cplx(0,1); -//#endif +#endif inline Real& From 83e89a46b6f176b55050d85062ff601990f88f25 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 15:33:57 +0200 Subject: [PATCH 47/68] use thrust complex --- itensor/types.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 5cef40fd6..9a25fbd53 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -30,22 +30,19 @@ namespace itensor { using Real = double; /**/ #ifdef PLATFORM_cuda - #include - using Cplx = cuDoubleComplex; - using Complex = cuDoubleComplex; - const Cplx Complex_1 = make_cuDoubleComplex(1,0); - const Cplx Complex_i = make_cuDoubleComplex(0,1); - const Cplx Cplx_1 = make_cuDoubleComplex(1,0); - const Cplx Cplx_i = make_cuDoubleComplex(0,1); + //#include + #include + using Cplx = thrust::complex; + using Complex = thrust::complex; #else /**/ using Cplx = std::complex; using Complex = std::complex; - const Cplx Complex_1 = Cplx(1,0); - const Cplx Complex_i = Cplx(0,1); - const Cplx Cplx_1 = Cplx(1,0); - const Cplx Cplx_i = Cplx(0,1); #endif +const Cplx Complex_1 = Cplx(1,0); +const Cplx Complex_i = Cplx(0,1); +const Cplx Cplx_1 = Cplx(1,0); +const Cplx Cplx_i = Cplx(0,1); inline Real& From 1274526e29275319c69076b22aea5d617c160761 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 16:02:43 +0200 Subject: [PATCH 48/68] trying to fix thrust complex --- itensor/types.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 9a25fbd53..80bbf445a 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -32,8 +32,10 @@ using Real = double; #ifdef PLATFORM_cuda //#include #include - using Cplx = thrust::complex; - using Complex = thrust::complex; + using Cplx2 = thrust::complex; + using Complex2 = thrust::complex; + using Cplx = std::complex; + using Complex = std::complex; #else /**/ using Cplx = std::complex; From cf7a55605aa706e26d3348280fa6f65bf4db8943 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 16:29:57 +0200 Subject: [PATCH 49/68] fixed typo in reinterpret cast --- itensor/types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 80bbf445a..655509088 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -31,9 +31,9 @@ using Real = double; /**/ #ifdef PLATFORM_cuda //#include - #include - using Cplx2 = thrust::complex; - using Complex2 = thrust::complex; + //#include + //using Cplx2 = thrust::complex; + //using Complex2 = thrust::complex; using Cplx = std::complex; using Complex = std::complex; #else From f15ac2979fd51206fe7b016b8d8ae56767f24a04 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 16:51:35 +0200 Subject: [PATCH 50/68] revert back --- itensor/types.h | 10 +++++----- itensor/util/safe_ptr.h | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/itensor/types.h b/itensor/types.h index 655509088..15cf97f2d 100644 --- a/itensor/types.h +++ b/itensor/types.h @@ -29,18 +29,18 @@ namespace itensor { using Real = double; /**/ -#ifdef PLATFORM_cuda +//#ifdef PLATFORM_cuda //#include //#include //using Cplx2 = thrust::complex; //using Complex2 = thrust::complex; - using Cplx = std::complex; - using Complex = std::complex; -#else + //using Cplx = std::complex; + //using Complex = std::complex; +//#else /**/ using Cplx = std::complex; using Complex = std::complex; -#endif +//#endif const Cplx Complex_1 = Cplx(1,0); const Cplx Complex_i = Cplx(0,1); const Cplx Cplx_1 = Cplx(1,0); diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h index 4e832e2e6..e386552fd 100644 --- a/itensor/util/safe_ptr.h +++ b/itensor/util/safe_ptr.h @@ -270,6 +270,7 @@ reinterpret(SafePtr const& p) #else +/* #ifdef PLATFORM_cuda //bare pointer versions of macros #define MAKE_SAFE_PTR(P,SZ) (P) @@ -278,13 +279,14 @@ reinterpret(SafePtr const& p) #define SAFE_REINTERPRET(NT,SP) reinterpret_cast>(SP) #define SAFE_PTR_OF(T) thrust::device_ptr #else + */ //bare pointer versions of macros #define MAKE_SAFE_PTR(P,SZ) (P) #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) #define SAFE_PTR_GET(P,SZ) P #define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) #define SAFE_PTR_OF(T) T* - #endif + //#endif #endif From 04db28b218577ff4e775a610ee411969ea3ed7b8 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 17:05:38 +0200 Subject: [PATCH 51/68] changed safe pointer behaviour --- itensor/util/safe_ptr.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h index e386552fd..3427dd50e 100644 --- a/itensor/util/safe_ptr.h +++ b/itensor/util/safe_ptr.h @@ -270,23 +270,18 @@ reinterpret(SafePtr const& p) #else -/* + #define MAKE_SAFE_PTR(P,SZ) (P) + #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) + #define SAFE_PTR_GET(P,SZ) P + #define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) + #ifdef PLATFORM_cuda //bare pointer versions of macros - #define MAKE_SAFE_PTR(P,SZ) (P) - #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) - #define SAFE_PTR_GET(P,SZ) P - #define SAFE_REINTERPRET(NT,SP) reinterpret_cast>(SP) #define SAFE_PTR_OF(T) thrust::device_ptr #else - */ //bare pointer versions of macros - #define MAKE_SAFE_PTR(P,SZ) (P) - #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF)) - #define SAFE_PTR_GET(P,SZ) P - #define SAFE_REINTERPRET(NT,SP) reinterpret_cast(SP) #define SAFE_PTR_OF(T) T* - //#endif + #endif #endif From 14d7b64f5317fbe8edf24532bacb80ce7e5e23a4 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 18:25:58 +0200 Subject: [PATCH 52/68] bypass gemm emulator --- itensor/tensor/gemm.cc | 33 ++++++++++++++++++++++----------- itensor/tensor/lapack_wrap.cc | 13 +------------ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index 5a76b8a6e..4b38bbd50 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -35,7 +35,7 @@ struct dgemmTask Real a, Real b) : Apart(Ap),Bpart(Bp),Cpart(Cp),alpha(a),beta(b) - { + { if(b != 0.0) copyFromC = true; } @@ -169,7 +169,18 @@ gemm_impl(MatRefc A, Real alpha, Real beta) { -#ifdef ITENSOR_USE_ZGEMM +#ifdef ITENSOR_USE_CUDA + gemm_wrapper(isTransposed(A), + isTransposed(B), + nrows(A), + ncols(B), + ncols(A), + alpha, + A.data(), + B.data(), + beta, + C.data()); +#elif defined ITENSOR_USE_ZGEMM gemm_wrapper(isTransposed(A), isTransposed(B), nrows(A), @@ -181,8 +192,8 @@ gemm_impl(MatRefc A, beta, C.data()); #else //emulate zgemm by calling dgemm four times - std::array - tasks = + std::array + tasks = {{dgemmTask(0,0,0,+alpha,beta), dgemmTask(1,1,0,-alpha), dgemmTask(0), @@ -202,8 +213,8 @@ gemm_impl(MatRefc A, Real alpha, Real beta) { - std::array - tasks = + std::array + tasks = {{dgemmTask(0,0,0,+alpha,beta), dgemmTask(0), dgemmTask(0,1,1,+alpha,beta), @@ -219,8 +230,8 @@ gemm_impl(MatRefc A, Real alpha, Real beta) { - std::array - tasks = + std::array + tasks = {{dgemmTask(0,0,0,+alpha,beta), dgemmTask(0), dgemmTask(1,0,1,+alpha,beta), @@ -252,14 +263,14 @@ gemm_impl(MatRefc A, // C = alpha*A*B + beta*C template void -gemm(MatRefc A, - MatRefc B, +gemm(MatRefc A, + MatRefc B, MatRef> C, Real alpha, Real beta) { #ifdef DEBUG - if(!(isContiguous(A) && isContiguous(B) && isContiguous(C))) + if(!(isContiguous(A) && isContiguous(B) && isContiguous(C))) throw std::runtime_error("multiplication of non-contiguous MatrixRefs not currently supported"); #endif diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index aeee8d749..8a46ce2d0 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -299,18 +299,7 @@ gemm_wrapper(bool transa, } cublasHandle_t handle; cublasCreate(&handle); - LAPACK_COMPLEX *d_A, *d_B, *d_C; - cudaMalloc(&d_A, m * k * sizeof(LAPACK_COMPLEX)); - cudaMalloc(&d_B, k * n * sizeof(LAPACK_COMPLEX)); - cudaMalloc(&d_C, m * n * sizeof(LAPACK_COMPLEX)); - cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice); - cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice); - cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice); - cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, d_A, lda, d_B, ldb, (LAPACK_COMPLEX*) &beta, d_C, m); - cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyDeviceToHost); - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); + cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m); cublasDestroy(handle); #else //platform not openblas #ifdef ITENSOR_USE_CBLAS From fc754b970a711518ee16845afc0a9805a2962b7e Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 18:38:16 +0200 Subject: [PATCH 53/68] overrided zgemm for cuda --- itensor/tensor/lapack_wrap.cc | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 8a46ce2d0..2f50fc843 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -243,6 +243,37 @@ gemm_wrapper(bool transa, // // zgemm // +gemm_wrapper(bool transa, + bool transb, + LAPACK_INT m, + LAPACK_INT n, + LAPACK_INT k, + Cplx alpha, + const thrust::device_ptr A, + const thrust::device_ptr B, + Cplx beta, + thrust::device_ptr C) + { + LAPACK_INT lda = m, + ldb = k; + cublasOperation_t at = CUBLAS_OP_N; + cublasOperation_t bt = CUBLAS_OP_N; + if(transa) + { + at = CUBLAS_OP_T; + lda = k; + } + if(transb) + { + bt = CUBLAS_OP_T; + ldb = n; + } + cublasHandle_t handle; + cublasCreate(&handle); + cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m); + cublasDestroy(handle); +} + void gemm_wrapper(bool transa, bool transb, From 24020abbc646417f032d9d9bb753f68373432efe Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 18:40:00 +0200 Subject: [PATCH 54/68] forgot to include thrust --- itensor/tensor/lapack_wrap.h | 1 + 1 file changed, 1 insertion(+) diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index ddab8c504..82fff9f40 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -151,6 +151,7 @@ imagRef(LAPACK_COMPLEX & z) #include #include #include +#include namespace itensor { //cudaDataType_t typeComplexData = CUDA_C_64F; From 986557b1d41b5ecb9323383c004f504547e8e401 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 18:43:08 +0200 Subject: [PATCH 55/68] changed complex to lapack_complex --- itensor/tensor/lapack_wrap.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 2f50fc843..1d92c5d95 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -249,10 +249,10 @@ gemm_wrapper(bool transa, LAPACK_INT n, LAPACK_INT k, Cplx alpha, - const thrust::device_ptr A, - const thrust::device_ptr B, + const thrust::device_ptr A, + const thrust::device_ptr B, Cplx beta, - thrust::device_ptr C) + thrust::device_ptr C) { LAPACK_INT lda = m, ldb = k; From e10825c5284af6687fd16dac8fa53b6ce25ce541 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 18:51:31 +0200 Subject: [PATCH 56/68] use raw pointers --- itensor/tensor/lapack_wrap.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 1d92c5d95..8155f6c58 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -270,7 +270,7 @@ gemm_wrapper(bool transa, } cublasHandle_t handle; cublasCreate(&handle); - cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m); + cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A.get(), lda, B.get(), ldb, (LAPACK_COMPLEX*) &beta, C.get(), m); cublasDestroy(handle); } From 09153526ed99c0579c9d9f7dbfe6fc4a7a68cded Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 19:00:03 +0200 Subject: [PATCH 57/68] extarct raw pointer from thrust::device_ptr --- itensor/tensor/gemm.cc | 7 ++++--- itensor/tensor/lapack_wrap.cc | 8 ++++---- itensor/tensor/lapack_wrap.h | 12 ++++++++++++ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index 4b38bbd50..8a45bbb84 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -176,10 +176,11 @@ gemm_impl(MatRefc A, ncols(B), ncols(A), alpha, - A.data(), - B.data(), + (A.data()).get(), // .get() returns the raw pointer from thrust::device_ptr + (B.data()).get(), beta, - C.data()); + (C.data()).get()); + #elif defined ITENSOR_USE_ZGEMM gemm_wrapper(isTransposed(A), isTransposed(B), diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 8155f6c58..2faae5d76 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -249,10 +249,10 @@ gemm_wrapper(bool transa, LAPACK_INT n, LAPACK_INT k, Cplx alpha, - const thrust::device_ptr A, - const thrust::device_ptr B, + const LAPACK_COMPLEX* A, + const LAPACK_COMPLEX* B, Cplx beta, - thrust::device_ptr C) + LAPACK_COMPLEX* C) { LAPACK_INT lda = m, ldb = k; @@ -270,7 +270,7 @@ gemm_wrapper(bool transa, } cublasHandle_t handle; cublasCreate(&handle); - cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A.get(), lda, B.get(), ldb, (LAPACK_COMPLEX*) &beta, C.get(), m); + cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m); cublasDestroy(handle); } diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h index 82fff9f40..8ddb3d6c5 100644 --- a/itensor/tensor/lapack_wrap.h +++ b/itensor/tensor/lapack_wrap.h @@ -524,6 +524,18 @@ gemm_wrapper(bool transa, // // zgemm // +void +gemm_wrapper(bool transa, + bool transb, + LAPACK_INT m, + LAPACK_INT n, + LAPACK_INT k, + Cplx alpha, + LAPACK_COMPLEX const* A, + LAPACK_COMPLEX const* B, + Cplx beta, + LAPACK_COMPLEX * C); + void gemm_wrapper(bool transa, bool transb, From 4fb774cf2e59f38e7aaf92b1b2706e51ae2b7512 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 19:02:17 +0200 Subject: [PATCH 58/68] deleted cuda stuff from normal zgemm --- itensor/tensor/lapack_wrap.cc | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index 2faae5d76..d111823e9 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -315,23 +315,6 @@ gemm_wrapper(bool transa, auto* pB = reinterpret_cast(B); auto* pC = reinterpret_cast(C); cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,pA,lda,pB,ldb,pbeta,pC,m); -#elif defined ITENSOR_USE_CUDA - cublasOperation_t at = CUBLAS_OP_N; - cublasOperation_t bt = CUBLAS_OP_N; - if(transa) - { - at = CUBLAS_OP_T; - lda = k; - } - if(transb) - { - bt = CUBLAS_OP_T; - ldb = n; - } - cublasHandle_t handle; - cublasCreate(&handle); - cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m); - cublasDestroy(handle); #else //platform not openblas #ifdef ITENSOR_USE_CBLAS auto at = CblasNoTrans, From a7130afee3a9dbc8e8d9e87183f31f7dc0f299f4 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Wed, 18 May 2022 19:04:38 +0200 Subject: [PATCH 59/68] fixed return type of zgemm --- itensor/tensor/lapack_wrap.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc index d111823e9..9dbfc4e84 100644 --- a/itensor/tensor/lapack_wrap.cc +++ b/itensor/tensor/lapack_wrap.cc @@ -243,6 +243,7 @@ gemm_wrapper(bool transa, // // zgemm // +void gemm_wrapper(bool transa, bool transb, LAPACK_INT m, From f3349d91504f9deb874c691fb9c8962e7d3ed7d6 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 15:46:50 +0200 Subject: [PATCH 60/68] not using get() --- itensor/tensor/gemm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index 8a45bbb84..cc01d12d1 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -176,8 +176,8 @@ gemm_impl(MatRefc A, ncols(B), ncols(A), alpha, - (A.data()).get(), // .get() returns the raw pointer from thrust::device_ptr - (B.data()).get(), + A.data(), + B.data(), beta, (C.data()).get()); From d547310a4129605efbe9048f9ada264ef3a05ec9 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 15:48:27 +0200 Subject: [PATCH 61/68] not using get() --- itensor/tensor/gemm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index cc01d12d1..f7d2d9af9 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -176,10 +176,10 @@ gemm_impl(MatRefc A, ncols(B), ncols(A), alpha, - A.data(), + A.data(), B.data(), beta, - (C.data()).get()); + C.data()); #elif defined ITENSOR_USE_ZGEMM gemm_wrapper(isTransposed(A), From 5be2e74f2593d5d5939abcfee061ab61f76cf417 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 15:55:06 +0200 Subject: [PATCH 62/68] dont compile gemm_impl --- itensor/tensor/gemm.cc | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index f7d2d9af9..620f71cd8 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -50,6 +50,19 @@ struct dgemmTask : copyToC(true),Cpart(Cp) { } }; +#ifdef ITENSOR_USE_CUDA + gemm_wrapper(isTransposed(A), + isTransposed(B), + nrows(A), + ncols(B), + ncols(A), + alpha, + A.data(), + B.data(), + beta, + C.data()); +#else + void cplxToRealBuf(SAFE_PTR_OF(const Real) C, @@ -169,19 +182,7 @@ gemm_impl(MatRefc A, Real alpha, Real beta) { -#ifdef ITENSOR_USE_CUDA - gemm_wrapper(isTransposed(A), - isTransposed(B), - nrows(A), - ncols(B), - ncols(A), - alpha, - A.data(), - B.data(), - beta, - C.data()); - -#elif defined ITENSOR_USE_ZGEMM +#if defined ITENSOR_USE_ZGEMM gemm_wrapper(isTransposed(A), isTransposed(B), nrows(A), @@ -203,7 +204,7 @@ gemm_impl(MatRefc A, dgemmTask(1) }}; gemm_emulator(A,B,C,alpha,beta,tasks); -#endif +#endif //ITENSOR_USE_ZGEMM } @@ -303,5 +304,7 @@ template void gemm(MatRefc, MatRefc, MatRef,Real,Real); template void gemm(MatRefc, MatRefc, MatRef,Real,Real); template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +#endif //ITENSOR_USE_CUDA + } //namespace itensor From 69ab83b0c6228f5376d9cab5f69f5620ee1f3b9d Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 15:58:59 +0200 Subject: [PATCH 63/68] gemm_impl --- itensor/tensor/gemm.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index 620f71cd8..6f2a24644 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -50,7 +50,15 @@ struct dgemmTask : copyToC(true),Cpart(Cp) { } }; + #ifdef ITENSOR_USE_CUDA +void +gemm_impl(MatRefc A, + MatRefc B, + MatRef C, + Real alpha, + Real beta) + { gemm_wrapper(isTransposed(A), isTransposed(B), nrows(A), @@ -61,6 +69,7 @@ struct dgemmTask B.data(), beta, C.data()); + } #else From 304e7c76e094a06dbe830a3e554edd1abb9d961e Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 16:04:43 +0200 Subject: [PATCH 64/68] implemented gemm --- itensor/tensor/gemm.cc | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index 6f2a24644..c987a180b 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -70,6 +70,32 @@ gemm_impl(MatRefc A, beta, C.data()); } + +// C = alpha*A*B + beta*C +template +void +gemm(MatRefc A, + MatRefc B, + MatRef> C, + Real alpha, + Real beta) + { + if(isTransposed(C)) + { + //Do C = Bt*At instead of Ct=A*B + //Recall that C.data() points to elements of C, not C.t() + //regardless of whether C.transpose()==true or false + gemm_impl(transpose(B),transpose(A),transpose(C),alpha,beta); + } + else + { + gemm_impl(A,B,C,alpha,beta); + } + } +template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +template void gemm(MatRefc, MatRefc, MatRef,Real,Real); #else From 0bb4e7e16d365963c00e93633598b661a45cc412 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 16:08:31 +0200 Subject: [PATCH 65/68] restrict gemm to complex --- itensor/tensor/gemm.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index c987a180b..fc80e5f8a 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -92,9 +92,9 @@ gemm(MatRefc A, gemm_impl(A,B,C,alpha,beta); } } -template void gemm(MatRefc, MatRefc, MatRef,Real,Real); -template void gemm(MatRefc, MatRefc, MatRef,Real,Real); -template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +//template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +//template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +//template void gemm(MatRefc, MatRefc, MatRef,Real,Real); template void gemm(MatRefc, MatRefc, MatRef,Real,Real); #else From c53944becce8f135b1efee86c2a1bf71f716d938 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 16:10:25 +0200 Subject: [PATCH 66/68] restrict gemm to complex or double --- itensor/tensor/gemm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index fc80e5f8a..495f4c28d 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -92,7 +92,7 @@ gemm(MatRefc A, gemm_impl(A,B,C,alpha,beta); } } -//template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +template void gemm(MatRefc, MatRefc, MatRef,Real,Real); //template void gemm(MatRefc, MatRefc, MatRef,Real,Real); //template void gemm(MatRefc, MatRefc, MatRef,Real,Real); template void gemm(MatRefc, MatRefc, MatRef,Real,Real); From e1128a4492343af9ba30758422301041b8a6b469 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 16:14:20 +0200 Subject: [PATCH 67/68] double requires some more overloading so we just stick to gemm complex --- itensor/tensor/gemm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc index 495f4c28d..fc80e5f8a 100644 --- a/itensor/tensor/gemm.cc +++ b/itensor/tensor/gemm.cc @@ -92,7 +92,7 @@ gemm(MatRefc A, gemm_impl(A,B,C,alpha,beta); } } -template void gemm(MatRefc, MatRefc, MatRef,Real,Real); +//template void gemm(MatRefc, MatRefc, MatRef,Real,Real); //template void gemm(MatRefc, MatRefc, MatRef,Real,Real); //template void gemm(MatRefc, MatRefc, MatRef,Real,Real); template void gemm(MatRefc, MatRefc, MatRef,Real,Real); From 503df3700b97d7e0453564e25438d4de6d5453e7 Mon Sep 17 00:00:00 2001 From: Mauro d'Arcangelo Date: Fri, 20 May 2022 16:17:47 +0200 Subject: [PATCH 68/68] restrict contract to complex --- itensor/tensor/contract.cc | 260 +++++++++++++++++++------------------ 1 file changed, 132 insertions(+), 128 deletions(-) diff --git a/itensor/tensor/contract.cc b/itensor/tensor/contract.cc index d5806c0bb..d09c1a38b 100644 --- a/itensor/tensor/contract.cc +++ b/itensor/tensor/contract.cc @@ -46,7 +46,7 @@ printv(const vector& t, const F& f) { print("{ "); - for(const auto& i : t) + for(const auto& i : t) { f(i); print(" "); @@ -76,8 +76,8 @@ printv(const autovector& t) #define PRIL(a,l) print(#a,": "); printv(a,l); //template -//long -//find_index(vector const& v, +//long +//find_index(vector const& v, // T const& t) // { // using size_type = typename vector::size_type; @@ -87,8 +87,8 @@ printv(const autovector& t) // } // //template -//long -//find_index(const VarArray& v, +//long +//find_index(const VarArray& v, // const T& t) // { // using size_type = typename VarArray::size_type; @@ -98,8 +98,8 @@ printv(const autovector& t) // } // //template -//long -//find_index(const InfArray& v, +//long +//find_index(const InfArray& v, // const T& t) // { // using size_type = typename InfArray::size_type; @@ -120,7 +120,7 @@ class small_map std::array,N> d; int nd{0}; - B& + B& operator[](const A& a) { for(int i = 0; i < nd; ++i) @@ -136,15 +136,15 @@ class small_map // struct analyzing index pattern for C = A * B struct CProps { - Labels ai, - bi, + Labels ai, + bi, ci; - int nactiveA = 0, - nactiveB = 0, + int nactiveA = 0, + nactiveB = 0, nactiveC = 0; private: - Labels AtoB_, - AtoC_, + Labels AtoB_, + AtoC_, BtoC_; bool permuteA_ = false, permuteB_ = false, @@ -166,11 +166,11 @@ struct CProps Range newArange, newBrange, newCrange; - - CProps(Labels const& ai_, - Labels const& bi_, - Labels const& ci_) - : + + CProps(Labels const& ai_, + Labels const& bi_, + Labels const& ci_) + : ai(ai_),bi(bi_),ci(ci_), Acstart(ai_.size()), Bcstart(bi_.size()), @@ -212,7 +212,7 @@ struct CProps if(size_t(Austart) >= ai.size()) return true; auto aCind = AtoC(Austart); using size_type = decltype(ai.size()); - for(size_type i = 0; i < ai.size(); ++i) + for(size_type i = 0; i < ai.size(); ++i) if(!contractedA(i)) { if(AtoC(i) != aCind) return false; @@ -227,7 +227,7 @@ struct CProps if(size_t(Bustart) >= bi.size()) return true; auto bCind = BtoC(Bustart); using size_type = decltype(bi.size()); - for(size_type i = 0; i < bi.size(); ++i) + for(size_type i = 0; i < bi.size(); ++i) if(!contractedB(i)) { if(BtoC(i) != bCind) return false; @@ -256,7 +256,7 @@ struct CProps // computePerms(); - + //Use PC.size() as a check to see if we've already run this if(PC.size() != 0) return; @@ -282,19 +282,19 @@ struct CProps dmid *= A.extent(i); } for(int j = 0; j < rb; ++j) - if(!contractedB(j)) + if(!contractedB(j)) { dright *= B.extent(j); PC.setFromTo(c,BtoC(j)); ++c; } - if(!isTrivial(PC)) + if(!isTrivial(PC)) { permuteC_ = true; if(checkBCsameord() && checkACsameord()) { - //Can avoid permuting C by + //Can avoid permuting C by //computing Bt*At = Ct ctrans = true; permuteC_ = false; @@ -305,15 +305,15 @@ struct CProps permuteA_ = false; if(!(contractedA(0) || contractedA(ra-1))) { - //If contracted indices are not all at front or back, - //will have to permute A + //If contracted indices are not all at front or back, + //will have to permute A permuteA_ = true; } else { //Contracted ind start at front or back, check if contiguous for(int i = 0; i < ncont; ++i) - if(!contractedA(Acstart+i)) + if(!contractedA(Acstart+i)) { //Contracted indices not contiguous, must permute permuteA_ = true; @@ -325,7 +325,7 @@ struct CProps permuteB_ = false; if(!(contractedB(0) || contractedB(rb-1))) { - //If contracted indices are not all at front or back, + //If contracted indices are not all at front or back, //will have to permute B permuteB_ = true; } @@ -344,9 +344,9 @@ struct CProps { //Check if contracted inds. in same order for(int i = 0; i < ncont; ++i) - if(AtoB(Acstart+i) != (Bcstart+i)) + if(AtoB(Acstart+i) != (Bcstart+i)) { - //If not in same order, + //If not in same order, //must permute one of A or B //so permute the smaller one if(dleft < dright) permuteA_ = true; @@ -487,7 +487,7 @@ struct CProps ++c; } for(int j = 0; j < rb; ++j) - if(!contractedB(j)) + if(!contractedB(j)) { PC.setFromTo(c,BtoC_[j]); ++c; @@ -502,9 +502,9 @@ struct CProps permuteC_ = true; //Here we already know since pc_triv = false that //at best indices from B precede those from A (on result C) - //so if both sets remain in same order on C + //so if both sets remain in same order on C //just need to transpose C, not permute it - if(checkBCsameord() && checkACsameord()) + if(checkBCsameord() && checkACsameord()) { ctrans = true; permuteC_ = false; @@ -534,27 +534,27 @@ struct CProps if(!permuteB_) { for(decltype(rb) j = 0; j < rb; ++j) - if(!contractedB(j)) + if(!contractedB(j)) Rb.nextIndex(B.extent(j)); } else { for(decltype(rb) j = 0; j < rb; ++j) - if(!contractedB(j)) + if(!contractedB(j)) Rb.nextIndex(newBrange.extent(j)); } newCrange = Rb.build(); } } - void + void computePerms() { //Use !AtoB.empty() as a check to see if we've already run this if(!AtoB_.empty()) return; - int na = ai.size(), - nb = bi.size(), + int na = ai.size(), + nb = bi.size(), nc = ci.size(); AtoB_ = Labels(na,-1); @@ -563,7 +563,7 @@ struct CProps for(int i = 0; i < na; ++i) { for(int j = 0; j < nb; ++j) - if(ai[i] == bi[j]) + if(ai[i] == bi[j]) { ++ncont; if(i < Acstart) Acstart = i; @@ -576,7 +576,7 @@ struct CProps for(int i = 0; i < na; ++i) { for(int k = 0; k < nc; ++k) - if(ai[i] == ci[k]) + if(ai[i] == ci[k]) { if(i < Austart) Austart = i; AtoC_[i] = k; @@ -587,7 +587,7 @@ struct CProps for(int j = 0; j < nb; ++j) { for(int k = 0; k < nc; ++k) - if(bi[j] == ci[k]) + if(bi[j] == ci[k]) { if(j < Bustart) Bustart = j; BtoC_[j] = k; @@ -603,7 +603,7 @@ struct CProps computeNactive() { // Out of A, B and C (C = A*B), each index appears in two tensors. - // An active index appears as one of the first two indices in each of the two + // An active index appears as one of the first two indices in each of the two // tensor in which it appears. More specifically: // the first index of a tensor is active if its pair is also a first index, or if its // pair is a second index and that tensor's first index is active. @@ -620,9 +620,9 @@ struct CProps small_map indval; for(int i = 0; i <= 1; ++i) { - ++indval[ai[i]]; - ++indval[bi[i]]; - ++indval[ci[i]]; + ++indval[ai[i]]; + ++indval[bi[i]]; + ++indval[ci[i]]; } for(int elim = 1; elim <= 3; ++elim) // bad guys at position 0 kill the index at 1 @@ -640,20 +640,20 @@ struct CProps struct ABoffC { - MatrixRefc mA, + MatrixRefc mA, mB; MatrixRef mC; int offC; - ABoffC(MatrixRefc& mA_, - MatrixRefc& mB_, - MatrixRef& mC_, + ABoffC(MatrixRefc& mA_, + MatrixRefc& mB_, + MatrixRef& mC_, int offC_) - : - mA(mA_), - mB(mB_), - mC(mC_), - offC(offC_) + : + mA(mA_), + mB(mB_), + mC(mC_), + offC(offC_) { } void @@ -667,16 +667,16 @@ class CABqueue CABqueue() { } - void - addtask(MatrixRefc& mA, - MatrixRefc& mB, - MatrixRef& mC, + void + addtask(MatrixRefc& mA, + MatrixRefc& mB, + MatrixRef& mC, int offC) { subtask[offC].emplace_back(mA,mB,mC,offC); } - void + void run(int numthread) { @@ -693,8 +693,8 @@ class CABqueue ////////// //Loop over threads in a round-robin fashion. - //Assign all tasks with the same memory - //destination (offC) to the same thread. + //Assign all tasks with the same memory + //destination (offC) to the same thread. vector> threadtask(numthread); int ss = 0; for(auto& t : subtask) @@ -715,14 +715,14 @@ class CABqueue //printfln("task size for thread %d is %d",i,tt.size()); futs[i] = std::async(std::launch::async, [&tt]() - { + { for(const auto& task : tt) task.execute(); } ); } //Wait for futures to complete - for(auto& ft : futs) + for(auto& ft : futs) { ft.wait(); } @@ -731,7 +731,7 @@ class CABqueue template -void +void contract(CProps const& p, TenRefc A, TenRefc B, @@ -802,7 +802,7 @@ contract(CProps const& p, } else { - if(p.Ctrans()) + if(p.Ctrans()) { cref = transpose(makeMatRef(C.store(),ncols(bref),nrows(aref))); } @@ -824,9 +824,9 @@ contract(CProps const& p, } template -void -contractScalar(T1 a, - TenRefc B, Labels const& bi, +void +contractScalar(T1 a, + TenRefc B, Labels const& bi, TenRef> C, Labels const& ci, Real alpha, Real beta) @@ -841,19 +841,19 @@ contractScalar(T1 a, } template -void -contract(TenRefc A, Labels const& ai, - TenRefc B, Labels const& bi, - TenRef> C, +void +contract(TenRefc A, Labels const& ai, + TenRefc B, Labels const& bi, + TenRef> C, Labels const& ci, Real alpha, Real beta) { - if(ai.empty()) + if(ai.empty()) { contractScalar(*A.data(),B,bi,C,ci,alpha,beta); } - else if(bi.empty()) + else if(bi.empty()) { contractScalar(*B.data(),A,ai,C,ci,alpha,beta); } @@ -866,44 +866,48 @@ contract(TenRefc A, Labels const& ai, } //Explicit template instantiations: -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +/* +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +*/ +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +/* +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); -template void -contract(TenRefc, Labels const&, - TenRefc, Labels const&, +*/ +template void +contract(TenRefc, Labels const&, + TenRefc, Labels const&, TenRef , Labels const&, Real,Real); @@ -913,12 +917,12 @@ struct MultInfo bool tA = false, tB = false, Bfirst = false; - MultInfo() {} + MultInfo() {} }; MultInfo static computeMultInfo(Labels const& ai, - Labels const& bi, + Labels const& bi, Labels const& ci) { MultInfo I; @@ -982,9 +986,9 @@ computeMultInfo(Labels const& ai, } template -void -contractloop(TenRefc A, Labels const& ai, - TenRefc B, Labels const& bi, +void +contractloop(TenRefc A, Labels const& ai, + TenRefc B, Labels const& bi, TenRef C, Labels const& ci, Args const& args) { @@ -1013,14 +1017,14 @@ contractloop(TenRefc A, Labels const& ai, auto nfo = computeMultInfo(ai,bi,ci); - auto Arow = A.extent(0); + auto Arow = A.extent(0); auto Acol = A.extent(1); - auto Brow = B.extent(0); + auto Brow = B.extent(0); auto Bcol = B.extent(1); - auto Crow = C.extent(0); + auto Crow = C.extent(0); auto Ccol = C.extent(1); - detail::GCounter couA(ra), + detail::GCounter couA(ra), couB(rb); //Keep couA.i[0] and couA.i[1] fixed at 0 couA.setRange(0,0,0); @@ -1035,8 +1039,8 @@ contractloop(TenRefc A, Labels const& ai, for(int j = 2; j < rb; ++j) couB.setRange(j,0,B.extent(j)-1); - Labels aind(ra,0), - bind(rb,0), + Labels aind(ra,0), + bind(rb,0), cind(rc,0); CABqueue cabq; @@ -1067,13 +1071,13 @@ contractloop(TenRefc A, Labels const& ai, cind[p.AtoC(ia)] = ival; } } - + for(;couB.notDone(); ++couB) { for(int ib = 2; ib < rb; ++ib) { bind[ib] = couB[ib]; - if(p.BtoC(ib) != -1) + if(p.BtoC(ib) != -1) if(!p.contractedB(ib)) cind[p.BtoC(ib)] = couB[ib]; } @@ -1100,15 +1104,15 @@ contractloop(TenRefc A, Labels const& ai, cabq.run(nthread); } template -void -contractloop(TenRefc A, Labels const& ai, - TenRefc B, Labels const& bi, +void +contractloop(TenRefc A, Labels const& ai, + TenRefc B, Labels const& bi, TenRef C, Labels const& ci, Args const& args); template -void -contractloop(TenRefc A, Labels const& ai, - TenRefc B, Labels const& bi, +void +contractloop(TenRefc A, Labels const& ai, + TenRefc B, Labels const& bi, TenRef C, Labels const& ci, Args const& args); @@ -1118,29 +1122,29 @@ contractloop(TenRefc A, Labels const& ai, //All indices of B contracted //(A can have some uncontracted indices) template -void -contractDiagFull(VectorRefc A, Labels const& ai, - TenRefc B, Labels const& bi, +void +contractDiagFull(VectorRefc A, Labels const& ai, + TenRefc B, Labels const& bi, VectorRef C, Labels const& ci) { Error("contractDiagFull not yet implemented"); } template -void -contractDiagFull(VectorRefc A, Labels const& ai, - TenRefc B, Labels const& bi, +void +contractDiagFull(VectorRefc A, Labels const& ai, + TenRefc B, Labels const& bi, VectorRef C, Labels const& ci); template -void -contractDiagFull(VectorRefc A, Labels const& ai, - TenRefc B, Labels const& bi, +void +contractDiagFull(VectorRefc A, Labels const& ai, + TenRefc B, Labels const& bi, VectorRef C, Labels const& ci); //////////////////////////////////////////// // // Non-Contracting Product Optimization Ideas -// +// // o Identify common index to A,B,C with largest // extent and make this the innermost loop // (similar to big index in transform method in ten_impl.h)