From 73d604790334dbb75dbd4e820da78938db9e73b3 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Thu, 7 Apr 2022 15:18:31 +0200
Subject: [PATCH 01/68] added platform cuda

---
 itensor/tensor/lapack_wrap.h | 219 +++++++++++++++++++----------------
 1 file changed, 122 insertions(+), 97 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index 618398231..0af8e3c49 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -56,16 +56,16 @@ using LAPACK_INT = lapack_int;
 using LAPACK_REAL = double;
 using LAPACK_COMPLEX = lapack_complex_double;
 
-inline LAPACK_REAL& 
-realRef(LAPACK_COMPLEX & z) 
-    { 
+inline LAPACK_REAL&
+realRef(LAPACK_COMPLEX & z)
+    {
     auto* p = reinterpret_cast<double*>(&z);
     return p[0];
     }
 
-inline LAPACK_REAL& 
-imagRef(LAPACK_COMPLEX & z) 
-    { 
+inline LAPACK_REAL&
+imagRef(LAPACK_COMPLEX & z)
+    {
     auto* p = reinterpret_cast<double*>(&z);
     return p[1];
     }
@@ -87,10 +87,10 @@ imagRef(LAPACK_COMPLEX & z)
     using LAPACK_REAL = __CLPK_doublereal;
     using LAPACK_COMPLEX = __CLPK_doublecomplex;
 
-    inline LAPACK_REAL& 
+    inline LAPACK_REAL&
     realRef(LAPACK_COMPLEX & z) { return z.r; }
 
-    inline LAPACK_REAL& 
+    inline LAPACK_REAL&
     imagRef(LAPACK_COMPLEX & z) { return z.i; }
     }
 
@@ -111,10 +111,10 @@ imagRef(LAPACK_COMPLEX & z)
     using LAPACK_REAL = double;
     using LAPACK_COMPLEX = MKL_Complex16;
 
-    inline LAPACK_REAL& 
+    inline LAPACK_REAL&
     realRef(LAPACK_COMPLEX & z) { return z.real; }
 
-    inline LAPACK_REAL& 
+    inline LAPACK_REAL&
     imagRef(LAPACK_COMPLEX & z) { return z.imag; }
     }
 
@@ -135,13 +135,38 @@ imagRef(LAPACK_COMPLEX & z)
     LAPACK_REAL real, imag;
     } LAPACK_COMPLEX;
 
-    inline LAPACK_REAL& 
+    inline LAPACK_REAL&
     realRef(LAPACK_COMPLEX & z) { return z.real; }
 
-    inline LAPACK_REAL& 
+    inline LAPACK_REAL&
     imagRef(LAPACK_COMPLEX & z) { return z.imag; }
     }
 
+#elif defined PLATFORM_cuda
+
+#define ITENSOR_USE_CUDA
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <cuComplex.h>
+
+namespace itensor {
+cudaDataType_t typeComplexData = CUDA_C_64F;
+cudaDataType_t typeRealData = CUDA_R_64F;
+cublasComputeType_t typeCompute = CUBLAS_COMPUTE_64F;
+
+using LAPACK_INT = int;
+using LAPACK_REAL = double;
+using LAPACK_COMPLEX = cuDoubleComplex;
+
+inline LAPACK_REAL&
+realRef(LAPACK_COMPLEX & z) { return z.x; }
+
+inline LAPACK_REAL&
+imagRef(LAPACK_COMPLEX & z) { return z.y; }
+}
+
 #endif // different PLATFORM types
 
 
@@ -174,14 +199,14 @@ LAPACK_REAL F77NAME(dnrm2)(LAPACK_INT* N, LAPACK_REAL* X, LAPACK_INT* incx);
 #ifdef ITENSOR_USE_CBLAS
 void cblas_daxpy(const int n, const double alpha, const double *X, const int incX, double *Y, const int incY);
 #else
-void F77NAME(daxpy)(LAPACK_INT* n, LAPACK_REAL* alpha, 
+void F77NAME(daxpy)(LAPACK_INT* n, LAPACK_REAL* alpha,
                     LAPACK_REAL* X, LAPACK_INT* incx,
                     LAPACK_REAL* Y, LAPACK_INT* incy);
 #endif
 
 //ddot declaration
 #ifdef ITENSOR_USE_CBLAS
-LAPACK_REAL 
+LAPACK_REAL
 cblas_ddot(const LAPACK_INT N, const LAPACK_REAL *X, const LAPACK_INT incx, const LAPACK_REAL *Y, const LAPACK_INT incy);
 #else
 LAPACK_REAL F77NAME(ddot)(LAPACK_INT* N, LAPACK_REAL* X, LAPACK_INT* incx, LAPACK_REAL* Y, LAPACK_INT* incy);
@@ -189,7 +214,7 @@ LAPACK_REAL F77NAME(ddot)(LAPACK_INT* N, LAPACK_REAL* X, LAPACK_INT* incx, LAPAC
 
 //zdotc declaration
 #ifdef ITENSOR_USE_CBLAS
-LAPACK_REAL 
+LAPACK_REAL
 cblas_zdotc_sub(const LAPACK_INT N, const void *X, const LAPACK_INT incx, const void *Y, const LAPACK_INT incy, void *res);
 #else
 LAPACK_COMPLEX F77NAME(zdotc)(LAPACK_INT* N, LAPACK_COMPLEX* X, LAPACK_INT* incx, LAPACK_COMPLEX* Y, LAPACK_INT* incy);
@@ -211,19 +236,19 @@ void F77NAME(dgemm)(char*,char*,LAPACK_INT*,LAPACK_INT*,LAPACK_INT*,
 
 //zgemm declaration
 #ifdef PLATFORM_openblas
-void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, 
-                 OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, 
-                 OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, 
-                 OPENBLAS_CONST blasint M, 
-                 OPENBLAS_CONST blasint N, 
+void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order,
+                 OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                 OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+                 OPENBLAS_CONST blasint M,
+                 OPENBLAS_CONST blasint N,
                  OPENBLAS_CONST blasint K,
-                 OPENBLAS_CONST double *alpha, 
-                 OPENBLAS_CONST double *A, 
-                 OPENBLAS_CONST blasint lda, 
-                 OPENBLAS_CONST double *B, 
-                 OPENBLAS_CONST blasint ldb, 
-                 OPENBLAS_CONST double *beta, 
-                 double *C, 
+                 OPENBLAS_CONST double *alpha,
+                 OPENBLAS_CONST double *A,
+                 OPENBLAS_CONST blasint lda,
+                 OPENBLAS_CONST double *B,
+                 OPENBLAS_CONST blasint ldb,
+                 OPENBLAS_CONST double *beta,
+                 double *C,
                  OPENBLAS_CONST blasint ldc);
 #else //platform not openblas
 
@@ -257,22 +282,22 @@ void F77NAME(dgemv)(char* transa,LAPACK_INT* M,LAPACK_INT* N,LAPACK_REAL* alpha,
 
 //zgemv declaration
 #ifdef PLATFORM_openblas
-void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  
-                 OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  
-                 OPENBLAS_CONST blasint m, 
+void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,
+                 OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,
+                 OPENBLAS_CONST blasint m,
                  OPENBLAS_CONST blasint n,
-                 OPENBLAS_CONST double *alpha, 
-                 OPENBLAS_CONST double  *a, 
-                 OPENBLAS_CONST blasint lda,  
-                 OPENBLAS_CONST double  *x, 
-                 OPENBLAS_CONST blasint incx,  
-                 OPENBLAS_CONST double *beta,  
-                 double  *y, 
+                 OPENBLAS_CONST double *alpha,
+                 OPENBLAS_CONST double  *a,
+                 OPENBLAS_CONST blasint lda,
+                 OPENBLAS_CONST double  *x,
+                 OPENBLAS_CONST blasint incx,
+                 OPENBLAS_CONST double *beta,
+                 double  *y,
                  OPENBLAS_CONST blasint incy);
 #else
 #ifdef ITENSOR_USE_CBLAS
-void cblas_zgemv(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE trans, const LAPACK_INT m, 
-                 const LAPACK_INT n, const void *alpha, const void *a, const LAPACK_INT lda, 
+void cblas_zgemv(const CBLAS_ORDER Order, const CBLAS_TRANSPOSE trans, const LAPACK_INT m,
+                 const LAPACK_INT n, const void *alpha, const void *a, const LAPACK_INT lda,
                  const void *x, const LAPACK_INT incx, const void *beta, void *y, const LAPACK_INT incy);
 #else
 void F77NAME(zgemv)(char* transa,LAPACK_INT* M,LAPACK_INT* N,LAPACK_COMPLEX* alpha, LAPACK_COMPLEX* A,
@@ -282,8 +307,8 @@ void F77NAME(zgemv)(char* transa,LAPACK_INT* M,LAPACK_INT* N,LAPACK_COMPLEX* alp
 #endif //zgemv declaration
 
 #ifdef PLATFORM_acml
-void F77NAME(dsyev)(char *jobz, char *uplo, int *n, double *a, int *lda, 
-                    double *w, double *work, int *lwork, int *info, 
+void F77NAME(dsyev)(char *jobz, char *uplo, int *n, double *a, int *lda,
+                    double *w, double *work, int *lwork, int *info,
                     int jobz_len, int uplo_len);
 #else
 void F77NAME(dsyev)(const char* jobz, const char* uplo, const LAPACK_INT* n, double* a,
@@ -299,66 +324,66 @@ void F77NAME(dscal)(LAPACK_INT* N, LAPACK_REAL* alpha, LAPACK_REAL* X,LAPACK_INT
 
 
 #ifdef PLATFORM_acml
-void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, 
-             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, 
+void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s,
+             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt,
              double *work, LAPACK_INT *lwork, LAPACK_INT *iwork, LAPACK_INT *info, int jobz_len);
 #else
-void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, 
-             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, 
+void F77NAME(dgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s,
+             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt,
              double *work, LAPACK_INT *lwork, LAPACK_INT *iwork, LAPACK_INT *info);
 #endif
 
 
 #ifdef PLATFORM_acml
-  void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, 
-             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, 
+  void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s,
+             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt,
              double *work, LAPACK_INT *lwork, LAPACK_INT *info, int jobz_len);
 #else
-  void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s, 
-             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt, 
+  void F77NAME(dgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, double *s,
+             double *u, LAPACK_INT *ldu, double *vt, LAPACK_INT *ldvt,
              double *work, LAPACK_INT *lwork, LAPACK_INT *info);
 #endif
 
 
   #ifdef PLATFORM_acml
-  void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s, 
-             LAPACK_COMPLEX *u, LAPACK_INT *ldu,  LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, 
+  void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s,
+             LAPACK_COMPLEX *u, LAPACK_INT *ldu,  LAPACK_COMPLEX *vt, LAPACK_INT *ldvt,
              LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_REAL * rwork, LAPACK_INT *info, int jobz_len);
 #else
-  void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s, 
-             LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, 
+  void F77NAME(zgesvd)(char *jobz, char* jobv, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, LAPACK_REAL *s,
+             LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt,
 		       LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_REAL * rwork, LAPACK_INT *info);
 #endif
 
 #ifdef PLATFORM_acml
-void F77NAME(zgesdd)(char *jobz, int *m, int *n, LAPACK_COMPLEX *a, int *lda, double *s, 
-             LAPACK_COMPLEX *u, int *ldu, LAPACK_COMPLEX *vt, int *ldvt, 
-             LAPACK_COMPLEX *work, int *lwork, double *rwork, int *iwork, int *info, 
+void F77NAME(zgesdd)(char *jobz, int *m, int *n, LAPACK_COMPLEX *a, int *lda, double *s,
+             LAPACK_COMPLEX *u, int *ldu, LAPACK_COMPLEX *vt, int *ldvt,
+             LAPACK_COMPLEX *work, int *lwork, double *rwork, int *iwork, int *info,
              int jobz_len);
 #else
-void F77NAME(zgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, double *s, 
-             LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt, 
+void F77NAME(zgesdd)(char *jobz, LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, double *s,
+             LAPACK_COMPLEX *u, LAPACK_INT *ldu, LAPACK_COMPLEX *vt, LAPACK_INT *ldvt,
              LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, LAPACK_INT *iwork, LAPACK_INT *info);
 #endif
 
-void F77NAME(dgeqrf)(LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda, 
+void F77NAME(dgeqrf)(LAPACK_INT *m, LAPACK_INT *n, double *a, LAPACK_INT *lda,
                      double *tau, double *work, LAPACK_INT *lwork, LAPACK_INT *info);
 
-void F77NAME(dorgqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, double *a, 
-                     LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork, 
+void F77NAME(dorgqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, double *a,
+                     LAPACK_INT *lda, double *tau, double *work, LAPACK_INT *lwork,
                      LAPACK_INT *info);
 
-  
-void F77NAME(zgeqrf)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, 
+
+void F77NAME(zgeqrf)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda,
                      LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, LAPACK_INT *info);
 
 #ifdef PLATFORM_lapacke
-void LAPACKE_zungqr(int matrix_layout, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a, 
-                     LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, 
+void LAPACKE_zungqr(int matrix_layout, LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a,
+                     LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork,
                      LAPACK_INT *info);
 #else
-void F77NAME(zungqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a, 
-                     LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork, 
+void F77NAME(zungqr)(LAPACK_INT *m, LAPACK_INT *n, LAPACK_INT *k, LAPACK_COMPLEX *a,
+                     LAPACK_INT *lda, LAPACK_COMPLEX *tau, LAPACK_COMPLEX *work, LAPACK_INT *lwork,
                      LAPACK_INT *info);
 #endif
 
@@ -388,51 +413,51 @@ LAPACK_REAL F77NAME(zlange)(char* norm, LAPACK_INT* m, LAPACK_INT* n, LAPACK_COM
 lapack_int LAPACKE_zheev(int matrix_order, char jobz, char uplo, lapack_int n,
                          lapack_complex_double* a, lapack_int lda, double* w);
 #elif defined PLATFORM_acml
-void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, 
-            double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, 
+void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda,
+            double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork,
             LAPACK_INT *info, LAPACK_INT jobz_len, LAPACK_INT uplo_len);
 #else
-void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda, 
-           double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, 
+void F77NAME(zheev)(char *jobz, char *uplo, LAPACK_INT *n, LAPACK_COMPLEX *a, LAPACK_INT *lda,
+           double *w, LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork,
            LAPACK_INT *info);
 #endif
 
 
 #ifdef PLATFORM_acml
-void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a, 
-            LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work, 
-            LAPACK_INT *lwork, LAPACK_INT *info, LAPACK_INT jobz_len, LAPACK_INT uplo_len); 
+void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a,
+            LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work,
+            LAPACK_INT *lwork, LAPACK_INT *info, LAPACK_INT jobz_len, LAPACK_INT uplo_len);
 #else
-void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a, 
-           LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work, 
+void F77NAME(dsygv)(LAPACK_INT *itype, char *jobz, char *uplo, LAPACK_INT *n, double *a,
+           LAPACK_INT *lda, double *b, LAPACK_INT *ldb, double *w, double *work,
            LAPACK_INT *lwork, LAPACK_INT *info);
 #endif
 
 
 #ifdef PLATFORM_acml
-void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, 
-                    LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl, 
-                    double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, 
+void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a,
+                    LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl,
+                    double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork,
                     LAPACK_INT *info, LAPACK_INT jobvl_len, LAPACK_INT jobvr_len);
 #else
-void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a, 
-                    LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl, 
-                    double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork, 
+void F77NAME(dgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, double *a,
+                    LAPACK_INT *lda, double *wr, double *wi, double *vl, LAPACK_INT *ldvl,
+                    double *vr, LAPACK_INT *ldvr, double *work, LAPACK_INT *lwork,
                     LAPACK_INT *info);
 #endif
 
 
 #ifdef PLATFORM_acml
-void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a, 
-                    LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl, 
-                    LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr, 
-                    LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, 
+void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a,
+                    LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl,
+                    LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr,
+                    LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork,
                     LAPACK_INT *info, LAPACK_INT jobvl_len, LAPACK_INT jobvr_len);
 #else
-void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a, 
-                    LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl, 
-                    LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr, 
-                    LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork, 
+void F77NAME(zgeev)(char *jobvl, char *jobvr, LAPACK_INT *n, LAPACK_COMPLEX *a,
+                    LAPACK_INT *lda, LAPACK_COMPLEX *w, LAPACK_COMPLEX *vl,
+                    LAPACK_INT *ldvl, LAPACK_COMPLEX *vr, LAPACK_INT *ldvr,
+                    LAPACK_COMPLEX *work, LAPACK_INT *lwork, double *rwork,
                     LAPACK_INT *info);
 #endif
 
@@ -483,7 +508,7 @@ zdotc_wrapper(LAPACK_INT N,
 // dgemm
 //
 void
-gemm_wrapper(bool transa, 
+gemm_wrapper(bool transa,
              bool transb,
              LAPACK_INT m,
              LAPACK_INT n,
@@ -498,7 +523,7 @@ gemm_wrapper(bool transa,
 // zgemm
 //
 void
-gemm_wrapper(bool transa, 
+gemm_wrapper(bool transa,
              bool transb,
              LAPACK_INT m,
              LAPACK_INT n,
@@ -513,7 +538,7 @@ gemm_wrapper(bool transa,
 // dgemv - matrix*vector multiply
 //
 void
-gemv_wrapper(bool trans, 
+gemv_wrapper(bool trans,
              LAPACK_REAL alpha,
              LAPACK_REAL beta,
              LAPACK_INT m,
@@ -528,7 +553,7 @@ gemv_wrapper(bool trans,
 // zgemv - matrix*vector multiply
 //
 void
-gemv_wrapper(bool trans, 
+gemv_wrapper(bool trans,
              Cplx alpha,
              Cplx beta,
              LAPACK_INT m,
@@ -694,7 +719,7 @@ zgesv_wrapper(LAPACK_INT n,
 //
 // dlange
 //
-// returns the value of the 1-norm, Frobenius norm, infinity-norm, 
+// returns the value of the 1-norm, Frobenius norm, infinity-norm,
 // or the largest absolute value of any element of a general rectangular matrix.
 //
 double
@@ -706,7 +731,7 @@ dlange_wrapper(char norm,
 //
 // zlange
 //
-// returns the value of the 1-norm, Frobenius norm, infinity-norm, 
+// returns the value of the 1-norm, Frobenius norm, infinity-norm,
 // or the largest absolute value of any element of a general rectangular matrix.
 //
 LAPACK_REAL
@@ -720,7 +745,7 @@ zlange_wrapper(char norm,
 //
 // Eigenvalues and eigenvectors of complex Hermitian matrix A
 //
-LAPACK_INT 
+LAPACK_INT
 zheev_wrapper(LAPACK_INT    N,  //number of cols of A
               Cplx        * A,  //matrix A, on return contains eigenvectors
               LAPACK_REAL * d); //eigenvalues on return

From 3ee56cc8a96297cc7e1aeab6a2b4ca7c9b87ed79 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 8 Apr 2022 13:54:45 +0200
Subject: [PATCH 02/68] added wrapper to all blas routines

---
 itensor/tensor/lapack_wrap.cc | 231 ++++++++++++++++++++++++++++------
 1 file changed, 191 insertions(+), 40 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 41d54825a..09060cd50 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -22,7 +22,7 @@ namespace itensor {
 // daxpy
 // Y += alpha*X
 //
-void 
+void
 daxpy_wrapper(LAPACK_INT n,        //number of elements of X,Y
               LAPACK_REAL alpha,   //scale factor
               const LAPACK_REAL* X, //pointer to head of vector X
@@ -32,6 +32,19 @@ daxpy_wrapper(LAPACK_INT n,        //number of elements of X,Y
     {
 #ifdef ITENSOR_USE_CBLAS
     cblas_daxpy(n,alpha,X,incx,Y,incy);
+#elif ITENSOR_USE_CUDA
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_REAL *d_X, *d_Y;
+    cudaMalloc(&d_X, n * sizeof(LAPACK_REAL));
+    cudaMalloc(&d_Y, n * sizeof(LAPACK_REAL));
+    cublasSetVector(n, sizeof(LAPACK_REAL), X, incx, d_X, incx);
+    cublasSetVector(n, sizeof(LAPACK_REAL), Y, incy, d_Y, incy);
+    cublasDaxpy(handle, n, &alpha, d_X, incx, d_Y, incy);
+    cublasGetVector(n, sizeof(LAPACK_REAL), d_Y, incy, Y, incy);
+    cudaFree(d_X);
+    cudaFree(d_Y);
+    cublasDestroy(handle);
 #else
     auto Xnc = const_cast<LAPACK_REAL*>(X);
     F77NAME(daxpy)(&n,&alpha,Xnc,&incx,Y,&incy);
@@ -41,13 +54,24 @@ daxpy_wrapper(LAPACK_INT n,        //number of elements of X,Y
 //
 // dnrm2
 //
-LAPACK_REAL 
+LAPACK_REAL
 dnrm2_wrapper(LAPACK_INT N,
               const LAPACK_REAL* X,
               LAPACK_INT incx)
     {
 #ifdef ITENSOR_USE_CBLAS
     return cblas_dnrm2(N,X,incx);
+#elif ITENSOR_USE_CUDA
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_REAL *d_X;
+    cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
+    cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
+    LAPACK_REAL result;
+    cublasDnrm2(handle, N, d_X, incx, &result);
+    cudaFree(d_X);
+    cublasDestroy(handle);
+    return result;
 #else
     auto *Xnc = const_cast<LAPACK_REAL*>(X);
     return F77NAME(dnrm2)(&N,Xnc,&incx);
@@ -58,7 +82,7 @@ dnrm2_wrapper(LAPACK_INT N,
 //
 // ddot
 //
-LAPACK_REAL 
+LAPACK_REAL
 ddot_wrapper(LAPACK_INT N,
              const LAPACK_REAL* X,
              LAPACK_INT incx,
@@ -67,6 +91,20 @@ ddot_wrapper(LAPACK_INT N,
     {
 #ifdef ITENSOR_USE_CBLAS
     return cblas_ddot(N,X,incx,Y,incy);
+#elif ITENSOR_USE_CUDA
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_REAL *d_X, *d_Y;
+    cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
+    cudaMalloc(&d_Y, N * sizeof(LAPACK_REAL));
+    cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
+    cublasSetVector(N, sizeof(LAPACK_REAL), Y, incy, d_Y, incy);
+    LAPACK_REAL result;
+    cublasDdot (handle, N, d_X, incx, d_Y, incy, &result);
+    cudaFree(d_X);
+    cudaFree(d_Y);
+    cublasDestroy(handle);
+    return result;
 #else
     auto *Xnc = const_cast<LAPACK_REAL*>(X);
     auto *Ync = const_cast<LAPACK_REAL*>(Y);
@@ -98,6 +136,20 @@ zdotc_wrapper(LAPACK_INT N,
 #endif
     cblas_zdotc_sub(N,pX,incx,pY,incy,pres);
     return res;
+#elif ITENSOR_USE_CUDA
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_COMPLEX res;
+    LAPACK_COMPLEX *d_X, *d_Y;
+    cudaMalloc(&d_X, N * sizeof(LAPACK_COMPLEX));
+    cudaMalloc(&d_Y, N * sizeof(LAPACK_COMPLEX));
+    cublasSetVector(N, sizeof(LAPACK_COMPLEX), X, incx, d_X, incx);
+    cublasSetVector(N, sizeof(LAPACK_COMPLEX), Y, incy, d_Y, incy);
+    cublasZdotc(handle, N, d_X, incx, d_Y, incy, &res);
+    cudaFree(d_X);
+    cudaFree(d_Y);
+    cublasDestroy(handle);
+    return res;
 #else
     auto ncX = const_cast<Cplx*>(X);
     auto ncY = const_cast<Cplx*>(Y);
@@ -113,8 +165,8 @@ zdotc_wrapper(LAPACK_INT N,
 //
 // dgemm
 //
-void 
-gemm_wrapper(bool transa, 
+void
+gemm_wrapper(bool transa,
              bool transb,
              LAPACK_INT m,
              LAPACK_INT n,
@@ -141,6 +193,34 @@ gemm_wrapper(bool transa,
         ldb = n;
         }
     cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m);
+#elif ITENSOR_USE_CUDA
+    cublasOperation_t at = CUBLAS_OP_N;
+    cublasOperation_t bt = CUBLAS_OP_N;
+    if(transa)
+        {
+        at = CUBLAS_OP_T;
+        lda = k;
+        }
+    if(transb)
+        {
+        bt = CUBLAS_OP_T;
+        ldb = n;
+        }
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_REAL *d_A, *d_B, *d_C;
+    cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL));
+    cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL));
+    cudaMalloc(&d_C, m * n * sizeof(LAPACK_REAL));
+    cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda);
+    cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb);
+    cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
+    cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m)
+    cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
+    cublasDestroy(handle);
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);
@@ -163,8 +243,8 @@ gemm_wrapper(bool transa,
 //
 // zgemm
 //
-void 
-gemm_wrapper(bool transa, 
+void
+gemm_wrapper(bool transa,
              bool transb,
              LAPACK_INT m,
              LAPACK_INT n,
@@ -204,6 +284,34 @@ gemm_wrapper(bool transa,
     auto* pB = reinterpret_cast<const double*>(B);
     auto* pC = reinterpret_cast<double*>(C);
 	cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,pA,lda,pB,ldb,pbeta,pC,m);
+#elif ITENSOR_USE_CUDA
+    cublasOperation_t at = CUBLAS_OP_N;
+    cublasOperation_t bt = CUBLAS_OP_N;
+    if(transa)
+        {
+        at = CUBLAS_OP_T;
+        lda = k;
+        }
+    if(transb)
+        {
+        bt = CUBLAS_OP_T;
+        ldb = n;
+        }
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_COMPLEX *d_A, *d_B, *d_C;
+    cudaMalloc(&d_A, m * k * sizeof(LAPACK_COMPLEX));
+    cudaMalloc(&d_B, k * n * sizeof(LAPACK_COMPLEX));
+    cudaMalloc(&d_C, m * n * sizeof(LAPACK_COMPLEX));
+    cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda);
+    cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb);
+    cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m);
+    cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m)
+    cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m);
+    cudaFree(d_A);
+    cudaFree(d_B);
+    cudaFree(d_C);
+    cublasDestroy(handle);
 #else //platform not openblas
 #ifdef ITENSOR_USE_CBLAS
     auto at = CblasNoTrans,
@@ -218,8 +326,8 @@ gemm_wrapper(bool transa,
         bt = CblasTrans;
         ldb = n;
         }
-    auto palpha = (void*)(&alpha); 
-    auto pbeta = (void*)(&beta); 
+    auto palpha = (void*)(&alpha);
+    auto pbeta = (void*)(&beta);
     cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,(void*)A,lda,(void*)B,ldb,pbeta,(void*)C,m);
 #else //use Fortran zgemm
     auto *ncA = const_cast<Cplx*>(A);
@@ -246,8 +354,8 @@ gemm_wrapper(bool transa,
 #endif
     }
 
-void 
-gemv_wrapper(bool trans, 
+void
+gemv_wrapper(bool trans,
              LAPACK_REAL alpha,
              LAPACK_REAL beta,
              LAPACK_INT m,
@@ -261,6 +369,23 @@ gemv_wrapper(bool trans,
 #ifdef ITENSOR_USE_CBLAS
     auto Tr = trans ? CblasTrans : CblasNoTrans;
     cblas_dgemv(CblasColMajor,Tr,m,n,alpha,A,m,x,incx,beta,y,incy);
+#elif ITENSOR_USE_CUDA
+    cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_REAL *d_A, *d_x, *d_y;
+    cudaMalloc(&d_A, m * n * sizeof(LAPACK_REAL));
+    cudaMalloc(&d_x, m * sizeof(LAPACK_REAL));
+    cudaMalloc(&d_y, m * sizeof(LAPACK_REAL));
+    cublasSetMatrix(m, n, sizeof(LAPACK_REAL), A, m, d_A, m);
+    cublasSetVector(m, sizeof(LAPACK_REAL), x, incx, d_x, incx);
+    cublasSetVector(m, sizeof(LAPACK_REAL), y, incy, d_y, incy);
+    cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy)
+    cublasGetVector(m, sizeof(LAPACK_REAL), d_y, incy, y, incy);
+    cudaFree(d_A);
+    cudaFree(d_x);
+    cudaFree(d_y);
+    cublasDestroy(handle);
 #else
     char Tr = trans ? 'T' : 'N';
     F77NAME(dgemv)(&Tr,&m,&n,&alpha,const_cast<LAPACK_REAL*>(A),&m,const_cast<LAPACK_REAL*>(x),&incx,&beta,y,&incy);
@@ -268,7 +393,7 @@ gemv_wrapper(bool trans,
     }
 
 void
-gemv_wrapper(bool trans, 
+gemv_wrapper(bool trans,
              Cplx alpha,
              Cplx beta,
              LAPACK_INT m,
@@ -295,21 +420,38 @@ gemv_wrapper(bool trans,
 	auto* px = reinterpret_cast<const double*>(x);
 	auto* py = reinterpret_cast<double*>(y);
     cblas_zgemv(CblasColMajor,Tr,m,n,palpha,pA,m,px,incx,pbeta,py,incy);
+#elif ITENSOR_USE_CUDA
+    cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_COMPLEX *d_A, *d_x, *d_y;
+    cudaMalloc(&d_A, m * n * sizeof(LAPACK_COMPLEX));
+    cudaMalloc(&d_x, m * sizeof(LAPACK_COMPLEX));
+    cudaMalloc(&d_y, m * sizeof(LAPACK_COMPLEX));
+    cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m);
+    cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx);
+    cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy);
+    cublasZgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy)
+    cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy);
+    cudaFree(d_A);
+    cudaFree(d_x);
+    cudaFree(d_y);
+    cublasDestroy(handle);
 #else //platform other than openblas
 #ifdef ITENSOR_USE_CBLAS
     auto Tr = trans ? CblasTrans : CblasNoTrans;
-    auto palpha = reinterpret_cast<void*>(&alpha); 
-    auto pbeta = reinterpret_cast<void*>(&beta); 
+    auto palpha = reinterpret_cast<void*>(&alpha);
+    auto pbeta = reinterpret_cast<void*>(&beta);
     cblas_zgemv(CblasColMajor,Tr,m,n,palpha,(void*)A,m,(void*)x,incx,pbeta,(void*)y,incy);
 #else
     char Tr = trans ? 'T' : 'N';
-    auto ncA = const_cast<Cplx*>(A); 
-    auto ncx = const_cast<Cplx*>(x); 
-    auto pA = reinterpret_cast<LAPACK_COMPLEX*>(ncA); 
-    auto px = reinterpret_cast<LAPACK_COMPLEX*>(ncx); 
-    auto py = reinterpret_cast<LAPACK_COMPLEX*>(y); 
-    auto palpha = reinterpret_cast<LAPACK_COMPLEX*>(&alpha); 
-    auto pbeta = reinterpret_cast<LAPACK_COMPLEX*>(&beta); 
+    auto ncA = const_cast<Cplx*>(A);
+    auto ncx = const_cast<Cplx*>(x);
+    auto pA = reinterpret_cast<LAPACK_COMPLEX*>(ncA);
+    auto px = reinterpret_cast<LAPACK_COMPLEX*>(ncx);
+    auto py = reinterpret_cast<LAPACK_COMPLEX*>(y);
+    auto palpha = reinterpret_cast<LAPACK_COMPLEX*>(&alpha);
+    auto pbeta = reinterpret_cast<LAPACK_COMPLEX*>(&beta);
     F77NAME(zgemv)(&Tr,&m,&n,palpha,pA,&m,px,&incx,pbeta,py,&incy);
 #endif
 #endif
@@ -319,7 +461,7 @@ gemv_wrapper(bool trans,
 //
 // dsyev
 //
-void 
+void
 dsyev_wrapper(char jobz,        //if jobz=='V', compute eigs and evecs
               char uplo,        //if uplo=='U', read from upper triangle of A
               LAPACK_INT n,     //number of cols of A
@@ -349,7 +491,7 @@ dsyev_wrapper(char jobz,        //if jobz=='V', compute eigs and evecs
 //
 // dscal
 //
-void 
+void
 dscal_wrapper(LAPACK_INT N,
               LAPACK_REAL alpha,
               LAPACK_REAL* data,
@@ -357,12 +499,22 @@ dscal_wrapper(LAPACK_INT N,
     {
 #ifdef ITENSOR_USE_CBLAS
     cblas_dscal(N,alpha,data,inc);
+#elif ITENSOR_USE_CUDA
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    LAPACK_REAL *d_x;
+    cudaMalloc(&d_x, N * sizeof(LAPACK_REAL));
+    cublasSetVector(N, sizeof(LAPACK_REAL), data, inc, d_x, inc);
+    cublasDscal(handle, N, &alpha, d_x, inc)
+    cublasGetVector(N, sizeof(LAPACK_REAL), d_x, inc, data, inc);
+    cudaFree(d_x);
+    cublasDestroy(handle);
 #else
     F77NAME(dscal)(&N,&alpha,data,&inc);
 #endif
     }
 
-void 
+void
 zgesdd_wrapper(char *jobz,           //char* specifying how much of U, V to compute
                                      //choosing *jobz=='S' computes min(m,n) cols of U, V
                LAPACK_INT *m,        //number of rows of input matrix *A
@@ -395,7 +547,7 @@ zgesdd_wrapper(char *jobz,           //char* specifying how much of U, V to comp
 
 
 
-  void 
+  void
 dgesdd_wrapper(char* jobz,           //char* specifying how much of U, V to compute
                                     //choosing *jobz=='S' computes min(m,n) cols of U, V
                LAPACK_INT *m,        //number of rows of input matrix *A
@@ -423,7 +575,7 @@ dgesdd_wrapper(char* jobz,           //char* specifying how much of U, V to comp
 
 
 
-  void 
+  void
 zgesvd_wrapper(char *jobz,           //char* specifying how much of U, V to compute
                                      //choosing *jobz=='S' computes min(m,n) cols of U, V
                LAPACK_INT *m,        //number of rows of input matrix *A
@@ -456,7 +608,7 @@ zgesvd_wrapper(char *jobz,           //char* specifying how much of U, V to comp
 
 
 
-  void 
+  void
 dgesvd_wrapper(char* jobz,           //char* specifying how much of U, V to compute
                                     //choosing *jobz=='S' computes min(m,n) cols of U, V
                LAPACK_INT *m,        //number of rows of input matrix *A
@@ -469,7 +621,7 @@ dgesvd_wrapper(char* jobz,           //char* specifying how much of U, V to comp
     {
     std::vector<LAPACK_REAL> work;
     // std::vector<LAPACK_REAL> superb;
-    
+
     std::vector<LAPACK_INT> iwork;
     LAPACK_INT l = std::min(*m,*n),
                g = std::max(*m,*n);
@@ -490,7 +642,7 @@ dgesvd_wrapper(char* jobz,           //char* specifying how much of U, V to comp
 //
 // QR factorization of a real matrix A
 //
-void 
+void
 dgeqrf_wrapper(LAPACK_INT* m,     //number of rows of A
                LAPACK_INT* n,     //number of cols of A
                LAPACK_REAL* A,    //matrix A
@@ -503,7 +655,7 @@ dgeqrf_wrapper(LAPACK_INT* m,     //number of rows of A
     static const LAPACK_INT one = 1;
     std::vector<LAPACK_REAL> work;
     LAPACK_INT lwork = std::max(one,4*std::max(*n,*m));
-    work.resize(lwork+2); 
+    work.resize(lwork+2);
     F77NAME(dgeqrf)(m,n,A,lda,tau,work.data(),&lwork,info);
     }
 
@@ -512,7 +664,7 @@ dgeqrf_wrapper(LAPACK_INT* m,     //number of rows of A
 //
 // Generates Q from output of QR factorization routine dgeqrf (see above)
 //
-void 
+void
 dorgqr_wrapper(LAPACK_INT* m,     //number of rows of A
                LAPACK_INT* n,     //number of cols of A
                LAPACK_INT* k,     //number of elementary reflectors, typically min(m,n)
@@ -525,7 +677,7 @@ dorgqr_wrapper(LAPACK_INT* m,     //number of rows of A
     static const LAPACK_INT one = 1;
     std::vector<LAPACK_REAL> work;
     auto lwork = std::max(one,4*std::max(*n,*m));
-    work.resize(lwork+2); 
+    work.resize(lwork+2);
     F77NAME(dorgqr)(m,n,k,A,lda,tau,work.data(),&lwork,info);
     }
 
@@ -535,7 +687,7 @@ dorgqr_wrapper(LAPACK_INT* m,     //number of rows of A
 //
 // QR factorization of a complex matrix A
 //
-void 
+void
 zgeqrf_wrapper(LAPACK_INT* m,     //number of rows of A
                LAPACK_INT* n,     //number of cols of A
                Cplx* A,    //matrix A
@@ -559,7 +711,7 @@ zgeqrf_wrapper(LAPACK_INT* m,     //number of rows of A
 //
 // Generates Q from output of QR factorization routine zgeqrf (see above)
 //
-void 
+void
 zungqr_wrapper(LAPACK_INT* m,     //number of rows of A
                LAPACK_INT* n,     //number of cols of A
                LAPACK_INT* k,     //number of elementary reflectors, typically min(m,n)
@@ -675,7 +827,7 @@ zlange_wrapper(char norm,
 //
 // Eigenvalues and eigenvectors of complex Hermitian matrix A
 //
-LAPACK_INT 
+LAPACK_INT
 zheev_wrapper(LAPACK_INT      N,  //number of cols of A
               Cplx          * A,  //matrix A, on return contains eigenvectors
               LAPACK_REAL   * d)  //eigenvalues on return
@@ -713,7 +865,7 @@ zheev_wrapper(LAPACK_INT      N,  //number of cols of A
 // A and B must be symmetric
 // B must be positive definite
 //
-void 
+void
 dsygv_wrapper(char* jobz,           //if 'V', compute both eigs and evecs
                                     //if 'N', only eigenvalues
               char* uplo,           //if 'U', use upper triangle of A
@@ -743,7 +895,7 @@ dsygv_wrapper(char* jobz,           //if 'V', compute both eigs and evecs
 // Eigenvalues and eigenvectors of real, square matrix A
 // A can be a general real matrix, not assumed symmetric
 //
-LAPACK_INT 
+LAPACK_INT
 dgeev_wrapper(char jobvl,          //if 'V', compute left eigenvectors, else 'N'
               char jobvr,          //if 'V', compute right eigenvectors, else 'N'
               LAPACK_INT n,        //number of rows/cols of A
@@ -758,7 +910,7 @@ dgeev_wrapper(char jobvl,          //if 'V', compute left eigenvectors, else 'N'
 
     cpA.resize(n*n);
     std::copy(A,A+n*n,cpA.data());
-    
+
     LAPACK_INT nevecl = (jobvl == 'V' ? n : 1);
     LAPACK_INT nevecr = (jobvr == 'V' ? n : 1);
     LAPACK_INT info = 0;
@@ -802,7 +954,7 @@ dgeev_wrapper(char jobvl,          //if 'V', compute left eigenvectors, else 'N'
 // Eigenvalues and eigenvectors of complex, square matrix A
 // A can be a general complex matrix, not assumed symmetric
 //
-LAPACK_INT 
+LAPACK_INT
 zgeev_wrapper(char jobvl,          //if 'V', compute left eigenvectors, else 'N'
               char jobvr,          //if 'V', compute right eigenvectors, else 'N'
               LAPACK_INT n,        //number of rows/cols of A
@@ -841,4 +993,3 @@ zgeev_wrapper(char jobvl,          //if 'V', compute left eigenvectors, else 'N'
     }
 
 } //namespace itensor
-

From e421a33a417c2df90eae8f6d4d7bf5a0d65f7143 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 19 Apr 2022 14:57:05 +0200
Subject: [PATCH 03/68] added cuda in options.mk

---
 options.mk.sample | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/options.mk.sample b/options.mk.sample
index 3412c8961..fe92f215b 100644
--- a/options.mk.sample
+++ b/options.mk.sample
@@ -9,8 +9,11 @@
 ## GNU GCC compiler
 CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 
+## CUDA compiler
+#CCCOM=nvcc -m64 -std=c++17 -fconcepts -fPIC
+
 ## Clang compiler (good to use on macOS)
-## Note: try this if you have trouble compiling 
+## Note: try this if you have trouble compiling
 ##       with g++ on macOS Mojave 10.14
 #CCCOM=clang++ -std=c++17 -fPIC -Wno-gcc-compat
 
@@ -26,23 +29,23 @@ CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 
 #########
 ## [2]
-## 
+##
 ## BLAS/LAPACK Related Options
-## 
+##
 ## o The variable PLATFORM can be set to macos,lapack,mkl, or acml.
 ##   This tells ITensor which style of BLAS/Lapack library to expect,
 ##   and turns various lines of code on or off inside the files
 ##   itensor/tensor/lapack_wrap.h and lapack_wrap.cc.
-## 
-## o BLAS_LAPACK_LIBFLAGS specifies blas or lapack related 
-##   flags used during the linking step. For example, 
+##
+## o BLAS_LAPACK_LIBFLAGS specifies blas or lapack related
+##   flags used during the linking step. For example,
 ##   flags of the type:
 ##   -L/path/to/lapack/lib -llapack -lblas
 ##   though the names of the static libraries (the files referred
 ##   to by the -l flags) can be highly variable - see examples below.
 ##
-## o BLAS_LAPACK_INCLUDEFLAGS are blas or lapack related flags 
-##   needed during compilation. It may include flags such as 
+## o BLAS_LAPACK_INCLUDEFLAGS are blas or lapack related flags
+##   needed during compilation. It may include flags such as
 ##   -I/path/to/lapack/include
 ##   where "include" is a folder containing .h header files.
 ##
@@ -60,7 +63,15 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 ##
 
 #PLATFORM=lapack
-#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack 
+#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack
+
+##
+## Example using CUDA on GNU/LINUX systems
+## (Path to lib/ folder may differ on your system)
+##
+
+#PLATFORM=cuda
+#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack -lcublas
 
 ##
 ## Example using the Intel MKL library
@@ -70,8 +81,8 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 #PLATFORM=mkl
 ## MKL example - you may need to change the names of the folders below
 #BLAS_LAPACK_LIBFLAGS=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_rt -lmkl_core -liomp5 -lpthread
-## Example of using a sequential version of MKL. You may want to link to 
-## the sequential version of MKL if you are using ITensor's native multithreading, 
+## Example of using a sequential version of MKL. You may want to link to
+## the sequential version of MKL if you are using ITensor's native multithreading,
 ## see section [4] for more details.
 #BLAS_LAPACK_LIBFLAGS=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_rt -lmkl_core -lpthread
 #BLAS_LAPACK_INCLUDEFLAGS=-I/opt/intel/mkl/include
@@ -100,12 +111,12 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 ## If you want ITensor to be compiled with HDF5 support
 ## uncomment the line below, and edit it to be the
 ## location in which the HDF5 libraries and headers are
-## installed. 
+## installed.
 ##
 ## To determine the prefix on your system, use the command:
 ## h5cc -show
 ##
-## For more help, visit: 
+## For more help, visit:
 ## http://itensor.org/docs.cgi?vers=cppv3&page=install
 ##
 #HDF5_PREFIX=/usr/local
@@ -120,7 +131,7 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 ## conservation may benefit.
 ##
 ## If you want to enable OpenMP multithreading in ITensor,
-## uncomment the line below. You must have OpenMP installed 
+## uncomment the line below. You must have OpenMP installed
 ## on your system.
 ##
 ## You should set the environment variable OMP_NUM_THREADS
@@ -144,7 +155,7 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 ##
 ## export OPENBLAS_NUM_THREADS=1
 ##
-## For more help, visit: 
+## For more help, visit:
 ## http://itensor.org/docs.cgi?vers=cppv3&page=install
 ##
 #ITENSOR_USE_OMP=1

From 40994fbf3710143662b070680930aa91464168dd Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 19 Apr 2022 15:17:33 +0200
Subject: [PATCH 04/68] fixed typos and compilation options

---
 itensor/tensor/lapack_wrap.cc | 22 +++++++++++-----------
 options.mk.sample             |  6 +++---
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 09060cd50..46113bec4 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -32,7 +32,7 @@ daxpy_wrapper(LAPACK_INT n,        //number of elements of X,Y
     {
 #ifdef ITENSOR_USE_CBLAS
     cblas_daxpy(n,alpha,X,incx,Y,incy);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasHandle_t handle;
     cublasCreate(&handle);
     LAPACK_REAL *d_X, *d_Y;
@@ -61,7 +61,7 @@ dnrm2_wrapper(LAPACK_INT N,
     {
 #ifdef ITENSOR_USE_CBLAS
     return cblas_dnrm2(N,X,incx);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasHandle_t handle;
     cublasCreate(&handle);
     LAPACK_REAL *d_X;
@@ -91,7 +91,7 @@ ddot_wrapper(LAPACK_INT N,
     {
 #ifdef ITENSOR_USE_CBLAS
     return cblas_ddot(N,X,incx,Y,incy);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasHandle_t handle;
     cublasCreate(&handle);
     LAPACK_REAL *d_X, *d_Y;
@@ -136,7 +136,7 @@ zdotc_wrapper(LAPACK_INT N,
 #endif
     cblas_zdotc_sub(N,pX,incx,pY,incy,pres);
     return res;
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasHandle_t handle;
     cublasCreate(&handle);
     LAPACK_COMPLEX res;
@@ -193,7 +193,7 @@ gemm_wrapper(bool transa,
         ldb = n;
         }
     cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasOperation_t at = CUBLAS_OP_N;
     cublasOperation_t bt = CUBLAS_OP_N;
     if(transa)
@@ -215,7 +215,7 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda);
     cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb);
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
-    cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m)
+    cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
     cudaFree(d_A);
     cudaFree(d_B);
@@ -284,7 +284,7 @@ gemm_wrapper(bool transa,
     auto* pB = reinterpret_cast<const double*>(B);
     auto* pC = reinterpret_cast<double*>(C);
 	cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,pA,lda,pB,ldb,pbeta,pC,m);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasOperation_t at = CUBLAS_OP_N;
     cublasOperation_t bt = CUBLAS_OP_N;
     if(transa)
@@ -306,7 +306,7 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda);
     cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb);
     cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m);
-    cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m)
+    cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m);
     cudaFree(d_A);
     cudaFree(d_B);
@@ -369,7 +369,7 @@ gemv_wrapper(bool trans,
 #ifdef ITENSOR_USE_CBLAS
     auto Tr = trans ? CblasTrans : CblasNoTrans;
     cblas_dgemv(CblasColMajor,Tr,m,n,alpha,A,m,x,incx,beta,y,incy);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N;
     cublasHandle_t handle;
     cublasCreate(&handle);
@@ -420,7 +420,7 @@ gemv_wrapper(bool trans,
 	auto* px = reinterpret_cast<const double*>(x);
 	auto* py = reinterpret_cast<double*>(y);
     cblas_zgemv(CblasColMajor,Tr,m,n,palpha,pA,m,px,incx,pbeta,py,incy);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasOperation_t tr = trans ? CUBLAS_OP_T : CUBLAS_OP_N;
     cublasHandle_t handle;
     cublasCreate(&handle);
@@ -499,7 +499,7 @@ dscal_wrapper(LAPACK_INT N,
     {
 #ifdef ITENSOR_USE_CBLAS
     cblas_dscal(N,alpha,data,inc);
-#elif ITENSOR_USE_CUDA
+#elif defined ITENSOR_USE_CUDA
     cublasHandle_t handle;
     cublasCreate(&handle);
     LAPACK_REAL *d_x;
diff --git a/options.mk.sample b/options.mk.sample
index fe92f215b..992bf82bd 100644
--- a/options.mk.sample
+++ b/options.mk.sample
@@ -10,7 +10,7 @@
 CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 
 ## CUDA compiler
-#CCCOM=nvcc -m64 -std=c++17 -fconcepts -fPIC
+#CCCOM=nvcc -m64 -std=c++17
 
 ## Clang compiler (good to use on macOS)
 ## Note: try this if you have trouble compiling
@@ -168,10 +168,10 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 ## used to compile optimized and debug code, you can do so here.
 
 ## Flags to give the compiler for "release mode"
-OPTIMIZATIONS=-O2 -DNDEBUG -Wall -Wno-unknown-pragmas
+OPTIMIZATIONS=-O2 -DNDEBUG
 
 ## Flags to give the compiler for "debug mode"
-DEBUGFLAGS=-DDEBUG -g -Wall -Wno-unknown-pragmas -pedantic
+DEBUGFLAGS=-DDEBUG -g -pedantic
 #
 ## Set this to 1 if you want ITensor to also build dynamic libraries
 ## These can be faster to link and give smaller binary sizes

From 8d2b7452bf7f06143c236bf929b234619235d12b Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 19 Apr 2022 15:27:10 +0200
Subject: [PATCH 05/68] fixed type cast

---
 itensor/tensor/lapack_wrap.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 46113bec4..14ad0748f 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -149,7 +149,7 @@ zdotc_wrapper(LAPACK_INT N,
     cudaFree(d_X);
     cudaFree(d_Y);
     cublasDestroy(handle);
-    return res;
+    return (LAPACK_COMPLEX) res;
 #else
     auto ncX = const_cast<Cplx*>(X);
     auto ncY = const_cast<Cplx*>(Y);
@@ -306,7 +306,7 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda);
     cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb);
     cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m);
-    cublasZgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
+    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, d_A, lda, d_B, ldb, (LAPACK_COMPLEX*) &beta, d_C, m);
     cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m);
     cudaFree(d_A);
     cudaFree(d_B);
@@ -380,7 +380,7 @@ gemv_wrapper(bool trans,
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), A, m, d_A, m);
     cublasSetVector(m, sizeof(LAPACK_REAL), x, incx, d_x, incx);
     cublasSetVector(m, sizeof(LAPACK_REAL), y, incy, d_y, incy);
-    cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy)
+    cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy);
     cublasGetVector(m, sizeof(LAPACK_REAL), d_y, incy, y, incy);
     cudaFree(d_A);
     cudaFree(d_x);

From 3596a6d0995ddd9aa779711bc05c1e90f1943ae0 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 19 Apr 2022 15:46:39 +0200
Subject: [PATCH 06/68] fixed types

---
 itensor/tensor/lapack_wrap.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 14ad0748f..3148c608b 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -149,7 +149,7 @@ zdotc_wrapper(LAPACK_INT N,
     cudaFree(d_X);
     cudaFree(d_Y);
     cublasDestroy(handle);
-    return (LAPACK_COMPLEX) res;
+    return (Cplx) res;
 #else
     auto ncX = const_cast<Cplx*>(X);
     auto ncY = const_cast<Cplx*>(Y);
@@ -431,7 +431,7 @@ gemv_wrapper(bool trans,
     cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m);
     cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx);
     cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy);
-    cublasZgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy)
+    cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy)
     cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy);
     cudaFree(d_A);
     cudaFree(d_x);
@@ -505,7 +505,7 @@ dscal_wrapper(LAPACK_INT N,
     LAPACK_REAL *d_x;
     cudaMalloc(&d_x, N * sizeof(LAPACK_REAL));
     cublasSetVector(N, sizeof(LAPACK_REAL), data, inc, d_x, inc);
-    cublasDscal(handle, N, &alpha, d_x, inc)
+    cublasDscal(handle, N, &alpha, d_x, inc);
     cublasGetVector(N, sizeof(LAPACK_REAL), d_x, inc, data, inc);
     cudaFree(d_x);
     cublasDestroy(handle);

From 9753d7e91540008193ec366c97209e5d452d2c29 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 19 Apr 2022 15:48:09 +0200
Subject: [PATCH 07/68] fixe types

---
 itensor/tensor/lapack_wrap.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 3148c608b..e8a3d1e36 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -149,7 +149,7 @@ zdotc_wrapper(LAPACK_INT N,
     cudaFree(d_X);
     cudaFree(d_Y);
     cublasDestroy(handle);
-    return (Cplx) res;
+    return Cplx(res.x, res.y);
 #else
     auto ncX = const_cast<Cplx*>(X);
     auto ncY = const_cast<Cplx*>(Y);
@@ -431,7 +431,7 @@ gemv_wrapper(bool trans,
     cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m);
     cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx);
     cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy);
-    cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy)
+    cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy);
     cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy);
     cudaFree(d_A);
     cudaFree(d_x);

From 84011c546ba640611e57f724d7d748c4a9ab11cd Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 19 Apr 2022 15:56:02 +0200
Subject: [PATCH 08/68] added fortran functions

---
 itensor/tensor/lapack_wrap.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index 0af8e3c49..7000c07c3 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -145,6 +145,7 @@ imagRef(LAPACK_COMPLEX & z)
 #elif defined PLATFORM_cuda
 
 #define ITENSOR_USE_CUDA
+#define LAPACK_REQUIRE_EXTERN
 
 #include <cuda_runtime.h>
 #include <cublas_v2.h>

From c9535cd77c7a46aa8bb5ec54ef3aec0b029c5814 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Thu, 28 Apr 2022 11:05:12 +0200
Subject: [PATCH 09/68] trying to fix already defined type error

---
 itensor/tensor/lapack_wrap.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index 7000c07c3..014cd1e57 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -145,7 +145,7 @@ imagRef(LAPACK_COMPLEX & z)
 #elif defined PLATFORM_cuda
 
 #define ITENSOR_USE_CUDA
-#define LAPACK_REQUIRE_EXTERN
+//#define LAPACK_REQUIRE_EXTERN
 
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
@@ -153,9 +153,9 @@ imagRef(LAPACK_COMPLEX & z)
 #include <cuComplex.h>
 
 namespace itensor {
-cudaDataType_t typeComplexData = CUDA_C_64F;
-cudaDataType_t typeRealData = CUDA_R_64F;
-cublasComputeType_t typeCompute = CUBLAS_COMPUTE_64F;
+//cudaDataType_t typeComplexData = CUDA_C_64F;
+//cudaDataType_t typeRealData = CUDA_R_64F;
+//cublasComputeType_t typeCompute = CUBLAS_COMPUTE_64F;
 
 using LAPACK_INT = int;
 using LAPACK_REAL = double;

From c94c67dcffdf885611af11432549af594e925446 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Thu, 28 Apr 2022 11:47:10 +0200
Subject: [PATCH 10/68] changed lapack in cuda platform

---
 itensor/tensor/lapack_wrap.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index 014cd1e57..9c1ecc18a 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -151,6 +151,8 @@ imagRef(LAPACK_COMPLEX & z)
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cuComplex.h>
+#include "lapacke.h"
+#undef I //lapacke.h includes complex.h which defined an `I` macro
 
 namespace itensor {
 //cudaDataType_t typeComplexData = CUDA_C_64F;

From 16de1b7115f1dd840cabee80bc081cb8d9ce35fe Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Thu, 28 Apr 2022 11:51:49 +0200
Subject: [PATCH 11/68] reverted lapack in cuda platform

---
 itensor/tensor/lapack_wrap.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index 9c1ecc18a..ddab8c504 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -145,14 +145,12 @@ imagRef(LAPACK_COMPLEX & z)
 #elif defined PLATFORM_cuda
 
 #define ITENSOR_USE_CUDA
-//#define LAPACK_REQUIRE_EXTERN
+#define LAPACK_REQUIRE_EXTERN
 
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cuComplex.h>
-#include "lapacke.h"
-#undef I //lapacke.h includes complex.h which defined an `I` macro
 
 namespace itensor {
 //cudaDataType_t typeComplexData = CUDA_C_64F;

From 33f23b3a14ec0bb730ec4e94cf90b0ffe216a1c3 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Thu, 28 Apr 2022 12:02:55 +0200
Subject: [PATCH 12/68] set cuda default in options

---
 options.mk.sample | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/options.mk.sample b/options.mk.sample
index 992bf82bd..280a8f0e2 100644
--- a/options.mk.sample
+++ b/options.mk.sample
@@ -7,10 +7,10 @@
 ##
 ## Set which compiler to use by defining CCCOM:
 ## GNU GCC compiler
-CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
+#CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 
 ## CUDA compiler
-#CCCOM=nvcc -m64 -std=c++17
+CCCOM=nvcc -m64 -std=c++17
 
 ## Clang compiler (good to use on macOS)
 ## Note: try this if you have trouble compiling
@@ -54,8 +54,8 @@ CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 ## Mac OSX system
 ##
 
-PLATFORM=macos
-BLAS_LAPACK_LIBFLAGS=-framework Accelerate
+#PLATFORM=macos
+#BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 
 ##
 ## Example using a C interface to LAPACK on GNU/LINUX systems
@@ -70,8 +70,8 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 ## (Path to lib/ folder may differ on your system)
 ##
 
-#PLATFORM=cuda
-#BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack -lcublas
+PLATFORM=cuda
+BLAS_LAPACK_LIBFLAGS=-lpthread -L/usr/lib -lblas -llapack -lcublas
 
 ##
 ## Example using the Intel MKL library
@@ -171,7 +171,7 @@ BLAS_LAPACK_LIBFLAGS=-framework Accelerate
 OPTIMIZATIONS=-O2 -DNDEBUG
 
 ## Flags to give the compiler for "debug mode"
-DEBUGFLAGS=-DDEBUG -g -pedantic
+DEBUGFLAGS=-DDEBUG -g
 #
 ## Set this to 1 if you want ITensor to also build dynamic libraries
 ## These can be faster to link and give smaller binary sizes
@@ -217,7 +217,7 @@ ITENSOR_LIBFLAGS += -fopenmp
 ITENSOR_LIBGFLAGS += -fopenmp
 endif
 
-CCFLAGS=-I. $(ITENSOR_INCLUDEFLAGS) $(OPTIMIZATIONS) -Wno-unused-variable
+CCFLAGS=-I. $(ITENSOR_INCLUDEFLAGS) $(OPTIMIZATIONS)
 CCGFLAGS=-I. $(ITENSOR_INCLUDEFLAGS) $(DEBUGFLAGS)
 LIBFLAGS=-L'$(ITENSOR_LIBDIR)' $(ITENSOR_LIBFLAGS)
 LIBGFLAGS=-L'$(ITENSOR_LIBDIR)' $(ITENSOR_LIBGFLAGS)

From 3fcc47e34228ce2bd969324fdc30c2ffe10419cd Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 16:46:35 +0200
Subject: [PATCH 13/68] added nrm2 debug print

---
 itensor/tensor/lapack_wrap.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index e8a3d1e36..fd20dc57a 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -68,7 +68,13 @@ dnrm2_wrapper(LAPACK_INT N,
     cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
     cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
     LAPACK_REAL result;
-    cublasDnrm2(handle, N, d_X, incx, &result);
+    cublasDnrm2(handle, N, d_X, incxr &result);
+    std::cout << result << " is the norm of:" << std:endl;
+    for(auto v : X)
+    {
+        std::cout << v << " ";
+    }
+    std::cout << std::endl << std::endl;
     cudaFree(d_X);
     cublasDestroy(handle);
     return result;

From 89b729e77c9d5d1bdef2e37aa8e435300bc6ba67 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 16:50:09 +0200
Subject: [PATCH 14/68] fixed typo in nrm2

---
 itensor/tensor/lapack_wrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index fd20dc57a..d1d6686e1 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -68,7 +68,7 @@ dnrm2_wrapper(LAPACK_INT N,
     cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
     cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
     LAPACK_REAL result;
-    cublasDnrm2(handle, N, d_X, incxr &result);
+    cublasDnrm2(handle, N, d_X, incx, &result);
     std::cout << result << " is the norm of:" << std:endl;
     for(auto v : X)
     {

From a4e089a437769b1642652157588315dd0a79de77 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 16:50:56 +0200
Subject: [PATCH 15/68] fixed typo in std::endl

---
 itensor/tensor/lapack_wrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index d1d6686e1..96230c997 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -69,7 +69,7 @@ dnrm2_wrapper(LAPACK_INT N,
     cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
     LAPACK_REAL result;
     cublasDnrm2(handle, N, d_X, incx, &result);
-    std::cout << result << " is the norm of:" << std:endl;
+    std::cout << result << " is the norm of:" << std::endl;
     for(auto v : X)
     {
         std::cout << v << " ";

From 8a395da2d97c2ed9a83e2962c5783c8847f9f50e Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 16:53:04 +0200
Subject: [PATCH 16/68] fixed vector printing

---
 itensor/tensor/lapack_wrap.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 96230c997..7ff396944 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -70,9 +70,9 @@ dnrm2_wrapper(LAPACK_INT N,
     LAPACK_REAL result;
     cublasDnrm2(handle, N, d_X, incx, &result);
     std::cout << result << " is the norm of:" << std::endl;
-    for(auto v : X)
+    for(int i=0; i<N; ++i)
     {
-        std::cout << v << " ";
+        std::cout << X[i] << " ";
     }
     std::cout << std::endl << std::endl;
     cudaFree(d_X);

From 065a0d9b1ae1b16f11dfc032c2184657545dba63 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 17:09:45 +0200
Subject: [PATCH 17/68] added debug in gemv

---
 itensor/tensor/lapack_wrap.cc | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 7ff396944..bfaea6d4c 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -391,6 +391,26 @@ gemv_wrapper(bool trans,
     cudaFree(d_A);
     cudaFree(d_x);
     cudaFree(d_y);
+    std::cout << "Matrix A:" << std::endl;
+    for(int i=0; i<m*n; ++i)
+    {
+        std::cout << A[i] << " ";
+        if(!((i+1)%m)) std::cout << std::endl;
+    }
+    std::cout << std::endl;
+    std::cout << "Vector x:" << std::endl;
+    for(int i=0; i<m; ++i)
+    {
+        std::cout << x[i] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "Vector y:" << std::endl;
+    for(int i=0; i<m; ++i)
+    {
+        std::cout << y[i] << " ";
+    }
+    std::cout << std::endl;
+
     cublasDestroy(handle);
 #else
     char Tr = trans ? 'T' : 'N';

From e295407875690fb15984532bee0631a5e85abe90 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 17:20:14 +0200
Subject: [PATCH 18/68] added zgemv debug

---
 itensor/tensor/lapack_wrap.cc | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index bfaea6d4c..b3b213296 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -462,6 +462,25 @@ gemv_wrapper(bool trans,
     cudaFree(d_A);
     cudaFree(d_x);
     cudaFree(d_y);
+    std::cout << "Matrix A:" << std::endl;
+    for(int i=0; i<m*n; ++i)
+    {
+        std::cout << A[i] << " ";
+        if(!((i+1)%m)) std::cout << std::endl;
+    }
+    std::cout << std::endl;
+    std::cout << "Vector x:" << std::endl;
+    for(int i=0; i<m; ++i)
+    {
+        std::cout << x[i] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "Vector y:" << std::endl;
+    for(int i=0; i<m; ++i)
+    {
+        std::cout << y[i] << " ";
+    }
+    std::cout << std::endl;
     cublasDestroy(handle);
 #else //platform other than openblas
 #ifdef ITENSOR_USE_CBLAS

From 394be3af3ba202f733b05669f83a2b98eeee1da9 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 17:45:44 +0200
Subject: [PATCH 19/68] added debug dgemm

---
 itensor/tensor/lapack_wrap.cc | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index b3b213296..820796334 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -227,6 +227,27 @@ gemm_wrapper(bool transa,
     cudaFree(d_B);
     cudaFree(d_C);
     cublasDestroy(handle);
+    std::cout << "Matrix A:" << std::endl;
+    for(int i=0; i<m*k; ++i)
+    {
+        std::cout << A[i] << " ";
+        if(!((i+1)%m)) std::cout << std::endl;
+    }
+    std::cout << std::endl;
+    std::cout << "Matrix B:" << std::endl;
+    for(int i=0; i<k*n; ++i)
+    {
+        std::cout << B[i] << " ";
+        if(!((i+1)%k)) std::cout << std::endl;
+    }
+    std::cout << std::endl;
+    std::cout << "Matrix C:" << std::endl;
+    for(int i=0; i<m*n; ++i)
+    {
+        std::cout << C[i] << " ";
+        if(!((i+1)%m)) std::cout << std::endl;
+    }
+    std::cout << std::endl;
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);

From 40aabd50708ee39755be37bad59a296d3d746d55 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 17:54:48 +0200
Subject: [PATCH 20/68] dont actually use gemm on gpu

---
 itensor/tensor/lapack_wrap.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 820796334..6ac6a7f40 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -248,6 +248,7 @@ gemm_wrapper(bool transa,
         if(!((i+1)%m)) std::cout << std::endl;
     }
     std::cout << std::endl;
+    F77NAME(dgemm)(&at,&bt,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m);
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);

From 5de66a00bb67af5e37436a0b733a7a09878719cb Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 17:57:33 +0200
Subject: [PATCH 21/68] dont actually use gemm on gpu - fixed typo

---
 itensor/tensor/lapack_wrap.cc | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 6ac6a7f40..2d9f65ab2 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -248,7 +248,37 @@ gemm_wrapper(bool transa,
         if(!((i+1)%m)) std::cout << std::endl;
     }
     std::cout << std::endl;
-    F77NAME(dgemm)(&at,&bt,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m);
+
+    // I just want to try to not use gpu multiplication
+    auto *pA = const_cast<double*>(A);
+    auto *pB = const_cast<double*>(B);
+    char at = 'N';
+    char bt = 'N';
+    if(transa)
+        {
+        at = 'T';
+        lda = k;
+        }
+    if(transb)
+        {
+        bt = 'T';
+        ldb = n;
+        }
+    auto *pA = const_cast<double*>(A);
+    auto *pB = const_cast<double*>(B);
+    char at2 = 'N';
+    char bt2 = 'N';
+    if(transa)
+        {
+        at2 = 'T';
+        lda = k;
+        }
+    if(transb)
+        {
+        bt2 = 'T';
+        ldb = n;
+        }
+    F77NAME(dgemm)(&at2,&bt2,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m);
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);

From 39d5b67c02a06835d1338cded6a778132323a1b6 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 17:59:01 +0200
Subject: [PATCH 22/68] dont actually use gemm on gpu - fixed typo 2

---
 itensor/tensor/lapack_wrap.cc | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 2d9f65ab2..c176cf199 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -252,20 +252,6 @@ gemm_wrapper(bool transa,
     // I just want to try to not use gpu multiplication
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);
-    char at = 'N';
-    char bt = 'N';
-    if(transa)
-        {
-        at = 'T';
-        lda = k;
-        }
-    if(transb)
-        {
-        bt = 'T';
-        ldb = n;
-        }
-    auto *pA = const_cast<double*>(A);
-    auto *pB = const_cast<double*>(B);
     char at2 = 'N';
     char bt2 = 'N';
     if(transa)

From fa88178ebb0162dfd28e55e506d2c151d9526686 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 29 Apr 2022 18:46:17 +0200
Subject: [PATCH 23/68] dont actually use gemm on gpu - dont print

---
 itensor/tensor/lapack_wrap.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index c176cf199..5ea4184bf 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -200,6 +200,7 @@ gemm_wrapper(bool transa,
         }
     cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m);
 #elif defined ITENSOR_USE_CUDA
+/*
     cublasOperation_t at = CUBLAS_OP_N;
     cublasOperation_t bt = CUBLAS_OP_N;
     if(transa)
@@ -248,6 +249,7 @@ gemm_wrapper(bool transa,
         if(!((i+1)%m)) std::cout << std::endl;
     }
     std::cout << std::endl;
+*/
 
     // I just want to try to not use gpu multiplication
     auto *pA = const_cast<double*>(A);

From 83c8d2b049d64501d8e0abed7f4d756260eb801f Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 14:31:17 +0200
Subject: [PATCH 24/68] output A B C

---
 itensor/tensor/lapack_wrap.cc | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 5ea4184bf..fd92ca1d4 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -69,12 +69,12 @@ dnrm2_wrapper(LAPACK_INT N,
     cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
     LAPACK_REAL result;
     cublasDnrm2(handle, N, d_X, incx, &result);
-    std::cout << result << " is the norm of:" << std::endl;
-    for(int i=0; i<N; ++i)
-    {
-        std::cout << X[i] << " ";
-    }
-    std::cout << std::endl << std::endl;
+    //std::cout << result << " is the norm of:" << std::endl;
+    //for(int i=0; i<N; ++i)
+    //{
+        //std::cout << X[i] << " ";
+    //}
+    //std::cout << std::endl << std::endl;
     cudaFree(d_X);
     cublasDestroy(handle);
     return result;
@@ -200,7 +200,7 @@ gemm_wrapper(bool transa,
         }
     cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m);
 #elif defined ITENSOR_USE_CUDA
-/*
+/**/
     cublasOperation_t at = CUBLAS_OP_N;
     cublasOperation_t bt = CUBLAS_OP_N;
     if(transa)
@@ -249,8 +249,8 @@ gemm_wrapper(bool transa,
         if(!((i+1)%m)) std::cout << std::endl;
     }
     std::cout << std::endl;
-*/
-
+/**/
+/*
     // I just want to try to not use gpu multiplication
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);
@@ -267,6 +267,7 @@ gemm_wrapper(bool transa,
         ldb = n;
         }
     F77NAME(dgemm)(&at2,&bt2,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m);
+*/
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);

From 3d19cddfbb90aaaca53f14f4981e0a368fc2c03e Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 14:43:35 +0200
Subject: [PATCH 25/68] output A B C and transa b

---
 itensor/tensor/lapack_wrap.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index fd92ca1d4..544fa1964 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -223,19 +223,27 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb);
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
+    double* A_copy;
+    malloc(A_copy, m * k * sizeof(double));
+    cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
+    std::cout << "difference" << std::endl;
+    for(int i=0; i<m*k; ++i)
+        std::cout << A[i]-A_copy[i] << " ";
+    std::cout << std::endl << std::endl;
+    free(A_copy);
     cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
     cublasDestroy(handle);
-    std::cout << "Matrix A:" << std::endl;
+    std::cout << "Matrix A: " << transa << std::endl;
     for(int i=0; i<m*k; ++i)
     {
         std::cout << A[i] << " ";
         if(!((i+1)%m)) std::cout << std::endl;
     }
     std::cout << std::endl;
-    std::cout << "Matrix B:" << std::endl;
+    std::cout << "Matrix B: " << transb << std::endl;
     for(int i=0; i<k*n; ++i)
     {
         std::cout << B[i] << " ";

From ec7e841d58a866c07db0c1f4f302d374455534d9 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 14:45:14 +0200
Subject: [PATCH 26/68] fixed malloc

---
 itensor/tensor/lapack_wrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 544fa1964..b1b2fc3e7 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -224,7 +224,7 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     double* A_copy;
-    malloc(A_copy, m * k * sizeof(double));
+    A_copy = malloc(m * k * sizeof(double));
     cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
     std::cout << "difference" << std::endl;
     for(int i=0; i<m*k; ++i)

From 162e26eac256e703a508fe7fbefd7a6f98368c0f Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 14:46:15 +0200
Subject: [PATCH 27/68] fixed malloc

---
 itensor/tensor/lapack_wrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index b1b2fc3e7..5adafa6f8 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -224,7 +224,7 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     double* A_copy;
-    A_copy = malloc(m * k * sizeof(double));
+    A_copy = (*double)malloc(m * k * sizeof(double));
     cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
     std::cout << "difference" << std::endl;
     for(int i=0; i<m*k; ++i)

From 2c40d44963e5263552e4ffd30f252308d48bd45c Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 14:47:00 +0200
Subject: [PATCH 28/68] fixed malloc

---
 itensor/tensor/lapack_wrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 5adafa6f8..8f88602ed 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -224,7 +224,7 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     double* A_copy;
-    A_copy = (*double)malloc(m * k * sizeof(double));
+    A_copy = (double*)malloc(m * k * sizeof(double));
     cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
     std::cout << "difference" << std::endl;
     for(int i=0; i<m*k; ++i)

From 801a04d4ad1e795672c3b60b96fc82f3da6e4085 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 15:00:53 +0200
Subject: [PATCH 29/68] trying cudamemcpy

---
 itensor/tensor/lapack_wrap.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 8f88602ed..43d128744 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -224,14 +224,16 @@ gemm_wrapper(bool transa,
     cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     double* A_copy;
-    A_copy = (double*)malloc(m * k * sizeof(double));
-    cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
+    A_copy = (LAPACK_REAL*)malloc(m * k * sizeof(LAPACK_REAL));
+    //cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
+    cudaMemcpy( A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     std::cout << "difference" << std::endl;
     for(int i=0; i<m*k; ++i)
         std::cout << A[i]-A_copy[i] << " ";
     std::cout << std::endl << std::endl;
     free(A_copy);
-    cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
+    //cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
+    cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);

From eb6bfca5f1dceae5b7c9a612c1e16648887cfb15 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 15:09:54 +0200
Subject: [PATCH 30/68] trying cudamemcpy instead of setmatrix

---
 itensor/tensor/lapack_wrap.cc | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 43d128744..43d3920b8 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -219,14 +219,17 @@ gemm_wrapper(bool transa,
     cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL));
     cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL));
     cudaMalloc(&d_C, m * n * sizeof(LAPACK_REAL));
-    cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda);
-    cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb);
-    cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
+    //cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda);
+    cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    //cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb);
+    cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    //cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
+    cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
     double* A_copy;
     A_copy = (LAPACK_REAL*)malloc(m * k * sizeof(LAPACK_REAL));
     //cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
-    cudaMemcpy( A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
+    cudaMemcpy(A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     std::cout << "difference" << std::endl;
     for(int i=0; i<m*k; ++i)
         std::cout << A[i]-A_copy[i] << " ";

From 67f3b51d56b6163126d5be54f948f16e4dd70912 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Mon, 2 May 2022 15:16:25 +0200
Subject: [PATCH 31/68] fixed gemm?

---
 itensor/tensor/lapack_wrap.cc | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 43d3920b8..d0de1fef2 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -226,21 +226,13 @@ gemm_wrapper(bool transa,
     //cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
-    double* A_copy;
-    A_copy = (LAPACK_REAL*)malloc(m * k * sizeof(LAPACK_REAL));
-    //cublasGetMatrix(m, k, sizeof(LAPACK_REAL), d_A, m, A_copy, m);
-    cudaMemcpy(A_copy, d_A, m * k * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
-    std::cout << "difference" << std::endl;
-    for(int i=0; i<m*k; ++i)
-        std::cout << A[i]-A_copy[i] << " ";
-    std::cout << std::endl << std::endl;
-    free(A_copy);
     //cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
     cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
     cublasDestroy(handle);
+    /*
     std::cout << "Matrix A: " << transa << std::endl;
     for(int i=0; i<m*k; ++i)
     {
@@ -262,6 +254,7 @@ gemm_wrapper(bool transa,
         if(!((i+1)%m)) std::cout << std::endl;
     }
     std::cout << std::endl;
+    */
 /**/
 /*
     // I just want to try to not use gpu multiplication

From c9f0bdaf6e0b61c687c82b75b7edaa644020f104 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 3 May 2022 10:55:57 +0200
Subject: [PATCH 32/68] changed every set get with memcpy

---
 itensor/tensor/lapack_wrap.cc | 136 ++++++----------------------------
 1 file changed, 22 insertions(+), 114 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index d0de1fef2..aeee8d749 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -38,10 +38,10 @@ daxpy_wrapper(LAPACK_INT n,        //number of elements of X,Y
     LAPACK_REAL *d_X, *d_Y;
     cudaMalloc(&d_X, n * sizeof(LAPACK_REAL));
     cudaMalloc(&d_Y, n * sizeof(LAPACK_REAL));
-    cublasSetVector(n, sizeof(LAPACK_REAL), X, incx, d_X, incx);
-    cublasSetVector(n, sizeof(LAPACK_REAL), Y, incy, d_Y, incy);
+    cudaMemcpy(d_X, X, n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Y, Y, n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasDaxpy(handle, n, &alpha, d_X, incx, d_Y, incy);
-    cublasGetVector(n, sizeof(LAPACK_REAL), d_Y, incy, Y, incy);
+    cudaMemcpy(Y, d_Y, n * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     cudaFree(d_X);
     cudaFree(d_Y);
     cublasDestroy(handle);
@@ -66,15 +66,9 @@ dnrm2_wrapper(LAPACK_INT N,
     cublasCreate(&handle);
     LAPACK_REAL *d_X;
     cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
-    cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
+    cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     LAPACK_REAL result;
     cublasDnrm2(handle, N, d_X, incx, &result);
-    //std::cout << result << " is the norm of:" << std::endl;
-    //for(int i=0; i<N; ++i)
-    //{
-        //std::cout << X[i] << " ";
-    //}
-    //std::cout << std::endl << std::endl;
     cudaFree(d_X);
     cublasDestroy(handle);
     return result;
@@ -103,8 +97,8 @@ ddot_wrapper(LAPACK_INT N,
     LAPACK_REAL *d_X, *d_Y;
     cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
     cudaMalloc(&d_Y, N * sizeof(LAPACK_REAL));
-    cublasSetVector(N, sizeof(LAPACK_REAL), X, incx, d_X, incx);
-    cublasSetVector(N, sizeof(LAPACK_REAL), Y, incy, d_Y, incy);
+    cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Y, Y, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     LAPACK_REAL result;
     cublasDdot (handle, N, d_X, incx, d_Y, incy, &result);
     cudaFree(d_X);
@@ -149,8 +143,8 @@ zdotc_wrapper(LAPACK_INT N,
     LAPACK_COMPLEX *d_X, *d_Y;
     cudaMalloc(&d_X, N * sizeof(LAPACK_COMPLEX));
     cudaMalloc(&d_Y, N * sizeof(LAPACK_COMPLEX));
-    cublasSetVector(N, sizeof(LAPACK_COMPLEX), X, incx, d_X, incx);
-    cublasSetVector(N, sizeof(LAPACK_COMPLEX), Y, incy, d_Y, incy);
+    cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_Y, Y, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasZdotc(handle, N, d_X, incx, d_Y, incy, &res);
     cudaFree(d_X);
     cudaFree(d_Y);
@@ -200,7 +194,6 @@ gemm_wrapper(bool transa,
         }
     cblas_dgemm(CblasColMajor,at,bt,m,n,k,alpha,A,lda,B,ldb,beta,C,m);
 #elif defined ITENSOR_USE_CUDA
-/**/
     cublasOperation_t at = CUBLAS_OP_N;
     cublasOperation_t bt = CUBLAS_OP_N;
     if(transa)
@@ -219,61 +212,15 @@ gemm_wrapper(bool transa,
     cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL));
     cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL));
     cudaMalloc(&d_C, m * n * sizeof(LAPACK_REAL));
-    //cublasSetMatrix(m, k, sizeof(LAPACK_REAL), A, lda, d_A, lda);
     cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
-    //cublasSetMatrix(k, n, sizeof(LAPACK_REAL), B, ldb, d_B, ldb);
     cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
-    //cublasSetMatrix(m, n, sizeof(LAPACK_REAL), C, m, d_C, m);
     cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasDgemm(handle, at, bt, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, m);
-    //cublasGetMatrix(m, n, sizeof(LAPACK_REAL), d_C, m, C, m);
     cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
     cublasDestroy(handle);
-    /*
-    std::cout << "Matrix A: " << transa << std::endl;
-    for(int i=0; i<m*k; ++i)
-    {
-        std::cout << A[i] << " ";
-        if(!((i+1)%m)) std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    std::cout << "Matrix B: " << transb << std::endl;
-    for(int i=0; i<k*n; ++i)
-    {
-        std::cout << B[i] << " ";
-        if(!((i+1)%k)) std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    std::cout << "Matrix C:" << std::endl;
-    for(int i=0; i<m*n; ++i)
-    {
-        std::cout << C[i] << " ";
-        if(!((i+1)%m)) std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    */
-/**/
-/*
-    // I just want to try to not use gpu multiplication
-    auto *pA = const_cast<double*>(A);
-    auto *pB = const_cast<double*>(B);
-    char at2 = 'N';
-    char bt2 = 'N';
-    if(transa)
-        {
-        at2 = 'T';
-        lda = k;
-        }
-    if(transb)
-        {
-        bt2 = 'T';
-        ldb = n;
-        }
-    F77NAME(dgemm)(&at2,&bt2,&m,&n,&k,&alpha,pA,&lda,pB,&ldb,&beta,C,&m);
-*/
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);
@@ -356,11 +303,11 @@ gemm_wrapper(bool transa,
     cudaMalloc(&d_A, m * k * sizeof(LAPACK_COMPLEX));
     cudaMalloc(&d_B, k * n * sizeof(LAPACK_COMPLEX));
     cudaMalloc(&d_C, m * n * sizeof(LAPACK_COMPLEX));
-    cublasSetMatrix(m, k, sizeof(LAPACK_COMPLEX), A, lda, d_A, lda);
-    cublasSetMatrix(k, n, sizeof(LAPACK_COMPLEX), B, ldb, d_B, ldb);
-    cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), C, m, d_C, m);
+    cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
     cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, d_A, lda, d_B, ldb, (LAPACK_COMPLEX*) &beta, d_C, m);
-    cublasGetMatrix(m, n, sizeof(LAPACK_COMPLEX), d_C, m, C, m);
+    cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyDeviceToHost);
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
@@ -430,34 +377,14 @@ gemv_wrapper(bool trans,
     cudaMalloc(&d_A, m * n * sizeof(LAPACK_REAL));
     cudaMalloc(&d_x, m * sizeof(LAPACK_REAL));
     cudaMalloc(&d_y, m * sizeof(LAPACK_REAL));
-    cublasSetMatrix(m, n, sizeof(LAPACK_REAL), A, m, d_A, m);
-    cublasSetVector(m, sizeof(LAPACK_REAL), x, incx, d_x, incx);
-    cublasSetVector(m, sizeof(LAPACK_REAL), y, incy, d_y, incy);
+    cudaMemcpy(d_A, A, m * n * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_x, x, m * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y, y, m * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasDgemv(handle, tr, m, n, &alpha, d_A, m, d_x, incx, &beta, d_y, incy);
-    cublasGetVector(m, sizeof(LAPACK_REAL), d_y, incy, y, incy);
+    cudaMemcpy(y, d_y, m * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     cudaFree(d_A);
     cudaFree(d_x);
     cudaFree(d_y);
-    std::cout << "Matrix A:" << std::endl;
-    for(int i=0; i<m*n; ++i)
-    {
-        std::cout << A[i] << " ";
-        if(!((i+1)%m)) std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    std::cout << "Vector x:" << std::endl;
-    for(int i=0; i<m; ++i)
-    {
-        std::cout << x[i] << " ";
-    }
-    std::cout << std::endl;
-    std::cout << "Vector y:" << std::endl;
-    for(int i=0; i<m; ++i)
-    {
-        std::cout << y[i] << " ";
-    }
-    std::cout << std::endl;
-
     cublasDestroy(handle);
 #else
     char Tr = trans ? 'T' : 'N';
@@ -501,33 +428,14 @@ gemv_wrapper(bool trans,
     cudaMalloc(&d_A, m * n * sizeof(LAPACK_COMPLEX));
     cudaMalloc(&d_x, m * sizeof(LAPACK_COMPLEX));
     cudaMalloc(&d_y, m * sizeof(LAPACK_COMPLEX));
-    cublasSetMatrix(m, n, sizeof(LAPACK_COMPLEX), A, m, d_A, m);
-    cublasSetVector(m, sizeof(LAPACK_COMPLEX), x, incx, d_x, incx);
-    cublasSetVector(m, sizeof(LAPACK_COMPLEX), y, incy, d_y, incy);
+    cudaMemcpy(d_A, A, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_x, x, m * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y, y, m * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
     cublasZgemv(handle, tr, m, n, (LAPACK_COMPLEX*) &alpha, d_A, m, d_x, incx, (LAPACK_COMPLEX*) &beta, d_y, incy);
-    cublasGetVector(m, sizeof(LAPACK_COMPLEX), d_y, incy, y, incy);
+    cudaMemcpy(y, d_y, m * sizeof(LAPACK_COMPLEX), cudaMemcpyDeviceToHost);
     cudaFree(d_A);
     cudaFree(d_x);
     cudaFree(d_y);
-    std::cout << "Matrix A:" << std::endl;
-    for(int i=0; i<m*n; ++i)
-    {
-        std::cout << A[i] << " ";
-        if(!((i+1)%m)) std::cout << std::endl;
-    }
-    std::cout << std::endl;
-    std::cout << "Vector x:" << std::endl;
-    for(int i=0; i<m; ++i)
-    {
-        std::cout << x[i] << " ";
-    }
-    std::cout << std::endl;
-    std::cout << "Vector y:" << std::endl;
-    for(int i=0; i<m; ++i)
-    {
-        std::cout << y[i] << " ";
-    }
-    std::cout << std::endl;
     cublasDestroy(handle);
 #else //platform other than openblas
 #ifdef ITENSOR_USE_CBLAS
@@ -596,9 +504,9 @@ dscal_wrapper(LAPACK_INT N,
     cublasCreate(&handle);
     LAPACK_REAL *d_x;
     cudaMalloc(&d_x, N * sizeof(LAPACK_REAL));
-    cublasSetVector(N, sizeof(LAPACK_REAL), data, inc, d_x, inc);
+    cudaMemcpy(d_x, data, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     cublasDscal(handle, N, &alpha, d_x, inc);
-    cublasGetVector(N, sizeof(LAPACK_REAL), d_x, inc, data, inc);
+    cudaMemcpy(data, d_x, N * sizeof(LAPACK_REAL), cudaMemcpyDeviceToHost);
     cudaFree(d_x);
     cublasDestroy(handle);
 #else

From ad19a9ca122d7c5fb73428de1a8d3204929595ab Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 3 May 2022 12:46:39 +0200
Subject: [PATCH 33/68] trying static cublas handle

---
 itensor/tensor/lapack_wrap.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index aeee8d749..965bbdb8f 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -62,15 +62,15 @@ dnrm2_wrapper(LAPACK_INT N,
 #ifdef ITENSOR_USE_CBLAS
     return cblas_dnrm2(N,X,incx);
 #elif defined ITENSOR_USE_CUDA
-    cublasHandle_t handle;
-    cublasCreate(&handle);
+    static cublasHandle_t handle;
+    //cublasCreate(&handle);
     LAPACK_REAL *d_X;
     cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
     cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     LAPACK_REAL result;
     cublasDnrm2(handle, N, d_X, incx, &result);
     cudaFree(d_X);
-    cublasDestroy(handle);
+    //cublasDestroy(handle);
     return result;
 #else
     auto *Xnc = const_cast<LAPACK_REAL*>(X);
@@ -206,8 +206,8 @@ gemm_wrapper(bool transa,
         bt = CUBLAS_OP_T;
         ldb = n;
         }
-    cublasHandle_t handle;
-    cublasCreate(&handle);
+    static cublasHandle_t handle;
+    //cublasCreate(&handle);
     LAPACK_REAL *d_A, *d_B, *d_C;
     cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL));
     cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL));
@@ -220,7 +220,7 @@ gemm_wrapper(bool transa,
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
-    cublasDestroy(handle);
+    //cublasDestroy(handle);
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);

From ff6ff239bb7fe0500aad3e6309286c254c51b614 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 3 May 2022 12:57:57 +0200
Subject: [PATCH 34/68] reverting static handle

---
 itensor/tensor/lapack_wrap.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 965bbdb8f..aeee8d749 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -62,15 +62,15 @@ dnrm2_wrapper(LAPACK_INT N,
 #ifdef ITENSOR_USE_CBLAS
     return cblas_dnrm2(N,X,incx);
 #elif defined ITENSOR_USE_CUDA
-    static cublasHandle_t handle;
-    //cublasCreate(&handle);
+    cublasHandle_t handle;
+    cublasCreate(&handle);
     LAPACK_REAL *d_X;
     cudaMalloc(&d_X, N * sizeof(LAPACK_REAL));
     cudaMemcpy(d_X, X, N * sizeof(LAPACK_REAL), cudaMemcpyHostToDevice);
     LAPACK_REAL result;
     cublasDnrm2(handle, N, d_X, incx, &result);
     cudaFree(d_X);
-    //cublasDestroy(handle);
+    cublasDestroy(handle);
     return result;
 #else
     auto *Xnc = const_cast<LAPACK_REAL*>(X);
@@ -206,8 +206,8 @@ gemm_wrapper(bool transa,
         bt = CUBLAS_OP_T;
         ldb = n;
         }
-    static cublasHandle_t handle;
-    //cublasCreate(&handle);
+    cublasHandle_t handle;
+    cublasCreate(&handle);
     LAPACK_REAL *d_A, *d_B, *d_C;
     cudaMalloc(&d_A, m * k * sizeof(LAPACK_REAL));
     cudaMalloc(&d_B, k * n * sizeof(LAPACK_REAL));
@@ -220,7 +220,7 @@ gemm_wrapper(bool transa,
     cudaFree(d_A);
     cudaFree(d_B);
     cudaFree(d_C);
-    //cublasDestroy(handle);
+    cublasDestroy(handle);
 #else
     auto *pA = const_cast<double*>(A);
     auto *pB = const_cast<double*>(B);

From 3ffaf1d56afe426ac50f97fbc6f65f90d1e0a12c Mon Sep 17 00:00:00 2001
From: Kemal Bidzhiev <kemal@Kemals-MacBook-Pro.local>
Date: Thu, 5 May 2022 13:43:28 +0200
Subject: [PATCH 35/68] comment to the option.mk

---
 options.mk.sample | 1 +
 1 file changed, 1 insertion(+)

diff --git a/options.mk.sample b/options.mk.sample
index 280a8f0e2..5908a012c 100644
--- a/options.mk.sample
+++ b/options.mk.sample
@@ -10,6 +10,7 @@
 #CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 
 ## CUDA compiler
+## comment line 
 CCCOM=nvcc -m64 -std=c++17
 
 ## Clang compiler (good to use on macOS)

From 6a688e7b6da44b6f903dca10d1c2a487f67c7a29 Mon Sep 17 00:00:00 2001
From: Kemal Bidzhiev <kemal@Kemals-MacBook-Pro.local>
Date: Thu, 5 May 2022 13:49:24 +0200
Subject: [PATCH 36/68] comment in cuda section is removed

---
 options.mk.sample | 1 -
 1 file changed, 1 deletion(-)

diff --git a/options.mk.sample b/options.mk.sample
index 5908a012c..280a8f0e2 100644
--- a/options.mk.sample
+++ b/options.mk.sample
@@ -10,7 +10,6 @@
 #CCCOM=g++ -m64 -std=c++17 -fconcepts -fPIC
 
 ## CUDA compiler
-## comment line 
 CCCOM=nvcc -m64 -std=c++17
 
 ## Clang compiler (good to use on macOS)

From 61bf459db68ca2cce28c3836def56475257a9413 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Thu, 5 May 2022 14:01:42 +0200
Subject: [PATCH 37/68] comments showing how to replace storage type

---
 itensor/util/vector_no_init.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/itensor/util/vector_no_init.h b/itensor/util/vector_no_init.h
index 19200ff92..e1e8ea177 100644
--- a/itensor/util/vector_no_init.h
+++ b/itensor/util/vector_no_init.h
@@ -67,7 +67,12 @@ class uninitialized_allocator
   };
 
 template<typename T>
-using vector_no_init = std::vector<T,uninitialized_allocator<T>>;
+//#ifdef PLATFORM_cuda
+    //using vector_no_init = thrust::vector<T,uninitialized_allocator<T>>;
+//#else
+    using vector_no_init = std::vector<T,uninitialized_allocator<T>>;
+//#endif
+
 
 } //namespace itensor
 

From 0a0dc51847bdbdf1a25871309190fd4624c52f01 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 16:13:07 +0200
Subject: [PATCH 38/68] thrust allocation

---
 itensor/util/vector_no_init.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/itensor/util/vector_no_init.h b/itensor/util/vector_no_init.h
index e1e8ea177..9c1fc5a07 100644
--- a/itensor/util/vector_no_init.h
+++ b/itensor/util/vector_no_init.h
@@ -17,6 +17,7 @@
 #define __ITENSOR_VECTOR_NO_INIT_H
 
 #include <vector>
+#include <thrust/device_vector.h>
 
 namespace itensor {
 
@@ -64,14 +65,16 @@ class uninitialized_allocator
   bool
   operator!=(uninitialized_allocator<T> const&) { return false; }
 
+  //where is destroy?
+
   };
 
 template<typename T>
-//#ifdef PLATFORM_cuda
-    //using vector_no_init = thrust::vector<T,uninitialized_allocator<T>>;
-//#else
+#ifdef PLATFORM_cuda
+    using vector_no_init = thrust::device_vector<T>;
+#else
     using vector_no_init = std::vector<T,uninitialized_allocator<T>>;
-//#endif
+#endif
 
 
 } //namespace itensor

From ba6080142beed3419f84c868fa809efdde529438 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 16:35:27 +0200
Subject: [PATCH 39/68] safe pointer to thrust

---
 itensor/util/safe_ptr.h | 58 ++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 24 deletions(-)

diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h
index 12eb319a4..7d273d0bc 100644
--- a/itensor/util/safe_ptr.h
+++ b/itensor/util/safe_ptr.h
@@ -18,6 +18,7 @@
 
 #include "itensor/util/print.h"
 #include "itensor/util/stdx.h"
+#include <thrust/device_vector.h>
 
 namespace itensor {
 
@@ -44,19 +45,19 @@ class SafePtr
 
     SafePtr(T* pt, size_t offset_end)
         : p_(pt), offset_(0), offset_end_(offset_end)
-        { 
+        {
         //if(!p_) throw std::runtime_error("SafePtr: pointer is null");
         }
 
     SafePtr(T* pt, size_t offset, size_t offset_end)
         : p_(pt), offset_(offset), offset_end_(offset_end)
-        { 
+        {
         if(!p_) throw std::runtime_error("SafePtr: pointer is null");
         }
 
     //Allow automatic conversion SafePtr<T> -> SafePtr<const T>
     SafePtr(SafePtr<value_type> const& P)
-        { 
+        {
         operator=(P);
         }
     SafePtr&
@@ -75,29 +76,29 @@ class SafePtr
     bool
     validOffset() const { return (offset_ < offset_end_); }
     size_t
-    range() const 
-        { 
+    range() const
+        {
         if(validOffset()) return (offset_end_-offset_);
         return 0ul;
         }
 
     pointer
-    get() const 
-        { 
+    get() const
+        {
         if(!p_) return nullptr;
-        return (p_+offset_); 
+        return (p_+offset_);
         }
 
     pointer
     safeGet(size_t expected_range)
         {
-        if(!p_) 
+        if(!p_)
             {
             throw std::runtime_error("SafePtr: dereferencing null pointer");
             }
         if(!validOffset())
             {
-            auto error_msg = 
+            auto error_msg =
             format("SafePtr: offset >= offset_end (%d >= %d)",
                    offset_,offset_end_);
             throw std::runtime_error(error_msg);
@@ -105,7 +106,7 @@ class SafePtr
         auto actual_range = offsetEnd()-offset();
         if(expected_range > actual_range)
             {
-            auto error_msg = 
+            auto error_msg =
             format("SafePtr: expected_range > actual_range (%d > %d)",
                     expected_range,actual_range);
             throw std::runtime_error(error_msg);
@@ -118,7 +119,7 @@ class SafePtr
     SafePtr&
     operator+=(size_t shift)
         {
-        //if(!p_) 
+        //if(!p_)
         //    {
         //    throw std::runtime_error("SafePtr: incrementing (+=) null pointer");
         //    }
@@ -186,8 +187,8 @@ class SafePtr
         }
 
     bool
-    operator!=(SafePtr const& other) const 
-        { 
+    operator!=(SafePtr const& other) const
+        {
         if(p_ != other.p_)
             throw std::runtime_error("SafePtr: error, comparing two different starting pointers");
         return (offset_ != other.offset_);
@@ -198,20 +199,20 @@ class SafePtr
 
 
     pointer
-    getStart() const 
-        { 
+    getStart() const
+        {
         return p_;
         }
 
     private:
 
     pointer
-    safeFront() const 
+    safeFront() const
         {
         if(!p_) throw std::runtime_error("SafePtr: dereferencing null pointer");
         if(!validOffset())
             {
-            auto error_msg = 
+            auto error_msg =
             format("SafePtr: offset >= offset_end (%d >= %d)",
                    offset_,offset_end_);
             throw std::runtime_error(error_msg);
@@ -269,12 +270,21 @@ reinterpret(SafePtr<OldType> const& p)
 
 #else
 
-//bare pointer versions of macros
-#define MAKE_SAFE_PTR(P,SZ) (P)
-#define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
-#define SAFE_PTR_GET(P,SZ) P
-#define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
-#define SAFE_PTR_OF(T) T*
+    #ifdef PLATFORM_cuda
+        //bare pointer versions of macros
+        #define MAKE_SAFE_PTR(P,SZ) (P)
+        #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
+        #define SAFE_PTR_GET(P,SZ) P
+        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
+        #define SAFE_PTR_OF(T) thrust::device_ptr<T>
+    #else
+        //bare pointer versions of macros
+        #define MAKE_SAFE_PTR(P,SZ) (P)
+        #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
+        #define SAFE_PTR_GET(P,SZ) P
+        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
+        #define SAFE_PTR_OF(T) T*
+    #endif
 
 #endif
 

From 7281ae33d06f1a6046b45095c0f7ce8d7a06ffda Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 16:53:52 +0200
Subject: [PATCH 40/68] safe pointer reinterpret cast

---
 itensor/util/safe_ptr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h
index 7d273d0bc..751cf4fed 100644
--- a/itensor/util/safe_ptr.h
+++ b/itensor/util/safe_ptr.h
@@ -275,7 +275,7 @@ reinterpret(SafePtr<OldType> const& p)
         #define MAKE_SAFE_PTR(P,SZ) (P)
         #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
         #define SAFE_PTR_GET(P,SZ) P
-        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
+        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<thrust::device_ptr<T>>(SP)
         #define SAFE_PTR_OF(T) thrust::device_ptr<T>
     #else
         //bare pointer versions of macros

From 7b32260538484ea9502a2130bb5657e8e29f8c21 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 16:54:58 +0200
Subject: [PATCH 41/68] fixed typo

---
 itensor/util/safe_ptr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h
index 751cf4fed..4e832e2e6 100644
--- a/itensor/util/safe_ptr.h
+++ b/itensor/util/safe_ptr.h
@@ -275,7 +275,7 @@ reinterpret(SafePtr<OldType> const& p)
         #define MAKE_SAFE_PTR(P,SZ) (P)
         #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
         #define SAFE_PTR_GET(P,SZ) P
-        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<thrust::device_ptr<T>>(SP)
+        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<thrust::device_ptr<NT>>(SP)
         #define SAFE_PTR_OF(T) thrust::device_ptr<T>
     #else
         //bare pointer versions of macros

From bb3a48214d2311c1337c8aef36e7475696eee821 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 16:58:32 +0200
Subject: [PATCH 42/68] redefined complex if cuda

---
 itensor/types.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 2ed40e45e..834d874ef 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -28,18 +28,24 @@
 namespace itensor {
 
 using Real = double;
-using Cplx = std::complex<double>;
-using Complex = std::complex<double>;
+#ifdef PLATFORM_cuda
+    #include <cuComplex.h>
+    using Cplx = cuDoubleComplex;
+    using Complex = cuDoubleComplex;
+#else
+    using Cplx = std::complex<double>;
+    using Complex = std::complex<double>;
+#endif
 
 const Cplx Complex_1 = Cplx(1,0);
 const Cplx Complex_i = Cplx(0,1);
 const Cplx Cplx_1 = Cplx(1,0);
 const Cplx Cplx_i = Cplx(0,1);
 
-inline Real& 
+inline Real&
 realRef(Cplx & z) { return reinterpret_cast<Real*>(&z)[0]; }
 
-inline Real& 
+inline Real&
 imagRef(Cplx & z) { return reinterpret_cast<Real*>(&z)[1]; }
 
 void inline
@@ -78,11 +84,11 @@ formatVal(Cplx const& val)
     }
 
 template<typename T, class=stdx::require<std::is_same<T,Real>>>
-constexpr const char* 
+constexpr const char*
 typeName(int=0) { return "Real"; }
 
 template<typename T, class=stdx::require<std::is_same<T,Cplx>>>
-constexpr const char* 
+constexpr const char*
 typeName(long=0) { return "Cplx"; }
 
 }

From 7d3823087967c81207ad9f11b75809e9de2c9ba2 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 17:05:14 +0200
Subject: [PATCH 43/68] used thrust complex instead of cuda complex

---
 itensor/types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 834d874ef..56d545789 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -29,9 +29,9 @@ namespace itensor {
 
 using Real = double;
 #ifdef PLATFORM_cuda
-    #include <cuComplex.h>
-    using Cplx = cuDoubleComplex;
-    using Complex = cuDoubleComplex;
+    #include <thrust/complex.h>
+    using Cplx = thrust::complex<double>;
+    using Complex = thrust::complex<double>;
 #else
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;

From 36dcbbf4d2371e2561d5cd592ac3d043d2674449 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 17:59:19 +0200
Subject: [PATCH 44/68] reverted to cuda complex

---
 itensor/types.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 56d545789..1dd889785 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -29,18 +29,22 @@ namespace itensor {
 
 using Real = double;
 #ifdef PLATFORM_cuda
-    #include <thrust/complex.h>
-    using Cplx = thrust::complex<double>;
-    using Complex = thrust::complex<double>;
+    #include <cuComplex.h>
+    using Cplx = cuDoubleComplex;
+    using Complex = cuDoubleComplex;
+    const Cplx Complex_1 = make_cuDoubleComplex(1,0);
+    const Cplx Complex_i = make_cuDoubleComplex(0,1);
+    const Cplx Cplx_1 = make_cuDoubleComplex(1,0);
+    const Cplx Cplx_i = make_cuDoubleComplex(0,1);
 #else
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;
+    const Cplx Complex_1 = Cplx(1,0);
+    const Cplx Complex_i = Cplx(0,1);
+    const Cplx Cplx_1 = Cplx(1,0);
+    const Cplx Cplx_i = Cplx(0,1);
 #endif
 
-const Cplx Complex_1 = Cplx(1,0);
-const Cplx Complex_i = Cplx(0,1);
-const Cplx Cplx_1 = Cplx(1,0);
-const Cplx Cplx_i = Cplx(0,1);
 
 inline Real&
 realRef(Cplx & z) { return reinterpret_cast<Real*>(&z)[0]; }

From f77896e8bd561e8de147dc679ac37875141e8eb7 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Tue, 17 May 2022 18:03:26 +0200
Subject: [PATCH 45/68] reverted to simple complex

---
 itensor/types.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/itensor/types.h b/itensor/types.h
index 1dd889785..53ab9d8a5 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -28,6 +28,7 @@
 namespace itensor {
 
 using Real = double;
+/*
 #ifdef PLATFORM_cuda
     #include <cuComplex.h>
     using Cplx = cuDoubleComplex;
@@ -37,13 +38,14 @@ using Real = double;
     const Cplx Cplx_1 = make_cuDoubleComplex(1,0);
     const Cplx Cplx_i = make_cuDoubleComplex(0,1);
 #else
+*/
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;
     const Cplx Complex_1 = Cplx(1,0);
     const Cplx Complex_i = Cplx(0,1);
     const Cplx Cplx_1 = Cplx(1,0);
     const Cplx Cplx_i = Cplx(0,1);
-#endif
+//#endif
 
 
 inline Real&

From 6ef61858bd1a114f114c6e271819e6f35e4cb9af Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 15:29:22 +0200
Subject: [PATCH 46/68] try cucomplex

---
 itensor/types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 53ab9d8a5..5cef40fd6 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -28,7 +28,7 @@
 namespace itensor {
 
 using Real = double;
-/*
+/**/
 #ifdef PLATFORM_cuda
     #include <cuComplex.h>
     using Cplx = cuDoubleComplex;
@@ -38,14 +38,14 @@ using Real = double;
     const Cplx Cplx_1 = make_cuDoubleComplex(1,0);
     const Cplx Cplx_i = make_cuDoubleComplex(0,1);
 #else
-*/
+/**/
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;
     const Cplx Complex_1 = Cplx(1,0);
     const Cplx Complex_i = Cplx(0,1);
     const Cplx Cplx_1 = Cplx(1,0);
     const Cplx Cplx_i = Cplx(0,1);
-//#endif
+#endif
 
 
 inline Real&

From 83e89a46b6f176b55050d85062ff601990f88f25 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 15:33:57 +0200
Subject: [PATCH 47/68] use thrust complex

---
 itensor/types.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 5cef40fd6..9a25fbd53 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -30,22 +30,19 @@ namespace itensor {
 using Real = double;
 /**/
 #ifdef PLATFORM_cuda
-    #include <cuComplex.h>
-    using Cplx = cuDoubleComplex;
-    using Complex = cuDoubleComplex;
-    const Cplx Complex_1 = make_cuDoubleComplex(1,0);
-    const Cplx Complex_i = make_cuDoubleComplex(0,1);
-    const Cplx Cplx_1 = make_cuDoubleComplex(1,0);
-    const Cplx Cplx_i = make_cuDoubleComplex(0,1);
+    //#include <cuComplex.h>
+    #include <thrust/complex.h>
+    using Cplx = thrust::complex<double>;
+    using Complex = thrust::complex<double>;
 #else
 /**/
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;
-    const Cplx Complex_1 = Cplx(1,0);
-    const Cplx Complex_i = Cplx(0,1);
-    const Cplx Cplx_1 = Cplx(1,0);
-    const Cplx Cplx_i = Cplx(0,1);
 #endif
+const Cplx Complex_1 = Cplx(1,0);
+const Cplx Complex_i = Cplx(0,1);
+const Cplx Cplx_1 = Cplx(1,0);
+const Cplx Cplx_i = Cplx(0,1);
 
 
 inline Real&

From 1274526e29275319c69076b22aea5d617c160761 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 16:02:43 +0200
Subject: [PATCH 48/68] trying to fix thrust complex

---
 itensor/types.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 9a25fbd53..80bbf445a 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -32,8 +32,10 @@ using Real = double;
 #ifdef PLATFORM_cuda
     //#include <cuComplex.h>
     #include <thrust/complex.h>
-    using Cplx = thrust::complex<double>;
-    using Complex = thrust::complex<double>;
+    using Cplx2 = thrust::complex<double>;
+    using Complex2 = thrust::complex<double>;
+    using Cplx = std::complex<double>;
+    using Complex = std::complex<double>;
 #else
 /**/
     using Cplx = std::complex<double>;

From cf7a55605aa706e26d3348280fa6f65bf4db8943 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 16:29:57 +0200
Subject: [PATCH 49/68] fixed typo in reinterpret cast

---
 itensor/types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 80bbf445a..655509088 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -31,9 +31,9 @@ using Real = double;
 /**/
 #ifdef PLATFORM_cuda
     //#include <cuComplex.h>
-    #include <thrust/complex.h>
-    using Cplx2 = thrust::complex<double>;
-    using Complex2 = thrust::complex<double>;
+    //#include <thrust/complex.h>
+    //using Cplx2 = thrust::complex<double>;
+    //using Complex2 = thrust::complex<double>;
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;
 #else

From f15ac2979fd51206fe7b016b8d8ae56767f24a04 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 16:51:35 +0200
Subject: [PATCH 50/68] revert back

---
 itensor/types.h         | 10 +++++-----
 itensor/util/safe_ptr.h |  4 +++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/itensor/types.h b/itensor/types.h
index 655509088..15cf97f2d 100644
--- a/itensor/types.h
+++ b/itensor/types.h
@@ -29,18 +29,18 @@ namespace itensor {
 
 using Real = double;
 /**/
-#ifdef PLATFORM_cuda
+//#ifdef PLATFORM_cuda
     //#include <cuComplex.h>
     //#include <thrust/complex.h>
     //using Cplx2 = thrust::complex<double>;
     //using Complex2 = thrust::complex<double>;
-    using Cplx = std::complex<double>;
-    using Complex = std::complex<double>;
-#else
+    //using Cplx = std::complex<double>;
+    //using Complex = std::complex<double>;
+//#else
 /**/
     using Cplx = std::complex<double>;
     using Complex = std::complex<double>;
-#endif
+//#endif
 const Cplx Complex_1 = Cplx(1,0);
 const Cplx Complex_i = Cplx(0,1);
 const Cplx Cplx_1 = Cplx(1,0);
diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h
index 4e832e2e6..e386552fd 100644
--- a/itensor/util/safe_ptr.h
+++ b/itensor/util/safe_ptr.h
@@ -270,6 +270,7 @@ reinterpret(SafePtr<OldType> const& p)
 
 #else
 
+/*
     #ifdef PLATFORM_cuda
         //bare pointer versions of macros
         #define MAKE_SAFE_PTR(P,SZ) (P)
@@ -278,13 +279,14 @@ reinterpret(SafePtr<OldType> const& p)
         #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<thrust::device_ptr<NT>>(SP)
         #define SAFE_PTR_OF(T) thrust::device_ptr<T>
     #else
+    */
         //bare pointer versions of macros
         #define MAKE_SAFE_PTR(P,SZ) (P)
         #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
         #define SAFE_PTR_GET(P,SZ) P
         #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
         #define SAFE_PTR_OF(T) T*
-    #endif
+    //#endif
 
 #endif
 

From 04db28b218577ff4e775a610ee411969ea3ed7b8 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 17:05:38 +0200
Subject: [PATCH 51/68] changed safe pointer behaviour

---
 itensor/util/safe_ptr.h | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/itensor/util/safe_ptr.h b/itensor/util/safe_ptr.h
index e386552fd..3427dd50e 100644
--- a/itensor/util/safe_ptr.h
+++ b/itensor/util/safe_ptr.h
@@ -270,23 +270,18 @@ reinterpret(SafePtr<OldType> const& p)
 
 #else
 
-/*
+    #define MAKE_SAFE_PTR(P,SZ) (P)
+    #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
+    #define SAFE_PTR_GET(P,SZ) P
+    #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
+
     #ifdef PLATFORM_cuda
         //bare pointer versions of macros
-        #define MAKE_SAFE_PTR(P,SZ) (P)
-        #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
-        #define SAFE_PTR_GET(P,SZ) P
-        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<thrust::device_ptr<NT>>(SP)
         #define SAFE_PTR_OF(T) thrust::device_ptr<T>
     #else
-    */
         //bare pointer versions of macros
-        #define MAKE_SAFE_PTR(P,SZ) (P)
-        #define MAKE_SAFE_PTR_OFFSET(P,OFF,SZ) ((P)+(OFF))
-        #define SAFE_PTR_GET(P,SZ) P
-        #define SAFE_REINTERPRET(NT,SP) reinterpret_cast<NT*>(SP)
         #define SAFE_PTR_OF(T) T*
-    //#endif
+    #endif
 
 #endif
 

From 14d7b64f5317fbe8edf24532bacb80ce7e5e23a4 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 18:25:58 +0200
Subject: [PATCH 52/68] bypass gemm emulator

---
 itensor/tensor/gemm.cc        | 33 ++++++++++++++++++++++-----------
 itensor/tensor/lapack_wrap.cc | 13 +------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index 5a76b8a6e..4b38bbd50 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -35,7 +35,7 @@ struct dgemmTask
               Real a,
               Real b)
       : Apart(Ap),Bpart(Bp),Cpart(Cp),alpha(a),beta(b)
-        { 
+        {
         if(b != 0.0) copyFromC = true;
         }
 
@@ -169,7 +169,18 @@ gemm_impl(MatRefc<Cplx> A,
           Real alpha,
           Real beta)
     {
-#ifdef ITENSOR_USE_ZGEMM
+#ifdef ITENSOR_USE_CUDA
+    gemm_wrapper(isTransposed(A),
+                 isTransposed(B),
+                 nrows(A),
+                 ncols(B),
+                 ncols(A),
+                 alpha,
+                 A.data(),
+                 B.data(),
+                 beta,
+                 C.data());
+#elif defined ITENSOR_USE_ZGEMM
     gemm_wrapper(isTransposed(A),
                  isTransposed(B),
                  nrows(A),
@@ -181,8 +192,8 @@ gemm_impl(MatRefc<Cplx> A,
                  beta,
                  C.data());
 #else //emulate zgemm by calling dgemm four times
-    std::array<const dgemmTask,6> 
-    tasks = 
+    std::array<const dgemmTask,6>
+    tasks =
         {{dgemmTask(0,0,0,+alpha,beta),
           dgemmTask(1,1,0,-alpha),
           dgemmTask(0),
@@ -202,8 +213,8 @@ gemm_impl(MatRefc<Real> A,
           Real alpha,
           Real beta)
     {
-    std::array<const dgemmTask,4> 
-    tasks = 
+    std::array<const dgemmTask,4>
+    tasks =
         {{dgemmTask(0,0,0,+alpha,beta),
           dgemmTask(0),
           dgemmTask(0,1,1,+alpha,beta),
@@ -219,8 +230,8 @@ gemm_impl(MatRefc<Cplx> A,
           Real alpha,
           Real beta)
     {
-    std::array<const dgemmTask,4> 
-    tasks = 
+    std::array<const dgemmTask,4>
+    tasks =
         {{dgemmTask(0,0,0,+alpha,beta),
           dgemmTask(0),
           dgemmTask(1,0,1,+alpha,beta),
@@ -252,14 +263,14 @@ gemm_impl(MatRefc<Real> A,
 // C = alpha*A*B + beta*C
 template<typename VA, typename VB>
 void
-gemm(MatRefc<VA> A, 
-     MatRefc<VB> B, 
+gemm(MatRefc<VA> A,
+     MatRefc<VB> B,
      MatRef<common_type<VA,VB>>  C,
      Real alpha,
      Real beta)
     {
 #ifdef DEBUG
-    if(!(isContiguous(A) && isContiguous(B) && isContiguous(C))) 
+    if(!(isContiguous(A) && isContiguous(B) && isContiguous(C)))
         throw std::runtime_error("multiplication of non-contiguous MatrixRefs not currently supported");
 #endif
 
diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index aeee8d749..8a46ce2d0 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -299,18 +299,7 @@ gemm_wrapper(bool transa,
         }
     cublasHandle_t handle;
     cublasCreate(&handle);
-    LAPACK_COMPLEX *d_A, *d_B, *d_C;
-    cudaMalloc(&d_A, m * k * sizeof(LAPACK_COMPLEX));
-    cudaMalloc(&d_B, k * n * sizeof(LAPACK_COMPLEX));
-    cudaMalloc(&d_C, m * n * sizeof(LAPACK_COMPLEX));
-    cudaMemcpy(d_A, A, m * k * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_B, B, k * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_C, C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyHostToDevice);
-    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, d_A, lda, d_B, ldb, (LAPACK_COMPLEX*) &beta, d_C, m);
-    cudaMemcpy(C, d_C, m * n * sizeof(LAPACK_COMPLEX), cudaMemcpyDeviceToHost);
-    cudaFree(d_A);
-    cudaFree(d_B);
-    cudaFree(d_C);
+    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m);
     cublasDestroy(handle);
 #else //platform not openblas
 #ifdef ITENSOR_USE_CBLAS

From fc754b970a711518ee16845afc0a9805a2962b7e Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 18:38:16 +0200
Subject: [PATCH 53/68] overrided zgemm for cuda

---
 itensor/tensor/lapack_wrap.cc | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 8a46ce2d0..2f50fc843 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -243,6 +243,37 @@ gemm_wrapper(bool transa,
 //
 // zgemm
 //
+gemm_wrapper(bool transa,
+             bool transb,
+             LAPACK_INT m,
+             LAPACK_INT n,
+             LAPACK_INT k,
+             Cplx alpha,
+             const thrust::device_ptr<complex> A,
+             const thrust::device_ptr<complex> B,
+             Cplx beta,
+             thrust::device_ptr<complex> C)
+    {
+    LAPACK_INT lda = m,
+               ldb = k;
+    cublasOperation_t at = CUBLAS_OP_N;
+    cublasOperation_t bt = CUBLAS_OP_N;
+    if(transa)
+        {
+        at = CUBLAS_OP_T;
+        lda = k;
+        }
+    if(transb)
+        {
+        bt = CUBLAS_OP_T;
+        ldb = n;
+        }
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m);
+    cublasDestroy(handle);
+}
+
 void
 gemm_wrapper(bool transa,
              bool transb,

From 24020abbc646417f032d9d9bb753f68373432efe Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 18:40:00 +0200
Subject: [PATCH 54/68] forgot to include thrust

---
 itensor/tensor/lapack_wrap.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index ddab8c504..82fff9f40 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -151,6 +151,7 @@ imagRef(LAPACK_COMPLEX & z)
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cuComplex.h>
+#include <thrust/device_vector.h>
 
 namespace itensor {
 //cudaDataType_t typeComplexData = CUDA_C_64F;

From 986557b1d41b5ecb9323383c004f504547e8e401 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 18:43:08 +0200
Subject: [PATCH 55/68] changed complex to lapack_complex

---
 itensor/tensor/lapack_wrap.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 2f50fc843..1d92c5d95 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -249,10 +249,10 @@ gemm_wrapper(bool transa,
              LAPACK_INT n,
              LAPACK_INT k,
              Cplx alpha,
-             const thrust::device_ptr<complex> A,
-             const thrust::device_ptr<complex> B,
+             const thrust::device_ptr<LAPACK_COMPLEX> A,
+             const thrust::device_ptr<LAPACK_COMPLEX> B,
              Cplx beta,
-             thrust::device_ptr<complex> C)
+             thrust::device_ptr<LAPACK_COMPLEX> C)
     {
     LAPACK_INT lda = m,
                ldb = k;

From e10825c5284af6687fd16dac8fa53b6ce25ce541 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 18:51:31 +0200
Subject: [PATCH 56/68] use raw pointers

---
 itensor/tensor/lapack_wrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 1d92c5d95..8155f6c58 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -270,7 +270,7 @@ gemm_wrapper(bool transa,
         }
     cublasHandle_t handle;
     cublasCreate(&handle);
-    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m);
+    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A.get(), lda, B.get(), ldb, (LAPACK_COMPLEX*) &beta, C.get(), m);
     cublasDestroy(handle);
 }
 

From 09153526ed99c0579c9d9f7dbfe6fc4a7a68cded Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 19:00:03 +0200
Subject: [PATCH 57/68] extarct raw pointer from thrust::device_ptr

---
 itensor/tensor/gemm.cc        |  7 ++++---
 itensor/tensor/lapack_wrap.cc |  8 ++++----
 itensor/tensor/lapack_wrap.h  | 12 ++++++++++++
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index 4b38bbd50..8a45bbb84 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -176,10 +176,11 @@ gemm_impl(MatRefc<Cplx> A,
                  ncols(B),
                  ncols(A),
                  alpha,
-                 A.data(),
-                 B.data(),
+                 (A.data()).get(), // .get() returns the raw pointer from thrust::device_ptr
+                 (B.data()).get(),
                  beta,
-                 C.data());
+                 (C.data()).get());
+
 #elif defined ITENSOR_USE_ZGEMM
     gemm_wrapper(isTransposed(A),
                  isTransposed(B),
diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 8155f6c58..2faae5d76 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -249,10 +249,10 @@ gemm_wrapper(bool transa,
              LAPACK_INT n,
              LAPACK_INT k,
              Cplx alpha,
-             const thrust::device_ptr<LAPACK_COMPLEX> A,
-             const thrust::device_ptr<LAPACK_COMPLEX> B,
+             const LAPACK_COMPLEX* A,
+             const LAPACK_COMPLEX* B,
              Cplx beta,
-             thrust::device_ptr<LAPACK_COMPLEX> C)
+             LAPACK_COMPLEX* C)
     {
     LAPACK_INT lda = m,
                ldb = k;
@@ -270,7 +270,7 @@ gemm_wrapper(bool transa,
         }
     cublasHandle_t handle;
     cublasCreate(&handle);
-    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A.get(), lda, B.get(), ldb, (LAPACK_COMPLEX*) &beta, C.get(), m);
+    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m);
     cublasDestroy(handle);
 }
 
diff --git a/itensor/tensor/lapack_wrap.h b/itensor/tensor/lapack_wrap.h
index 82fff9f40..8ddb3d6c5 100644
--- a/itensor/tensor/lapack_wrap.h
+++ b/itensor/tensor/lapack_wrap.h
@@ -524,6 +524,18 @@ gemm_wrapper(bool transa,
 //
 // zgemm
 //
+void
+gemm_wrapper(bool transa,
+             bool transb,
+             LAPACK_INT m,
+             LAPACK_INT n,
+             LAPACK_INT k,
+             Cplx alpha,
+             LAPACK_COMPLEX const* A,
+             LAPACK_COMPLEX const* B,
+             Cplx beta,
+             LAPACK_COMPLEX * C);
+
 void
 gemm_wrapper(bool transa,
              bool transb,

From 4fb774cf2e59f38e7aaf92b1b2706e51ae2b7512 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 19:02:17 +0200
Subject: [PATCH 58/68] deleted cuda stuff from normal zgemm

---
 itensor/tensor/lapack_wrap.cc | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index 2faae5d76..d111823e9 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -315,23 +315,6 @@ gemm_wrapper(bool transa,
     auto* pB = reinterpret_cast<const double*>(B);
     auto* pC = reinterpret_cast<double*>(C);
 	cblas_zgemm(CblasColMajor,at,bt,m,n,k,palpha,pA,lda,pB,ldb,pbeta,pC,m);
-#elif defined ITENSOR_USE_CUDA
-    cublasOperation_t at = CUBLAS_OP_N;
-    cublasOperation_t bt = CUBLAS_OP_N;
-    if(transa)
-        {
-        at = CUBLAS_OP_T;
-        lda = k;
-        }
-    if(transb)
-        {
-        bt = CUBLAS_OP_T;
-        ldb = n;
-        }
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-    cublasZgemm(handle, at, bt, m, n, k, (LAPACK_COMPLEX*) &alpha, A, lda, B, ldb, (LAPACK_COMPLEX*) &beta, C, m);
-    cublasDestroy(handle);
 #else //platform not openblas
 #ifdef ITENSOR_USE_CBLAS
     auto at = CblasNoTrans,

From a7130afee3a9dbc8e8d9e87183f31f7dc0f299f4 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Wed, 18 May 2022 19:04:38 +0200
Subject: [PATCH 59/68] fixed return type of zgemm

---
 itensor/tensor/lapack_wrap.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/itensor/tensor/lapack_wrap.cc b/itensor/tensor/lapack_wrap.cc
index d111823e9..9dbfc4e84 100644
--- a/itensor/tensor/lapack_wrap.cc
+++ b/itensor/tensor/lapack_wrap.cc
@@ -243,6 +243,7 @@ gemm_wrapper(bool transa,
 //
 // zgemm
 //
+void 
 gemm_wrapper(bool transa,
              bool transb,
              LAPACK_INT m,

From f3349d91504f9deb874c691fb9c8962e7d3ed7d6 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 15:46:50 +0200
Subject: [PATCH 60/68] not using get()

---
 itensor/tensor/gemm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index 8a45bbb84..cc01d12d1 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -176,8 +176,8 @@ gemm_impl(MatRefc<Cplx> A,
                  ncols(B),
                  ncols(A),
                  alpha,
-                 (A.data()).get(), // .get() returns the raw pointer from thrust::device_ptr
-                 (B.data()).get(),
+                 A.data(), 
+                 B.data(),
                  beta,
                  (C.data()).get());
 

From d547310a4129605efbe9048f9ada264ef3a05ec9 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 15:48:27 +0200
Subject: [PATCH 61/68] not using get()

---
 itensor/tensor/gemm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index cc01d12d1..f7d2d9af9 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -176,10 +176,10 @@ gemm_impl(MatRefc<Cplx> A,
                  ncols(B),
                  ncols(A),
                  alpha,
-                 A.data(), 
+                 A.data(),
                  B.data(),
                  beta,
-                 (C.data()).get());
+                 C.data());
 
 #elif defined ITENSOR_USE_ZGEMM
     gemm_wrapper(isTransposed(A),

From 5be2e74f2593d5d5939abcfee061ab61f76cf417 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 15:55:06 +0200
Subject: [PATCH 62/68] dont compile gemm_impl

---
 itensor/tensor/gemm.cc | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index f7d2d9af9..620f71cd8 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -50,6 +50,19 @@ struct dgemmTask
        : copyToC(true),Cpart(Cp)
         { }
     };
+#ifdef ITENSOR_USE_CUDA
+    gemm_wrapper(isTransposed(A),
+                 isTransposed(B),
+                 nrows(A),
+                 ncols(B),
+                 ncols(A),
+                 alpha,
+                 A.data(),
+                 B.data(),
+                 beta,
+                 C.data());
+#else
+
 
 void
 cplxToRealBuf(SAFE_PTR_OF(const Real) C,
@@ -169,19 +182,7 @@ gemm_impl(MatRefc<Cplx> A,
           Real alpha,
           Real beta)
     {
-#ifdef ITENSOR_USE_CUDA
-    gemm_wrapper(isTransposed(A),
-                 isTransposed(B),
-                 nrows(A),
-                 ncols(B),
-                 ncols(A),
-                 alpha,
-                 A.data(),
-                 B.data(),
-                 beta,
-                 C.data());
-
-#elif defined ITENSOR_USE_ZGEMM
+#if defined ITENSOR_USE_ZGEMM
     gemm_wrapper(isTransposed(A),
                  isTransposed(B),
                  nrows(A),
@@ -203,7 +204,7 @@ gemm_impl(MatRefc<Cplx> A,
           dgemmTask(1)
           }};
     gemm_emulator(A,B,C,alpha,beta,tasks);
-#endif
+#endif //ITENSOR_USE_ZGEMM
     }
 
 
@@ -303,5 +304,7 @@ template void gemm(MatRefc<Real>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
 template void gemm(MatRefc<Cplx>, MatRefc<Real>, MatRef<Cplx>,Real,Real);
 template void gemm(MatRefc<Cplx>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
 
+#endif //ITENSOR_USE_CUDA
+
 
 } //namespace itensor

From 69ab83b0c6228f5376d9cab5f69f5620ee1f3b9d Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 15:58:59 +0200
Subject: [PATCH 63/68] gemm_impl

---
 itensor/tensor/gemm.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index 620f71cd8..6f2a24644 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -50,7 +50,15 @@ struct dgemmTask
        : copyToC(true),Cpart(Cp)
         { }
     };
+
 #ifdef ITENSOR_USE_CUDA
+void
+gemm_impl(MatRefc<Cplx> A,
+          MatRefc<Cplx> B,
+          MatRef<Cplx>  C,
+          Real alpha,
+          Real beta)
+    {
     gemm_wrapper(isTransposed(A),
                  isTransposed(B),
                  nrows(A),
@@ -61,6 +69,7 @@ struct dgemmTask
                  B.data(),
                  beta,
                  C.data());
+    }
 #else
 
 

From 304e7c76e094a06dbe830a3e554edd1abb9d961e Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 16:04:43 +0200
Subject: [PATCH 64/68] implemented gemm

---
 itensor/tensor/gemm.cc | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index 6f2a24644..c987a180b 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -70,6 +70,32 @@ gemm_impl(MatRefc<Cplx> A,
                  beta,
                  C.data());
     }
+
+// C = alpha*A*B + beta*C
+template<typename VA, typename VB>
+void
+gemm(MatRefc<VA> A,
+     MatRefc<VB> B,
+     MatRef<common_type<VA,VB>>  C,
+     Real alpha,
+     Real beta)
+    {
+        if(isTransposed(C))
+            {
+            //Do C = Bt*At instead of Ct=A*B
+            //Recall that C.data() points to elements of C, not C.t()
+            //regardless of whether C.transpose()==true or false
+            gemm_impl(transpose(B),transpose(A),transpose(C),alpha,beta);
+            }
+        else
+            {
+            gemm_impl(A,B,C,alpha,beta);
+            }
+    }
+template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
+template void gemm(MatRefc<Real>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
+template void gemm(MatRefc<Cplx>, MatRefc<Real>, MatRef<Cplx>,Real,Real);
+template void gemm(MatRefc<Cplx>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
 #else
 
 

From 0bb4e7e16d365963c00e93633598b661a45cc412 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 16:08:31 +0200
Subject: [PATCH 65/68] restrict gemm to complex

---
 itensor/tensor/gemm.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index c987a180b..fc80e5f8a 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -92,9 +92,9 @@ gemm(MatRefc<VA> A,
             gemm_impl(A,B,C,alpha,beta);
             }
     }
-template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
-template void gemm(MatRefc<Real>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
-template void gemm(MatRefc<Cplx>, MatRefc<Real>, MatRef<Cplx>,Real,Real);
+//template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
+//template void gemm(MatRefc<Real>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
+//template void gemm(MatRefc<Cplx>, MatRefc<Real>, MatRef<Cplx>,Real,Real);
 template void gemm(MatRefc<Cplx>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
 #else
 

From c53944becce8f135b1efee86c2a1bf71f716d938 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 16:10:25 +0200
Subject: [PATCH 66/68] restrict gemm to complex or double

---
 itensor/tensor/gemm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index fc80e5f8a..495f4c28d 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -92,7 +92,7 @@ gemm(MatRefc<VA> A,
             gemm_impl(A,B,C,alpha,beta);
             }
     }
-//template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
+template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
 //template void gemm(MatRefc<Real>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
 //template void gemm(MatRefc<Cplx>, MatRefc<Real>, MatRef<Cplx>,Real,Real);
 template void gemm(MatRefc<Cplx>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);

From e1128a4492343af9ba30758422301041b8a6b469 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 16:14:20 +0200
Subject: [PATCH 67/68] double requires some more overloading so we just stick
 to gemm complex

---
 itensor/tensor/gemm.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/itensor/tensor/gemm.cc b/itensor/tensor/gemm.cc
index 495f4c28d..fc80e5f8a 100644
--- a/itensor/tensor/gemm.cc
+++ b/itensor/tensor/gemm.cc
@@ -92,7 +92,7 @@ gemm(MatRefc<VA> A,
             gemm_impl(A,B,C,alpha,beta);
             }
     }
-template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
+//template void gemm(MatRefc<Real>, MatRefc<Real>, MatRef<Real>,Real,Real);
 //template void gemm(MatRefc<Real>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);
 //template void gemm(MatRefc<Cplx>, MatRefc<Real>, MatRef<Cplx>,Real,Real);
 template void gemm(MatRefc<Cplx>, MatRefc<Cplx>, MatRef<Cplx>,Real,Real);

From 503df3700b97d7e0453564e25438d4de6d5453e7 Mon Sep 17 00:00:00 2001
From: Mauro d'Arcangelo <mauro.darcangelo@pasqal.com>
Date: Fri, 20 May 2022 16:17:47 +0200
Subject: [PATCH 68/68] restrict contract to complex

---
 itensor/tensor/contract.cc | 260 +++++++++++++++++++------------------
 1 file changed, 132 insertions(+), 128 deletions(-)

diff --git a/itensor/tensor/contract.cc b/itensor/tensor/contract.cc
index d5806c0bb..d09c1a38b 100644
--- a/itensor/tensor/contract.cc
+++ b/itensor/tensor/contract.cc
@@ -46,7 +46,7 @@ printv(const vector<T>& t,
        const F& f)
     {
     print("{ ");
-    for(const auto& i : t) 
+    for(const auto& i : t)
         {
         f(i);
         print(" ");
@@ -76,8 +76,8 @@ printv(const autovector<T>& t)
 #define PRIL(a,l) print(#a,": "); printv(a,l);
 
 //template<typename T>
-//long 
-//find_index(vector<T> const& v, 
+//long
+//find_index(vector<T> const& v,
 //           T const& t)
 //    {
 //    using size_type = typename vector<T>::size_type;
@@ -87,8 +87,8 @@ printv(const autovector<T>& t)
 //    }
 //
 //template<typename T, size_t MaxSize>
-//long 
-//find_index(const VarArray<T,MaxSize>& v, 
+//long
+//find_index(const VarArray<T,MaxSize>& v,
 //           const T& t)
 //    {
 //    using size_type = typename VarArray<T,MaxSize>::size_type;
@@ -98,8 +98,8 @@ printv(const autovector<T>& t)
 //    }
 //
 //template<typename T, size_t MaxSize>
-//long 
-//find_index(const InfArray<T,MaxSize>& v, 
+//long
+//find_index(const InfArray<T,MaxSize>& v,
 //           const T& t)
 //    {
 //    using size_type = typename InfArray<T,MaxSize>::size_type;
@@ -120,7 +120,7 @@ class small_map
     std::array<std::pair<A,B>,N> d;
     int nd{0};
 
-    B& 
+    B&
     operator[](const A& a)
         {
         for(int i = 0; i < nd; ++i)
@@ -136,15 +136,15 @@ class small_map
 // struct analyzing index pattern for C = A * B
 struct CProps
     {
-    Labels ai, 
-          bi, 
+    Labels ai,
+          bi,
           ci;
-    int nactiveA = 0, 
-        nactiveB = 0, 
+    int nactiveA = 0,
+        nactiveB = 0,
         nactiveC = 0;
     private:
-    Labels AtoB_, 
-          AtoC_, 
+    Labels AtoB_,
+          AtoC_,
           BtoC_;
     bool permuteA_ = false,
          permuteB_ = false,
@@ -166,11 +166,11 @@ struct CProps
     Range newArange,
           newBrange,
           newCrange;
-    
-    CProps(Labels const& ai_, 
-           Labels const& bi_, 
-           Labels const& ci_) 
-        : 
+
+    CProps(Labels const& ai_,
+           Labels const& bi_,
+           Labels const& ci_)
+        :
         ai(ai_),bi(bi_),ci(ci_),
         Acstart(ai_.size()),
         Bcstart(bi_.size()),
@@ -212,7 +212,7 @@ struct CProps
         if(size_t(Austart) >= ai.size()) return true;
         auto aCind = AtoC(Austart);
         using size_type = decltype(ai.size());
-        for(size_type i = 0; i < ai.size(); ++i) 
+        for(size_type i = 0; i < ai.size(); ++i)
             if(!contractedA(i))
                 {
                 if(AtoC(i) != aCind) return false;
@@ -227,7 +227,7 @@ struct CProps
         if(size_t(Bustart) >= bi.size()) return true;
         auto bCind = BtoC(Bustart);
         using size_type = decltype(bi.size());
-        for(size_type i = 0; i < bi.size(); ++i) 
+        for(size_type i = 0; i < bi.size(); ++i)
             if(!contractedB(i))
                 {
                 if(BtoC(i) != bCind) return false;
@@ -256,7 +256,7 @@ struct CProps
         //
 
         computePerms();
-        
+
         //Use PC.size() as a check to see if we've already run this
         if(PC.size() != 0) return;
 
@@ -282,19 +282,19 @@ struct CProps
                 dmid *= A.extent(i);
                 }
         for(int j = 0; j < rb; ++j)
-            if(!contractedB(j)) 
+            if(!contractedB(j))
                 {
                 dright *= B.extent(j);
                 PC.setFromTo(c,BtoC(j));
                 ++c;
                 }
 
-        if(!isTrivial(PC)) 
+        if(!isTrivial(PC))
             {
             permuteC_ = true;
             if(checkBCsameord() && checkACsameord())
                 {
-                //Can avoid permuting C by 
+                //Can avoid permuting C by
                 //computing Bt*At = Ct
                 ctrans = true;
                 permuteC_ = false;
@@ -305,15 +305,15 @@ struct CProps
         permuteA_ = false;
         if(!(contractedA(0) || contractedA(ra-1)))
             {
-            //If contracted indices are not all at front or back, 
-            //will have to permute A 
+            //If contracted indices are not all at front or back,
+            //will have to permute A
             permuteA_ = true;
             }
         else
             {
             //Contracted ind start at front or back, check if contiguous
             for(int i = 0; i < ncont; ++i)
-                if(!contractedA(Acstart+i)) 
+                if(!contractedA(Acstart+i))
                     {
                     //Contracted indices not contiguous, must permute
                     permuteA_ = true;
@@ -325,7 +325,7 @@ struct CProps
         permuteB_ = false;
         if(!(contractedB(0) || contractedB(rb-1)))
             {
-            //If contracted indices are not all at front or back, 
+            //If contracted indices are not all at front or back,
             //will have to permute B
             permuteB_ = true;
             }
@@ -344,9 +344,9 @@ struct CProps
             {
             //Check if contracted inds. in same order
             for(int i = 0; i < ncont; ++i)
-                if(AtoB(Acstart+i) != (Bcstart+i)) 
+                if(AtoB(Acstart+i) != (Bcstart+i))
                     {
-                    //If not in same order, 
+                    //If not in same order,
                     //must permute one of A or B
                     //so permute the smaller one
                     if(dleft < dright) permuteA_ = true;
@@ -487,7 +487,7 @@ struct CProps
                     ++c;
                     }
             for(int j = 0; j < rb; ++j)
-                if(!contractedB(j)) 
+                if(!contractedB(j))
                     {
                     PC.setFromTo(c,BtoC_[j]);
                     ++c;
@@ -502,9 +502,9 @@ struct CProps
                 permuteC_ = true;
                 //Here we already know since pc_triv = false that
                 //at best indices from B precede those from A (on result C)
-                //so if both sets remain in same order on C 
+                //so if both sets remain in same order on C
                 //just need to transpose C, not permute it
-                if(checkBCsameord() && checkACsameord()) 
+                if(checkBCsameord() && checkACsameord())
                     {
                     ctrans = true;
                     permuteC_ = false;
@@ -534,27 +534,27 @@ struct CProps
             if(!permuteB_)
                 {
                 for(decltype(rb) j = 0; j < rb; ++j)
-                    if(!contractedB(j)) 
+                    if(!contractedB(j))
                         Rb.nextIndex(B.extent(j));
                 }
             else
                 {
                 for(decltype(rb) j = 0; j < rb; ++j)
-                    if(!contractedB(j)) 
+                    if(!contractedB(j))
                         Rb.nextIndex(newBrange.extent(j));
                 }
             newCrange = Rb.build();
             }
         }
 
-    void 
+    void
     computePerms()
         {
         //Use !AtoB.empty() as a check to see if we've already run this
         if(!AtoB_.empty()) return;
 
-        int na = ai.size(), 
-            nb = bi.size(), 
+        int na = ai.size(),
+            nb = bi.size(),
             nc = ci.size();
 
         AtoB_ = Labels(na,-1);
@@ -563,7 +563,7 @@ struct CProps
         for(int i = 0; i < na; ++i)
             {
             for(int j = 0; j < nb; ++j)
-                if(ai[i] == bi[j]) 
+                if(ai[i] == bi[j])
                     {
                     ++ncont;
                     if(i < Acstart) Acstart = i;
@@ -576,7 +576,7 @@ struct CProps
         for(int i = 0; i < na; ++i)
             {
             for(int k = 0; k < nc; ++k)
-                if(ai[i] == ci[k]) 
+                if(ai[i] == ci[k])
                     {
                     if(i < Austart) Austart = i;
                     AtoC_[i] = k;
@@ -587,7 +587,7 @@ struct CProps
         for(int j = 0; j < nb; ++j)
             {
             for(int k = 0; k < nc; ++k)
-                if(bi[j] == ci[k]) 
+                if(bi[j] == ci[k])
                     {
                     if(j < Bustart) Bustart = j;
                     BtoC_[j] = k;
@@ -603,7 +603,7 @@ struct CProps
     computeNactive()
         {
         // Out of A, B and C  (C = A*B), each index appears in two tensors.
-        // An active index appears as one of the first two indices in each of the two 
+        // An active index appears as one of the first two indices in each of the two
         // tensor in which it appears.  More specifically:
         // the first index of a tensor is active if its pair is also a first index, or if its
         // pair is a second index and that tensor's first index is active.
@@ -620,9 +620,9 @@ struct CProps
         small_map<int,int> indval;
         for(int i = 0; i <= 1; ++i)
             {
-            ++indval[ai[i]]; 
-            ++indval[bi[i]]; 
-            ++indval[ci[i]]; 
+            ++indval[ai[i]];
+            ++indval[bi[i]];
+            ++indval[ci[i]];
             }
 
         for(int elim = 1; elim <= 3; ++elim) // bad guys at position 0 kill the index at 1
@@ -640,20 +640,20 @@ struct CProps
 
 struct ABoffC
     {
-    MatrixRefc mA, 
+    MatrixRefc mA,
             mB;
     MatrixRef  mC;
     int offC;
 
-    ABoffC(MatrixRefc& mA_, 
-           MatrixRefc& mB_, 
-           MatrixRef& mC_, 
+    ABoffC(MatrixRefc& mA_,
+           MatrixRefc& mB_,
+           MatrixRef& mC_,
            int offC_)
-        : 
-        mA(mA_), 
-        mB(mB_), 
-        mC(mC_), 
-        offC(offC_) 
+        :
+        mA(mA_),
+        mB(mB_),
+        mC(mC_),
+        offC(offC_)
         { }
 
     void
@@ -667,16 +667,16 @@ class CABqueue
 
     CABqueue() { }
 
-    void 
-    addtask(MatrixRefc& mA, 
-            MatrixRefc& mB, 
-            MatrixRef& mC, 
+    void
+    addtask(MatrixRefc& mA,
+            MatrixRefc& mB,
+            MatrixRef& mC,
             int offC)
         {
         subtask[offC].emplace_back(mA,mB,mC,offC);
         }
 
-    void 
+    void
     run(int numthread)
         {
 
@@ -693,8 +693,8 @@ class CABqueue
         //////////
 
         //Loop over threads in a round-robin fashion.
-        //Assign all tasks with the same memory 
-        //destination (offC) to the same thread. 
+        //Assign all tasks with the same memory
+        //destination (offC) to the same thread.
         vector<vector<ABoffC>> threadtask(numthread);
         int ss = 0;
         for(auto& t : subtask)
@@ -715,14 +715,14 @@ class CABqueue
             //printfln("task size for thread %d is %d",i,tt.size());
             futs[i] = std::async(std::launch::async,
                       [&tt]()
-                          { 
+                          {
                           for(const auto& task : tt)
                             task.execute();
                           }
                       );
             }
         //Wait for futures to complete
-        for(auto& ft : futs) 
+        for(auto& ft : futs)
             {
             ft.wait();
             }
@@ -731,7 +731,7 @@ class CABqueue
 
 
 template<typename range_t, typename VA, typename VB>
-void 
+void
 contract(CProps const& p,
          TenRefc<range_t,VA> A,
          TenRefc<range_t,VB> B,
@@ -802,7 +802,7 @@ contract(CProps const& p,
         }
     else
         {
-        if(p.Ctrans()) 
+        if(p.Ctrans())
             {
             cref = transpose(makeMatRef(C.store(),ncols(bref),nrows(aref)));
             }
@@ -824,9 +824,9 @@ contract(CProps const& p,
     }
 
 template<typename R, typename T1, typename T2>
-void 
-contractScalar(T1 a, 
-               TenRefc<R,T2> B, Labels const& bi, 
+void
+contractScalar(T1 a,
+               TenRefc<R,T2> B, Labels const& bi,
                TenRef<R,common_type<T1,T2>>  C, Labels const& ci,
                Real alpha,
                Real beta)
@@ -841,19 +841,19 @@ contractScalar(T1 a,
     }
 
 template<typename RangeT, typename VA, typename VB>
-void 
-contract(TenRefc<RangeT,VA> A, Labels const& ai, 
-         TenRefc<RangeT,VB> B, Labels const& bi, 
-         TenRef<RangeT,common_type<VA,VB>>  C, 
+void
+contract(TenRefc<RangeT,VA> A, Labels const& ai,
+         TenRefc<RangeT,VB> B, Labels const& bi,
+         TenRef<RangeT,common_type<VA,VB>>  C,
          Labels const& ci,
          Real alpha,
          Real beta)
     {
-    if(ai.empty()) 
+    if(ai.empty())
         {
         contractScalar(*A.data(),B,bi,C,ci,alpha,beta);
         }
-    else if(bi.empty()) 
+    else if(bi.empty())
         {
         contractScalar(*B.data(),A,ai,C,ci,alpha,beta);
         }
@@ -866,44 +866,48 @@ contract(TenRefc<RangeT,VA> A, Labels const& ai,
     }
 
 //Explicit template instantiations:
-template void 
-contract(TenRefc<Range,Real>, Labels const&, 
-         TenRefc<Range,Real>, Labels const&, 
+/*
+template void
+contract(TenRefc<Range,Real>, Labels const&,
+         TenRefc<Range,Real>, Labels const&,
          TenRef<Range,Real> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<Range,Cplx>, Labels const&, 
-         TenRefc<Range,Real>, Labels const&, 
+template void
+contract(TenRefc<Range,Cplx>, Labels const&,
+         TenRefc<Range,Real>, Labels const&,
          TenRef<Range,Cplx> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<Range,Real>, Labels const&, 
-         TenRefc<Range,Cplx>, Labels const&, 
+template void
+contract(TenRefc<Range,Real>, Labels const&,
+         TenRefc<Range,Cplx>, Labels const&,
          TenRef<Range,Cplx> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<Range,Cplx>, Labels const&, 
-         TenRefc<Range,Cplx>, Labels const&, 
+*/
+template void
+contract(TenRefc<Range,Cplx>, Labels const&,
+         TenRefc<Range,Cplx>, Labels const&,
          TenRef<Range,Cplx> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<IndexSet,Real>, Labels const&, 
-         TenRefc<IndexSet,Real>, Labels const&, 
+/*
+template void
+contract(TenRefc<IndexSet,Real>, Labels const&,
+         TenRefc<IndexSet,Real>, Labels const&,
          TenRef<IndexSet,Real> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<IndexSet,Cplx>, Labels const&, 
-         TenRefc<IndexSet,Real>, Labels const&, 
+template void
+contract(TenRefc<IndexSet,Cplx>, Labels const&,
+         TenRefc<IndexSet,Real>, Labels const&,
          TenRef<IndexSet,Cplx> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<IndexSet,Real>, Labels const&, 
-         TenRefc<IndexSet,Cplx>, Labels const&, 
+template void
+contract(TenRefc<IndexSet,Real>, Labels const&,
+         TenRefc<IndexSet,Cplx>, Labels const&,
          TenRef<IndexSet,Cplx> , Labels const&,
          Real,Real);
-template void 
-contract(TenRefc<IndexSet,Cplx>, Labels const&, 
-         TenRefc<IndexSet,Cplx>, Labels const&, 
+*/
+template void
+contract(TenRefc<IndexSet,Cplx>, Labels const&,
+         TenRefc<IndexSet,Cplx>, Labels const&,
          TenRef<IndexSet,Cplx> , Labels const&,
          Real,Real);
 
@@ -913,12 +917,12 @@ struct MultInfo
     bool tA = false,
          tB = false,
          Bfirst = false;
-    MultInfo() {} 
+    MultInfo() {}
     };
 
 MultInfo static
 computeMultInfo(Labels const& ai,
-                Labels const& bi, 
+                Labels const& bi,
                 Labels const& ci)
     {
     MultInfo I;
@@ -982,9 +986,9 @@ computeMultInfo(Labels const& ai,
     }
 
 template<typename RangeT>
-void 
-contractloop(TenRefc<RangeT> A, Labels const& ai, 
-             TenRefc<RangeT> B, Labels const& bi, 
+void
+contractloop(TenRefc<RangeT> A, Labels const& ai,
+             TenRefc<RangeT> B, Labels const& bi,
              TenRef<RangeT>  C, Labels const& ci,
              Args const& args)
     {
@@ -1013,14 +1017,14 @@ contractloop(TenRefc<RangeT> A, Labels const& ai,
 
     auto nfo = computeMultInfo(ai,bi,ci);
 
-    auto Arow = A.extent(0); 
+    auto Arow = A.extent(0);
     auto Acol = A.extent(1);
-    auto Brow = B.extent(0); 
+    auto Brow = B.extent(0);
     auto Bcol = B.extent(1);
-    auto Crow = C.extent(0); 
+    auto Crow = C.extent(0);
     auto Ccol = C.extent(1);
 
-    detail::GCounter couA(ra), 
+    detail::GCounter couA(ra),
                      couB(rb);
     //Keep couA.i[0] and couA.i[1] fixed at 0
     couA.setRange(0,0,0);
@@ -1035,8 +1039,8 @@ contractloop(TenRefc<RangeT> A, Labels const& ai,
     for(int j = 2; j < rb; ++j)
         couB.setRange(j,0,B.extent(j)-1);
 
-    Labels aind(ra,0), 
-          bind(rb,0), 
+    Labels aind(ra,0),
+          bind(rb,0),
           cind(rc,0);
 
     CABqueue cabq;
@@ -1067,13 +1071,13 @@ contractloop(TenRefc<RangeT> A, Labels const& ai,
                 cind[p.AtoC(ia)] = ival;
                 }
             }
-        
+
         for(;couB.notDone(); ++couB)
             {
             for(int ib = 2; ib < rb; ++ib)
                 {
                 bind[ib] = couB[ib];
-                if(p.BtoC(ib) != -1) 
+                if(p.BtoC(ib) != -1)
                 if(!p.contractedB(ib))
                     cind[p.BtoC(ib)] = couB[ib];
                 }
@@ -1100,15 +1104,15 @@ contractloop(TenRefc<RangeT> A, Labels const& ai,
     cabq.run(nthread);
     }
 template
-void 
-contractloop(TenRefc<Range> A, Labels const& ai, 
-             TenRefc<Range> B, Labels const& bi, 
+void
+contractloop(TenRefc<Range> A, Labels const& ai,
+             TenRefc<Range> B, Labels const& bi,
              TenRef<Range>  C, Labels const& ci,
              Args const& args);
 template
-void 
-contractloop(TenRefc<IndexSet> A, Labels const& ai, 
-             TenRefc<IndexSet> B, Labels const& bi, 
+void
+contractloop(TenRefc<IndexSet> A, Labels const& ai,
+             TenRefc<IndexSet> B, Labels const& bi,
              TenRef<IndexSet>  C, Labels const& ci,
              Args const& args);
 
@@ -1118,29 +1122,29 @@ contractloop(TenRefc<IndexSet> A, Labels const& ai,
 //All indices of B contracted
 //(A can have some uncontracted indices)
 template<typename RangeT>
-void 
-contractDiagFull(VectorRefc A,         Labels const& ai, 
-                 TenRefc<RangeT> B, Labels const& bi, 
+void
+contractDiagFull(VectorRefc A,         Labels const& ai,
+                 TenRefc<RangeT> B, Labels const& bi,
                  VectorRef          C, Labels const& ci)
     {
     Error("contractDiagFull not yet implemented");
     }
 template
-void 
-contractDiagFull(VectorRefc     A, Labels const& ai, 
-                 TenRefc<Range> B, Labels const& bi, 
+void
+contractDiagFull(VectorRefc     A, Labels const& ai,
+                 TenRefc<Range> B, Labels const& bi,
                  VectorRef      C, Labels const& ci);
 template
-void 
-contractDiagFull(VectorRefc        A, Labels const& ai, 
-                 TenRefc<IndexSet> B, Labels const& bi, 
+void
+contractDiagFull(VectorRefc        A, Labels const& ai,
+                 TenRefc<IndexSet> B, Labels const& bi,
                  VectorRef         C, Labels const& ci);
 
 ////////////////////////////////////////////
 
 //
 // Non-Contracting Product Optimization Ideas
-// 
+//
 // o Identify common index to A,B,C with largest
 //   extent and make this the innermost loop
 //   (similar to big index in transform method in ten_impl.h)