diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index be9fed1..0978da1 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
 
       matrix:
-        version: ['1.1', '1.2', '1.3', '1.4', '1.5', 'nightly']
+        version: ['1.5', '1.6', '1.7', 'nightly']
         os: [ubuntu-latest]
 
         include:
diff --git a/Project.toml b/Project.toml
index 2f25435..5f8968a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "SetIntersectionProjection"
 uuid = "335f7d24-6316-57dd-9c3a-df470f2b739e"
 authors = ["Bas Peters <1.bas.peters@gmail.com>"]
-version = "0.2.2"
+version = "0.2.3"
 
 [deps]
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -21,7 +21,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 [compat]
 DistributedArrays = "0.6"
 Interpolations = "0.13"
-JOLI = "0.7"
+JOLI = "0.7,0.8"
 Parameters = "0.12"
 SortingAlgorithms = "1"
 TimerOutputs = "0.5.15"
diff --git a/examples/ConstraintSetupExamples.jl b/examples/ConstraintSetupExamples.jl
index 4d32496..72b04ed 100644
--- a/examples/ConstraintSetupExamples.jl
+++ b/examples/ConstraintSetupExamples.jl
@@ -27,16 +27,16 @@ else
   #error("download escalator video from http://cvxr.com/tfocs/demos/rpca/escalator_data.mat")
 end
 
-mtrue    = read(file, "X")
-n1       = convert(Integer,read(file, "m"))
-n2       = convert(Integer,read(file, "n"))
-m_mat    = convert(Array{TF,2},mtrue)
-m_tensor = convert(Array{TF,3},reshape(mtrue,n1,n2,Integer(200)))
+mtrue    = read(file, "X");
+n1       = convert(Integer,read(file, "m"));
+n2       = convert(Integer,read(file, "n"));
+m_mat    = convert(Array{TF,2},mtrue);
+m_tensor = convert(Array{TF,3},reshape(mtrue,n1,n2,Integer(200)));
 
 #computational grid for the video
-comp_grid  = compgrid((1f0,1f0,1f0),(size(m_tensor,1),size(m_tensor,2), size(m_tensor,3)))
+comp_grid  = compgrid((1f0,1f0,1f0),(size(m_tensor,1),size(m_tensor,2), size(m_tensor,3)));
 
-comp_grid_time_slice = compgrid((1f0,1f0),(size(m_tensor,1),size(m_tensor,2)))
+comp_grid_time_slice = compgrid((1f0,1f0),(size(m_tensor,1),size(m_tensor,2)));
 
 
 ######################################################################
@@ -52,7 +52,7 @@ options = PARSDMM_options()
 #l1 (total variation) constraints (in one direction)
 
 #find a reasonable value for the l1-ball
-(TD_OP, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid,"D_z",options.FL)
+(TD_OP, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid,"D_z",options.FL);
 TV_z = norm(TD_OP*vec(m_tensor),1)
 
 m_min     = 0.0
@@ -61,15 +61,15 @@ set_type  = "l1"
 TD_OP     = "D_z"
 app_mode  = ("tensor","")
 custom_TD_OP = ([],false)
-push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
-(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
-(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+(P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
+(TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
 @time (x,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 @time (x,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
-m_proj = reshape(x,comp_grid.n)
+m_proj = reshape(x,comp_grid.n);
 
 figure();
 for i=1:comp_grid.n[3]
@@ -83,23 +83,23 @@ end
 ######## (anisotropic) total-variation on the time-derivative          ########
 ########                     using JOLI operators                      ########
 
-options = PARSDMM_options()
+options = PARSDMM_options();
 
 #TV operator per time-slice
-(TV, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid_time_slice,"TV",options.FL)
+(TV, AtA_diag, dense, TD_n) = get_TD_operator(comp_grid_time_slice,"TV",options.FL);
 #time derivative over the time-slices
-(D, AtA_diag, dense, TD_n)  =  get_TD_operator(comp_grid,"D_z",options.FL)
+(D, AtA_diag, dense, TD_n)  =  get_TD_operator(comp_grid,"D_z",options.FL);
 
-CustomOP_explicit_sparse = kron(TV, SparseMatrixCSC{TF}(LinearAlgebra.I, comp_grid.n[3]-1,comp_grid.n[3]-1))* D
+CustomOP_explicit_sparse = kron(TV, SparseMatrixCSC{TF}(LinearAlgebra.I, comp_grid.n[3]-1,comp_grid.n[3]-1))* D;
 
-D  = joMatrix(D)
-TV = joMatrix(TV)
-CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D
+D  = joMatrix(D);
+TV = joMatrix(TV);
+CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D;
 
 ## Solve using JOLI ##
 
     #initialize constraints
-    constraint = Vector{SetIntersectionProjection.set_definitions}()
+    constraint = Vector{SetIntersectionProjection.set_definitions}();
 
     m_min     = 0.0
     m_max     = 0.1*norm(CustomOP_JOLI*vec(m_tensor),1)
@@ -122,7 +122,7 @@ CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D
     @time (x_joli,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 ## solve using explicit sparse array ##
-    constraint = Vector{SetIntersectionProjection.set_definitions}()
+    constraint = Vector{SetIntersectionProjection.set_definitions}();
 
     m_min     = 0.0
     m_max     = 0.025*norm(CustomOP_explicit_sparse*vec(m_tensor),1)
@@ -130,17 +130,17 @@ CustomOP_JOLI = joKron(TV, joEye(comp_grid.n[3]-1,DDT=Float32,RDT=Float32))* D
     TD_OP     = "identity"
     app_mode  = ("matrix","")
     custom_TD_OP = (CustomOP_explicit_sparse,false)
-    push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP))
+    push!(constraint, set_definitions(set_type,TD_OP,m_min,m_max,app_mode,custom_TD_OP));
 
     options=PARSDMM_options()
-    (P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL)
+    (P_sub,TD_OP,set_Prop) = setup_constraints(constraint,comp_grid,options.FL);
 
     #set properties of custom operator
     set_Prop.AtA_diag[1]  = false
     set_Prop.dense[1]     = false
     set_Prop.banded[1]    = true
 
-    (TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options)
+    (TD_OP,AtA,l,y)        = PARSDMM_precompute_distribute(TD_OP,set_Prop,comp_grid,options);
 
     @time (x_sp,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
     @time (x_sp,log_PARSDMM) = PARSDMM(vec(m_tensor),AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
diff --git a/examples/projection_intersection_2D.jl b/examples/projection_intersection_2D.jl
index 36119c4..8a58bed 100755
--- a/examples/projection_intersection_2D.jl
+++ b/examples/projection_intersection_2D.jl
@@ -114,7 +114,7 @@ options.parallel       = true
 
 @time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 @time (x2,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
-#@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
+@time (x,log_PARSDMM) = PARSDMM(m,AtA,TD_OP,set_Prop,P_sub,comp_grid,options);
 
 #plot
 figure();
@@ -125,27 +125,30 @@ savefig("projected_model_ParallelPARSDMM.png",bbox_inches="tight")
 #print timings in terminal
 log_PARSDMM.timing
 
-# #use multilevel-serial (2-levels)
-# options.parallel = false
+#use multilevel-serial (2-levels)
+options.parallel = false
 
-# #2 levels, the gird point spacing at level 2 is 3X that of the original (level 1) grid
-# n_levels          = 2
-# coarsening_factor = 3
+#2 levels, the gird point spacing at level 2 is 3X that of the original (level 1) grid
+n_levels          = 2
+coarsening_factor = 3
 
-# #set up all required quantities for each level
-# #(m_levels,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
-# (TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+#set up all required quantities for each level
+#(m_levels,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels)=setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
 
-# println("")
-# println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
-# @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
-# @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
-# @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+println("")
+println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
 
-# figure();
-# subplot(2,1,1);imshow(permutedims(reshape(m,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("model to project")
-# subplot(2,1,2);imshow(permutedims(reshape(x,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("Projection (bounds and bounds on D_z)")
-# savefig("projected_model_MultilevelSerialPARSDMM.png",bbox_inches="tight")
+figure();
+subplot(2,1,1);imshow(permutedims(reshape(m,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("model to project")
+subplot(2,1,2);imshow(permutedims(reshape(x,(comp_grid.n[1],comp_grid.n[2])),[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("Projection (bounds and bounds on D_z)")
+savefig("projected_model_MultilevelSerialPARSDMM.png",bbox_inches="tight")
+
+#print timings in terminal, note that for the multi-level version, the timing is only for the final level on the original grid
+log_PARSDMM.timing
 
 # #now use multi-level with parallel PARSDMM
 # options.parallel=true
diff --git a/examples/projection_intersection_3D.jl b/examples/projection_intersection_3D.jl
index 8e10726..073ef6a 100644
--- a/examples/projection_intersection_3D.jl
+++ b/examples/projection_intersection_3D.jl
@@ -102,13 +102,13 @@ subplot(3, 3, 8);plot(log_PARSDMM.gamma)              ;title("gamma")
 subplot(3, 3, 9);semilogy(log_PARSDMM.evol_x)         ;title("x evolution")
 
 #plot
-m_plot = reshape(m,comp_grid.n)
+m_plot = reshape(m,comp_grid.n);
 figure();
 subplot(3,1,1);imshow(m_plot[:,:,Int64(round(comp_grid.n[3]/2))],cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, ymax, 0]); title("model to project x-y slice")
 subplot(3,1,2);imshow(permutedims(m_plot[:,Int64(round(comp_grid.n[2]/2)),:],[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("model to project x-z slice")
 subplot(3,1,3);imshow(permutedims(m_plot[Int64(round(comp_grid.n[1]/2)),:,:],[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  ymax, zmax, 0]); title("model to project y-z slice")
 
-x_plot = reshape(x,comp_grid.n)
+x_plot = reshape(x,comp_grid.n);
 figure();
 subplot(3,1,1);imshow(x_plot[:,:,Int64(round(comp_grid.n[3]/2))],cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, ymax, 0]); title("projected model x-y slice")
 subplot(3,1,2);imshow(permutedims(x_plot[:,Int64(round(comp_grid.n[2]/2)),:],[2,1]),cmap="jet",vmin=vmi,vmax=vma,extent=[0,  xmax, zmax, 0]); title("projected model x-z slice")
@@ -121,13 +121,13 @@ n_levels          = 2
 coarsening_factor = 3
 
 #set up all required quantities for each level
-(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options);
 
 println("")
 println("PARSDMM multilevel-serial (bounds and bounds on D_z):")
-@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options)
-@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options)
-@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options)
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
+@time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
 
 #parallel single level
 println("")
@@ -147,7 +147,7 @@ n_levels          = 2
 coarsening_factor = 3
 
 #set up all required quantities for each level
-(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options)
+(TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels) = setup_multi_level_PARSDMM(m,n_levels,coarsening_factor,comp_grid,constraint,options);
 
 println("PARSDMM multilevel-parallel (bounds and bounds on D_z):")
 @time (x,log_PARSDMM) = PARSDMM_multi_level(m,TD_OP_levels,AtA_levels,P_sub_levels,set_Prop_levels,comp_grid_levels,options);
diff --git a/src/CDS_MVp.jl b/src/CDS_MVp.jl
index 4f5c190..e06b7ac 100755
--- a/src/CDS_MVp.jl
+++ b/src/CDS_MVp.jl
@@ -1,4 +1,4 @@
-export CDS_MVp, CDS_MVp2, CDS_MVp3, CDS_MVp4
+export CDS_MVp
 
 """
 compute single-thread matrix vector product with vector x, output is vector y: y=A*x
@@ -19,9 +19,9 @@ function CDS_MVp(
       r0 = max(1, 1-d)
       r1 = min(N, N-d)
       c0 = max(1, 1+d)
-       for r = r0 : r1
-         c = r - r0 + c0 #original
-       @inbounds y[r] = y[r] + R[r,i] * x[c]#original
+      for r = r0 : r1
+        c = r - r0 + c0 #original
+        @inbounds y[r] = y[r] + R[r,i] * x[c]#original
       end
   end
   return y
@@ -51,7 +51,7 @@ end
 # return y
 # end
 
-# function CDS_MVp(
+# function CDS_MVp2(
 #   N      ::Integer,
 #   ndiags ::Integer,
 #   R      ::Array{TF,2},
diff --git a/src/PARSDMM.jl b/src/PARSDMM.jl
index 12d1a2e..b0d0f09 100755
--- a/src/PARSDMM.jl
+++ b/src/PARSDMM.jl
@@ -104,7 +104,7 @@ for i=1:maxit #main loop
   # x-minimization
   @timeit to "argmin x" begin
   copy!(x_old,x);
-  (x,iter,relres,x_solve_tol_ref) = argmin_x(Q,rhs,x,x_solve_tol_ref,i,log_PARSDMM,Q_offsets,Ax_out)
+  (x,iter,relres,x_solve_tol_ref) = argmin_x(Q,rhs,x,x_solve_tol_ref,i,log_PARSDMM,Q_offsets,Ax_out,comp_grid)
   log_PARSDMM.cg_it[i]            = iter
   log_PARSDMM.cg_relres[i]        = relres
   end #end timer for argmin x 
diff --git a/src/PARSDMM_initialize.jl b/src/PARSDMM_initialize.jl
index 28efcc4..9829cdd 100755
--- a/src/PARSDMM_initialize.jl
+++ b/src/PARSDMM_initialize.jl
@@ -86,10 +86,13 @@ function PARSDMM_initialize(
     m = [m ; zeros(TF,length(m)) ]
   end
   if parallel
-    #f_compute_rel_feas = (feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) 
-    #[@spawnat pid f_compute_rel_feas for pid in 2:nworkers()]
-    #feasibility_initial = pmap(f_compute_rel_feas, feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
-    feasibility_initial = pmap((feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) , feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
+    feasibility_initial = distribute(feasibility_initial)
+    [@sync @spawnat pid m for pid in P_sub.pids]
+    [@sync @spawnat pid compute_relative_feasibility(m,feasibility_initial[:L],TD_OP[:L],P_sub[:L]) for pid in P_sub.pids]
+    feasibility_initial = convert(Vector{TF},feasibility_initial)
+
+    #using pmap
+    #feasibility_initial = pmap((feasibility_initial,TD_OP,P_sub) -> compute_relative_feasibility(m,feasibility_initial,TD_OP,P_sub) , feasibility_initial,TD_OP,P_sub; distributed=true, batch_size=1, on_error=nothing, retry_delays=[], retry_check=nothing)
   else
     for ii = 1:length(P_sub)
       feasibility_initial[ii] = norm(P_sub[ii](TD_OP[ii]*m) .- TD_OP[ii]*m) ./ (norm(TD_OP[ii]*m)+(100*eps(TF)));
diff --git a/src/SetIntersectionProjection.jl b/src/SetIntersectionProjection.jl
index 9048202..79751d3 100644
--- a/src/SetIntersectionProjection.jl
+++ b/src/SetIntersectionProjection.jl
@@ -15,6 +15,9 @@ using JOLI
 using JOLI.FFTW, JOLI.Wavelets
 using SortingAlgorithms
 using TimerOutputs
+# using Flux
+# using NNlib
+# using CUDA
 
 export log_type_PARSDMM, set_properties, PARSDMM_options, set_definitions
 
@@ -39,6 +42,7 @@ include("CDS_MVp.jl")
 include("CDS_MVp_MT.jl")
 include("CDS_MVp_MT_subfunc.jl")
 include("CDS_scaled_add!.jl")
+#include("CDS2stencil.jl")
 
 #scripts for parallelism
 include("update_y_l_parallel.jl")
@@ -53,7 +57,7 @@ include("setup_multi_level_PARSDMM.jl")
 include("constraint2coarse.jl")
 include("interpolate_y_l.jl")
 
-#scripts for setting up constraints, projetors, linear operators
+#scripts for setting up constraints, projectors, linear operators
 include("default_PARSDMM_options.jl")
 include("convert_options!.jl")
 include("get_discrete_Grad.jl");
@@ -97,13 +101,6 @@ mutable struct log_type_PARSDMM
       cg_it             :: Vector{Integer}
       cg_relres         :: Vector{Real}
       timing            :: TimerOutput
-      # T_cg              :: Real
-      # T_stop            :: Real
-      # T_ini             :: Real
-      # T_rhs             :: Real
-      # T_adjust_rho_gamma:: Real
-      # T_y_l_upd         :: Real
-      # T_Q_upd           :: Real
 end
 
 @with_kw mutable struct PARSDMM_options
diff --git a/src/argmin_x.jl b/src/argmin_x.jl
index dd63247..663ce76 100755
--- a/src/argmin_x.jl
+++ b/src/argmin_x.jl
@@ -11,7 +11,8 @@ function argmin_x(
                   i               ::Integer,
                   log_PARSDMM,
                   Q_offsets=[],
-                  Ax_out=zeros(TF,length(x)) ::Vector{TF}
+                  Ax_out=zeros(TF,length(x)) ::Vector{TF},
+                  comp_grid=[]
                   ) where {TF<:Real}
 
   #Initialize
@@ -20,8 +21,14 @@ function argmin_x(
   iter    = 0
 
   if typeof(Q)==Array{TF,2} #set up multi-threaded matrix-vector product in compressed diagonal storage format
-    Af1(in) =  (fill!(Ax_out,TF(0)); CDS_MVp_MT(size(Q,1),size(Q,2),Q,Q_offsets,in,Ax_out); return Ax_out)
+    
+    # w = CDS2stencil(Q, Q_offsets, comp_grid.n)
+    # w = w |> gpu
+    # cdims = DenseConvDims(reshape(x, comp_grid.n ...,1,1), w, padding=Int(size(w,1)-1)/2)
 
+    Af1(in) =  Ax_CDS_MT(in, Ax_out, Q, Q_offsets)
+    #Af1(in) =  Ax_stencil_cuda(in, Ax_out, comp_grid, w, cdims)
+ 
     #determine what relative residual CG needs to reach
     if i<3 #i is the PARSDMM iteration counter
       x_solve_tol_ref = TF(max(0.1*norm(Af1(x)-rhs)/norm(rhs),10*eps(TF))) #10% of current relative residual
@@ -30,6 +37,7 @@ function argmin_x(
     end
 
     (x,flag,relres,iter) = cg(Af1,rhs,tol=x_solve_tol_ref,maxIter=1000,x=x,out=0);
+    #(x,flag,relres,iter) = cg(Q,Q_offsets,rhs, x_solve_tol_ref, 1000, x, 0)
 
   elseif typeof(Q)==SparseMatrixCSC{TF,Int64} #CG with native Julia sparse CSC format MPVs
     
@@ -60,3 +68,29 @@ function argmin_x(
 
   return x,iter,relres,x_solve_tol_ref
 end
+
+function Ax_CDS_MT(in::Vector{TF}, Ax_out::Vector{TF}, Q::Array{TF,2}, Q_offsets::Vector{Int}) where TF <: Real
+  
+  fill!(Ax_out,TF(0))
+  CDS_MVp_MT(size(Q,1),size(Q,2),Q,Q_offsets,in,Ax_out);
+
+  return Ax_out
+end
+
+# function Ax_stencil_cuda(in::Vector{TF}, Ax_out::Vector{TF}, comp_grid, w::CUDA.CuArray{TF, 5}, cdims) where TF <: Real
+#   in     = in |> gpu
+#   Ax_out = Ax_out |>gpu
+#   fill!(Ax_out,TF(0))
+#   Ax_out = reshape(Ax_out,comp_grid.n ...,1,1)
+#   in     = reshape(in,comp_grid.n ...,1,1)
+#   conv!(Ax_out, in, w, cdims)
+
+#   # #for now, boundary conditions are an issue, manually correct it
+#   # kernel_size = size(w,1) 
+
+
+#   Ax_out = Ax_out |> cpu
+
+#   return vec(Ax_out)
+
+# end
diff --git a/src/cg.jl b/src/cg.jl
index eb92023..43db4d7 100644
--- a/src/cg.jl
+++ b/src/cg.jl
@@ -41,11 +41,10 @@ Output:
 """
 # function cg(A::Function,b::Vector; tol::Real=1e-2,maxIter::Integer=100,M::Function=identity,x::Vector=[],out::Integer=0,
 # 	storeInterm::Bool=false)
-function cg(A::Function,b::Vector{TF}; tol::Real=1e-2,maxIter::Integer=100,M::Function=identity,x::Vector=[],out::Integer=0,
-	storeInterm::Bool=false) where {TF <: Real}
+function cg(A::Function,b::Vector{TF}; tol::Real=1e-2,maxIter::Integer=100,M::Function=identity,x::Vector=[],out::Integer=0) where {TF <: Real}
 	n = length(b)
 
-	if norm(b)==0; return zeros(eltype(b),n),-9,0.0,0,[0.0]; end
+	if norm(b)==0; return zeros(eltype(b),n), -9, 0f0, 0, [0f0]; end
 	if isempty(x)
 		x = zeros(TF,n)#x = zeros(eltype(b),n)
 		r = copy(b)
@@ -55,9 +54,9 @@ function cg(A::Function,b::Vector{TF}; tol::Real=1e-2,maxIter::Integer=100,M::Fu
 	z = M(r)::Vector{TF}
 	p = copy(z)::Vector{TF}
 
-    if storeInterm
-        X = zeros(n,maxIter)	# allocate space for intermediates
-    end
+    # if storeInterm
+    #     X = zeros(n,maxIter)	# allocate space for intermediates
+    # end
 
 	nr0  = norm(b)
 
@@ -125,9 +124,85 @@ function cg(A::Function,b::Vector{TF}; tol::Real=1e-2,maxIter::Integer=100,M::Fu
 			println("cg achieved desired tolerance at iteration %d. Residual norm is %1.2e.",lastIter,resvec[lastIter])
 		end
 	end
-    if storeInterm
-        return X[:,1:lastIter],flag,resvec[lastIter],lastIter,resvec[1:lastIter]
-    else
-        return x,flag,resvec[lastIter],lastIter,resvec[1:lastIter]
-    end
+    return x,flag,resvec[lastIter],lastIter,resvec[1:lastIter]
 end
+
+#specifically for CDS/DIA storage matrices
+# function cg(R::Matrix{TF},offset::Vector{TI},b::Vector{TF},tol::Real,maxIter::Integer,x::Vector{TF},out::Integer) where {TF <: Real, TI <: Integer}
+# 	n  	   = length(b)
+# 	ndiags = size(R,2)
+# 	r  	   = zeros(TF,n)
+
+# 	if norm(b)==0; return zeros(eltype(b),n),-9,0.0,0,[0.0]; end
+
+# 	CDS_MVp_MT(n,ndiags,R,offset,x,r)
+# 	r .= -r .+ b
+# 		#r = b .- A(x)::Vector{TF}
+
+# 	z = copy(r)
+
+# 	p = copy(r)::Vector{TF}
+
+# 	nr0  = norm(b)
+
+# 	resvec = zeros(TF,maxIter)
+# 	iter   = 1 # makes iter available outside the loop
+# 	flag   = -1
+
+# 	if norm(r)/nr0<=tol
+# 		flag = 0;
+# 		return x,flag,resvec[iter],iter,resvec[1:iter]
+# 	end
+
+#  	alpha  	 = TF(0)
+# 	beta     = TF(0)
+# 	lastIter = 0
+# 	Ap  	 = zeros(TF,n)
+# 	for iter=1:maxIter
+# 		lastIter = iter
+
+# 		#Ap 	  = A(p)::Vector{TF}
+# 		fill!(Ap,TF(0))
+# 		CDS_MVp_MT(n,ndiags,R,offset,p,Ap)
+# 		gamma = dot(r,z)
+# 		#gamma = BLAS.dot(n, r,1,z,1)#
+# 		alpha = gamma / dot(p,Ap)
+# 		#alpha = gamma / BLAS.dot(n, p,1,Ap,1)#dot(p,Ap)
+
+# 		if alpha==Inf || alpha<0
+# 			flag = -2; break
+# 		end
+
+# 		BLAS.axpy!(n,alpha,p,1,x,1) # x = alpha*p+x
+# 		BLAS.axpy!(n,-alpha,Ap,1,r,1) # r -= alpha*Ap
+
+# 		#resvec[iter] = BLAS.nrm2(n, r, 1) / nr0#
+# 		resvec[iter]  = norm(r)/nr0
+# 		if resvec[iter] <= tol
+# 			flag = 0; break
+# 		end
+
+# 		#z    = M(r)::Vector{TF}
+# 		z = copy(r)
+# 		#beta = BLAS.dot(n, z,1,r,1) / gamma
+# 		beta  = dot(z,r) / gamma
+# 		# the following two lines are equivalent to p = z + beta*p
+# 		#p = BLAS.scal!(n,beta,p,1)::Vector{TF}
+# 		#p = BLAS.axpy!(n,TF(1.0),z,1,p,1)::Vector{TF}
+# 		p  = BLAS.axpby!(TF(1.0), z, beta, p)::Vector{TF} # Overwrite Y with X*a + Y*b, where a and b are scalars. Return Y
+# 	end
+#
+# 	if out>=0
+# 		if flag==-1
+# 			println("cg iterated maxIter (=%d) times but reached only residual norm %1.2e instead of tol=%1.2e.",
+# 																								maxIter,resvec[lastIter],tol)
+# 		elseif flag==-2
+# 			println("Matrix A in cg has to be positive definite.")
+# 		elseif flag==0 && out>=1
+# 			println("cg achieved desired tolerance at iteration %d. Residual norm is %1.2e.",lastIter,resvec[lastIter])
+# 		end
+# 	end
+
+#     return x,flag,resvec[lastIter],lastIter,resvec[1:lastIter]
+
+# end
diff --git a/src/compute_relative_feasibility.jl b/src/compute_relative_feasibility.jl
index 45b8282..bb2c2d4 100644
--- a/src/compute_relative_feasibility.jl
+++ b/src/compute_relative_feasibility.jl
@@ -6,19 +6,19 @@ r_feas =  ||P(A*x)-A*x||/||A*x||
 Helper function that is intended for the parallel version only, where just a single
 operator is sent to a worker where this script runs
 """
-# function compute_relative_feasibility(m::Vector{TF}, feasibility_initial::Vector{TF},
-#                                       TD_OP::Vector{Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}}},
-#                                        P_sub) where {TF<:Real,TI<:Integer}
-function compute_relative_feasibility(m::Vector{TF},feasibility_initial::TF,
-    TD_OP::Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}},
-     P_sub) where {TF<:Real,TI<:Integer}
+function compute_relative_feasibility(m::Vector{TF}, feasibility_initial::Vector{TF},
+                                      TD_OP::Vector{Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}}},
+                                       P_sub) where {TF<:Real,TI<:Integer}
+# function compute_relative_feasibility(m::Vector{TF},feasibility_initial::TF,
+#     TD_OP::Union{SparseMatrixCSC{TF,TI},JOLI.joAbstractLinearOperator{TF,TF}},
+#      P_sub) where {TF<:Real,TI<:Integer}
 
     #note that P_sub mutates the input in-place
-    s_temp = TD_OP*m
-    norm(P_sub(deepcopy(s_temp)) .- s_temp)
+    s_temp = TD_OP[1]*m
+    norm(P_sub[1](deepcopy(s_temp)) .- s_temp)
      # feasibility_initial[1]=norm(P_sub[1](pm) .- pm) ./ (norm(pm)+(100*eps(TF)))
-    feasibility_initial = norm(P_sub(deepcopy(s_temp)) .- s_temp) ./ (norm(s_temp)+(100*eps(TF)))
+    feasibility_initial[1] = norm(P_sub[1](deepcopy(s_temp)) .- s_temp) ./ (norm(s_temp)+(100*eps(TF)))
     s_temp = []
-    #gc()# check if we need this in julia >0.7
-    return feasibility_initial
+
+    #return feasibility_initial
 end