From 26e518fc9cb1afc8cf7d498014fdef9384ba6a34 Mon Sep 17 00:00:00 2001 From: BenjaProg Date: Wed, 14 Mar 2018 06:10:16 +0100 Subject: [PATCH] added SuperMUC subfolder, for DASH's plattform adjustments --- cowichan/SuperMUC/Makefile | 26 + cowichan/SuperMUC/Terminal_Color.h | 66 +++ cowichan/SuperMUC/benchAll.sh | 45 ++ cowichan/SuperMUC/chain/Makefile | 24 + cowichan/SuperMUC/chain/chain.cpp | 177 +++++++ cowichan/SuperMUC/make.defs | 12 + cowichan/SuperMUC/outer/Makefile | 22 + cowichan/SuperMUC/outer/outer.cpp | 129 +++++ cowichan/SuperMUC/outer/outer.h | 75 +++ cowichan/SuperMUC/product/Makefile | 22 + cowichan/SuperMUC/product/product.cpp | 109 +++++ cowichan/SuperMUC/product/product.h | 72 +++ cowichan/SuperMUC/randmat/Makefile | 24 + cowichan/SuperMUC/randmat/randmat.cpp | 110 +++++ cowichan/SuperMUC/randmat/randmat.h | 41 ++ cowichan/SuperMUC/thresh/Makefile | 22 + cowichan/SuperMUC/thresh/thresh.cpp | 146 ++++++ cowichan/SuperMUC/thresh/thresh.h | 121 +++++ cowichan/SuperMUC/winnow/Makefile | 22 + cowichan/SuperMUC/winnow/input | 24 + cowichan/SuperMUC/winnow/winnow.cpp | 193 ++++++++ cowichan/SuperMUC/winnow/winnow.h | 669 ++++++++++++++++++++++++++ 22 files changed, 2151 insertions(+) create mode 100644 cowichan/SuperMUC/Makefile create mode 100644 cowichan/SuperMUC/Terminal_Color.h create mode 100644 cowichan/SuperMUC/benchAll.sh create mode 100644 cowichan/SuperMUC/chain/Makefile create mode 100644 cowichan/SuperMUC/chain/chain.cpp create mode 100644 cowichan/SuperMUC/make.defs create mode 100644 cowichan/SuperMUC/outer/Makefile create mode 100644 cowichan/SuperMUC/outer/outer.cpp create mode 100644 cowichan/SuperMUC/outer/outer.h create mode 100644 cowichan/SuperMUC/product/Makefile create mode 100644 cowichan/SuperMUC/product/product.cpp create mode 100644 cowichan/SuperMUC/product/product.h create mode 100644 cowichan/SuperMUC/randmat/Makefile create mode 100644 cowichan/SuperMUC/randmat/randmat.cpp create mode 100644 cowichan/SuperMUC/randmat/randmat.h create mode 100644 cowichan/SuperMUC/thresh/Makefile create mode 100644 cowichan/SuperMUC/thresh/thresh.cpp create mode 100644 cowichan/SuperMUC/thresh/thresh.h create mode 100644 cowichan/SuperMUC/winnow/Makefile create mode 100644 cowichan/SuperMUC/winnow/input create mode 100644 cowichan/SuperMUC/winnow/winnow.cpp create mode 100644 cowichan/SuperMUC/winnow/winnow.h diff --git a/cowichan/SuperMUC/Makefile b/cowichan/SuperMUC/Makefile new file mode 100644 index 0000000..2de486a --- /dev/null +++ b/cowichan/SuperMUC/Makefile @@ -0,0 +1,26 @@ +include ./make.defs + +SUBDIRS = randmat thresh winnow outer product chain + + +.PHONY: all clean all-clean +#### ------------------------------------------------------------ +all: dash + +clean: dash-clean +all-clean: clean +#################### ##################### +#################### ##################### +.PHONY: dash dash-clean $(SUBDIRS) +#### ------------------------------------------------------------ + +dash: MAKECMDGOALS = all +dash: $(SUBDIRS) + +dash-clean: MAKECMDGOALS = clean +dash-clean: $(SUBDIRS) + +$(SUBDIRS): + @$(MAKE) --no-print-directory -C $@ $(MAKECMDGOALS) +#################### ############################## + diff --git a/cowichan/SuperMUC/Terminal_Color.h b/cowichan/SuperMUC/Terminal_Color.h new file mode 100644 index 0000000..8122e9b --- /dev/null +++ b/cowichan/SuperMUC/Terminal_Color.h @@ -0,0 +1,66 @@ +#ifndef TERMINAL_COLOR_HEADER +#define TERMINAL_COLOR_HEADER + +#define TERMINAL_COLORS + +#define BEGIN_COLOR "\x1B[" +#define END_COLOR "\x1B[0m" + +// Color codes for colorized output on terminal +enum Code { + FBLK = 30, + FRED , + FGREEN , + FYEL , + FBLUE , + FMAG , + FCYN , + FWHT = 37, + BRED = 41, + BGREEN , + BYEL , + BBLUE , + BMAG , + BCYN , + BWHT = 47 +}; + + +// Helper struct for fmt function +template +struct sPar{ + public: + const T& in; + const Code code; + const int width; + sPar(const T& in_, const Code code_, const int width_ = 0) + :in(in_), code(code_), width(width_){}; +}; + +// Function for formated output and more beautiful code +template +inline const sPar fmt( const T& in, const Code code, const int width = 0 ) { + return sPar( in, code, width ); +} + +std::ostream& operator<<(std::ostream& os, const sPar& par) { + #ifdef TERMINAL_COLORS + return os << BEGIN_COLOR << par.code << "m" << std::setw(par.width) << static_cast(par.in) << END_COLOR; + #else + return os << std::setw(par.width) << static_cast(par.in); + #endif +} + +// overloading for transfer of several parameters +template +std::ostream& operator<<(std::ostream& os, const sPar& par) { + #ifdef TERMINAL_COLORS + return os << BEGIN_COLOR << par.code << "m" << std::setw(par.width) << par.in << END_COLOR; + #else + return os << std::setw(par.width) << par.in; + #endif +} + + + +#endif \ No newline at end of file diff --git a/cowichan/SuperMUC/benchAll.sh b/cowichan/SuperMUC/benchAll.sh new file mode 100644 index 0000000..c54fc01 --- /dev/null +++ b/cowichan/SuperMUC/benchAll.sh @@ -0,0 +1,45 @@ +#!/bin/bash + + +probIxStart=0 +probIxEnd=5 +jobsStart=1 +jobsEnd=24 +numberOfIterations=10 +nRowsCols=(100 400 4000 40000) +thresh=(10 25 50 75 100) +winnowNelts=(100 400 4000 40000) + + +# #bench winnows and chains +# for (( nRowsColsIX=3 ; nRowsColsIX<4 ; ++nRowsColsIX )); do + # for (( threshIX=3 ; threshIX<5 ; ++threshIX )); do + # for (( winnowNeltsIX=0 ; winnowNeltsIX<4 ; ++winnowNeltsIX )); do + # #bench winnows + # ./benchDash.sh 2 2 $jobsStart $jobsEnd $numberOfIterations ${nRowsCols[$nRowsColsIX]} ${thresh[$threshIX]} ${winnowNelts[$winnowNeltsIX]} + # #bench chains + # ./benchDash.sh 5 5 $jobsStart $jobsEnd $numberOfIterations ${nRowsCols[$nRowsColsIX]} ${thresh[$threshIX]} ${winnowNelts[$winnowNeltsIX]} + # done + # done +# done + + +# #bench randmats +# for (( nRowsColsIX=2 ; nRowsColsIX<4 ; ++nRowsColsIX )); do + # ./benchDash.sh 0 0 $jobsStart $jobsEnd $numberOfIterations ${nRowsCols[$nRowsColsIX]} 0 0 +# done + +# #bench threshs +# tmp=2 +# for (( nRowsColsIX=2 ; nRowsColsIX<4 ; ++nRowsColsIX )); do + # for (( threshIX=tmp ; threshIX<5 ; ++threshIX )); do + # tmp=0 + # ./benchDash.sh 1 1 $jobsStart $jobsEnd $numberOfIterations ${nRowsCols[$nRowsColsIX]} ${thresh[$threshIX]} 0 + # done +# done + + +#bench outers and products +for (( winnowNeltsIX=0 ; winnowNeltsIX<4 ; ++winnowNeltsIX )); do + ./benchDash.sh 3 4 $jobsStart $jobsEnd $numberOfIterations 0 0 ${winnowNelts[$winnowNeltsIX]} +done \ No newline at end of file diff --git a/cowichan/SuperMUC/chain/Makefile b/cowichan/SuperMUC/chain/Makefile new file mode 100644 index 0000000..cb73f6a --- /dev/null +++ b/cowichan/SuperMUC/chain/Makefile @@ -0,0 +1,24 @@ +include ../make.defs + +.PHONY: all +all: chain +#mpirun -n 4 ./$@ < main.in > rand.out2 +#diff -ws rand.out2 main.gold + +chain: chain.cpp ../randmat/randmat.h ../thresh/thresh.h ../winnow/winnow.h ../outer/outer.h ../product/product.h + $(CXX) -c $(INC) $< + $(CXX) -o $@ $@.o $(LIB) + + +.PHONY: printenv +printenv: + @echo "CXX = $(CXX)" + @echo "DART_IMPL = $(DART_IMPL)" + @echo "DASH_ROOT = $(DASH_ROOT)" + @echo "INC = $(INC)" + @echo "LIB = $(LIB)" + +.PHONY: clean +clean: + -rm chain + -rm chain.o diff --git a/cowichan/SuperMUC/chain/chain.cpp b/cowichan/SuperMUC/chain/chain.cpp new file mode 100644 index 0000000..677d797 --- /dev/null +++ b/cowichan/SuperMUC/chain/chain.cpp @@ -0,0 +1,177 @@ +#include +#include + +using std::cout; +using std::endl; +using std::cin; + +using dash::Shared; +using dash::NArray; +using dash::Array; + +using uint = unsigned int ; +using uchar = unsigned char ; +using MATRIX_T = uchar ; + +static int myid; + +#include "../randmat/randmat.h" +#include "../thresh/thresh.h" +// #include "../winnow/winnow_placeholder.h" +#include "../winnow/winnow.h" +#include "../outer/outer.h" +#include "../product/product.h" + +#include +#include +#include +using std::strcmp; + +struct InputPar { uint nRowsCols, seed, thresh, winnow_nelts; } in; + + +/* One unit has the job to read in the parameters. + * Because there's always a unit0, it reads the input parameter and + * distributes them to the rest of the units. + */ +inline void ReadPars( ) +{ + Shared input_transfer; + + if(0 == myid) + { + cin >> in.nRowsCols ; + cin >> in.seed ; + cin >> in.thresh ; + cin >> in.winnow_nelts ; + + input_transfer.set(in) ; + } + input_transfer.barrier() ; + in = input_transfer.get(); +} + +/* both parameters musst have same size! + * copy from a dash::Array to std::vector + */ +inline void CopyFromDashToStd( + Array const & dashVector, + vector & loclVector) +{ + if(0 == myid) + { + double * vec = loclVector.data( ); + for( double const i : dashVector ) + { + *(vec++) = i; + } + } + BroadcastOuterVecToUnits( loclVector ); //defined in product.h +} + +/* both parameters musst have same size! + * copy from a dash::Array to std::vector + */ +template +inline void CopyFromDashToStd( + Array const & dashVector, + vector & loclVector) +{ + if(0 == myid) + { + T * vec = loclVector.data( ); + for( T const i : dashVector ) + { + *(vec++) = i; + } + } + + // using dash::Team ; + // using dash::team_unit_t ; + + // Team & team = dash::Team::All( ); + + dart_ret_t ret = dart_bcast( + static_cast( loclVector.data() ), // buf + loclVector.size() * sizeof(T) , // nelem + DART_TYPE_BYTE , // dtype + dash::team_unit_t(0) , // root/source + dash::Team::All( ).dart_id( ) ); // team + + if( DART_OK != ret ) cout << "An error while BCAST has occured!" << endl; +} + + +// @echo $(nRowsCols) $(seed) $(thresh) $(winnow_nelts) | $(TBB_ROOT)/chain/expertpar/main +int main( int argc, char* argv[] ) +{ + dash::init( &argc,&argv ); + + struct timespec start, stop; + double accum; + int is_bench = 0; + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--is_bench")) { + is_bench = 1; + } + } + + myid = dash::myid( ); + ReadPars( ); + + // initialize variables + NArray rand_mat ( in.nRowsCols , in.nRowsCols ); + NArray thresh_mask ( in.nRowsCols , in.nRowsCols ); + vector winnow_vecRes ( in.winnow_nelts ); //value defined in winnow.h + NArray outer_mat ( in.winnow_nelts, in.winnow_nelts ); + Array outer_vec ( in.winnow_nelts ); + vector prod_vec ( in.winnow_nelts ); + // Array < double > result ( in.winnow_nelts ); + res_array_t winnow_dashRes; // defined in winnow.h + + // after the run of outer "outer_vec" will be recycled/reused for the final output + auto & result = outer_vec; + + if( clock_gettime( CLOCK_MONOTONIC, &start) == -1 ) { + perror( "clock gettime error 1" ); + exit( EXIT_FAILURE ); + } + + // execute functions + Randmat( rand_mat, in.seed ); + + Thresh( rand_mat, thresh_mask, in.nRowsCols, in.nRowsCols, in.thresh ); + + // if(0 == myid){ winnow( in.nRowsCols, in.nRowsCols, rand_mat, thresh_mask, in.winnow_nelts, winnow_points );} + // BroadcastPointsToUnits( winnow_points ); + Winnow( in.nRowsCols, in.nRowsCols, rand_mat, thresh_mask, in.winnow_nelts, winnow_dashRes ); + CopyFromDashToStd( winnow_dashRes, winnow_vecRes ); + + Outer( winnow_vecRes, outer_mat, outer_vec, in.winnow_nelts ); + CopyFromDashToStd( outer_vec, prod_vec ); + + Product( prod_vec, outer_mat, result, in.winnow_nelts ); + + if( clock_gettime( CLOCK_MONOTONIC, &stop) == -1 ) { + perror( "clock gettime error 2" ); + exit( EXIT_FAILURE ); + } + + accum = ( stop.tv_sec - start.tv_sec ) + ( stop.tv_nsec - start.tv_nsec ) / 1e9; + + if( 0 == myid ){ + FILE* fp = fopen("./measurements.txt", "a"); + + if( !fp ) { + perror("File opening for benchmark results failed"); + return EXIT_FAILURE; + } + // Lang, Problem, rows, cols, thresh, winnow_nelts, jobs, time + fprintf( fp, "DASH ,Chain ,%5u,%5u,%3u,%5u,%2u,%.9lf,isBench:%d\n", in.nRowsCols, in.nRowsCols, in.thresh, in.winnow_nelts, dash::Team::All().size(), accum, is_bench ); + fclose ( fp ); + } + + if( !is_bench ){ PrintOutput( result, in.winnow_nelts ); } + dash::finalize( ); +} \ No newline at end of file diff --git a/cowichan/SuperMUC/make.defs b/cowichan/SuperMUC/make.defs new file mode 100644 index 0000000..4813cc7 --- /dev/null +++ b/cowichan/SuperMUC/make.defs @@ -0,0 +1,12 @@ +DASH_ROOT = $(HOME)/opt/dash-0.3.0 + +DART_IMPL = mpi + +INC=-I$(DASH_ROOT)/include +LIB=-L$(DASH_ROOT)/lib -ldash-$(DART_IMPL) -ldart-$(DART_IMPL) -ldart-base + +#CXXFLAGS= -g -O0 -Wall -DDASH_DEBUG=1 -DDASH_ENABLE_DEFAULT_INDEX_TYPE_LONG +#CXXFLAGS= -O3 -DDASH_ENABLE_DEFAULT_INDEX_TYPE_LONG + CXXFLAGS= -Ofast + + CXX = mpiCC -cxx=icpc -std=c++11 $(CXXFLAGS) diff --git a/cowichan/SuperMUC/outer/Makefile b/cowichan/SuperMUC/outer/Makefile new file mode 100644 index 0000000..e132dfa --- /dev/null +++ b/cowichan/SuperMUC/outer/Makefile @@ -0,0 +1,22 @@ +include ../make.defs + +.PHONY: all +all: outer + +outer: outer.cpp outer.h + $(CXX) -c $(INC) $< + $(CXX) -o $@ $@.o $(LIB) + + +.PHONY: printenv +printenv : + @echo "CXX = $(CXX)" + @echo "DART_IMPL = $(DART_IMPL)" + @echo "DASH_ROOT = $(DASH_ROOT)" + @echo "INC = $(INC)" + @echo "LIB = $(LIB)" + +.PHONY: clean +clean: + -rm outer + -rm outer.o diff --git a/cowichan/SuperMUC/outer/outer.cpp b/cowichan/SuperMUC/outer/outer.cpp new file mode 100644 index 0000000..8d8a04e --- /dev/null +++ b/cowichan/SuperMUC/outer/outer.cpp @@ -0,0 +1,129 @@ +#include + +using std::cout; +using std::cin; +using std::endl; +using dash::Shared; +using uint = unsigned int; + +uint nelts; +static int myid; + +using value = struct{ int row, col; }; //hast to be signed! +#include "outer.h" +#include +#include + +std::ifstream winnow_output; + + +inline void PrintOutput( + NArray < double, 2 > const & matOut , + Array < double > const & vec ) +{ + if( 0 == myid ){ + cout << nelts << "\n"; + uint count = 0; + cout << std::showpoint << std::fixed << std::setprecision(4); + + for(uint i = 0; i < matOut.extent(0); ++i) { + for(uint j = 0; j < matOut.extent(1); ++j) { + if(j) cout << " "; + cout << static_cast(matOut[i][j]); + } cout << "\n"; + } + + cout << "\n"; + + for(uint i = 0; i < vec.size(); ++i){ + if(i) cout << " "; + cout << static_cast(vec[i]); + } cout << endl; + } +} + + +inline void ReadNelts( char* argv[] ){ + + Shared nelts_transfer; + + if(0 == myid) + { + winnow_output.open(argv[1]); + winnow_output >> nelts; + + nelts_transfer.set(nelts); + } + nelts_transfer.barrier(); + nelts = nelts_transfer.get(); +} + + +inline void ReadVectorOfPoints( vector & points ) { + if( 0 == myid ) + { + for( uint i = 0; i < nelts; i++ ) { + winnow_output >> points[i].row; + winnow_output >> points[i].col; + } + + winnow_output.close(); + } +} + + +int main( int argc, char* argv[] ) +{ + dash::init( &argc,&argv ); + + struct timespec start, stop; + double accum; + int is_bench = 0; + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--is_bench")) { + is_bench = 1; + } + } + + myid = dash::myid( ); + ReadNelts( argv ); + + vector < value > points( nelts ); + NArray < double, 2 > matOut( nelts, nelts ); + Array < double > vec ( nelts ); + + //read input points on unit 0 and broadcast to all units + if (!is_bench) { ReadVectorOfPoints( points ); } + + if( clock_gettime( CLOCK_MONOTONIC, &start) == -1 ) { + perror( "clock gettime error 1" ); + exit( EXIT_FAILURE ); + } + + BroadcastPointsToUnits( points ); + Outer( points, matOut, vec, nelts ); + + if( clock_gettime( CLOCK_MONOTONIC, &stop) == -1 ) { + perror( "clock gettime error 2" ); + exit( EXIT_FAILURE ); + } + + accum = ( stop.tv_sec - start.tv_sec ) + ( stop.tv_nsec - start.tv_nsec ) / 1e9; + + + if( 0 == myid ){ + FILE* fp = fopen("./measurements.txt", "a"); + + if( !fp ) { + perror("File opening for benchmark results failed"); + return EXIT_FAILURE; + } + // Lang, Problem, rows, cols, thresh, winnow_nelts, jobs, time + fprintf( fp, "DASH ,Outer , , , ,%5u,%2u,%.9lf,isBench:%d\n", nelts, dash::Team::All().size(), accum, is_bench ); + fclose ( fp ); + } + + if (!is_bench) { PrintOutput(matOut, vec); } + dash::finalize( ); +} \ No newline at end of file diff --git a/cowichan/SuperMUC/outer/outer.h b/cowichan/SuperMUC/outer/outer.h new file mode 100644 index 0000000..4d5b02b --- /dev/null +++ b/cowichan/SuperMUC/outer/outer.h @@ -0,0 +1,75 @@ +using std::max; +using std::pair; +using std::vector; +using std::make_pair; + +using dash::Team; +using dash::Array; +using dash::NArray; +using dash::barrier; +using dash::team_unit_t; + + +inline double sqr(double const x) { return x * x; } + + +//calculates the distance between two points. +inline double distance( value const & x, value const & y ){ + return sqrt(sqr(x.row - y.row) + sqr(x.col - y.col)); +} + + +inline void Outer( + vector< value > const & points, + NArray< double, 2 > & matOut, + Array < double > & vec , + uint nelts ) +{ + /* "gRow" represents the global row number of the local matrix data + * the first local row has the initial global row number of "gRow" + * "end" holds the global row number exakt one past the last row + * number which is local at this unit. + * "matP" will be used to linear iterate over the local data + * "matBegin" will be used to access local data via []operator + */ + auto gRow = matOut.pattern().global({0,0})[0]; + auto end = gRow + matOut.pattern().local_extents()[0]; + + double nmax; + value zero = {0,0}; + + auto matBegin = matOut.lbegin(); + auto matP = matOut.lbegin(); + + + for( decltype(gRow) i = 0; gRow < end; ++gRow, ++i ) { + + nmax = 0; + + for( decltype(gRow) j = 0; j < nelts; ++j,++matP ) { + if( gRow != j) { + *matP = distance(points[gRow], points[j]); + nmax = max( nmax, *matP ); + } + } + + matBegin[i*nelts+gRow] = nelts * nmax; + vec.local[i] = distance( zero, points[gRow] ); + } + + barrier( ); +} + + +template +inline void BroadcastPointsToUnits( vector & points ) +{ + dart_ret_t ret = dart_bcast( + static_cast( points.data() ), // buf + points.size( ) * sizeof(T) , // nelts + DART_TYPE_BYTE , // dtype + dash::team_unit_t(0) , // root + dash::Team::All( ).dart_id( ) ); // team + + if( DART_OK != ret ) cout << "An error while BCAST has occured!" << endl; +} \ No newline at end of file diff --git a/cowichan/SuperMUC/product/Makefile b/cowichan/SuperMUC/product/Makefile new file mode 100644 index 0000000..6638414 --- /dev/null +++ b/cowichan/SuperMUC/product/Makefile @@ -0,0 +1,22 @@ +include ../make.defs + +.PHONY: all +all: product + +product: product.cpp product.h + $(CXX) -c $(INC) $< + $(CXX) -o $@ $@.o $(LIB) + + +.PHONY: printenv +printenv : + @echo "CXX = $(CXX)" + @echo "DART_IMPL = $(DART_IMPL)" + @echo "DASH_ROOT = $(DASH_ROOT)" + @echo "INC = $(INC)" + @echo "LIB = $(LIB)" + +.PHONY: clean +clean: + -rm product + -rm product.o diff --git a/cowichan/SuperMUC/product/product.cpp b/cowichan/SuperMUC/product/product.cpp new file mode 100644 index 0000000..ea4e1fb --- /dev/null +++ b/cowichan/SuperMUC/product/product.cpp @@ -0,0 +1,109 @@ +#include + +using std::cin; +using dash::Shared; + +using uint = unsigned int; + +uint nelts; +static int myid; + +#include "product.h" +#include +#include +#include + +using std::strcmp; + +std::ifstream outer_output; + +inline void ReadMatrixAndVector( + NArray < double, 2> & matIn, + vector < double > & vec ) +{ + if( 0 == myid ) + { + // read matrix + double tmp; + for ( auto i : matIn ){ outer_output >> tmp; i = tmp; } + + // //Read Vector + for (int i = 0; i < vec.size(); i++){ outer_output >> vec[i]; } + + outer_output.close(); + } +} + + +inline void ReadNelts( char * argv[] ) +{ + Shared nelts_transfer; + + if(0 == myid) + { + outer_output.open(argv[1]); + outer_output >> nelts; + + nelts_transfer.set(nelts); + } + nelts_transfer.barrier(); + nelts = nelts_transfer.get(); +} + + +int main( int argc, char* argv[] ) +{ + dash::init( &argc,&argv ); + myid = dash::myid( ); + + struct timespec start, stop; + double accum; + int is_bench = 0; + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--is_bench")) { + is_bench = 1; + } + } + + ReadNelts( argv ); + + NArray < double, 2 > matIn ( nelts, nelts ); + Array < double > result ( nelts ); + vector < double > vec ( nelts ); + + //read input on unit 0 + if (!is_bench) { ReadMatrixAndVector(matIn, vec); } + + if( clock_gettime( CLOCK_MONOTONIC, &start) == -1 ) { + perror( "clock gettime error 1" ); + exit( EXIT_FAILURE ); + } + + //broadcast vector from unit0 to all other units + BroadcastOuterVecToUnits(vec); + Product(vec, matIn, result, nelts ); + + if( clock_gettime( CLOCK_MONOTONIC, &stop) == -1 ) { + perror( "clock gettime error 2" ); + exit( EXIT_FAILURE ); + } + + accum = ( stop.tv_sec - start.tv_sec ) + ( stop.tv_nsec - start.tv_nsec ) / 1e9; + + + if( 0 == myid ){ + FILE* fp = fopen("./measurements.txt", "a"); + + if( !fp ) { + perror("File opening for benchmark results failed"); + return EXIT_FAILURE; + } + // Lang, Problem, rows, cols, thresh, winnow_nelts, jobs, time + fprintf( fp, "DASH ,Product, , , ,%5u,%2u,%.9lf,isBench:%d\n", nelts, dash::Team::All().size(), accum, is_bench ); + fclose ( fp ); + } + + if( !is_bench ){ PrintOutput( result, nelts ); } + dash::finalize( ); +} \ No newline at end of file diff --git a/cowichan/SuperMUC/product/product.h b/cowichan/SuperMUC/product/product.h new file mode 100644 index 0000000..76bc19d --- /dev/null +++ b/cowichan/SuperMUC/product/product.h @@ -0,0 +1,72 @@ +using std::cout; +using std::endl; +using std::vector; + +using dash::NArray; +using dash::Array; +using dash::team_unit_t; +using dash::Team; +using dash::barrier; + +inline void PrintOutput( Array const & result, uint const nelts) +{ + if(0 == myid){ + cout << nelts << "\n"; + cout << std::showpoint << std::fixed << std::setprecision(4); + + for(auto i : result) { + cout << static_cast(i) << " "; + } cout << endl; + } +} + + +inline void BroadcastOuterVecToUnits( vector & vec ) +{ + team_unit_t TeamUnit0ID = Team::All().myid( ); + TeamUnit0ID.id = 0; + dart_ret_t ret = dart_bcast( + static_cast(vec.data( )), // buf + vec.size( ) , // nelts + DART_TYPE_DOUBLE , // dtype + TeamUnit0ID , // root + Team::All().dart_id( ) ); // team + + if( DART_OK != ret ) cout << "An error while BCAST has occured!" << endl; +} + + +inline void Product( + vector < double > const & vec , + NArray < double, 2 > const & matIn , + Array < double > & result , + uint const nelts ) +{ + uint lclRows = matIn.pattern().local_extents()[0]; + double sum ; + double const * mPtr ; + double const * vPtr ; + double * res = result.lbegin(); + + for(uint i = 0; i < lclRows; ++i) + { + sum = 0; + mPtr = matIn.local.row(i).lbegin(); + vPtr = vec.data(); + + /* first loop iteration is done here. + * so in loop can the prefix operator be used + */ + sum += *mPtr * *vPtr; + + /* one less loop iteration because of the line before + * -> j != 0 but j = 1 + */ + for(uint j = 1; j < nelts ; ++j){ + sum += *++mPtr * *++vPtr; + } + *(res++) = sum; + } + + barrier(); +} diff --git a/cowichan/SuperMUC/randmat/Makefile b/cowichan/SuperMUC/randmat/Makefile new file mode 100644 index 0000000..6f01c0c --- /dev/null +++ b/cowichan/SuperMUC/randmat/Makefile @@ -0,0 +1,24 @@ +include ../make.defs + +.PHONY: all +all: randmat +#mpirun -n 4 ./$@ < main.in > rand.out2 +#diff -ws rand.out2 main.gold + +randmat: randmat.cpp randmat.h + $(CXX) -c $(INC) $< + $(CXX) -o $@ $@.o $(LIB) + + +.PHONY: printenv +printenv: + @echo "CXX = $(CXX)" + @echo "DART_IMPL = $(DART_IMPL)" + @echo "DASH_ROOT = $(DASH_ROOT)" + @echo "INC = $(INC)" + @echo "LIB = $(LIB)" + +.PHONY: clean +clean: + -rm randmat + -rm randmat.o diff --git a/cowichan/SuperMUC/randmat/randmat.cpp b/cowichan/SuperMUC/randmat/randmat.cpp new file mode 100644 index 0000000..c78ad7f --- /dev/null +++ b/cowichan/SuperMUC/randmat/randmat.cpp @@ -0,0 +1,110 @@ +#include +#include +//#define _POSIX_C_SOURCE 199506L + +using std::cout; +using std::endl; +using std::cin; + +using dash::Shared; + +using uint = unsigned int ; +using uchar = unsigned char; +using MATRIX_T = uchar ; + +struct InputPar { uint nrows, ncols, s; } in; +static int myid; + +#include "randmat.h" +#include +#include +/* + * One unit has the job to read in the parameters. + * Because there's always a unit0, it reads the input parameter and + * distributes them to the rest of the units. + */ +inline void ReadPars() +{ + Shared input_transfer; + + if(0 == myid) + { + cin >> in.nrows; + cin >> in.ncols; + cin >> in.s ; + + input_transfer.set(in); + } + input_transfer.barrier(); + in = input_transfer.get(); +} + + +/* + * This function prints the content of a 2D matrix to std::out. + * Datatypes are casted to for readable output + * (otherwise uchars would be printed as chars and not as numerics) + */ +template< typename T = MATRIX_T > +inline void Print2D( NArray< T, 2 > const & mat ) +{ + if(0==myid){ + for( int i = 0; i < mat.extent(0); i++ ) { + for( int j = 0; j < mat.extent(1); j++ ) { + cout << std::setw(3) << static_cast( mat(i,j) )<< " "; + } + cout << endl; + } cout << endl; + } +} + + +int main( int argc, char* argv[] ) +{ + dash::init( &argc,&argv ); + + struct timespec start, stop; + double accum; + int is_bench = 0; + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--is_bench")) { + is_bench = 1; + } + } + + myid = dash::myid( ); + ReadPars( ); + + NArray rand_mat ( in.nrows, in.ncols ); + + if( clock_gettime( CLOCK_MONOTONIC, &start) == -1 ) { + perror( "clock gettime error 1" ); + exit( EXIT_FAILURE ); + } + + Randmat( rand_mat, in.s ); + + if( clock_gettime( CLOCK_MONOTONIC, &stop) == -1 ) { + perror( "clock gettime error 2" ); + exit( EXIT_FAILURE ); + } + + accum = ( stop.tv_sec - start.tv_sec ) + ( stop.tv_nsec - start.tv_nsec ) / 1e9; + + + if( 0 == myid ){ + FILE* fp = fopen("./measurements.txt", "a"); + + if( !fp ) { + perror("File opening for benchmark results failed"); + return EXIT_FAILURE; + } + // Lang, Problem, rows, cols, thresh, winnow_nelts, jobs, time + fprintf( fp, "DASH ,Randmat,%5u,%5u, , ,%2u,%.9lf,isBench:%d\n", in.nrows, in.ncols, dash::Team::All().size(), accum, is_bench ); + fclose ( fp ); + } + + if (!is_bench) { Print2D( rand_mat ); } + dash::finalize( ); +} diff --git a/cowichan/SuperMUC/randmat/randmat.h b/cowichan/SuperMUC/randmat/randmat.h new file mode 100644 index 0000000..b5f98ee --- /dev/null +++ b/cowichan/SuperMUC/randmat/randmat.h @@ -0,0 +1,41 @@ +/* + * The Cowichan problems require that the output is independent of the + * numbers of processors used. For randmat() a common solution found + * in other implementations is to use a simple linear congruential + * random number generator (LCG) with a separate deterministic seed + * for each row and to parallelize over the rows of the matrix. This + * is also how the DASH solution below works. + * + * A potential alternative would be to use a counter-based random + * number generation scheme (e.g. random123) that can be easily + * parallelized. + */ + +using dash::barrier; +using dash::NArray; + +template< typename T = MATRIX_T > +inline void Randmat( + NArray< T, 2 > & rand_mat, + uint const seed ) +{ + const int LCG_A = 1664525, LCG_C = 1013904223; + + uint nrows = rand_mat.local.extent(0); // num of local rows + uint ncols = rand_mat.local.extent(1); // num of local cols + + auto gc = rand_mat.pattern( ).global( {0,0} ); + uint gbeg = gc[0]; // global row of local (0,0) + + if( 0 < rand_mat.local_size( ) ){ + for( uint i = 0; i < nrows; ++i ) { + uint s = seed + gbeg + i; + + for( int j = 0; j < ncols; ++j ) { + s = LCG_A * s + LCG_C; + rand_mat.lbegin( )[i*ncols + j] = ( (unsigned)s ) % 100; + } + } + } + barrier( ); +} \ No newline at end of file diff --git a/cowichan/SuperMUC/thresh/Makefile b/cowichan/SuperMUC/thresh/Makefile new file mode 100644 index 0000000..77a9525 --- /dev/null +++ b/cowichan/SuperMUC/thresh/Makefile @@ -0,0 +1,22 @@ +include ../make.defs + +.PHONY: all +all: thresh + +thresh: thresh.cpp thresh.h + $(CXX) -c $(INC) $< + $(CXX) -o $@ $@.o $(LIB) + + +.PHONY: printenv +printenv : + @echo "CXX = $(CXX)" + @echo "DART_IMPL = $(DART_IMPL)" + @echo "DASH_ROOT = $(DASH_ROOT)" + @echo "INC = $(INC)" + @echo "LIB = $(LIB)" + +.PHONY: clean +clean: + -rm thresh + -rm thresh.o diff --git a/cowichan/SuperMUC/thresh/thresh.cpp b/cowichan/SuperMUC/thresh/thresh.cpp new file mode 100644 index 0000000..1c1f05d --- /dev/null +++ b/cowichan/SuperMUC/thresh/thresh.cpp @@ -0,0 +1,146 @@ +#include +#include +#include + +using std::cout; +using std::endl; +using std::cin; +using uint = unsigned int ; +using uchar = unsigned char; +using MATRIX_T = uchar ; + +struct InputPar { uint nrows, ncols; } in; +uint percent; +static int myid; + +#include "thresh.h" +#include +#include + +std::ifstream randmat_output; + +/* + * This function prints the content of a 2D matrix to std::out. + * Datatypes are casted to for readable output + * (otherwise uchars would be printed as chars and not as numerics) + */ +inline void Print2D( NArray< bool, 2 > const & mat ) +{ + if(0==myid){ + + //cout << in.nrows << " " << in.ncols << "\n"; + + for( int i = 0; i < mat.extent(0); i++ ) { + for( int j = 0; j < mat.extent(1); j++ ) { + cout << static_cast( mat(i,j) )<< " "; + } + cout << "\n"; + } + } +} + +// unit0 reads the matrix with random values from std::in +template +inline void ReadRandMat( NArray< T, 2 > & rand_mat ) +{ + if(0 == myid){ + int tmp; + for ( auto i : rand_mat ){ + randmat_output >> tmp; + i = static_cast(tmp); + } + } +} + +/* + * One unit has the job to read in the parameters. + * Because there's always a unit0, it reads the input parameter and + * distributes them to the rest of the units. + */ +inline void ReadRowsNCols( char * argv[] ) +{ + Shared input_transfer; + + if(0 == myid) + { + randmat_output.open(argv[1]); + + randmat_output >> in.nrows; + randmat_output >> in.ncols; + + input_transfer.set(in); + } + input_transfer.barrier(); + in = input_transfer.get(); +} + +inline void ReadPercentage( ) +{ + Shared percent_transfer; + + if(0 == myid) + { + randmat_output >> percent; + + percent_transfer.set(percent); + + randmat_output.close(); + } + percent_transfer.barrier(); + percent = percent_transfer.get(); +} + + +int main( int argc, char* argv[] ) +{ + dash::init( &argc, &argv ); + + struct timespec start, stop; + double accum; + int is_bench = 0; + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--is_bench")) { + is_bench = 1; + } + } + + myid = dash::myid( ); + ReadRowsNCols( argv ); + + NArray< MATRIX_T, 2 > rand_mat ( in.nrows, in.ncols ); + NArray< bool , 2 > thresh_mask ( in.nrows, in.ncols ); + + if (!is_bench) { ReadRandMat(rand_mat); } + ReadPercentage(); + + if( clock_gettime( CLOCK_MONOTONIC, &start) == -1 ) { + perror( "clock gettime error 1" ); + exit( EXIT_FAILURE ); + } + + Thresh( rand_mat, thresh_mask, in.nrows, in.ncols, percent ); + + if( clock_gettime( CLOCK_MONOTONIC, &stop) == -1 ) { + perror( "clock gettime error 2" ); + exit( EXIT_FAILURE ); + } + + accum = ( stop.tv_sec - start.tv_sec ) + ( stop.tv_nsec - start.tv_nsec ) / 1e9; + + + if( 0 == myid ){ + FILE* fp = fopen("./measurements.txt", "a"); + + if( !fp ) { + perror("File opening for benchmark results failed"); + return EXIT_FAILURE; + } + // Lang, Problem, rows, cols, thresh, winnow_nelts, jobs, time + fprintf( fp, "DASH ,Thresh ,%5u,%5u,%3u, ,%2u,%.9lf,isBench:%d\n", in.nrows, in.ncols, percent, dash::Team::All().size(), accum, is_bench ); + fclose ( fp ); + } + + if (!is_bench) { Print2D( thresh_mask ); } + dash::finalize( ); +} diff --git a/cowichan/SuperMUC/thresh/thresh.h b/cowichan/SuperMUC/thresh/thresh.h new file mode 100644 index 0000000..0bee37c --- /dev/null +++ b/cowichan/SuperMUC/thresh/thresh.h @@ -0,0 +1,121 @@ +using dash::NArray; +using dash::Array; +using dash::max_element; +using dash::BLOCKED; +using dash::size; +using dash::barrier; +using dash::Shared; + +template +inline void Thresh( + NArray< T , 2 > const & rand_mat , + NArray< bool, 2 > & thresh_mask, + uint const nrows , + uint const ncols , + uint const percent ) +{ + // find max value in rand_mat + auto max_glob = max_element( rand_mat.begin( ), rand_mat.end( ) ); + + T max = *max_glob; + + // get number of units running + size_t num_units = size( ); + + // create global histo array and initialze with 0 + Array histo( (max + 1) * num_units, BLOCKED ); + + // initialize the histogram + for( uint * i = histo.lbegin(); i < histo.lend(); ++i) { + *i = 0; + } + + // every unit generates a histogram for the local values + for( T const * i = rand_mat.lbegin( ); i < rand_mat.lend( ); ++i ) { + ++histo.local[*i]; + } + + /* barrier is necessary because if unit 0 is still calculating + * while another unit starts with dash::transform there could be a race condition + */ + barrier( ); + + // add the values of the local histogram to the histogram of unit0 + if( 0 != myid ) { + dash::transform( + histo.lbegin ( ) , + histo.lend ( ) , + histo.begin ( ) , // points to global begin -> lbegin of unit0 + histo.begin ( ) , + dash::plus ( ) ); + } + + // create new shared variable + Shared threshold; + + // wait for all units to finish adding (especially unit0 should wait) + barrier( ); + +/* + * In the following scope unit0 calculates the threshold for the + * matrix with random values. A given percentage defines how much values + * are to be hold in the result. Lower values are dropped first and so + * unit0 calculates which "low" values are dropped and defines therefore + * a threshold. + */ + if( 0 == myid ) { + + // if compiled; unit0 prints the global histogram (which resides at this point only on unit0) + #if 0 + for( uint j = 0; j < histo.lsize( ); ++j ) { + if( histo.local[j] ) cout << std::setw(3) << j + << " counted: " << histo.local[j] << endl; + } + #endif + + // count defines how many values are to be hold on given percentage + uint count = ( static_cast(nrows) * ncols * percent ) / 100; + uint prefixsum = 0; + int i; + + // find threshold + for( i = max; i >= 0 && prefixsum <= count; --i ) { + prefixsum += histo.local[i]; + } + + threshold.set( ++i ); + #if 0 + cout << "original threshold: " << i << " - perc: " << percent << endl; + #endif + } + + // wait for unit0 to finish and flush + threshold.barrier( ); + + int threshLclCpy = threshold.get( ); + + T const * src = rand_mat.lbegin( ); + bool * i = thresh_mask.lbegin ( ); + + /* //debug ausgabe von rand_mat + int co = 0; + while( src < rand_mat.lend()){ + cout << *src++ << " "; + if (++co == 9){ + cout << endl; + co = 0; + } + }*/ + + *i = ( *(src) >= threshLclCpy); + while ( i < thresh_mask.lend( ) - 1 ) { + *(++i) = (*(++src) >= threshLclCpy); + } + + #if 0 + cout << myid << " got threshold: " << threshLclCpy << endl; + #endif + + // wait for all units finish calculating local boolean mask + barrier( ); +} \ No newline at end of file diff --git a/cowichan/SuperMUC/winnow/Makefile b/cowichan/SuperMUC/winnow/Makefile new file mode 100644 index 0000000..d90a3d5 --- /dev/null +++ b/cowichan/SuperMUC/winnow/Makefile @@ -0,0 +1,22 @@ +include ../make.defs + +.PHONY: all +all: winnow + +winnow: winnow.cpp winnow.h ../Terminal_Color.h + $(CXX) -c $(INC) $< + $(CXX) -o $@ $@.o $(LIB) + +.PHONY: printenv +printenv : + @echo "CXX = $(CXX)" + @echo "DART_IMPL = $(DART_IMPL)" + @echo "DASH_ROOT = $(DASH_ROOT)" + @echo "INC = $(INC)" + @echo "LIB = $(LIB)" + +.PHONY: clean +clean: + -rm winnow + -rm winnow.o + diff --git a/cowichan/SuperMUC/winnow/input b/cowichan/SuperMUC/winnow/input new file mode 100644 index 0000000..f547433 --- /dev/null +++ b/cowichan/SuperMUC/winnow/input @@ -0,0 +1,24 @@ +10 10 +98 5 20 63 78 57 92 67 6 5 +23 10 61 44 59 82 49 80 67 6 +48 19 2 25 44 7 10 93 32 7 +73 24 43 6 29 28 67 6 93 8 +98 29 80 87 10 53 24 19 58 9 +23 34 21 68 95 78 85 32 19 10 +48 39 62 53 80 3 42 45 84 11 +73 44 99 34 61 24 3 58 45 12 +98 49 40 15 46 49 60 71 10 17 +23 54 81 96 31 74 21 84 71 18 + +1 0 0 0 1 0 1 0 0 0 +0 0 0 0 0 1 0 1 0 0 +0 0 0 0 0 0 0 1 0 0 +1 0 0 0 0 0 0 0 1 0 +1 0 1 1 0 0 0 0 0 0 +0 0 0 1 1 1 1 0 0 0 +0 0 0 0 1 0 0 0 1 0 +1 0 1 0 0 0 0 0 0 0 +1 0 0 0 0 0 0 1 0 0 +0 0 1 1 0 1 0 1 1 0 + +3 diff --git a/cowichan/SuperMUC/winnow/winnow.cpp b/cowichan/SuperMUC/winnow/winnow.cpp new file mode 100644 index 0000000..acc1b98 --- /dev/null +++ b/cowichan/SuperMUC/winnow/winnow.cpp @@ -0,0 +1,193 @@ +#include + +using uint = unsigned int; + +// static variables +static struct InputPar { uint nrows, ncols; } in; +static uint nelts; +static uint thresh; +static int myid ; + +#include "winnow.h" +#include +#include + +std::ifstream raThr_output; + int is_bench = 0; + +/* + * One unit has the job to read in the parameters. + * Because there's always a unit0, it reads the input parameter and + * distributes them to the rest of the units. + */ +inline void ReadRowsNCols( char * argv[] ) +{ + Shared input_transfer; + + if(0 == myid) + { + raThr_output.open(argv[1]); + + raThr_output >> in.nrows; + raThr_output >> in.ncols; + + input_transfer.set(in); + } + input_transfer.barrier(); + in = input_transfer.get(); +} + + +template< typename T = MATRIX_T > +inline void ReadMatricesAndNelts( NArray& randMat, NArray& threshMask ) +{ + Shared nelts_transfer; + Shared thresh_transfer; + + if(0 == myid) + { + if (!is_bench) { + //read matrices + int tmp; + + for ( auto i : randMat ) + { + // scanf( "%u", &tmp ) , i = tmp; + raThr_output >> tmp; + i = static_cast(tmp); + } + + bool tmpB; + + for ( auto i : threshMask ) + { + // scanf( "%u", &tmpB ) , i = tmpB; + raThr_output >> tmpB; + i = static_cast(tmpB); + } + } + + raThr_output >> nelts; + if(is_bench) raThr_output >> thresh; + raThr_output.close(); + + thresh_transfer.set(thresh); + nelts_transfer.set(nelts); + } + nelts_transfer.barrier(); + nelts = nelts_transfer.get(); + thresh = thresh_transfer.get(); +} + +template< typename T = MATRIX_T > +inline void FillOnTheFly( NArray& randMat, NArray& threshMask ) +{ + auto gR = randMat.pattern( ).global( {0,0} ); + auto gT = threshMask.pattern( ).global( {0,0} ); + + uint i = gR[0]; // global row of local (0,0) + uint j = 0; + // cout << "ich hab:" << i << "\n"; + + for( T * ptr = randMat.lbegin(); ptr < randMat.lend(); ++ptr) + { + *ptr = (i*in.ncols+j) % 100; + if(++j == in.ncols) {++i;j=0;} + } + + uint threshInverse = 100 / thresh; + i = gT[0]; // global row of local (0,0) + j = 0; + + for( bool * ptr = threshMask.lbegin(); ptr < threshMask.lend(); ++ptr) + { + if(( ( i*in.ncols+j) % threshInverse ) == 0) { + *ptr = true; + }else{ + *ptr = false; + } + if(++j == in.ncols) {++i;j=0;} + } + + dash::barrier(); +} + + +int main( int argc, char* argv[] ) +{ + dash::init( &argc,&argv ); + + struct timespec start, stop; + double accum; + + for (int i = 1; i < argc; i++) { + if (!strcmp(argv[i], "--is_bench")) { + is_bench = 1; + } + } + + myid = dash::myid( ); + ReadRowsNCols( argv ); + + + NArray< MATRIX_T, 2 > randMat ( in.nrows, in.ncols ); + NArray< bool , 2 > threshMask ( in.nrows, in.ncols ); + + res_array_t result; + + + #ifdef DEBUG // print error message if mask's and matrix's local size aren't identical + if( threshMask.local_size() != randMat.local_size() ) + { + cout << "On unit " << myid + << " the local sizes of matrix and mask differ!\naborted on this unit\n"; + return -1; + } + #endif + + ReadMatricesAndNelts( randMat, threshMask ); + + if (is_bench) FillOnTheFly( randMat, threshMask ); + + if( clock_gettime( CLOCK_MONOTONIC, &start) == -1 ) { + perror( "clock gettime error 1" ); + exit( EXIT_FAILURE ); + } + + Winnow( in.nrows, in.ncols, randMat, threshMask, nelts, result ); + + if( clock_gettime( CLOCK_MONOTONIC, &stop) == -1 ) { + perror( "clock gettime error 2" ); + exit( EXIT_FAILURE ); + } + + + accum = ( stop.tv_sec - start.tv_sec ) + ( stop.tv_nsec - start.tv_nsec ) / 1e9; + + if( 0 == myid ){ + FILE* fp = fopen("./measurements.txt", "a"); + + if( !fp ) { + perror("File opening for benchmark results failed"); + return EXIT_FAILURE; + } + // Lang, Problem, rows, cols, thresh, winnow_nelts, jobs, time + fprintf( fp, "DASH ,Winnow ,%5u,%5u,%3u,%5u,%2u,%.9lf,isBench:%d\n", in.nrows, in.ncols, thresh, nelts, dash::Team::All().size(), accum, is_bench ); + fclose ( fp ); + } + + if (!is_bench) { + // output + if( 0 == myid ) + { + cout << nelts << "\n"; + + for( value it : result ) cout << it; + + cout << endl; + } + } + dash::finalize( ); +} + + diff --git a/cowichan/SuperMUC/winnow/winnow.h b/cowichan/SuperMUC/winnow/winnow.h new file mode 100644 index 0000000..c423b68 --- /dev/null +++ b/cowichan/SuperMUC/winnow/winnow.h @@ -0,0 +1,669 @@ +#include "../Terminal_Color.h" + +#include // for std::malloc +#include // for std::memcpy and std::memset + +// #define DEBUG +// #define DEBUG_DETAILED // requires DEBUG + + +#ifdef DEBUG + #include + #include +#endif + + +#define MAX_KEY 99 +#define MIN_NUM_ELEM_PER_UNIT 10 + + +using std::cout ; +using std::cin ; +using std::endl ; +using std::vector ; +using std::pair ; + +using dash::Team ; +using dash::Array ; +using dash::NArray ; +using dash::Shared ; +using dash::fill ; +using dash::team_unit_t ; + +using uint = unsigned int ; +using uchar = unsigned char ; +using MATRIX_T = uchar ; + +using value = struct{ int row, col; }; //hast to be signed! - results are stored in here + +using Point = struct{ MATRIX_T value; uint row, col; }; // results are created using these +using unitRange = struct{ MATRIX_T begin, end; uint count; }; // unitValueRange + +using pattern_t = dash::CSRPattern< 1, dash::ROW_MAJOR, uint >; +using extent_t = pattern_t::size_type; +using res_array_t = dash::Array; + + +bool operator<(const Point& lhs, const Point& rhs) +{ + return lhs.value < rhs.value; +} + + +std::ostream& operator<<(std::ostream& os, const Point& p) +{ + #ifdef DEBUG + // return os << "(" << fmt( p.value, FCYN ) << "," << fmt( p.row, FGREEN ) << "," << fmt( p.col, FGREEN ) << ")-"; + return os << "(" << fmt( p.value, FCYN ) << "," << fmt( p.row, FGREEN ) << "," << fmt( p.col, FGREEN ) << ")"; + #else + return os << p.row << " " << p.col << "\n"; + #endif +} + +std::ostream& operator<<(std::ostream& os, const value& p) +{ + return os << p.row << " " << p.col << "\n"; +} + + +// #ifdef DEBUG + using std::this_thread::sleep_for; + inline void __sleep( uint const baseDur = 0, uint const mult = 10 ) + { + uint SLEEP_TIME__ = (myid + 1) * mult + baseDur; + sleep_for(std::chrono::milliseconds(SLEEP_TIME__)); + } +// #endif + + +template +inline void Winnow( + uint const nrows , + uint const ncols , + NArray< T,2> & randMat , + NArray & threshMask , + uint const nelts , + res_array_t & result ) +{ + Team & team = dash::Team::All ( ); + size_t nUnits = team.size ( ); + + #ifdef DEBUG_DETAILED + if( 0 == myid ) cout << "nUnits: " << nUnits << endl; + #endif + + /* create global histo array for sorting + * size += 1 for direct Index access -> histo[2]++ counts for value 2 + * size += 1 for additional value at the end of the + * histogram -> used for counter how many values were found + */ + Array histo( (MAX_KEY + 2) * nUnits, dash::BLOCKED ); + // fill( histo.begin( ), histo.end( ), 0 ); + // for(uint * it = histo.lbegin( ); it < histo.lend( ); ++it){*it = 0;} + std::memset( histo.lbegin(), 0, histo.lsize() * sizeof(uint) ); + + // local found points are gathered in this vector + vector pointsLocal; + + + // returns a object with the global row and column of the the local coordinates {0,0} + auto globIndex = randMat.pattern( ).global( {0,0} ); + + uint gRow = globIndex[0] ; + uint gCol = globIndex[1] ; + uint & found = histo.local[MAX_KEY+1] ; + T const * matrEl = randMat.lbegin( ) ; + + + /* read in local part of mask - matrix combination + * and while doing that: + * generate histogram and count the values (-> found) + */ + for ( bool const * maskEl = threshMask.lbegin( ); maskEl < threshMask.lend( ); ++maskEl , ++matrEl, ++gCol ) + { + if( gCol == ncols ) gCol = 0, ++gRow; // end of row -> next row in matrix/mask + if( *maskEl ) + { + pointsLocal.push_back( Point{ *matrEl, gRow, gCol } ); + ++histo.local[*matrEl]; + } + } + found = pointsLocal.size(); + + #ifdef DEBUG // print points found local + dash::barrier(); // only needed for better IO Output + __sleep( ); + + cout << "#" << fmt( myid, FBLUE, 2 ) << ": "; + for( auto it : pointsLocal){ cout << it << "-" ; } + cout << endl; + + #ifdef DEBUG_DETAILED + __sleep(20); + cout << "#" << fmt( myid, FBLUE, 2 ) << ": found via pointsLocal:" << fmt( pointsLocal.size() , FRED, 2 ) << "\n"; + cout << "#" << fmt( myid, FBLUE, 2 ) << ": found via histogram :" << fmt( found , FRED, 2 ) << "\n"; + // cout << "#" << fmt( myid, FBLUE, 2 ) << ": histogram.lsize:" << fmt(found - histo.lbegin(), FRED, 2 ) << endl; + #endif + #endif + + // is here a barrier needed? -> to wait for completion on unit0?! + if( 0 != myid ) { + dash::transform( + histo.lbegin ( ) , + histo.lend ( ) , + histo.begin ( ) , + histo.begin ( ) , + dash::plus ( ) ); + } + + // free randMat and threshMask + randMat.deallocate(); + threshMask.deallocate(); + + // in this array will be the distribution info for creating buckets + unitRange * distr = static_cast( std::malloc( nUnits * sizeof(unitRange) ) ); + unitRange * const distr_end = distr + nUnits; + + + // unit 0 have to wait for rest to finish adding their values + dash::barrier(); + + if( 0 == myid ) { // calculate bucket distribution + + /* calculate how much elements each unit should hold ideally + * increment by one for safety garuantees in distribution + * that is the assumption (ideal * nUnits > foundAllSize) == true + */ + uint ideal = std::max( static_cast( MIN_NUM_ELEM_PER_UNIT ), (found / nUnits) + 1 ); + // cout << "elements:" << found << "\n"; + + #ifdef DEBUG + __sleep(); + cout << "ideal number of elements per unit: " << fmt( ideal, FRED ) << endl; + + cout << "Histogram: "; + for( size_t i = 0; i < histo.lsize(); ++i ) { + if( histo.local[i] ) cout << fmt( i, FCYN ) << ":" << fmt( histo.local[i], FRED ) << ", "; + } + cout << endl; + #endif + + + // begin - 1 for loop logic (starting with prefix ++) + unitRange * uRPtr = distr; + uRPtr->begin = 0; + + /* the loop for calculation of the distribution got a bit more complex + * because i wanted to iterate only once over "distr" + * therefore no initialization beforehand is needed + */ + if ( 1 == nUnits ) + { + uRPtr->end = MAX_KEY; + uRPtr->count = found; + + ++uRPtr; + }else{ + // T begin = 0; + uRPtr->count = 0; //needed if to few elements for acc >= ideal + uint acc = 0; + uint * hisIt = histo.lbegin(); + + // actual calculation of distribution + for( size_t i = 0; i < histo.lsize()-1 ; ++i ) { + acc += *hisIt++; + + if( acc >= ideal ){ + + // uRPtr->begin = begin; + uRPtr->end = i; + uRPtr->count = acc; + + // begin = i+1; + acc = 0; + + (++uRPtr)->begin = i + 1; + // uRPtr->count = 0; + } + } + if( acc >= MIN_NUM_ELEM_PER_UNIT ){ uRPtr->count = acc; } + else + { + if( uRPtr == distr ){ uRPtr->count += acc; } + else{ (--uRPtr)->count += acc; } + } + + uRPtr->end = MAX_KEY ; + + } + // set the rest to zero + // while( ++uRPtr < distr_end ) { uRPtr->begin = 0; uRPtr->end = 0; uRPtr->count = 0; } + if( ++uRPtr < distr_end){ + std::memset( uRPtr, 0, (distr_end - uRPtr) * sizeof(unitRange) ); + } + + } // end of unit 0 only part + + + /* Distribute calculated distribution data to every unit. + * Distribution data consists of unitRange structs. + * With this every unit knows which unit is in charge for + * sorting a specific value range. + * Every unit is then creating buckets to send the values to + * the responsible unit. + */ + dart_ret_t ret = dart_bcast( + static_cast( distr ) , // buf + nUnits * sizeof(unitRange) , // nelem + DART_TYPE_BYTE , // dtype + team_unit_t(0) , // root/source + team.dart_id( ) ); // team + + if( DART_OK != ret ) cout << "An Error while BCAST has occured!" << endl; + + #ifdef DEBUG + __sleep(); + cout << "#" << fmt( myid, FBLUE, 2 ) << ": "; + for( unitRange * i = distr; i < distr_end; ++i ) + { + cout << "Range: " + << fmt( i->begin, FYEL ) + << "-" + << fmt( i->end, FGREEN ) + << " c:" + << i->count + << ", "; + } + cout << endl; + #endif + + /* create dash::Array which will hold the data to be sorted. + * therefore the CSRPattern of DASH will be used. + * this allows different local sizes! + */ + vector local_sizes; + for( unitRange * uRPtr = distr; uRPtr < distr_end; ++uRPtr ) + { + local_sizes.push_back( uRPtr->count ); + } + + + pattern_t pattern( local_sizes ); + Array toSort( pattern ); + + + //std::memset(toSort.lbegin(), 0, toSort.lsize() * sizeof(Point)); // not necessary anymore + + #ifdef DEBUG + dash::barrier(); // only needed for better IO Output + + __sleep(); + cout << "#" << fmt( myid, FBLUE, 2 ) << ": local sizes: "; + for( auto it : local_sizes ){ cout << it << ", ";} + cout << endl; + + #ifdef DEBUG_DETAILED + if( local_sizes.size() > myid ){ + __sleep(20); + cout << "#" << fmt( myid, FBLUE, 2 ) + << ":" + << " vec.size: " << fmt( local_sizes.size(), FCYN ) + //<< " veCount: " << fmt( local_sizes[myid] , FCYN ) // careful! may not exist -> segfault chance! + << " lsize: " << fmt( toSort.lsize( ) , FCYN ) + << " toSort lend - lbegin: " << fmt( toSort.lend ( ) - toSort.lbegin( ), FCYN ) + << " Pattern size: " << fmt( pattern.size( ) , FCYN ) + << " toSort size: " << fmt( toSort.size ( ) , FCYN ) + << " sizeof(Point): " << fmt( sizeof(Point) , FCYN ) + << " alignment: " << fmt( (reinterpret_cast(toSort.lbegin( )) % 64), FCYN ) + << endl; + } + + #if 0 + dash::barrier(); // only needed for better IO Output + + if( local_sizes.size() > myid ){ + __sleep(); + uint count = 0; + cout << "#" << myid << ": "; + + for( Point * i = toSort.lbegin( ); i < toSort.lend( ) ; ++i ) { + cout << ++count << ", "; + i->value = myid; // test member access + i->row = myid; // test member access + i->col = myid; // test member access + } + cout << endl; + } + #endif + #endif + #endif + + + /* Each unit creates buckets that are send to the responsible unit later. + * Note: every unit will get data from other units (e.g. if distribution specifies range 0-0) + * This is a array of vectors like "vector buckets[ involvedUnits ];" + * But on the Heap! + */ + const size_t involvedUnits = local_sizes.size( ); + + vector ** buckets = static_cast **>( std::malloc( involvedUnits * sizeof(vector*) ) ); + vector ** const buckets_end = buckets + involvedUnits; + + // create a vector for each bucket pointer + for( vector ** bucket = buckets; bucket < buckets_end; ++bucket ){ *bucket = new vector; } + + + // iterate over pointsLocal and fill the buckets + for( Point const * lclPt = pointsLocal.data(); lclPt < pointsLocal.data() + pointsLocal.size(); ++lclPt ) + { + unitRange * uRPtr = distr; + for( vector ** bucket = buckets; bucket < buckets_end; ++uRPtr, ++bucket ) + { + if( lclPt->value <= uRPtr->end ) // if value is in Range for this unit + { + (*bucket)->push_back(*lclPt); + break; + } + } + } + + // pointsLocal no longer needed + pointsLocal.clear(); + + #ifdef DEBUG + dash::barrier(); // only needed for better IO Output + + __sleep(); + uint counter = 0; + + for( vector ** bucket = buckets; bucket < buckets_end; ++bucket, ++counter ) + { + cout << "#" << fmt( myid, FBLUE, 2 ) << ": bucket: " << fmt( counter, BBLUE ) << ": "; + + for( auto it : **bucket ){ cout << it << "-";} + cout << endl; + } + #endif + + /* In comTable will be the information which unit has how many elements of type Point + * for other units and itself. + * The table can be thought of like "comTable[nUnits][local_sizes.size()]" + * First every unit saves the information in row[0] + */ + uint * comTable = static_cast( std::malloc( sizeof(uint) * nUnits * involvedUnits ) ); + uint * const comTable_end = comTable + ( nUnits * involvedUnits ); + + uint * thisUnitsRow_begin = new uint[involvedUnits]; + uint * const thisUnitsRow_end = thisUnitsRow_begin + involvedUnits; + + uint * poiCount = thisUnitsRow_begin; + for( vector ** bucket = buckets; bucket < buckets_end; ++bucket, ++poiCount ) + { + *poiCount = (*bucket)->size(); + } + + #ifdef DEBUG + dash::barrier(); // only needed for better IO Output + + __sleep(); + + cout << "#" << fmt( myid, FBLUE, 2 ) << ": poiCount: "; + for( poiCount = thisUnitsRow_begin; poiCount < thisUnitsRow_end; ++poiCount ) + { + cout << fmt( *poiCount, FRED ) << ", "; + } cout << endl; + #endif + + ret = dart_allgather( + thisUnitsRow_begin, // sendbuf + comTable , // recvbuf + involvedUnits , // nelem + DART_TYPE_UINT , // dtype + team.dart_id( ) ); // team + + + if( DART_OK != ret ) cout << "An Error while allgather has occured!" << endl; + + #ifdef DEBUG_DETAILED + dash::barrier(); // only needed for better IO Output + + __sleep(); + counter = 0; + cout << "#" << fmt( myid, FBLUE, 2 ) << ": comTable:\n"; + for( poiCount = comTable; poiCount < comTable_end; ++poiCount, ++counter ) + { + if( counter == involvedUnits ) counter = 0, cout << "\n"; + cout << fmt( *poiCount, FMAG ) << ", "; + } cout << endl; + #endif + + + /* only involvedUnits can have data for memcpy! + * and the units must have data for themselves as well + * logical lookup -> comTable[myid][myid] > 0 + */ + size_t forMySelf = comTable[myid * involvedUnits + myid]; + + if( myid < involvedUnits && forMySelf > 0) + { + uint * forThisUnit = comTable + myid; + uint lclOffset = 0; + + + for( int ID = 0; ID < myid; ++ID) + { + // lclOffset += comTable[ID * involvedUnits + myid]; //replaced by pointer logic + lclOffset += *(forThisUnit); + forThisUnit += involvedUnits ; + } + + Point * lclDest = toSort.lbegin() + lclOffset; + + std::memcpy( lclDest, buckets[myid]->data(), sizeof(Point) * forMySelf ); + } + + + /* copy data for other units in a circle. + * every units sends its data to it's "right" neighbour + */ + uint myRowOffset = myid * involvedUnits; + + auto baseIterator = toSort.begin( ); + // auto globDest = baseIterator +10; + int nextID = (myid + 1) % involvedUnits; + + + for( uint counter = 0; counter < involvedUnits; ++counter, nextID = (nextID + 1) % involvedUnits ) + { + // if this unit has data for the next unit and is a remote unit (local unit was handled before) + if( comTable[ myRowOffset + nextID ] && nextID != myid ) + { + // uint offsetToUnit = 0; + uint offsetOnUnit = 0; + uint * forNextUnit = comTable + nextID; + // extent_t * unitsLclSize = local_sizes.data(); + + // calculate offsetToUnit // not needed anymore since using .pattern.global(nextID, offsetOnUnit); + // for( int ID = 0; ID < nextID; ++ID, ++unitsLclSize ){ offsetToUnit += *unitsLclSize; } + + + // calculate offsetToUnit + // offsetOnUnit += comTable[ID * involvedUnits + nextID]; //replaced by pointer logic + for( int ID = 0; ID < myid; ++ID, forNextUnit += involvedUnits ){ offsetOnUnit += *(forNextUnit); } + + + // global iterator/pointer to destination: + // auto globDest = baseIterator + offsetToUnit + offsetOnUnit; + auto globDest = baseIterator + toSort.pattern().global( team_unit_t(nextID), offsetOnUnit ); + + //dash::copy / MPI_Put to target unit + dash::copy( buckets[nextID]->data(), buckets[nextID]->data() + buckets[nextID]->size(), globDest ); + } + } + + for( vector ** bucket = buckets; bucket < buckets_end; ++bucket ){ delete *bucket; /* = new vector; */} + + // wait before sorting to finish all puts from dash::copy + dash::barrier(); + + #ifdef DEBUG_DETAILED + + __sleep(); + + // if( 0 == myid ) + // { + // vector testA; + + // for( Point it : toSort ){ testA.push_back( it ); } + + // std::sort( testA.begin( ), testA.end( ) ); + + // cout << "here comes testA:\n"; + // for( auto it : testA ){ cout << it << "\n" ; } + // cout << endl; + // } + + + if( 0 == myid ) + { + cout << "toSort Array before sort:\n"; + for( Point it : toSort ){ + cout << it << "-"; + } cout << endl; + } + + + // cout << "#" << fmt( myid, FBLUE, 2 ) << ": toSort Array before sort:\n"; + // for( Point * it = toSort.lbegin(); it < toSort.lend(); ++it ){ + // cout << *it << "\n"; + // } cout << endl; + #endif + + + // if( 0 < toSort.lsize() ) + // { + + std::stable_sort( toSort.lbegin( ), toSort.lend( ) ); + + // } + /* calculate local sizes for result array + * local_sizes will be recycled therefore + */ + + local_sizes.clear(); + size_t chunk = toSort.size( ) / nelts; + + if( 0 == chunk ){ + + local_sizes.push_back( nelts ); + for( int id = 1; id < nUnits; ++id ) { local_sizes.push_back( 0 ); } + + }else + { + // int diff = 0 ; + int div = 0 ; + int mod = 0 ; + uint countSoFar = 0 ; + uint lastIX = 0 ; + uint resulCount = 0 ; + uint rest = 0 ; + uint elemOnUnit = 0 ; + uint remain = nelts ; + + + for( unitRange * uRPtr = distr; uRPtr < distr_end; ++uRPtr ) + { + // diff = uRPtr->count - rest; + + // if( 0 > diff ) + // { + // elemOnUnit = 0 ; + // rest -= uRPtr->count ; + // }else + // { + countSoFar += uRPtr->count; + lastIX = countSoFar - 1 ; + div = lastIX / chunk ; + // mod = lastIX % chunk ; + + // if( 0 == myid ){ + // cout << "count:" << uRPtr->count << " rest:" << rest << "\n"; + // } + + elemOnUnit = div + 1 - resulCount ; + resulCount = div + 1 ; + // rest = mod ; + + if( remain < elemOnUnit ) + { + elemOnUnit = remain; + remain = 0 ; + }else + { + remain -= elemOnUnit; + } + // } + + + // if(0 ==myid)cout << "el:" << elemOnUnit << " r:" << rest << ", "; + local_sizes.push_back( elemOnUnit ); + + } //if(0 ==myid)cout << endl; + } + + // allocate result dash::Array with CSR Pattern + pattern_t pattern_result ( local_sizes ); + result.allocate ( pattern_result ); + uint pos = 0; + + /* generate local result. + * start by calculating local start index. + * using global indices therefore. + */ + if( 0 < result.lsize( ) ) + { + /* result.pattern().global(0) -> correspond to: "how much result elements have the units before this unit" + * toSort.pattern().global(0) -> correspond to: "how much sorted elements have the units before this unit" + */ + pos = result.pattern().global(0) * chunk - toSort.pattern().global(0); + Point * src = toSort.lbegin() + pos; + + for( value * dst = result.lbegin(); dst < result.lend(); ++dst, src += chunk ) + { + dst->col = src->col; + dst->row = src->row; + } + } + + dash::barrier(); + + #ifdef DEBUG + #ifdef DEBUG_DETAILED + __sleep(); + + cout << "#" << fmt( myid, FBLUE, 2 ) + << ": chunk:" << chunk + << " res:" << fmt( result.pattern().global(0), FCYN, 2) + << " toS:" << fmt( toSort.pattern().global(0), FCYN, 2) + << " pos:" << fmt( pos, FCYN, 2) + << " lclSizesResult: "; + + for( extent_t it : local_sizes ){ cout << it << ", "; } + cout << endl; + #endif + + dash::barrier(); + + __sleep(); + + if( 0 == myid ) + { + cout << "toSort Array after sort:\n"; + for( Point it : toSort ){ + cout << it << "-"; + } cout << endl; + } + #endif + + //no frees and deletes for potential speed up through cumulative memory freeing +} \ No newline at end of file