Skip to content

Commit

Permalink
Merge pull request oneapi-src#79 from oneapi-src/ratul/dl-cifar/segfa…
Browse files Browse the repository at this point in the history
…ult_fix

dl-cifar - seg fault fix
  • Loading branch information
rmukhopa authored Nov 14, 2024
2 parents 8543915 + b6611f1 commit 23b0362
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 54 deletions.
64 changes: 45 additions & 19 deletions dl-mnist/SYCL/conv_layer.onednn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,16 @@ namespace dl_infra {
}

ConvLayer::ConvLayer(WorkloadParams* workloadParams, int index_in_network, int total_layers_in_nw,
Timer* timer, TensorMgr* tensor_mgr, engine eng, stream s,
Timer* timer, TensorMgr* tensor_mgr, engine *eng, stream *s,
int input_tensor_dims[], int filter_tensor_dims[], int output_tensor_dims[]): workloadParams_(workloadParams) {

Tracer::func_begin("ConvLayer::ConvLayer");

index_in_network_ = index_in_network;
total_layers_in_nw_ = total_layers_in_nw;
timer_ = timer;
eng_ = std::move(eng);
s_ = std::move(s);
eng_ = eng;
s_ = s;
tensor_mgr_ = tensor_mgr;

input_tensor_dims_ = input_tensor_dims;
Expand All @@ -106,9 +106,9 @@ namespace dl_infra {
}

ConvLayer::ConvLayer(WorkloadParams* workloadParams, int index_in_network, int total_layers_in_nw,
Timer* timer, TensorMgr* tensor_mgr, IConvLayer* nextConvLayer, engine eng, stream s,
Timer* timer, TensorMgr* tensor_mgr, IConvLayer* nextConvLayer, engine *eng, stream *s,
int input_tensor_dims[], int filter_tensor_dims[], int output_tensor_dims[])
: ConvLayer(workloadParams, index_in_network, total_layers_in_nw, timer, tensor_mgr, std::move(eng), std::move(s), input_tensor_dims, filter_tensor_dims, output_tensor_dims) {
: ConvLayer(workloadParams, index_in_network, total_layers_in_nw, timer, tensor_mgr, eng, s, input_tensor_dims, filter_tensor_dims, output_tensor_dims) {
nextConvLayer_ = nextConvLayer;
};

Expand All @@ -132,7 +132,7 @@ namespace dl_infra {
#ifdef DEVICE_TIMER
Time start = get_time_now();
#endif
conv_pd = convolution_forward::primitive_desc(eng_,
conv_pd = convolution_forward::primitive_desc(*eng_,
prop_kind::forward_inference, algo,
tensor_mgr_->getTensorBagAt(index_in_network_)->conv_src_md,
tensor_mgr_->getTensorBagAt(index_in_network_)->conv_weights_md,
Expand All @@ -151,10 +151,33 @@ namespace dl_infra {
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "CONV_FORWARD CREATION");
#endif
createWorkspace();

reorderWeightsIfRequired();

Tracer::func_end("ConvLayer::initialize");
}

void ConvLayer::reorderWeightsIfRequired() {
need_reorder_weights_ = conv_pd.weights_desc() != tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_.get_desc();
// if(need_reorder_weights_)
// std::cout << "need_reorder_weights_" << std::endl;
auto conv_weights_mem = need_reorder_weights_ ? memory(conv_pd.weights_desc(), *eng_) : tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_;

if (need_reorder_weights_) {
#ifdef DEVICE_TIMER
start = get_time_now();
#endif
auto reorder_weights = reorder(tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_, conv_weights_mem);
reorder_weights.execute(*s_,
{{DNNL_ARG_FROM, tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_},
{DNNL_ARG_TO, conv_weights_mem}});
s_->wait(); // wait for the reorder to complete
tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_ = conv_weights_mem;
#ifdef DEVICE_TIMER
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "REORDER WEIGHTS");
#endif
}
}

void ConvLayer::doIOTensorAndWSAllocs() {
Tracer::func_begin("ConvLayer::doTensorAndWSAllocs");

Expand All @@ -167,7 +190,7 @@ namespace dl_infra {
#ifdef DEVICE_TIMER
Time start = get_time_now();
#endif
auto sycl_queue = dnnl::sycl_interop::get_queue(dnnl::stream(eng_));
auto sycl_queue = dnnl::sycl_interop::get_queue(dnnl::stream(*eng_));
sycl::free(tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_.get_data_handle(), sycl_queue);
#ifdef DEVICE_TIMER
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "FREE_INPUT_DEV_PTR");
Expand All @@ -194,7 +217,7 @@ namespace dl_infra {
#ifdef DEVICE_TIMER
Time start = get_time_now();
#endif
conv_scratchpad_mem_ = memory(conv_pd.scratchpad_desc(), eng_);
conv_scratchpad_mem_ = memory(conv_pd.scratchpad_desc(), *eng_);
#ifdef DEVICE_TIMER
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "MEMALLOC_SCRATCHPAD_DEV_MEM");
#endif
Expand Down Expand Up @@ -225,6 +248,8 @@ namespace dl_infra {
need_reorder_src_ = conv_pd.src_desc() != tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_.get_desc();

//need_reorder_weights_ = conv_pd.weights_desc() != tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_.get_desc();
// if(need_reorder_weights_)
// std::cout << "need_reorder_weights_" << std::endl;

if(index_in_network_ == total_layers_in_nw_-1) {
need_reorder_dst_ = conv_pd.dst_desc() != tensor_mgr_->getTensorBagAt(index_in_network_)->dst_mem_.get_desc();
Expand All @@ -239,13 +264,14 @@ namespace dl_infra {
#ifdef DEVICE_TIMER
start = get_time_now();
#endif
auto conv_src_mem = need_reorder_src_ ? memory(conv_pd.src_desc(), eng_) : tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_;
//auto conv_weights_mem = need_reorder_weights_ ? memory(conv_pd.weights_desc(), eng_) : tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_;
auto conv_src_mem = need_reorder_src_ ? memory(conv_pd.src_desc(), *eng_) : tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_;
//auto conv_weights_mem = need_reorder_weights_ ? memory(conv_pd.weights_desc(), *eng_) : tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_;
auto conv_weights_mem = tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_;

// in this workload we will forego reordering of weights
// we will assume that the pre-trained weights have been created in the memory format as determined by conv_pd.weights_desc()
auto conv_weights_mem = tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_;
auto conv_dst_mem = memory(conv_pd.dst_desc(), eng_, tensor_mgr_->getTensorBagAt(index_in_network_)->dst_mem_.get_data_handle());
//auto conv_weights_mem = tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_;
auto conv_dst_mem = memory(conv_pd.dst_desc(), *eng_, tensor_mgr_->getTensorBagAt(index_in_network_)->dst_mem_.get_data_handle());
tensor_mgr_->getTensorBagAt(index_in_network_)->dst_mem_ = conv_dst_mem;
#ifdef DEVICE_TIMER
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "REORDERED MEM CREATE");
Expand All @@ -257,8 +283,8 @@ namespace dl_infra {
#endif
auto reorder_src = reorder(tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_, conv_src_mem);
reorder_src.execute(
s_, {{DNNL_ARG_FROM, tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_}, {DNNL_ARG_TO, conv_src_mem}});
s_.wait(); // wait for the reorder to complete
*s_, {{DNNL_ARG_FROM, tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_}, {DNNL_ARG_TO, conv_src_mem}});
s_->wait(); // wait for the reorder to complete
#ifdef DEVICE_TIMER
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "REORDER SRC");
#endif
Expand All @@ -267,10 +293,10 @@ namespace dl_infra {
// if (need_reorder_weights_) {
// //start = get_time_now();
// auto reorder_weights = reorder(tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_, conv_weights_mem);
// reorder_weights.execute(s_,
// reorder_weights.execute(*s_,
// {{DNNL_ARG_FROM, tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_},
// {DNNL_ARG_TO, conv_weights_mem}});
// s_.wait(); // wait for the reorder to complete
// s_->wait(); // wait for the reorder to complete
// timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "REORDER WEIGHTS");
// }
//}
Expand All @@ -281,10 +307,10 @@ namespace dl_infra {
// conv_.execute(s_,
// {{DNNL_ARG_SRC, tensor_mgr_->getTensorBagAt(index_in_network_)->src_mem_}, {DNNL_ARG_WEIGHTS, tensor_mgr_->getTensorBagAt(index_in_network_)->weights_mem_},
// {DNNL_ARG_DST, tensor_mgr_->getTensorBagAt(index_in_network_)->dst_mem_}});
conv_.execute(s_,
conv_.execute(*s_,
{{DNNL_ARG_SRC, conv_src_mem}, {DNNL_ARG_WEIGHTS, conv_weights_mem},
{DNNL_ARG_DST, conv_dst_mem}});
s_.wait();
s_->wait();
#ifdef DEVICE_TIMER
timer_->recordOpTimeTaken(index_in_network_, calculate_op_time_taken(start), "CONV_FORWARD EXECUTION");
#endif
Expand Down
11 changes: 6 additions & 5 deletions dl-mnist/SYCL/conv_layer.onednn.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ namespace dl_infra {
private:
int index_in_network_, total_layers_in_nw_;
Timer* timer_;
engine eng_;
stream s_;
engine *eng_;
stream *s_;

TensorMgr* tensor_mgr_;

Expand All @@ -71,7 +71,7 @@ namespace dl_infra {
bool add_mem_transfer_time_ = false;

bool need_reorder_src_ = false;
//bool need_reorder_weights_ = false;
bool need_reorder_weights_ = false;
bool need_reorder_dst_ = false;

void write_to_dnnl_memory(void *handle, dnnl::memory &mem);
Expand All @@ -83,10 +83,10 @@ namespace dl_infra {

public:
ConvLayer(WorkloadParams* workloadParams, int index_in_network, int total_layers_in_nw,
Timer* timer, TensorMgr* tensor_mgr, engine eng, stream s,
Timer* timer, TensorMgr* tensor_mgr, engine *eng, stream *s,
int input_tensor_dims[], int filter_tensor_dims[], int output_tensor_dims[]);
ConvLayer(WorkloadParams* workloadParams, int index_in_network, int total_layers_in_nw,
Timer* timer, TensorMgr* tensor_mgr, IConvLayer* nextConvLayer, engine eng, stream s,
Timer* timer, TensorMgr* tensor_mgr, IConvLayer* nextConvLayer, engine *eng, stream *s,
int input_tensor_dims[], int filter_tensor_dims[], int output_tensor_dims[]);
~ConvLayer();

Expand All @@ -106,6 +106,7 @@ namespace dl_infra {
void createWorkspace();
void createTensorDescriptors();
void createTensors();
void reorderWeightsIfRequired();

void calculateStrideDims();
};
Expand Down
8 changes: 4 additions & 4 deletions dl-mnist/SYCL/dl_network_mgr.onednn.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ namespace dl_infra {
class DlNetworkMgr {
private:
Timer* timer_, *dataFileReadTimer_;
engine eng_;
stream s_;
engine *eng_;
stream *s_;
WorkloadParams* workloadParams_;
//WorkloadParams::TensorMemPolicy tensorMemPolicy_;

Expand All @@ -69,8 +69,8 @@ namespace dl_infra {
void initializeNetwork(string networkName);

public:
DlNetworkMgr(WorkloadParams* workloadParams, engine eng, stream s, Timer* timer, Timer* dataFileReadTimer)
: workloadParams_(workloadParams), eng_(std::move(eng)), s_(std::move(s)), timer_(timer), tensorMgr(0), dataFileReadTimer_(dataFileReadTimer) {}
DlNetworkMgr(WorkloadParams* workloadParams, engine* eng, stream* s, Timer* timer, Timer* dataFileReadTimer)
: workloadParams_(workloadParams), eng_(eng), s_(s), timer_(timer), tensorMgr(0), dataFileReadTimer_(dataFileReadTimer) {}
void createDLNetwork(string networkName, int no_of_conv_layers, int *conv_dims);
void executeInferenceRun(string networkName);
};
Expand Down
21 changes: 13 additions & 8 deletions dl-mnist/SYCL/main.onednn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,39 +52,39 @@ int main(int argc, const char** argv) {

cout << endl << "\t\tWelcome to DL-MNIST workload: SYCL version." << endl << endl;
cout << "=======================================================================" << endl;
sycl::device* dht = new sycl::device(sycl::gpu_selector());
sycl::device* dht = new sycl::device(sycl::gpu_selector_v);
#ifdef DEVICE_TIMER
start = get_time_now();
#endif
sycl::context context(*dht);
sycl::context *context = new sycl::context(*dht);
#ifdef DEVICE_TIMER
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_CONTEXT");
#endif
//auto propList = sycl::property_list{sycl::property::queue::in_order()};
#ifdef DEVICE_TIMER
start = get_time_now();
#endif
sycl::queue deviceQueue1(context, *dht);
sycl::queue *deviceQueue1 = new sycl::queue(*context, *dht);
#ifdef DEVICE_TIMER
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_SYCL_QUEUE");
#endif
#ifdef DEVICE_TIMER
start = get_time_now();
#endif
//engine eng(engine::kind::gpu, 0);
engine eng = dnnl::sycl_interop::make_engine(*dht, context);
engine eng = dnnl::sycl_interop::make_engine(*dht, *context);

#ifdef DEVICE_TIMER
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_ONEDNN_ENGINE");
#endif
#ifdef DEVICE_TIMER
start = get_time_now();
#endif
//stream s(eng);
stream s = dnnl::sycl_interop::make_stream(eng, deviceQueue1);
stream s = dnnl::sycl_interop::make_stream(eng, *deviceQueue1);
#ifdef DEVICE_TIMER
timer->recordOpTimeTaken(1000, calculate_op_time_taken(start), "CREATE_ONEDNN STREAM");
#endif
SYCL sycl(dnnl::sycl_interop::get_queue(s).get_device());
SYCL sycl(*dht);
sycl.DisplayProperties();
cout << "=======================================================================" << endl;
cout << endl;
Expand Down Expand Up @@ -146,7 +146,7 @@ int main(int argc, const char** argv) {
cout.precision(3);

int noOfIterations = workload_params.getNoOfIterations();
DlNetworkMgr* dlNetworkMgr = new DlNetworkMgr(&workload_params, eng, s, timer, dataFileReadTimer);
DlNetworkMgr* dlNetworkMgr = new DlNetworkMgr(&workload_params, &eng, &s, timer, dataFileReadTimer);

string networkName1_1 = "nw_1.1";
dlNetworkMgr->createDLNetwork(networkName1_1, 10, (int *)&conv_dims1);
Expand All @@ -173,6 +173,11 @@ int main(int argc, const char** argv) {
cout << "Final time across all networks: " << timer->getTotalOpTime() << " s" << std::endl;
#endif
delete dlNetworkMgr;

delete dht;
delete context;
delete deviceQueue1;

delete timer;

std::cout << "dl-mnist - total time for whole calculation: " << calculate_op_time_taken(wallClockStart) - dataFileReadTimer->getTotalOpTime()<< " s" << std::endl;
Expand Down
Loading

0 comments on commit 23b0362

Please sign in to comment.