Skip to content

Commit

Permalink
Merge branch 'master' into nrwu/fixppevalbatchdtype
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffra authored Apr 26, 2023
2 parents d908937 + f7d71ec commit 77ddbe4
Show file tree
Hide file tree
Showing 76 changed files with 2,136 additions and 594 deletions.
5 changes: 4 additions & 1 deletion .github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ assignees: ''
---

**Describe the bug**
A clear and concise description of what the bug is.
A clear and concise description of what the bug is. Please include which training step you are using and which model you are training.

**Log output**
If you used `train.py` to launch the application, please include the contents of the output log file.

**To Reproduce**
Steps to reproduce the behavior:
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ repos:
hooks:
- id: yapf

- repo: https://gitlab.com/daverona/pre-commit-cpp
- repo: https://gitlab.com/daverona/pre-commit/cpp
rev: 0.8.0
hooks:
- id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
[![Downloads](https://pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
[![Twitter](https://img.shields.io/twitter/follow/MSFTDeepSpeed)](https://twitter.com/intent/follow?screen_name=MSFTDeepSpeed)


<div align="center">
Expand Down Expand Up @@ -97,7 +98,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram
<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
| <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed) |
| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/projects/composer/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
| <img src="docs/assets/images/determined.svg" width="225px"> | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |

---
Expand Down
10 changes: 10 additions & 0 deletions csrc/includes/conversion_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,11 @@ DS_D_INLINE __half2 to(float2 val)
{
return __float22half2_rn(val);
}
template <>
DS_D_INLINE __half2 to(float val)
{
return __float2half2_rn(val);
}

#ifdef BF16_AVAILABLE
// No direct conversion
Expand Down Expand Up @@ -406,6 +411,11 @@ DS_D_INLINE __nv_bfloat162 to(float2 val)
return __float22bfloat162_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat162 to(float val)
{
return __float2bfloat162_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat162 to(__half2 val)
{
return to<__nv_bfloat162>(to<float2>(val));
Expand Down
5 changes: 5 additions & 0 deletions csrc/includes/quantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@

#pragma once

#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif

#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
Expand Down
5 changes: 5 additions & 0 deletions csrc/spatial/includes/spatial_cuda_layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
#define HALF_PRECISION_AVAILABLE = 1
#endif

#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif

#include <cuda.h>
#include <cuda_fp16.h>

Expand Down
4 changes: 2 additions & 2 deletions csrc/transformer/general_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void launch_fused_add2<float>(float* out,
int total_count = batch_size * seq_length * hidden_dim / 4;
dim3 grid_dim = DS_GET_BLOCKS(total_count); //(batch_size * seq_length);

dim3 block_dim = DS_CUDA_NUM_THREADS; //(hidden_dim / 4);
dim3 block_dim = DS_CUDA_NUM_THREADS; //(hidden_dim / 4);

fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
}
Expand All @@ -179,7 +179,7 @@ void launch_fused_add2<__half>(__half* out,
int total_count = batch_size * seq_length * hidden_dim / 4;
dim3 grid_dim = DS_GET_BLOCKS(total_count); //(batch_size * seq_length);

dim3 block_dim = DS_CUDA_NUM_THREADS; //(hidden_dim / 4);
dim3 block_dim = DS_CUDA_NUM_THREADS; //(hidden_dim / 4);

fused_add2_kernel<<<grid_dim, block_dim, 0, stream>>>(total_count, out, inp1, inp2);
}
Expand Down
140 changes: 83 additions & 57 deletions csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

// DeepSpeed Team

#include "conversion_utils.h"
#include "inference_cuda_layers.h"

#ifndef __HIP_PLATFORM_HCC__
Expand All @@ -12,8 +13,9 @@
namespace cg = cooperative_groups;
namespace cg = cooperative_groups;

__global__ void apply_rotary_pos_emb(float* mixed_query,
float* key_layer,
template <typename T>
__global__ void apply_rotary_pos_emb(T* mixed_query,
T* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
Expand All @@ -40,8 +42,8 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
while (lane < rotary_dim) {
float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
float q = mixed_query[offset + lane];
float k = key_layer[k_offset + lane];
float q = conversion::to<float>(mixed_query[offset + lane]);
float k = conversion::to<float>(key_layer[k_offset + lane]);
float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
float q_rot = (q * rotary_sign);
float k_rot = (k * rotary_sign);
Expand All @@ -50,59 +52,14 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);

mixed_query[offset + lane] = q;
key_layer[k_offset + lane] = k;
mixed_query[offset + lane] = conversion::to<T>(q);
key_layer[k_offset + lane] = conversion::to<T>(k);

lane += WARP_SIZE;
}
}
}

__global__ void apply_rotary_pos_emb(__half* mixed_query,
__half* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
unsigned num_heads,
unsigned head_size,
unsigned total_count,
int max_out_tokens)
{
cg::thread_block b = cg::this_thread_block();
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);

int id = threadIdx.x;
int gid = id >> 5;
int lane = id & 0x1f;

unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
unsigned offset = head_id * head_size;

unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
unsigned seq_index = head_id % seq_len;
unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;

if (head_id < total_count) {
while (lane < rotary_dim) {
float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
float q = (float)mixed_query[offset + lane];
float k = (float)key_layer[k_offset + lane];
float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
float q_rot = (q * rotary_sign);
float k_rot = (k * rotary_sign);
q_rot = g.shfl_xor(q_rot, 1);
k_rot = g.shfl_xor(k_rot, 1);
q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);

mixed_query[offset + lane] = (__half)q;
key_layer[k_offset + lane] = (__half)k;

lane += WARP_SIZE;
}
}
}
__global__ void apply_rotary_pos_emb1(float* mixed_query,
float* key_layer,
unsigned rotary_dim,
Expand Down Expand Up @@ -148,8 +105,10 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
}
}
}
__global__ void apply_rotary_pos_emb1(__half* mixed_query,
__half* key_layer,

template <typename T>
__global__ void apply_rotary_pos_emb1(T* mixed_query,
T* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
Expand Down Expand Up @@ -185,8 +144,8 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
while (lane < rotary_dim) {
float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
float q = (float)mixed_query[offset + lane];
float k = (float)key_layer[k_offset + lane];
float q = conversion::to<float>(mixed_query[offset + lane]);
float k = conversion::to<float>(key_layer[k_offset + lane]);
float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
float q_rot = (q * rotary_sign);
float k_rot = (k * rotary_sign);
Expand All @@ -197,8 +156,8 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);

mixed_query[offset + lane] = (__half)q;
key_layer[k_offset + lane] = (__half)k;
mixed_query[offset + lane] = conversion::to<T>(q);
key_layer[k_offset + lane] = conversion::to<T>(k);

lane += WARP_SIZE;
}
Expand Down Expand Up @@ -256,6 +215,20 @@ template void launch_apply_rotary_pos_emb<float>(float*,
bool,
cudaStream_t,
int);
#ifdef BF16_AVAILABLE
template void launch_apply_rotary_pos_emb<__nv_bfloat16>(__nv_bfloat16*,
__nv_bfloat16*,
unsigned,
unsigned,
unsigned,
unsigned,
unsigned,
unsigned,
bool,
bool,
cudaStream_t,
int);
#endif
template void launch_apply_rotary_pos_emb<__half>(__half*,
__half*,
unsigned,
Expand All @@ -269,6 +242,59 @@ template void launch_apply_rotary_pos_emb<__half>(__half*,
cudaStream_t,
int);

template __global__ void apply_rotary_pos_emb(float* mixed_query,
float* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
unsigned num_heads,
unsigned head_size,
unsigned total_count,
int max_out_tokens);

#ifdef BF16_AVAILABLE
template __global__ void apply_rotary_pos_emb(__nv_bfloat16* mixed_query,
__nv_bfloat16* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
unsigned num_heads,
unsigned head_size,
unsigned total_count,
int max_out_tokens);
#endif

template __global__ void apply_rotary_pos_emb(__half* mixed_query,
__half* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
unsigned num_heads,
unsigned head_size,
unsigned total_count,
int max_out_tokens);

#ifdef BF16_AVAILABLE
template __global__ void apply_rotary_pos_emb1(__nv_bfloat16* mixed_query,
__nv_bfloat16* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
unsigned num_heads,
unsigned head_size,
unsigned total_count,
int max_out_tokens);
#endif

template __global__ void apply_rotary_pos_emb1(__half* mixed_query,
__half* key_layer,
unsigned rotary_dim,
unsigned seq_len,
unsigned seq_offset,
unsigned num_heads,
unsigned head_size,
unsigned total_count,
int max_out_tokens);
/*
__global__ void apply_rotary_pos_emb(float* mixed_query,
float* key_layer,
Expand Down
Loading

0 comments on commit 77ddbe4

Please sign in to comment.