Skip to content

Commit

Permalink
add MQA results (huggingface#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
mayank31398 authored Dec 12, 2022
1 parent 66ed2bd commit 55d9634
Showing 12 changed files with 214 additions and 1 deletion.
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -126,3 +126,54 @@ Latency (sec)
| 1664 | oom | 13.45 | 13.36 | oom |
| 1792 | oom | 14.65 | 14.85 | oom |
| 1920 | oom | oom | oom | oom |

## GPT2 Multi-Query Attention
```python
hidden_size = 2048
n_head = 16
n_layer = 24
total_params = 1126889472
```

![image](images/GPT2-MQA-throughput.png)
![image](images/GPT2-MQA-latency.png)

Throughput (tokens/sec | msec/token)
| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
|:----------:|:---------------:|:----------------:|:----------------:|
| 1 | 72.61 \| 13.77 | 68.89 \| 14.52 | 54.68 \| 18.29 |
| 2 | 139.03 \| 7.19 | 133.32 \| 7.50 | 106.70 \| 9.37 |
| 4 | 275.54 \| 3.63 | 273.12 \| 3.66 | 213.83 \| 4.68 |
| 8 | 538.85 \| 1.86 | 556.67 \| 1.80 | 432.10 \| 2.31 |
| 16 | 1015.47 \| 0.98 | 1096.44 \| 0.91 | 846.28 \| 1.18 |
| 32 | 1863.15 \| 0.54 | 2194.91 \| 0.46 | 1663.86 \| 0.60 |
| 64 | 3009.88 \| 0.33 | 4167.02 \| 0.24 | 3192.54 \| 0.31 |
| 128 | 3399.45 \| 0.29 | 6856.43 \| 0.15 | 5928.43 \| 0.17 |
| 256 | 4208.59 \| 0.24 | 11002.50 \| 0.09 | 9938.01 \| 0.10 |
| 512 | 4559.72 \| 0.22 | 13727.93 \| 0.07 | 13850.24 \| 0.07 |
| 1024 | 4969.87 \| 0.20 | 15122.67 \| 0.07 | 15604.99 \| 0.06 |
| 2048 | 5090.85 \| 0.20 | 16014.17 \| 0.06 | 16298.18 \| 0.06 |
| 4096 | 5212.22 \| 0.19 | 16570.20 \| 0.06 | 16884.37 \| 0.06 |
| 8192 | 5268.96 \| 0.19 | 16781.00 \| 0.06 | 17088.02 \| 0.06 |
| 16384 | oom | 16874.13 \| 0.06 | 17159.74 \| 0.06 |
| 32768 | oom | oom | oom |

Latency (sec)
| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
|:----------:|:---------:|:---------:|:---------:|
| 1 | 1.38 | 1.45 | 1.83 |
| 2 | 1.44 | 1.50 | 1.87 |
| 4 | 1.45 | 1.46 | 1.87 |
| 8 | 1.48 | 1.44 | 1.85 |
| 16 | 1.58 | 1.46 | 1.89 |
| 32 | 1.72 | 1.46 | 1.92 |
| 64 | 2.13 | 1.54 | 2.00 |
| 128 | 3.77 | 1.87 | 2.16 |
| 256 | 6.08 | 2.33 | 2.58 |
| 512 | 11.23 | 3.73 | 3.70 |
| 1024 | 20.60 | 6.77 | 6.56 |
| 2048 | 40.23 | 12.79 | 12.57 |
| 4096 | 78.58 | 24.72 | 24.26 |
| 8192 | 155.48 | 48.82 | 47.94 |
| 16384 | oom | 97.10 | 95.48 |
| 32768 | oom | oom | oom |
Binary file modified images/BLOOM-latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/BLOOM-throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/GPT2-MHA-latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified images/GPT2-MHA-throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/GPT2-MQA-latency.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/GPT2-MQA-throughput.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 4 additions & 1 deletion scripts/make_graph_throughput.py
Original file line number Diff line number Diff line change
@@ -46,7 +46,10 @@ def parse_line(line: str, plot: str = "throughput") -> str:

def parse_data(data: list):
x = []
y = [[], [], [], []]
y = []
for i in range(len(data[0]) - 1):
y.append([])

for dp in data:
x.append(dp[0])
for i in range(1, len(dp)):
148 changes: 148 additions & 0 deletions scripts/parse_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import argparse
import copy
import os
from typing import Tuple

from markdownTable import markdownTable
from pandas import DataFrame


def get_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()

parser.add_argument("--input_dir", type=str, required=True)

args = parser.parse_args()
return args


def parse_line(line: str) -> Tuple[str, str]:
line = line.strip()

if line.endswith("tokens/sec"):
line = line.split("Throughput (including tokenization) = ")[1]
line = line.split(" tokens/sec")[0]

return line, "throughput"
elif line.endswith("msecs/token"):
line = line.split("Throughput (including tokenization) = ")[1]
line = line.split(" msecs/token")[0]

return line, "inverse_throughput"
elif line.startswith("Latency = ") and line.endswith("secs"):
line = line.split("Latency = ")[1]
line = line.split(" secs")[0]

return line, "latency"
elif "with batch size = " in line:
line = line.split("with batch size = ")[1]

return line, "batch_size"

return None, None


def get_throughput_dataframe(results: dict, order: list) -> DataFrame:
throughput = copy.deepcopy(results["throughput"])
for key in results["inverse_throughput"]:
for index, value in enumerate(results["inverse_throughput"][key]):
throughput[key][index] = throughput[key][index] + " \| " + value

max_rows = -1
batch_size_column = None
for key in results["batch_size"]:
bs = len(results["batch_size"][key])

if bs > max_rows:
max_rows = bs
batch_size_column = results["batch_size"][key]

for key in throughput:
while len(throughput[key]) < max_rows:
throughput[key].append("oom")
throughput["batch_size"] = batch_size_column

df = DataFrame(throughput)
df = df.loc[:, order]

return df


def get_latency_dataframe(results: dict, order: list) -> DataFrame:
latency = copy.deepcopy(results["latency"])

max_rows = -1
batch_size_column = None
for key in results["batch_size"]:
bs = len(results["batch_size"][key])

if bs > max_rows:
max_rows = bs
batch_size_column = results["batch_size"][key]

for key in latency:
while len(latency[key]) < max_rows:
latency[key].append("oom")
latency["batch_size"] = batch_size_column

df = DataFrame(latency)
df = df.loc[:, order]

return df


def make_table(results: dict):
order = ["batch_size", "HF (fp32)", "HF (bf16)", "HF (int8)"]

kwargs = dict(
row_sep="markdown",
padding_width=1,
)

throughput = get_throughput_dataframe(results, order)
throughput = throughput.to_dict(orient="records")
throughput = markdownTable(throughput).setParams(**kwargs).getMarkdown().split("```")[1]

latency = get_latency_dataframe(results, order)
latency = latency.to_dict(orient="records")
latency = markdownTable(latency).setParams(**kwargs).getMarkdown().split("```")[1]

return throughput, latency


def main() -> None:
args = get_args()

input_files = os.listdir(args.input_dir)
results = {"throughput": {}, "inverse_throughput": {}, "latency": {}, "batch_size": {}}
filename_column = {
"fp32.log": "HF (fp32)",
"bf16.log": "HF (bf16)",
"int8.log": "HF (int8)",
"fp16.log": "DS-inference (fp16)",
}

for filename in input_files:
with open(os.path.join(args.input_dir, filename), "r") as f:
lines = f.readlines()

for line in lines:
value, key = parse_line(line)

if key is not None:
column_name = filename_column[filename]
if column_name not in results[key]:
results[key][column_name] = []
results[key][column_name].append(value)

throughput, latency = make_table(results)

print("Throughput (tokens/sec | msec/token)")
print(throughput)
print()
print("Latency (sec)")
print(latency)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions run_batch_size.sh → scripts/run_batch_size.sh
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@ do
make $1 batch_size=$bs
done

# split for loops
for i in {1..20}
do
bs=$(($i*128))
10 changes: 10 additions & 0 deletions scripts/run_batch_size1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
export CUDA_VISIBLE_DEVICES=0

rm -rf ./tmp

# split for loops
for i in {0..20}
do
bs=$((2**$i))
make $1 batch_size=$bs
done
File renamed without changes.

0 comments on commit 55d9634

Please sign in to comment.