add MQA results (huggingface#8)

jlamypoirier · Dec 12, 2022 · 55d9634 · 55d9634
1 parent 66ed2bd
commit 55d9634
Showing 12 changed files with 214 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -126,3 +126,54 @@ Latency (sec)
 | 1664       | oom       | 13.45     | 13.36     | oom                 |
 | 1792       | oom       | 14.65     | 14.85     | oom                 |
 | 1920       | oom       | oom       | oom       | oom                 |
+
+## GPT2 Multi-Query Attention
+```python
+hidden_size = 2048
+n_head = 16
+n_layer = 24
+total_params = 1126889472
+```
+
+![image](images/GPT2-MQA-throughput.png)
+![image](images/GPT2-MQA-latency.png)
+
+Throughput (tokens/sec | msec/token)
+| batch_size |    HF (fp32)    |    HF (bf16)     |    HF (int8)     |
+|:----------:|:---------------:|:----------------:|:----------------:|
+|     1      | 72.61 \| 13.77  |  68.89 \| 14.52  |  54.68 \| 18.29  |
+|     2      | 139.03 \| 7.19  |  133.32 \| 7.50  |  106.70 \| 9.37  |
+|     4      | 275.54 \| 3.63  |  273.12 \| 3.66  |  213.83 \| 4.68  |
+|     8      | 538.85 \| 1.86  |  556.67 \| 1.80  |  432.10 \| 2.31  |
+|    16      | 1015.47 \| 0.98 | 1096.44 \| 0.91  |  846.28 \| 1.18  |
+|    32      | 1863.15 \| 0.54 | 2194.91 \| 0.46  | 1663.86 \| 0.60  |
+|    64      | 3009.88 \| 0.33 | 4167.02 \| 0.24  | 3192.54 \| 0.31  |
+|    128     | 3399.45 \| 0.29 | 6856.43 \| 0.15  | 5928.43 \| 0.17  |
+|    256     | 4208.59 \| 0.24 | 11002.50 \| 0.09 | 9938.01 \| 0.10  |
+|    512     | 4559.72 \| 0.22 | 13727.93 \| 0.07 | 13850.24 \| 0.07 |
+|   1024     | 4969.87 \| 0.20 | 15122.67 \| 0.07 | 15604.99 \| 0.06 |
+|   2048     | 5090.85 \| 0.20 | 16014.17 \| 0.06 | 16298.18 \| 0.06 |
+|   4096     | 5212.22 \| 0.19 | 16570.20 \| 0.06 | 16884.37 \| 0.06 |
+|   8192     | 5268.96 \| 0.19 | 16781.00 \| 0.06 | 17088.02 \| 0.06 |
+|   16384    |       oom       | 16874.13 \| 0.06 | 17159.74 \| 0.06 |
+|   32768    |       oom       | oom              | oom              |
+
+Latency (sec)
+| batch_size | HF (fp32) | HF (bf16) | HF (int8) |
+|:----------:|:---------:|:---------:|:---------:|
+|     1      |   1.38    |   1.45    |   1.83    |
+|     2      |   1.44    |   1.50    |   1.87    |
+|     4      |   1.45    |   1.46    |   1.87    |
+|     8      |   1.48    |   1.44    |   1.85    |
+|    16      |   1.58    |   1.46    |   1.89    |
+|    32      |   1.72    |   1.46    |   1.92    |
+|    64      |   2.13    |   1.54    |   2.00    |
+|    128     |   3.77    |   1.87    |   2.16    |
+|    256     |   6.08    |   2.33    |   2.58    |
+|    512     |  11.23    |   3.73    |   3.70    |
+|   1024     |  20.60    |   6.77    |   6.56    |
+|   2048     |  40.23    |  12.79    |  12.57    |
+|   4096     |  78.58    |  24.72    |  24.26    |
+|   8192     |  155.48   |  48.82    |  47.94    |
+|   16384    |   oom     |  97.10    |  95.48    |
+|   32768    |   oom     |  oom      |  oom      |
diff --git a/images/BLOOM-latency.png b/images/BLOOM-latency.png
diff --git a/images/BLOOM-throughput.png b/images/BLOOM-throughput.png
diff --git a/images/GPT2-MHA-latency.png b/images/GPT2-MHA-latency.png
diff --git a/images/GPT2-MHA-throughput.png b/images/GPT2-MHA-throughput.png
diff --git a/images/GPT2-MQA-latency.png b/images/GPT2-MQA-latency.png
diff --git a/images/GPT2-MQA-throughput.png b/images/GPT2-MQA-throughput.png
diff --git a/scripts/make_graph_throughput.py b/scripts/make_graph_throughput.py
@@ -46,7 +46,10 @@ def parse_line(line: str, plot: str = "throughput") -> str:
 
 def parse_data(data: list):
     x = []
-    y = [[], [], [], []]
+    y = []
+    for i in range(len(data[0]) - 1):
+        y.append([])
+
     for dp in data:
         x.append(dp[0])
         for i in range(1, len(dp)):

diff --git a/scripts/parse_logs.py b/scripts/parse_logs.py
@@ -0,0 +1,148 @@
+import argparse
+import copy
+import os
+from typing import Tuple
+
+from markdownTable import markdownTable
+from pandas import DataFrame
+
+
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--input_dir", type=str, required=True)
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_line(line: str) -> Tuple[str, str]:
+    line = line.strip()
+
+    if line.endswith("tokens/sec"):
+        line = line.split("Throughput (including tokenization) = ")[1]
+        line = line.split(" tokens/sec")[0]
+
+        return line, "throughput"
+    elif line.endswith("msecs/token"):
+        line = line.split("Throughput (including tokenization) = ")[1]
+        line = line.split(" msecs/token")[0]
+
+        return line, "inverse_throughput"
+    elif line.startswith("Latency = ") and line.endswith("secs"):
+        line = line.split("Latency = ")[1]
+        line = line.split(" secs")[0]
+
+        return line, "latency"
+    elif "with batch size = " in line:
+        line = line.split("with batch size = ")[1]
+
+        return line, "batch_size"
+
+    return None, None
+
+
+def get_throughput_dataframe(results: dict, order: list) -> DataFrame:
+    throughput = copy.deepcopy(results["throughput"])
+    for key in results["inverse_throughput"]:
+        for index, value in enumerate(results["inverse_throughput"][key]):
+            throughput[key][index] = throughput[key][index] + " \| " + value
+
+    max_rows = -1
+    batch_size_column = None
+    for key in results["batch_size"]:
+        bs = len(results["batch_size"][key])
+
+        if bs > max_rows:
+            max_rows = bs
+            batch_size_column = results["batch_size"][key]
+
+    for key in throughput:
+        while len(throughput[key]) < max_rows:
+            throughput[key].append("oom")
+    throughput["batch_size"] = batch_size_column
+
+    df = DataFrame(throughput)
+    df = df.loc[:, order]
+
+    return df
+
+
+def get_latency_dataframe(results: dict, order: list) -> DataFrame:
+    latency = copy.deepcopy(results["latency"])
+
+    max_rows = -1
+    batch_size_column = None
+    for key in results["batch_size"]:
+        bs = len(results["batch_size"][key])
+
+        if bs > max_rows:
+            max_rows = bs
+            batch_size_column = results["batch_size"][key]
+
+    for key in latency:
+        while len(latency[key]) < max_rows:
+            latency[key].append("oom")
+    latency["batch_size"] = batch_size_column
+
+    df = DataFrame(latency)
+    df = df.loc[:, order]
+
+    return df
+
+
+def make_table(results: dict):
+    order = ["batch_size", "HF (fp32)", "HF (bf16)", "HF (int8)"]
+
+    kwargs = dict(
+        row_sep="markdown",
+        padding_width=1,
+    )
+
+    throughput = get_throughput_dataframe(results, order)
+    throughput = throughput.to_dict(orient="records")
+    throughput = markdownTable(throughput).setParams(**kwargs).getMarkdown().split("```")[1]
+
+    latency = get_latency_dataframe(results, order)
+    latency = latency.to_dict(orient="records")
+    latency = markdownTable(latency).setParams(**kwargs).getMarkdown().split("```")[1]
+
+    return throughput, latency
+
+
+def main() -> None:
+    args = get_args()
+
+    input_files = os.listdir(args.input_dir)
+    results = {"throughput": {}, "inverse_throughput": {}, "latency": {}, "batch_size": {}}
+    filename_column = {
+        "fp32.log": "HF (fp32)",
+        "bf16.log": "HF (bf16)",
+        "int8.log": "HF (int8)",
+        "fp16.log": "DS-inference (fp16)",
+    }
+
+    for filename in input_files:
+        with open(os.path.join(args.input_dir, filename), "r") as f:
+            lines = f.readlines()
+
+        for line in lines:
+            value, key = parse_line(line)
+
+            if key is not None:
+                column_name = filename_column[filename]
+                if column_name not in results[key]:
+                    results[key][column_name] = []
+                results[key][column_name].append(value)
+
+    throughput, latency = make_table(results)
+
+    print("Throughput (tokens/sec | msec/token)")
+    print(throughput)
+    print()
+    print("Latency (sec)")
+    print(latency)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/run_batch_size.sh → scripts/run_batch_size.sh b/run_batch_size.sh → scripts/run_batch_size.sh
@@ -7,6 +7,7 @@ do
     make $1 batch_size=$bs
 done
 
+# split for loops
 for i in {1..20}
 do
     bs=$(($i*128))

diff --git a/scripts/run_batch_size1.sh b/scripts/run_batch_size1.sh
@@ -0,0 +1,10 @@
+export CUDA_VISIBLE_DEVICES=0
+
+rm -rf ./tmp
+
+# split for loops
+for i in {0..20}
+do
+    bs=$((2**$i))
+    make $1 batch_size=$bs
+done
diff --git a/run_input_length.sh → scripts/run_input_length.sh b/run_input_length.sh → scripts/run_input_length.sh