Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch merge #411

Draft
wants to merge 45 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
1b214d1
stash initial changes for now
jacob-morrison Aug 16, 2024
6178897
stash
jacob-morrison Aug 16, 2024
cc5670f
.
jacob-morrison Aug 17, 2024
d3ccf4a
.
jacob-morrison Aug 17, 2024
f9e0319
.
jacob-morrison Aug 17, 2024
4c161e1
.
jacob-morrison Aug 17, 2024
f282a3c
.
jacob-morrison Aug 18, 2024
488b71b
.
jacob-morrison Aug 18, 2024
623e68d
.
jacob-morrison Aug 18, 2024
758b2a9
.
jacob-morrison Aug 18, 2024
8b68012
fix
jacob-morrison Aug 18, 2024
09d6834
.
jacob-morrison Aug 18, 2024
baaa375
.
jacob-morrison Aug 18, 2024
e0b9a84
.
jacob-morrison Aug 18, 2024
88b1656
.
jacob-morrison Aug 18, 2024
32739a4
.
jacob-morrison Aug 18, 2024
f516abc
.
jacob-morrison Aug 19, 2024
f97c4d2
.
jacob-morrison Aug 19, 2024
ef11e70
.
jacob-morrison Aug 19, 2024
8250980
.
jacob-morrison Aug 19, 2024
b704fde
.
jacob-morrison Aug 19, 2024
8404b6c
.
jacob-morrison Aug 21, 2024
691143b
.
jacob-morrison Aug 21, 2024
1045794
Merge branch 'main' into batch-merge
jacob-morrison Sep 9, 2024
bbe7648
add
jacob-morrison Sep 10, 2024
45cabbb
.
jacob-morrison Sep 20, 2024
ace26b0
.
jacob-morrison Sep 20, 2024
7e7e1c1
test
jacob-morrison Sep 20, 2024
29a7a95
,
jacob-morrison Sep 20, 2024
9b29228
fix
jacob-morrison Sep 20, 2024
d07a819
test
jacob-morrison Sep 20, 2024
1812c40
Merge branch 'main' into batch-merge
jacob-morrison Oct 28, 2024
47fb938
push new commits
jacob-morrison Oct 29, 2024
c33ab5a
Merge branch 'main' into batch-merge
jacob-morrison Oct 29, 2024
9a13d8b
changes to support weka (rough draft for now)
jacob-morrison Oct 29, 2024
5aa6267
changes
jacob-morrison Oct 30, 2024
f4bbe02
update merge configs
jacob-morrison Oct 30, 2024
6377335
committing changes
jacob-morrison Nov 1, 2024
c5b9c0f
update
jacob-morrison Nov 3, 2024
67d05a4
update
jacob-morrison Nov 5, 2024
205c2f6
final configs
jacob-morrison Nov 5, 2024
bc2aec8
update
jacob-morrison Nov 13, 2024
f002136
Merge branch 'main' into batch-merge
jacob-morrison Nov 13, 2024
a0fc16f
update my branch with garbo
jacob-morrison Nov 17, 2024
9c0e769
dumping changes, not necessary for release
jacob-morrison Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
dumping changes, not necessary for release
  • Loading branch information
jacob-morrison committed Nov 21, 2024
commit 9c0e76924d750ed5fc3c6f601bfe0d1112ee3088
Binary file added downsampling_bars.pdf
Binary file not shown.
268 changes: 179 additions & 89 deletions scripts/plot-downsampling.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
benchmark_data = {
"Avg.": {
"eval_setting": "",
"sft_5": 56.6,
"sft_10": 57.0,
"sft_25": 57.7,
"sft_50": 58.1,
"sft_75": 58.6,
"sft_full": 59.1
"sft_5": 57.69,
"sft_10": 58.06,
"sft_25": 58.64,
"sft_50": 59.18,
"sft_75": 59.57,
"sft_full": 60.08
},
"MMLU": {
"eval_setting": "5 shot",
Expand Down Expand Up @@ -35,6 +35,7 @@
"sft_75": 29.6,
"sft_full": 29.3
},
# TODO: BBH IS NOT UP TO DATE!!!
"BigBenchHard": {
"eval_setting": "3 shot, CoT",
"sft_5": 67.5,
Expand Down Expand Up @@ -109,12 +110,12 @@
},
"Safety": {
"eval_setting": "",
"sft_5": 75.3,
"sft_10": 77.8,
"sft_25": 79.9,
"sft_50": 79.1,
"sft_75": 80.2,
"sft_full": 80.2
"sft_5": 89.8,
"sft_10": 90.9,
"sft_25": 92.3,
"sft_50": 92.6,
"sft_75": 92.8,
"sft_full": 93.1
}
}

Expand All @@ -124,96 +125,185 @@
# Create x-axis values (SFT percentages)
x_values = [5, 10, 25, 50, 75, 100] # 100 represents full SFT

# Create figure and axis with a larger size
plt.figure(figsize=(12, 8))

# Color palette for different lines
colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data)))

# Plot each benchmark
for (benchmark, data), color in zip(benchmark_data.items(), colors):
if benchmark != "Avg.": # Skip the average for now
y_values = [
data["sft_5"],
data["sft_10"],
data["sft_25"],
data["sft_50"],
data["sft_75"],
data["sft_full"]
]
plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2)

# Add the average line with higher emphasis
avg_values = [
benchmark_data["Avg."]["sft_5"],
benchmark_data["Avg."]["sft_10"],
benchmark_data["Avg."]["sft_25"],
benchmark_data["Avg."]["sft_50"],
benchmark_data["Avg."]["sft_75"],
benchmark_data["Avg."]["sft_full"]
]
plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s')
# # Create figure and axis with a larger size
# plt.figure(figsize=(12, 8))

# Customize the plot
plt.xlabel('SFT Percentage', fontsize=12)
plt.ylabel('Performance', fontsize=12)
plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
# # Color palette for different lines
# colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data)))

# Set x-axis ticks
plt.xticks(x_values)
# # Plot each benchmark
# for (benchmark, data), color in zip(benchmark_data.items(), colors):
# if benchmark != "Avg.": # Skip the average for now
# y_values = [
# data["sft_5"],
# data["sft_10"],
# data["sft_25"],
# data["sft_50"],
# data["sft_75"],
# data["sft_full"]
# ]
# plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2)

# Adjust layout to prevent label cutoff
plt.tight_layout()
# # Add the average line with higher emphasis
# avg_values = [
# benchmark_data["Avg."]["sft_5"],
# benchmark_data["Avg."]["sft_10"],
# benchmark_data["Avg."]["sft_25"],
# benchmark_data["Avg."]["sft_50"],
# benchmark_data["Avg."]["sft_75"],
# benchmark_data["Avg."]["sft_full"]
# ]
# plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s')

# Show the plot
plt.show()
# # Customize the plot
# plt.xlabel('SFT Training Data Size', fontsize=12)
# plt.ylabel('Performance', fontsize=12)
# plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14)
# plt.grid(True, linestyle='--', alpha=0.7)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)

# # Set x-axis ticks
# plt.xticks(x_values)

# # Adjust layout to prevent label cutoff
# plt.tight_layout()

# # Show the plot
# plt.show()

# Optional: Create a second plot focusing on specific benchmarks of interest
plt.figure(figsize=(12, 8))

# Define specific benchmarks and their colors
plot_config = {
'Avg.': '#0a3235', # Black for average
'TruthfulQA': '#b11bE8', # Coral red
'HumanEval+': '#f0529c', # Turquoise
'Safety': '#105257', # Light blue
'GSM8K': '#0fcb8c' # Sage green
}
# plt.figure(figsize=(20, 8))

# Define benchmarks and SFT percentages
benchmarks = [
'Avg.',
'GSM8K',
'HumanEval+',
'Safety',
'TruthfulQA',
]
sft_percentages = ['5%', '10%', '25%', '50%', '75%', '100%']
# colors = ['#0A2B35', '#0fcb8c', '#105257', '#f0529c', '#838383', '#0a3235'] # One color for each percentage
colors = [
'#FAC4DD', # 10%
'#F8ADD0', # 20%
'#F697C3', # 40%
'#F480B6', # 60%
'#F269A9', # 80%
'#F0529C', # 100% - original pink
]

colors = [
"#E7EEEE", # RGB(231, 238, 238)
"#CEDCDD", # RGB(206, 220, 221)
"#B7CBCC", # RGB(183, 203, 204)
"#9FB9BB", # RGB(159, 185, 187)
"#88A8AB", # RGB(136, 168, 171)
"#F0529C", # PINK
"#6E979A", # RGB(110, 151, 154)
"#588689", # RGB(88, 134, 137)
"#3F7478", # RGB(63, 116, 120)
"#105257", # RGB(16, 82, 87)
"#0A3235", # RGB(10, 50, 53)
]

# Set up the plot
fig, ax = plt.subplots(figsize=(20, 8))

# Width of bars and positions
width = 0.12
n_percentages = len(sft_percentages)

# Plot each benchmark with its specified color
for benchmark, color in plot_config.items():
# Create bars for each benchmark
for i, benchmark in enumerate(benchmarks):
data = benchmark_data[benchmark]
y_values = [
values = [
data["sft_5"],
data["sft_10"],
data["sft_25"],
data["sft_50"],
data["sft_75"],
data["sft_full"]
]
# Make average line dashed and thicker
if benchmark == 'Avg.':
plt.plot(x_values, y_values, '--', marker='s', label=benchmark,
color=color, linewidth=3)
else:
plt.plot(x_values, y_values, marker='o', label=benchmark,
color=color, linewidth=2)

# Customize the focused plot
plt.xlabel('SFT Percentage', fontsize=12)
plt.ylabel('Performance', fontsize=12)
# plt.title('Selected Benchmark Performance Trends', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10)
plt.xticks(x_values)

# Adjust layout
plt.tight_layout()

# Show the plot
# plt.show()

# Calculate positions for this benchmark's group of bars
x = i
for j in range(n_percentages):
bar_position = x - (n_percentages-1)*width/2 + j*width
bar = ax.bar(bar_position, values[j], width,
label=sft_percentages[j] if i == 0 else "",
color=colors[j],
edgecolor="black")

# Add value labels on top of bars
# ax.text(bar_position, values[j], f'{values[j]:.1f}', ha='center', va='bottom', fontsize=8)

# Customize the plot
# ax.set_xlabel('Benchmarks', fontsize=14)
ax.set_ylabel('Performance', fontsize=18)
plt.tick_params(axis='y', labelsize=18)
# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14)

# Set x-axis ticks and labels
ax.set_xticks(range(len(benchmarks)))
ax.set_xticklabels(benchmarks, ha="center", fontsize=18)

ax.spines[["right", "top"]].set_visible(False)

# Add legend
# ax.legend(title='SFT Sample Size', loc='center', bbox_to_anchor=(0.885, 0.8))

# Add grid
# ax.grid(True, linestyle='--', alpha=0.3, axis='y')

# Adjust layout to accommodate legend
# plt.subplots_adjust(right=0.85)

# Save and show the plot
plt.savefig('downsampling_bars.pdf', bbox_inches='tight', dpi=300)
plt.show()

# # Define specific benchmarks and their colors
# plot_config = {
# 'Avg.': '#0a3235', # Black for average
# 'TruthfulQA': '#b11bE8', # Coral red
# 'HumanEval+': '#f0529c', # Turquoise
# 'Safety': '#105257', # Light blue
# 'GSM8K': '#0fcb8c' # Sage green
# }

# # Plot each benchmark with its specified color
# for benchmark, color in plot_config.items():
# data = benchmark_data[benchmark]
# y_values = [
# data["sft_5"],
# data["sft_10"],
# data["sft_25"],
# data["sft_50"],
# data["sft_75"],
# data["sft_full"]
# ]
# # Make average line dashed and thicker
# if benchmark == 'Avg.':
# plt.plot(x_values, y_values, '--', marker='s', label=benchmark,
# color=color, linewidth=3)
# else:
# plt.plot(x_values, y_values, marker='o', label=benchmark,
# color=color, linewidth=2)

# # Customize the focused plot
# plt.xlabel('SFT Percentage', fontsize=12)
# plt.ylabel('Performance', fontsize=12)
# # plt.title('Selected Benchmark Performance Trends', fontsize=14)
# plt.grid(True, linestyle='--', alpha=0.7)
# plt.legend(fontsize=10)
# plt.xticks(x_values)

# # Adjust layout
# plt.tight_layout()

# # Show the plot
# # plt.show()

plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300)
plt.close()
# plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300)
# plt.close()
Loading