Skip to content

Commit

Permalink
fix max norm clipping test in python packaging pipeline test (microso…
Browse files Browse the repository at this point in the history
…ft#6468)

* fix python packaging pipeline

* make clip norm test compatabile with both V100 and M60 GPUs
  • Loading branch information
pengwa authored Jan 27, 2021
1 parent b5d1a49 commit fd43806
Showing 1 changed file with 56 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1477,17 +1477,33 @@ def testTrainingGraphExport(debug_files):
else:
assert not os.path.isfile(path)

@pytest.mark.parametrize("seed,device,max_norm_clip,gradient_accumulation_steps,total_steps,expected_loss", [
(0, 'cuda', 1.0, 1, 12, [10.536802, 9.95102, 9.495312, 9.067217, 8.735067, 8.447508,\
8.179443, 7.903837, 7.655049, 7.409669, 7.135822, 6.931838]),
(0, 'cuda', 0.1, 1, 12, [10.536802, 9.951735, 9.496659, 9.069328, 8.7381115, 8.4513855,\
8.184143, 7.9093056, 7.661127, 7.4162436, 7.142842, 6.9388437]),
(42, 'cuda', 1.0, 1, 12, [10.645588, 10.0333, 9.52253, 9.108369, 8.766306, 8.497426,\
8.199408, 7.958235, 7.659668, 7.459833, 7.170661, 6.9139776]),
(42, 'cuda', 0.1, 1, 12, [10.645588, 10.03406, 9.524019, 9.110594, 8.769308, 8.501322,\
8.204281, 7.963957, 7.6660814, 7.46682, 7.1780496, 6.92159]),
])
def testORTTrainerAdamMaxNormClip(seed, device, max_norm_clip, gradient_accumulation_steps,total_steps, expected_loss):

def _adam_max_norm_clip_data():
device_capability_major = torch.cuda.get_device_capability()[0]
if device_capability_major == 7: # V100 for Dev machine
return [
(0, 'cuda', 1.0, 1, 12, [10.536802, 9.95102, 9.495312, 9.067217, 8.735067, 8.447508,\
8.179443, 7.903837, 7.655049, 7.409669, 7.135822, 6.931838]),
(0, 'cuda', 0.1, 1, 12, [10.536802, 9.951735, 9.496659, 9.069328, 8.7381115, 8.4513855,\
8.184143, 7.9093056, 7.661127, 7.4162436, 7.142842, 6.9388437]),
(42, 'cuda', 1.0, 1, 12, [10.645588, 10.0333, 9.52253, 9.108369, 8.766306, 8.497426,\
8.199408, 7.958235, 7.659668, 7.459833, 7.170661, 6.9139776]),
(42, 'cuda', 0.1, 1, 12, [10.645588, 10.03406, 9.524019, 9.110594, 8.769308, 8.501322,\
8.204281, 7.963957, 7.6660814, 7.46682, 7.1780496, 6.92159]),
]
elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline)
return [
(0, 'cuda', 1.0, 1, 12, [10.564176, 9.97518 , 9.474583, 9.076513, 8.724233, 8.440754,\
8.156107, 7.905789, 7.683374, 7.420595, 7.156939, 6.914139]),
(0, 'cuda', 0.1, 1, 12, [10.564176, 9.975904, 9.475933, 9.078633, 8.727232, 8.444615,\
8.160932, 7.911339, 7.689524, 7.427308, 7.164138, 6.921518]),
(42, 'cuda', 1.0, 1, 12, [10.660578, 10.027208, 9.518457, 9.10457 , 8.767458, 8.469093,\
8.207001, 7.92541 , 7.655277, 7.434964, 7.155968, 6.924634]),
(42, 'cuda', 0.1, 1, 12, [10.660578, 10.027987, 9.519927, 9.106762, 8.770505, 8.473034,\
8.211944, 7.931111, 7.661622, 7.441899, 7.163355, 6.932114]),
]
@pytest.mark.parametrize("seed,device,max_norm_clip,gradient_accumulation_steps,total_steps,expected_loss", _adam_max_norm_clip_data())
def testORTTrainerAdamMaxNormClip(seed, device, max_norm_clip, gradient_accumulation_steps, total_steps, expected_loss):
rtol = 1e-5
torch.manual_seed(seed)
set_seed(seed)
Expand All @@ -1505,22 +1521,38 @@ def testORTTrainerAdamMaxNormClip(seed, device, max_norm_clip, gradient_accumula
for i in range(total_steps):
data, targets = batcher_fn(train_data, i)
loss, _ = trainer.train_step(data, targets)
actual_loss.append(loss.cpu())
actual_loss.append(loss.cpu().item())

# Compare legacy vs experimental APIs
_test_helpers.assert_model_outputs(expected_loss, actual_loss, rtol=rtol)

@pytest.mark.parametrize("seed,device,max_norm_clip, gradient_accumulation_steps,total_steps,expected_loss", [
(0, 'cuda', 1.0, 1, 12, [10.536802, 10.409792, 10.354762, 10.253063, 10.213676, 10.113361,\
10.066136, 9.977713, 9.924597, 9.858974, 9.796471, 9.794921]),
(0, 'cuda', 0.1, 1, 12, [10.536802, 10.3714695, 10.276415, 10.13743, 10.063246, 9.93144,\
9.854875, 9.739198, 9.661381, 9.570321, 9.482681, 9.457669]),
(42, 'cuda', 1.0, 1, 12, [10.645588, 10.51151, 10.438802, 10.356055, 10.291667, 10.232069,\
10.168237, 10.074414, 9.990586, 9.9324, 9.891901, 9.788895]),
(42, 'cuda', 0.1, 1, 12, [10.645588, 10.473022, 10.359108, 10.238948, 10.141735, 10.049339,\
9.953887, 9.832249, 9.722989, 9.640278, 9.572205, 9.448381]),
])
def testORTTrainerLambMaxNormClip(seed, device, max_norm_clip, gradient_accumulation_steps, total_steps, expected_loss):

def _lamb_max_norm_clip_data():
device_capability_major = torch.cuda.get_device_capability()[0]
if device_capability_major == 7: # V100 for Dev machine
return [
(0, 'cuda', 1.0, 1, 12, [10.536802, 10.409792, 10.354762, 10.253063, 10.213676, 10.113361,\
10.066136, 9.977713, 9.924597, 9.858974, 9.796471, 9.794921]),
(0, 'cuda', 0.1, 1, 12, [10.536802, 10.3714695, 10.276415, 10.13743, 10.063246, 9.93144,\
9.854875, 9.739198, 9.661381, 9.570321, 9.482681, 9.457669]),
(42, 'cuda', 1.0, 1, 12, [10.645588, 10.51151, 10.438802, 10.356055, 10.291667, 10.232069,\
10.168237, 10.074414, 9.990586, 9.9324, 9.891901, 9.788895]),
(42, 'cuda', 0.1, 1, 12, [10.645588, 10.473022, 10.359108, 10.238948, 10.141735, 10.049339,\
9.953887, 9.832249, 9.722989, 9.640278, 9.572205, 9.448381]),
]
elif device_capability_major == 5: # M60 for CI machines (Python Packaging Pipeline)
return [
(0, 'cuda', 1.0, 1, 12, [10.564176, 10.429815, 10.331507, 10.261825, 10.19336 , 10.110986,\
10.041771, 9.990074, 9.985901, 9.892414, 9.819457, 9.753627]),
(0, 'cuda', 0.1, 1, 12, [10.564176, 10.391491, 10.253088, 10.146585, 10.044328, 9.930671,\
9.830513, 9.752279, 9.72234 , 9.606323, 9.506898, 9.417118]),
(42, 'cuda', 1.0, 1, 12, [10.660578, 10.510471, 10.431763, 10.358577, 10.301462, 10.209934,\
10.167318, 10.03529 , 9.995482, 9.938999, 9.875689, 9.80955 ]),
(42, 'cuda', 0.1, 1, 12, [10.660578, 10.471846, 10.352203, 10.241209, 10.149426, 10.026606,\
9.952093, 9.792846, 9.726216, 9.645785, 9.556379, 9.467741]),
]
@pytest.mark.parametrize("seed,device,max_norm_clip, gradient_accumulation_steps,total_steps,expected_loss", _lamb_max_norm_clip_data())
def testORTTrainerLambMaxNormClip(seed, device, max_norm_clip, gradient_accumulation_steps, total_steps, expected_loss):
rtol = 1e-3
torch.manual_seed(seed)
set_seed(seed)
Expand All @@ -1538,7 +1570,7 @@ def testORTTrainerLambMaxNormClip(seed, device, max_norm_clip, gradient_accumula
for i in range(total_steps):
data, targets = batcher_fn(train_data, i)
loss, _ = trainer.train_step(data, targets)
actual_loss.append(loss.cpu())
actual_loss.append(loss.cpu().item())

# Compare legacy vs experimental APIs
_test_helpers.assert_model_outputs(expected_loss, actual_loss, rtol=rtol)

0 comments on commit fd43806

Please sign in to comment.