Skip to content

Commit

Permalink
Add AWS Inf2 instances support for aws_batch scheduler
Browse files Browse the repository at this point in the history
  • Loading branch information
Shixian Cui committed Nov 26, 2024
1 parent 26cb186 commit 894b557
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
44 changes: 44 additions & 0 deletions torchx/specs/named_resources_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,46 @@ def aws_trn1_32xlarge() -> Resource:
)


def aws_inf2_xlarge() -> Resource:
return Resource(
cpu=4,
gpu=0,
memMB=32 * GiB,
capabilities={K8S_ITYPE: "inf2.xlarge"},
devices={NEURON_DEVICE: 1},
)


def aws_inf2_8xlarge() -> Resource:
return Resource(
cpu=32,
gpu=0,
memMB=32 * GiB,
capabilities={K8S_ITYPE: "inf2.8xlarge"},
devices={NEURON_DEVICE: 1},
)


def aws_inf2_24xlarge() -> Resource:
return Resource(
cpu=96,
gpu=0,
memMB=192 * GiB,
capabilities={K8S_ITYPE: "inf2.24xlarge"},
devices={NEURON_DEVICE: 6},
)


def aws_inf2_48xlarge() -> Resource:
return Resource(
cpu=192,
gpu=0,
memMB=384 * GiB,
capabilities={K8S_ITYPE: "inf2.48xlarge"},
devices={NEURON_DEVICE: 12},
)


NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
"aws_t3.medium": aws_t3_medium,
"aws_m5.2xlarge": aws_m5_2xlarge,
Expand Down Expand Up @@ -383,4 +423,8 @@ def aws_trn1_32xlarge() -> Resource:
"aws_g6e.48xlarge": aws_g6e_48xlarge,
"aws_trn1.2xlarge": aws_trn1_2xlarge,
"aws_trn1.32xlarge": aws_trn1_32xlarge,
"aws_inf2.xlarge": aws_inf2_xlarge,
"aws_inf2.8xlarge": aws_inf2_8xlarge,
"aws_inf2.24xlarge": aws_inf2_24xlarge,
"aws_inf2.48xlarge": aws_inf2_48xlarge,
}
29 changes: 29 additions & 0 deletions torchx/specs/test/named_resources_aws_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
aws_g6e_4xlarge,
aws_g6e_8xlarge,
aws_g6e_xlarge,
aws_inf2_24xlarge,
aws_inf2_48xlarge,
aws_inf2_8xlarge,
aws_inf2_xlarge,
aws_m5_2xlarge,
aws_p3_16xlarge,
aws_p3_2xlarge,
Expand Down Expand Up @@ -231,6 +235,31 @@ def test_aws_trn1(self) -> None:
self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)

def test_aws_inf2(self) -> None:
inf2_1 = aws_inf2_xlarge()
self.assertEqual(4, inf2_1.cpu)
self.assertEqual(0, inf2_1.gpu)
self.assertEqual(32 * GiB, inf2_1.memMB)
self.assertEqual({NEURON_DEVICE: 1}, inf2_1.devices)

inf2_8 = aws_inf2_8xlarge()
self.assertEqual(32, inf2_8.cpu)
self.assertEqual(0, inf2_8.gpu)
self.assertEqual(32 * GiB, inf2_8.memMB)
self.assertEqual({NEURON_DEVICE: 1}, inf2_8.devices)

inf2_24 = aws_inf2_24xlarge()
self.assertEqual(96, inf2_24.cpu)
self.assertEqual(0, inf2_24.gpu)
self.assertEqual(192 * GiB, inf2_24.memMB)
self.assertEqual({NEURON_DEVICE: 6}, inf2_24.devices)

inf2_48 = aws_inf2_48xlarge()
self.assertEqual(192, inf2_48.cpu)
self.assertEqual(0, inf2_48.gpu)
self.assertEqual(384 * GiB, inf2_48.memMB)
self.assertEqual({NEURON_DEVICE: 12}, inf2_48.devices)

def test_aws_m5_2xlarge(self) -> None:
resource = aws_m5_2xlarge()
self.assertEqual(8, resource.cpu)
Expand Down

0 comments on commit 894b557

Please sign in to comment.