Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NVIDIA GPU support #24836

Merged
merged 1 commit into from
May 12, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/kubelet/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ func NewKubeletServer() *KubeletServer {
MaxPerPodContainerCount: 2,
MaxOpenFiles: 1000000,
MaxPods: 110,
NvidiaGPUs: 0,
MinimumGCAge: unversioned.Duration{Duration: 1 * time.Minute},
NetworkPluginDir: "/usr/libexec/kubernetes/kubelet-plugins/net/exec/",
NetworkPluginName: "",
Expand Down Expand Up @@ -231,6 +232,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&s.BabysitDaemons, "babysit-daemons", s.BabysitDaemons, "If true, the node has babysitter process monitoring docker and kubelet.")
fs.MarkDeprecated("babysit-daemons", "Will be removed in a future version.")
fs.Int32Var(&s.MaxPods, "max-pods", s.MaxPods, "Number of Pods that can run on this Kubelet.")
fs.Int32Var(&s.NvidiaGPUs, "experimental-nvidia-gpus", s.NvidiaGPUs, "Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.")
fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.")
fs.StringVar(&s.NonMasqueradeCIDR, "non-masquerade-cidr", s.NonMasqueradeCIDR, "Traffic to IPs outside this range will use IP masquerade.")
fs.StringVar(&s.PodCIDR, "pod-cidr", "", "The CIDR to use for pod IP addresses, only used in standalone mode. In cluster mode, this is obtained from the master.")
Expand Down
4 changes: 4 additions & 0 deletions cmd/kubelet/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ func UnsecuredKubeletConfig(s *options.KubeletServer) (*KubeletConfig, error) {
MaxOpenFiles: s.MaxOpenFiles,
MaxPerPodContainerCount: int(s.MaxPerPodContainerCount),
MaxPods: int(s.MaxPods),
NvidiaGPUs: int(s.NvidiaGPUs),
MinimumGCAge: s.MinimumGCAge.Duration,
Mounter: mounter,
NetworkPluginName: s.NetworkPluginName,
Expand Down Expand Up @@ -551,6 +552,7 @@ func SimpleKubelet(client *clientset.Clientset,
MaxOpenFiles: 1024,
MaxPerPodContainerCount: 2,
MaxPods: maxPods,
NvidiaGPUs: 0,
MinimumGCAge: minimumGCAge,
Mounter: mount.New(),
NodeStatusUpdateFrequency: nodeStatusUpdateFrequency,
Expand Down Expand Up @@ -743,6 +745,7 @@ type KubeletConfig struct {
NodeLabels map[string]string
NodeStatusUpdateFrequency time.Duration
NonMasqueradeCIDR string
NvidiaGPUs int
OOMAdjuster *oom.OOMAdjuster
OSInterface kubecontainer.OSInterface
PodCIDR string
Expand Down Expand Up @@ -852,6 +855,7 @@ func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod
kc.PodCIDR,
kc.ReconcileCIDR,
kc.MaxPods,
kc.NvidiaGPUs,
kc.DockerExecHandler,
kc.ResolverConfig,
kc.CPUCFSQuota,
Expand Down
3 changes: 2 additions & 1 deletion docs/admin/kubelet.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ kubelet
--eviction-soft="": A set of eviction thresholds (e.g. memory.available<1.5Gi) that if met over a corresponding grace period would trigger a pod eviction.
--eviction-soft-grace-period="": A set of eviction grace periods (e.g. memory.available=1m30s) that correspond to how long a soft eviction threshold must hold before triggering a pod eviction.
--experimental-flannel-overlay[=false]: Experimental support for starting the kubelet with the default overlay network (flannel). Assumes flanneld is already running in client mode. [default=false]
--experimental-nvidia-gpus=0: Number of NVIDIA GPU devices on this node. Only 0 (default) and 1 are currently supported.
--file-check-frequency=20s: Duration between checking config files for new data
--google-json-key="": The Google Cloud Platform Service Account JSON Key to use for authentication.
--hairpin-mode="promiscuous-bridge": How should the kubelet setup hairpin NAT. This allows endpoints of a Service to loadbalance back to themselves if they should try to access their own Service. Valid values are "promiscuous-bridge", "hairpin-veth" and "none".
Expand Down Expand Up @@ -156,7 +157,7 @@ kubelet
--volume-stats-agg-period=1m0s: Specifies interval for kubelet to calculate and cache the volume disk usage for all pods and volumes. To disable volume calculations, set to 0. Default: '1m'
```

###### Auto generated by spf13/cobra on 21-Apr-2016
###### Auto generated by spf13/cobra on 3-May-2016


<!-- BEGIN MUNGE: GENERATED_ANALYTICS -->
Expand Down
1 change: 1 addition & 0 deletions hack/verify-flags/known-flags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ executor-path
executor-suicide-timeout
experimental-flannel-overlay
experimental-keystone-url
experimental-nvidia-gpus
experimental-prefix
external-hostname
external-ip
Expand Down
7 changes: 7 additions & 0 deletions pkg/api/resource_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ func (self *ResourceList) Pods() *resource.Quantity {
return &resource.Quantity{}
}

func (self *ResourceList) NvidiaGPU() *resource.Quantity {
if val, ok := (*self)[ResourceNvidiaGPU]; ok {
return &val
}
return &resource.Quantity{}
}

func GetContainerStatus(statuses []ContainerStatus, name string) (ContainerStatus, bool) {
for i := range statuses {
if statuses[i].Name == name {
Expand Down
7 changes: 7 additions & 0 deletions pkg/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1917,13 +1917,20 @@ type NodeResources struct {
// ResourceName is the name identifying various resources in a ResourceList.
type ResourceName string

// Resource names must be not more than 63 characters, consisting of upper- or lower-case alphanumeric characters,
// with the -, _, and . characters allowed anywhere, except the first or last character.
// The default convention, matching that for annotations, is to use lower-case names, with dashes, rather than
// camel case, separating compound words.
// Fully-qualified resource typenames are constructed from a DNS-style subdomain, followed by a slash `/` and a name.
const (
// CPU, in cores. (500m = .5 cores)
ResourceCPU ResourceName = "cpu"
// Memory, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
ResourceMemory ResourceName = "memory"
// Volume size, in bytes (e,g. 5Gi = 5GiB = 5 * 1024 * 1024 * 1024)
ResourceStorage ResourceName = "storage"
// NVIDIA GPU, in devices. Alpha, might change: although fractional and allowing values >1, only one whole device per node is assigned.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't understand this comment. The resource is integer GPUs, not milliGPUs, so how can you specify fractional?

ResourceNvidiaGPU ResourceName = "alpha.kubernetes.io/nvidia-gpu"
// Number of Pods that may be running on this Node: see ResourcePods
)

Expand Down
8 changes: 8 additions & 0 deletions pkg/api/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -2302,13 +2302,21 @@ type NodeAddress struct {
// ResourceName is the name identifying various resources in a ResourceList.
type ResourceName string

// Resource names must be not more than 63 characters, consisting of upper- or lower-case alphanumeric characters,
// with the -, _, and . characters allowed anywhere, except the first or last character.
// The default convention, matching that for annotations, is to use lower-case names, with dashes, rather than
// camel case, separating compound words.
// Fully-qualified resource typenames are constructed from a DNS-style subdomain, followed by a slash `/` and a name.
const (
// CPU, in cores. (500m = .5 cores)
ResourceCPU ResourceName = "cpu"
// Memory, in bytes. (500Gi = 500GiB = 500 * 1024 * 1024 * 1024)
ResourceMemory ResourceName = "memory"
// Volume size, in bytes (e,g. 5Gi = 5GiB = 5 * 1024 * 1024 * 1024)
ResourceStorage ResourceName = "storage"
// NVIDIA GPU, in devices. Alpha, might change: although fractional and allowing values >1, only one whole device per node is assigned.
ResourceNvidiaGPU ResourceName = "alpha.kubernetes.io/nvidia-gpu"
// Number of Pods that may be running on this Node: see ResourcePods
)

// ResourceList is a set of (resource name, quantity) pairs.
Expand Down
7 changes: 5 additions & 2 deletions pkg/api/validation/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -2464,6 +2464,7 @@ func validateBasicResource(quantity resource.Quantity, fldPath *field.Path) fiel
func ValidateResourceRequirements(requirements *api.ResourceRequirements, fldPath *field.Path) field.ErrorList {
allErrs := field.ErrorList{}
limPath := fldPath.Child("limits")
reqPath := fldPath.Child("requests")
for resourceName, quantity := range requirements.Limits {
fldPath := limPath.Key(string(resourceName))
// Validate resource name.
Expand All @@ -2474,12 +2475,14 @@ func ValidateResourceRequirements(requirements *api.ResourceRequirements, fldPat
// Check that request <= limit.
requestQuantity, exists := requirements.Requests[resourceName]
if exists {
if quantity.Cmp(requestQuantity) < 0 {
// For GPUs, require that no request be set.
if resourceName == api.ResourceNvidiaGPU {
allErrs = append(allErrs, field.Invalid(reqPath, requestQuantity.String(), "cannot be set"))
} else if quantity.Cmp(requestQuantity) < 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we also add quantity.Cmp(requrestQuantity) > 1? Because so far we only support /dev/nvidia0

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is necessary. We would not, for example, validate that your ram is smaller than the largest ram machine.

allErrs = append(allErrs, field.Invalid(fldPath, quantity.String(), "must be greater than or equal to request"))
}
}
}
reqPath := fldPath.Child("requests")
for resourceName, quantity := range requirements.Requests {
fldPath := reqPath.Key(string(resourceName))
// Validate resource name.
Expand Down
35 changes: 35 additions & 0 deletions pkg/api/validation/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,22 @@ func TestValidateContainers(t *testing.T) {
},
ImagePullPolicy: "IfNotPresent",
},
{
Name: "resources-test-with-gpu",
Image: "image",
Resources: api.ResourceRequirements{
Requests: api.ResourceList{
api.ResourceName(api.ResourceCPU): resource.MustParse("10"),
api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
},
Limits: api.ResourceList{
api.ResourceName(api.ResourceCPU): resource.MustParse("10"),
api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
api.ResourceName(api.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
ImagePullPolicy: "IfNotPresent",
},
{
Name: "resources-request-limit-simple",
Image: "image",
Expand Down Expand Up @@ -1598,6 +1614,25 @@ func TestValidateContainers(t *testing.T) {
ImagePullPolicy: "IfNotPresent",
},
},
"Resource can only have GPU limit": {
{
Name: "resources-request-limit-edge",
Image: "image",
Resources: api.ResourceRequirements{
Requests: api.ResourceList{
api.ResourceName(api.ResourceCPU): resource.MustParse("10"),
api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
api.ResourceName(api.ResourceNvidiaGPU): resource.MustParse("1"),
},
Limits: api.ResourceList{
api.ResourceName(api.ResourceCPU): resource.MustParse("10"),
api.ResourceName(api.ResourceMemory): resource.MustParse("10G"),
api.ResourceName(api.ResourceNvidiaGPU): resource.MustParse("1"),
},
},
ImagePullPolicy: "IfNotPresent",
},
},
"Request limit simple invalid": {
{
Name: "abc-123",
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/componentconfig/deep_copy_generated.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ func DeepCopy_componentconfig_KubeletConfiguration(in KubeletConfiguration, out
out.HairpinMode = in.HairpinMode
out.BabysitDaemons = in.BabysitDaemons
out.MaxPods = in.MaxPods
out.NvidiaGPUs = in.NvidiaGPUs
out.DockerExecHandlerName = in.DockerExecHandlerName
out.PodCIDR = in.PodCIDR
out.ResolverConfig = in.ResolverConfig
Expand Down
Loading