From e949256bc5591e1c733e5a6eb6d51c7065e5040c Mon Sep 17 00:00:00 2001 From: enxebre Date: Thu, 12 Dec 2024 12:58:54 +0100 Subject: [PATCH] Add karpenter operator This introduces a selfcontained controller that knows how to run karpenter management side watching a guest cluster and managing karpeneter CRDs --- .../controllers/karpenter/assets/assets.go | 13 + .../karpenter.k8s.aws_ec2nodeclasses.yaml | 739 ++++++++++++++++++ .../assets/karpenter.sh_nodeclaims.yaml | 382 +++++++++ .../assets/karpenter.sh_nodepools.yaml | 507 ++++++++++++ .../karpenter/karpenter_controller.go | 327 ++++++++ .../karpenter/karpenter_controller_test.go | 226 ++++++ .../controllers/karpenter/manifests.go | 420 ++++++++++ karpenter-operator/main.go | 124 +++ karpenter-operator/manifests/operator.go | 367 +++++++++ support/assets/readasset.go | 10 +- 10 files changed, 3113 insertions(+), 2 deletions(-) create mode 100644 karpenter-operator/controllers/karpenter/assets/assets.go create mode 100644 karpenter-operator/controllers/karpenter/assets/karpenter.k8s.aws_ec2nodeclasses.yaml create mode 100644 karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodeclaims.yaml create mode 100644 karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodepools.yaml create mode 100644 karpenter-operator/controllers/karpenter/karpenter_controller.go create mode 100644 karpenter-operator/controllers/karpenter/karpenter_controller_test.go create mode 100644 karpenter-operator/controllers/karpenter/manifests.go create mode 100644 karpenter-operator/main.go create mode 100644 karpenter-operator/manifests/operator.go diff --git a/karpenter-operator/controllers/karpenter/assets/assets.go b/karpenter-operator/controllers/karpenter/assets/assets.go new file mode 100644 index 0000000000..e49f902d92 --- /dev/null +++ b/karpenter-operator/controllers/karpenter/assets/assets.go @@ -0,0 +1,13 @@ +package assets + +import ( + "embed" +) + +//go:embed *.yaml +var f embed.FS + +// ReadFile reads and returns the content of the named file. +func ReadFile(name string) ([]byte, error) { + return f.ReadFile(name) +} diff --git a/karpenter-operator/controllers/karpenter/assets/karpenter.k8s.aws_ec2nodeclasses.yaml b/karpenter-operator/controllers/karpenter/assets/karpenter.k8s.aws_ec2nodeclasses.yaml new file mode 100644 index 0000000000..ff1bc9da2b --- /dev/null +++ b/karpenter-operator/controllers/karpenter/assets/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -0,0 +1,739 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 + name: ec2nodeclasses.karpenter.k8s.aws +spec: + group: karpenter.k8s.aws + names: + categories: + - karpenter + kind: EC2NodeClass + listKind: EC2NodeClassList + plural: ec2nodeclasses + shortNames: + - ec2nc + - ec2ncs + singular: ec2nodeclass + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.role + name: Role + priority: 1 + type: string + name: v1 + schema: + openAPIV3Schema: + description: EC2NodeClass is the Schema for the EC2NodeClass API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + EC2NodeClassSpec is the top level specification for the AWS Karpenter Provider. + This will contain configuration necessary to launch instances in AWS. + properties: + amiFamily: + description: |- + AMIFamily dictates the UserData format and default BlockDeviceMappings used when generating launch templates. + This field is optional when using an alias amiSelectorTerm, and the value will be inferred from the alias' + family. When an alias is specified, this field may only be set to its corresponding family or 'Custom'. If no + alias is specified, this field is required. + NOTE: We ignore the AMIFamily for hashing here because we hash the AMIFamily dynamically by using the alias using + the AMIFamily() helper function + enum: + - AL2 + - AL2023 + - Bottlerocket + - Custom + - Windows2019 + - Windows2022 + type: string + amiSelectorTerms: + description: AMISelectorTerms is a list of or ami selector terms. The terms are ORed. + items: + description: |- + AMISelectorTerm defines selection logic for an ami used by Karpenter to launch nodes. + If multiple fields are used for selection, the requirements are ANDed. + properties: + alias: + description: |- + Alias specifies which EKS optimized AMI to select. + Each alias consists of a family and an AMI version, specified as "family@version". + Valid families include: al2, al2023, bottlerocket, windows2019, and windows2022. + The version can either be pinned to a specific AMI release, with that AMIs version format (ex: "al2023@v20240625" or "bottlerocket@v1.10.0"). + The version can also be set to "latest" for any family. Setting the version to latest will result in drift when a new AMI is released. This is **not** recommended for production environments. + Note: The Windows families do **not** support version pinning, and only latest may be used. + maxLength: 30 + type: string + x-kubernetes-validations: + - message: '''alias'' is improperly formatted, must match the format ''family@version''' + rule: self.matches('^[a-zA-Z0-9]+@.+$') + - message: 'family is not supported, must be one of the following: ''al2'', ''al2023'', ''bottlerocket'', ''windows2019'', ''windows2022''' + rule: self.split('@')[0] in ['al2','al2023','bottlerocket','windows2019','windows2022'] + - message: windows families may only specify version 'latest' + rule: 'self.split(''@'')[0] in [''windows2019'',''windows2022''] ? self.split(''@'')[1] == ''latest'' : true' + id: + description: ID is the ami id in EC2 + pattern: ami-[0-9a-z]+ + type: string + name: + description: |- + Name is the ami name in EC2. + This value is the name field, which is different from the name tag. + type: string + owner: + description: |- + Owner is the owner for the ami. + You can specify a combination of AWS account IDs, "self", "amazon", and "aws-marketplace" + type: string + tags: + additionalProperties: + type: string + description: |- + Tags is a map of key/value tags used to select subnets + Specifying '*' for a value selects all values for a given tag key. + maxProperties: 20 + type: object + x-kubernetes-validations: + - message: empty tag keys or values aren't supported + rule: self.all(k, k != '' && self[k] != '') + type: object + maxItems: 30 + minItems: 1 + type: array + x-kubernetes-validations: + - message: expected at least one, got none, ['tags', 'id', 'name', 'alias'] + rule: self.all(x, has(x.tags) || has(x.id) || has(x.name) || has(x.alias)) + - message: '''id'' is mutually exclusive, cannot be set with a combination of other fields in amiSelectorTerms' + rule: '!self.exists(x, has(x.id) && (has(x.alias) || has(x.tags) || has(x.name) || has(x.owner)))' + - message: '''alias'' is mutually exclusive, cannot be set with a combination of other fields in amiSelectorTerms' + rule: '!self.exists(x, has(x.alias) && (has(x.id) || has(x.tags) || has(x.name) || has(x.owner)))' + - message: '''alias'' is mutually exclusive, cannot be set with a combination of other amiSelectorTerms' + rule: '!(self.exists(x, has(x.alias)) && self.size() != 1)' + associatePublicIPAddress: + description: AssociatePublicIPAddress controls if public IP addresses are assigned to instances that are launched with the nodeclass. + type: boolean + blockDeviceMappings: + description: BlockDeviceMappings to be applied to provisioned nodes. + items: + properties: + deviceName: + description: The device name (for example, /dev/sdh or xvdh). + type: string + ebs: + description: EBS contains parameters used to automatically set up EBS volumes when an instance is launched. + properties: + deleteOnTermination: + description: DeleteOnTermination indicates whether the EBS volume is deleted on instance termination. + type: boolean + encrypted: + description: |- + Encrypted indicates whether the EBS volume is encrypted. Encrypted volumes can only + be attached to instances that support Amazon EBS encryption. If you are creating + a volume from a snapshot, you can't specify an encryption value. + type: boolean + iops: + description: |- + IOPS is the number of I/O operations per second (IOPS). For gp3, io1, and io2 volumes, + this represents the number of IOPS that are provisioned for the volume. For + gp2 volumes, this represents the baseline performance of the volume and the + rate at which the volume accumulates I/O credits for bursting. + + The following are the supported values for each volume type: + + * gp3: 3,000-16,000 IOPS + + * io1: 100-64,000 IOPS + + * io2: 100-64,000 IOPS + + For io1 and io2 volumes, we guarantee 64,000 IOPS only for Instances built + on the Nitro System (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances). + Other instance families guarantee performance up to 32,000 IOPS. + + This parameter is supported for io1, io2, and gp3 volumes only. This parameter + is not supported for gp2, st1, sc1, or standard volumes. + format: int64 + type: integer + kmsKeyID: + description: KMSKeyID (ARN) of the symmetric Key Management Service (KMS) CMK used for encryption. + type: string + snapshotID: + description: SnapshotID is the ID of an EBS snapshot + type: string + throughput: + description: |- + Throughput to provision for a gp3 volume, with a maximum of 1,000 MiB/s. + Valid Range: Minimum value of 125. Maximum value of 1000. + format: int64 + type: integer + volumeSize: + description: |- + VolumeSize in `Gi`, `G`, `Ti`, or `T`. You must specify either a snapshot ID or + a volume size. The following are the supported volumes sizes for each volume + type: + + * gp2 and gp3: 1-16,384 + + * io1 and io2: 4-16,384 + + * st1 and sc1: 125-16,384 + + * standard: 1-1,024 + pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ + type: string + volumeType: + description: |- + VolumeType of the block device. + For more information, see Amazon EBS volume types (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EBSVolumeTypes.html) + in the Amazon Elastic Compute Cloud User Guide. + enum: + - standard + - io1 + - io2 + - gp2 + - sc1 + - st1 + - gp3 + type: string + type: object + x-kubernetes-validations: + - message: snapshotID or volumeSize must be defined + rule: has(self.snapshotID) || has(self.volumeSize) + rootVolume: + description: |- + RootVolume is a flag indicating if this device is mounted as kubelet root dir. You can + configure at most one root volume in BlockDeviceMappings. + type: boolean + type: object + maxItems: 50 + type: array + x-kubernetes-validations: + - message: must have only one blockDeviceMappings with rootVolume + rule: self.filter(x, has(x.rootVolume)?x.rootVolume==true:false).size() <= 1 + context: + description: |- + Context is a Reserved field in EC2 APIs + https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_CreateFleet.html + type: string + detailedMonitoring: + description: DetailedMonitoring controls if detailed monitoring is enabled for instances that are launched + type: boolean + instanceProfile: + description: |- + InstanceProfile is the AWS entity that instances use. + This field is mutually exclusive from role. + The instance profile should already have a role assigned to it that Karpenter + has PassRole permission on for instance launch using this instanceProfile to succeed. + type: string + x-kubernetes-validations: + - message: instanceProfile cannot be empty + rule: self != '' + instanceStorePolicy: + description: InstanceStorePolicy specifies how to handle instance-store disks. + enum: + - RAID0 + type: string + kubelet: + description: |- + Kubelet defines args to be used when configuring kubelet on provisioned nodes. + They are a subset of the upstream types, recognizing not all options may be supported. + Wherever possible, the types and names should reflect the upstream kubelet types. + properties: + clusterDNS: + description: |- + clusterDNS is a list of IP addresses for the cluster DNS server. + Note that not all providers may use all addresses. + items: + type: string + type: array + cpuCFSQuota: + description: CPUCFSQuota enables CPU CFS quota enforcement for containers that specify CPU limits. + type: boolean + evictionHard: + additionalProperties: + type: string + pattern: ^((\d{1,2}(\.\d{1,2})?|100(\.0{1,2})?)%||(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?)$ + description: EvictionHard is the map of signal names to quantities that define hard eviction thresholds + type: object + x-kubernetes-validations: + - message: valid keys for evictionHard are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + evictionMaxPodGracePeriod: + description: |- + EvictionMaxPodGracePeriod is the maximum allowed grace period (in seconds) to use when terminating pods in + response to soft eviction thresholds being met. + format: int32 + type: integer + evictionSoft: + additionalProperties: + type: string + pattern: ^((\d{1,2}(\.\d{1,2})?|100(\.0{1,2})?)%||(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?)$ + description: EvictionSoft is the map of signal names to quantities that define soft eviction thresholds + type: object + x-kubernetes-validations: + - message: valid keys for evictionSoft are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + evictionSoftGracePeriod: + additionalProperties: + type: string + description: EvictionSoftGracePeriod is the map of signal names to quantities that define grace periods for each eviction signal + type: object + x-kubernetes-validations: + - message: valid keys for evictionSoftGracePeriod are ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available'] + rule: self.all(x, x in ['memory.available','nodefs.available','nodefs.inodesFree','imagefs.available','imagefs.inodesFree','pid.available']) + imageGCHighThresholdPercent: + description: |- + ImageGCHighThresholdPercent is the percent of disk usage after which image + garbage collection is always run. The percent is calculated by dividing this + field value by 100, so this field must be between 0 and 100, inclusive. + When specified, the value must be greater than ImageGCLowThresholdPercent. + format: int32 + maximum: 100 + minimum: 0 + type: integer + imageGCLowThresholdPercent: + description: |- + ImageGCLowThresholdPercent is the percent of disk usage before which image + garbage collection is never run. Lowest disk usage to garbage collect to. + The percent is calculated by dividing this field value by 100, + so the field value must be between 0 and 100, inclusive. + When specified, the value must be less than imageGCHighThresholdPercent + format: int32 + maximum: 100 + minimum: 0 + type: integer + kubeReserved: + additionalProperties: + type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + description: KubeReserved contains resources reserved for Kubernetes system components. + type: object + x-kubernetes-validations: + - message: valid keys for kubeReserved are ['cpu','memory','ephemeral-storage','pid'] + rule: self.all(x, x=='cpu' || x=='memory' || x=='ephemeral-storage' || x=='pid') + - message: kubeReserved value cannot be a negative resource quantity + rule: self.all(x, !self[x].startsWith('-')) + maxPods: + description: |- + MaxPods is an override for the maximum number of pods that can run on + a worker node instance. + format: int32 + minimum: 0 + type: integer + podsPerCore: + description: |- + PodsPerCore is an override for the number of pods that can run on a worker node + instance based on the number of cpu cores. This value cannot exceed MaxPods, so, if + MaxPods is a lower value, that value will be used. + format: int32 + minimum: 0 + type: integer + systemReserved: + additionalProperties: + type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + description: SystemReserved contains resources reserved for OS system daemons and kernel memory. + type: object + x-kubernetes-validations: + - message: valid keys for systemReserved are ['cpu','memory','ephemeral-storage','pid'] + rule: self.all(x, x=='cpu' || x=='memory' || x=='ephemeral-storage' || x=='pid') + - message: systemReserved value cannot be a negative resource quantity + rule: self.all(x, !self[x].startsWith('-')) + type: object + x-kubernetes-validations: + - message: imageGCHighThresholdPercent must be greater than imageGCLowThresholdPercent + rule: 'has(self.imageGCHighThresholdPercent) && has(self.imageGCLowThresholdPercent) ? self.imageGCHighThresholdPercent > self.imageGCLowThresholdPercent : true' + - message: evictionSoft OwnerKey does not have a matching evictionSoftGracePeriod + rule: has(self.evictionSoft) ? self.evictionSoft.all(e, (e in self.evictionSoftGracePeriod)):true + - message: evictionSoftGracePeriod OwnerKey does not have a matching evictionSoft + rule: has(self.evictionSoftGracePeriod) ? self.evictionSoftGracePeriod.all(e, (e in self.evictionSoft)):true + metadataOptions: + default: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 1 + httpTokens: required + description: |- + MetadataOptions for the generated launch template of provisioned nodes. + + This specifies the exposure of the Instance Metadata Service to + provisioned EC2 nodes. For more information, + see Instance Metadata and User Data + (https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-metadata.html) + in the Amazon Elastic Compute Cloud User Guide. + + Refer to recommended, security best practices + (https://aws.github.io/aws-eks-best-practices/security/docs/iam/#restrict-access-to-the-instance-profile-assigned-to-the-worker-node) + for limiting exposure of Instance Metadata and User Data to pods. + If omitted, defaults to httpEndpoint enabled, with httpProtocolIPv6 + disabled, with httpPutResponseLimit of 1, and with httpTokens + required. + properties: + httpEndpoint: + default: enabled + description: |- + HTTPEndpoint enables or disables the HTTP metadata endpoint on provisioned + nodes. If metadata options is non-nil, but this parameter is not specified, + the default state is "enabled". + + If you specify a value of "disabled", instance metadata will not be accessible + on the node. + enum: + - enabled + - disabled + type: string + httpProtocolIPv6: + default: disabled + description: |- + HTTPProtocolIPv6 enables or disables the IPv6 endpoint for the instance metadata + service on provisioned nodes. If metadata options is non-nil, but this parameter + is not specified, the default state is "disabled". + enum: + - enabled + - disabled + type: string + httpPutResponseHopLimit: + default: 1 + description: |- + HTTPPutResponseHopLimit is the desired HTTP PUT response hop limit for + instance metadata requests. The larger the number, the further instance + metadata requests can travel. Possible values are integers from 1 to 64. + If metadata options is non-nil, but this parameter is not specified, the + default value is 1. + format: int64 + maximum: 64 + minimum: 1 + type: integer + httpTokens: + default: required + description: |- + HTTPTokens determines the state of token usage for instance metadata + requests. If metadata options is non-nil, but this parameter is not + specified, the default state is "required". + + If the state is optional, one can choose to retrieve instance metadata with + or without a signed token header on the request. If one retrieves the IAM + role credentials without a token, the version 1.0 role credentials are + returned. If one retrieves the IAM role credentials using a valid signed + token, the version 2.0 role credentials are returned. + + If the state is "required", one must send a signed token header with any + instance metadata retrieval requests. In this state, retrieving the IAM + role credentials always returns the version 2.0 credentials; the version + 1.0 credentials are not available. + enum: + - required + - optional + type: string + type: object + role: + description: |- + Role is the AWS identity that nodes use. This field is immutable. + This field is mutually exclusive from instanceProfile. + Marking this field as immutable avoids concerns around terminating managed instance profiles from running instances. + This field may be made mutable in the future, assuming the correct garbage collection and drift handling is implemented + for the old instance profiles on an update. + type: string + x-kubernetes-validations: + - message: role cannot be empty + rule: self != '' + - message: immutable field changed + rule: self == oldSelf + securityGroupSelectorTerms: + description: SecurityGroupSelectorTerms is a list of or security group selector terms. The terms are ORed. + items: + description: |- + SecurityGroupSelectorTerm defines selection logic for a security group used by Karpenter to launch nodes. + If multiple fields are used for selection, the requirements are ANDed. + properties: + id: + description: ID is the security group id in EC2 + pattern: sg-[0-9a-z]+ + type: string + name: + description: |- + Name is the security group name in EC2. + This value is the name field, which is different from the name tag. + type: string + tags: + additionalProperties: + type: string + description: |- + Tags is a map of key/value tags used to select subnets + Specifying '*' for a value selects all values for a given tag key. + maxProperties: 20 + type: object + x-kubernetes-validations: + - message: empty tag keys or values aren't supported + rule: self.all(k, k != '' && self[k] != '') + type: object + maxItems: 30 + type: array + x-kubernetes-validations: + - message: securityGroupSelectorTerms cannot be empty + rule: self.size() != 0 + - message: expected at least one, got none, ['tags', 'id', 'name'] + rule: self.all(x, has(x.tags) || has(x.id) || has(x.name)) + - message: '''id'' is mutually exclusive, cannot be set with a combination of other fields in securityGroupSelectorTerms' + rule: '!self.all(x, has(x.id) && (has(x.tags) || has(x.name)))' + - message: '''name'' is mutually exclusive, cannot be set with a combination of other fields in securityGroupSelectorTerms' + rule: '!self.all(x, has(x.name) && (has(x.tags) || has(x.id)))' + subnetSelectorTerms: + description: SubnetSelectorTerms is a list of or subnet selector terms. The terms are ORed. + items: + description: |- + SubnetSelectorTerm defines selection logic for a subnet used by Karpenter to launch nodes. + If multiple fields are used for selection, the requirements are ANDed. + properties: + id: + description: ID is the subnet id in EC2 + pattern: subnet-[0-9a-z]+ + type: string + tags: + additionalProperties: + type: string + description: |- + Tags is a map of key/value tags used to select subnets + Specifying '*' for a value selects all values for a given tag key. + maxProperties: 20 + type: object + x-kubernetes-validations: + - message: empty tag keys or values aren't supported + rule: self.all(k, k != '' && self[k] != '') + type: object + maxItems: 30 + type: array + x-kubernetes-validations: + - message: subnetSelectorTerms cannot be empty + rule: self.size() != 0 + - message: expected at least one, got none, ['tags', 'id'] + rule: self.all(x, has(x.tags) || has(x.id)) + - message: '''id'' is mutually exclusive, cannot be set with a combination of other fields in subnetSelectorTerms' + rule: '!self.all(x, has(x.id) && has(x.tags))' + tags: + additionalProperties: + type: string + description: Tags to be applied on ec2 resources like instances and launch templates. + type: object + x-kubernetes-validations: + - message: empty tag keys aren't supported + rule: self.all(k, k != '') + - message: tag contains a restricted tag matching eks:eks-cluster-name + rule: self.all(k, k !='eks:eks-cluster-name') + - message: tag contains a restricted tag matching kubernetes.io/cluster/ + rule: self.all(k, !k.startsWith('kubernetes.io/cluster') ) + - message: tag contains a restricted tag matching karpenter.sh/nodepool + rule: self.all(k, k != 'karpenter.sh/nodepool') + - message: tag contains a restricted tag matching karpenter.sh/nodeclaim + rule: self.all(k, k !='karpenter.sh/nodeclaim') + - message: tag contains a restricted tag matching karpenter.k8s.aws/ec2nodeclass + rule: self.all(k, k !='karpenter.k8s.aws/ec2nodeclass') + userData: + description: |- + UserData to be applied to the provisioned nodes. + It must be in the appropriate format based on the AMIFamily in use. Karpenter will merge certain fields into + this UserData to ensure nodes are being provisioned with the correct configuration. + type: string + required: + - amiSelectorTerms + - securityGroupSelectorTerms + - subnetSelectorTerms + type: object + x-kubernetes-validations: + - message: must specify exactly one of ['role', 'instanceProfile'] + rule: (has(self.role) && !has(self.instanceProfile)) || (!has(self.role) && has(self.instanceProfile)) + - message: changing from 'instanceProfile' to 'role' is not supported. You must delete and recreate this node class if you want to change this. + rule: (has(oldSelf.role) && has(self.role)) || (has(oldSelf.instanceProfile) && has(self.instanceProfile)) + - message: if set, amiFamily must be 'AL2' or 'Custom' when using an AL2 alias + rule: '!has(self.amiFamily) || (self.amiSelectorTerms.exists(x, has(x.alias) && x.alias.find(''^[^@]+'') == ''al2'') ? (self.amiFamily == ''Custom'' || self.amiFamily == ''AL2'') : true)' + - message: if set, amiFamily must be 'AL2023' or 'Custom' when using an AL2023 alias + rule: '!has(self.amiFamily) || (self.amiSelectorTerms.exists(x, has(x.alias) && x.alias.find(''^[^@]+'') == ''al2023'') ? (self.amiFamily == ''Custom'' || self.amiFamily == ''AL2023'') : true)' + - message: if set, amiFamily must be 'Bottlerocket' or 'Custom' when using a Bottlerocket alias + rule: '!has(self.amiFamily) || (self.amiSelectorTerms.exists(x, has(x.alias) && x.alias.find(''^[^@]+'') == ''bottlerocket'') ? (self.amiFamily == ''Custom'' || self.amiFamily == ''Bottlerocket'') : true)' + - message: if set, amiFamily must be 'Windows2019' or 'Custom' when using a Windows2019 alias + rule: '!has(self.amiFamily) || (self.amiSelectorTerms.exists(x, has(x.alias) && x.alias.find(''^[^@]+'') == ''windows2019'') ? (self.amiFamily == ''Custom'' || self.amiFamily == ''Windows2019'') : true)' + - message: if set, amiFamily must be 'Windows2022' or 'Custom' when using a Windows2022 alias + rule: '!has(self.amiFamily) || (self.amiSelectorTerms.exists(x, has(x.alias) && x.alias.find(''^[^@]+'') == ''windows2022'') ? (self.amiFamily == ''Custom'' || self.amiFamily == ''Windows2022'') : true)' + - message: must specify amiFamily if amiSelectorTerms does not contain an alias + rule: 'self.amiSelectorTerms.exists(x, has(x.alias)) ? true : has(self.amiFamily)' + status: + description: EC2NodeClassStatus contains the resolved state of the EC2NodeClass + properties: + amis: + description: |- + AMI contains the current AMI values that are available to the + cluster under the AMI selectors. + items: + description: AMI contains resolved AMI selector values utilized for node launch + properties: + id: + description: ID of the AMI + type: string + name: + description: Name of the AMI + type: string + requirements: + description: Requirements of the AMI to be utilized on an instance type + items: + description: |- + A node selector requirement is a selector that contains values, a key, and an operator + that relates the key and values. + properties: + key: + description: The label key that the selector applies to. + type: string + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + required: + - id + - requirements + type: object + type: array + conditions: + description: Conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + instanceProfile: + description: InstanceProfile contains the resolved instance profile for the role + type: string + securityGroups: + description: |- + SecurityGroups contains the current Security Groups values that are available to the + cluster under the SecurityGroups selectors. + items: + description: SecurityGroup contains resolved SecurityGroup selector values utilized for node launch + properties: + id: + description: ID of the security group + type: string + name: + description: Name of the security group + type: string + required: + - id + type: object + type: array + subnets: + description: |- + Subnets contains the current Subnet values that are available to the + cluster under the subnet selectors. + items: + description: Subnet contains resolved Subnet selector values utilized for node launch + properties: + id: + description: ID of the subnet + type: string + zone: + description: The associated availability zone + type: string + zoneID: + description: The associated availability zone ID + type: string + required: + - id + - zone + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodeclaims.yaml b/karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodeclaims.yaml new file mode 100644 index 0000000000..7976cfe0f4 --- /dev/null +++ b/karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodeclaims.yaml @@ -0,0 +1,382 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 + name: nodeclaims.karpenter.sh +spec: + group: karpenter.sh + names: + categories: + - karpenter + kind: NodeClaim + listKind: NodeClaimList + plural: nodeclaims + singular: nodeclaim + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .metadata.labels.node\.kubernetes\.io/instance-type + name: Type + type: string + - jsonPath: .metadata.labels.karpenter\.sh/capacity-type + name: Capacity + type: string + - jsonPath: .metadata.labels.topology\.kubernetes\.io/zone + name: Zone + type: string + - jsonPath: .status.nodeName + name: Node + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .status.providerID + name: ID + priority: 1 + type: string + - jsonPath: .metadata.labels.karpenter\.sh/nodepool + name: NodePool + priority: 1 + type: string + - jsonPath: .spec.nodeClassRef.name + name: NodeClass + priority: 1 + type: string + name: v1 + schema: + openAPIV3Schema: + description: NodeClaim is the Schema for the NodeClaims API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NodeClaimSpec describes the desired state of the NodeClaim + properties: + expireAfter: + default: 720h + description: |- + ExpireAfter is the duration the controller will wait + before terminating a node, measured from when the node is created. This + is useful to implement features like eventually consistent node upgrade, + memory leak protection, and disruption testing. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + nodeClassRef: + description: NodeClassRef is a reference to an object that defines provider specific configuration + properties: + group: + description: API version of the referent + pattern: ^[^/]*$ + type: string + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + required: + - group + - kind + - name + type: object + requirements: + description: Requirements are layered with GetLabels and applied to every node. + items: + description: |- + A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values + and minValues that represent the requirement to have at least that many values. + properties: + key: + description: The label key that the selector applies to. + type: string + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.find("^([^/]+)").endsWith("node.kubernetes.io") || self.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !self.find("^([^/]+)").endsWith("kubernetes.io") + - message: label domain "k8s.io" is restricted + rule: self.find("^([^/]+)").endsWith("kops.k8s.io") || !self.find("^([^/]+)").endsWith("k8s.io") + - message: label domain "karpenter.sh" is restricted + rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") + - message: label "kubernetes.io/hostname" is restricted + rule: self != "kubernetes.io/hostname" + - message: label domain "karpenter.k8s.aws" is restricted + rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-cpu-manufacturer","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + minValues: + description: |- + This field is ALPHA and can be dropped or replaced at any time + MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. + maximum: 50 + minimum: 1 + type: integer + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + enum: + - In + - NotIn + - Exists + - DoesNotExist + - Gt + - Lt + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + required: + - key + - operator + type: object + maxItems: 100 + type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' + - message: requirements with 'minValues' must have at least that many values specified in the 'values' field + rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? x.values.size() >= x.minValues : true)' + resources: + description: Resources models the resource requirements for the NodeClaim to launch + properties: + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Requests describes the minimum required resources for the NodeClaim to launch + type: object + type: object + startupTaints: + description: |- + StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically + within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by + daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning + purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + taints: + description: Taints will be applied to the NodeClaim's node. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + terminationGracePeriod: + description: |- + TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. + + Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. + + This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. + When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + + Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. + If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, + that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. + + The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. + If left undefined, the controller will wait indefinitely for pods to be drained. + pattern: ^([0-9]+(s|m|h))+$ + type: string + required: + - nodeClassRef + - requirements + type: object + x-kubernetes-validations: + - message: spec is immutable + rule: self == oldSelf + status: + description: NodeClaimStatus defines the observed state of NodeClaim + properties: + allocatable: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Allocatable is the estimated allocatable capacity of the node + type: object + capacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Capacity is the estimated full capacity of the node + type: object + conditions: + description: Conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + pattern: ^([A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?|)$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - status + - type + type: object + type: array + imageID: + description: ImageID is an identifier for the image that runs on the node + type: string + lastPodEventTime: + description: |- + LastPodEventTime is updated with the last time a pod was scheduled + or removed from the node. A pod going terminal or terminating + is also considered as removed. + format: date-time + type: string + nodeName: + description: NodeName is the name of the corresponding node object + type: string + providerID: + description: ProviderID of the corresponding node object + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodepools.yaml b/karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodepools.yaml new file mode 100644 index 0000000000..f20e73cd0d --- /dev/null +++ b/karpenter-operator/controllers/karpenter/assets/karpenter.sh_nodepools.yaml @@ -0,0 +1,507 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.5 + name: nodepools.karpenter.sh +spec: + group: karpenter.sh + names: + categories: + - karpenter + kind: NodePool + listKind: NodePoolList + plural: nodepools + singular: nodepool + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .spec.template.spec.nodeClassRef.name + name: NodeClass + type: string + - jsonPath: .status.resources.nodes + name: Nodes + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.weight + name: Weight + priority: 1 + type: integer + - jsonPath: .status.resources.cpu + name: CPU + priority: 1 + type: string + - jsonPath: .status.resources.memory + name: Memory + priority: 1 + type: string + name: v1 + schema: + openAPIV3Schema: + description: NodePool is the Schema for the NodePools API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + NodePoolSpec is the top level nodepool specification. Nodepools + launch nodes in response to pods that are unschedulable. A single nodepool + is capable of managing a diverse set of nodes. Node properties are determined + from a combination of nodepool and pod scheduling constraints. + properties: + disruption: + default: + consolidateAfter: 0s + description: Disruption contains the parameters that relate to Karpenter's disruption logic + properties: + budgets: + default: + - nodes: 10% + description: |- + Budgets is a list of Budgets. + If there are multiple active budgets, Karpenter uses + the most restrictive value. If left undefined, + this will default to one budget with a value to 10%. + items: + description: |- + Budget defines when Karpenter will restrict the + number of Node Claims that can be terminating simultaneously. + properties: + duration: + description: |- + Duration determines how long a Budget is active since each Schedule hit. + Only minutes and hours are accepted, as cron does not work in seconds. + If omitted, the budget is always active. + This is required if Schedule is set. + This regex has an optional 0s at the end since the duration.String() always adds + a 0s at the end. + pattern: ^((([0-9]+(h|m))|([0-9]+h[0-9]+m))(0s)?)$ + type: string + nodes: + default: 10% + description: |- + Nodes dictates the maximum number of NodeClaims owned by this NodePool + that can be terminating at once. This is calculated by counting nodes that + have a deletion timestamp set, or are actively being deleted by Karpenter. + This field is required when specifying a budget. + This cannot be of type intstr.IntOrString since kubebuilder doesn't support pattern + checking for int nodes for IntOrString nodes. + Ref: https://github.com/kubernetes-sigs/controller-tools/blob/55efe4be40394a288216dab63156b0a64fb82929/pkg/crd/markers/validation.go#L379-L388 + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + type: string + reasons: + description: |- + Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods. + Otherwise, this will apply to each reason defined. + allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons. + items: + description: |- + DisruptionReason defines valid reasons for disruption budgets. + CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons + enum: + - Underutilized + - Empty + - Drifted + type: string + type: array + schedule: + description: |- + Schedule specifies when a budget begins being active, following + the upstream cronjob syntax. If omitted, the budget is always active. + Timezones are not supported. + This field is required if Duration is set. + pattern: ^(@(annually|yearly|monthly|weekly|daily|midnight|hourly))|((.+)\s(.+)\s(.+)\s(.+)\s(.+))$ + type: string + required: + - nodes + type: object + maxItems: 50 + type: array + x-kubernetes-validations: + - message: '''schedule'' must be set with ''duration''' + rule: self.all(x, has(x.schedule) == has(x.duration)) + consolidateAfter: + description: |- + ConsolidateAfter is the duration the controller will wait + before attempting to terminate nodes that are underutilized. + Refer to ConsolidationPolicy for how underutilization is considered. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + consolidationPolicy: + default: WhenEmptyOrUnderutilized + description: |- + ConsolidationPolicy describes which nodes Karpenter can disrupt through its consolidation + algorithm. This policy defaults to "WhenEmptyOrUnderutilized" if not specified + enum: + - WhenEmpty + - WhenEmptyOrUnderutilized + type: string + required: + - consolidateAfter + type: object + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Limits define a set of bounds for provisioning capacity. + type: object + template: + description: |- + Template contains the template of possibilities for the provisioning logic to launch a NodeClaim with. + NodeClaims launched from this NodePool will often be further constrained than the template specifies. + properties: + metadata: + properties: + annotations: + additionalProperties: + type: string + description: |- + Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations + type: object + labels: + additionalProperties: + type: string + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + description: |- + Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels + type: object + maxProperties: 100 + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self.all(x, x in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || x.find("^([^/]+)").endsWith("node.kubernetes.io") || x.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !x.find("^([^/]+)").endsWith("kubernetes.io")) + - message: label domain "k8s.io" is restricted + rule: self.all(x, x.find("^([^/]+)").endsWith("kops.k8s.io") || !x.find("^([^/]+)").endsWith("k8s.io")) + - message: label domain "karpenter.sh" is restricted + rule: self.all(x, x in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !x.find("^([^/]+)").endsWith("karpenter.sh")) + - message: label "karpenter.sh/nodepool" is restricted + rule: self.all(x, x != "karpenter.sh/nodepool") + - message: label "kubernetes.io/hostname" is restricted + rule: self.all(x, x != "kubernetes.io/hostname") + - message: label domain "karpenter.k8s.aws" is restricted + rule: self.all(x, x in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-cpu-manufacturer","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !x.find("^([^/]+)").endsWith("karpenter.k8s.aws")) + type: object + spec: + description: |- + NodeClaimTemplateSpec describes the desired state of the NodeClaim in the Nodepool + NodeClaimTemplateSpec is used in the NodePool's NodeClaimTemplate, with the resource requests omitted since + users are not able to set resource requests in the NodePool. + properties: + expireAfter: + default: 720h + description: |- + ExpireAfter is the duration the controller will wait + before terminating a node, measured from when the node is created. This + is useful to implement features like eventually consistent node upgrade, + memory leak protection, and disruption testing. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + nodeClassRef: + description: NodeClassRef is a reference to an object that defines provider specific configuration + properties: + group: + description: API version of the referent + pattern: ^[^/]*$ + type: string + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + required: + - group + - kind + - name + type: object + requirements: + description: Requirements are layered with GetLabels and applied to every node. + items: + description: |- + A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values + and minValues that represent the requirement to have at least that many values. + properties: + key: + description: The label key that the selector applies to. + type: string + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.find("^([^/]+)").endsWith("node.kubernetes.io") || self.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !self.find("^([^/]+)").endsWith("kubernetes.io") + - message: label domain "k8s.io" is restricted + rule: self.find("^([^/]+)").endsWith("kops.k8s.io") || !self.find("^([^/]+)").endsWith("k8s.io") + - message: label domain "karpenter.sh" is restricted + rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") + - message: label "karpenter.sh/nodepool" is restricted + rule: self != "karpenter.sh/nodepool" + - message: label "kubernetes.io/hostname" is restricted + rule: self != "kubernetes.io/hostname" + - message: label domain "karpenter.k8s.aws" is restricted + rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-cpu-manufacturer","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-ebs-bandwidth", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + minValues: + description: |- + This field is ALPHA and can be dropped or replaced at any time + MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. + maximum: 50 + minimum: 1 + type: integer + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + enum: + - In + - NotIn + - Exists + - DoesNotExist + - Gt + - Lt + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + required: + - key + - operator + type: object + maxItems: 100 + type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' + - message: requirements with 'minValues' must have at least that many values specified in the 'values' field + rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? x.values.size() >= x.minValues : true)' + startupTaints: + description: |- + StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically + within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by + daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning + purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + taints: + description: Taints will be applied to the NodeClaim's node. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute + key: + description: Required. The taint key to be applied to a node. + type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + required: + - effect + - key + type: object + type: array + terminationGracePeriod: + description: |- + TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. + + Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. + + This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. + When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + + Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. + If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, + that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. + + The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. + If left undefined, the controller will wait indefinitely for pods to be drained. + pattern: ^([0-9]+(s|m|h))+$ + type: string + required: + - nodeClassRef + - requirements + type: object + required: + - spec + type: object + weight: + description: |- + Weight is the priority given to the nodepool during scheduling. A higher + numerical weight indicates that this nodepool will be ordered + ahead of other nodepools with lower weights. A nodepool with no weight + will be treated as if it is a nodepool with a weight of 0. + format: int32 + maximum: 100 + minimum: 1 + type: integer + required: + - template + type: object + status: + description: NodePoolStatus defines the observed state of NodePool + properties: + conditions: + description: Conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resources is the list of resources that have been provisioned. + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/karpenter-operator/controllers/karpenter/karpenter_controller.go b/karpenter-operator/controllers/karpenter/karpenter_controller.go new file mode 100644 index 0000000000..682b247bed --- /dev/null +++ b/karpenter-operator/controllers/karpenter/karpenter_controller.go @@ -0,0 +1,327 @@ +package karpenter + +import ( + "context" + "fmt" + "sort" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/karpenter-operator/controllers/karpenter/assets" + supportassets "github.com/openshift/hypershift/support/assets" + "github.com/openshift/hypershift/support/upsert" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime/schema" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +const ( + karpenterFinalizer = "hypershift.openshift.io/karpenter-finalizer" + // userDataAMILabel is a label set in the userData secret generated for karpenter instances. + userDataAMILabel = "hypershift.openshift.io/ami" +) + +var ( + crdEC2NodeClass = supportassets.MustCRD(assets.ReadFile, "karpenter.k8s.aws_ec2nodeclasses.yaml") + crdNodePool = supportassets.MustCRD(assets.ReadFile, "karpenter.sh_nodepools.yaml") + crdNodeClaim = supportassets.MustCRD(assets.ReadFile, "karpenter.sh_nodeclaims.yaml") +) + +type Reconciler struct { + ManagementClient client.Client + GuestClient client.Client + Namespace string + ControlPlaneOperatorImage string + upsert.CreateOrUpdateProvider +} + +func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, managementCluster cluster.Cluster) error { + r.ManagementClient = managementCluster.GetClient() + r.GuestClient = mgr.GetClient() + r.CreateOrUpdateProvider = upsert.New(false) + + // First install the CRDs so we can create a watch below. + if err := r.reconcileCRDs(ctx, true); err != nil { + return err + } + + c, err := controller.New("karpenter", mgr, controller.Options{Reconciler: r}) + if err != nil { + return fmt.Errorf("failed to construct controller: %w", err) + } + + // Watch CRDs guest side. + if err := c.Watch(source.Kind[client.Object](mgr.GetCache(), &apiextensionsv1.CustomResourceDefinition{}, handler.EnqueueRequestsFromMapFunc( + func(ctx context.Context, o client.Object) []ctrl.Request { + // Only watch our Karpenter CRDs + switch o.GetName() { + case "ec2nodeclasses.karpenter.k8s.aws", + "nodepools.karpenter.sh", + "nodeclaims.karpenter.sh": + return []ctrl.Request{{NamespacedName: client.ObjectKey{Namespace: r.Namespace}}} + } + return nil + }))); err != nil { + return fmt.Errorf("failed to watch CRDs: %w", err) + } + + // Watch EC2NodeClass guest side. + if err := c.Watch(source.Kind[client.Object](mgr.GetCache(), &unstructured.Unstructured{Object: map[string]interface{}{ + "apiVersion": "karpenter.k8s.aws/v1", + "kind": "EC2NodeClass", + }}, &handler.EnqueueRequestForObject{})); err != nil { + return fmt.Errorf("failed to watch EC2NodeClass: %w", err) + } + + // Watch the karpenter Deployment management side. + if err := c.Watch(source.Kind[client.Object](managementCluster.GetCache(), &appsv1.Deployment{}, handler.EnqueueRequestsFromMapFunc( + func(ctx context.Context, o client.Object) []ctrl.Request { + if o.GetNamespace() != r.Namespace || o.GetName() != "karpenter" { + return nil + } + return []ctrl.Request{{NamespacedName: client.ObjectKeyFromObject(o)}} + }))); err != nil { + return fmt.Errorf("failed to watch Deployment: %w", err) + } + + // Trigger initial sync. + initialSync := make(chan event.GenericEvent) + if err := c.Watch(source.Channel(initialSync, &handler.EnqueueRequestForObject{})); err != nil { + return fmt.Errorf("failed to watch initial sync channel: %w", err) + } + go func() { + initialSync <- event.GenericEvent{Object: &hyperv1.HostedControlPlane{}} + }() + + return nil +} + +func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.Info("Reconciling", "req", req) + + hcp, err := r.getHCP(ctx) + if err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + if hcp.DeletionTimestamp != nil { + // TODO(alberto): implement deletion. E.g. loop over nodeClaims delete them, wait and delete karpeneter deployment. + if controllerutil.ContainsFinalizer(hcp, karpenterFinalizer) { + originalHCP := hcp.DeepCopy() + controllerutil.RemoveFinalizer(hcp, karpenterFinalizer) + if err := r.ManagementClient.Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to remove finalizer from cluster: %w", err) + } + } + return ctrl.Result{}, nil + } + if !controllerutil.ContainsFinalizer(hcp, karpenterFinalizer) { + originalHCP := hcp.DeepCopy() + controllerutil.AddFinalizer(hcp, karpenterFinalizer) + if err := r.ManagementClient.Patch(ctx, hcp, client.MergeFromWithOptions(originalHCP, client.MergeFromWithOptimisticLock{})); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to add finalizer to hostedControlPlane: %w", err) + } + } + + // TODO(alberto): + // - reconcile validatingAdmissionPolicy to enforce shared ownership. + // - Watch userDataSecret. + // - Solve token rotation causing drift. + // - CSR approval. + + userDataSecret, err := r.getUserDataSecret(ctx, hcp) + if err != nil { + return ctrl.Result{}, err + } + + if err := r.reconcileEC2NodeClassOwnedFields(ctx, userDataSecret); err != nil { + return ctrl.Result{}, err + } + + if err := r.reconcileEC2NodeClassDefault(ctx, userDataSecret, hcp); err != nil { + return ctrl.Result{}, err + } + + if err := r.reconcileKarpenter(ctx, hcp); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile karpenter deployment: %w", err) + } + + if err := r.reconcileCRDs(ctx, false); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +// reconcileCRDs reconcile the Karpenter CRDs, if onlyCreate is true it uses an only write non cached client. +func (r *Reconciler) reconcileCRDs(ctx context.Context, onlyCreate bool) error { + log := ctrl.LoggerFrom(ctx) + + errs := []error{} + var op controllerutil.OperationResult + var err error + for _, crd := range []*apiextensionsv1.CustomResourceDefinition{ + crdEC2NodeClass, + crdNodePool, + crdNodeClaim, + } { + if onlyCreate { + if err := r.GuestClient.Create(ctx, crd); err != nil { + if !apierrors.IsAlreadyExists(err) { + errs = append(errs, err) + } + } + } else { + op, err = r.CreateOrUpdate(ctx, r.GuestClient, crd, func() error { + return nil + }) + if err != nil { + errs = append(errs, err) + } + + } + } + if err := utilerrors.NewAggregate(errs); err != nil { + return fmt.Errorf("failed to reconcile CRDs: %w", err) + } + log.Info("Reconciled CRDs", "op", op) + + return nil +} + +func (r *Reconciler) reconcileEC2NodeClassOwnedFields(ctx context.Context, userDataSecret *corev1.Secret) error { + log := ctrl.LoggerFrom(ctx) + + ec2NodeClassList := &unstructured.UnstructuredList{} + ec2NodeClassList.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "karpenter.k8s.aws", + Version: "v1", + Kind: "EC2NodeClassList", + }) + err := r.GuestClient.List(ctx, ec2NodeClassList) + if err != nil { + return fmt.Errorf("failed to get EC2NodeClassList: %w", err) + } + + errs := []error{} + for _, ec2NodeClass := range ec2NodeClassList.Items { + ec2NodeClass.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "karpenter.k8s.aws", + Version: "v1", + Kind: "EC2NodeClass", + }) + op, err := r.CreateOrUpdate(ctx, r.GuestClient, &ec2NodeClass, func() error { + ec2NodeClass.Object["spec"].(map[string]interface{})["userData"] = string(userDataSecret.Data["value"]) + ec2NodeClass.Object["spec"].(map[string]interface{})["amiFamily"] = "Custom" + ec2NodeClass.Object["spec"].(map[string]interface{})["amiSelectorTerms"] = []map[string]interface{}{ + { + "id": string(userDataSecret.Labels[userDataAMILabel]), + }, + } + return nil + }) + if err != nil { + errs = append(errs, err) + } + if err == nil { + log.Info("Set managed fields in ec2NodeClass", "ec2NodeClass", ec2NodeClass.GetName(), "op", op) + } + } + if err := utilerrors.NewAggregate(errs); err != nil { + return fmt.Errorf("failed to update EC2NodeClass: %w", err) + } + return nil +} + +func (r *Reconciler) reconcileEC2NodeClassDefault(ctx context.Context, userDataSecret *corev1.Secret, hcp *hyperv1.HostedControlPlane) error { + log := ctrl.LoggerFrom(ctx) + + // Create an unstructured object for the EC2NodeClass + ec2NodeClass := &unstructured.Unstructured{} + ec2NodeClass.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "karpenter.k8s.aws", + Version: "v1", + Kind: "EC2NodeClass", + }) + ec2NodeClass.SetName("default") + + op, err := r.CreateOrUpdate(ctx, r.GuestClient, ec2NodeClass, func() error { + ec2NodeClass.Object["spec"] = map[string]interface{}{} + ec2NodeClass.Object["spec"].(map[string]interface{})["role"] = "KarpenterNodeRole-agl" // TODO(alberto): set a convention for this e.g. openshift-karpenter-infraID + ec2NodeClass.Object["spec"].(map[string]interface{})["userData"] = string(userDataSecret.Data["value"]) + ec2NodeClass.Object["spec"].(map[string]interface{})["amiFamily"] = "Custom" + ec2NodeClass.Object["spec"].(map[string]interface{})["amiSelectorTerms"] = []map[string]interface{}{ + { + "id": string(userDataSecret.Labels[userDataAMILabel]), + }, + } + ec2NodeClass.Object["spec"].(map[string]interface{})["subnetSelectorTerms"] = []map[string]interface{}{ + { + "tags": map[string]interface{}{ + "karpenter.sh/discovery": hcp.Spec.InfraID, + }, + }, + } + ec2NodeClass.Object["spec"].(map[string]interface{})["securityGroupSelectorTerms"] = []map[string]interface{}{ + { + "tags": map[string]interface{}{ + "karpenter.sh/discovery": hcp.Spec.InfraID, + }, + }, + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to reconcile default EC2NodeClass: %w", err) + } + log.Info("Reconciled default EC2NodeClass", "op", op) + return nil +} + +func (r *Reconciler) getUserDataSecret(ctx context.Context, hcp *hyperv1.HostedControlPlane) (*corev1.Secret, error) { + labelSelector := labels.SelectorFromSet(labels.Set{hyperv1.NodePoolLabel: fmt.Sprintf("%s-karpenter", hcp.GetName())}) + listOptions := &client.ListOptions{ + LabelSelector: labelSelector, + Namespace: r.Namespace, + } + secretList := &corev1.SecretList{} + err := r.ManagementClient.List(ctx, secretList, listOptions) + if err != nil { + return nil, fmt.Errorf("failed to list secrets: %w", err) + } + + sort.Slice(secretList.Items, func(i, j int) bool { + return secretList.Items[i].CreationTimestamp.After(secretList.Items[j].CreationTimestamp.Time) + }) + if len(secretList.Items) < 1 { + return nil, fmt.Errorf("expected 1 secret, got 0") + } + return &secretList.Items[0], err +} + +func (r *Reconciler) getHCP(ctx context.Context) (*hyperv1.HostedControlPlane, error) { + hcpList := &hyperv1.HostedControlPlaneList{} + if err := r.ManagementClient.List(ctx, hcpList, client.InNamespace(r.Namespace)); err != nil { + return nil, err + } + if len(hcpList.Items) == 0 { + return nil, fmt.Errorf("failed to find HostedControlPlane in namespace %s", r.Namespace) + } + + return &hcpList.Items[0], nil +} diff --git a/karpenter-operator/controllers/karpenter/karpenter_controller_test.go b/karpenter-operator/controllers/karpenter/karpenter_controller_test.go new file mode 100644 index 0000000000..6bb74ba742 --- /dev/null +++ b/karpenter-operator/controllers/karpenter/karpenter_controller_test.go @@ -0,0 +1,226 @@ +package karpenter + +import ( + "context" + "testing" + "time" + + . "github.com/onsi/gomega" + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/support/upsert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestReconcileEC2NodeClassDefault(t *testing.T) { + scheme := runtime.NewScheme() + // _ = corev1.AddToScheme(scheme) + _ = hyperv1.AddToScheme(scheme) + + // Register the EC2NodeClass GVK in the scheme + ec2NodeClassGVK := schema.GroupVersionKind{ + Group: "karpenter.k8s.aws", + Version: "v1", + Kind: "EC2NodeClass", + } + scheme.AddKnownTypeWithName(ec2NodeClassGVK, &unstructured.Unstructured{}) + scheme.AddKnownTypeWithName( + schema.GroupVersionKind{ + Group: "karpenter.k8s.aws", + Version: "v1", + Kind: "EC2NodeClassList", + }, + &unstructured.UnstructuredList{}, + ) + + testCases := []struct { + name string + userDataSecret *corev1.Secret + hcp *hyperv1.HostedControlPlane + wantErr bool + }{ + { + name: "When no errors it should create the default EC2NodeClass", + userDataSecret: &corev1.Secret{ + Data: map[string][]byte{ + "value": []byte("test-userdata"), + }, + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + userDataAMILabel: "ami-123", + }, + }, + }, + hcp: &hyperv1.HostedControlPlane{ + Spec: hyperv1.HostedControlPlaneSpec{ + InfraID: "test-infra", + }, + }, + wantErr: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + r := &Reconciler{ + GuestClient: fakeClient, + CreateOrUpdateProvider: upsert.New(false), + } + + err := r.reconcileEC2NodeClassDefault(context.Background(), tc.userDataSecret, tc.hcp) + if (err != nil) != tc.wantErr { + t.Errorf("reconcileEC2NodeClassDefault() error = %v, wantErr %v", err, tc.wantErr) + return + } + + // Verify the EC2NodeClass was created. + got := &unstructured.Unstructured{} + got.SetGroupVersionKind(ec2NodeClassGVK) + + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: "default"}, got) + if err != nil { + t.Errorf("failed to get EC2NodeClass: %v", err) + return + } + + spec, ok := got.Object["spec"].(map[string]interface{}) + if !ok { + t.Fatal("spec is not a map") + } + + // Verify basic fields + g.Expect(spec["role"]).To(Equal("KarpenterNodeRole-agl"), "role = %v, want KarpenterNodeRole-agl", spec["role"]) + g.Expect(spec["userData"]).To(Equal("test-userdata"), "userData = %v, want test-userdata", spec["userData"]) + g.Expect(spec["amiFamily"]).To(Equal("Custom"), "amiFamily = %v, want Custom", spec["amiFamily"]) + + // Verify amiSelectorTerms + amiTerms, ok := spec["amiSelectorTerms"].([]interface{}) + g.Expect(ok).To(BeTrue(), "amiSelectorTerms should be a slice") + g.Expect(len(amiTerms)).To(Equal(1), "amiSelectorTerms should have exactly one element") + + amiTerm, ok := amiTerms[0].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "amiTerm should be a map") + g.Expect(amiTerm["id"]).To(Equal("ami-123"), "unexpected amiSelectorTerms: %v", amiTerms) + + // Verify selector terms have correct tags + expectedTag := map[string]interface{}{ + "karpenter.sh/discovery": "test-infra", + } + + // Helper function to verify selector terms + verifySelectorTerms := func(field string, expectedTags map[string]interface{}) { + terms, ok := spec[field].([]interface{}) + g.Expect(ok).To(BeTrue(), "terms should be a slice for field %s", field) + g.Expect(len(terms)).To(Equal(1), "terms should have exactly one element for field %s", field) + + term, ok := terms[0].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "term should be a map for field %s", field) + + tags, ok := term["tags"].(map[string]interface{}) + g.Expect(ok).To(BeTrue(), "tags should be a map for field %s", field) + g.Expect(tags).To(Equal(expectedTags), "%s tags = %v, want %v", field, tags, expectedTags) + } + + verifySelectorTerms("subnetSelectorTerms", expectedTag) + verifySelectorTerms("securityGroupSelectorTerms", expectedTag) + }) + } +} + +func TestGetUserDataSecret(t *testing.T) { + g := NewWithT(t) + + scheme := runtime.NewScheme() + g.Expect(corev1.AddToScheme(scheme)).To(Succeed()) + + testCases := []struct { + name string + namespace string + hcp *hyperv1.HostedControlPlane + objects []client.Object + expectedError string + }{ + { + name: "when multiple exist it should return newest secret", + namespace: "test-namespace", + hcp: &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hcp", + }, + }, + objects: []client.Object{ + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "older-secret", + Namespace: "test-namespace", + CreationTimestamp: metav1.Time{Time: time.Now().Add(-1 * time.Hour)}, + Labels: map[string]string{ + hyperv1.NodePoolLabel: "test-hcp-karpenter", + }, + }, + }, + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "newer-secret", + Namespace: "test-namespace", + CreationTimestamp: metav1.Time{Time: time.Now()}, + Labels: map[string]string{ + hyperv1.NodePoolLabel: "test-hcp-karpenter", + }, + }, + }, + }, + }, + { + name: "when no secrets exist it should return error", + namespace: "test-namespace", + hcp: &hyperv1.HostedControlPlane{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-hcp", + }, + }, + objects: []client.Object{}, + expectedError: "expected 1 secret, got 0", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tc.objects...). + Build() + + r := &Reconciler{ + ManagementClient: fakeClient, + Namespace: tc.namespace, + } + + secret, err := r.getUserDataSecret(context.Background(), tc.hcp) + + if tc.expectedError != "" { + g.Expect(err).To(MatchError(tc.expectedError)) + g.Expect(secret).To(BeNil()) + return + } + + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(secret).NotTo(BeNil()) + + g.Expect(secret.Name).To(Equal("newer-secret")) + }) + } +} diff --git a/karpenter-operator/controllers/karpenter/manifests.go b/karpenter-operator/controllers/karpenter/manifests.go new file mode 100644 index 0000000000..b2d75b171a --- /dev/null +++ b/karpenter-operator/controllers/karpenter/manifests.go @@ -0,0 +1,420 @@ +package karpenter + +import ( + "context" + "fmt" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator" + "github.com/openshift/hypershift/support/config" + "github.com/openshift/hypershift/support/util" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + intstr "k8s.io/apimachinery/pkg/util/intstr" + k8sutilspointer "k8s.io/utils/ptr" + client "sigs.k8s.io/controller-runtime/pkg/client" +) + +func KarpenterDeployment(namespace string) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: "karpenter", + }, + } +} + +func KarpenterServiceAccount(controlPlaneNamespace string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: "karpenter", + }, + } +} + +func KarpenterRole(controlPlaneNamespace string) *rbacv1.Role { + return &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: "karpenter", + }, + } +} + +func KarpenterRoleBinding(controlPlaneNamespace string) *rbacv1.RoleBinding { + return &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: "karpenter", + }, + } +} + +func karpenterSelector() map[string]string { + return map[string]string{ + "karpenter": "karpenter", + } +} + +func ReconcileKarpenterDeployment(deployment *appsv1.Deployment, + hcp *hyperv1.HostedControlPlane, + sa *corev1.ServiceAccount, + kubeConfigSecret *corev1.Secret, + availabilityProberImage, tokenMinterImage string, + credentialsSecret *corev1.Secret, + setDefaultSecurityContext bool, + ownerRef config.OwnerRef) error { + + deployment.Spec = appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: karpenterSelector(), + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: karpenterSelector(), + }, + Spec: corev1.PodSpec{ + ServiceAccountName: sa.Name, + TerminationGracePeriodSeconds: k8sutilspointer.To(int64(10)), + Tolerations: []corev1.Toleration{ + { + Key: "node-role.kubernetes.io/master", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "target-kubeconfig", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: kubeConfigSecret.Name, + DefaultMode: k8sutilspointer.To(int32(0640)), + Items: []corev1.KeyToPath{ + { + Key: "value", + Path: "target-kubeconfig", + }, + }, + }, + }, + }, + { + Name: "serviceaccount-token", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + { + Name: "provider-creds", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: credentialsSecret.Name, + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: "karpenter", + // TODO(alberto): lifecycle this image. + Image: "public.ecr.aws/karpenter/controller:1.0.7", + ImagePullPolicy: corev1.PullAlways, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "target-kubeconfig", + MountPath: "/mnt/kubeconfig", + }, + { + Name: "provider-creds", + MountPath: "/etc/provider", + }, + { + Name: "serviceaccount-token", + MountPath: "/var/run/secrets/openshift/serviceaccount", + }, + }, + Env: []corev1.EnvVar{ + { + Name: "KUBECONFIG", + Value: "/mnt/kubeconfig/target-kubeconfig", + }, + { + Name: "SYSTEM_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + { + Name: "DISABLE_WEBHOOK", + Value: "true", + }, + { + Name: "DISABLE_LEADER_ELECTION", + Value: "true", + }, + { + Name: "FEATURE_GATES", + Value: "Drift=true", + }, + { + Name: "AWS_REGION", + Value: "us-east-1", + }, + { + Name: "AWS_SHARED_CREDENTIALS_FILE", + Value: "/etc/provider/credentials", + }, + { + Name: "AWS_SDK_LOAD_CONFIG", + Value: "true", + }, + { + Name: "HEALTH_PROBE_PORT", + Value: "8081", + }, + // TODO (alberto): this is to satisfy current karpenter requirements. We should relax the req. + { + Name: "CLUSTER_ENDPOINT", + Value: "https://fake.com", + }, + { + Name: "CLUSTER_NAME", + Value: "none", + }, + }, + // Command: []string{""}, + Args: []string{ + "--log-level=debug", + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("http"), + Scheme: corev1.URISchemeHTTP, + }, + }, + InitialDelaySeconds: 60, + PeriodSeconds: 60, + SuccessThreshold: 1, + FailureThreshold: 5, + TimeoutSeconds: 5, + }, + + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("http"), + Scheme: corev1.URISchemeHTTP, + }, + }, + PeriodSeconds: 10, + SuccessThreshold: 1, + FailureThreshold: 3, + TimeoutSeconds: 5, + }, + Ports: []corev1.ContainerPort{ + { + Name: "metrics", + ContainerPort: 8000, + }, + { + Name: "http", + ContainerPort: 8081, + Protocol: corev1.ProtocolTCP, + }, + }, + }, + { + Name: "token-minter", + Command: []string{"/usr/bin/control-plane-operator", "token-minter"}, + Args: []string{ + "--service-account-namespace=kube-system", + "--service-account-name=karpenter", + "--token-file=/var/run/secrets/openshift/serviceaccount/token", + "--kubeconfig=/mnt/kubeconfig/target-kubeconfig", + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10m"), + corev1.ResourceMemory: resource.MustParse("10Mi"), + }, + }, + Image: tokenMinterImage, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "target-kubeconfig", + MountPath: "/mnt/kubeconfig", + }, + { + Name: "serviceaccount-token", + MountPath: "/var/run/secrets/openshift/serviceaccount", + }, + }, + }, + }, + }, + }, + } + + util.AvailabilityProber(kas.InClusterKASReadyURL(hcp.Spec.Platform.Type), availabilityProberImage, &deployment.Spec.Template.Spec) + deploymentConfig := config.DeploymentConfig{ + AdditionalLabels: map[string]string{ + config.NeedManagementKASAccessLabel: "true", + }, + Scheduling: config.Scheduling{ + PriorityClass: config.DefaultPriorityClass, + }, + SetDefaultSecurityContext: setDefaultSecurityContext, + } + if hcp.Annotations[hyperv1.ControlPlanePriorityClass] != "" { + deploymentConfig.Scheduling.PriorityClass = hcp.Annotations[hyperv1.ControlPlanePriorityClass] + } + + replicas := k8sutilspointer.To(1) + deploymentConfig.SetDefaults(hcp, nil, replicas) + deploymentConfig.SetRestartAnnotation(hcp.ObjectMeta) + deploymentConfig.ApplyTo(deployment) + + return nil +} + +func ReconcileKarpenterRole(role *rbacv1.Role, owner config.OwnerRef) error { + owner.ApplyTo(role) + role.Rules = []rbacv1.PolicyRule{ + { + APIGroups: []string{"coordination.k8s.io"}, + Resources: []string{ + "leases", + }, + Verbs: []string{ + "get", + "watch", + "create", + }, + }, + { + APIGroups: []string{"coordination.k8s.io"}, + Resources: []string{ + "leases", + }, + Verbs: []string{ + "patch", + "update", + }, + ResourceNames: []string{ + "karpenter-leader-election", + }, + }, + } + return nil +} + +func ReconcileKarpenterRoleBinding(binding *rbacv1.RoleBinding, role *rbacv1.Role, sa *corev1.ServiceAccount, owner config.OwnerRef) error { + owner.ApplyTo(binding) + binding.RoleRef = rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: role.Name, + } + + binding.Subjects = []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: sa.Name, + Namespace: sa.Namespace, + }, + } + + return nil +} + +// ReconcileKarpenter orchestrates reconciliation of karpenter components. +func (r *Reconciler) reconcileKarpenter(ctx context.Context, hcp *hyperv1.HostedControlPlane) error { + createOrUpdate := r.CreateOrUpdate + c := r.ManagementClient + ownerRef := config.OwnerRefFrom(hcp) + setDefaultSecurityContext := false + availabilityProberImage := r.ControlPlaneOperatorImage + tokenMinterImage := r.ControlPlaneOperatorImage + + role := KarpenterRole(hcp.Namespace) + _, err := createOrUpdate(ctx, c, role, func() error { + return ReconcileKarpenterRole(role, ownerRef) + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter role: %w", err) + } + + serviceAccount := KarpenterServiceAccount(hcp.Namespace) + _, err = createOrUpdate(ctx, c, serviceAccount, func() error { + util.EnsurePullSecret(serviceAccount, controlplaneoperator.PullSecret("").Name) + ownerRef.ApplyTo(serviceAccount) + return nil + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter service account: %w", err) + } + + roleBinding := KarpenterRoleBinding(hcp.Namespace) + _, err = createOrUpdate(ctx, c, roleBinding, func() error { + return ReconcileKarpenterRoleBinding(roleBinding, role, serviceAccount, ownerRef) + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter role binding: %w", err) + } + + awsCredentialsTemplate := `[default] + role_arn = %s + web_identity_token_file = /var/run/secrets/openshift/serviceaccount/token + sts_regional_endpoints = regional +` + arn := hcp.Spec.AutoNode.Provisioner.Karpenter.AWS.RoleARN + credentialsSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcp.Namespace, + Name: "karpenter-credentials", + }, + } + if _, err := createOrUpdate(ctx, c, credentialsSecret, func() error { + credentials := fmt.Sprintf(awsCredentialsTemplate, arn) + credentialsSecret.Data = map[string][]byte{"credentials": []byte(credentials)} + credentialsSecret.Type = corev1.SecretTypeOpaque + return nil + }); err != nil { + return fmt.Errorf("failed to reconcile karpenter credentials secret %s/%s: %w", credentialsSecret.Namespace, credentialsSecret.Name, err) + } + + // The deployment depends on the kubeconfig being reported. + if hcp.Status.KubeConfig != nil { + // Resolve the kubeconfig secret for CAPI which is used for karpeneter for convience + capiKubeConfigSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcp.Namespace, + Name: fmt.Sprintf("%s-kubeconfig", hcp.Spec.InfraID), + }, + } + err = c.Get(ctx, client.ObjectKeyFromObject(capiKubeConfigSecret), capiKubeConfigSecret) + if err != nil { + return fmt.Errorf("failed to get hosted controlplane kubeconfig secret %q: %w", capiKubeConfigSecret.Name, err) + } + + deployment := KarpenterDeployment(hcp.Namespace) + _, err = createOrUpdate(ctx, c, deployment, func() error { + return ReconcileKarpenterDeployment(deployment, hcp, serviceAccount, capiKubeConfigSecret, availabilityProberImage, tokenMinterImage, credentialsSecret, setDefaultSecurityContext, ownerRef) + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter deployment: %w", err) + } + } + + return nil +} diff --git a/karpenter-operator/main.go b/karpenter-operator/main.go new file mode 100644 index 0000000000..3a949800c2 --- /dev/null +++ b/karpenter-operator/main.go @@ -0,0 +1,124 @@ +package main + +import ( + "context" + "fmt" + "os" + + "github.com/openshift/hypershift/karpenter-operator/controllers/karpenter" + hyperapi "github.com/openshift/hypershift/support/api" + "github.com/spf13/cobra" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/cluster" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +var ( + setupLog = ctrl.Log.WithName("setup") + + targetKubeconfig string + namespace string + controlPlaneOperatorImage string +) + +func main() { + var rootCmd = &cobra.Command{ + Use: "karpenter-operator", + Short: "Karpenter Operator is a Kubernetes operator for managing Karpenter", + Run: func(cmd *cobra.Command, args []string) { + opts := zap.Options{ + Development: true, + } + // opts.BindFlags(flag.CommandLine) + // flag.Parse() + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + if err := run(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + }, + } + + rootCmd.PersistentFlags().StringVar(&targetKubeconfig, "target-kubeconfig", "", "Path to guest side kubeconfig file. Where the karpenter CRs (nodeClaim, nodePool, nodeClass) live") + rootCmd.PersistentFlags().StringVar(&namespace, "namespace", "", "The namespace to infer input for reconciliation, e.g the userData secret") + rootCmd.PersistentFlags().StringVar(&controlPlaneOperatorImage, "control-plane-operator-image", "", "The image to run the tokenMinter and the availability prober") + rootCmd.MarkPersistentFlagRequired("target-kubeconfig") + rootCmd.MarkPersistentFlagRequired("namespace") + rootCmd.MarkPersistentFlagRequired("control-plane-operator-image") + + if err := rootCmd.Execute(); err != nil { + setupLog.Error(err, "problem executing command") + os.Exit(1) + } +} + +func run(ctx context.Context) error { + managementKubeconfig, err := ctrl.GetConfig() + if err != nil { + return err + } + managementCluster, err := cluster.New(managementKubeconfig, func(opt *cluster.Options) { + opt.Cache = cache.Options{ + DefaultNamespaces: map[string]cache.Config{namespace: {}}, + Scheme: hyperapi.Scheme, + } + opt.Scheme = hyperapi.Scheme + }) + if err != nil { + return err + } + + guestKubeconfig, err := kubeconfigFromFile(targetKubeconfig) + if err != nil { + return fmt.Errorf("failed to create guest kubeconfig: %w", err) + } + + mgr, err := ctrl.NewManager(guestKubeconfig, ctrl.Options{ + Scheme: hyperapi.Scheme, + LeaderElection: false, + }) + if err != nil { + return fmt.Errorf("failed to create manager: %w", err) + } + + if err := mgr.Add(managementCluster); err != nil { + return fmt.Errorf("failed to add managementCluster to controller runtime manager: %v", err) + } + + // Add health check endpoints + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + return fmt.Errorf("failed to setup healthz check: %w", err) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + return fmt.Errorf("failed to setup readyz check: %w", err) + } + + r := karpenter.Reconciler{ + Namespace: namespace, + ControlPlaneOperatorImage: controlPlaneOperatorImage, + } + if err := r.SetupWithManager(ctx, mgr, managementCluster); err != nil { + return fmt.Errorf("failed to setup controller with manager: %w", err) + } + + if err := mgr.Start(ctx); err != nil { + return fmt.Errorf("failed to start manager: %w", err) + } + + return nil +} + +func kubeconfigFromFile(path string) (*rest.Config, error) { + cfg, err := clientcmd.NewNonInteractiveDeferredLoadingClientConfig( + &clientcmd.ClientConfigLoadingRules{ExplicitPath: path}, + &clientcmd.ConfigOverrides{}).ClientConfig() + if err != nil { + return nil, fmt.Errorf("failed to construct kubeconfig from path %s: %w", path, err) + } + return cfg, nil +} diff --git a/karpenter-operator/manifests/operator.go b/karpenter-operator/manifests/operator.go new file mode 100644 index 0000000000..e25031aa4e --- /dev/null +++ b/karpenter-operator/manifests/operator.go @@ -0,0 +1,367 @@ +package manifests + +import ( + "context" + "fmt" + + hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/control-plane-operator/controllers/hostedcontrolplane/kas" + "github.com/openshift/hypershift/hypershift-operator/controllers/manifests/controlplaneoperator" + "github.com/openshift/hypershift/support/config" + "github.com/openshift/hypershift/support/upsert" + "github.com/openshift/hypershift/support/util" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + intstr "k8s.io/apimachinery/pkg/util/intstr" + k8sutilspointer "k8s.io/utils/ptr" + client "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + name = "karpenter-operator" +) + +func KarpenterOperatorDeployment(namespace string) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: name, + }, + } +} + +func KarpenterOperatorServiceAccount(controlPlaneNamespace string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: name, + }, + } +} + +func KarpenterOperatorRole(controlPlaneNamespace string) *rbacv1.Role { + return &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: name, + }, + } +} + +func KarpenterOperatorRoleBinding(controlPlaneNamespace string) *rbacv1.RoleBinding { + return &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: controlPlaneNamespace, + Name: name, + }, + } +} + +func karpenterOperatorSelector() map[string]string { + return map[string]string{ + "karpenter": "karpenter", + } +} + +func ReconcileKarpenterOperatorDeployment(deployment *appsv1.Deployment, + hcp *hyperv1.HostedControlPlane, + sa *corev1.ServiceAccount, + kubeConfigSecret *corev1.Secret, + hypershiftOperatorImage string, + controlPlaneOperatorImage string, + setDefaultSecurityContext bool, + ownerRef config.OwnerRef) error { + + deployment.Spec = appsv1.DeploymentSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: karpenterOperatorSelector(), + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: karpenterOperatorSelector(), + }, + Spec: corev1.PodSpec{ + ServiceAccountName: sa.Name, + TerminationGracePeriodSeconds: k8sutilspointer.To(int64(10)), + Tolerations: []corev1.Toleration{ + { + Key: "node-role.kubernetes.io/master", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "target-kubeconfig", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{ + SecretName: kubeConfigSecret.Name, + DefaultMode: k8sutilspointer.To(int32(0640)), + Items: []corev1.KeyToPath{ + { + Key: "value", + Path: "target-kubeconfig", + }, + }, + }, + }, + }, + { + Name: "serviceaccount-token", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: name, + Image: hypershiftOperatorImage, + ImagePullPolicy: corev1.PullAlways, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "target-kubeconfig", + MountPath: "/mnt/kubeconfig", + }, + }, + Env: []corev1.EnvVar{ + { + Name: "MY_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + }, + Command: []string{ + "/usr/bin/karpenter-operator", + }, + Args: []string{ + "--target-kubeconfig=/mnt/kubeconfig/target-kubeconfig", + "--namespace=$(MY_NAMESPACE)", + "--control-plane-operator-image=" + controlPlaneOperatorImage, + }, + LivenessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromString("http"), + Scheme: corev1.URISchemeHTTP, + }, + }, + InitialDelaySeconds: 60, + PeriodSeconds: 60, + SuccessThreshold: 1, + FailureThreshold: 5, + TimeoutSeconds: 5, + }, + + ReadinessProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromString("http"), + Scheme: corev1.URISchemeHTTP, + }, + }, + PeriodSeconds: 10, + SuccessThreshold: 1, + FailureThreshold: 3, + TimeoutSeconds: 5, + }, + Ports: []corev1.ContainerPort{ + { + Name: "metrics", + ContainerPort: 8000, + }, + { + Name: "http", + ContainerPort: 8081, + Protocol: corev1.ProtocolTCP, + }, + }, + }, + }, + }, + }, + } + + util.AvailabilityProber(kas.InClusterKASReadyURL(hcp.Spec.Platform.Type), controlPlaneOperatorImage, &deployment.Spec.Template.Spec) + deploymentConfig := config.DeploymentConfig{ + AdditionalLabels: map[string]string{ + config.NeedManagementKASAccessLabel: "true", + }, + Scheduling: config.Scheduling{ + PriorityClass: config.DefaultPriorityClass, + }, + SetDefaultSecurityContext: setDefaultSecurityContext, + } + if hcp.Annotations[hyperv1.ControlPlanePriorityClass] != "" { + deploymentConfig.Scheduling.PriorityClass = hcp.Annotations[hyperv1.ControlPlanePriorityClass] + } + + replicas := k8sutilspointer.To(1) + deploymentConfig.SetDefaults(hcp, nil, replicas) + deploymentConfig.SetRestartAnnotation(hcp.ObjectMeta) + deploymentConfig.ApplyTo(deployment) + + return nil +} + +func ReconcileKarpenterOperatorRole(role *rbacv1.Role, owner config.OwnerRef) error { + role.Rules = []rbacv1.PolicyRule{ + { + APIGroups: []string{"coordination.k8s.io"}, + Resources: []string{ + "leases", + }, + Verbs: []string{ + "get", + "watch", + "create", + }, + }, + { + APIGroups: []string{"coordination.k8s.io"}, + Resources: []string{ + "leases", + }, + Verbs: []string{ + "patch", + "update", + }, + ResourceNames: []string{ + "karpenter-leader-election", + }, + }, + { + APIGroups: []string{"apps"}, + Resources: []string{ + "deployments", + }, + Verbs: []string{ + "create", + "update", + "patch", + "delete", + "get", + "list", + "watch", + }, + }, + { + APIGroups: []string{""}, + Resources: []string{ + "secrets", + "serviceaccounts", + }, + Verbs: []string{ + "*", + }, + }, + { + APIGroups: []string{"hypershift.openshift.io"}, + Resources: []string{ + "hostedcontrolplanes", + "hostedcontrolplanes/finalizers", + }, + Verbs: []string{ + "*", + }, + }, + { + APIGroups: []string{"rbac.authorization.k8s.io"}, + Resources: []string{ + "roles", + "rolebindings", + }, + Verbs: []string{ + "get", + "list", + "watch", + "create", + "update", + "patch", + "delete", + "deletecollection", + }, + }, + } + return nil +} + +func ReconcileKarpenterOperatorRoleBinding(binding *rbacv1.RoleBinding, role *rbacv1.Role, sa *corev1.ServiceAccount, owner config.OwnerRef) error { + binding.RoleRef = rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "Role", + Name: role.Name, + } + + binding.Subjects = []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: sa.Name, + Namespace: sa.Namespace, + }, + } + + return nil +} + +// ReconcileKarpenter orchestrates reconciliation of karpenter components. +func ReconcileKarpenterOperator(ctx context.Context, createOrUpdate upsert.CreateOrUpdateFN, c client.Client, hypershiftOperatorImage, controlPlaneOperatorImage string, hcp *hyperv1.HostedControlPlane) error { + ownerRef := config.OwnerRefFrom(hcp) + setDefaultSecurityContext := false + + role := KarpenterOperatorRole(hcp.Namespace) + _, err := createOrUpdate(ctx, c, role, func() error { + return ReconcileKarpenterOperatorRole(role, ownerRef) + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter role: %w", err) + } + + serviceAccount := KarpenterOperatorServiceAccount(hcp.Namespace) + _, err = createOrUpdate(ctx, c, serviceAccount, func() error { + util.EnsurePullSecret(serviceAccount, controlplaneoperator.PullSecret("").Name) + return nil + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter service account: %w", err) + } + + roleBinding := KarpenterOperatorRoleBinding(hcp.Namespace) + _, err = createOrUpdate(ctx, c, roleBinding, func() error { + return ReconcileKarpenterOperatorRoleBinding(roleBinding, role, serviceAccount, ownerRef) + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter role binding: %w", err) + } + + // The deployment depends on the kubeconfig being reported. + if hcp.Status.KubeConfig != nil { + // Resolve the kubeconfig secret for CAPI which is used for karpeneter for convience + capiKubeConfigSecret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: hcp.Namespace, + Name: fmt.Sprintf("%s-kubeconfig", hcp.Spec.InfraID), + }, + } + err = c.Get(ctx, client.ObjectKeyFromObject(capiKubeConfigSecret), capiKubeConfigSecret) + if err != nil { + return fmt.Errorf("failed to get hosted controlplane kubeconfig secret %q: %w", capiKubeConfigSecret.Name, err) + } + + deployment := KarpenterOperatorDeployment(hcp.Namespace) + _, err = createOrUpdate(ctx, c, deployment, func() error { + return ReconcileKarpenterOperatorDeployment(deployment, hcp, serviceAccount, capiKubeConfigSecret, hypershiftOperatorImage, controlPlaneOperatorImage, setDefaultSecurityContext, ownerRef) + }) + if err != nil { + return fmt.Errorf("failed to reconcile karpenter deployment: %w", err) + } + } + + return nil +} diff --git a/support/assets/readasset.go b/support/assets/readasset.go index ac206f064b..63ca5dac44 100644 --- a/support/assets/readasset.go +++ b/support/assets/readasset.go @@ -3,13 +3,13 @@ package assets import ( "fmt" - "github.com/openshift/hypershift/support/api" - hyperv1 "github.com/openshift/hypershift/api/hypershift/v1beta1" + "github.com/openshift/hypershift/support/api" appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -83,6 +83,12 @@ func ShouldNodePool(reader AssetReader, fileName string) *hyperv1.NodePool { return nodePool } +func MustCRD(reader AssetReader, fileName string) *apiextensionsv1.CustomResourceDefinition { + crd := &apiextensionsv1.CustomResourceDefinition{} + deserializeResource(reader, fileName, crd) + return crd +} + func deserializeResource(reader AssetReader, fileName string, obj runtime.Object) { data := MustAsset(reader, fileName) gvks, _, err := api.Scheme.ObjectKinds(obj)