karpenter技术介绍-3-controller介绍-天翼云开发者社区

karpenter把功能拆得很细，用到大量controller

func NewControllers(
    mgr manager.Manager,
    clock clock.Clock,
    kubeClient client.Client,
    recorder events.Recorder,
    cloudProvider cloudprovider.CloudProvider,
) []controller.Controller {
 
    // 整个集群的缓存
    cluster := state.NewCluster(clock, kubeClient)
    // 核心控制器，启用新的NodeClaim以兼容pending pod
    p := provisioning.NewProvisioner(kubeClient, recorder, cloudProvider, cluster)
    // pod驱逐队列，生产者为node terminator
    evictionQueue := terminator.NewQueue(kubeClient, recorder)
    // 中断队列，生产者为disruption controller
    disruptionQueue := orchestration.NewQueue(kubeClient, recorder, cluster, clock, p)
 
    return []controller.Controller{
        p, evictionQueue, disruptionQueue,
        // 中断控制器，判断NodeClaim是否需要中断，如需要则发送命令到中断队列
        disruption.NewController(clock, kubeClient, p, cloudProvider, recorder, cluster, disruptionQueue),
        // watch pending pod，发送到Provisioner的批处理窗口
        provisioning.NewPodController(kubeClient, p),
        // watch 可调度的node，发送到Provisioner的批处理窗口
        provisioning.NewNodeController(kubeClient, p),
        // 更新hash到nodepool，更新到所有对应NodeClaim的annotations中
        nodepoolhash.NewController(kubeClient),
        // 删除过期的NodeClaim
        expiration.NewController(clock, kubeClient),
        // 以下五个控制器用于更新cluster缓存
        informer.NewDaemonSetController(kubeClient, cluster),
        informer.NewNodeController(kubeClient, cluster),
        informer.NewPodController(kubeClient, cluster),
        informer.NewNodePoolController(kubeClient, cluster),
        informer.NewNodeClaimController(kubeClient, cluster),
        // watch deleting node，执行的操作包括删除 NodeClaim、确保底层实例被删除等，最后移除node的finalizer
        termination.NewController(clock, kubeClient, cloudProvider, terminator.NewTerminator(clock, kubeClient, evictionQueue, recorder), recorder),
        // 以下三个控制器用于统计指标
        metricspod.NewController(kubeClient),
        metricsnodepool.NewController(kubeClient),
        metricsnode.NewController(cluster),
        // 当nodepool的nodeclass引用发生变化时，校验是否存在nodeclass，并更新nodepool的conditions
        nodepoolreadiness.NewController(kubeClient, cloudProvider),
        // 统计当前集群已经分配给nodepool的资源容量，并更新resources字段（cpu、mem）
        nodepoolcounter.NewController(kubeClient, cluster),
        // 校验nodepool的属性是否合法，结果更新到Conditions中
        nodepoolvalidation.NewController(kubeClient),
        // watch pod，更新对应NodeClaim的LastPodEventTime
        podevents.NewController(clock, kubeClient),
        // 检查NodeClaim的一致性，并更新NodeClaim的Conditions
        nodeclaimconsistency.NewController(clock, kubeClient, recorder),
        // NodeClaim生命周期控制器，调用provider进行创建/删除，维护NodeClaim的conditions
        nodeclaimlifecycle.NewController(clock, kubeClient, cloudProvider, recorder),
        // NodeClaim gc控制器，当NodeClaim为已注册且对应node为not ready且时，删除该NodeClaim
        nodeclaimgarbagecollection.NewController(clock, kubeClient, cloudProvider),
        // 根据nodeclaim状态维护NodeClaim的 Drifted 和 Consolidatable 两个condiction，会影响NodeClaim是否会被中断
        nodeclaimdisruption.NewController(clock, kubeClient, cloudProvider),
        status.NewController[*v1.NodeClaim](kubeClient, mgr.GetEventRecorderFor("karpenter")),
        status.NewController[*v1.NodePool](kubeClient, mgr.GetEventRecorderFor("karpenter")),
    }
}

其中比较关键的controller有Provisioner、disruption和nodeclaimlifecycle

扩容流程

当出现pending状态的pod时会触发扩容流程

中断流程

当NodeClaim的Drifted或者其他指定condiction为true，以及集群出现过剩容量时，会触发中断流程

func NewControllers( mgr manager.Manager, clock clock.Clock, kubeClient client.Client, recorder events.Recorder, cloudProvider cloudprovider.CloudProvider, ) []controller.Controller { // 整个集群的缓存 cluster := state.NewCluster(clock, kubeClient) // 核心控制器，启用新的NodeClaim以兼容pending pod p := provisioning.NewProvisioner(kubeClient, recorder, cloudProvider, cluster) // pod驱逐队列，生产者为node terminator evictionQueue := terminator.NewQueue(kubeClient, recorder) // 中断队列，生产者为disruption controller disruptionQueue := orchestration.NewQueue(kubeClient, recorder, cluster, clock, p) return []controller.Controller{ p, evictionQueue, disruptionQueue, // 中断控制器，判断NodeClaim是否需要中断，如需要则发送命令到中断队列 disruption.NewController(clock, kubeClient, p, cloudProvider, recorder, cluster, disruptionQueue), // watch pending pod，发送到Provisioner的批处理窗口 provisioning.NewPodController(kubeClient, p), // watch 可调度的node，发送到Provisioner的批处理窗口 provisioning.NewNodeController(kubeClient, p), // 更新hash到nodepool，更新到所有对应NodeClaim的annotations中 nodepoolhash.NewController(kubeClient), // 删除过期的NodeClaim expiration.NewController(clock, kubeClient), // 以下五个控制器用于更新cluster缓存 informer.NewDaemonSetController(kubeClient, cluster), informer.NewNodeController(kubeClient, cluster), informer.NewPodController(kubeClient, cluster), informer.NewNodePoolController(kubeClient, cluster), informer.NewNodeClaimController(kubeClient, cluster), // watch deleting node，执行的操作包括删除 NodeClaim、确保底层实例被删除等，最后移除node的finalizer termination.NewController(clock, kubeClient, cloudProvider, terminator.NewTerminator(clock, kubeClient, evictionQueue, recorder), recorder), // 以下三个控制器用于统计指标 metricspod.NewController(kubeClient), metricsnodepool.NewController(kubeClient), metricsnode.NewController(cluster), // 当nodepool的nodeclass引用发生变化时，校验是否存在nodeclass，并更新nodepool的conditions nodepoolreadiness.NewController(kubeClient, cloudProvider), // 统计当前集群已经分配给nodepool的资源容量，并更新resources字段（cpu、mem） nodepoolcounter.NewController(kubeClient, cluster), // 校验nodepool的属性是否合法，结果更新到Conditions中 nodepoolvalidation.NewController(kubeClient), // watch pod，更新对应NodeClaim的LastPodEventTime podevents.NewController(clock, kubeClient), // 检查NodeClaim的一致性，并更新NodeClaim的Conditions nodeclaimconsistency.NewController(clock, kubeClient, recorder), // NodeClaim生命周期控制器，调用provider进行创建/删除，维护NodeClaim的conditions nodeclaimlifecycle.NewController(clock, kubeClient, cloudProvider, recorder), // NodeClaim gc控制器，当NodeClaim为已注册且对应node为not ready且时，删除该NodeClaim nodeclaimgarbagecollection.NewController(clock, kubeClient, cloudProvider), // 根据nodeclaim状态维护NodeClaim的 Drifted 和 Consolidatable 两个condiction，会影响NodeClaim是否会被中断 nodeclaimdisruption.NewController(clock, kubeClient, cloudProvider), status.NewController[*v1.NodeClaim](kubeClient, mgr.GetEventRecorderFor("karpenter")), status.NewController[*v1.NodePool](kubeClient, mgr.GetEventRecorderFor("karpenter")), } }

息壤智算

应用商城

定价

合作伙伴

开发者

支持与服务

了解天翼云

karpenter技术介绍-3-controller介绍

扩容流程

中断流程

karpenter技术介绍-3-controller介绍

扩容流程

中断流程

活动

息壤智算

应用商城

定价

合作伙伴

开发者

支持与服务

了解天翼云

karpenter技术介绍-3-controller介绍

扩容流程

中断流程

karpenter技术介绍-3-controller介绍

扩容流程

中断流程