kubernetes源码分析之kube-controller-m

2018-12-20  本文已影响0人  davisgao

1.controller-manager在集群中的作用

作为集群的管理控制中心,维护集群中的所有控制器,对维持集群的稳定和自我修复,实现高可用,副本控制等起关键作用。

2.controller-manager内部结构图

cm-inside.png

3.controller-manager源码中的关键性调用链

controller-manager.png

4.具体的源码分析过程

4.1.组件启动的入口

位置: k8s.io/kubernetes/cmd/kube-controller-manager/controller-manager.go

func main() {
    rand.Seed(time.Now().UTC().UnixNano())

    command := app.NewControllerManagerCommand()

    // TODO: once we switch everything over to Cobra commands, we can go back to calling
    // utilflag.InitFlags() (by removing its pflag.Parse() call). For now, we have to set the
    // normalize func and add the go flag set by hand.
    pflag.CommandLine.SetNormalizeFunc(utilflag.WordSepNormalizeFunc)
    pflag.CommandLine.AddGoFlagSet(goflag.CommandLine)
    // utilflag.InitFlags()
    logs.InitLogs()
    defer logs.FlushLogs()

    if err := command.Execute(); err != nil {
        fmt.Fprintf(os.Stderr, "%v\n", err)
        os.Exit(1)
    }
}

4.2.读取配置文件,进行配置读取和初始化默认配置

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go ->NewControllerManagerCommand

func NewControllerManagerCommand() *cobra.Command {

    s, err := options.NewKubeControllerManagerOptions()
    if err != nil {
        glog.Fatalf("unable to initialize command options: %v", err)
    }

    cmd := &cobra.Command{
        Use: "kube-controller-manager",
        Long: `The Kubernetes controller manager is a daemon that embeds
the core control loops shipped with Kubernetes. In applications of robotics and
automation, a control loop is a non-terminating loop that regulates the state of
the system. In Kubernetes, a controller is a control loop that watches the shared
state of the cluster through the apiserver and makes changes attempting to move the
current state towards the desired state. Examples of controllers that ship with
Kubernetes today are the replication controller, endpoints controller, namespace
controller, and serviceaccounts controller.`,
        Run: func(cmd *cobra.Command, args []string) {
            verflag.PrintAndExitIfRequested()
            utilflag.PrintFlags(cmd.Flags())

            c, err := s.Config(KnownControllers(), ControllersDisabledByDefault.List())
            if err != nil {
                fmt.Fprintf(os.Stderr, "%v\n", err)
                os.Exit(1)
            }

            if err := Run(c.Complete()); err != nil {
                fmt.Fprintf(os.Stderr, "%v\n", err)
                os.Exit(1)
            }
        },
    }
    s.AddFlags(cmd.Flags(), KnownControllers(), ControllersDisabledByDefault.List())

    return cmd
}

4.3.组件启动执行

从main中的command.Execute()到4.2中构造的Run
位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
//加载所有控制器,并将对应参数注入到控制器中

c, err := s.Config(KnownControllers(), ControllersDisabledByDefault.List())

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
KnownControllers()中的NewControllerInitializers初始化所有的控制器

func NewControllerInitializers(loopMode ControllerLoopMode) map[string]InitFunc {
    controllers := map[string]InitFunc{}
    controllers["endpoint"] = startEndpointController
    controllers["replicationcontroller"] = startReplicationController
    controllers["podgc"] = startPodGCController
    controllers["resourcequota"] = startResourceQuotaController
    controllers["namespace"] = startNamespaceController
    controllers["serviceaccount"] = startServiceAccountController
    controllers["garbagecollector"] = startGarbageCollectorController
    controllers["daemonset"] = startDaemonSetController
    controllers["job"] = startJobController
    controllers["deployment"] = startDeploymentController
    controllers["replicaset"] = startReplicaSetController
    controllers["horizontalpodautoscaling"] = startHPAController
    controllers["disruption"] = startDisruptionController
    controllers["statefulset"] = startStatefulSetController
    controllers["cronjob"] = startCronJobController
    controllers["csrsigning"] = startCSRSigningController
    controllers["csrapproving"] = startCSRApprovingController
    controllers["csrcleaner"] = startCSRCleanerController
    controllers["ttl"] = startTTLController
    controllers["bootstrapsigner"] = startBootstrapSignerController
    controllers["tokencleaner"] = startTokenCleanerController
    controllers["nodeipam"] = startNodeIpamController
    if loopMode == IncludeCloudLoops {
        controllers["service"] = startServiceController
        controllers["route"] = startRouteController
        // TODO: volume controller into the IncludeCloudLoops only set.
        // TODO: Separate cluster in cloud check from node lifecycle controller.
    }
    controllers["nodelifecycle"] = startNodeLifecycleController
    controllers["persistentvolume-binder"] = startPersistentVolumeBinderController
    controllers["attachdetach"] = startAttachDetachController
    controllers["persistentvolume-expander"] = startVolumeExpandController
    controllers["clusterrole-aggregation"] = startClusterRoleAggregrationController
    controllers["pvc-protection"] = startPVCProtectionController
    controllers["pv-protection"] = startPVProtectionController

    return controllers
}

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
真正进入执行

// Run runs the KubeControllerManagerOptions.  This should never exit.
func Run(c *config.CompletedConfig) error {
    // To help debugging, immediately log version
    glog.Infof("Version: %+v", version.Get())

    if cfgz, err := configz.New("componentconfig"); err == nil {
        cfgz.Set(c.ComponentConfig)
    } else {
        glog.Errorf("unable to register configz: %c", err)
    }

    // Start the controller manager HTTP server
    stopCh := make(chan struct{})
    if c.SecureServing != nil {
        handler := genericcontrollermanager.NewBaseHandler(&c.ComponentConfig.Debugging)
        handler = genericcontrollermanager.BuildHandlerChain(handler, &c.Authorization, &c.Authentication)
        if err := c.SecureServing.Serve(handler, 0, stopCh); err != nil {
            return err
        }
    }
    if c.InsecureServing != nil {
        handler := genericcontrollermanager.NewBaseHandler(&c.ComponentConfig.Debugging)
        handler = genericcontrollermanager.BuildHandlerChain(handler, &c.Authorization, &c.Authentication)
        if err := c.InsecureServing.Serve(handler, 0, stopCh); err != nil {
            return err
        }
    }

    run := func(stop <-chan struct{}) {
        rootClientBuilder := controller.SimpleControllerClientBuilder{
            ClientConfig: c.Kubeconfig,
        }
        var clientBuilder controller.ControllerClientBuilder
        if c.ComponentConfig.KubeCloudShared.UseServiceAccountCredentials {
            if len(c.ComponentConfig.SAController.ServiceAccountKeyFile) == 0 {
                // It'c possible another controller process is creating the tokens for us.
                // If one isn't, we'll timeout and exit when our client builder is unable to create the tokens.
                glog.Warningf("--use-service-account-credentials was specified without providing a --service-account-private-key-file")
            }
            clientBuilder = controller.SAControllerClientBuilder{
                ClientConfig:         restclient.AnonymousClientConfig(c.Kubeconfig),
                CoreClient:           c.Client.CoreV1(),
                AuthenticationClient: c.Client.AuthenticationV1(),
                Namespace:            "kube-system",
            }
        } else {
            clientBuilder = rootClientBuilder
        }
        ctx, err := CreateControllerContext(c, rootClientBuilder, clientBuilder, stop)
        if err != nil {
            glog.Fatalf("error building controller context: %v", err)
        }
        saTokenControllerInitFunc := serviceAccountTokenControllerStarter{rootClientBuilder: rootClientBuilder}.startServiceAccountTokenController

        //启动控制器
        if err := StartControllers(ctx, saTokenControllerInitFunc, NewControllerInitializers(ctx.LoopMode)); err != nil {
            glog.Fatalf("error starting controllers: %v", err)
        }

        ctx.InformerFactory.Start(ctx.Stop)
        close(ctx.InformersStarted)

        select {}
    }

    //note 如果未启用选主(只是单节点),直接启动,并且panic,不在往下走,因为run内部有select挂起
    if !c.ComponentConfig.GenericComponent.LeaderElection.LeaderElect {
        run(wait.NeverStop)
        panic("unreachable")
    }

    id, err := os.Hostname()
    if err != nil {
        return err
    }

    // add a uniquifier so that two processes on the same host don't accidentally both become active
    //生成唯一ID,相当于进程锁
    id = id + "_" + string(uuid.NewUUID())
    rl, err := resourcelock.New(c.ComponentConfig.GenericComponent.LeaderElection.ResourceLock,
        "kube-system",
        "kube-controller-manager",
        c.LeaderElectionClient.CoreV1(),
        resourcelock.ResourceLockConfig{
            Identity:      id,
            EventRecorder: c.EventRecorder,
        })
    if err != nil {
        glog.Fatalf("error creating lock: %v", err)
    }

    //进行选主,并在选为主节点后执行run
    leaderelection.RunOrDie(leaderelection.LeaderElectionConfig{
        Lock:          rl,
        LeaseDuration: c.ComponentConfig.GenericComponent.LeaderElection.LeaseDuration.Duration,
        RenewDeadline: c.ComponentConfig.GenericComponent.LeaderElection.RenewDeadline.Duration,
        RetryPeriod:   c.ComponentConfig.GenericComponent.LeaderElection.RetryPeriod.Duration,
        Callbacks: leaderelection.LeaderCallbacks{
            //选主完成后执行
            OnStartedLeading: run,
            OnStoppedLeading: func() {
                glog.Fatalf("leaderelection lost")
            },
        },
    })
    panic("unreachable")
}

转到run内部核心的三个动作 :CreateControllerContext 、 StartControllers和ctx.InformerFactory.Start

CreateControllerContext

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go

func CreateControllerContext(s *config.CompletedConfig, rootClientBuilder, clientBuilder controller.ControllerClientBuilder, stop <-chan struct{}) (ControllerContext, error) {
    //拿到对APIServer资源的操作句柄
    versionedClient := rootClientBuilder.ClientOrDie("shared-informers")
    sharedInformers := informers.NewSharedInformerFactory(versionedClient, ResyncPeriod(s)())

    // If apiserver is not running we should wait for some time and fail only then. This is particularly
    // important when we start apiserver and controller manager at the same time.
    //gaogao note : 10s内检查APIserver服务是否可用
    if err := genericcontrollermanager.WaitForAPIServer(versionedClient, 10*time.Second); err != nil {
        return ControllerContext{}, fmt.Errorf("failed to wait for apiserver being healthy: %v", err)
    }

    // Use a discovery client capable of being refreshed.
    discoveryClient := rootClientBuilder.ClientOrDie("controller-discovery")
    //note:  DiscoveryClient = discoveryClient.Discovery()
    cachedClient := cacheddiscovery.NewMemCacheClient(discoveryClient.Discovery())
    restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cachedClient)
    go wait.Until(func() {
        restMapper.Reset()
    }, 30*time.Second, stop)

    availableResources, err := GetAvailableResources(rootClientBuilder)
    if err != nil {
        return ControllerContext{}, err
    }

    cloud, loopMode, err := createCloudProvider(s.ComponentConfig.CloudProvider.Name, s.ComponentConfig.ExternalCloudVolumePlugin,
        s.ComponentConfig.CloudProvider.CloudConfigFile, s.ComponentConfig.KubeCloudShared.AllowUntaggedCloud, sharedInformers)
    if err != nil {
        return ControllerContext{}, err
    }

    ctx := ControllerContext{
        ClientBuilder:      clientBuilder,
        InformerFactory:    sharedInformers,
        ComponentConfig:    s.ComponentConfig,
        RESTMapper:         restMapper,
        AvailableResources: availableResources,
        Cloud:              cloud,
        LoopMode:           loopMode,
        Stop:               stop,
        InformersStarted:   make(chan struct{}),
        ResyncPeriod:       ResyncPeriod(s),
    }
    return ctx, nil
}

StartControllers

位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/controllermanager.go
启动初始化的所有控制器

func StartControllers(ctx ControllerContext, startSATokenController InitFunc, controllers map[string]InitFunc) error {
    ···
    for controllerName, initFn := range controllers {
        if !ctx.IsControllerEnabled(controllerName) {
            glog.Warningf("%q is disabled", controllerName)
            continue
        }

        time.Sleep(wait.Jitter(ctx.ComponentConfig.GenericComponent.ControllerStartInterval.Duration, ControllerStartJitter))

        glog.V(1).Infof("Starting %q", controllerName)
        //note : initFn为初始化controller是创建的初始化函数
        started, err := initFn(ctx)
        ···
    }

    return nil
}

ctx.InformerFactory.Start

controller-manager中的informer开始启动监听资源的事件,将事件放到自己的队列中(具有限流特性)。处理进程从队列总获取事件开始进行任务处理。

将新建的ReplicaSet,放入队列

// obj could be an *apps.ReplicaSet, or a DeletionFinalStateUnknown marker item.
func (rsc *ReplicaSetController) enqueueReplicaSet(obj interface{}) {
    key, err := controller.KeyFunc(obj)
    if err != nil {
        utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
        return
    }
    rsc.queue.Add(key)
}

从队列中获取对象进行处理(具体过程见下方)

func (rsc *ReplicaSetController) processNextWorkItem() bool {
    key, quit := rsc.queue.Get()
    if quit {
        return false
    }
    defer rsc.queue.Done(key)

    err := rsc.syncHandler(key.(string))
    if err == nil {
        rsc.queue.Forget(key)
        return true
    }

    utilruntime.HandleError(fmt.Errorf("Sync %q failed with %v", key, err))
    rsc.queue.AddRateLimited(key)

    return true
}

4.4.以startReplicaSetController为例分析controller的启动和执行过程

在StartControllers中initFn方法是NewControllerInitializers中初始化Controller是定义,以下主要看下startReplicaSetController。
位置: k8s.io/kubernetes/cmd/kube-controller-manager/app/apps.go
其中NewReplicaSetController主要是初始化ReplicaSetController的结构,包括apiserver的客户端,informer的回调函数等等。NewReplicaSetController->NewBaseController

func startReplicaSetController(ctx ControllerContext) (bool, error) {
    if !ctx.AvailableResources[schema.GroupVersionResource{Group: "apps", Version: "v1", Resource: "replicasets"}] {
        return false, nil
    }
    go replicaset.NewReplicaSetController(
        ctx.InformerFactory.Apps().V1().ReplicaSets(),
        ctx.InformerFactory.Core().V1().Pods(),
        ctx.ClientBuilder.ClientOrDie("replicaset-controller"),
        replicaset.BurstReplicas,
    ).Run(int(ctx.ComponentConfig.ReplicaSetController.ConcurrentRSSyncs), ctx.Stop)
    return true, nil
}

关键函数run:k8s.io/kubernetes/pkg/controller/replicaset/replica_set.go
run中执行rsc.worker。

// Run begins watching and syncing.
func (rsc *ReplicaSetController) Run(workers int, stopCh <-chan struct{}) {
    defer utilruntime.HandleCrash()
    defer rsc.queue.ShutDown()

    controllerName := strings.ToLower(rsc.Kind)
    glog.Infof("Starting %v controller", controllerName)
    defer glog.Infof("Shutting down %v controller", controllerName)

    if !controller.WaitForCacheSync(rsc.Kind, stopCh, rsc.podListerSynced, rsc.rsListerSynced) {
        return
    }

    for i := 0; i < workers; i++ {
        go wait.Until(rsc.worker, time.Second, stopCh)
    }

    <-stopCh
}

rsc.worker即为rsc.syncHandler,而syncHandler在创建时来源于rsc.syncReplicaSet(见NewBaseController方法)
那么我们转到syncReplicaSet
位置:k8s.io/kubernetes/pkg/controller/replicaset/replica_set.go
updateReplicaSetStatus:在pod死亡或者新建时更新

func (rsc *ReplicaSetController) syncReplicaSet(key string) error {

    startTime := time.Now()
    defer func() {
        glog.V(4).Infof("Finished syncing %v %q (%v)", rsc.Kind, key, time.Since(startTime))
    }()
    //从key中解析出namespace和name
    namespace, name, err := cache.SplitMetaNamespaceKey(key)
    if err != nil {
        return err
    }
        //根据名称通过apiserver获取rs
    rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
    if errors.IsNotFound(err) {
        glog.V(4).Infof("%v %v has been deleted", rsc.Kind, key)
        rsc.expectations.DeleteExpectations(key)
        return nil
    }
    if err != nil {
        return err
    }

    rsNeedsSync := rsc.expectations.SatisfiedExpectations(key)
    selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
    if err != nil {
        utilruntime.HandleError(fmt.Errorf("Error converting pod selector to selector: %v", err))
        return nil
    }

    // list all pods to include the pods that don't match the rs`s selector
    // anymore but has the stale controller ref.
    // TODO: Do the List and Filter in a single pass, or use an index.
    allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
    if err != nil {
        return err
    }
    // Ignore inactive pods.
    var filteredPods []*v1.Pod
    for _, pod := range allPods {
        if controller.IsPodActive(pod) {
            filteredPods = append(filteredPods, pod)
        }
    }

    // NOTE: filteredPods are pointing to objects from cache - if you need to
    // modify them, you need to copy it first.
    filteredPods, err = rsc.claimPods(rs, selector, filteredPods)
    if err != nil {
        return err
    }

    var manageReplicasErr error
    if rsNeedsSync && rs.DeletionTimestamp == nil {
        manageReplicasErr = rsc.manageReplicas(filteredPods, rs)
    }
    rs = rs.DeepCopy()
    newStatus := calculateStatus(rs, filteredPods, manageReplicasErr)

    // Always updates status as pods come up or die.
    //在pod死亡或者新建时更新
    updatedRS, err := updateReplicaSetStatus(rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace), rs, newStatus)
    ···
    return manageReplicasErr
}

转到updateReplicaSetStatus:k8s.io/kubernetes/pkg/controller/replicaset/replica_set_utils.go
调用UpdateStatus,通过apiserver更新

func updateReplicaSetStatus(c appsclient.ReplicaSetInterface, rs *apps.ReplicaSet, newStatus apps.ReplicaSetStatus) (*apps.ReplicaSet, error) {
    ···
        updatedRS, updateErr = c.UpdateStatus(rs)
    ···
    return nil, updateErr
}
func (c *replicaSets) UpdateStatus(replicaSet *v1.ReplicaSet) (result *v1.ReplicaSet, err error) {
    result = &v1.ReplicaSet{}
    err = c.client.Put().
        Namespace(c.ns).
        Resource("replicasets").
        Name(replicaSet.Name).
        SubResource("status").
        Body(replicaSet).
        Do().
        Into(result)
    return
}

5.此处强调一下controller-manager中PodGCController的清理依据

1.gc掉超过阈值限制的pod,按时间排序gc

func (gcc *PodGCController) gcTerminated(pods []*v1.Pod) {
    terminatedPods := []*v1.Pod{}
    for _, pod := range pods {
        if isPodTerminated(pod) {
            terminatedPods = append(terminatedPods, pod)
        }
    }

    terminatedPodCount := len(terminatedPods)
    sort.Sort(byCreationTimestamp(terminatedPods))

    deleteCount := terminatedPodCount - gcc.terminatedPodThreshold

    if deleteCount > terminatedPodCount {
        deleteCount = terminatedPodCount
    }
    if deleteCount > 0 {
        glog.Infof("garbage collecting %v pods", deleteCount)
    }

    var wait sync.WaitGroup
    for i := 0; i < deleteCount; i++ {
        wait.Add(1)
        go func(namespace string, name string) {
            defer wait.Done()
            if err := gcc.deletePod(namespace, name); err != nil {
                // ignore not founds
                defer utilruntime.HandleError(err)
            }
        }(terminatedPods[i].Namespace, terminatedPods[i].Name)
    }
    wait.Wait()
}

2.gc掉孤儿pod:pod上的node信息不在当前可调度的节点上,即没有和有效node绑定

func (gcc *PodGCController) gcOrphaned(pods []*v1.Pod) {
    glog.V(4).Infof("GC'ing orphaned")
    // We want to get list of Nodes from the etcd, to make sure that it's as fresh as possible.
    nodes, err := gcc.kubeClient.CoreV1().Nodes().List(metav1.ListOptions{})
    if err != nil {
        return
    }
    nodeNames := sets.NewString()
    for i := range nodes.Items {
        nodeNames.Insert(nodes.Items[i].Name)
    }

    for _, pod := range pods {
        if pod.Spec.NodeName == "" {
            continue
        }
        if nodeNames.Has(pod.Spec.NodeName) {
            continue
        }
        glog.V(2).Infof("Found orphaned Pod %v assigned to the Node %v. Deleting.", pod.Name, pod.Spec.NodeName)
        if err := gcc.deletePod(pod.Namespace, pod.Name); err != nil {
            utilruntime.HandleError(err)
        } else {
            glog.V(0).Infof("Forced deletion of orphaned Pod %s succeeded", pod.Name)
        }
    }
}

3.gc掉没有调度成功的pod:表现在pod的NodeName为空,主要由于资源等条件不满足

func (gcc *PodGCController) gcUnscheduledTerminating(pods []*v1.Pod) {
    glog.V(4).Infof("GC'ing unscheduled pods which are terminating.")

    for _, pod := range pods {
        if pod.DeletionTimestamp == nil || len(pod.Spec.NodeName) > 0 {
            continue
        }

        glog.V(2).Infof("Found unscheduled terminating Pod %v not assigned to any Node. Deleting.", pod.Name)
        if err := gcc.deletePod(pod.Namespace, pod.Name); err != nil {
            utilruntime.HandleError(err)
        } else {
            glog.V(0).Infof("Forced deletion of unscheduled terminating Pod %s succeeded", pod.Name)
        }
    }
}










上一篇 下一篇

猜你喜欢

热点阅读