从源码看pod 驱逐机制
2024-01-12 本文已影响0人
wwq2020
背景
pod驱逐有2种场景
1 controller-manager基于taint的驱逐(not not ready)
2 kubelet的驱逐
源码
controller-manager
taint
pkg/controller/nodelifecycle/node_lifecycle_controller.go中
创建NodeLifecycle控制器
func NewNodeLifecycleController(
ctx context.Context,
leaseInformer coordinformers.LeaseInformer,
podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
daemonSetInformer appsv1informers.DaemonSetInformer,
kubeClient clientset.Interface,
nodeMonitorPeriod time.Duration,
nodeStartupGracePeriod time.Duration,
nodeMonitorGracePeriod time.Duration,
evictionLimiterQPS float32,
secondaryEvictionLimiterQPS float32,
largeClusterThreshold int32,
unhealthyZoneThreshold float32,
) (*Controller, error) {
...
}
运行控制器
func (nc *Controller) Run(ctx context.Context) {
...
驱逐pod
go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second)
...
添加taint
go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod)
...
监控节点健康状况
go wait.UntilWithContext(ctx, func(ctx context.Context) {
if err := nc.monitorNodeHealth(ctx); err != nil {
logger.Error(err, "Error monitoring node health")
}
}, nc.nodeMonitorPeriod)
...
}
func (nc *Controller) doPodProcessingWorker(ctx context.Context) {
...
nc.processPod(ctx, podItem)
...
}
添加taint
func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) {
...
var zoneNoExecuteTainterKeys []string
func() {
nc.evictorLock.Lock()
defer nc.evictorLock.Unlock()
zoneNoExecuteTainterKeys = make([]string, 0, len(nc.zoneNoExecuteTainter))
for k := range nc.zoneNoExecuteTainter {
zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k)
}
}()
...
for _, k := range zoneNoExecuteTainterKeys {
...
添加根据ready condition污点
taintToAdd := v1.Taint{}
oppositeTaint := v1.Taint{}
switch condition.Status {
case v1.ConditionFalse:
taintToAdd = *NotReadyTaintTemplate
oppositeTaint = *UnreachableTaintTemplate
case v1.ConditionUnknown:
taintToAdd = *UnreachableTaintTemplate
oppositeTaint = *NotReadyTaintTemplate
default:
...
return true, 0
}
result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node)
...
}
...
}
监控节点健康状况
func (nc *Controller) monitorNodeHealth(ctx context.Context) error {
...
处理基于污点的驱逐
nc.processTaintBaseEviction(ctx, node, &observedReadyCondition)
...
}
...
}
func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) {
...
修改unreadhable污点为notready
if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
taintToAdd := *NotReadyTaintTemplate
if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
...
}
标记节点准备添加污点
} else if nc.markNodeForTainting(node, v1.ConditionFalse) {
...
}
...
}
标记节点准备添加污点
func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool {
...
if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name)
}
...
}
至此node异常时已添加上not ready taints
evict
pkg/controller/tainteviction/taint_eviction.go中
evict控制器
func New(ctx context.Context, c clientset.Interface, podInformer corev1informers.PodInformer, nodeInformer corev1informers.NodeInformer, controllerName string) (*Controller, error) {
...
}
运行控制器
func (tc *Controller) Run(ctx context.Context) {
...
启动工作goroutine
go tc.worker(ctx, i, wg.Done, ctx.Done())
..
}
func (tc *Controller) worker(ctx context.Context, worker int, done func(), stopCh <-chan struct{}) {
...
处理节点更新
case nodeUpdate := <-tc.nodeUpdateChannels[worker]:
tc.handleNodeUpdate(ctx, nodeUpdate)
...
}
func (tc *Controller) handleNodeUpdate(ctx context.Context, nodeUpdate nodeUpdateItem) {
...
pods, err := tc.getPodsAssignedToNode(node.Name)
...
处理节点上的pod
tc.processPodOnNode(ctx, podNamespacedName, node.Name, pod.Spec.Tolerations, taints, now)
...
}
处理节点上的pod
func (tc *Controller) processPodOnNode(
ctx context.Context,
podNamespacedName types.NamespacedName,
nodeName string,
tolerations []v1.Toleration,
taints []v1.Taint,
now time.Time,
) {
...
添加到工作队列,定期执行
tc.taintEvictionQueue.AddWork(ctx, NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), startTime, triggerTime)
}
至此已完成pod evict
kubelet
pkg/kubelet/eviction/eviction_manager.go中
func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) {
...
定期检查驱逐
evictedPods, err := m.synchronize(diskInfoProvider, podFunc)
...
}
检查是否满足条件进行驱逐
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) ([]*v1.Pod, error) {
...
没有满足条件
if len(thresholds) == 0 {
klog.V(3).InfoS("Eviction manager: no resources are starved")
return nil, nil
}
...
驱逐pod
if m.evictPod(pod, gracePeriodOverride, message, annotations, condition) {
...
}
驱逐pod
func (m *managerImpl) evictPod(pod *v1.Pod, gracePeriodOverride int64, evictMsg string, annotations map[string]string, condition *v1.PodCondition) bool {
...
执行驱逐pod
err := m.killPodFunc(pod, true, &gracePeriodOverride, func(status *v1.PodStatus) {
status.Phase = v1.PodFailed
status.Reason = Reason
status.Message = evictMsg
if condition != nil {
podutil.UpdatePodCondition(status, condition)
}
})
...
}