kubelet源码分析 添加 pod
创始人
2024-03-07 21:36:27
0

添加pod
1触发HandlePodAdditions函数进行创建

  • 对pod根据时间戳升序排序
  • 如果是静态pod,走静态pod处理
  • 第17行,如果这个pod不是在停止中,就需要进行特殊处理。(pod是否能在node上创建的校验)
  • 第18行,流程2
  • 第19行,流程3
  • 第20行,rejectPod函数如果不能创建,则记录一下原因。
  • 通过校验后。dispatchWork触发,执行和上述一样的函数进行创建pod
  • 最后一行,把这个pod增加到定时探测中
case kubetypes.ADD:klog.V(2).InfoS("SyncLoop ADD", "source", u.Source, "pods", klog.KObjs(u.Pods))handler.HandlePodAdditions(u.Pods)func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {start := kl.clock.Now()sort.Sort(sliceutils.PodsByCreationTime(pods))for _, pod := range pods {existingPods := kl.podManager.GetPods() kl.podManager.AddPod(pod)if kubetypes.IsMirrorPod(pod) {kl.handleMirrorPod(pod, start)continue}if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {activePods := kl.filterOutTerminatedPods(existingPods)if ok, reason, message := kl.canAdmitPod(activePods, pod); !ok {kl.rejectPod(pod, reason, message)continue}}mirrorPod, _ := kl.podManager.GetMirrorPodByPod(pod)kl.dispatchWork(pod, kubetypes.SyncPodCreate, mirrorPod, start)kl.probeManager.AddPod(pod)}
}

2.把所有pod进行校验,留下所有已经承认或者的pod。

  • 第4行和7行,如果pod停止完成了或者运行完成和失败的并且不是停止中。就跳过
  • 否则记录存入切片里返回。
func (kl *Kubelet) filterOutInactivePods(pods []*v1.Pod) []*v1.Pod {filteredPods := make([]*v1.Pod, 0, len(pods))for _, p := range pods {if kl.podWorkers.IsPodKnownTerminated(p.UID) {continue}if kl.isAdmittedPodTerminal(p) && !kl.podWorkers.IsPodTerminationRequested(p.UID) {continue}filteredPods = append(filteredPods, p)}return filteredPods
}

3.校验是否能在node上创建
第四行为函数校验,这里情况比较多,不全部贴代码,大概是验证一下pod的可创建型,有以下几种函数

  • node是否已经ready。node是否有内存、cpu、磁盘、进程压力。网络是否配置正确。
  • sysctl是否是有效的(pod的securityContext下的Sysctls)如果不修改内核,直接返回true
  • node节点上的资源是否足够、亲和性等(在做一遍部分scheduler的工作,确保安全可靠)
  • 拓扑管理
func (kl *Kubelet) canAdmitPod(pods []*v1.Pod, pod *v1.Pod) (bool, string, string) {attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: pods}for _, podAdmitHandler := range kl.admitHandlers {if result := podAdmitHandler.Admit(attrs); !result.Admit {return false, result.Reason, result.Message}}return true, "", ""
}

4.校验pod的几个重要函数。这个函数主要检查node是否可以

  • 如果全部准备就绪,返回true
  • 如果是关键pod。可以根据pod的优先级是否可以创建(2000000000优先级可以直接创建),或者是静态pod也可以直接创建不用理会是否有其他压力
  • 如果是内存压力,查看一下pod的QOS级别(第15行,流程4.1),如果不是BestEffort,可以进行创建。这里可以简单介绍一下pod的级别,如果limit和request都设置了并且相等,则是Guaranteed级别,当资源紧张时,淘汰的级别也是最低的;如果limit大于request,则是Burstable级别,也就是爆发性增长的。淘汰级别第二;如果都未设置,代表不限制,则是BestEffort级别,优先淘汰。
  • 检查pod的容忍度,如果能接受有内存压力,就返回true
  • 如果是其他压力,如cpu压力,磁盘压力,网络配置问题等,就不能创建了
pkg/kubelet/eviction/eviction_manager.go
func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {m.RLock()defer m.RUnlock()if len(m.nodeConditions) == 0 {return lifecycle.PodAdmitResult{Admit: true}}if kubelettypes.IsCriticalPod(attrs.Pod) {return lifecycle.PodAdmitResult{Admit: true}}nodeOnlyHasMemoryPressureCondition := hasNodeCondition(m.nodeConditions, v1.NodeMemoryPressure) && len(m.nodeConditions) == 1if nodeOnlyHasMemoryPressureCondition {notBestEffort := v1.PodQOSBestEffort != v1qos.GetPodQOS(attrs.Pod)if notBestEffort {return lifecycle.PodAdmitResult{Admit: true}}if v1helper.TolerationsTolerateTaint(attrs.Pod.Spec.Tolerations, &v1.Taint{Key:    v1.TaintNodeMemoryPressure,Effect: v1.TaintEffectNoSchedule,}) {return lifecycle.PodAdmitResult{Admit: true}}}klog.InfoS("Failed to admit pod to node", "pod", klog.KObj(attrs.Pod), "nodeCondition", m.nodeConditions)return lifecycle.PodAdmitResult{Admit:   false,Reason:  Reason,Message: fmt.Sprintf(nodeConditionMessageFmt, m.nodeConditions),}
}

4.1.获得pod的QOS等级。官方文档说是每个容器的limits和requests都需要相等才是Guaranteed,其实是判断所有容器的总和

  • 两个for循环中。遍历所有容器,把limits和requests的总和统计
  • 如果某个pod中的容器未全设置limits和requests,则不是Guaranteed
  • 如果都未设置过,则是最低等级BestEffort
  • 如果都设置了,并且总和相等,则是最高等级Guaranteed
  • 如果都设置了,但是不相等,则是中间等级Burstable
func GetPodQOS(pod *v1.Pod) v1.PodQOSClass {requests := v1.ResourceList{}limits := v1.ResourceList{}zeroQuantity := resource.MustParse("0")isGuaranteed := trueallContainers := []v1.Container{}allContainers = append(allContainers, pod.Spec.Containers...)allContainers = append(allContainers, pod.Spec.InitContainers...)for _, container := range allContainers {		for name, quantity := range container.Resources.Requests {if !isSupportedQoSComputeResource(name) {continue}if quantity.Cmp(zeroQuantity) == 1 {delta := quantity.DeepCopy()if _, exists := requests[name]; !exists {requests[name] = delta} else {delta.Add(requests[name])requests[name] = delta}}}qosLimitsFound := sets.NewString()for name, quantity := range container.Resources.Limits {if !isSupportedQoSComputeResource(name) {continue}if quantity.Cmp(zeroQuantity) == 1 {qosLimitsFound.Insert(string(name))delta := quantity.DeepCopy()if _, exists := limits[name]; !exists {limits[name] = delta} else {delta.Add(limits[name])limits[name] = delta}}}if !qosLimitsFound.HasAll(string(v1.ResourceMemory), string(v1.ResourceCPU)) {isGuaranteed = false}}if len(requests) == 0 && len(limits) == 0 {return v1.PodQOSBestEffort}if isGuaranteed {for name, req := range requests {if lim, exists := limits[name]; !exists || lim.Cmp(req) != 0 {isGuaranteed = falsebreak}}}if isGuaranteed &&len(requests) == len(limits) {return v1.PodQOSGuaranteed}return v1.PodQOSBurstable
}

5.验证sysctls。有的容器需要修改系统内核,这里需要对准许修改的内核做校验

  • 如果没有修改sysctls,则直接返回正确
  • 如果修改了sysctls,则验证明明空。有许多的 sysctl 参数都是有命名空间的,如果有命名空间是ipc或者Network的而且还用了宿主机的hostipc或hostNet,则返回错误
func (w *patternAllowlist) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {pod := attrs.Podif pod.Spec.SecurityContext == nil || len(pod.Spec.SecurityContext.Sysctls) == 0 {return lifecycle.PodAdmitResult{Admit: true,}}var hostNet, hostIPC boolif pod.Spec.SecurityContext != nil {hostNet = pod.Spec.HostNetworkhostIPC = pod.Spec.HostIPC}for _, s := range pod.Spec.SecurityContext.Sysctls {if err := w.validateSysctl(s.Name, hostNet, hostIPC); err != nil {return lifecycle.PodAdmitResult{Admit:   false,Reason:  ForbiddenReason,Message: fmt.Sprintf("forbidden sysctl: %v", err),}}}return lifecycle.PodAdmitResult{Admit: true,}
}

6.验证资源是否足够

  • 第2行,获得node的信息,如果未获得,初始化一下,都失败的话,就返回node错误了
  • 初始化一下node信息(保证数据最新最可靠),存入所有pod,并且加入pod的亲和性
  • 更新插件资源
  • 验证是否有node不符合的pod扩展请求(requests下),排出去
  • 验证pod资源是否都没问题。污点、容忍度是否有问题
  • 如果都匹配,验证标签是否匹配,验证pod.spec.OS是否有,如果传了是否等于GOOS
  • 都没问题,返回true
func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult {node, err := w.getNodeAnyWayFunc()if err != nil {klog.ErrorS(err, "Cannot get Node info")return PodAdmitResult{Admit:   false,Reason:  "InvalidNodeInfo",Message: "Kubelet cannot get node info.",}}admitPod := attrs.Pod  //要添加的podpods := attrs.OtherPods //所有占用资源的podnodeInfo := schedulerframework.NewNodeInfo(pods...)  //把pod信息存入到nodeinfo里(流程6.1)nodeInfo.SetNode(node) //把运行中的node和node的可用资源复制给nodeInfoif err = w.pluginResourceUpdateFunc(nodeInfo, attrs); err != nil { //更新插件资源message := fmt.Sprintf("Update plugin resources failed due to %v, which is unexpected.", err)klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)return PodAdmitResult{Admit:   false,Reason:  "UnexpectedAdmissionError",Message: message,}}podWithoutMissingExtendedResources := removeMissingExtendedResources(admitPod, nodeInfo)//验证node里不符合的pod的扩展请求(requests下的)reasons := generalFilter(podWithoutMissingExtendedResources, nodeInfo)//验证pod资源是否都没问题(scheduler的函数验证)fit := len(reasons) == 0if !fit {//资源有问题,例cpu超出reasons, err = w.admissionFailureHandler.HandleAdmissionFailure(admitPod, reasons)//在检测一次是否可以接纳(判断优先级,或者是静态pod)fit = len(reasons) == 0 && err == nilif err != nil {message := fmt.Sprintf("Unexpected error while attempting to recover from admission failure: %v", err)klog.InfoS("Failed to admit pod, unexpected error while attempting to recover from admission failure", "pod", klog.KObj(admitPod), "err", err)return PodAdmitResult{Admit:   fit,Reason:  "UnexpectedAdmissionError",Message: message,}}}if !fit {var reason stringvar message stringif len(reasons) == 0 {message = fmt.Sprint("GeneralPredicates failed due to unknown reason, which is unexpected.")klog.InfoS("Failed to admit pod: GeneralPredicates failed due to unknown reason, which is unexpected", "pod", klog.KObj(admitPod))return PodAdmitResult{Admit:   fit,Reason:  "UnknownReason",Message: message,}}// If there are failed predicates, we only return the first one as a reason.r := reasons[0]switch re := r.(type) {//错误的类型case *PredicateFailureError: //算法错误类型reason = re.PredicateNamemessage = re.Error()klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)case *InsufficientResourceError: //资源类型错误,如cpu不足,reason = fmt.Sprintf("OutOf%s", re.ResourceName)message = re.Error()klog.V(2).InfoS("Predicate failed on Pod", "pod", klog.KObj(admitPod), "err", message)default:reason = "UnexpectedPredicateFailureType"message = fmt.Sprintf("GeneralPredicates failed due to %v, which is unexpected.", r)klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "err", message)}return PodAdmitResult{ //返回错误信息及其类型Admit:   fit,Reason:  reason,Message: message,}}if rejectPodAdmissionBasedOnOSSelector(admitPod, node) {//验证标签return PodAdmitResult{Admit:   false,Reason:  "PodOSSelectorNodeLabelDoesNotMatch",Message: "Failed to admit pod as the `kubernetes.io/os` label doesn't match node label",}}if rejectPodAdmissionBasedOnOSField(admitPod) {//验证pod.spec.OSreturn PodAdmitResult{Admit:   false,Reason:  "PodOSNotSupported",Message: "Failed to admit pod as the OS field doesn't match node OS",}}return PodAdmitResult{Admit: true,}
}

7.加入资源和亲和性

  • 亲和性的跳过了。比较简单但是乱。就是把pod的亲和性组装一下给node,没什么可谈的
  • 这里直接跳到了资源上,统计一下pod的资源占比。所有容器资源相加。然后和init的需要的比较,取max最大的就是最终资源。如果是非0情况下,cpu是0.1核,内存是200M。这里还算计算pod的开销Overhead。会得到所有容器消耗后加上overhead的消耗
  • 把pod的信息都存入到node中。
  • 如果有亲和性,加入到node
  • 如果pod占用了端口,把端口信息加入到node
  • 如果使用了PVC,对PVC的使用量+1
func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) {res, non0CPU, non0Mem := calculateResource(podInfo.Pod)//res为总数据,non0CPU为非0的资源占比n.Requested.MilliCPU += res.MilliCPUn.Requested.Memory += res.Memoryn.Requested.EphemeralStorage += res.EphemeralStorageif n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 {n.Requested.ScalarResources = map[v1.ResourceName]int64{}}for rName, rQuant := range res.ScalarResources {n.Requested.ScalarResources[rName] += rQuant}n.NonZeroRequested.MilliCPU += non0CPUn.NonZeroRequested.Memory += non0Memn.Pods = append(n.Pods, podInfo)if podWithAffinity(podInfo.Pod) {n.PodsWithAffinity = append(n.PodsWithAffinity, podInfo)}if podWithRequiredAntiAffinity(podInfo.Pod) {n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo)}n.updateUsedPorts(podInfo.Pod, true)n.updatePVCRefCounts(podInfo.Pod, true)n.Generation = nextGeneration()
}

相关内容

热门资讯

AWSECS:访问外部网络时出... 如果您在AWS ECS中部署了应用程序,并且该应用程序需要访问外部网络,但是无法正常访问,可能是因为...
AWSElasticBeans... 在Dockerfile中手动配置nginx反向代理。例如,在Dockerfile中添加以下代码:FR...
银河麒麟V10SP1高级服务器... 银河麒麟高级服务器操作系统简介: 银河麒麟高级服务器操作系统V10是针对企业级关键业务...
北信源内网安全管理卸载 北信源内网安全管理是一款网络安全管理软件,主要用于保护内网安全。在日常使用过程中,卸载该软件是一种常...
AWR报告解读 WORKLOAD REPOSITORY PDB report (PDB snapshots) AW...
AWS管理控制台菜单和权限 要在AWS管理控制台中创建菜单和权限,您可以使用AWS Identity and Access Ma...
​ToDesk 远程工具安装及... 目录 前言 ToDesk 优势 ToDesk 下载安装 ToDesk 功能展示 文件传输 设备链接 ...
群晖外网访问终极解决方法:IP... 写在前面的话 受够了群晖的quickconnet的小水管了,急需一个新的解决方法&#x...
不能访问光猫的的管理页面 光猫是现代家庭宽带网络的重要组成部分,它可以提供高速稳定的网络连接。但是,有时候我们会遇到不能访问光...
Azure构建流程(Power... 这可能是由于配置错误导致的问题。请检查构建流程任务中的“发布构建制品”步骤,确保正确配置了“Arti...