
2021-11-29  本文已影响0人  微凉哇

基于kubernetes v1.18.6,关于基于windows平台运行kubelet的相关代码逻辑不作解析。



  1. 拉取镜像
  2. 创建容器
  3. 启动容器
  4. 执行容器启动后的钩子


  1. 设置容器重启次数
  2. 生成创建容器所需配置
  3. 创建容器
  4. 预启动容器
  5. 生成容器引用信息


func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, spec *startSpec, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string, podIPs []string) (string, error) {
    // 生成容器配置-获取临时容器ID
    target, err := spec.getTargetID(podStatus)
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainerConfig

    containerConfig, cleanupAction, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef, podIPs, target)
    if cleanupAction != nil {
        defer cleanupAction()
    if err != nil {
        s, _ := grpcstatus.FromError(err)
        m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", s.Message())
        return s.Message(), ErrCreateContainerConfig




  1. 生成创建容器所需配置
  2. 根据镜像名称,调用容器运行时,获取运行容器启动命令的用户
  3. 检测运行容器启动命令的用户判是否违反pod安全上下文设置(runAsNonRoot: true时,不允许容器以root用户启动)
  4. 生成日志目录(格式为: /var/log/pods/<pod namespace>_<pod name>_<pod uid>/<容器名称>
  5. 针对windows平台,定义额外配置
  6. 定义容器内的环境变量
  7. 组装配置项并返回



// generateContainerConfig generates container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateContainerConfig(container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string, podIPs []string, nsTarget *kubecontainer.ContainerID) (*runtimeapi.ContainerConfig, func(), error) {
    // 生成创建容器所需配置:环境变量列表、挂载点信息列表、映射到容器中的主机设备列表、容器端口映射列表、容器注解列表、容器根文件系统是否只读、主机名、
    opts, cleanupAction, err := m.runtimeHelper.GenerateRunContainerOptions(pod, container, podIP, podIPs)
    if err != nil {
        return nil, nil, err

    // 根据镜像名称,调用容器运行时,获取运行容器启动命令的用户
    uid, username, err := m.getImageUser(container.Image)
    if err != nil {
        return nil, cleanupAction, err

    // Verify RunAsNonRoot. Non-root verification only supports numeric user.
    // 检测运行容器启动命令的用户判是否违反pod安全上下文设置(runAsNonRoot: true时,不允许容器以root用户启动)
    if err := verifyRunAsNonRoot(pod, container, uid, username); err != nil {
        return nil, cleanupAction, err

    // 解析容器的启动命令与参数
    command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs)

    // 生成日志目录(格式为: /var/log/pods/<pod namespace>_<pod name>_<pod uid>/<容器名称>)
    logDir := BuildContainerLogsDirectory(pod.Namespace, pod.Name, pod.UID, container.Name)
    err = m.osInterface.MkdirAll(logDir, 0755)
    if err != nil {
        return nil, cleanupAction, fmt.Errorf("create container log directory for container %s failed: %v", container.Name, err)

    // 定义pod下容器日志路径:<容器名称>/<容器重启次数>.log
    containerLogsPath := buildContainerLogsPath(container.Name, restartCount)

    restartCountUint32 := uint32(restartCount)
    // 组装容器配置
    config := &runtimeapi.ContainerConfig{
        Metadata: &runtimeapi.ContainerMetadata{
            Name:    container.Name,
            Attempt: restartCountUint32,
        Image:       &runtimeapi.ImageSpec{Image: imageRef},
        Command:     command,
        Args:        args,
        WorkingDir:  container.WorkingDir,
        Labels:      newContainerLabels(container, pod),
        Annotations: newContainerAnnotations(container, pod, restartCount, opts),
        Devices:     makeDevices(opts),
        Mounts:      m.makeMounts(opts, container),
        LogPath:     containerLogsPath,
        Stdin:       container.Stdin,
        StdinOnce:   container.StdinOnce,
        Tty:         container.TTY,

    // set platform specific configurations.
    // 针对windows,定义额外配置
    if err := m.applyPlatformSpecificContainerConfig(config, container, pod, uid, username, nsTarget); err != nil {
        return nil, cleanupAction, err

    // set environment variables
    // 定义容器内的环境变量
    envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
    for idx := range opts.Envs {
        e := opts.Envs[idx]
        envs[idx] = &runtimeapi.KeyValue{
            Key:   e.Name,
            Value: e.Value,
    config.Envs = envs

    return config, cleanupAction, nil



  1. 返回值一containerConfig: ContainerConfig对象,容器配置属性。


// ContainerConfig holds all the required and optional fields for creating a
// container.
type ContainerConfig struct {
    // Metadata of the container. This information will uniquely identify the
    // container, and the runtime should leverage this to ensure correct
    // operation. The runtime may also use this information to improve UX, such
    // as by constructing a readable name.
    Metadata *ContainerMetadata `protobuf:"bytes,1,opt,name=metadata,proto3" json:"metadata,omitempty"`
    // Image to use.
    Image *ImageSpec `protobuf:"bytes,2,opt,name=image,proto3" json:"image,omitempty"`
    // Command to execute (i.e., entrypoint for docker)
    Command []string `protobuf:"bytes,3,rep,name=command,proto3" json:"command,omitempty"`
    // Args for the Command (i.e., command for docker)
    Args []string `protobuf:"bytes,4,rep,name=args,proto3" json:"args,omitempty"`
    // Current working directory of the command.
    WorkingDir string `protobuf:"bytes,5,opt,name=working_dir,json=workingDir,proto3" json:"working_dir,omitempty"`
    // List of environment variable to set in the container.
    Envs []*KeyValue `protobuf:"bytes,6,rep,name=envs,proto3" json:"envs,omitempty"`
    // Mounts for the container.
    Mounts []*Mount `protobuf:"bytes,7,rep,name=mounts,proto3" json:"mounts,omitempty"`
    // Devices for the container.
    Devices []*Device `protobuf:"bytes,8,rep,name=devices,proto3" json:"devices,omitempty"`
    // Key-value pairs that may be used to scope and select individual resources.
    // Label keys are of the form:
    //     label-key ::= prefixed-name | name
    //     prefixed-name ::= prefix '/' name
    //     prefix ::= DNS_SUBDOMAIN
    //     name ::= DNS_LABEL
    Labels map[string]string `protobuf:"bytes,9,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
    // Unstructured key-value map that may be used by the kubelet to store and
    // retrieve arbitrary metadata.
    // Annotations MUST NOT be altered by the runtime; the annotations stored
    // here MUST be returned in the ContainerStatus associated with the container
    // this ContainerConfig creates.
    // In general, in order to preserve a well-defined interface between the
    // kubelet and the container runtime, annotations SHOULD NOT influence
    // runtime behaviour.
    Annotations map[string]string `protobuf:"bytes,10,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"`
    // Path relative to PodSandboxConfig.LogDirectory for container to store
    // the log (STDOUT and STDERR) on the host.
    // E.g.,
    //     PodSandboxConfig.LogDirectory = `/var/log/pods/<podUID>/`
    //     ContainerConfig.LogPath = `containerName/Instance#.log`
    // WARNING: Log management and how kubelet should interface with the
    // container logs are under active discussion in
    // https://issues.k8s.io/24677. There *may* be future change of direction
    // for logging as the discussion carries on.
    LogPath string `protobuf:"bytes,11,opt,name=log_path,json=logPath,proto3" json:"log_path,omitempty"`
    // Variables for interactive containers, these have very specialized
    // use-cases (e.g. debugging).
    // TODO: Determine if we need to continue supporting these fields that are
    // part of Kubernetes's Container Spec.
    Stdin     bool `protobuf:"varint,12,opt,name=stdin,proto3" json:"stdin,omitempty"`
    StdinOnce bool `protobuf:"varint,13,opt,name=stdin_once,json=stdinOnce,proto3" json:"stdin_once,omitempty"`
    Tty       bool `protobuf:"varint,14,opt,name=tty,proto3" json:"tty,omitempty"`
    // Configuration specific to Linux containers.
    Linux *LinuxContainerConfig `protobuf:"bytes,15,opt,name=linux,proto3" json:"linux,omitempty"`
    // Configuration specific to Windows containers.
    Windows              *WindowsContainerConfig `protobuf:"bytes,16,opt,name=windows,proto3" json:"windows,omitempty"`
    XXX_NoUnkeyedLiteral struct{}                `json:"-"`
    XXX_sizecache        int32                   `json:"-"`


type LinuxContainerResources struct {
    // CPU CFS (Completely Fair Scheduler) period. Default: 0 (not specified).
    CpuPeriod int64 `protobuf:"varint,1,opt,name=cpu_period,json=cpuPeriod,proto3" json:"cpu_period,omitempty"`
    // CPU CFS (Completely Fair Scheduler) quota. Default: 0 (not specified).
    CpuQuota int64 `protobuf:"varint,2,opt,name=cpu_quota,json=cpuQuota,proto3" json:"cpu_quota,omitempty"`
    // CPU shares (relative weight vs. other containers). Default: 0 (not specified).
    CpuShares int64 `protobuf:"varint,3,opt,name=cpu_shares,json=cpuShares,proto3" json:"cpu_shares,omitempty"`
    // Memory limit in bytes. Default: 0 (not specified).
    MemoryLimitInBytes int64 `protobuf:"varint,4,opt,name=memory_limit_in_bytes,json=memoryLimitInBytes,proto3" json:"memory_limit_in_bytes,omitempty"`
    // OOMScoreAdj adjusts the oom-killer score. Default: 0 (not specified).
    OomScoreAdj int64 `protobuf:"varint,5,opt,name=oom_score_adj,json=oomScoreAdj,proto3" json:"oom_score_adj,omitempty"`
    // CpusetCpus constrains the allowed set of logical CPUs. Default: "" (not specified).
    CpusetCpus string `protobuf:"bytes,6,opt,name=cpuset_cpus,json=cpusetCpus,proto3" json:"cpuset_cpus,omitempty"`
    // CpusetMems constrains the allowed set of memory nodes. Default: "" (not specified).
    CpusetMems string `protobuf:"bytes,7,opt,name=cpuset_mems,json=cpusetMems,proto3" json:"cpuset_mems,omitempty"`
    // List of HugepageLimits to limit the HugeTLB usage of container per page size. Default: nil (not specified).
    HugepageLimits       []*HugepageLimit `protobuf:"bytes,8,rep,name=hugepage_limits,json=hugepageLimits,proto3" json:"hugepage_limits,omitempty"`
    XXX_NoUnkeyedLiteral struct{}         `json:"-"`
    XXX_sizecache        int32            `json:"-"`


// LinuxContainerSecurityContext holds linux security configuration that will be applied to a container.
type LinuxContainerSecurityContext struct {
    // Capabilities to add or drop.
    Capabilities *Capability `protobuf:"bytes,1,opt,name=capabilities,proto3" json:"capabilities,omitempty"`
    // 特权模式下,可以做以下事情:
    // 1. 所有的linux能力将被添加.
    // 2. 敏感路径(例如sysfs中的内核模块路径)不会被屏蔽。
    // 3. 任何sysfs和procfs都以读写权限挂载。
    // 4. Apparmor将不会被配置
    // 5. Seccomp将不会被配置
    // 6. 设备cgroup不限制对任何设备的访问
    // 7. 主机/dev中的所有设备都可以在容器中使用。
    // 8. SELinux将不会被配置
    Privileged bool `protobuf:"varint,2,opt,name=privileged,proto3" json:"privileged,omitempty"`
    // Configurations for the container's namespaces.
    // Only used if the container uses namespace for isolation.
    NamespaceOptions *NamespaceOption `protobuf:"bytes,3,opt,name=namespace_options,json=namespaceOptions,proto3" json:"namespace_options,omitempty"`
    // SELinux context to be optionally applied.
    SelinuxOptions *SELinuxOption `protobuf:"bytes,4,opt,name=selinux_options,json=selinuxOptions,proto3" json:"selinux_options,omitempty"`
    // 以那个用户运行容器(用户id)
    RunAsUser *Int64Value `protobuf:"bytes,5,opt,name=run_as_user,json=runAsUser,proto3" json:"run_as_user,omitempty"`
    // GID to run the container process as. run_as_group should only be specified
    // when run_as_user or run_as_username is specified; otherwise, the runtime
    // MUST error.
    RunAsGroup *Int64Value `protobuf:"bytes,12,opt,name=run_as_group,json=runAsGroup,proto3" json:"run_as_group,omitempty"`
    // 以那个用户运行容器(用户名称,该用户必需存在,不会自动创建)
    RunAsUsername string `protobuf:"bytes,6,opt,name=run_as_username,json=runAsUsername,proto3" json:"run_as_username,omitempty"`
    // 根文件系统是否只读
    ReadonlyRootfs bool `protobuf:"varint,7,opt,name=readonly_rootfs,json=readonlyRootfs,proto3" json:"readonly_rootfs,omitempty"`
    // List of groups applied to the first process run in the container, in
    // addition to the container's primary GID.
    SupplementalGroups []int64 `protobuf:"varint,8,rep,packed,name=supplemental_groups,json=supplementalGroups,proto3" json:"supplemental_groups,omitempty"`
    // AppArmor profile for the container, candidate values are:
    // * runtime/default: equivalent to not specifying a profile.
    // * unconfined: no profiles are loaded
    // * localhost/<profile_name>: profile loaded on the node
    //    (localhost) by name. The possible profile names are detailed at
    //    http://wiki.apparmor.net/index.php/AppArmor_Core_Policy_Reference
    ApparmorProfile string `protobuf:"bytes,9,opt,name=apparmor_profile,json=apparmorProfile,proto3" json:"apparmor_profile,omitempty"`
    // Seccomp profile for the container, candidate values are:
    // * runtime/default: the default profile for the container runtime
    // * unconfined: unconfined profile, ie, no seccomp sandboxing
    // * localhost/<full-path-to-profile>: the profile installed on the node.
    //   <full-path-to-profile> is the full path of the profile.
    // Default: "", which is identical with unconfined.
    SeccompProfilePath string `protobuf:"bytes,10,opt,name=seccomp_profile_path,json=seccompProfilePath,proto3" json:"seccomp_profile_path,omitempty"`
    // no_new_privs defines if the flag for no_new_privs should be set on the
    // container.
    NoNewPrivs bool `protobuf:"varint,11,opt,name=no_new_privs,json=noNewPrivs,proto3" json:"no_new_privs,omitempty"`
    // 需要隐藏的路径
    MaskedPaths []string `protobuf:"bytes,13,rep,name=masked_paths,json=maskedPaths,proto3" json:"masked_paths,omitempty"`
    // readonly_paths is a slice of paths that should be set as readonly by the
    // container runtime, this can be passed directly to the OCI spec.
    ReadonlyPaths        []string `protobuf:"bytes,14,rep,name=readonly_paths,json=readonlyPaths,proto3" json:"readonly_paths,omitempty"`
    XXX_NoUnkeyedLiteral struct{} `json:"-"`
    XXX_sizecache        int32    `json:"-"`
  1. 返回值二cleanupAction: 容器带有子路径的卷成功运行或启动失败后的回调函数

  2. 返回值三为异常(error)






// GenerateRunContainerOptions generates the RunContainerOptions, which can be used by
// the container runtime to set parameters for launching a container.
func (kl *Kubelet) GenerateRunContainerOptions(pod *v1.Pod, container *v1.Container, podIP string, podIPs []string) (*kubecontainer.RunContainerOptions, func(), error) {
    opts, err := kl.containerManager.GetResources(pod, container)
    if err != nil {
        return nil, nil, err

    // 定义pod的hostname与hostDomainName
    hostname, hostDomainName, err := kl.GeneratePodHostNameAndDomain(pod)
    if err != nil {
        return nil, nil, err
    opts.Hostname = hostname
    podName := volumeutil.GetUniquePodName(pod)
    volumes := kl.volumeManager.GetMountedVolumesForPod(podName)

    opts.PortMappings = kubecontainer.MakePortMappings(container)

    blkutil := volumepathhandler.NewBlockVolumePathHandler()
    blkVolumes, err := kl.makeBlockVolumes(pod, container, volumes, blkutil)
    if err != nil {
        return nil, nil, err
    opts.Devices = append(opts.Devices, blkVolumes...)

    envs, err := kl.makeEnvironmentVariables(pod, container, podIP, podIPs)
    if err != nil {
        return nil, nil, err
    opts.Envs = append(opts.Envs, envs...)

    // only podIPs is sent to makeMounts, as podIPs is populated even if dual-stack feature flag is not enabled.
    mounts, cleanupAction, err := makeMounts(pod, kl.getPodDir(pod.UID), container, hostname, hostDomainName, podIPs, volumes, kl.hostutil, kl.subpather, opts.Envs)
    if err != nil {
        return nil, cleanupAction, err
    opts.Mounts = append(opts.Mounts, mounts...)

    // adding TerminationMessagePath on Windows is only allowed if ContainerD is used. Individual files cannot
    // be mounted as volumes using Docker for Windows.
    supportsSingleFileMapping := kl.containerRuntime.SupportsSingleFileMapping()
    if len(container.TerminationMessagePath) != 0 && supportsSingleFileMapping {
        p := kl.getPodContainerDir(pod.UID, container.Name)
        if err := os.MkdirAll(p, 0750); err != nil {
            klog.Errorf("Error on creating %q: %v", p, err)
        } else {
            opts.PodContainerDir = p

    // only do this check if the experimental behavior is enabled, otherwise allow it to default to false
    if kl.experimentalHostUserNamespaceDefaulting {
        opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod)

    return opts, cleanupAction, nil









原生的块设备(Raw Block Devices)还通常由能自己实现某种存储服务的软件(软件定义的存储系统)使用。





  1. 创建pvc,其中kubernetes-csi-rbd-scceph rbd类型
$ cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: PersistentVolumeClaim
  name: my-pvc
    - ReadWriteMany
  volumeMode: Block
  storageClassName: kubernetes-csi-rbd-sc
      storage: 1Gi
  1. pod定义使用my-pvc
$ cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: Pod
  name: my-pod
    - name: my-container
      image: busybox
        - sleep
        - "3600"
        - devicePath: /dev/block
          name: my-volume
      imagePullPolicy: IfNotPresent
    - name: my-volume
        claimName: my-pvc
  1. 进入pod内查看块设备
$ kubectl exec -it my-pod -- sh
/ # ls -l /dev/block
brwxrwxrwx    1 root     disk      252, 352 Nov 22 05:48 /dev/block





func (kl *Kubelet) makeBlockVolumes(pod *v1.Pod, container *v1.Container, podVolumes kubecontainer.VolumeMap, blkutil volumepathhandler.BlockVolumePathHandler) ([]kubecontainer.DeviceInfo, error) {
    var devices []kubecontainer.DeviceInfo
    for _, device := range container.VolumeDevices {
        // check path is absolute
        if !filepath.IsAbs(device.DevicePath) {
            return nil, fmt.Errorf("error DevicePath `%s` must be an absolute path", device.DevicePath)
        vol, ok := podVolumes[device.Name]
        if !ok || vol.BlockVolumeMapper == nil {
            klog.Errorf("Block volume cannot be satisfied for container %q, because the volume is missing or the volume mapper is nil: %+v", container.Name, device)
            return nil, fmt.Errorf("cannot find volume %q to pass into container %q", device.Name, container.Name)
        // Get a symbolic link associated to a block device under pod device path
        dirPath, volName := vol.BlockVolumeMapper.GetPodDeviceMapPath()
        symlinkPath := path.Join(dirPath, volName)
        if islinkExist, checkErr := blkutil.IsSymlinkExist(symlinkPath); checkErr != nil {
            return nil, checkErr
        } else if islinkExist {
            // Check readOnly in PVCVolumeSource and set read only permission if it's true.
            permission := "mrw"
            if vol.ReadOnly {
                permission = "r"
            klog.V(4).Infof("Device will be attached to container %q. Path on host: %v", container.Name, symlinkPath)
            devices = append(devices, kubecontainer.DeviceInfo{PathOnHost: symlinkPath, PathInContainer: device.DevicePath, Permissions: permission})

    return devices, nil


  1. 判断声明的块设备挂载路径(devicePath)是否为绝对路径(如: /dev/block),如果非绝对路径(block)返回异常。
  2. 判断卷组(spec.volumes)内是否含有该设备的pvc(卷声明)
  3. 映射主机上块设备路径与容器内路径:


$ ls -l /var/lib/kubelet/pods/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb/volumeDevices/kubernetes.io~csi/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1
lrwxrwxrwx 1 root root 142 Nov 22 13:48 /var/lib/kubelet/pods/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb/volumeDevices/kubernetes.io~csi/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1 -> /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb
$ ls -l /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb
brwxrwxrwx 1 root disk 252, 352 Nov 22 13:48 /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb


src: /{kubelet data dir}/pods/{podUid}/{DefaultKubeletVolumeDevicesDirName}/{escapeQualifiedPluginName}/, {volumeName}
dst: /var/lib/kubelet/plugins/kubernetes.io/{PluginName}/{DefaultKubeletVolumeDevicesDirName}/{volumePluginDependentPath}/{pod uuid}


$ fdisk  /var/lib/kubelet/plugins/kubernetes.io/csi/volumeDevices/publish/pvc-26cf725d-be5b-4ba8-9d59-540a35014df1/66d92c5f-ef2f-40a4-9e6c-bc46235db4cb
Welcome to fdisk (util-linux 2.23.2).

Changes will remain in memory only, until you decide to write them.
Be careful before using the write command.

Command (m for help): n
Partition type:
   p   primary (0 primary, 0 extended, 4 free)
   e   extended
Select (default p): p
Partition number (1-4, default 1): 1
First sector (2048-2097151, default 2048):
Using default value 2048
Last sector, +sectors or +size{K,M,G} (2048-2097151, default 2097151):
Using default value 2097151
Partition 1 of type Linux and of size 1023 MiB is set

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.
Syncing disks.



由于没有看过CSI的源码,这里推测:这个块设备应该是起到桥梁的作用(桥接CSI Agentpod容器内的原生块设备),本质为链接而非实体,块设备实体由CSI管理。





  1. 判断pod下卷组是否含有容器所定义的卷,不存在返回异常
  2. 如果卷支持SELinux,并且它还没有被重新标记,而且它不是只读卷,重新标记它并将其标记为已标记卷
  3. 判断卷的挂载路径(volumeMounts.mountPath)是否为空,为空的话返回异常
  4. 解析volumeMounts.subPathvolumeMounts.subPathExpr(同一个卷只能存在其中一个字段,否则异常返回):
  1. 解析podspec.hostAliases数组,写入容器的/etc/hosts内。如果该Pod使用主机网络命名空间,主机的/etc/hosts内容也将写入容器的/etc/hosts

值得注意的是NSA&CISA发布的Kubernetes加固指南 认为子路径存在安全隐患,不建议使用



// makeMounts determines the mount points for the given container.
func makeMounts(pod *v1.Pod, podDir string, container *v1.Container, hostName, hostDomain string, podIPs []string, podVolumes kubecontainer.VolumeMap, hu hostutil.HostUtils, subpather subpath.Interface, expandEnvs []kubecontainer.EnvVar) ([]kubecontainer.Mount, func(), error) {
    // Kubernetes only mounts on /etc/hosts if:
    // - container is not an infrastructure (pause) container
    // - container is not already mounting on /etc/hosts
    // - OS is not Windows
    // Kubernetes will not mount /etc/hosts if:
    // - when the Pod sandbox is being created, its IP is still unknown. Hence, PodIP will not have been set.
    mountEtcHostsFile := len(podIPs) > 0 && runtime.GOOS != "windows"
    klog.V(3).Infof("container: %v/%v/%v podIPs: %q creating hosts mount: %v", pod.Namespace, pod.Name, container.Name, podIPs, mountEtcHostsFile)
    mounts := []kubecontainer.Mount{}
    var cleanupAction func()
    for i, mount := range container.VolumeMounts {
        // do not mount /etc/hosts if container is already mounting on the path
        mountEtcHostsFile = mountEtcHostsFile && (mount.MountPath != etcHostsPath)
        vol, ok := podVolumes[mount.Name]
        if !ok || vol.Mounter == nil {
            klog.Errorf("Mount cannot be satisfied for container %q, because the volume is missing (ok=%v) or the volume mounter (vol.Mounter) is nil (vol=%+v): %+v", container.Name, ok, vol, mount)
            return nil, cleanupAction, fmt.Errorf("cannot find volume %q to mount into container %q", mount.Name, container.Name)

        relabelVolume := false
        // If the volume supports SELinux and it has not been
        // relabeled already and it is not a read-only volume,
        // relabel it and mark it as labeled
        if vol.Mounter.GetAttributes().Managed && vol.Mounter.GetAttributes().SupportsSELinux && !vol.SELinuxLabeled {
            vol.SELinuxLabeled = true
            relabelVolume = true
        hostPath, err := volumeutil.GetPath(vol.Mounter)
        if err != nil {
            return nil, cleanupAction, err

        subPath := mount.SubPath
        if mount.SubPathExpr != "" {
            if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeSubpath) {
                return nil, cleanupAction, fmt.Errorf("volume subpaths are disabled")

            if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeSubpathEnvExpansion) {
                return nil, cleanupAction, fmt.Errorf("volume subpath expansion is disabled")

            subPath, err = kubecontainer.ExpandContainerVolumeMounts(mount, expandEnvs)

            if err != nil {
                return nil, cleanupAction, err

        if subPath != "" {
            if !utilfeature.DefaultFeatureGate.Enabled(features.VolumeSubpath) {
                return nil, cleanupAction, fmt.Errorf("volume subpaths are disabled")

            if filepath.IsAbs(subPath) {
                return nil, cleanupAction, fmt.Errorf("error SubPath `%s` must not be an absolute path", subPath)

            err = volumevalidation.ValidatePathNoBacksteps(subPath)
            if err != nil {
                return nil, cleanupAction, fmt.Errorf("unable to provision SubPath `%s`: %v", subPath, err)

            volumePath := hostPath
            hostPath = filepath.Join(volumePath, subPath)

            if subPathExists, err := hu.PathExists(hostPath); err != nil {
                klog.Errorf("Could not determine if subPath %s exists; will not attempt to change its permissions", hostPath)
            } else if !subPathExists {
                // Create the sub path now because if it's auto-created later when referenced, it may have an
                // incorrect ownership and mode. For example, the sub path directory must have at least g+rwx
                // when the pod specifies an fsGroup, and if the directory is not created here, Docker will
                // later auto-create it with the incorrect mode 0750
                // Make extra care not to escape the volume!
                perm, err := hu.GetMode(volumePath)
                if err != nil {
                    return nil, cleanupAction, err
                if err := subpather.SafeMakeDir(subPath, volumePath, perm); err != nil {
                    // Don't pass detailed error back to the user because it could give information about host filesystem
                    klog.Errorf("failed to create subPath directory for volumeMount %q of container %q: %v", mount.Name, container.Name, err)
                    return nil, cleanupAction, fmt.Errorf("failed to create subPath directory for volumeMount %q of container %q", mount.Name, container.Name)
            hostPath, cleanupAction, err = subpather.PrepareSafeSubpath(subpath.Subpath{
                VolumeMountIndex: i,
                Path:             hostPath,
                VolumeName:       vol.InnerVolumeSpecName,
                VolumePath:       volumePath,
                PodDir:           podDir,
                ContainerName:    container.Name,
            if err != nil {
                // Don't pass detailed error back to the user because it could give information about host filesystem
                klog.Errorf("failed to prepare subPath for volumeMount %q of container %q: %v", mount.Name, container.Name, err)
                return nil, cleanupAction, fmt.Errorf("failed to prepare subPath for volumeMount %q of container %q", mount.Name, container.Name)

        // Docker Volume Mounts fail on Windows if it is not of the form C:/
        if volumeutil.IsWindowsLocalPath(runtime.GOOS, hostPath) {
            hostPath = volumeutil.MakeAbsolutePath(runtime.GOOS, hostPath)

        containerPath := mount.MountPath
        // IsAbs returns false for UNC path/SMB shares/named pipes in Windows. So check for those specifically and skip MakeAbsolutePath
        if !volumeutil.IsWindowsUNCPath(runtime.GOOS, containerPath) && !filepath.IsAbs(containerPath) {
            containerPath = volumeutil.MakeAbsolutePath(runtime.GOOS, containerPath)

        propagation, err := translateMountPropagation(mount.MountPropagation)
        if err != nil {
            return nil, cleanupAction, err
        klog.V(5).Infof("Pod %q container %q mount %q has propagation %q", format.Pod(pod), container.Name, mount.Name, propagation)

        mustMountRO := vol.Mounter.GetAttributes().ReadOnly

        mounts = append(mounts, kubecontainer.Mount{
            Name:           mount.Name,
            ContainerPath:  containerPath,
            HostPath:       hostPath,
            ReadOnly:       mount.ReadOnly || mustMountRO,
            SELinuxRelabel: relabelVolume,
            Propagation:    propagation,
    if mountEtcHostsFile {
        hostAliases := pod.Spec.HostAliases
        hostsMount, err := makeHostsMount(podDir, podIPs, hostName, hostDomain, hostAliases, pod.Spec.HostNetwork)
        if err != nil {
            return nil, cleanupAction, err
        mounts = append(mounts, *hostsMount)
    return mounts, cleanupAction, nil



  1. 生成创建容器所需配置
  2. 根据镜像名称,调用容器运行时,获取运行容器启动命令的用户
  3. 检测运行容器启动命令的用户判是否违反pod安全上下文设置(runAsNonRoot: true时,不允许容器以root用户启动)
  4. 生成日志目录(格式为: /var/log/pods/<pod namespace>_<pod name>_<pod uid>/<容器名称>
  5. 针对windows平台,定义额外配置
  6. 定义容器内的环境变量
  7. 组装配置项并返回


Raw Block Volume 支持进入 Beta

上一篇 下一篇

