Cni terway-ENI独占模式源码详解

2024-02-22  本文已影响0人  Teddy_b

Terway

ENI独占模式

源码分析

func podNetworkType(daemonMode string, pod *corev1.Pod) string {
    switch daemonMode {
    case daemon.ModeENIMultiIP:
        return daemon.PodNetworkTypeENIMultiIP
    case daemon.ModeVPC:
        podAnnotation := pod.GetAnnotations()
        useENI := false
        if needEni, ok := podAnnotation[podNeedEni]; ok && (needEni != "" && needEni != ConditionFalse && needEni != "0") {
            useENI = true
        }

        for _, c := range pod.Spec.Containers {
            if _, ok := c.Resources.Requests[deviceplugin.ENIResName]; ok {
                useENI = true
                break
            }
        }

        if useENI {
            return daemon.PodNetworkTypeVPCENI
        }
        return daemon.PodNetworkTypeVPCIP
    case daemon.ModeENIOnly:
        return daemon.PodNetworkTypeVPCENI
    }

}

ENI独占模式时,对应的POD网络模式是VPC-ENI,此时的网络资源请求类型就不一样了

switch pod.PodNetworkType {

    case daemon.PodNetworkTypeVPCENI:
        reply.IPType = rpc.IPType_TypeVPCENI

        else {
            req := &eni.LocalIPRequest{}

            resourceRequests = append(resourceRequests, req)
        }

    }

对于VPC-ENI类型,可以看到此时的网络资源请求类型是LocalIPRequest

func (l *Local) Allocate(ctx context.Context, cni *daemon.CNI, request ResourceRequest) (chan *AllocResp, []Trace) {
    
    expectV4 := 0
    expectV6 := 0

    if l.enableIPv4 {
        ipv4 := l.ipv4.PeekAvailable(cni.PodID, lo.IPv4)
        if ipv4 == nil && len(l.ipv4)+l.allocatingV4 >= l.cap {
            return nil, []Trace{{Condition: Full}}
        } else if ipv4 == nil {
            expectV4 = 1
        }
    }

    l.allocatingV4 += expectV4

    l.cond.Broadcast()

    respCh := make(chan *AllocResp)

    go l.allocWorker(ctx, cni, lo, respCh, func() {
            ...
    })

    return respCh, nil
}

LocalIPRequest这种类型分配IP的流程相对复杂一点,这里它会维护一个IP可用集合,分配IP的时候就是遍历这个集合,从中获取可用的IP

所谓可用的IP就是集合中还没绑定POD的那些IP

func (s Set) PeekAvailable(podID string, prefer netip.Addr) *IP {
    
    for _, v := range s {
        if  v.status == ipStatusValid  && v.podID == ""{
            return v
        }
    }
    return nil
}

如果集合中没有可用IP,它会通过信号量通知其它携程帮他分配IP,然后自己等待IP分配好了之后,再遍历集合去获取可用的IP

func (l *Local) allocWorker(ctx context.Context, cni *daemon.CNI, request *LocalIPRequest, respCh chan *AllocResp, onErrLocked func()) {

    for {
        resp := &AllocResp{}

        var ip types.IPSet2
        if l.enableIPv4 {
            ipv4 = l.ipv4.PeekAvailable(cni.PodID, request.IPv4)
            if ipv4 == nil {
                l.cond.Wait()
                continue
            }
            ip.IPv4 = ipv4.ip
        }

        return
    }
}

这里没有可用IP时它会通过l.cond.Broadcast()去唤醒携程帮它分配IP,再其它携程帮它分配好IP之前它用过l.cond.Wait()将自己挂起,等待其它携程唤醒自己

可见真正干活的是另外的携程

func (l *Local) factoryAllocWorker(ctx context.Context) {
    l.cond.L.Lock()

    log := logf.FromContext(ctx)
    for {

        if l.allocatingV4 <= 0 && l.allocatingV6 <= 0 {
            l.cond.Wait()
            continue
        }

        // wait a small period
        l.cond.L.Unlock()
        time.Sleep(300 * time.Millisecond)
        l.cond.L.Lock()

        if l.eni == nil {
            // create eni
            v4Count := min(l.batchSize, max(l.allocatingV4, 1))
            v6Count := min(l.batchSize, l.allocatingV6)

            l.status = statusCreating
            l.cond.L.Unlock()

            err := l.rateLimitEni.Wait(ctx)
            
            eni, ipv4Set, ipv6Set, err := l.factory.CreateNetworkInterface(v4Count, v6Count, l.eniType)
            
            l.cond.L.Lock()

            l.eni = eni

            l.allocatingV4 -= v4Count
            l.allocatingV6 -= v6Count

            l.allocatingV4 = max(l.allocatingV4, 0)
            l.allocatingV6 = max(l.allocatingV6, 0)

            primary, err := netip.ParseAddr(eni.PrimaryIP.IPv4.String())
            if err == nil {
                for _, v := range ipv4Set {
                    l.ipv4.Add(NewValidIP(v, netip.MustParseAddr(v.String()) == primary))
                }
            }

            l.status = statusInUse
        } 

        l.cond.Broadcast()
    }
}

这个携程就是真正分配IP的了,再不需要分配IP的时候,即l.allocatingV4 <= 0,它会一直挂起,等待被需要分配IP的携程唤醒

上述有了分配IP的需求进来了,它就会被唤醒干活了

func (a *Aliyun) CreateNetworkInterface(ipv4, ipv6 int, eniType string) (*daemon.ENI, []netip.Addr, []netip.Addr, error) {
    ctx, cancel := context.WithTimeout(a.ctx, time.Second*60)
    defer cancel()

    // 1. create eni
    var eni *client.NetworkInterface
    var vswID string

    err := wait.ExponentialBackoffWithContext(a.ctx, backoff.Backoff(backoff.ENICreate), func(ctx context.Context) (bool, error) {
        vsw, innerErr := a.vsw.GetOne(ctx, a.openAPI, a.zoneID, a.vSwitchOptions)
        
        eni, innerErr = a.openAPI.CreateNetworkInterface(ctx, trunk, vswID, a.securityGroupIDs, a.resourceGroupID, ipv4, ipv6, a.eniTags)
        
        return true, nil
    })

    r := &daemon.ENI{
        ID:        eni.NetworkInterfaceID,
        MAC:       eni.MacAddress,
        VSwitchID: eni.VSwitchID,
        Type:      eni.Type,
    }

    r.PrimaryIP.SetIP(eni.PrivateIPAddress)

    v4Set, err := func() ([]netip.Addr, error) {
        var ips []netip.Addr
        for _, v := range eni.PrivateIPSets {
            addr, err := netip.ParseAddr(v.PrivateIpAddress)
            ips = append(ips, addr)
        }
        return ips, nil
    }()


    // 2. attach eni
    err = a.openAPI.AttachNetworkInterface(ctx, eni.NetworkInterfaceID, a.instanceID, "")

    // 3. wait metadata ready & update cidr
    err = validateIPInMetadata(ctx, v4Set, func() []netip.Addr {
        exists, err := metadata.GetIPv4ByMac(r.MAC)
        
        return exists
    })


    prefix, err := metadata.GetVSwitchCIDR(eni.MacAddress)
    r.VSwitchCIDR.SetIPNet(prefix.String())

    gw, err := metadata.GetENIGatewayAddr(eni.MacAddress)
    r.GatewayIP.SetIP(gw.String())


    return r, v4Set, v6Set, nil
}

这里主要就是和阿里云 云主机相关的一些交互了

curl http://100.100.100.200/latest/meta-data/vswitch-id

vsw-8vbddxzcxxxxxxp1evxd6r
curl http://100.100.100.200/latest/meta-data/instance-id

i-8vb4cxxxxxxxxxxxxxzahaxyv
curl http://100.100.100.200/latest/meta-data/network/interfaces/macs/00:11:22:33:44:55/private-ipv4s

192.168.128.15
curl http://100.100.100.200/latest/meta-data/network/interfaces/macs/00:11:22:33:44:55/vswitch-cidr-block

192.168.128.0/24
curl http://100.100.100.200/latest/meta-data/network/interfaces/macs/00:11:22:33:44:55/gateway

192.168.128.253

上述ENI准备好了之后,就会把对应的IP地址加入到集合里,然后唤醒需要分配IP的携程即可

有了IP之后,就会转换为网络配置

func (l *LocalIPResource) ToRPC() []*rpc.NetConf {
    cfg := &rpc.NetConf{
        BasicInfo: &rpc.BasicInfo{
            PodIP:       l.IP.ToRPC(),
            PodCIDR:     l.ENI.VSwitchCIDR.ToRPC(),
            GatewayIP:   l.ENI.GatewayIP.ToRPC(),
            ServiceCIDR: nil,
        },
        ENIInfo: &rpc.ENIInfo{
            MAC:       l.ENI.MAC,
            Trunk:     false,
            Vid:       0,
            GatewayIP: l.ENI.GatewayIP.ToRPC(),
        },
        Pod:          nil,
        IfName:       "",
        ExtraRoutes:  nil,
        DefaultRoute: true,
    }

    return []*rpc.NetConf{cfg}
}

然后补充Service CIDR,获取方式和前面VPV模式是一样的

c.BasicInfo.ServiceCIDR = n.k8s.GetServiceCIDR().ToRPC()

有了网络配置后,就可以开始配置网卡了,由于此时的ipType 对应的是VPC-ENI,所以对应的网卡配置类型为独占ENI

func getDatePath(ipType rpc.IPType, vlanStripType types.VlanStripType, trunk bool) types.DataPath {
    switch ipType {
    case rpc.IPType_TypeVPCENI:
        return types.ExclusiveENI
    }
}

因为已经分配好了IP地址,所以这里就不需要IPAM插件了,直接使用分配好的IP地址即可

switch setupCfg.DP {
        case types.ExclusiveENI:
            
            if setupCfg.ContainerIfName == args.IfName {
                containerIPNet = setupCfg.ContainerIPNet
                gatewayIPSet = setupCfg.GatewayIP
            }

            err = datapath.NewExclusiveENIDriver().Setup(setupCfg, cniNetns)

最后再看下网卡配置过程

func (r *ExclusiveENI) Setup(cfg *types.SetupConfig, netNS ns.NetNS) error {
    // 1. move link in
    nicLink, err := netlink.LinkByIndex(cfg.ENIIndex)
    
    hostNetNS, err := ns.GetCurrentNS()
    
    defer hostNetNS.Close()

    err = utils.LinkSetNsFd(nicLink, netNS)

    // 2. setup addr and default route
    err = netNS.Do(func(netNS ns.NetNS) error {
        // 2.1 setup addr
        contLink, err := netlink.LinkByName(nicLink.Attrs().Name)

        contCfg := generateContCfgForExclusiveENI(cfg, contLink)
        err = nic.Setup(contLink, contCfg)

        // for now we only create slave link for eth0
        if !cfg.DisableCreatePeer && cfg.ContainerIfName == "eth0" {
            err = veth.Setup(&veth.Veth{
                IfName:   cfg.HostVETHName, // name for host ns side
                PeerName: defaultVethForENI,
            }, hostNetNS)

            var mac net.HardwareAddr
            err = hostNetNS.Do(func(netNS ns.NetNS) error {
                hostPeer, innerErr := netlink.LinkByName(cfg.HostVETHName)
                mac = hostPeer.Attrs().HardwareAddr
                return innerErr
            })

            veth1, err := netlink.LinkByName(defaultVethForENI)

            veth1Cfg := generateVeth1Cfg(cfg, veth1, mac)
            return nic.Setup(veth1, veth1Cfg)
        }
        return nil
    })


    hostPeer, err := netlink.LinkByName(cfg.HostVETHName)

    hostPeerCfg := generateHostSlaveCfg(cfg, hostPeer)
    err = nic.Setup(hostPeer, hostPeerCfg)

    return nil
}

容器内的网卡配置时, 首先直接将ENI设备移到容器命名空间内,可见这种模式下容器是直接分配的ENI网卡

然后配置容器ENI网卡名称、设置ENI网卡的IP地址、默认路由

func generateContCfgForExclusiveENI(cfg *types.SetupConfig, link netlink.Link) *nic.Conf {
    var addrs []*netlink.Addr
    var routes []*netlink.Route
    var rules []*netlink.Rule
    var sysctl map[string][]string

        else {
        addrs = utils.NewIPNetToMaxMask(cfg.ContainerIPNet)
    }

    if cfg.ContainerIPNet.IPv4 != nil {
        // add default route
        if cfg.DefaultRoute {
            routes = append(routes, &netlink.Route{
                LinkIndex: link.Attrs().Index,
                Scope:     netlink.SCOPE_UNIVERSE,
                Dst:       "0.0.0.0/0",
                Gw:        cfg.GatewayIP.IPv4,
                Flags:     int(netlink.FLAG_ONLINK),
            })
        }
    }

    contCfg := &nic.Conf{
        IfName: cfg.ContainerIfName,
        MTU:    cfg.MTU,
        Addrs:  addrs,
        Routes: routes,
        Rules:  rules,
        SysCtl: sysctl,
    }
    return contCfg
}

设置ENI网卡名称为eth0、然后设置的IP地址就是ENI的IP地址、然后添加默认路由,注意这里默认路由的网关设备就是ENI的网关地址

default via  192.168.128.253  dev eth0 onlink

如此,ENI网卡就配置好了,但是还需要一个veth网卡

err = veth.Setup(&veth.Veth{
                IfName:   cfg.HostVETHName, // name for host ns side
                PeerName: "veth1",
            }, hostNetNS)

veth网卡在容器内的网卡名称就是veth1,在宿主机上的名称就是calixxxxxxxxxxx

然后配置容器内veth网卡的名称、配置veth网卡的IP地址、配置veth网卡的默认路由、配置veth网卡的静态ARP

func generateVeth1Cfg(cfg *types.SetupConfig, link netlink.Link, peerMAC net.HardwareAddr) *nic.Conf {
    var routes []*netlink.Route
    var neighs []*netlink.Neigh
    var sysctl map[string][]string

    if cfg.ContainerIPNet.IPv4 != nil {
        // 169.254.1.1 dev veth1
        routes = append(routes, &netlink.Route{
            LinkIndex: link.Attrs().Index,
            Scope:     netlink.SCOPE_LINK,
            Dst:       "169.254.1.1",
        })

        if cfg.ServiceCIDR != nil && cfg.ServiceCIDR.IPv4 != nil {
            routes = append(routes, &netlink.Route{
                LinkIndex: link.Attrs().Index,
                Dst:       "10.96.0.0/12",
                Gw:        "169.254.1.1/32",
                Flags:     int(netlink.FLAG_ONLINK),
            })
        }
        neighs = append(neighs, &netlink.Neigh{
            LinkIndex:    link.Attrs().Index,
            IP:           169.254.1.1,
            HardwareAddr: peerMAC,
            State:        netlink.NUD_PERMANENT,
        })
    }

    contCfg := &nic.Conf{
        IfName: "veth1",
        MTU:    cfg.MTU,
        Addrs:  "192.168.128.15/32",
        Routes: routes,
        Neighs: neighs,
        SysCtl: sysctl,
    }
    return contCfg
}

设置容器内veth网卡的名称为veth1

veth1网卡的IP地址仍为ENI的IP地址

然后是veth1默认路由

169.254.1.1 dev veth1 scope link
10.96.0.0/12 via 169.254.1.1 dev veth1 onlink

然后是静态ARP,对应的MAC地址就是宿主机上calixxxxxxxxxx设备的MAC地址

? (169.254.1.1) at da:44:55:66:77:88 [ether] on eth0

最后是宿主机上的veth网卡配置

func generateHostSlaveCfg(cfg *types.SetupConfig, link netlink.Link) *nic.Conf {
    var addrs []*netlink.Addr
    var routes []*netlink.Route

    if cfg.ContainerIPNet.IPv4 != nil {
        addrs = append(addrs, &netlink.Addr{
            IPNet: "169.254.1.1/32",
        })

        // add route to container
        routes = append(routes, &netlink.Route{
            LinkIndex: link.Attrs().Index,
            Scope:     netlink.SCOPE_LINK,
            Dst:       "192.168.128.15/32",
        })
    }
    contCfg := &nic.Conf{
        IfName: cfg.HostVETHName,
        MTU:    cfg.MTU,
        Addrs:  addrs,
        Routes: routes,
        SysCtl: sysctl,
    }

    return contCfg
}

首先设置宿主机上的veth网卡名称为calixxxxxxxxxxx

设置calixxxxxxxxxxxxxx网卡IP地址为169.254.1.1/32

设置calixxxxxxxxxxxxxx网卡的默认路由

192.168.128.15/32 dev calixxxxxxxxxxxxxx scope link

可以看到这种模式下,容器内是有两个网卡的,其中ENI网卡直连的是VPC;另外的veth网卡是处理Service请求的

参考

上一篇下一篇

猜你喜欢

热点阅读