Cni terway-ENI多IP模式源码详解

2024-02-22  本文已影响0人  Teddy_b

ENIIP模式

ENI共享模式,单个ENI可以配置多个辅助IP

源码解析

func podNetworkType(daemonMode string, pod *corev1.Pod) string {
    switch daemonMode {
    case daemon.ModeENIMultiIP:
        return daemon.PodNetworkTypeENIMultiIP
}

ENI多IP模式下,对应的POD网络模式是ENI-MultiIP,此时的网络资源请求类型又不一样了

switch pod.PodNetworkType {
    case daemon.PodNetworkTypeENIMultiIP:
        reply.IPType = rpc.IPType_TypeENIMultiIP

        else {
            req := &eni.LocalIPRequest{}

            resourceRequests = append(resourceRequests, req)
        }
}

LocalIPRequest类型的申请IP流程和ENI独占模式的流程基本上是一样的

最大的区别是分配IP时候会判断是申请新的ENI还是申请辅助IP

func (l *Local) Allocate(ctx context.Context, cni *daemon.CNI, request ResourceRequest) (chan *AllocResp, []Trace) {

    expectV4 := 0

    if l.enableIPv4 {
        ipv4 := l.ipv4.PeekAvailable(cni.PodID, lo.IPv4)
        if ipv4 == nil && len(l.ipv4)+l.allocatingV4 >= l.cap {
            return nil, []Trace{{Condition: Full}}
        } else if ipv4 == nil {
            expectV4 = 1
        }
    }

}

这里分配IP的时候,需要留一下这个判断ipv4 == nil && len(l.ipv4)+l.allocatingV4 >= l.cap,对应的是如果需要重新分配IP并且当前分配数量已经达到了IP容量时,会直接返回

func getPoolConfig(cfg *daemon.Config, daemonMode string, limit *instance.Limits) (*types.PoolConfig, error) {

    poolConfig := &types.PoolConfig{
        SecurityGroupIDs:          cfg.GetSecurityGroups(),
        VSwitchSelectionPolicy:    cfg.VSwitchSelectionPolicy,
        DisableSecurityGroupCheck: cfg.DisableSecurityGroupCheck,
        BatchSize:                 10,
    }

    switch daemonMode {
    case daemon.ModeVPC, daemon.ModeENIOnly:

        poolConfig.MaxIPPerENI = 1
case daemon.ModeENIMultiIP:
        ipPerENI := limit.IPv4PerAdapter
                poolConfig.MaxIPPerENI = ipPerENI

这就决定了在分配IP时是继续使用当前的ENI,还是申请新的ENI

func (m *Manager) Allocate(ctx context.Context, cni *daemon.CNI, req *AllocRequest) (NetworkResources, error) {
        for _, ni := range m.networkInterfaces {
            var tr []Trace
            ch, tr = ni.Allocate(ctx, cni, request)
            if ch != nil {
                break
            }
            traces = append(traces, tr...)
        }

对于独占ENI来说,由于容量是1,所以在遍历networkInterfaces的时候已经分配过IP的ENI会直接返回nil,进而开始遍历下一个ENI

而ENI共享IP来说,由于容量往往是大于1的,所以在遍历networkInterfaces的时候,仍然可以继续在当前ENI上申请辅助IP,对应的分配流程就会有点不同了

func (l *Local) factoryAllocWorker(ctx context.Context) {

        if l.eni == nil {
            ...
        } else {
            eniID := l.eni.ID
            v4Count := min(l.batchSize, l.allocatingV4)
            
            if v4Count > 0 {
            
                ipv4Set, err := l.factory.AssignNIPv4(eniID, v4Count, l.eni.MAC)
                l.cond.L.Lock()

                l.allocatingV4 -= v4Count
                l.allocatingV4 = max(l.allocatingV4, 0)

                l.ipv4.PutValid(ipv4Set...)
            }

        l.cond.Broadcast()
    }
}

此时分配IP的时候,假设是另一个POD进来分配IP,由于上一个POD分配IP的时候ENI已经创建过了,所以这里会判断到l.eni != nil

可以看到这个分支里就是通过AssignNIPv4为当前的ENI分配辅助IP

还一个区别是IPType 不一样了

func getDatePath(ipType rpc.IPType, vlanStripType types.VlanStripType, trunk bool) types.DataPath {
    switch ipType {
    case rpc.IPType_TypeENIMultiIP:
        return types.IPVlan
}

IPVlan模式下设置网卡的流程如下

switch setupCfg.DP {
case types.IPVlan:
            utils.Hook.AddExtraInfo("dp", "ipvlan")

            if conf.IPVlan() {
                available := false
                available, err = datapath.CheckIPVLanAvailable()
                if err != nil {
                    return
                }
                if available {
                    if setupCfg.ContainerIfName == args.IfName {
                        containerIPNet = setupCfg.ContainerIPNet
                        gatewayIPSet = setupCfg.GatewayIP
                    }
                    err = datapath.NewIPVlanDriver().Setup(setupCfg, cniNetns)
                    
                    continue
                }
            }
            fallthrough
        case types.PolicyRoute:
            utils.Hook.AddExtraInfo("dp", "policyRoute")

            if setupCfg.ContainerIfName == args.IfName {
                containerIPNet = setupCfg.ContainerIPNet
                gatewayIPSet = setupCfg.GatewayIP
            }
            err = datapath.NewPolicyRoute().Setup(setupCfg, cniNetns)
}

如果我们的CNI配置文件里指定了eniip_virtual_type: ipvlan并且内核版本大于4.19,那么会使用IPVlan方式配置网络

{
    "name": "networks",
    "cniVersion": "0.4.0",
    "ipam": {
        "type": "host-local",
        "subnet": "10.250.7.0/24",
        "dataDir": "/var/lib/cni/",
        "routes": [
            { "dst": "0.0.0.0/0" }
        ]
    },
    "eniip_virtual_type": "ipvlan"
}

否则将使用veth的方式配置网络

veth策略

先来看下veth方式配置网络,即types.PolicyRoute这个分支

可以看到直接使用的是分配到的ENI IP和网关

然后配置网卡信息

func (d *PolicyRoute) Setup(cfg *types.SetupConfig, netNS ns.NetNS) error {
    vethCfg := &veth.Veth{
        IfName:   cfg.ContainerIfName,
        PeerName: cfg.HostVETHName,
        MTU:      cfg.MTU,
    }
    err := veth.Setup(vethCfg, netNS)
    

    hostVETH, err := netlink.LinkByName(cfg.HostVETHName)
    
    err = netNS.Do(func(_ ns.NetNS) error {
        // 2. add address for container interface
        contLink, err := netlink.LinkByName(cfg.ContainerIfName)

        contCfg := generateContCfgForPolicy(cfg, contLink, hostVETH.Attrs().HardwareAddr)
        err = nic.Setup(contLink, contCfg)
    })

    eni, err := netlink.LinkByIndex(cfg.ENIIndex)

    table := utils.GetRouteTableID(eni.Attrs().Index)

    eniCfg := generateENICfgForPolicy(cfg, eni, table)
    err = nic.Setup(eni, eniCfg)

    hostVETHCfg := generateHostPeerCfgForPolicy(cfg, hostVETH, table)
    err = nic.Setup(hostVETH, hostVETHCfg)

    return nil
}
func generateContCfgForPolicy(cfg *types.SetupConfig, link netlink.Link, mac net.HardwareAddr) *nic.Conf {
    var routes []*netlink.Route
    var neighs []*netlink.Neigh

    if cfg.ContainerIPNet.IPv4 != nil {
        // add default route
        if cfg.DefaultRoute {
            routes = append(routes, &netlink.Route{
                LinkIndex: link.Attrs().Index,
                Scope:     netlink.SCOPE_UNIVERSE,
                Dst:       "0.0.0.0/0",
                Gw:        "169.254.1.1",
                Flags:     int(netlink.FLAG_ONLINK),
            })
        }

        neighs = append(neighs, &netlink.Neigh{
            LinkIndex:    link.Attrs().Index,
            IP:           "169.254.1.1",
            HardwareAddr: mac,
            State:        netlink.NUD_PERMANENT,
        })
    }

    contCfg := &nic.Conf{
        IfName: cfg.ContainerIfName,
        MTU:    cfg.MTU,
        Addrs:  utils.NewIPNetToMaxMask(cfg.ContainerIPNet),
        Routes: routes,
        Rules:  rules,
        Neighs: neighs,
        SysCtl: sysctl,
    }

    return contCfg
}
default via 169.254.1.1 dev eth0 onlink

? (169.254.1.1) at da:44:55:66:77:88 [ether] on eth0
func generateHostPeerCfgForPolicy(cfg *types.SetupConfig, link netlink.Link, table int) *nic.Conf {
    var addrs []*netlink.Addr
    var routes []*netlink.Route
    var rules []*netlink.Rule
    var sysctl map[string][]string

    if cfg.ContainerIPNet.IPv4 != nil {

        // add route to container
        routes = append(routes, &netlink.Route{
            LinkIndex: link.Attrs().Index,
            Scope:     netlink.SCOPE_LINK,
            Dst:       utils.NewIPNetWithMaxMask(cfg.ContainerIPNet.IPv4),
        })

        v4 := utils.NewIPNetWithMaxMask(cfg.ContainerIPNet.IPv4)
        // 2. add host to container rule
        toContainerRule := netlink.NewRule()
        toContainerRule.Dst = v4
        toContainerRule.Table = unix.RT_TABLE_MAIN
        toContainerRule.Priority = 512

        fromContainerRule := netlink.NewRule()
        fromContainerRule.Src = v4
        fromContainerRule.Table = table
        fromContainerRule.Priority = 2048

        rules = append(rules, toContainerRule, fromContainerRule)
    }

    return &nic.Conf{
        MTU:       cfg.MTU,
        Addrs:     addrs,
        Routes:    routes,
        Rules:     rules,
        SysCtl:    sysctl,
        StripVlan: false,
    }
}

路由规则包括

10.250.7.2 dev calixxxxxxxxxx scope link

ip rule add from all to 10.250.7.2 pref 512 table main

ip rule add from 10.250.7.2 to all pref 2048 table 1005

这里不仅添加了路由规则,还添加了路由策略,结合宿主机上的默认路由策略,添加后的效果

# 添加前,只有系统自带的三条路由策略
mwr@ubuntu:~$ ip rule show
0:      from all lookup local 
32766:  from all lookup main 
32767:  from all lookup default 

# 添加后
0:      from all lookup local 
512   from all to 10.250.7.2 lookup main
2048 from 10.250.7.2 to all lookup 1005
32766:  from all lookup main 
32767:  from all lookup default 

这里的10.250.7.2对应的是容器IP,1005是自定义的路由表ID,通过ENI网卡的序号+1000得到的

func GetRouteTableID(linkIndex int) int {
    return 1000 + eni.Attrs().Index
}
func generateENICfgForPolicy(cfg *types.SetupConfig, link netlink.Link, table int) *nic.Conf {
    var routes []*netlink.Route
    var rules []*netlink.Rule
    var neighs []*netlink.Neigh
    var sysctl map[string][]string

    if cfg.ContainerIPNet.IPv4 != nil {
        // add default route
        gw := cfg.GatewayIP.IPv4
        
        routes = append(routes, &netlink.Route{
            LinkIndex: link.Attrs().Index,
            Scope:     netlink.SCOPE_UNIVERSE,
            Table:     1005,
            Dst:       "0.0.0.0/0",
            Gw:        gw,
            Flags:     int(netlink.FLAG_ONLINK),
        })
    }

    contCfg := &nic.Conf{
        MTU:       cfg.MTU,
        Addrs:     utils.NewIPNetToMaxMask(cfg.HostIPSet),
        Routes:    routes,
        Rules:     rules,
        Neighs:    neighs,
        SysCtl:    sysctl,
        StripVlan: cfg.StripVlan, // if trunk enabled, will remote vlan tag
    }

    return contCfg
}

在路由表1005中添加了默认路由

# ip route add default via 10.250.7.1 dev eni table 1005

default via 10.250.7.1 dev eni onlink  

总结一下veth模式

与独占ENI模式相比,ENI设备是存在宿主机命名空间下的

容器内只有一个veth网卡,和宿主机上的calixxxxxxxxx网卡匹配,并且容器网卡的IP就是ENI网卡的IP地址

同时,这里为每个ENI网卡都会新建一条策略路由已经一个新的路由表,确保这个辅助IP的数据包从宿主机发出去的时候是从它所属的ENI发送出去

对应的规则是2048 from 10.250.7.2 to all lookup 1005,即从容器IP(辅助IP)出去的都走自定义的策略路由,自定义的策略路由只有一条默认路由,从ENI网卡转发到ENI网关

同时,由于ip rule add from all to 10.250.7.2 pref 512 table main这条策略路由的优先级高于ip rule add from 10.250.7.2 to all pref 2048 table 1005,所以同节点的POD访问会走main路由表,即通过calixxxxxxxx设备直接访问,不经过ENI网卡

ipvlan策略

对于配置了使用ipvlan策略的、同时内核版本高于4.19的,会使用ipvlan策略,对应的是types.IPVlan这个分支

可以看到也是直接使用的是分配到的ENI IP和网关

然后配置网卡信息

func (d *IPvlanDriver) Setup(cfg *types.SetupConfig, netNS ns.NetNS) error {
    var err error

    parentLink, err := netlink.LinkByIndex(cfg.ENIIndex)

    eniCfg := generateENICfgForIPVlan(cfg, parentLink)
    err = nic.Setup(parentLink, eniCfg)

    err = ipvlan.Setup(&ipvlan.IPVlan{
        Parent:  parentLink.Attrs().Name,
        PreName: cfg.HostVETHName,
        IfName:  cfg.ContainerIfName,
        MTU:     cfg.MTU,
    }, netNS)

    // 2. setup addr and default route
    err = netNS.Do(func(netNS ns.NetNS) error {
        contLink, err := netlink.LinkByName(cfg.ContainerIfName)
        
        contCfg := generateContCfgForIPVlan(cfg, contLink)
        err = nic.Setup(contLink, contCfg)
        
    })

    if err := d.setupInitNamespace(parentLink, cfg); err != nil {
    }

    return nil
}
unc generateContCfgForIPVlan(cfg *types.SetupConfig, link netlink.Link) *nic.Conf {
    var addrs []*netlink.Addr
    var routes []*netlink.Route
    var rules []*netlink.Rule

    var neighs []*netlink.Neigh

    if cfg.ContainerIPNet.IPv4 != nil {
         else {
            addrs = append(addrs, &netlink.Addr{IPNet: cfg.ContainerIPNet.IPv4})
        }

        // add default route
        if cfg.DefaultRoute {
            routes = append(routes, &netlink.Route{
                LinkIndex: link.Attrs().Index,
                Scope:     netlink.SCOPE_UNIVERSE,
                Dst:       "0.0.0.0/0",
                Gw:        cfg.GatewayIP.IPv4,
                Flags:     int(netlink.FLAG_ONLINK),
            })
        }
        routes = append(routes, &netlink.Route{
            LinkIndex: link.Attrs().Index,
            Scope:     netlink.SCOPE_LINK,
            Dst:       utils.NewIPNetWithMaxMask(cfg.HostIPSet.IPv4),
        })

        neighs = append(neighs, &netlink.Neigh{
            LinkIndex:    link.Attrs().Index,
            IP:           cfg.HostIPSet.IPv4.IP,
            HardwareAddr: link.Attrs().HardwareAddr,
            State:        netlink.NUD_PERMANENT,
        })
    }

    contCfg := &nic.Conf{
        IfName:    cfg.ContainerIfName,
        MTU:       cfg.MTU,
        Addrs:     addrs,
        Routes:    routes,
        Rules:     rules,
        Neighs:    neighs,
        SysCtl:    sysctl,
        StripVlan: false,
    }

    return contCfg
}

这里设置ipvlan子设备网卡名称为eth0,然后直接把辅助IP设置到ipvlan子设备上,作为容器IP

然后添加默认路由和静态ARP,其中10.250.7.1是ENI网关地址,10.250.7.100是云主机IP

default via 10.250.7.1 dev eth0 onlink

10.250.7.100 dev eth0 scope link

? (10.250.7.100) at da:44:55:66:77:88 [ether] on eth0

由于ipvlan子设备默认是无法访问宿主机的(MAC地址都相同,不知道回包给哪一个),所以还会在宿主机上创建宿主机IP的ipvlan子设备

func (d *IPvlanDriver) setupInitNamespace(parentLink netlink.Link, cfg *types.SetupConfig) error {
    // setup slave nic
    slaveName := d.initSlaveName(parentLink.Attrs().Index)
    slaveLink, err := d.createSlaveIfNotExist(parentLink, slaveName, cfg.MTU)

    slaveCfg := generateSlaveLinkCfgForIPVlan(cfg, slaveLink)
    err = nic.Setup(slaveLink, slaveCfg)

    return nil
}

这里首先在宿主机命名空间中创建L2模式的ipvlan子设备,子设备名称为ipvl_5,名称根据ENI网卡的序号生成

func (d *IPvlanDriver) initSlaveName(parentIndex int) string {
    return fmt.Sprintf("ipvl_%d", eni.Attrs().Index)
}

err = utils.LinkAdd(&netlink.IPVlan{
        LinkAttrs: netlink.LinkAttrs{
            Name:        slaveName,
            ParentIndex: parentLink.Attrs().Index,
            MTU:         mtu,
        },
        Mode: netlink.IPVLAN_MODE_L2,
    })

然后设置这个ipvlan子设备的IP地址

func generateSlaveLinkCfgForIPVlan(cfg *types.SetupConfig, link netlink.Link) *nic.Conf {
    var addrs []*netlink.Addr
    var routes []*netlink.Route
    var sysctl map[string][]string

    if cfg.ContainerIPNet.IPv4 != nil {
        addrs = append(addrs, &netlink.Addr{IPNet: utils.NewIPNetWithMaxMask(cfg.HostIPSet.IPv4), Scope: int(netlink.SCOPE_HOST)})

        // add route to container
        routes = append(routes, &netlink.Route{
            LinkIndex: link.Attrs().Index,
            Scope:     netlink.SCOPE_LINK,
            Dst:       utils.NewIPNetWithMaxMask(cfg.ContainerIPNet.IPv4),
        })
    }

    contCfg := &nic.Conf{
        MTU:    cfg.MTU,
        Addrs:  addrs,
        Routes: routes,
        SysCtl: sysctl,
    }

    return contCfg
}

这里设置ipvlan的IP地址就是宿主机的IP地址,然后添加到容器IP的默认路由

10.250.7.2 dev ipvl_5 scope link

如此一来,容器内配置了宿主机的IP和MAC地址,宿主机上也配置了容器IP的路由,同时这两个ipvlan设备都属于ENI网卡,容器就能正常访问宿主机了

容器内访问其它的都会直接转发到ENI网关,由网关去转发

都有网关去转发,会造成一个新问题是:Service将无法访问

为了解决这个问题,引入了cilium ebpf能力,Service在POD的网络命名空间内就会被ebpf转为某个Service后端pod的IP,然后直接通过ENI网关转发,相对比较复杂,待进一步研究

参考

上一篇下一篇

猜你喜欢

热点阅读