veth虚拟网卡

2020-09-22 本文已影响0人分享放大价值

veth是kernel提供的一种虚拟网卡，总是成对出现。在一端发送数据，就可以在另一端接收到，类似一根网线。那么它是如何实现的呢？今天就看一下它的实现。

veth创建

可以使用ip命令创建，如下。

[root@localhost ~]# ip link add vetha type veth peer name vethb

在kernel端需要提前加载veth module，如下，这个module做的事情很简单，就是注册一个 rtnl_link_ops

[root@localhost ~]# modprobe veth
[root@localhost ~]# lsmod | grep veth
veth                   13410  0

//veth.ko 初始化
#define DRV_NAME    "veth"
static struct rtnl_link_ops veth_link_ops = {
    .kind       = DRV_NAME,
    .priv_size  = sizeof(struct veth_priv),
    .setup      = veth_setup,
    .validate   = veth_validate,
    .newlink    = veth_newlink,
    .dellink    = veth_dellink,
    .policy     = veth_policy,
    .maxtype    = VETH_INFO_MAX,
};

/*
 * init/fini
 */

static __init int veth_init(void)
{
    return rtnl_link_register(&veth_link_ops);
}
int rtnl_link_register(struct rtnl_link_ops *ops)
{
    int err;

    rtnl_lock();
    err = __rtnl_link_register(ops);
    rtnl_unlock();
    return err;
}
int __rtnl_link_register(struct rtnl_link_ops *ops)
{
    if (rtnl_link_ops_get(ops->kind))
        return -EEXIST;

    /* The check for setup is here because if ops
     * does not have that filled up, it is not possible
     * to use the ops for creating device. So do not
     * fill up dellink as well. That disables rtnl_dellink.
     */
    if (ops->setup && !ops->dellink)
        ops->dellink = unregister_netdevice_queue;

    list_add_tail(&ops->list, &link_ops);
    return 0;
}

通过命令 ip link add vetha type veth peer name vethb 创建veth时，在kernel中调用rtnl_newlink，会根据传入的type查找rtnl_link_ops，再调用rtnl_link_ops的newlink创建veth的peer，并将两端veth分别放在对方的私有数据中。

rtnl_newlink
    if (linkinfo[IFLA_INFO_KIND]) {
        nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
        ops = rtnl_link_ops_get(kind);

    struct net_device *dev;
    dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
    
    if (ops->newlink) {
        err = ops->newlink(net, dev, tb, data);

static int veth_newlink(struct net *src_net, struct net_device *dev,
             struct nlattr *tb[], struct nlattr *data[])

    struct net_device *peer;
    peer = rtnl_create_link(net, ifname, name_assign_type,
                &veth_link_ops, tbp);

    register_netdevice(peer);
    register_netdevice(dev);

    /*
     * tie the deviced together
     */
   //这里是关键，将peer放在dev的priv中，将dev放在peer的priv中，
   //这样将这两个虚拟设备绑定到一起。
    priv = netdev_priv(dev);
    rcu_assign_pointer(priv->peer, peer);

    priv = netdev_priv(peer);
    rcu_assign_pointer(priv->peer, dev);

static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
    //取出peer设备
    struct veth_priv *priv = netdev_priv(dev);
    rcv = rcu_dereference(priv->peer);
    //调用 dev_forward_skb 发送数据时，已经换成peer设备rcv了
    //相当于 peer 设备的接收
    if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
        u64_stats_update_begin(&stats->syncp);
        stats->bytes += length;
        stats->packets++;
        u64_stats_update_end(&stats->syncp);
    } else {
drop:
        atomic64_inc(&priv->dropped);
    }
    rcu_read_unlock();
    return NETDEV_TX_OK;
}
//如果报文可转发，则调用netif_rx_internal将报文送入主机协议栈
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
    return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
    if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
        if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
            atomic_long_inc(&dev->rx_dropped);
            kfree_skb(skb);
            return NET_RX_DROP;
        }
    }

    if (unlikely(!is_skb_forwardable(dev, skb))) {
        atomic_long_inc(&dev->rx_dropped);
        kfree_skb(skb);
        return NET_RX_DROP;
    }

    skb_scrub_packet(skb, true);
    skb->protocol = eth_type_trans(skb, dev);
    skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

    return 0;
}
//根据目的mac设备 pkt_type
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
    unsigned short _service_access_point;
    const unsigned short *sap;
    const struct ethhdr *eth;

    skb->dev = dev;
    skb_reset_mac_header(skb);
    skb_pull_inline(skb, ETH_HLEN);
    eth = eth_hdr(skb);

    if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
            skb->pkt_type = PACKET_BROADCAST;
        else
            skb->pkt_type = PACKET_MULTICAST;
    }
    else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
                           dev->dev_addr)))
        skb->pkt_type = PACKET_OTHERHOST;
  ....
}

veth使用

我见过的使用方法有两种，如下

a. 一端在 root namespace，另一端放在其他 namespace，连接两个namespace，比如在k8s中，calico, cilium和ovs这些cni都是如此实现的。
b.两端分别放在两个网桥上，连接网桥。

第二种情况比较简单，重点说一下第一种情况遇到的几个问题。

实验步骤如下，
创建一对veth口，vetha和vethb，
创建一个namespace test，
将vetha放入namespace test，
将vetha和vethb都up起来，
给vetha配置ip 1.1.1.2

[root@localhost ~]# ip link add vetha type veth peer name vethb
[root@localhost ~]# ip link set dev vethb up
[root@localhost ~]# ip netns add test
[root@localhost ~]# ip link set dev vetha netns test
[root@localhost ~]# ip netns exec test ip link set dev vetha up
[root@localhost ~]# ip netns exec test ip address add dev vetha 1.1.1.2/24

[root@localhost ~]# ip netns exec test ip a
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN qlen 1
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
30: vetha@if29: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP qlen 1000
    link/ether 96:7f:a6:ea:93:23 brd ff:ff:ff:ff:ff:ff link-netnsid 0
    inet 1.1.1.2/24 scope global vetha
       valid_lft forever preferred_lft forever
    inet6 fe80::74ef:e3ff:fe5d:2db0/64 scope link
       valid_lft forever preferred_lft forever

[root@localhost ~]# ip netns exec test ip r
1.1.1.0/24 dev vetha proto kernel scope link src 1.1.1.2

[root@localhost ~]# ip netns exec test arp -n
Address                  HWtype  HWaddress           Flags Mask            Iface
1.1.1.4                          (incomplete)                              vetha

[root@localhost ~]# ifconfig vethb
vethb: flags=4163<UP,BROADCAST,RUNNING,MULTICAST>  mtu 1500
        inet6 fe80::947f:a6ff:feea:9321  prefixlen 64  scopeid 0x20<link>
        ether 96:7f:a6:ea:93:21  txqueuelen 1000  (Ethernet)
        RX packets 208784  bytes 12836250 (12.2 MiB)
        RX errors 0  dropped 0  overruns 0  frame 0
        TX packets 1143  bytes 62469 (61.0 KiB)
        TX errors 0  dropped 0 overruns 0  carrier 0  collisions 0

问题1：如果在test namespace中，设置一个静态arp，ping一个不存在的地址1.1.1.4

[root@localhost ~]# ip netns exec test arp -s 1.1.1.4 00:00:00:00:00:01
[root@localhost ~]# ip netns exec test arp -n
Address                  HWtype  HWaddress           Flags Mask            Iface
1.1.1.4                  ether   00:00:00:00:00:01   CM                    vetha

icmp报文到达vethb设备后，走host的协议栈，匹配到host的默认路由表，应该会从em1发出去，但是结果是vethb可以抓到icmp报文，em1抓不到。

[root@localhost ~]# ip r
default via 10.164.129.1 dev em1 proto static metric 100
10.10.10.0/24 dev gre10 proto kernel scope link src 10.10.10.1
10.164.129.0/24 dev em1 proto kernel scope link src 10.164.129.16 metric 100

[root@localhost ~]# tcpdump -vne -i vethb icmp
tcpdump: listening on vethb, link-type EN10MB (Ethernet), capture size 262144 bytes
08:04:49.386991 f6:d4:d2:de:20:be > 00:00:00:00:00:01, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 48297, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 30782, seq 21, length 64
08:04:50.386985 f6:d4:d2:de:20:be > 00:00:00:00:00:01, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 48427, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 30782, seq 22, length 64
^C
2 packets captured
2 packets received by filter
0 packets dropped by kernel
[root@localhost ~]# tcpdump -vne -i em1 icmp
tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
^C
0 packets captured
0 packets received by filter
0 packets dropped by kernel

原因是icmp报文的目的mac为00:00:00:00:00:01，而在vethb收到此报文后，在函数eth_type_trans中会根据目的mac给skb->pkt_type赋值，因为目的mac不为vethb的mac，所以skb->pkt_type被设置成PACKET_OTHERHOST。

veth_xmit ->dev_forward_skb -> __dev_forward_skb -> eth_type_trans
    if (unlikely(is_multicast_ether_addr(eth->h_dest))) {
        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
            skb->pkt_type = PACKET_BROADCAST;
        else
            skb->pkt_type = PACKET_MULTICAST;
    }
    else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
                           dev->dev_addr)))
        skb->pkt_type = PACKET_OTHERHOST;

随后将报文送入主机协议栈，在ip_rcv中有个判断如果skb->pkt_type == PACKET_OTHERHOST就直接drop报文，比较恶心的是，这个drop没有统计信息可看。
看来得在test namespace中将1.1.1.4对应的mac设置为vethb的mac地址。

netif_rx_internal -> enqueue_to_backlog -> process_backlog ->__netif_receive_skb -> __netif_receive_skb_core -> ip_rcv
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
    const struct iphdr *iph;
    u32 len;

    /* When the interface is in promisc. mode, drop all the crap
     * that it receives, do not try to analyse it.
     */
    if (skb->pkt_type == PACKET_OTHERHOST)
        goto drop;
drop:
    kfree_skb(skb);
out:
    return NET_RX_DROP;
}

问题2: 如下，将1.1.1.4对应的mac修改为vethb的mac了，但是仍然有问题，vethb可以收到，em1收不到。

[root@localhost ~]# ip link show dev vethb
33: vethb@if34: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT qlen 1000
    link/ether 96:ec:6a:a8:67:ed brd ff:ff:ff:ff:ff:ff link-netnsid 0
[root@localhost ~]# ip netns exec test arp -d 1.1.1.4
[root@localhost ~]# ip netns exec test arp -s 1.1.1.4 96:ec:6a:a8:67:ed
[root@localhost ~]# ip netns exec test arp -n
Address                  HWtype  HWaddress           Flags Mask            Iface
1.1.1.4                  ether   96:ec:6a:a8:67:ed   CM                    vetha

[root@localhost ~]# ip link show dev vethb
33: vethb@if34: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP mode DEFAULT qlen 1000
    link/ether 96:ec:6a:a8:67:ed brd ff:ff:ff:ff:ff:ff link-netnsid 0
[root@localhost ~]# tcpdump -vne -i vethb icmp
tcpdump: listening on vethb, link-type EN10MB (Ethernet), capture size 262144 bytes
08:15:21.495979 f6:d4:d2:de:20:be > 96:ec:6a:a8:67:ed, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 23666, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 31770, seq 32, length 64
^C
1 packet captured
1 packet received by filter
0 packets dropped by kernel
[root@localhost ~]# tcpdump -vne -i em1 icmp
tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
^C
0 packets captured
0 packets received by filter
0 packets dropped by kernel

问题3: 这又是另一个问题了，在调用ip_route_input_noref查找路由时，虽然可以匹配到默认路由，但是因为 vethb 没有开启forward功能，所以仍然会失败。

ip_route_input_noref -> ip_route_input_slow
    fl4.flowi4_oif = 0;
    fl4.flowi4_iif = dev->ifindex;
    fl4.flowi4_mark = skb->mark;
    fl4.flowi4_tos = tos;
    fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
    fl4.daddr = daddr;
    fl4.saddr = saddr;
    err = fib_lookup(net, &fl4, &res);

    if (!IN_DEV_FORWARD(in_dev)) {
        err = -EHOSTUNREACH;
        goto no_route;
    }
no_route:
    RT_CACHE_STAT_INC(in_no_route);
    res.type = RTN_UNREACHABLE;
    res.fi = NULL;

可以通过下面的命令查看丢包计数 in_no_route

cat /proc/net/stat/rt_cache | awk -F " " '{print $5}'

接下来使能vethb的forwarding试试看

[root@localhost ~]# echo 1 > /proc/sys/net/ipv4/conf/vethb/forwarding
[root@localhost ~]# cat /proc/sys/net/ipv4/conf/vethb/forwarding
1

注意：如果 /proc/sys/net/ipv4/conf/all/forwarding 使能了，则新创建的网卡的forwarding 功能都会默认被使能。

问题4: 再次ping还是不行，这是由于反向路径检查失败导致的。会调用fib_validate_source使用报文的源ip作为目的查找路由表，只能匹配到默认路由，因为默认路由出接口和报文入接口不是同一个，所以判断失败。收发同一个报文应该是同一个设备，这称为对称路由。

ip_route_input_slow -> __mkroute_input
/* Ignore rp_filter for packets protected by IPsec. */
    err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
                  in_dev->dev, in_dev, &itag);
    if (err < 0) {
        ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                     saddr);

        goto cleanup;
    }

int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
            u8 tos, int oif, struct net_device *dev,
            struct in_device *idev, u32 *itag)
{
    int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
    //反向路径检查开关，为0就不做检查
    if (!r && !fib_num_tclassid_users(dev_net(dev)) &&
        IN_DEV_ACCEPT_LOCAL(idev) &&
        (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
        *itag = 0;
        return 0;
    }
    return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
}

static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                 u8 tos, int oif, struct net_device *dev,
                 int rpf, struct in_device *idev, u32 *itag)
{
    int ret, no_addr;
    struct fib_result res;
    struct flowi4 fl4;
    struct net *net;
    bool dev_match;

    fl4.flowi4_oif = 0;
    fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
    fl4.daddr = src;
    fl4.saddr = dst;
    fl4.flowi4_tos = tos;
    fl4.flowi4_scope = RT_SCOPE_UNIVERSE;

    no_addr = idev->ifa_list == NULL;

    fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;

    net = dev_net(dev);
    if (fib_lookup(net, &fl4, &res))
        goto last_resort;
    if (res.type != RTN_UNICAST &&
        (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
        goto e_inval;
    if (!rpf && !fib_num_tclassid_users(dev_net(dev)) &&
        (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev)))
        goto last_resort;
    fib_combine_itag(itag, &res);
    dev_match = false;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
    for (ret = 0; ret < res.fi->fib_nhs; ret++) {
        struct fib_nh *nh = &res.fi->fib_nh[ret];

        if (nh->nh_dev == dev) {
            dev_match = true;
            break;
        }
    }
#else
    //如果路由中的出接口是入接口才会成功。否则就是反向路径检查失败。
    if (FIB_RES_DEV(res) == dev)
        dev_match = true;
#endif
    if (dev_match) {
        ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
        return ret;
    }
    if (no_addr)
        goto last_resort;
    if (rpf == 1)
        goto e_rpf;
    fl4.flowi4_oif = dev->ifindex;

    ret = 0;
    if (fib_lookup(net, &fl4, &res) == 0) {
        if (res.type == RTN_UNICAST)
            ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
    }
    return ret;

last_resort:
    if (rpf)
        goto e_rpf;
    *itag = 0;
    return 0;

e_inval:
    return -EINVAL;
e_rpf:
    return -EXDEV;
}

可以通过下面的命令查看drop计数 in_martian_src

[root@localhost ~]# cat /proc/net/stat/rt_cache | awk -F " " '{print $8}'

这个问题的解决办法有两个

a. 添加对称路由
b. 关闭反向路径检查

a. 添加对称路由，如下添加到test namespace路由后，icmp报文可以从em1发出去了

[root@localhost ~]# ip route add 1.1.1.2 dev vethb
[root@localhost ~]# ip r
default via 10.164.129.1 dev em1 proto static metric 100
1.1.1.2 dev vethb scope link
10.10.10.0/24 dev gre10 proto kernel scope link src 10.10.10.1
10.164.129.0/24 dev em1 proto kernel scope link src 10.164.129.16 metric 100
169.254.0.0/16 dev provisioning_nw scope link metric 1016
169.254.0.0/16 dev idrac_nw scope link metric 1017
192.168.0.0/24 dev provisioning_nw proto kernel scope link src 192.168.0.253
192.168.10.0/24 dev idrac_nw proto kernel scope link src 192.168.10.13
192.168.122.0/24 dev virbr0 proto kernel scope link src 192.168.122.1
[root@localhost ~]# tcpdump -vne -i em1 icmp or arp
tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
08:56:10.509045 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 28970, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1664, length 64
08:56:11.509051 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 29200, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1665, length 64

b. 关闭反向路径检查
rp_filter取决于设备和all的最大值，所以必须把设备和all的rp_filter都关闭

#define IN_DEV_RPFILTER(in_dev)     IN_DEV_MAXCONF((in_dev), RP_FILTER)

#define IN_DEV_MAXCONF(in_dev, attr) \
    (max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \
         IN_DEV_CONF_GET((in_dev), attr)))

[root@localhost ~]# ip route del 1.1.1.2 dev vethb
[root@localhost ~]# echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
[root@localhost ~]# echo 0 > /proc/sys/net/ipv4/conf/vethb/rp_filter

[root@localhost ~]# tcpdump -vne -i em1 icmp or arp
tcpdump: listening on em1, link-type EN10MB (Ethernet), capture size 262144 bytes
09:01:22.555047 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 58344, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1976, length 64
09:01:23.555046 90:b1:1c:55:37:1e > 00:00:0c:07:ac:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 63, id 58481, offset 0, flags [DF], proto ICMP (1), length 84)
    1.1.1.2 > 1.1.1.4: ICMP echo request, id 1009, seq 1977, length 64

proxy_arp

到这里icmp报文算是成功发出去了，但是test namespace中1.1.1.4的mac地址是手动设置的，不太灵活，可以使用设备的 proxy_arp 功能。

//使能 proxy_arp
[root@localhost ~]# echo 1 >  /proc/sys/net/ipv4/conf/vethb/proxy_arp

[root@localhost ~]# taskset -c 3 ip netns exec test ping 1.1.1.4
PING 1.1.1.4 (1.1.1.4) 56(84) bytes of data.
^C
--- 1.1.1.4 ping statistics ---
1 packets transmitted, 0 received, 100% packet loss, time 0ms

//学到了vethb的mac地址
[root@localhost ~]# ip netns exec test arp -n
Address                  HWtype  HWaddress           Flags Mask            Iface
1.1.1.4                  ether   96:ec:6a:a8:67:ed   C                     vetha

但是在 arp 处理过程中，也会查找路由，反向路径检查等流程，所以上面的问题也都会遇到，按照上面的设置一下就行。

arp_process
    if (arp->ar_op == htons(ARPOP_REQUEST) &&
        ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
        rt = skb_rtable(skb);
        addr_type = rt->rt_type;

        if (addr_type == RTN_LOCAL) {
        ...
      } else if (IN_DEV_FORWARD(in_dev)) {
            if (addr_type == RTN_UNICAST  &&
                (arp_fwd_proxy(in_dev, dev, rt) ||
                 arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
                 (rt->dst.dev != dev &&
                  pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                if (n)
                    neigh_release(n);

                if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
                    skb->pkt_type == PACKET_HOST ||
                    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
                    arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
                         dev, tip, sha, dev->dev_addr,
                         sha);
                } else {
                    pneigh_enqueue(&arp_tbl,
                               in_dev->arp_parms, skb);
                    return 0;
                }
                goto out;
            }

总结

a. 如果veth一端在其他namespace，另一端在root namespace，并没有被加入到网桥，则test namespace中通过veth发送的报文的目的mac如果是单播的，则必须是veth的peer设备的mac。可以在test namespace中静态配置或者设置代理arp
b. 为了通过反向路径检查，可以关闭反向路径检查或者设置对称路由。
c. 必须使能设备的 forwarding 功能。

用到的命令如下，其实在k8s的calico cni网络中，基本上就是下面的几个设置。

echo 1 >  /proc/sys/net/ipv4/conf/vethb/proxy_arp
echo 1 > /proc/sys/net/ipv4/conf/vethb/forwarding
echo 0 > /proc/sys/net/ipv4/conf/vethb/rp_filter
ip route add 1.1.1.2 dev vethb

veth虚拟网卡

veth创建

veth使用

proxy_arp

总结

猜你喜欢

热点阅读