DPVS SNAT实现

DPVS的数据流量从以流入流出划分为:inbond outbond;

outbond 方向数据包处理流程

内网服务器需要访问外网某个功能接口,请求数据包到达了 DPVS 服务器。数据包从网卡某个队列 queuex 进入后,被 cpux 接收并开始相关的逻辑处理。收包路径大概包括如下:

dp_vs_in 中其实调用的是 __dp_vs_in 函数,主逻辑入口在这个函数中体现:

  1. 调用 svc->scheduler->schedule 函数,从 RS 列表中挑选出一个 RS 作为 dest
  2. 对于 SNAT 模式,调用 dp_vs_snat_schedule() 函数
  3. 此时已确认 dest 作为源 IP 地址,调用 sa_fetch 根据源地址选择相对应的源 port
  4. 调用 dp_vs_conn_fill_param 将相关数据赋值到 param 变量中去。
  1. 申请 new conn 连接内存资源,调用 dp_vs_conn_alloc
  2. 初始化 inbond 和 outbond 连接表关联的 ntuple 哈希
  3. 初始化 new conn 的相关字段
  4. 为 conn 绑定对应的 dest,并设置对应的 xmit 函数
  5. 添加 new conn 到连接表中 dp_vs_conn_hash
  6. 初始化相关的计数器和定时器
  7. 到此为止,一个新的 SNAT 连接已经完成
  1. 调用相关接口,确认 output 路由,并修改三层和四层信息
  2. 确认下一跳等相关信息,调用 neigh_output 发出数据包

到此从 SNAT 数据包的接收、处理、转发等过程,完成 SNAT outbond 方向的整个流程。

inbond 方向数据包处理流程

  内网服务器发送请求数据包后,外网服务器执行相关操作,发出响应数据包,此数据包到达了 DPVS 服务器。此时 DPDK 驱动程序会根据关键字段进行 FDIR 规则匹配,数据包从同样的网卡度列 queuex 进入后,被 cpux 接收并开始相关的逻辑处理,这样整个连接都是被同一个 cpux 来处理的。收包路径大概包括如下:

在 dp_vs_in 前的收包处理流程和 outbond 基本一致,这里就不分析了。

不断的重复 outbond 和 inbond 流程,就能够通过多次数据包的交互完成连接过程中业务的传输,从而实现 SNAT 功能


存在问题:

NAT模式原理

  对于inbound方向的流量,实际上做的是dnat,将目标ip由lb vip转换成真正的rs ip,此时后端rs是能拿到client ip的。outbond的流量做snat,将源地址换成lb vip

/* return verdict INET_XXX
 * af from mbuf->l3_type? No! The field is rewritten by netif and conflicts with
 * m.packet_type(an union), so using a wrapper to get af.
 * */
static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
                      const struct inet_hook_state *state, int af)
{
    struct dp_vs_iphdr iph;
    struct dp_vs_proto *prot;
    struct dp_vs_conn *conn;
    int dir, verdict, err, related;
    bool drop = false;
    lcoreid_t cid, peer_cid;
    eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
    assert(mbuf && state);
    //获取当前运行的lcore id
    cid = peer_cid = rte_lcore_id();
    
    //数据包不是发往本机的,直接返回ACCEPT,之后执行ipv4_rcv_fin
    if (unlikely(etype != ETH_PKT_HOST))
        return INET_ACCEPT;

    //填充内部dp_vs_iphdr,如果出错,主要是协议族不正确,直接返回ACCEPT
    if (dp_vs_fill_iphdr(af, mbuf, &iph) != EDPVS_OK)
        return INET_ACCEPT;
    //处理ICMP消息,类似于linux内核中icmp_error相关的处理逻辑
    if (unlikely(iph.proto == IPPROTO_ICMP ||
                 iph.proto == IPPROTO_ICMPV6)) {
        /* handle related ICMP error to existing conn */
        verdict = dp_vs_in_icmp(af, mbuf, &related);
        if (related || verdict != INET_ACCEPT)
            return verdict;
        /* let unrelated and valid ICMP goes down,
         * may implement ICMP fwd in the futher. */
    }
                 //查找四层处理协议,目前实现了tcp,udp和icmp
    prot = dp_vs_proto_lookup(iph.proto);
    if (unlikely(!prot))
        return INET_ACCEPT;

    /*
     * Defrag ipvs-forwarding TCP/UDP is not supported for some reasons,
     *
     * - RSS/flow-director do not support TCP/UDP fragments, means it's
     *   not able to direct frags to same lcore as original TCP/UDP packets.
     * - per-lcore conn table will miss if frags reachs wrong lcore.
     *
     * If we redirect frags to "correct" lcore, it may cause performance
     * issue. Also it need to understand RSS algorithm. Moreover, for the
     * case frags in same flow are not occur in same lcore, a global lock is
     * needed, which is not a good idea.
     */ //目前不支持ip分片,此处与flow director相关
    if (af == AF_INET && ip4_is_frag(ip4_hdr(mbuf))) {
        RTE_LOG(DEBUG, IPVS, "%s: frag not support.
", __func__);
        return INET_DROP;
    }
    //调用proto相关conn_lookup函数查找会话,tcp中为 tcp_conn_lookup 。有可能会 drop 掉。dir 是设置数据流方向,从client到LB,
    //还是从real server到LB,peer_cid为查找时决定处理该连接的lcore id
    /* packet belongs to existing connection ? */
    conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop, &peer_cid);

    if (unlikely(drop)) {
        RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.
", __func__);
        return INET_DROP;
    }

    /*
     * The connection is not locally found, however the redirect is found so
     * forward the packet to the remote redirect owner core.
     */
    //如果不在当前lcore上处理,则恢复mbuf->data_off指向L2 header后转发给其他lcore处理,此处ring enqueue成功后返回INET_STOLEN,
        //否则返回DROP丢弃数据包
    if (cid != peer_cid) {
        /* recover mbuf.data_off to outer Ether header */
        rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr));

        return dp_vs_redirect_pkt(mbuf, peer_cid);
    }
    //对于新建的连接,肯定是没有会话的,conn_sched根据请求选择一个后端real server建立连接
    if (unlikely(!conn)) {
        /* try schedule RS and create new connection */
    //调用proto中conn_sched接口选择一个后端rs建立连接,如果创建连接失败,返回verdict
        if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
            /* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.
", __func__); */
            return verdict;
        }
        //snat模式,则是内部服务器访问外部服务,内网服务器--->dpvs--->外网服务器(baidu),
        //所以设置dir=DPVS_CONN_DIR_OUTBOUND
        /* only SNAT triggers connection by inside-outside traffic. */
        if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
            dir = DPVS_CONN_DIR_OUTBOUND;
        else//其余模式设置dir=DPVS_CONN_DIR_INBOUND
            dir = DPVS_CONN_DIR_INBOUND;
    } else {
        /* assert(conn->dest->svc != NULL); */
        if (conn->dest && conn->dest->svc &&
                prot->conn_expire_quiescent &&
                (conn->dest->svc->flags & DPVS_CONN_F_EXPIRE_QUIESCENT)) {
            if (rte_atomic16_read(&conn->dest->weight) == 0) {
                RTE_LOG(INFO, IPVS, "%s: the conn is quiescent, expire it right now,"
                        " and drop the packet!
", __func__);
                prot->conn_expire_quiescent(conn);
                dp_vs_conn_put(conn);
                return INET_DROP;
            }
        }
    }
    //特殊处理,syn_proxy

    if (conn->flags & DPVS_CONN_F_SYNPROXY) {
        if (dir == DPVS_CONN_DIR_INBOUND) {
            /* Filter out-in ack packet when cp is at SYN_SENT state.
             * Drop it if not a valid packet, store it otherwise */
            if (0 == dp_vs_synproxy_filter_ack(mbuf, conn, prot,
                                               &iph, &verdict)) {
                dp_vs_stats_in(conn, mbuf);
                dp_vs_conn_put(conn);
                return verdict;
            }

            /* "Reuse" synproxy sessions.
             * "Reuse" means update syn_proxy_seq struct
             * and clean ack_mbuf etc. */
            if (0 != dp_vs_synproxy_ctrl_conn_reuse) {
                if (0 == dp_vs_synproxy_reuse_conn(af, mbuf, conn, prot,
                                                   &iph, &verdict)) {
                    dp_vs_stats_in(conn, mbuf);
                    dp_vs_conn_put(conn);
                    return verdict;
                }
            }
        } else {
            /* Syn-proxy 3 logic: receive syn-ack from rs */
            if (dp_vs_synproxy_synack_rcv(mbuf, conn, prot,
                                          iph.len, &verdict) == 0) {
                dp_vs_stats_out(conn, mbuf);
                dp_vs_conn_put(conn);
                return verdict;
            }
        }
    }
    //状态迁移,tcp中为tcp_state_trans
    if (prot->state_trans) {
        err = prot->state_trans(prot, conn, mbuf, dir);
        if (err != EDPVS_OK)
            RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
    }
    conn->old_state = conn->state;

    /* holding the conn, need a "put" later. */
    //根据流量方向dir,来选择如何发送数据
    if (dir == DPVS_CONN_DIR_INBOUND)
        return xmit_inbound(mbuf, prot, conn);
    else
        return xmit_outbound(mbuf, prot, conn);
}

原文链接:https://www.cnblogs.com/codestack/p/15717792.html

展开阅读全文

页面更新:2024-04-13

标签:字段   初始化   函数   逻辑   流程   方向   协议   类型   模式   服务器

1 2 3 4 5

上滑加载更多 ↓
推荐阅读:
友情链接:
更多:

本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828  

© CopyRight 2008-2024 All Rights Reserved. Powered By bs178.com 闽ICP备11008920号-3
闽公网安备35020302034844号

Top