IPVS分析

1.Netfilter & CONNTRACK & IPVS结构图 IPVS分析
文章图片

2.IPVS ipvs只有DNAT和de-DNAT功能 ,它独立与iptables和conntrack,实现了自己的一套连接跟踪表和NAT机制
2.1 ipvs与conntrack的联系: ipvs仅仅在做DNAT后对conntrack连接进行更新,防止回包因为没有记录而被丢弃
ipvs在TUNNEL模式下,会调用nf_conntrack_confirm函数对连接进行确认
2.2 ipvs注册的钩子函数

static const struct nf_hook_ops ip_vs_ops[] = { /* After packet filtering, change source only for VS/NAT */ { .hook= ip_vs_reply4, /* ip_vs_out */ .pf= NFPROTO_IPV4, .hooknum= NF_INET_LOCAL_IN, .priority= NF_IP_PRI_NAT_SRC - 2, }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook= ip_vs_remote_request4, .pf= NFPROTO_IPV4, .hooknum= NF_INET_LOCAL_IN, .priority= NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { .hook= ip_vs_local_reply4, .pf= NFPROTO_IPV4, .hooknum= NF_INET_LOCAL_OUT, .priority= NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { .hook= ip_vs_local_request4, .pf= NFPROTO_IPV4, .hooknum= NF_INET_LOCAL_OUT, .priority= NF_IP_PRI_NAT_DST + 2, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook= ip_vs_forward_icmp, .pf= NFPROTO_IPV4, .hooknum= NF_INET_FORWARD, .priority= 99, }, /* After packet filtering, change source only for VS/NAT */ { .hook= ip_vs_reply4, .pf= NFPROTO_IPV4, .hooknum= NF_INET_FORWARD, .priority= 100, }, };

2.3 IPVS中tcp协议的状态转换表
/* *Timeout table[state] */ static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE]=2*HZ, [IP_VS_TCP_S_ESTABLISHED]=15*60*HZ, [IP_VS_TCP_S_SYN_SENT]=2*60*HZ, [IP_VS_TCP_S_SYN_RECV]=1*60*HZ, [IP_VS_TCP_S_FIN_WAIT]=2*60*HZ, [IP_VS_TCP_S_TIME_WAIT]=2*60*HZ, [IP_VS_TCP_S_CLOSE]=10*HZ, [IP_VS_TCP_S_CLOSE_WAIT]=60*HZ, [IP_VS_TCP_S_LAST_ACK]=30*HZ, [IP_VS_TCP_S_LISTEN]=2*60*HZ, [IP_VS_TCP_S_SYNACK]=120*HZ, [IP_VS_TCP_S_LAST]=2*HZ, }; #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT #define sSR IP_VS_TCP_S_SYN_RECV #define sFW IP_VS_TCP_S_FIN_WAIT #define sTW IP_VS_TCP_S_TIME_WAIT #define sCL IP_VS_TCP_S_CLOSE #define sCW IP_VS_TCP_S_CLOSE_WAIT #define sLA IP_VS_TCP_S_LAST_ACK #define sLI IP_VS_TCP_S_LISTEN #define sSA IP_VS_TCP_S_SYNACKstatic struct tcp_states_t tcp_states[] = { /*INPUT ip_vs_in调用 */ /*sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA初始状态*/ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},/*OUTPUT ip_vs_out调用 */ /*sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA初始状态*/ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},/*INPUT-ONLY ip_vs_in在没有收到回包时调用*/ /*sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, };

2.4 ip_vs_in 为目的地为虚拟服务器的数据包确认连接,并为连接分配后端,然后转发数据包
  1. 数据包四元组匹配到了连接记录
    1. 连接不复用
      • 释放连接
    2. 连接复用
      • 复用连接
  2. 数据包四元组没有匹配到连接记录,或者连接被释放
    1. 目的地是虚拟服务器
      • 分配后端,新建连接
    2. 目的地不是虚拟服务器
      • 返回ACCEPT
  3. 统计计数,更新四层协议连接状态
  4. 执行DNAT,转发数据包到LOCAL_OUT
  5. 更新连接保持时间
源码分析:
/* *Check if it's for virtual services, look it up, *and send it on its way... */ static unsigned int ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, pkts; int conn_reuse_mode; struct sock *sk; /* 已经被ipvs处理过则不处理 */ /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; /* *Big tappo: *- remote client: only PACKET_HOST *- route: used for struct net when skb->dev is unset */ if (unlikely((skb->pkt_type != PACKET_HOST && hooknum != NF_INET_LOCAL_OUT) || !skb_dst(skb))) { ip_vs_fill_iph_skb(af, skb, false, &iph); IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" " ignored in hook %u\n", skb->pkt_type, iph.protocol, IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; /* 获取ip头 */ ip_vs_fill_iph_skb(af, skb, false, &iph); /* 获取数据包所属sock */ /* Bad... Do not break raw sockets */ sk = skb_to_full_sk(skb); if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) {if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; }#ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_in_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; }/* Protocol supported? */ /* 判断是否为ipvs支持的协议 */ pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) { /* The only way we'll see this packet again is if it's * encapsulated, so mark it with ipvs_property=1 so we * skip it if we're ignoring tunneled packets */ if (sysctl_ignore_tunneled(ipvs)) skb->ipvs_property = 1; return NF_ACCEPT; } pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ /* 在ipvs连接跟踪表里查找数据包所属连接 */ cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto, ipvs, af, skb, &iph); /* conn_reuse_mode是ipvs连接复用参数 * frag是分片偏移量 * is_new_conn()是判断tcp头的syn标志位 */ conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { /* 找到了所属连接并且是SYN,非分片,reuse_mode==1,时会走到这里 */ bool uses_ct = false, resched = false; /* 判断expire_nodest_conn和连接的目的地的weight */ if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && unlikely(!atomic_read(&cp->dest->weight))) { /* expire_nodest_conn表示释放不可用后端的连接 * 后端不可用会走到这里 */ resched = true; /* 是否使用了nf_conntrack */ uses_ct = ip_vs_conn_uses_conntrack(cp, skb); /* 判断之前的连接是否可以释放 */ } else if (is_new_conn_expected(cp, conn_reuse_mode)) { /* 是否使用了nf_conntrack */ uses_ct = ip_vs_conn_uses_conntrack(cp, skb); if (!atomic_read(&cp->n_control)) { resched = true; } else { /* Do not reschedule controlling connection * that uses conntrack while it is still * referenced by controlled connection(s). */ resched = !uses_ct; } }if (resched) { /* 提前释放之前的连接 */ if (!atomic_read(&cp->n_control)) ip_vs_conn_expire_now(cp); __ip_vs_conn_put(cp); /* 这里有一个bug,如果使用了conntrack,直接丢包,客户端必须重传 * 重传导致产生1s延迟 */ if (uses_ct) return NF_DROP; cp = NULL; } }if (unlikely(!cp)) { /* 没有连接记录和不复用连接记录会走到这里 */ int v; /* 进行连接记录的创建和目的地的确认 */ if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph)) /* 没有匹配到service的不属于ipvs的数据包返回ACCEPT */ return v; }/* 属于IPVS的service的数据包会走到这里 */IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */__u32 flags = cp->flags; /* when timer already started, silently drop the packet.*/ if (timer_pending(&cp->timer)) __ip_vs_conn_put(cp); else ip_vs_conn_put(cp); if (sysctl_expire_nodest_conn(ipvs) && !(flags & IP_VS_CONN_F_ONE_PACKET)) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); }return NF_DROP; }/* 统计计数 */ ip_vs_in_stats(cp, skb); /* 更新四层协议连接状态 */ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) /* DNAT之后,发送数据包到local_out * 发送成功ret = NF_STOLEN */ ret = cp->packet_xmit(skb, cp, pp, &iph); /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); ret = NF_ACCEPT; }/* Increase its packet counter and check if it is needed * to be synchronized * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout * * For ONE_PKT let ip_vs_sync_conn() do the filter work. */if (cp->flags & IP_VS_CONN_F_ONE_PACKET) pkts = sysctl_sync_threshold(ipvs); else pkts = atomic_add_return(1, &cp->in_pkts); if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) /* increment is done inside ip_vs_sync_conn too */ atomic_inc(&cp->control->in_pkts); /* 更新连接记录保持时间 */ ip_vs_conn_put(cp); return ret; }

2.5 ip_vs_out 【IPVS分析】为回包确认所属连接,并将回包做还原处理
源码分析:
/* *Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; struct sock *sk; EnterFunction(11); /* 已经被ipvs处理过 */ /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; /* 获取所属连接?? */ sk = skb_to_full_sk(skb); /* Bad... Do not break raw sockets */ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT && af == AF_INET)) {if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag) return NF_ACCEPT; }if (unlikely(!skb_dst(skb))) return NF_ACCEPT; if (!ipvs->enable) return NF_ACCEPT; /* 获取ip协议头 */ ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related; int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related, hooknum, &iph); if (related) return verdict; } } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related; int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum); if (related) return verdict; }/* 判断是否是ipvs支持的四层协议类型 */ pd = ip_vs_proto_data_get(ipvs, iph.protocol); if (unlikely(!pd)) return NF_ACCEPT; pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET) #endif if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) { if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iph_skb(AF_INET, skb, false, &iph); }/* * Check if the packet belongs to an existing entry */ /* 是否属于现有连接 */ cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto, ipvs, af, skb, &iph); if (likely(cp)) { if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) goto ignore_cp; /* de-DNAT和连接状态更新 */ return handle_response(af, skb, pd, cp, &iph, hooknum); }/* UDP协议 */ /* Check for real-server-started requests */ if (atomic_read(&ipvs->conn_out_counter)) { /* Currently only for UDP: * connection oriented protocols typically use * ephemeral ports for outgoing connections, so * related incoming responses would not match any VS */ if (pp->protocol == IPPROTO_UDP) { cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); } }/* icmp协议 */ if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { __be16 _ports[2], *pptr; pptr = frag_safe_skb_hp(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr, pptr[0])) { /* * Notify the real server: there is no * existing entry if it is not RST * packet or not TCP packet. */ if ((iph.protocol != IPPROTO_TCP && iph.protocol != IPPROTO_SCTP) || ((iph.protocol == IPPROTO_TCP && !is_tcp_reset(skb, iph.len)) || (iph.protocol == IPPROTO_SCTP && !is_sctp_abort(skb, iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (!skb->dev) skb->dev = ipvs->net->loopback_dev; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); return NF_DROP; } } }out: IP_VS_DBG_PKT(12, af, pp, skb, iph.off, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; ignore_cp: __ip_vs_conn_put(cp); goto out; }/* Handle response packets: rewrite addresses and send away... */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, unsigned int hooknum) { struct ip_vs_protocol *pp = pd->pp; IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet"); if (skb_ensure_writable(skb, iph->len)) goto drop; /* mangle the packet */ /* 四层de-DNAT */ if (pp->snat_handler && !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph)) goto drop; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ipv6_hdr(skb)->saddr = cp->vaddr.in6; else #endif { /* 三层de-DNAT */ ip_hdr(skb)->saddr = cp->vaddr.ip; ip_send_check(ip_hdr(skb)); }/* * nf_iterate does not expect change in the skb->dst->dev. * It looks like it is not fatal to enable this code for hooks * where our handlers are at the end of the chain list and * when all next handlers use skb->dst->dev and not outdev. * It will definitely route properly the inout NAT traffic * when multiple paths are used. *//* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through.We want this packet to be routed as * if it came from this machine itself.So re-compute * the routing information. */ /* 重新路由,由snat_reroute参数决定 */ if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum)) goto drop; IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT"); /* 统计计数 */ ip_vs_out_stats(cp, skb); /* 更新四层协议状态 */ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); /* 更新连接记录保持时间 */ ip_vs_conn_put(cp); LeaveFunction(11); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); LeaveFunction(11); return NF_STOLEN; }

    推荐阅读