SNAT的MASQUERADE地址选择与端口选择 c

环境：

版本：kernel-5.4.54 amd64 双核 ubuntu18.04
k8s集群网络组件：flannel，kube-proxy: ipvs
代码工具：vs code

1.概述

SNAT（源地址转换）是IPTABLES的NAT表的核心功能，广泛应用与路由器，云服务器，K8S集群等内网环境中，是内核网络子系统中不可或缺的功能
IPTABLES的NAT完全依赖于netfilter的conntrack，对于没有进行conntrack的数据包无法进行NAT
在K8S集群中DNAT用于负载均衡，SNAT用来保证节点转发的数据包能回到节点去完成de-DNAT还原，而不是直接发给客户端。
- 客户端访问的是负载均衡IP，后端IP直接回包给客户端的话，客户端无法识别；
- 后端IP回包先转给负载均衡器，将后端IP还原成负载均衡IP之后再发给客户端
IPTABLES和IPVS都可以实现DNAT负载均衡的功能，但是SNAT只能由IPTABLES实现
查看集群中IPTABLES的SNAT规则

root@cluster1-worker1:~# iptables -t nat -nL Chain PREROUTING (policy ACCEPT) targetprot opt sourcedestination KUBE-SERVICESall--0.0.0.0/00.0.0.0/0/* kubernetes service portals */ DOCKERall--0.0.0.0/00.0.0.0/0ADDRTYPE match dst-type LOCALChain INPUT (policy ACCEPT) targetprot opt sourcedestinationChain OUTPUT (policy ACCEPT) targetprot opt sourcedestination KUBE-SERVICESall--0.0.0.0/00.0.0.0/0/* kubernetes service portals */ DOCKERall--0.0.0.0/0!127.0.0.0/8ADDRTYPE match dst-type LOCALChain POSTROUTING (policy ACCEPT) targetprot opt sourcedestination KUBE-POSTROUTINGall--0.0.0.0/00.0.0.0/0/* kubernetes postrouting rules */ MASQUERADEall--172.17.0.0/160.0.0.0/0 RETURNall--10.244.0.0/1610.244.0.0/16 MASQUERADEall--10.244.0.0/16!224.0.0.0/4 RETURNall-- !10.244.0.0/1610.244.2.0/24 MASQUERADEall-- !10.244.0.0/1610.244.0.0/16 ... Chain KUBE-POSTROUTING (1 references) targetprot opt sourcedestination /* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */ MASQUERADEall--0.0.0.0/00.0.0.0/0match-set KUBE-LOOP-BACK dst,dst,srcRETURNall--0.0.0.0/00.0.0.0/0mark match ! 0x4000/0x4000 MARKall--0.0.0.0/00.0.0.0/0MARK xor 0x4000/* kubernetes service traffic requiring SNAT */ MASQUERADEall--0.0.0.0/00.0.0.0/0 ...

分析MASQUERADE是如何SNAT的对于我们了解集群间网络通信很有帮助
2.概念 2.1 de-SNAT 为什么要做de-SNAT？
假设本机将POD1发出的包进行了SNAT，源IP从POD1-IP变成了HOST-IP；这样服务端回包目的地是HOST-IP，但是需要收包的是POD1，如果不de-SNAT把回包的目的地改为POD1-IP，POD1就无法收到数据包
2.2 netfilter中的与SNAT有关的钩子点

文章图片

K8S集群的SNAT规则是在POST_ROUTING做SNAT，在PRE_ROUTING做de-SNAT
3.代码分析 3.1 MASQUERADE在NAT表中注册的钩子函数

static struct xt_target masquerade_tg_reg[] __read_mostly = { { #if IS_ENABLED(CONFIG_IPV6) .name= "MASQUERADE", .family= NFPROTO_IPV6, .target= masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table= "nat", .hooks= 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg6_checkentry, .destroy= masquerade_tg_destroy, .me= THIS_MODULE, }, { #endif .name= "MASQUERADE", .family= NFPROTO_IPV4, .target= masquerade_tg, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table= "nat", .hooks= 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, .destroy= masquerade_tg_destroy, .me= THIS_MODULE, } };

3.2 masquerade_tg分析

文章图片

static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; /* 获取规则的配置和SNAT的可用端口范围 */ mr = par->targinfo; range.flags = mr->range[0].flags; range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; /* 核心函数 */ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, xt_out(par)); }

3.2.1 nf_nat_masquerade_ipv4分析

unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); /* 获取conntrack连接信息 */ ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; /* 获取路由表 */ rt = skb_rtable(skb); /* 下一跳的地址 */ nh = rt_nexthop(rt, ip_hdr(skb)->daddr); /* 选择最合适的SNAT源地址 */ newsrc = https://www.it610.com/article/inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; }nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ /* 设置可用的源地址和源端口范围 */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags= range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto= range->min_proto; newrange.max_proto= range->max_proto; /* Hand modified range to generic setup. */ /* 根据可用范围确定SNAT源地址，并修改连接记录 */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); }

3.2.2 nf_nat_setup_info分析

unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); /* 从可用范围中获取唯一的五元组 */ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ /* 修改conntrack中的回包的五元组 */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ /* 标识需要做的nat类型 */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; }/* 将连接记录添加到bysource表中 */ if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); }/* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; }

3.3.3 get_unique_tuple分析

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ /* 先尝试判断不做SNAT是否满足可用范围，或者在最近SNAT的连接记录中获取SNAT源地址 */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* SNAT和非随机端口会走到这里 */ /* try the original tuple first */ /* 不做SNAT判断是否满足可用范围 */ if (in_range(orig_tuple, range)) { /* 判断五元组是否唯一 */ if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } /* 根据源地址hash，在最近SNAT的连接记录中获取SNAT源地址 */ } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); /* 判断五元组是否唯一 */ if (!nf_nat_used_tuple(tuple, ct)) return; } }/* 随机端口或者没有找到符合上面判断的五元组时会走到这里 */ /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; /* 从源地址范围中获取最合适的源地址 */ find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. *//* Only bother mapping if it's not already in range and unique */ /* 先不修改端口判断五元组是否满足范围 */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) /* 非随机端口 && 设置了端口范围 && 端口满足范围 && 五元组唯一 * 会走到这里直接返回确认的五元组*/ return; } else if (!nf_nat_used_tuple(tuple, ct)) { /* 非随机端口 && 没有设置了端口范围 && 五元组唯一 * 会走到这里直接返回确认的五元组*/ return; } }/* Last chance: get protocol to try to obtain unique tuple. */ /* 在可用范围中选择一个合适的端口（五元组唯一，端口在范围内） */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); }

先不做对数据包的修改，这里只修改conntrack连接记录，后续根据连接记录对数据包修改
对数据包的修改和de-SNAT在NAT分析文档中：IPTABLES的连接跟踪与NAT分析
3.3 SNAT与MASQ区别 3.3.1 SNAT钩子函数

static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name= "SNAT", .revision= 0, .checkentry = xt_nat_checkentry_v0, .destroy= xt_nat_destroy, .target= xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family= NFPROTO_IPV4, .table= "nat", .hooks= (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me= THIS_MODULE, }, ...

3.3.2 xt_snat_target_v0分析

static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* 获取范围 */ xt_nat_convert_range(&range, &mr->range[0]); /* 根据可用范围确定SNAT源地址，并修改连接记录 */ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); }

【SNAT的MASQUERADE地址选择与端口选择】可以看到SNAT和MASQ最后都调用了nf_nat_setup_info，区别是MASQ在前面有一个选择最合适源IP的步骤。