2.6.1*Linux内核中NAT处理的改变

本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
msn: [email protected]
来源:http://yfydz.cublog.cn

1. 前言
在2.6.10内核后,netfilter的NAT部分作了一些较大的修改,而连接跟踪部分修改了一些但不大,本文大致描述一下NAT修改部分的情况。
 
以下内核代码版本为2.6.17.11。

2. 修改情况
 
2.1 多连接协议的跟踪和NAT
 
对于多连接协议,如FTP等,netfilter的处理进行了较大的修改,表现为:
1) 多连接跟踪改为一个nf_hook_ops点处理
2) 数据内容部分的NAT直接在跟踪函数中调用处理,不再由NAT部分处理

连接help操作在INPUT和POSTROUTING点上进行,优先级仅高于SEQ_ADJUST和confirm,数据包到这时地址端口部分已经经过NAT了。
/* net/ipv4/netfilter/ip_conntrack_standalne.c */
......
 {
  .hook  = ip_conntrack_help,
  .owner  = THIS_MODULE,
  .pf  = PF_INET,
  .hooknum = NF_IP_POST_ROUTING,
  .priority = NF_IP_PRI_CONNTRACK_HELPER,
 },
 {
  .hook  = ip_conntrack_help,
  .owner  = THIS_MODULE,
  .pf  = PF_INET,
  .hooknum = NF_IP_LOCAL_IN,
  .priority = NF_IP_PRI_CONNTRACK_HELPER,
 },
.......

static unsigned int ip_conntrack_help(unsigned int hooknum,
          struct sk_buff **pskb,
          const struct net_device *in,
          const struct net_device *out,
          int (*okfn)(struct sk_buff *))
{
 struct ip_conntrack *ct;
 enum ip_conntrack_info ctinfo;
 /* This is where we call the helper: as the packet goes out. */
 ct = ip_conntrack_get(*pskb, &ctinfo);
 if (ct && ct->helper) {
  unsigned int ret;
  ret = ct->helper->help(pskb, ct, ctinfo);
  if (ret != NF_ACCEPT)
   return ret;
 }
 return NF_ACCEPT;
}
 
多连接协议的跟踪辅助结构struct ip_conntrack_helper内容变化不大,但原来的NAT辅助结构struct ip_nat_helper已经取消了,数据内容的修改直接在跟踪函数中调用。
以FTP为例:
 
/* net/ipv4/netfilter/ip_conntrack_ftp.c */
 
static int help(struct sk_buff **pskb,
  struct ip_conntrack *ct,
  enum ip_conntrack_info ctinfo)
{
 unsigned int dataoff, datalen;
 struct tcphdr _tcph, *th;
 char *fb_ptr;
 int ret;
 u32 seq, array[6] = { 0 };
 int dir = CTINFO2DIR(ctinfo);
 unsigned int matchlen, matchoff;
 struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
 struct ip_conntrack_expect *exp;
 unsigned int i;
 int found = 0, ends_in_nl;
 /* Until there's been traffic both ways, don't look in packets. */
 if (ctinfo != IP_CT_ESTABLISHED
     && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
  DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
  return NF_ACCEPT;
 }
// TCP头,会拷贝到缓冲区
 th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
    sizeof(_tcph), &_tcph);
 if (th == NULL)
  return NF_ACCEPT;
 dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
 /* No data? */
 if (dataoff >= (*pskb)->len) {
  DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
  return NF_ACCEPT;
 }
 datalen = (*pskb)->len - dataoff;
 spin_lock_bh(&ip_ftp_lock);
// 获取FTP内容数据部分,可能会拷贝到一个缓冲区进行,不直接在原数据区
// 上进行操作,这也许和2.6支持抢占式处理有关,如果被其他内核进程抢占
// 由对该skb进行操作就麻烦了
 fb_ptr = skb_header_pointer(*pskb, dataoff,
        (*pskb)->len - dataoff, ftp_buffer);
 BUG_ON(fb_ptr == NULL);
 ends_in_nl = (fb_ptr[datalen - 1] == '\n');
 seq = ntohl(th->seq) + datalen;
 /* Look up to see if we're just after a \n. */
// 检查序列号是否是在回车符后,防止phrack63中描述的序列号问题
 if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
  /* Now if this ends in \n, update ftp info. */
  DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
         ct_ftp_info->seq_aft_nl[0][dir]
         old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
  ret = NF_ACCEPT;
  goto out_update_nl;
 }
 /* Initialize IP array to expected address (it's not mentioned
           in EPSV responses) */
// 期待的地址,如果进行了NAT,那得到的已经是NAT后的地址
 array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
 array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
 array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
 array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
// 模式查找
 for (i = 0; i < ARRAY_SIZE(search); i++) {
  if (search[i].dir != dir) continue;
  found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
         search[i].pattern,
         search[i].plen,
         search[i].skip,
         search[i].term,
         &matchoff, &matchlen,
         array,
         search[i].getnum);
  if (found) break;
 }
 if (found == -1) {
  /* We don't usually drop packets.  After all, this is
     connection tracking, not packet filtering.
     However, it is necessary for accurate tracking in
     this case. */
  if (net_ratelimit())
   printk("conntrack_ftp: partial %s %u+%u\n",
          search[i].pattern,
          ntohl(th->seq), datalen);
  ret = NF_DROP;
  goto out;
 } else if (found == 0) { /* No match */
  ret = NF_ACCEPT;
  goto out_update_nl;
 }
 DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
        fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
   
 /* Allocate expectation which will be inserted */
// 使用动态分配的期待连接空间,以前在此是静态,提交后才是动态分配
 exp = ip_conntrack_expect_alloc(ct);
 if (exp == NULL) {
  ret = NF_DROP;
  goto out;
 }
 /* We refer to the reverse direction ("!dir") tuples here,
  * because we're expecting something in the other direction.
  * Doesn't matter unless NAT is happening.  */
 exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
 if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
     != ct->tuplehash[dir].tuple.src.ip) {
  /* Enrico Scholz's passive FTP to partially RNAT'd ftp
     server: it really wants us to connect to a
     different IP address.  Simply don't record it for
     NAT. */
  DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
         array[0], array[1], array[2], array[3],
         NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
  /* Thanks to Cristiano Lincoln Mattos
     <[email protected]> for reporting this potential
     problem (DMZ machines opening holes to internal
     networks, or the packet filter itself). */
  if (!loose) {
   ret = NF_ACCEPT;
   goto out_put_expect;
  }
  exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
      | (array[2] << 8) | array[3]);
 }
 exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
 exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
 exp->tuple.src.u.tcp.port = 0; /* Don't care. */
 exp->tuple.dst.protonum = IPPROTO_TCP;
 exp->mask = ((struct ip_conntrack_tuple)
  { { 0xFFFFFFFF, { 0 } },
    { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 exp->expectfn = NULL;
 exp->flags = 0;
 /* Now, NAT might want to mangle the packet, and register the
  * (possibly changed) expectation itself. */
 if (ip_nat_ftp_hook)
// 如果注册了NAT修改函数,在此直接调用,该hook函数在ip_nat_ftp.c中定义
  ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
          matchoff, matchlen, exp, &seq);
// 现在的问题是如果内核包括了ip_nat_ftp,那相应ip_nat_ftp_hook就定义了;
// 如果系统仅运行在纯路由模式下,不进行NAT,也要调用该函数处理?
// 现在看不出如何区分是否需要进行NAT操作,似乎有点问题?
 else {
// 否则直接注册期待连接信息
// 现在2.6.1*中ip_conntrack_expect_related()就只需要调用一次(忽略端口已用情况),
// 而以前版本在跟踪时调用一次,NAT后由必须调用一次
  /* Can't expect this?  Best to drop packet now. */
  if (ip_conntrack_expect_related(exp) != 0)
   ret = NF_DROP;
  else
   ret = NF_ACCEPT;
 }
out_put_expect:
 ip_conntrack_expect_put(exp);
out_update_nl:
 /* Now if this ends in \n, update ftp info.  Seq may have been
  * adjusted by NAT code. */
 if (ends_in_nl)
  update_nl_seq(seq, ct_ftp_info,dir, *pskb);
 out:
 spin_unlock_bh(&ip_ftp_lock);
 return ret;
}
 
FTP的NAT处理文件就只是定义ip_nat_ftp_hook函数,不再向以前那样在NAT操作中修改,还要修改期待的子连接,现在就只是纯粹的内容数据修改,对期待连接的只检查新的端口是否被使用的问题,因此实际上和以前NAT操作相比简单了很多:
 
/* net/ipv4/netfilter/ip_nat_ftp.c */
 
static int __init ip_nat_ftp_init(void)
{
 BUG_ON(ip_nat_ftp_hook);
 ip_nat_ftp_hook = ip_nat_ftp;
 return 0;
}
 
static unsigned int ip_nat_ftp(struct sk_buff **pskb,
          enum ip_conntrack_info ctinfo,
          enum ip_ct_ftp_type type,
          unsigned int matchoff,
          unsigned int matchlen,
          struct ip_conntrack_expect *exp,
          u32 *seq)
{
 u_int32_t newip;
 u_int16_t port;
 int dir = CTINFO2DIR(ctinfo);
 struct ip_conntrack *ct = exp->master;
 DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
 /* Connection will come from wherever this packet goes, hence !dir */
 newip = ct->tuplehash[!dir].tuple.dst.ip;
 exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
 exp->dir = !dir;
 /* When you see the packet, we need to NAT it the same as the
  * this one. */
// 该函数在初始化连接init_conntrack()函数中调用,用于为子连接建立NAT信息
 exp->expectfn = ip_nat_follow_master;
 /* Try to get same port: if not, try to change it. */
 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
  exp->tuple.dst.u.tcp.port = htons(port);
// 检查是否可以用此端口替代原来的端口
  if (ip_conntrack_expect_related(exp) == 0)
   break;
 }
// 无空闲端口,丢包
 if (port == 0)
  return NF_DROP;
 if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
     seq)) {
// 修改失败丢包
  ip_conntrack_unexpect_related(exp);
  return NF_DROP;
 }
 return NF_ACCEPT;
}
 
2.2 由sk_buff包获取相应连接的信息的修改
 
在struct sk_buff中直接增加了nfctinfo参数,因此可以直接得到sk_buff包是哪种类型,不象以前是通过在ip_conntrack结构中的一个数组位置来表示是哪种类型,相应nfct参数就直接指向连接本身。这样减少了struct ip_conntrack结构的大小,而sk_buff中nfctinfo字段是利用原来其他字段的空闲区记录的,没有增加sk_buff的大小。
/* include/linux/skbuff.h */
struct sk_buff{
......
 __u8   local_df:1,
    cloned:1,
    ip_summed:2,
    nohdr:1,
    nfctinfo:3;
......
 struct nf_conntrack *nfct;
......
 
2.3 隐含的HOOK点

PREROUTING: DEFRAG, CONNTRACK
OUTPUT: DEFRAG, CONNTRACK
POSTROUTING: conn_help, SEQ_ADJUST,confirm
INPUT: conn_help, SEQ_ADJUST,confirm
 
权限值定义:
/* include/linux/netfilter_ipv4.h */
enum nf_ip_hook_priorities {
 NF_IP_PRI_FIRST = INT_MIN,
 NF_IP_PRI_CONNTRACK_DEFRAG = -400,
 NF_IP_PRI_RAW = -300,
 NF_IP_PRI_SELINUX_FIRST = -225,
 NF_IP_PRI_CONNTRACK = -200,
 NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD = -175,
 NF_IP_PRI_MANGLE = -150,
 NF_IP_PRI_NAT_DST = -100,
 NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT = -50,
 NF_IP_PRI_FILTER = 0,
 NF_IP_PRI_NAT_SRC = 100,
 NF_IP_PRI_SELINUX_LAST = 225,
 NF_IP_PRI_CONNTRACK_HELPER = INT_MAX - 2,
 NF_IP_PRI_NAT_SEQ_ADJUST = INT_MAX - 1,
 NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX,
 NF_IP_PRI_LAST = INT_MAX,
};

桥下的netfilter控制已经正式集成到内核中,不需要象以前那样需要打补丁了。

协议数据内容NAT时序列号调整也使用单独的nf_hook_ops,分别作用在INPUT和POSTROUTING挂接点上,以前也是在NAT模块中一起进行:
 
/* net/ipv4/netfilter/ip_nat_standalone.c */
......
 {
  .hook  = ip_nat_adjust,
  .owner  = THIS_MODULE,
  .pf  = PF_INET,
  .hooknum = NF_IP_POST_ROUTING,
  .priority = NF_IP_PRI_NAT_SEQ_ADJUST,
 },
......
 {
  .hook  = ip_nat_adjust,
  .owner  = THIS_MODULE,
  .pf  = PF_INET,
  .hooknum = NF_IP_LOCAL_IN,
  .priority = NF_IP_PRI_NAT_SEQ_ADJUST,
 },
......
 
static unsigned int
ip_nat_adjust(unsigned int hooknum,
       struct sk_buff **pskb,
       const struct net_device *in,
       const struct net_device *out,
       int (*okfn)(struct sk_buff *))
{
 struct ip_conntrack *ct;
 enum ip_conntrack_info ctinfo;
 ct = ip_conntrack_get(*pskb, &ctinfo);
 if (ct && test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)) {
         DEBUGP("ip_nat_standalone: adjusting sequence number\n");
         if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
                 return NF_DROP;
 }
 return NF_ACCEPT;
}
 
其中标志位:IPS_SEQ_ADJUST_BIT是在ip_nat_mangle_tcp_packet()函数中发现修改后的数据长度和原数据长度不同时设置的。
 
2.4 普通数据的NAT

现在的NAT只处理TCP/IP头部分,包括地址和传输层的端口,不再处理应用部分的数据的修改。而且对于已经建立好NAT对应关系的已建立(ESTABLISHED)的包,相关处理函数由以前的do_binding()改为ip_nat_packet():
/* net/ipv4/netfilter/ip_nat_core.c */
 
/* Do packet manipulations according to ip_nat_setup_info. */
unsigned int ip_nat_packet(struct ip_conntrack *ct,
      enum ip_conntrack_info ctinfo,
      unsigned int hooknum,
      struct sk_buff **pskb)
{
 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 unsigned long statusbit;
 enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
// 判断数据修改类型
 if (mtype == IP_NAT_MANIP_SRC)
  statusbit = IPS_SRC_NAT;
 else
  statusbit = IPS_DST_NAT;
 /* Invert if this is reply dir. */
 if (dir == IP_CT_DIR_REPLY)
  statusbit ^= IPS_NAT_MASK;
 /* Non-atomic: these bits don't change. */
// ct->status中NAT类型是在建立NAT信息的ip_nat_setup_info()函数中
// 设置的
 if (ct->status & statusbit) {
  struct ip_conntrack_tuple target;
  /* We are aiming to look like inverse of other direction. */
  invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
// 修改数据
  if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
   return NF_DROP;
 }
 return NF_ACCEPT;
}
可看到ip_nat_packet()函数比以前的do_binding()函数简洁了很多。

2.5  连接状态值
 
/* include/linux/netfilter/nf_conntrack_common.h */
 
/* Bitset representing status of connection. */
enum ip_conntrack_status {
 /* It's an expected connection: bit 0 set.  This bit never changed */
 IPS_EXPECTED_BIT = 0,
 IPS_EXPECTED = (1 << IPS_EXPECTED_BIT),
 /* We've seen packets both ways: bit 1 set.  Can be set, not unset. */
 IPS_SEEN_REPLY_BIT = 1,
 IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT),
 /* Conntrack should never be early-expired. */
 IPS_ASSURED_BIT = 2,
 IPS_ASSURED = (1 << IPS_ASSURED_BIT),
 /* Connection is confirmed: originating packet has left box */
 IPS_CONFIRMED_BIT = 3,
 IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT),
 /* Connection needs src nat in orig dir.  This bit never changed. */
 IPS_SRC_NAT_BIT = 4,
 IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT),
 /* Connection needs dst nat in orig dir.  This bit never changed. */
 IPS_DST_NAT_BIT = 5,
 IPS_DST_NAT = (1 << IPS_DST_NAT_BIT),
 /* Both together. */
 IPS_NAT_MASK = (IPS_DST_NAT | IPS_SRC_NAT),
 /* Connection needs TCP sequence adjusted. */
 IPS_SEQ_ADJUST_BIT = 6,
 IPS_SEQ_ADJUST = (1 << IPS_SEQ_ADJUST_BIT),
 /* NAT initialization bits. */
 IPS_SRC_NAT_DONE_BIT = 7,
 IPS_SRC_NAT_DONE = (1 << IPS_SRC_NAT_DONE_BIT),
 IPS_DST_NAT_DONE_BIT = 8,
 IPS_DST_NAT_DONE = (1 << IPS_DST_NAT_DONE_BIT),
 /* Both together */
 IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE),
 /* Connection is dying (removed from lists), can not be unset. */
 IPS_DYING_BIT = 9,
 IPS_DYING = (1 << IPS_DYING_BIT),
};

在连接结构struct ip_conntrack中的status参数用来描述该连接的一些状态,用上面的枚举类型描述,和2.4相比,增加了关于NAT操作的一些新的位。
 
 
3. 结论

2.6.1*以后的Linux内核中的NAT部分进行了较大的变化,修改后的NAT部分只专注于IP层和传输层参数的修改,剔除了内容级别的修改,内容级别的修改变成连接跟踪的单独模块进行,使系统更简化,同时由skbuff数据包获取连接信息的操作也更加简单,理论上系统效率会高一些。

发表于: 2006-08-28,修改于: 2006-08-28 08:30,已浏览3556次,有评论2条 推荐 投诉
	网友: rha030 	时间:2007-11-23 09:55:14 IP地址:211.103.237.★
	

我最近在学习netfilter,在网上找到你的文章,对我帮助很大,所以首先要谢谢你!

        我现在有一个问题想不清楚,就是nat是怎么使用conntrack->tuplehash的,ORIG和REPLY tuple分别指的是什么呢?前者只的是NAT转换前的?还有就是如果NAT设置了SNAT或者DNAT,另外一个方向就不需要再进行相应的设置了,就是靠这个tuplehash保存相应的信息来做的吧,可能搞清了第一个问题,这个问题也就明白了,下面是你博客上文章的一部分,没有看 懂。。。请指教!

         谢谢!我的邮箱是[email protected]











orig_tp =

    conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */

// 通过连接反方向的tuple获取连接正方向的tuple值存到curr_tuple中,

// 也就是NAT转换前的tuple

 invert_tuplepr(&curr_tuple,

         &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);

// 获取地址转换后的新的tuple值到new_tuple中,已经包括了传输层上的转换

 get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);

// 检查转换前后的tuple值是否相同,new_tuple是NAT后的新的原始方向的tuple

 if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {

// 不同,进行NAT转换

  struct ip_conntrack_tuple reply;

  /* Alter conntrack table so will recognize replies. */

// 获取转换后的连接响应方向的tuple值到reply中

  invert_tuplepr(&reply, &new_tuple);

// 修改连接中的响应方向的tuple值

// 即conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = reply

  ip_conntrack_alter_reply(conntrack, &reply);


	网友: yfydz 	时间:2007-11-23 11:02:02 IP地址:218.247.216.★
	

ORIG就是从发起方看起来的网络连接,REPLY是从响应方看起来的网络连接

比如说内部机器10.1.1.1:tcp:1024->1.1.1.1:tcp:80,这是ORIG, SNAT后是2.2.2.2,SNAT后端口是1025,这时REPLY的连接就是:1.1.1.1:tcp:80->2.2.2.2:tcp:1025,ORI的源就是REP的目的,ORI的目的就是REP的源,如果两个不同的话就进行NAT操作

你可能感兴趣的:(数据结构,linux,.net,网络应用,网络协议)