在分析icmp数据包处理流程之前,我有如下疑问:
1、为什么要为每一个cpu创建一个仅用于发送icmp报文的socket呢,不使用socket不也是可以把icmp报文发送出去吗?
2、ping的工作原理是什么呢?
3、Traceroute的工作原理是什么呢?
一、imcp协议的初始化
1)ICMP接收处理函数的初始化
我们知道icmp协议是附属于ip层的3层协议,且是将icmp数据存放于ip数据包的数据部分的3层协议。而tcp、udp也是将tcp、udp数据存放于ip数据包的数据部分的4层协议。
虽然icmp与tcp等协议不属于同一个网络层,但是都是在3层ip协议处理完以后,才会交给icmp、tcp的处理函数去处理。因此在linux中,都是调用inet_add_protocol将其接收处理函数相关的数据结构添加到数组inet_protos中去的(关于三、四层接收数据处理函数的注册相关的知识请参看http://blog.csdn.net/lickylin/article/details/22900401)。
Icmp的接收处理函数相关的结构体定义如下:
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.no_policy = 1,
.netns_ok = 1,
};
在inet_init初始化时,即会调用inet_add_protocol将tcp、udp、icmp、igmp等协议相关的接收处理结构体注册,并保存在数组inet_protos中。
当接收的数据包的协议为icmp时,即会调用icmp_rcv进行后续处理。
2) icmp协议模块的初始化
主要是调用函数register_pernet_subsys(关于该函数的工作流程,请参看http://blog.csdn.net/lickylin/article/details/18013879),将icmp协议模块注册到网络命令空间中,并调用ops->init进行协议初始化相关的代码。
对于icmp,其pernet_operations的定义如下:
static structpernet_operations __net_initdata icmp_sk_ops = {
.init = icmp_sk_init,
.exit = icmp_sk_exit,
};
在调用register_pernet_subsys将icmp协议模块注册到网络命名空间后,即会调用icmp_sk_init进行icmp协议初始化相关的功能,我们分析下icmp_sk_init。
该函数主要实现以下功能:
/*
1、 为每一个cpu创建一个用于发生icmp数据包的socket
2、 设置一些限制条件,包括速率限制、接收数据包的条件等
*/
static int __net_init icmp_sk_init(struct net *net)
{
int i, err;
/*为icmp_sk申请空间,该icmp_sk数组中存放了所有cpu相关的socket指针*/
net->ipv4.icmp_sk =
kzalloc(nr_cpu_ids *sizeof(struct sock *), GFP_KERNEL);
if (net->ipv4.icmp_sk ==NULL)
return -ENOMEM;
/*为每一个cpu创建一个RAW套接字*/
for_each_possible_cpu(i) {
struct sock *sk;
/*
创建一个 RAW 类型的套接字,并调用(*sk)->sk_prot->unhash,将该socket从hash链表raw_v4_hashinfo.ht[RAW_HTABLE_SIZE]中删除与该socket的关联
*/
err =inet_ctl_sock_create(&sk, PF_INET,
SOCK_RAW, IPPROTO_ICMP, net);
if (err < 0)
goto fail;
net->ipv4.icmp_sk[i]= sk;
/* Enough space for2 64K ICMP packets, including
* sk_buff struct overhead.
*/
sk->sk_sndbuf =
(2 * ((64 *1024) + sizeof(struct sk_buff)));
/*
* Speedup sock_wfree()
*/
sock_set_flag(sk,SOCK_USE_WRITE_QUEUE);
inet_sk(sk)->pmtudisc= IP_PMTUDISC_DONT;
}
net->ipv4.sysctl_icmp_echo_ignore_all= 0;
/*忽略广播的echo请求 */
net->ipv4.sysctl_icmp_echo_ignore_broadcasts= 1;
/* 忽略广播的icmp 错误回复信息*/
net->ipv4.sysctl_icmp_ignore_bogus_error_responses= 1;
net->ipv4.sysctl_icmp_ratelimit= 1 * HZ; //速率限制值
/*进行速率限制的icmp数据包类型,主要有dest unreachable 、source quench time exceeded 、parameter problem*/
net->ipv4.sysctl_icmp_ratemask= 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr= 0;
return 0;
fail:
for_each_possible_cpu(i)
inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]);
kfree(net->ipv4.icmp_sk);
return err;
}
疑问:当新创建的socket时,为什么要将其从hash链表raw_v4_hashinfo.ht[RAW_HTABLE_SIZE]中删除呢?
因为我们只使用这个socket进行发送数据包,而不需要使用该socket接收数据包。所以此处将其从hash链表raw_v4_hashinfo.ht[RAW_HTABLE_SIZE]中删除。
为什么不使用该socket直接接收icmp报文呢,我的理解是如果使用该socket接收报文,就需要在kernel创建一个内核线程,用于侦听是否有数据到达该socket,然后再进行处理。
而直接使用内核四层协议接收处理函数的注册流程,可以很方便的就能对接收的icmp报文进行处理,而且使用的内核资源比较少,所以对于kernel创建的socket,其接收操作基本上是使用内核四层协议接收处理函数的注册流程实现的。而对于应用层创建的icmp相关的socket则不会执行上述操作。
二、ICMP协议的接收处理函数
Icmp接收处理函数为icmp_rcv,下面分析这个函数。
主要功能:
1、 对数据包进行合理性检查
2、 根据icmp的类型,
int icmp_rcv(struct sk_buff *skb)
{
structicmphdr *icmph;
structrtable *rt = skb_rtable(skb);
structnet *net = dev_net(rt->u.dst.dev);
/*
基于策略的高扩展性的网络安全架构,对于这个内核子架构不清楚
此处分析不了,跳过。
*/
if(!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
structsec_path *sp = skb_sec_path(skb);
intnh;
if(!(sp && sp->xvec[sp->len - 1]->props.flags &
XFRM_STATE_ICMP))
gotodrop;
if(!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
gotodrop;
nh= skb_network_offset(skb);
skb_set_network_header(skb,sizeof(*icmph));
if(!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
gotodrop;
skb_set_network_header(skb,nh);
}
ICMP_INC_STATS_BH(net,ICMP_MIB_INMSGS);
/*验证校验和信息*/
switch(skb->ip_summed) {
caseCHECKSUM_COMPLETE:
if(!csum_fold(skb->csum))
break;
/*fall through */
caseCHECKSUM_NONE:
skb->csum= 0;
if(__skb_checksum_complete(skb))
gotoerror;
}
if(!pskb_pull(skb, sizeof(*icmph)))
gotoerror;
/*获取icmp头部*/
icmph= icmp_hdr(skb);
ICMPMSGIN_INC_STATS_BH(net,icmph->type);
/*
对于不支持的icmp报文,直接丢掉
*/
if(icmph->type > NR_ICMP_TYPES)
gotoerror;
/*
判断是否丢弃掉多播类型的icmp数据包
1、只处理echo、timestamp、address_mask_request、address_mask_reply类型的多播icmp数据包
*/
if(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
/*
* RFC1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
* silently ignored (we let user decide with asysctl).
* RFC1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
* discarded if to broadcast/multicast.
*/
if((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP)&&
net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
gotoerror;
}
if(icmph->type != ICMP_ECHO &&
icmph->type != ICMP_TIMESTAMP &&
icmph->type != ICMP_ADDRESS &&
icmph->type != ICMP_ADDRESSREPLY) {
gotoerror;
}
}
/*根据icmp数据包类型,调用相应的处理函数*/
icmp_pointers[icmph->type].handler(skb);
drop:
kfree_skb(skb);
return0;
error:
ICMP_INC_STATS_BH(net,ICMP_MIB_INERRORS);
gotodrop;
}
对于icmp_pointers的定义如下:
/*
* This table is the definition of how wehandle ICMP.
*/
static const struct icmp_controlicmp_pointers[NR_ICMP_TYPES + 1] = {
[ICMP_ECHOREPLY]= {
.handler= icmp_discard,
},
[1]= {
.handler= icmp_discard,
.error= 1,
},
[2]= {
.handler= icmp_discard,
.error= 1,
},
[ICMP_DEST_UNREACH]= {
.handler= icmp_unreach,
.error= 1,
},
[ICMP_SOURCE_QUENCH]= {
.handler= icmp_unreach,
.error= 1,
},
[ICMP_REDIRECT]= {
.handler= icmp_redirect,
.error= 1,
},
[6]= {
.handler= icmp_discard,
.error= 1,
},
[7]= {
.handler= icmp_discard,
.error= 1,
},
[ICMP_ECHO]= {
.handler= icmp_echo,
},
[9]= {
.handler= icmp_discard,
.error= 1,
},
[10]= {
.handler= icmp_discard,
.error= 1,
},
[ICMP_TIME_EXCEEDED]= {
.handler= icmp_unreach,
.error= 1,
},
[ICMP_PARAMETERPROB]= {
.handler= icmp_unreach,
.error= 1,
},
[ICMP_TIMESTAMP]= {
.handler= icmp_timestamp,
},
[ICMP_TIMESTAMPREPLY]= {
.handler= icmp_discard,
},
[ICMP_INFO_REQUEST]= {
.handler= icmp_discard,
},
[ICMP_INFO_REPLY]= {
.handler= icmp_discard,
},
[ICMP_ADDRESS]= {
.handler= icmp_address,
},
[ICMP_ADDRESSREPLY]= {
.handler= icmp_address_reply,
},
};
目前内核处理的icmp报文有icmp_unreach、icmp_address、icmp_address_reply、icmp_timestamp、icmp_echo、icmp_redirect。
icmp_echo
/*
该函数主要是将icmp的type设置为ICMP_ECHOREPLY,并调用icmp_reply将该数据包发送出去
*/
static void icmp_echo(struct sk_buff *skb)
{
structnet *net;
net= dev_net(skb_dst(skb)->dev);
if(!net->ipv4.sysctl_icmp_echo_ignore_all) {
structicmp_bxm icmp_param;
icmp_param.data.icmph =*icmp_hdr(skb);
icmp_param.data.icmph.type= ICMP_ECHOREPLY;
icmp_param.skb = skb;
icmp_param.offset = 0;
icmp_param.data_len =skb->len;
icmp_param.head_len =sizeof(struct icmphdr);
icmp_reply(&icmp_param,skb);
}
}
Timestamp
/*
设置时间戳的值,并将icmp的type设置为ICMP_TIMESTAMPREPLY,并通过icmp_reply发送出去
*/
static void icmp_timestamp(struct sk_buff*skb)
{
structtimespec tv;
structicmp_bxm icmp_param;
/*
* Tooshort.
*/
if(skb->len < 4)
gotoout_err;
/*
* Fillin the current time as ms since midnight UT:
*/
getnstimeofday(&tv);
icmp_param.data.times[1]= htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
tv.tv_nsec / NSEC_PER_MSEC);
icmp_param.data.times[2]= icmp_param.data.times[1];
if(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
BUG();
icmp_param.data.icmph =*icmp_hdr(skb);
icmp_param.data.icmph.type= ICMP_TIMESTAMPREPLY;
icmp_param.data.icmph.code= 0;
icmp_param.skb = skb;
icmp_param.offset = 0;
icmp_param.data_len =0;
icmp_param.head_len =sizeof(struct icmphdr) + 12;
icmp_reply(&icmp_param,skb);
out:
return;
out_err:
ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev),ICMP_MIB_INERRORS);
gotoout;
}
Unreach 数据处理
功能:根据icmp中有效载荷数据的值,调用传输层的错误处理函数进行处理
static void icmp_unreach(struct sk_buff*skb)
{
structiphdr *iph;
structicmphdr *icmph;
inthash, protocol;
conststruct net_protocol *ipprot;
u32info = 0;
structnet *net;
net= dev_net(skb_dst(skb)->dev);
/*
* Incompleteheader ?
* Onlychecks for the IP header, there should be an
* additionalcheck for longer headers in upper levels.
*/
if(!pskb_may_pull(skb, sizeof(struct iphdr)))
gotoout_err;
/*获取icmp首部*/
icmph= icmp_hdr(skb);
iph = (struct iphdr *)skb->data;
/*判断ip首部是否完整*/
if(iph->ihl < 5) /* Mangled header, drop. */
gotoout_err;
/*仅处理type类型为3或者12的数据包
1、当类型为3时,仅处理code为frag needed的报文
a)当系统不支持pmtu时,丢弃该数据包
b)当系统支持pmtu时,调用ip_rt_frag_needed修改pmtu的值
2、当type类型为12时,则通过icmph->un.gateway获取出错偏移值(相对于数据包)
*/
if(icmph->type == ICMP_DEST_UNREACH) {
switch(icmph->code & 15) {
caseICMP_NET_UNREACH:
caseICMP_HOST_UNREACH:
caseICMP_PROT_UNREACH:
caseICMP_PORT_UNREACH:
break;
caseICMP_FRAG_NEEDED:
if(ipv4_config.no_pmtu_disc) {
LIMIT_NETDEBUG(KERN_INFO"ICMP: %pI4: fragmentation needed and DF set.\n",
&iph->daddr);
}else {
info= ip_rt_frag_needed(net, iph,
ntohs(icmph->un.frag.mtu),
skb->dev);
if(!info)
gotoout;
}
break;
caseICMP_SR_FAILED:
LIMIT_NETDEBUG(KERN_INFO"ICMP: %pI4: Source Route Failed.\n",
&iph->daddr);
break;
default:
break;
}
if(icmph->code > NR_ICMP_UNREACH)
gotoout;
}else if (icmph->type == ICMP_PARAMETERPROB)
info= ntohl(icmph->un.gateway) >> 24;
/*
* Throwit at our lower layers
*
* RFC1122: 3.2.2 MUST extract the protocol ID from the passed
* header.
* RFC1122: 3.2.2.1 MUST pass ICMP unreach messages to the
* transport layer.
* RFC1122: 3.2.2.2 MUST pass ICMP time expired messages to
* transport layer.
*/
/*
* Checkthe other end isnt violating RFC 1122. Some routers send
* bogusresponses to broadcast frames. If you see this message
* firstcheck your netmask matches at both ends, if it does then
* getthe other vendor to fix their kit.
*/
/*
对于目的地址是广播的icmp数据包,且需要忽略时,则打印错误并
忽略该数据包
*/
if(!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
inet_addr_type(net, iph->daddr) ==RTN_BROADCAST) {
if(net_ratelimit())
printk(KERN_WARNING"%pI4 sent an invalid ICMP "
"type %u, code %u "
"error to a broadcast: %pI4 on%s\n",
&ip_hdr(skb)->saddr,
icmph->type, icmph->code,
&iph->daddr,
skb->dev->name);
gotoout;
}
/*
检测icmp报文中有效载荷部分内容长度是否大于等于ip头部信息加上8字节
在发送icmp差错报文时,会将icmp数据部分的值设置为ip头部信息+ ip有效载荷的前8个字节,
这样就可以判断是传输层的那个应用数据发送出错
*/
if(!pskb_may_pull(skb, iph->ihl * 4 + 8))
gotoout;
/*
此时的iph,是icmp有效载荷中的ip头部信息,在icmp_rcv中已经将skb->data指向icmp报文的
有效载荷部分了。
*/
iph= (struct iphdr *)skb->data;
/*获取传输层协议值*/
protocol= iph->protocol;
/*
首先调用raw_icmp_error,将差错信息发送给感兴趣的raw sockte
*/
raw_icmp_error(skb,protocol, info);
/*根据protocol值,查找符合条件的4层接收处理hash数组inet_protos,
调用其错误处理函数进行后续处理*/
hash= protocol & (MAX_INET_PROTOS - 1);
rcu_read_lock();
ipprot= rcu_dereference(inet_protos[hash]);
if(ipprot && ipprot->err_handler)
ipprot->err_handler(skb,info);
rcu_read_unlock();
out:
return;
out_err:
ICMP_INC_STATS_BH(net,ICMP_MIB_INERRORS);
gotoout;
}
重定向处理:
功能:根据icmp中数据部分中的值,调用ip_rt_redirect,进行后续处理(对于路由重定向的处理代码不熟悉,下次分析路由分支时再仔细分析)。
static void icmp_redirect(struct sk_buff*skb)
{
structiphdr *iph;
/*数据包有效性检查*/
if(skb->len < sizeof(struct iphdr))
gotoout_err;
if(!pskb_may_pull(skb, sizeof(struct iphdr)))
gotoout;
/*获取icmp数据部分携带的ip头部信息*/
iph= (struct iphdr *)skb->data;
/*对于code为ICMP_REDIR_NET 、ICMP_REDIR_NETTOS 、ICMP_REDIR_HOST 、ICMP_REDIR_HOSTTOS ,调用ip_rt_redirect 进行路由重定向的处理*/
switch(icmp_hdr(skb)->code & 7) {
caseICMP_REDIR_NET:
caseICMP_REDIR_NETTOS:
/*
* As per RFC recommendations now handle it asa host redirect.
*/
caseICMP_REDIR_HOST:
caseICMP_REDIR_HOSTTOS:
ip_rt_redirect(ip_hdr(skb)->saddr,iph->daddr,
icmp_hdr(skb)->un.gateway,
iph->saddr, skb->dev);
break;
}
out:
return;
out_err:
ICMP_INC_STATS_BH(dev_net(skb->dev),ICMP_MIB_INERRORS);
gotoout;
}
Icmp_reply
在前面介绍icmp echo的应对以及icmp timestamp的应答时,函数都是调用icmp_reply发送数据的,下面分析一下这个函数
功能:
1、查找路由,若查找失败,直接返回;查找成功执行第二步
2、调用速率限制函数icmpv4_xrlim_allow进行速率限制,当允许发送时,执行第三步,否则返回
3、调用icmp_push_reply发送数据
/*
* Driving logic for building and sending ICMPmessages.
*/
static void icmp_reply(struct icmp_bxm*icmp_param, struct sk_buff *skb)
{
structipcm_cookie ipc;
structrtable *rt = skb_rtable(skb);
structnet *net = dev_net(rt->u.dst.dev);
structsock *sk;
structinet_sock *inet;
__be32daddr;
if(ip_options_echo(&icmp_param->replyopts, skb))
return;
sk= icmp_xmit_lock(net);
if(sk == NULL)
return;
inet= inet_sk(sk);
icmp_param->data.icmph.checksum= 0;
inet->tos= ip_hdr(skb)->tos;
daddr= ipc.addr = rt->rt_src;
ipc.opt= NULL;
ipc.shtx.flags= 0;
if(icmp_param->replyopts.optlen) {
ipc.opt= &icmp_param->replyopts;
if(ipc.opt->srr)
daddr= icmp_param->replyopts.faddr;
}
{
structflowi fl = { .nl_u = { .ip4_u =
{ .daddr = daddr,
.saddr= rt->rt_spec_dst,
.tos= RT_TOS(ip_hdr(skb)->tos) } },
.proto = IPPROTO_ICMP };
security_skb_classify_flow(skb,&fl);
if(ip_route_output_key(net, &rt, &fl))
gotoout_unlock;
}
if(icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type,
icmp_param->data.icmph.code))
icmp_push_reply(icmp_param,&ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
}
a) 速率控制函数icmpv4_xrlim_allow
对于速率控制函数,主要是由两个函数完成icmpv4_xrlim_allow、xrlim_allow。
其中xrlim_allow是进行真正的限速操作;而icmpv4_xrlim_allow主要是返回是否成功,其对于不需要限速的数据,即返回允许通过
功能:判断当前是否有时间片用于数据发送,这个函数的流程还是比较简单的。
int xrlim_allow(struct dst_entry *dst, inttimeout)
{
unsignedlong now, token = dst->rate_tokens;
intrc = 0;
now= jiffies;
token+= now - dst->rate_last;
dst->rate_last= now;
if(token > XRLIM_BURST_FACTOR * timeout)
token= XRLIM_BURST_FACTOR * timeout;
if(token >= timeout) {
token-= timeout;
rc= 1;
}
dst->rate_tokens= token;
returnrc;
}
功能:判断是否允许发送数据
1、 对于不支持的icmp type类型,返回允许发送
2、 对于type类型为ICMP_DEST_UNREACH code为ICMP_FRAG_NEEDED的数据包,允许发送
3、 对于目的设备为回环设备的,返回允许发送
4、 对于其他类型的icmp报文,只有ipv4.sysctl_icmp_ratemask中对应位为1的数据包才会进行限速,对于其他类型的数据包,直接返回允许发送(即不限速)
static inline int icmpv4_xrlim_allow(structnet *net, struct rtable *rt,
inttype, int code)
{
structdst_entry *dst = &rt->u.dst;
intrc = 1;
if(type > NR_ICMP_TYPES)
gotoout;
/*Don't limit PMTU discovery. */
if(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
gotoout;
/*No rate limit on loopback */
if(dst->dev && (dst->dev->flags&IFF_LOOPBACK))
gotoout;
/*Limit if icmp type is enabled in ratemask. */
if((1 << type) & net->ipv4.sysctl_icmp_ratemask)
rc= xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit);
out:
returnrc;
}
b) 数据发送icmp_push_reply
功能:
1、 调用ip_append_data,将数据缓存起来
2、 调用ip_flush_pending_frames将数据直接发送出去
static void icmp_push_reply(struct icmp_bxm*icmp_param,
struct ipcm_cookie *ipc, struct rtable**rt)
{
structsock *sk;
structsk_buff *skb;
/*获取socket*/
sk= icmp_sk(dev_net((*rt)->u.dst.dev));
/*调用ip_append_data,将要发送的数据缓存到sk->sk_write_queue
并调用ip_push_pending_frames,将数据发送出去*/
if(ip_append_data(sk, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
ipc, rt, MSG_DONTWAIT) < 0)
ip_flush_pending_frames(sk);
elseif ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
structicmphdr *icmph = icmp_hdr(skb);
__wsumcsum = 0;
structsk_buff *skb1;
skb_queue_walk(&sk->sk_write_queue,skb1) {
csum= csum_add(csum, skb1->csum);
}
csum= csum_partial_copy_nocheck((void *)&icmp_param->data,
(char *)icmph,
icmp_param->head_len, csum);
icmph->checksum= csum_fold(csum);
skb->ip_summed= CHECKSUM_NONE;
ip_push_pending_frames(sk);
}
}
icmp_send函数
对于由与入口数据包处理失败等操作时,上层协议会调用icmp_send发送数据,下面分析这个函数
功能:发送一个icmp error 数据包
不能发送icmp error 数据包的条件
1、对于入口数据包是多播的数据包(硬件或者ip地址为多播地址),不发送icmp error 数据包
2、对于入口数据包有分段的,仅对首个分段的入口数据包,发送icmp error数据包
3、入口数据包本身是icmp error类型的,不发送针对该入口数据包的icmp error
若入口数据包不满足上述条件,则需要发送针对该数据包的icmp error类型数据
1、查找路由
2、当路由查找成功后,则会调用icmp_push_reply 将数据发送出去
void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct iphdr *iph;
int room;
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
struct ipcm_cookie ipc;
__be32 saddr;
u8 tos;
struct net *net;
struct sock *sk;
if (!rt)
goto out;
net = dev_net(rt->u.dst.dev);
/*
* Find the original header. It is expected to be valid, of course.
* Check this, icmp_send is called from the most obscure devices
* sometimes.
*/
iph = ip_hdr(skb_in);
/*
1、对sk_buff做合理性检查,保证ipheader在sk_buff->head与sk_buff->tail之间的范围内
*/
if ((u8 *)iph < skb_in->head ||
(skb_in->network_header + sizeof(*iph)) > skb_in->tail)
goto out;
/*
* No replies to physical multicast/broadcast
*/
/*判断入口数据包的数据链路层的地址是否是广播或组播地址,若是则退出*/
if (skb_in->pkt_type != PACKET_HOST)
goto out;
/*
* Now check at the protocol level
*/
/*
1、检查入口数据包是否广播、组播数据
*/
if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
goto out;
/*
* Only reply to fragment 0. We byte re-order the constant
* mask for efficiency.
*/
/*
1、对于IP分段数据,仅对首个分段数据包发送ICMP错误信息
*/
if (iph->frag_off & htons(IP_OFFSET))
goto out;
/*
* If we send an ICMP error to an ICMP error a mess would result..
*/
/*
1、判断接收的数据包是否是一个ICMP 错误信息数据包,若是则不对该数据包回复ICMP错误信息
*/
if (icmp_pointers[type].error) {
/*
* We are an error, check if we are replying to an
* ICMP error
*/
if (iph->protocol == IPPROTO_ICMP) {
u8 _inner_type, *itp;
itp = skb_header_pointer(skb_in,
skb_network_header(skb_in) +
(iph->ihl << 2) +
offsetof(struct icmphdr,
type) -
skb_in->data,
sizeof(_inner_type),
&_inner_type);
if (itp == NULL)
goto out;
/*
* Assume any unknown ICMP type is an error. This
* isn't specified by the RFC, but think about it..
*/
if (*itp > NR_ICMP_TYPES ||
icmp_pointers[*itp].error)
goto out;
}
}
/*关闭软中断,并为该socket添加自旋锁,确保同一时刻只有一个icmp报文被发送出去*/
sk = icmp_xmit_lock(net);
if (sk == NULL)
return;
/*
* Construct source address and options.
*/
/*
1、对于目的地址为本地的入口数据包,则将本地地址作为icmp包的源ip地址
2、对于目的地址非背地的入口数据包,则根据sysctl_icmp_errors_use_inbound_ifaddr的值来设置源ip地址
*/
saddr = iph->daddr;
if (!(rt->rt_flags & RTCF_LOCAL)) {
struct net_device *dev = NULL;
rcu_read_lock();
if (rt->fl.iif &&
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
dev = dev_get_by_index_rcu(net, rt->fl.iif);
if (dev)
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
else
saddr = 0;
rcu_read_unlock();
}
/*设置tos值*/
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
IPTOS_PREC_INTERNETCONTROL) :
iph->tos;
if (ip_options_echo(&icmp_param.replyopts, skb_in))
goto out_unlock;
/*
* Prepare data for ICMP header.
*/
/*设置icmp的头部信息*/
icmp_param.data.icmph.type= type;
icmp_param.data.icmph.code= code;
icmp_param.data.icmph.un.gateway = info;
icmp_param.data.icmph.checksum= 0;
icmp_param.skb = skb_in;
icmp_param.offset = skb_network_offset(skb_in);
inet_sk(sk)->tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts;
ipc.shtx.flags = 0;
{
struct flowi fl = {
.nl_u = {
.ip4_u = {
.daddr = icmp_param.replyopts.srr ?
icmp_param.replyopts.faddr :
iph->saddr,
.saddr = saddr,
.tos = RT_TOS(tos)
}
},
.proto = IPPROTO_ICMP,
.uli_u = {
.icmpt = {
.type = type,
.code = code
}
}
};
int err;
struct rtable *rt2;
/*xfrm 架构相关的代码对于xfrm的代码不懂,此处直接跳过,认为内核没有开启xfrm*/
security_skb_classify_flow(skb_in, &fl);
/*当内核没有开启xfrm时
a)若没有查找到路由,则直接调用icmp_xmit_unlock,开启软中断并释放自旋锁
b)若查找到路由,则调用icmpv4_xrlim_allow进行限速操作,并调用icmp_push_reply将数据发送出去*/
if (__ip_route_output_key(net, &rt, &fl))
goto out_unlock;
/* No need to clone since we're just using its address. */
rt2 = rt;
/*xfrm 架构相关的代码,对于xfrm的代码不懂,此处直接跳过,认为内核没有开启xfrm*/
err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0);
switch (err) {
case 0:
if (rt != rt2)
goto route_done;
break;
case -EPERM:
rt = NULL;
break;
default:
goto out_unlock;
}
if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET))
goto relookup_failed;
if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL)
err = __ip_route_output_key(net, &rt2, &fl);
else {
struct flowi fl2 = {};
struct dst_entry *odst;
fl2.fl4_dst = fl.fl4_src;
if (ip_route_output_key(net, &rt2, &fl2))
goto relookup_failed;
/* Ugh! */
odst = skb_dst(skb_in);
err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src,
RT_TOS(tos), rt2->u.dst.dev);
dst_release(&rt2->u.dst);
rt2 = skb_rtable(skb_in);
skb_dst_set(skb_in, odst);
}
if (err)
goto relookup_failed;
err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL,
XFRM_LOOKUP_ICMP);
switch (err) {
case 0:
dst_release(&rt->u.dst);
rt = rt2;
break;
case -EPERM:
goto ende;
default:
relookup_failed:
if (!rt)
goto out_unlock;
break;
}
}
route_done:
if (!icmpv4_xrlim_allow(net, rt, type, code))
goto ende;
/* RFC says return as much as we can without exceeding 576 bytes. */
room = dst_mtu(&rt->u.dst);
if (room > 576)
room = 576;
room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
room -= sizeof(struct icmphdr);
icmp_param.data_len = skb_in->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
icmp_push_reply(&icmp_param, &ipc, &rt);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
out:;
}
其他函数:
static struct sock *icmp_sk(struct net *net)
{
/*
1、获取当前执行CPU 所有的sock,主要用于发送ICMP数据包
*/
return net->ipv4.icmp_sk[smp_processor_id()];
}
/*
功能:
1、关闭软中断
2、为该sock获取自旋锁
*/
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
local_bh_disable();
/*
1、获取ICMP sock
2、为该sock加自旋锁,若失败则返回NULL,若成功则返回sock
*/
sk = icmp_sk(net);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
* dst_link_failure() for an outgoing ICMP packet.
*/
local_bh_enable();
return NULL;
}
return sk;
}
/*
功能:
1、开启软中断
2、释放自旋锁
*/
static inline void icmp_xmit_unlock(struct sock *sk)
{
spin_unlock_bh(&sk->sk_lock.slock);
}
至此,完成icmp数据收发流程的分析