网络层和传输层收包处理
- 网络层在哪收包?
- 如何递交到传输层?
在这里我们不讨论路由子系统,也就是假设我知道数据包应该被网络层哪个函数处理。
框架
有了上一节的铺垫,知道协议栈初始化时注册了哪些处理函数。我们分析__netif_receive_skn_core
函数中那个函数指针ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
在IPv4中调用了ip_rcv
, IPv6中调用了ipv6_rcv
。我们以ip_rcv
为例子继续分析。先放上一张蓝图,后面看代码能给我们一个框架印象。
实现
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
struct net *net;
u32 len;
//......
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
}
ip_rcv
函数在对skb
ip头做必要的检查后,想往常一样会调用ip_rcv_finish
函数,这类finish
后缀的函数,是在natfilter检查通过后调用的,关于natfilter,后面再说,这个坑和路由子系统一样,大坑。
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
struct net_device *dev = skb->dev;
struct rtable *rt;
int err;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
//数据包分流
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
err = edemux(skb);
if (unlikely(err))
goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
}
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
//路由查找
if (!skb_valid_dst(skb)) {
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, dev);
if (unlikely(err))
goto drop_error;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
* When a host sends a datagram to a link-layer broadcast
* address, the IP destination address MUST be a legal IP
* broadcast or IP multicast address.
*
* A host SHOULD silently discard a datagram that is received
* via a link-layer broadcast (see Section 2.4) but does not
* specify an IP multicast or broadcast destination address.
*
* This doesn't explicitly say L2 *broadcast*, but broadcast is
* in a way a form of multicast and the most common use case for
* this is 802.11 protecting against cross-station spoofing (the
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
goto drop;
}
//调用路由子系统分配的处理函数,可能为ip_local_deliver或者ip_forward
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
drop_error:
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
ip_rcv_finish
函数,这个函数最重要的任务就是,通过路由子系统,找到我们的数据包后续应该交给谁来处理,这个任务是ip_route_input_noref
函数完成的。再开篇我们说,在这我们预设知道数据包后续将交给谁处理。
- 发往本地的数据包:
ip_local_deliver
- 转发的数据包:
ip_forward
- 组播数据包:
ip_mr_input
接下来我们以发往本地的TCP包作为例子继续分析。
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
同样的手法,继续看ip_local_deliver_finish
。
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
const struct net_protocol *ipprot;
int raw;
resubmit:
raw = raw_local_deliver(skb, protocol);
//从inet_protos数组中取出对应的net_protocol元素,TCP的为tcp_protocol
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
int ret;
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
//调用L4协议的处理函数,对于TCP,调用tcp_protocol->handler,为tcp_v4_rcv()
ret = ipprot->handler(skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
}
out:
rcu_read_unlock();
return 0;
}
ip_local_deliver_finish
函数以ip_hdr(skb)->protocol
作为索引,从全局变量inet_protos
中找到对应协议的处理函数。inet_protos
是啥?在哪初始化的?前面在协议栈初始化时已经介绍了。inet_init
函数通过inet_add_protocol
调用注册的。
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) //tcp协议的注册
pr_crit("%s: Cannot add TCP protocol\n", __func__);
static struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.early_demux_handler = tcp_v4_early_demux,
.handler = tcp_v4_rcv, //对应ip_local_deliver_finish = ipprot->handler(skb);
.err_handler = tcp_v4_err,
.no_policy = 1,
.netns_ok = 1,
.icmp_strict_tag_validation = 1,
};
继续看tcp_v4_rcv
。
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
int sdif = inet_sdif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
struct sock *sk;
int ret;
//......
lookup:
//通过四元组得到对应的sock。
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
process:
//如果是time_wait状态,则进入相关处理(这次不会分析time_wait状态,以后分析tcp的断开状态变迁时,会详细分析这个).
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
//......
//这个宏很简单就是判断(sk)->sk_lock.owned.也就是当进程上下文在使用这个sock时为1
if (!sock_owned_by_user(sk)) {
//tcp_v4_do_rcv处理这个skb(其实也就是直接放到receive_queue中).
ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {//当有进程在使用这个sock则放buf到sk_backlog中。
goto discard_and_relse;
}
bh_unlock_sock(sk);
//......
}
tcp_v4_rcv函数只要做以下几个工作:
- 设置TCP_CB;
- 查找控制块;
- 根据控制块状态做不同处理,包括TCP_TIME_WAIT状态处理,TCP_NEW_SYN_RECV状态处理,TCP_LISTEN状态处理;
- 接收TCP段;
我们假设收到这个数据包时,TCP连接已经建立(sk->sk_state == TCP_ESTABLISHED
)。我们会tcp_v4_do_rcv
函数去将数据包保存到接收队列,等待交给对应的应用层进程。
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
!dst->ops->check(dst, 0)) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
}
tcp_rcv_established(sk, skb);
return 0;
}
//......
}
如上,tcp_v4_do_rcv
函数对于已经建立的连接,会通过tcp_rcv_established
将数据包,加入到连接对应的struct sock
接收队列中,这样应用层经常在通过read
类的系统IO接口就可以接收到数据了。tcp_rcv_established
有点复杂。打算将这个单独出来。大佬写的tcp_rcv_established函数分析
总结
到这,我们协议栈的收包任务就完成了。一共花了五篇博客的篇幅,显然这里基本没有什么报文处理的细节,就连skb_buf
怎么用也没有介绍。只是简单的讲协议的调用流程过了一遍,这样在后面如果分析细节时,能做到心中有数,知道当前的处理时位于协议栈的哪一块,在哪个路径下处理的。
打算将后续的内容分为如下几个大块:
- 网络编程接口函数和在内核的处理;
- 协议栈发包流程;
- 协议栈各层级的处理细节;