本篇讨论IP包的收发(暂不包括路由)
先来看inet_init,
首先是调用proto_register,注册了tcp_prot, udp_prot, raw_prot,其中proto_register前半部分是初始化各种slab_cache,后半部分把这些struct proto结构链到proto_list里
其次调用sock_register,内核有一个全局的net_proto_family结构的net_families数组,inet_init调用sock_register就是把inet_family_ops加到net_families[PF_NET]中,inet_family_ops结构如下
static struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};
接着调用inet_add_protocol,去填充inet_protos数组,inet_protos是一个全局的指针数组,其定义如下:
const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
可以看出数组最大长度MAX_INET_PROTOS为256,in.h里对所有的协议做了定义
/* Standard well-defined IP protocols. */
enum {
IPPROTO_IP = 0, /* Dummy protocol for TCP */
IPPROTO_ICMP = 1, /* Internet Control Message Protocol */
IPPROTO_IGMP = 2, /* Internet Group Management Protocol */
IPPROTO_IPIP = 4, /* IPIP tunnels (older KA9Q tunnels use 94) */
IPPROTO_TCP = 6, /* Transmission Control Protocol */
IPPROTO_EGP = 8, /* Exterior Gateway Protocol */
IPPROTO_PUP = 12, /* PUP protocol */
IPPROTO_UDP = 17, /* User Datagram Protocol */
IPPROTO_IDP = 22, /* XNS IDP protocol */
IPPROTO_DCCP = 33, /* Datagram Congestion Control Protocol */
IPPROTO_RSVP = 46, /* RSVP protocol */
IPPROTO_GRE = 47, /* Cisco GRE tunnels (rfc 1701,1702) */
IPPROTO_IPV6 = 41, /* IPv6-in-IPv4 tunnelling */
IPPROTO_ESP = 50, /* Encapsulation Security Payload protocol */
IPPROTO_AH = 51, /* Authentication Header protocol */
IPPROTO_BEETPH = 94, /* IP option pseudo header for BEET */
IPPROTO_PIM = 103, /* Protocol Independent Multicast */
IPPROTO_COMP = 108, /* Compression Header protocol */
IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */
IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */
IPPROTO_RAW = 255, /* Raw IP packets */
IPPROTO_MAX
};
inet_init里对inet_protos里只定义了ICMP, IGMP, TCP, UDP,以TCP为例,其net_protocol定义为
static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
IP层在把报文往上送的时候,e.g. ip_local_deliver_finish,实际上就是根据skb的protocol在inet_protos里找到对应的net_protocol结构,然后调用net_protocol->handler函数,e.g. 如果是TCP协议的skb,这时就调用tcp_v4_rcv
下面开始初始化inetsw数组以及inetsw_arry数组,inetsw是个list_head数组,每个索引代表了IP报的一种类型(由四层决定的),如 SOCK_STREAM, SOCK_DGRAM, SOCK_RAW等,定义如下
enum sock_type {
SOCK_STREAM = 1,
SOCK_DGRAM = 2,
SOCK_RAW = 3,
SOCK_RDM = 4,
SOCK_SEQPACKET = 5,
SOCK_DCCP = 6,
SOCK_PACKET = 10,
};
inetsw_array数组是一个inet_protosw类型的数组,定义如下
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
};
而inet_protosw定义如下
/* This is used to register socket interfaces for IP protocols. */
struct inet_protosw {
struct list_head list;
/* These two fields form the lookup key. */
unsigned short type; /* This is the 2nd argument to socket(2). */
unsigned short protocol; /* This is the L4 protocol number. */
struct proto *prot;
const struct proto_ops *ops;
char no_check; /* checksum on rcv/xmit/none? */
unsigned char flags; /* See INET_PROTOSW_* below. */
};
#define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */
#define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */
#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */
可以看出inet_protosw的list就是inetsw指向的list_head指针
最后是分别调用 arp_init, ip_init, tcp_v4_init, tcp_init, udp_init 等,这里略过了
下面来谈IP协议,这里我们略过IP option部分,因为实际应用的网络几乎不会有IP option出现,先看IP头部
struct iphdr {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u8 ihl:4,
version:4;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u8 version:4,
ihl:4;
#else
#error "Please fix
#endif
__u8 tos;
__be16 tot_len;
__be16 id;
__be16 frag_off;
__u8 ttl;
__u8 protocol;
__sum16 check;
__be32 saddr;
__be32 daddr;
/*The options start here. */
};
ihl单位是4字节,一般而言ihl长度是20字节因此是这个值是5
tot_len单位是字节
id一般用于IP的分段/组合,同一IP包的所有分段其ID值是相同的
protocol表示4层协议值
check是IP首部的校检和
sk_buff 结构中,skb->csum保存了L4的校验和,skb->ip_summed表示校验和的状态
CHECKSUM_NONE,表示L4校验和无效,需要重新计算
CHECKSUM_HW,表示网卡已经正确计算了L4校验和,但程序需要再次验证L4校验和
CHECKSUM_UNNECESSARY,表示L4校验和无需验证
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};
L2层通过ip_packet_type找到ip_rcv函数,从而把报文传到L3,下面分析下ip_rcv 函数:
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
如果skb是通过混杂模式获取的且不是发往本机的,直接丢弃;如果skb是share的,调用skb_share_check复制一份出来处理;pskb_may_pull这个函数比较复杂,其目的是,确保在skb->data开始的线性内存里面至少有 iphdr 的内容(这里要提下sk_buff这个结构的复杂性就在于:真正的报文内容很多情况下是不存在skb所在的线性内存中的,通常情况下,sk_buff后面会跟着一块线性内存空间,用skb_shared_info来表示,如果IP包没有分片的话,这里会存储scatter-gather的报文内容,这些内容是分散在各个不同的内存页中的,用一个 skb_frag_t 数组frags表示,nrfrags里保存了数组中元素的个数;如果IP包存在分片的话,可以看到有个sk_buff的数组frag_list,里面就是分片的skb咯),如果skb->data后续的内存不够,pskb_may_pull会扩充这个skb结构,然后把frags或者frag_list里的IP头内容拷出来填到skb线性内存里
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
len = ntohs(iph->tot_len);
if (skb->len < len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
这段代码基本都是做一些check,略过了
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
pskb_trim_rcsum用于去掉L2用来padding的部分,并重新计算checksum,了解下就行了
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
最后走一遍netfilter,如果不被DROP或啥的,进入ip_rcv_finish
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
ip_rcv_finish首先调用ip_route_input获取目的地路由,关于路由的部分放到以后说,这里通过本地路由表,会得知这个包究竟是应该本地接收还是给转发出去,ip_route_input会把路由信息存到 (struct dst_entry *)skb->_skb_dst 中,而这个dst_entry->input 的函数指针究竟指向ip_local_deliver还是ip_forward是在ip_route_input_slow里决定的(ip_route_input_slow由ip_route_input调用)
ip_route_input_slow中,先调用ip_mkroute_input,查看是否有转发路由表项,如果没有则返错表示是本地接收。ip_mkroute_input会调用__mkroute_input,里面会调用dst_alloc创建一个rtable,并设置rth->u.dst.input = ip_forward,代码段如下:
rth = dst_alloc(&ipv4_dst_ops);
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
if (IN_DEV_CONF_GET(out_dev, NOXFRM))
rth->u.dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
rth->rt_gateway = daddr;
rth->rt_iif =
rth->fl.iif = in_dev->dev->ifindex;
rth->u.dst.dev = (out_dev)->dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
rt_set_nexthop(rth, res, itag);
rth->rt_flags = flags;
如果是broadcast input, 或者local_input,会走进如下代码段:
local_input:
rth = dst_alloc(&ipv4_dst_ops);
if (!rth)
goto e_nobufs;
rth->u.dst.output= ip_rt_bug;
rth->rt_genid = rt_genid(net);
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
rth->u.dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
#ifdef CONFIG_NET_CLS_ROUTE
rth->u.dst.tclassid = itag;
#endif
rth->rt_iif =
rth->fl.iif = dev->ifindex;
rth->u.dst.dev = net->loopback_dev;
dev_hold(rth->u.dst.dev);
rth->idev = in_dev_get(rth->u.dst.dev);
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->u.dst.input= ip_local_deliver;
rth->rt_flags = flags|RTCF_LOCAL;
if (res.type == RTN_UNREACHABLE) {
rth->u.dst.input= ip_error;
rth->u.dst.error= -err;
rth->rt_flags &= ~RTCF_LOCAL;
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb);
goto done;