用户态的程序:
#include
#include
#include
#include
#include
#define length 40
int main()
{
struct sockaddr_in serv;
char buff[length];
int sockfd,n,i;
for(i=0;i
该程序是一个简单的udp程序,向服务器发送时间信息请求,如果服务器开启了相应的服务,
就会返回类似的输出:buff=07 DEC 2013 11:40:40 CST
本文主要分析sendto函数具体做了什么。
本机的信息:ip:192.168.1.109 mac:1c:65:9d:2c:fe:f7
通过路由器连入网络,路由器信息:ip:192.168.1.1 mac:00:23:cd:5b:ea:d6
发送的包有40个字节的信息,内容为0x35到0x5c
首先通过tcpdump查看这种组合下发出去的包的内容,先有个直观的感觉:
11:31:39.553588 IP 192.168.1.109.55527 > 115.239.210.27.13: UDP, length 40
0x0000: 0023 cd5b ead6 1c65 9d2c fef7 0800 4500 .#.[...e.,....E.
0x0010: 0044 0000 4000 4011 3289 c0a8 016d 73ef .D..@[email protected].
0x0020: d21b d8e7 000d 0030 78bf 3536 3738 393a .......0x.56789:
0x0030: 3b3c 3d3e 3f40 4142 4344 4546 4748 494a ;<=>?@ABCDEFGHIJ
0x0040: 4b4c 4d4e 4f50 5152 5354 5556 5758 595a KLMNOPQRSTUVWXYZ
0x0050: 5b5c [\
要读懂这串信息,需要知道L2 L3 L4的头结构
ethernet header长度为14字节
ip header长度为20字节(不考虑option)
udp header 长度为8个字节
tcpdump抓到的总共为82个字节(L2:14 + L3:20 + L4:8 + LOAD:40)
用户态的代码中只指定了服务器的ip地址和端口号以及负载的内容,通过内核相应的处理后,填充了L2,L3以及L4的头结构,
其中包含本地的ip地址,MAC地址,本地端口号,路由器的MAC地址,以及类型相关的内容。
后面这些分量的赋值涉及linux内核的整个网络协议。
首先略过整个内核协议栈,手动构造发送的包,看需要做什么操作才能把包发出去。
内核调用dev_queue_xmit(skb)函数实现发包流程,因此手动构造skb包,然后调用该函数,看能否正常发出
static int create_init(void)
{
struct net_device *dev,*dev_tmp;
int hh_len;
int alloc_len;
char *data_addr;
struct iphdr *iph;
struct ethhdr *eth;
struct udphdr *uh;
struct sk_buff *skb;
for_each_netdev(&init_net, dev)
if(strncmp(dev->name,"wlan0",5)==0)
dev=dev_tmp;
hh_len= LL_RESERVED_SPACE(dev);
init_dest_mac(dest_mac);
/* based on __ip_append_data */
alloc_len=LOAD_SIZE+L4_SIZE+L3_SIZE+hh_len+15;
skb=alloc_skb(alloc_len, GFP_ATOMIC);
if(!skb)
return -1;
skb->dev=dev;
skb_reserve(skb, hh_len);//date+=hh_len;tail+=hh_len
skb_put(skb,LOAD_SIZE+L4_SIZE+L3_SIZE);//tal+= ; skb->len+=
skb_set_network_header(skb, 0);
skb->transport_header = (skb->network_header + L3_SIZE);
data_addr=skb->data+L3_SIZE+L4_SIZE;
skb_fill_load(data_addr,LOAD_SIZE);
/* based on __ip_make_skb */
iph = (struct iphdr *)skb->data;
iph->version = 4;
iph->ihl = 5;
iph->tos = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 0x40;
iph->protocol = IPPROTO_UDP; //0x11
iph->tot_len=htons(0x44);
iph->saddr = SOURCE_IP;
iph->daddr = DEST_IP;
iph->id=0;
/* based on udp_send_skb */
uh = udp_hdr(skb);
uh->source = htons(SOURCE_PORT);
uh->dest = htons(DEST_PORT);
uh->len = htons(skb->len - L3_SIZE);
uh->check = 0;
/* based on eth_header */
eth = (struct ethhdr *)skb_push(skb, L2_SIZE);//skb->data -= ;skb->len +=
eth->h_proto =htons(ETH_P_IP); //0x0800
memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
memcpy(eth->h_dest, dest_mac, ETH_ALEN);
dev_queue_xmit(skb);
return 0;
}
这样构造的包是能够正常发送的,因此我们主要关注协议栈是如何自动完成上面模块中手动参与制定的部分。
1:net_device如何选择,用户态代码中只有目的ip,如何决定使用哪个net_device,系统中可能有多个网卡。
通过jprobe可以确定源IP以及net_device是在udp_sendmsg函数中的这段代码中实现的:
if (rt == NULL) {
struct net *net = sock_net(sk);
fl4 = &fl4_stack;
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
inet_sk_flowi_flags(sk)|FLOWI_FLAG_CAN_SLEEP,
faddr, saddr, dport, inet->inet_sport);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
err = -EACCES;
if ((rt->rt_flags & RTCF_BROADCAST) &&
!sock_flag(sk, SOCK_BROADCAST))
goto out;
if (connected)
sk_dst_set(sk, dst_clone(&rt->dst));
}
struct flowi4 {
struct flowi_common __fl_common;
#define flowi4_oif __fl_common.flowic_oif // 0
#define flowi4_iif __fl_common.flowic_iif // 发包流程,该字段为0
#define flowi4_mark __fl_common.flowic_mark // 0
#define flowi4_tos __fl_common.flowic_tos // 0
#define flowi4_scope __fl_common.flowic_scope // RT_SCOPE_UNIVERSE
#define flowi4_proto __fl_common.flowic_proto // IPPROTO_UDP
#define flowi4_flags __fl_common.flowic_flags
#define flowi4_secid __fl_common.flowic_secid //0
__be32 daddr;
__be32 saddr;
union flowi_uli uli;
#define fl4_sport uli.ports.sport
#define fl4_dport uli.ports.dport
#define fl4_icmp_type uli.icmpt.type
#define fl4_icmp_code uli.icmpt.code
#define fl4_ipsec_spi uli.spi
#define fl4_mh_type uli.mht.type
#define fl4_gre_key uli.gre_key
} __attribute__((__aligned__(BITS_PER_LONG/8)));
接着以flowi4为参数进行路由查找:
struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
{
struct rtable *rth;
unsigned int hash;
if (!rt_caching(net))
goto slow_output;
hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
rcu_read_lock_bh();
for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
rth = rcu_dereference_bh(rth->dst.rt_next)) {
if (rth->rt_key_dst == flp4->daddr &&
rth->rt_key_src == flp4->saddr &&
rt_is_output_route(rth) &&
rth->rt_oif == flp4->flowi4_oif &&
rth->rt_mark == flp4->flowi4_mark &&
!((rth->rt_key_tos ^ flp4->flowi4_tos) &
(IPTOS_RT_MASK | RTO_ONLINK)) &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
dst_use(&rth->dst, jiffies);
RT_CACHE_STAT_INC(out_hit);
rcu_read_unlock_bh();
if (!flp4->saddr)
flp4->saddr = rth->rt_src;
if (!flp4->daddr)
flp4->daddr = rth->rt_dst;
return rth;
}
RT_CACHE_STAT_INC(out_hlist_search);
}
rcu_read_unlock_bh();
slow_output:
return ip_route_output_slow(net, flp4);
}
首先从cache中查找,查不到再走ip_route_output_slow流程
cache查找比较简单,利用flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)这四个字段生成hash值,
根据这个hash值去rt_hash_table全局变量对应的hash表中查找,对比需要比较的字段,满足则返回缓存项。
由于route cache对性能的影响不确定,在linux-3.6内核以后被删除:
http://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=89aef8921bfbac22f00e04f8450f6e447db13e42
struct rtable {
struct dst_entry dst;
/* Lookup key. */
__be32 rt_key_dst;
__be32 rt_key_src;
int rt_genid;
unsigned rt_flags;
__u16 rt_type;
__u8 rt_key_tos;
__be32 rt_dst; /* Path destination */
__be32 rt_src; /* Path source */
int rt_route_iif;
int rt_iif;
int rt_oif;
__u32 rt_mark;
/* Info on neighbour */
__be32 rt_gateway;
/* Miscellaneous cached information */
__be32 rt_spec_dst; /* RFC1122 specific destination */
u32 rt_peer_genid;
struct inet_peer *peer; /* long-living peer info */
struct fib_info *fi; /* for client ref to shared metrics */
};
重点关注ip_route_output_slow(net, flp4)函数,其调用fib_lookup(net, fl4, &res)函数进行路由寻找:
int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
{
struct fib_lookup_arg arg = {
.result = res,
.flags = FIB_LOOKUP_NOREF,
};
int err;
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
res->r = arg.rule;
return err;
}
在初始化fib4_rules_init函数中ipv4.rules_ops赋值为fib4_rules_ops_template
static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = {
.family = AF_INET,
.rule_size = sizeof(struct fib4_rule),
.addr_size = sizeof(u32),
.action = fib4_rule_action,
.match = fib4_rule_match,
.configure = fib4_rule_configure,
.compare = fib4_rule_compare,
.fill = fib4_rule_fill,
.default_pref = fib_default_rule_pref,
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
.policy = fib4_rule_policy,
.owner = THIS_MODULE,
};
同时插入三种表对应的路由规则,通过ip命令查看:
root:/home/# ip rule list
0: from all lookup local
32766: from all lookup main
32767: from all lookup default
static int fib_default_rules_init(struct fib_rules_ops *ops)
{
int err;
err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
if (err < 0)
return err;
err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
if (err < 0)
return err;
err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
if (err < 0)
return err;
return 0;
}
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
struct fib_rule *r;
r = kzalloc(ops->rule_size, GFP_KERNEL);
if (r == NULL)
return -ENOMEM;
atomic_set(&r->refcnt, 1);
r->action = FR_ACT_TO_TBL;
r->pref = pref;
r->table = table;
r->flags = flags;
r->fr_net = hold_net(ops->fro_net);
/* The lock is not required here, the list in unreacheable
* at the moment this function is called */
list_add_tail(&r->list, &ops->rules_list);
return 0;
}
函数fib_default_rule_add只赋值了很少的分量,因此fib_rule_match总是返回1
int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
int flags, struct fib_lookup_arg *arg)
{
struct fib_rule *rule;
int err;
rcu_read_lock();
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
jumped:
if (!fib_rule_match(rule, ops, fl, flags)) //返回1
continue;
if (rule->action == FR_ACT_GOTO) {
struct fib_rule *target;
target = rcu_dereference(rule->ctarget);
if (target == NULL) {
continue;
} else {
rule = target;
goto jumped;
}
} else if (rule->action == FR_ACT_NOP)
continue;
else
err = ops->action(rule, fl, flags, arg); //fib4_rule_action,对于非本地的ip,都需要先local再到main查找
if (err != -EAGAIN) {
if ((arg->flags & FIB_LOOKUP_NOREF) ||
likely(atomic_inc_not_zero(&rule->refcnt))) {
arg->rule = rule;
goto out;
}
break;
}
}
err = -ESRCH;
out:
rcu_read_unlock();
return err;
}
根据规则找到相应的表:
static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
int err = -EAGAIN;
struct fib_table *tbl;
switch (rule->action) {
case FR_ACT_TO_TBL:
break;
case FR_ACT_UNREACHABLE:
err = -ENETUNREACH;
goto errout;
case FR_ACT_PROHIBIT:
err = -EACCES;
goto errout;
case FR_ACT_BLACKHOLE:
default:
err = -EINVAL;
goto errout;
}
tbl = fib_get_table(rule->fr_net, rule->table);
if (!tbl)
goto errout;
err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
if (err > 0)
err = -EAGAIN;
errout:
return err;
}
fib_get_table函数根据编译选项的不同,实现方式不一样,CONFIG_IP_MULTIPLE_TABLES
http://www.compendium.com.ar/policy-routing.txt
http://www.linuxjournal.com/content/linux-advanced-routing-tutorial
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
struct hlist_node *node;
struct hlist_head *head;
unsigned int h;
if (id == 0)
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1);
rcu_read_lock();
head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
if (tb->tb_id == id) {
rcu_read_unlock();
return tb;
}
}
rcu_read_unlock();
return NULL;
}
enum rt_class_t {
RT_TABLE_UNSPEC=0,
/* User defined values */
RT_TABLE_COMPAT=252,
RT_TABLE_DEFAULT=253,
RT_TABLE_MAIN=254,
RT_TABLE_LOCAL=255,
RT_TABLE_MAX=0xFFFFFFFF
};
通过ip命令可以增加其他的table,以及显示特定的table:
root@ # ip route add 10.2.0.0/16 via 10.2.2.1 table 1
root@ # ip route list table local
broadcast 10.0.0.0 dev eth0 proto kernel scope link src 10.2.1.2
local 10.2.1.2 dev eth0 proto kernel scope host src 10.2.1.2
broadcast 10.255.255.255 dev eth0 proto kernel scope link src 10.2.1.2
broadcast 127.0.0.0 dev lo proto kernel scope link src 127.0.0.1
local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
local 127.0.0.1 dev lo proto kernel scope host src 127.0.0.1
broadcast 127.255.255.255 dev lo proto kernel scope link src 127.0.0.1
broadcast 192.168.1.0 dev wlan0 proto kernel scope link src 192.168.1.109
local 192.168.1.109 dev wlan0 proto kernel scope host src 192.168.1.109
broadcast 192.168.1.255 dev wlan0 proto kernel scope link src 192.168.1.109
root@ # ip route list table main
default via 192.168.1.1 dev wlan0 proto static
10.0.0.0/8 dev eth0 proto kernel scope link src 10.2.1.2
192.168.1.0/24 dev wlan0 proto kernel scope link src 192.168.1.109 metric 9
root@ # ip route list table default
root@ # ip route list table 1
10.2.0.0/16 via 10.2.2.1 dev eth0
root@ #
对于本文的用户程序,由于目的IP不在local地址范围内,因此会选择table main
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
该函数以flp为参数到tb对应的表格中寻找,找到的结果放入res中
res->prefixlen = li->plen;
res->nh_sel = nhsel;
res->type = fa->fa_type; //RTN_UNICAST
res->scope = fa->fa_info->fib_scope;
res->fi = fi;
res->table = tb;
res->fa_head = &li->falh;
根据找到的ret进行处理;
if (res.type == RTN_LOCAL) {
if (!fl4->saddr) {
if (res.fi->fib_prefsrc)
fl4->saddr = res.fi->fib_prefsrc;
else
fl4->saddr = fl4->daddr;
}
dev_out = net->loopback_dev;
fl4->flowi4_oif = dev_out->ifindex;
res.fi = NULL;
flags |= RTCF_LOCAL;
goto make_route;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
fib_select_multipath(&res);
else
#endif
if (!res.prefixlen &&
res.table->tb_num_default > 1 &&
res.type == RTN_UNICAST && !fl4->flowi4_oif)
fib_select_default(&res);
if (!fl4->saddr)
fl4->saddr = FIB_RES_PREFSRC(net, res);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
根据res生成rtable,并且插入hash表中,rt_intern_hash函数中完成路由ip和mac地址的绑定
make_route:
rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
dev_out, flags);
if (!IS_ERR(rth)) {
unsigned int hash;
hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
rt_genid(dev_net(dev_out)));
rth = rt_intern_hash(hash, rth, NULL, orig_oif);
}
根据ip地址从neighbour子系统中寻找对应的neighbour数据结构,然后赋值给dst
static int rt_bind_neighbour(struct rtable *rt)
{
struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
if (IS_ERR(n))
return PTR_ERR(n);
dst_set_neighbour(&rt->dst, n);
return 0;
}