Linux kernel 版本:5.4.1
Netfilter特指内核中的netfilter框架
iptables指用户空间的配置工具
Linux 上最常用的防火墙工具是 iptables。iptables 与协议栈内有包过滤功能的 hook 交 互来完成工作。这些内核 hook 构成了 netfilter 框架。
每个进入网络系统的包(接收或发送)在经过协议栈时都会触发这些 hook,程序 可以通过注册 hook 函数的方式在一些关键路径上处理网络流量。iptables 相关的内核模 块在这些 hook 点注册了处理函数,因此可以通过配置 iptables 规则来使得网络流量符合 防火墙规则。
Iptable/netfilter使用table来组织规则,根据**用来做什么**的判断标准,将规则分为不同table。
Table名 | 作用描述 |
---|---|
Filter | 用于判断是否允许一个包通过,最常用的table。 |
NAT | 用于实现网络地址转换规则,一般用于将数据包路由到无法直接访问的网络。 |
Mangle | 用于修改包的IP头。例如,可以修改包的TTL。 |
Raw | 控制数据包不被connecting tracking所追踪。 |
Security | 用于给包打上SELinux标记,以此影响SELinux或其他可以解读SELinux安全上线问的系统处理包的行为。 |
而在每个table内部,规则被进一步组织成chain,内置的chain是由内置的hook触发的。chain基本上能决定规则**何时**被触发。
Chain名称 | HOOK宏 |
---|---|
PREROUTING | NF_INET_PRE_ROUTING |
INPUT | NF_INET_LOCAL_IN |
FORWARD | NF_INET_FORWARD |
OUTPUT | NF_INET_LOCAL_OUT |
POSTROUTING | NF_INET_POST_ROUTING |
每个table里面的chain不是相同的,即数据包在不同的处理流程的位置,Iptable/netfilter的处理方法不一样,具体见每个table的宏定义。
/* File: linux-5.4.1\net\ipv4\netfilter\iptable_filter.c */
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
/* File: linux-5.4.1\net\ipv4\netfilter\iptable_nat.c */
static const struct xt_table nf_nat_ipv4_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
(1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_OUT) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.table_init = iptable_nat_table_init,
};
/* File: linux-5.4.1\net\ipv4\netfilter\iptable_mangle.c */
#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
(1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
/* File: X:\linux-5.4.1\net\ipv4\netfilter\iptable_raw.c */
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
/* File: X:\linux-5.4.1\net\ipv4\netfilter\iptable_security.c */
#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
Chains/Table | Filter | NAT | Mangle | Raw | Security |
---|---|---|---|---|---|
PREROUTING | Y | Y | Y | ||
INPUT | Y | Y | Y | Y | |
FORWARD | Y | Y | Y | ||
OUTPUT | Y | Y | Y | Y | Y |
POSTROUTING | Y | Y |
当一个包触发 netfilter hook 时,处理过程存在先后顺序和处理条件。 触发哪个 hook (列)和包的方向(ingress/egress)、路由判断、过滤条件等相关。
特定事件会导致 table 的 chain 被跳过。例如,只有每个连接的第一个包会去匹配 NAT 规则,对这个包的动作会应用于此连接后面的所有包。到这个连接的应答包会被自动应用反 方向的 NAT 规则。
具体的包触发流程如下:
这里的ct表示的是conntrack
NF_HOOK是协议和Netfilter框架的切入点, 完成netfilter模块的处理后,调用int (*okfn)(struct net *, struct sock *, struct sk_buff *)进行后续处理流程。
而在netfilter模块的处理中,主要的就是对iptables所配置的规则进行匹配,如果这个chain上所有的规则匹配均通过,则可以执行后续的函数。
/* File:linux-5.4.1\include\linux\netfilter.h */
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
/* 根据nf_hook()的返回值,决定要不要执行后续函数 */
if (ret == 1)
ret = okfn(net, sk, skb);
return ret;
}
/* File:linux-5.4.1\include\linux\netfilter.h */
/**
* nf_hook - call a netfilter hook
*
* Returns 1 if the hook has allowed the packet to pass. The function
* okfn must be invoked by the caller in this case. Any other return
* value indicates the packet has been consumed by the hook.
*
*
* 调用hook函数,如果该函数允许数据包通过,返回1
* 在这种情况下,调用方必须调用函数okfn。任何其他的返回值都表示数据包已被hook使用。
*/
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
struct sock *sk, struct sk_buff *skb,
struct net_device *indev, struct net_device *outdev,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
struct nf_hook_entries *hook_head = NULL;
int ret = 1;
#ifdef CONFIG_JUMP_LABEL
if (__builtin_constant_p(pf) &&
__builtin_constant_p(hook) &&
!static_key_false(&nf_hooks_needed[pf][hook]))
return 1;
#endif
rcu_read_lock();
/* 判断网络层协议 */
switch (pf) {
case NFPROTO_IPV4:
hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
break;
case NFPROTO_IPV6:
hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
break;
case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
break;
hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
break;
case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
break;
#if IS_ENABLED(CONFIG_DECNET)
case NFPROTO_DECNET:
hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
break;
#endif
default:
WARN_ON_ONCE(1);
break;
}
if (hook_head) {
struct nf_hook_state state;
nf_hook_state_init(&state, hook, pf, indev, outdev,
sk, net, okfn);
ret = nf_hook_slow(skb, &state, hook_head, 0);
// 返回1 ,代表NF_ACCEPT,okfn()需要被别的caller执行
// 返回-EPERM,代表NF_DROP,也就是NF_HOOK中,不再执行okfn()函数
}
rcu_read_unlock();
return ret;
}
具体hook函数中的规则匹配流程,见《规则匹配流程》小节
/* File: linux-5.4.1\net\netfilter\core.c */
/* Returns 1 if okfn() needs to be executed by the caller,
* -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock.
*
* nf_hook_slow()用于完成成钩子函数遍历执行,也就是规则匹配,这里的执行顺序是按照table优先级执行
*
*/
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
const struct nf_hook_entries *e, unsigned int s)
{
unsigned int verdict;
int ret;
for (; s < e->num_hook_entries; s++) {
/* 执行对应的hook函数
* 这里的hook函数,是在iptables中的5个table在init的时候挂载的几个
* 1、Filter : iptable_filter_hook();
* 2、Mangle : iptable_mangle_hook();
* 3、NAT : iptable_nat_do_hook();
* 4、Raw : iptable_raw_hook();
* 5、Security : iptable_security_hook();
*/
verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
switch (verdict & NF_VERDICT_MASK) {
case NF_ACCEPT:
break;
case NF_DROP:
kfree_skb(skb);
ret = NF_DROP_GETERR(verdict);
if (ret == 0)
ret = -EPERM;
return ret;
case NF_QUEUE:
ret = nf_queue(skb, state, s, verdict);
if (ret == 1)
continue;
return ret;
default:
/* Implicit handling for NF_STOLEN, as well as any other
* non conventional verdicts.
*/
return 0;
}
}
return 1;
}
/* File: linux-5.4.1\include\uapi\linux\netfilter_ipv4\ip_tables.h */
/* This structure defines each of the firewall rules. Consists of 3
parts which are
1) general IP header stuff
2) match specific stuff
3) the target to perform if the rule matches
标准匹配结构,主要包括:
数据包的源、目的IP
出/入接口
掩码等
*/
struct ipt_entry {
/* 要匹配的报文IP头 */
struct ipt_ip ip;
/* Mark with fields that we care about.
* 位向量,标识本规则关心报文的什么部分
*/
unsigned int nfcache;
/* Size of ipt_entry + matches
target区的偏移,通常target区位于match区之后,而match区则在ipt_entry的末尾;
初始化为sizeof(struct ipt_entry),即假定没有match
*/
__u16 target_offset;
/* Size of ipt_entry + matches + target
下一条规则相对于本规则的偏移,也即本规则所用空间的总和,
初始化为sizeof(struct ipt_entry)+sizeof(struct ipt_target),即没有match
*/
__u16 next_offset;
/* Back pointer
位向量,标记调用本规则的HOOK号,可用于检查规则的有效性 */
unsigned int comefrom;
/* Packet and byte counters.
记录该规则处理过的报文数和报文总字节数 */
struct xt_counters counters;
/* The matches (if any), then the target.
target或者是match的起始位置 */
unsigned char elems[0];
};
iptables的规则是由iptables的命令创建并添加到对应的链上,用户配置完一条iptables规则之后,传给内核的是一个ipt_replace结构,其中包含了内核所需的所有的内容。
/* File: linux-5.4.1\include\uapi\linux\netfilter_ipv4\ip_tables.h */
/* The argument to IPT_SO_SET_REPLACE. */
/* 用户配置完iptables规则之后,传给内核的时一个ipt-table结构,其中包含了内核所需要的所有内容 */
struct ipt_replace {
/* Which table. 表名 */
char name[XT_TABLE_MAXNAMELEN];
/* Which hook entry points are valid: bitmask. You can't
change this. */
unsigned int valid_hooks;
/* Number of entries 新的entry数 */
unsigned int num_entries;
/* Total size of new entries */
unsigned int size;
/* Hook entry points. */
unsigned int hook_entry[NF_INET_NUMHOOKS];
/* Underflow points. */
unsigned int underflow[NF_INET_NUMHOOKS];
/* Information about old entries: 旧的规则数*/
/* Number of counters (must be equal to current number of entries). */
unsigned int num_counters;
/* The old entries' counters. */
struct xt_counters __user *counters;
/* The entries (hang off end: not really an array). */
/* 规则本身 */
struct ipt_entry entries[0];
};
ipt_match就是xt_match
#define ipt_match xt_match
struct xt_match {
struct list_head list;
const char name[XT_EXTENSION_MAXNAMELEN];
u_int8_t revision;
/* Return true or false: return FALSE and set *hotdrop = 1 to
force immediate packet drop. */
/* Arguments changed since 2.6.9, as this must now handle
non-linear skb, using skb_header_pointer and
skb_ip_make_writable. */
/* 匹配函数,最重要的部分,返回true表示匹配成功,返回false表示匹配失败 */
bool (*match)(const struct sk_buff *skb,
struct xt_action_param *);
/* Called when user tries to insert an entry of this type.
在使用本Match的规则注入表中之前调用,进行有效性检查,如果返回0,规则就不会加入iptables中. */
int (*checkentry)(const struct xt_mtchk_param *);
/* Called when entry of this type deleted.
删除包含本match的entry时调用,与checkentry配合可用于动态内存分配和释放 */
void (*destroy)(const struct xt_mtdtor_param *);
#ifdef CONFIG_COMPAT
/* Called when userspace align differs from kernel space one */
void (*compat_from_user)(void *dst, const void *src);
int (*compat_to_user)(void __user *dst, const void *src);
#endif
/* Set this to THIS_MODULE if you are a module, otherwise NULL
是否为模块 */
struct module *me;
const char *table;
unsigned int matchsize;
unsigned int usersize;
#ifdef CONFIG_COMPAT
unsigned int compatsize;
#endif
unsigned int hooks;
unsigned short proto;
unsigned short family;
};
ipt_target就是xt_target
#define ipt_target
/* Registration hooks for targets. */
struct xt_target {
struct list_head list;
const char name[XT_EXTENSION_MAXNAMELEN];
u_int8_t revision;
/* Returns verdict. Argument order changed since 2.6.9, as this
must now handle non-linear skbs, using skb_copy_bits and
skb_ip_make_writable.
target的模块函数,如果需要继续处理则返回IPT_CONTINUE(-1),否则返回NF_ACCEPT、NF_DROP等值,
它的调用者根据它的返回值来判断如何处理它处理过的报文
*/
unsigned int (*target)(struct sk_buff *skb,
const struct xt_action_param *);
/* Called when user tries to insert an entry of this type:
hook_mask is a bitmask of hooks from which it can be
called. */
/* Should return 0 on success or an error code otherwise (-Exxxx).
在使用本Match的规则注入表中之前调用,进行有效性检查,如果返回0,规则就不会加入iptables中 */
int (*checkentry)(const struct xt_tgchk_param *);
/* Called when entry of this type deleted.
在包含本Target的规则从表中删除时调用,与checkentry配合可用于动态内存分配和释放 */
void (*destroy)(const struct xt_tgdtor_param *);
#ifdef CONFIG_COMPAT
/* Called when userspace align differs from kernel space one */
void (*compat_from_user)(void *dst, const void *src);
int (*compat_to_user)(void __user *dst, const void *src);
#endif
/* Set this to THIS_MODULE if you are a module, otherwise NULL
表示当前Target是否为模块(NULL为否) */
struct module *me;
const char *table;
unsigned int targetsize;
unsigned int usersize;
#ifdef CONFIG_COMPAT
unsigned int compatsize;
#endif
unsigned int hooks;
unsigned short proto;
unsigned short family;
};
每一条iptables
配置的rule
都包含了匹配条件(match
)部分和动作(target
)。当报文途径HOOK
点时,Netfilter
会逐个遍历挂在该钩子点上的表的rule
,若报文满足rule
的匹配条件,内核就会执行动作(target
)。
在ip_tables.c中,ip_tables_init函数中调用hf_register_sockopts(&ipt_sockopts)注册对应的get和set方法
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
static struct nf_sockopt_ops ipt_sockopts = {
.pf = PF_INET,
.set_optmin = IPT_BASE_CTL,
.set_optmax = IPT_SO_SET_MAX+1,
.set = do_ipt_set_ctl,//set方法
#ifdef CONFIG_COMPAT
.compat_set = compat_do_ipt_set_ctl,
#endif
.get_optmin = IPT_BASE_CTL,
.get_optmax = IPT_SO_GET_MAX+1,
.get = do_ipt_get_ctl,//get方法
#ifdef CONFIG_COMPAT
.compat_get = compat_do_ipt_get_ctl,
#endif
.owner = THIS_MODULE,
};
static int __init ip_tables_init(void)
{
int ret;
ret = register_pernet_subsys(&ip_tables_net_ops);
if (ret < 0)
goto err1;
/* No one else will be downing sem now, so we won't sleep */
ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
if (ret < 0)
goto err2;
ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
if (ret < 0)
goto err4;
/* Register setsockopt */
/* 注册一个socket option,这个option用于读或写iptable的配置,
* 如:Linux的防火墙规则、NAT转换映射最终都是通过这个接口通知内核的
*/
ret = nf_register_sockopt(&ipt_sockopts);
if (ret < 0)
goto err5;
return 0;
err5:
xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
err4:
xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
unregister_pernet_subsys(&ip_tables_net_ops);
err1:
return ret;
}
用户改变iptables规则后,如set方法就是将用户空间传过来的ipt_replace来替换旧的iptables规则。
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* 将用户空间传过来的ipt_replace来替换旧的iptables规则。该工作在do_replace()函数中完成 */
static int
do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
int ret;
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
switch (cmd) {
case IPT_SO_SET_REPLACE:
ret = do_replace(sock_net(sk), user, len);
break;
case IPT_SO_SET_ADD_COUNTERS:
ret = do_add_counters(sock_net(sk), user, len, 0);
break;
default:
ret = -EINVAL;
}
return ret;
}
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
static int
do_replace(struct net *net, const void __user *user, unsigned int len)
{
int ret;
struct ipt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
struct ipt_entry *iter;
/* 拷贝用户空间的struct ip_replace结构 */
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
/* 分配一个xt_table_info结构,并根据ipt_replace.size
给xt_table_info的entry成员分配空间
*/
newinfo = xt_alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
/* 将ipt_replace.entries的内容拷贝到内核 */
if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
/* 根据ipt_replace给xt_table_info结构各个成员赋值 */
ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
/* 根据ipt_replace的name找到要更新的表,
用上面的xt_table_info为xt_table赋值
*/
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
goto free_newinfo_untrans;
return 0;
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
最终的结果是更新了内核中的某个xt_table表,如NAT表。
iptables中有5种tables,不同table的对应的rule作用不一致,每个tables都有它对应的hook函数,但最终都是走的ip_do_table()函数来匹配规则。
Tables | Hook函数 |
---|---|
Filter | iptable_filter_hook() |
NAT | iptable_nat_do_chain() |
Mangle | iptable_mangle_hook() |
Raw | iptable_raw_hook() |
Security | iptable_security_hook() |
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
/* 遍历钩子链上的所有规则,进行标准匹配和扩展匹配,执行其target操作 */
unsigned int
ipt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct xt_table *table)
{
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
const void *table_base;
struct ipt_entry *e, **jumpstack;
unsigned int stackidx, cpu;
const struct xt_table_info *private;
struct xt_action_param acpar;
unsigned int addend;
/* Initialization */
stackidx = 0;
ip = ip_hdr(skb);
indev = state->in ? state->in->name : nulldevname;
outdev = state->out ? state->out->name : nulldevname;
/* We handle fragments by dealing with the first fragment as
* if it was a normal packet. All other fragments are treated
* normally, except that they will NEVER match rules that ask
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
acpar.thoff = ip_hdrlen(skb);
acpar.hotdrop = false;
acpar.state = state;
WARN_ON(!(table->valid_hooks & (1 << hook)));
local_bh_disable();
addend = xt_write_recseq_begin();
private = READ_ONCE(table->private); /* Address dependency. */
cpu = smp_processor_id();
/* 首个规则地址 */
table_base = private->entries;
jumpstack = (struct ipt_entry **)private->jumpstack[cpu];
/* Switch to alternate jumpstack if we're being invoked via TEE.
* TEE issues XT_CONTINUE verdict on original skb so we must not
* clobber the jumpstack.
*
* For recursion via REJECT or SYNPROXY the stack will be clobbered
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
/* 获取对应链上的首个匹配规则 */
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
WARN_ON(!e);
/* 标准match */
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
/* 未匹配成功,继续下一个规则 */
e = ipt_next_entry(e);
continue;
}
/* 扩展match */
xt_ematch_foreach(ematch, e) {
acpar.match = ematch->u.kernel.match;
acpar.matchinfo = ematch->data;
/* 只要有返回不匹配的,则说明匹配当前规则失败 */
if (!acpar.match->match(skb, &acpar))
goto no_match;
}
counter = xt_get_this_cpu_counter(&e->counters);
ADD_COUNTER(*counter, skb->len, 1);
/* 标准match和扩展match都成功 */
/* 获取target */
t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
/* The packet is traced: log it */
if (unlikely(skb->nf_trace))
trace_packet(state->net, skb, hook, state->in,
state->out, table->name, private, e);
#endif
/* Standard target? */
/* 标准target */
if (!t->u.kernel.target->target) {
int v;
v = ((struct xt_standard_target *)t)->verdict;
/* 不会跳转到用户自定义规则 */
if (v < 0) {
/* Pop from stack? */
/* 不是XT_RETURN,则跳出结果 */
if (v != XT_RETURN) {
verdict = (unsigned int)(-v) - 1;
break;
}
/* XT_RETURN则继续匹配下一个规则 */
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
} else {
e = jumpstack[--stackidx];
e = ipt_next_entry(e);
}
continue;
}
/* 记录跳转规则,以便返回时获取吓一跳规则进行后续匹配 */
if (table_base + v != ipt_next_entry(e) &&
!(e->ip.flags & IPT_F_GOTO)) {
if (unlikely(stackidx >= private->stacksize)) {
verdict = NF_DROP;
break;
}
jumpstack[stackidx++] = e;
}
/* 获取自定义规则 */
e = get_entry(table_base, v);
continue;
}
/* 扩展target,执行target回调 */
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
/* 需要继续匹配 */
if (verdict == XT_CONTINUE) {
/* Target might have changed stuff. */
ip = ip_hdr(skb);
e = ipt_next_entry(e);
}
/* 跳出处理匹配结果 */
else {
/* Verdict */
break;
}
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
local_bh_enable();
/* drop标记 */
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
}
标准匹配Match
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* Returns whether matches rule or not. */
/* Performance critical - called for every packet
*
* 标准匹配match
*/
static inline bool
ip_packet_match(const struct iphdr *ip,
const char *indev,
const char *outdev,
const struct ipt_ip *ipinfo,
int isfrag)
{
unsigned long ret;
/* 处理源IP和目的IP */
if (NF_INVF(ipinfo, IPT_INV_SRCIP,
(ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
NF_INVF(ipinfo, IPT_INV_DSTIP,
(ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
return false;
/* 处理输入输出接口 */
ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
return false;
ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
return false;
/* Check specific protocol 检查协议字段是否匹配*/
if (ipinfo->proto &&
NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
return false;
/* If we have a fragment rule but the packet is not a fragment
* then we return zero
* 处理分片包的匹配情况
*/
if (NF_INVF(ipinfo, IPT_INV_FRAG,
(ipinfo->flags & IPT_F_FRAG) && !isfrag))
return false;
/* 以上所有匹配均通过,返回true */
return true;
}
Targets就是找到匹配的数据包之后怎么办,常见的有下面几种:
当然iptables包含的targets很多很多,但并不是每个表都支持所有的targets,
rule所支持的target由它所在的表和chain以及所开启的扩展功能来决定,具体每个表支持的targets请参考Iptables Targets And Jumps
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* for const-correctness */
static inline const struct xt_entry_target *
ipt_get_target_c(const struct ipt_entry *e)
{
return ipt_get_target((struct ipt_entry *)e);
}
/* File: linux-5.4.1\include\uapi\linux\netfilter_ipv4\ip_tables.h */
/* Helper functions */
static __inline__ struct xt_entry_target *
ipt_get_target(struct ipt_entry *e)
{
return (void *)e + e->target_offset;
}
netfilter在整个的IP报文收发流程源码的HOOK点如下图所示:
以下这段这里不做研究,我们这里只研究协议栈
硬件网卡->DMA驱动->CPU硬件中断处理(IRQ)->NIC Driver->软中断处理->内核网络模块->协议栈
/* File: linux-5.4.1\net\core\dev.c */
static inline int deliver_skb(struct sk_buff *skb,
struct packet_type *pt_prev,
struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
refcount_inc(&skb->users);
/* 调用ip_rcv() */
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
/* 这个 .func = ip_rcv,是dev调用ip_rcv的接口 */
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.list_func = ip_list_rcv,
};
从net协议栈分发到Ipv4模块时,是由ip_rcv(函数)来处理。
/* File: linux-5.4.1\net\ipv4\ip_input.c */
/*
* IP receive entry point
*
* @skb 接收到的IP数据包
* @dev 接收到的IP包当前的输入网络设备
* @pt 输入此数据包的网络层输入接口
* @orig_dev 接受到的IP数据包原始的输入网络设备
*
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
struct net *net = dev_net(dev);
skb = ip_rcv_core(skb, net); //对收到的报文进行处理,根据处理结果,决定是否drop
if (skb == NULL)
return NET_RX_DROP;
/* NF_HOOK是该函数和Netfilter框架的切入点
* 完成netfilter模块的处理后,调用ip_rcv_finish()
*/
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
}
/*
* Main IP Receive routine.
*
* 主IP接受程序
* 是从ip_rcv()函数中,把这块的功能剥离出来成一个单独的函数
*/
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*
* skb->pkt_type 值的判断是在 eth_type_trans函数(net/ethernet/eth.c)中做判定的
* 这里的判定的依据是报文的[目的MAC]来判断的
*/
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
//共享检查,如果是共享数据包,因为可能要修改skb中的信息,所以要先复制一份副本,再进一步处理
if (!skb) {
__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr))) //检查首部是否够长
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
__IP_ADD_STATS(net,
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
if (!pskb_may_pull(skb, iph->ihl*4))//对数据报的头长度进行检查
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error;
len = ntohs(iph->tot_len);
if (skb->len < len) {
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*
* 根据ip包总长度,重新计算skb的长度,去掉末尾无用信息。
*/
if (pskb_trim_rcsum(skb, len)) {
__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
iph = ip_hdr(skb);
skb->transport_header = skb->network_header + iph->ihl*4;
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
return skb;
csum_error:
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NULL;
}
在ip_rcv()中的NF_HOOK中,根据nf_hook()的返回值,决定要不要执行后续函数ip_rcv_finish()。在nf_hook()函数中,根据NF_INET_PRE_ROUTING链上table中对应的规则,来决定要不要drop。
/* File:linux-5.4.1\include\linux\netfilter.h */
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
struct net_device *in, struct net_device *out,
int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
/* 根据nf_hook()的返回值,决定要不要执行后续函数 */
if (ret == 1)
ret = okfn(net, sk, skb);
return ret;
}
走到这步,说明报文符合incoming阶段的规则,没有被drop掉。后面怎么走,根据查找输入路由缓存决定。
这里的主要功能为:
/* File: linux-5.4.1\net\ipv4\ip_input.c */
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
int ret;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
ret = ip_rcv_finish_core(net, sk, skb, dev);
/* 最后,根据输入路由缓存,决定是"输入到本地"或者"转发"
* 这里的dst_input,有两种可能
* 1、ip_local_deliver(), 对应于NF_INET_LOCAL_IN;
* 2、ip_forward(), 对应于NF_INET_FORWARD;
*/
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
}
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
struct sk_buff *skb, struct net_device *dev)
{
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
struct rtable *rt;
int err;
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
udp_v4_early_demux, skb);
if (unlikely(err))
goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
}
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*
* 用于为该packet初始化其在linux网络中的路径缓存
*
* 如果还没有为该数据包查找到输入路由缓存。则调用ip_route_input_noref()来为其查找,
* 若失败,则丢弃packet。
*/
if (!skb_valid_dst(skb)) {
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, dev);
if (unlikely(err))
goto drop_error;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
/* 通过
* ip_rcv_option()->ip_options_rcv_srr()->ip_route_input()来查找路由缓存
*/
if (iph->ihl > 5 && ip_rcv_options(skb, dev))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
* When a host sends a datagram to a link-layer broadcast
* address, the IP destination address MUST be a legal IP
* broadcast or IP multicast address.
*
* A host SHOULD silently discard a datagram that is received
* via a link-layer broadcast (see Section 2.4) but does not
* specify an IP multicast or broadcast destination address.
*
* This doesn't explicitly say L2 *broadcast*, but broadcast is
* in a way a form of multicast and the most common use case for
* this is 802.11 protecting against cross-station spoofing (the
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
goto drop;
}
return NET_RX_SUCCESS;
drop:
kfree_skb(skb);
return NET_RX_DROP;
drop_error:
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
根据路由缓存,转到本地。
/* File: linux-5.4.1\net\ipv4\ip_input.c */
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
/* ip数据包分片重组,ip_defrag() */
if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
这里最终,报文根据对应的协议handler处理函数,上发到L4层处理。
/* File: linux-5.4.1\net\ipv4\ip_input.c */
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
__skb_pull(skb, skb_network_header_len(skb));
rcu_read_lock();
ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
}
/*
* 通过ip_local_deliver_finish()将输入数据包从网络层传递到传输层。过程如下:
*
* 1、去掉IP首部
* 2、如果是raw套接字接受数据包,则需要复制一份副本,输入到该接受数据包的套接字。
* 3、最后,通过传输层的接受例程,将数据包传递到传输层,由传输层进行处理。
*/
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
const struct net_protocol *ipprot;
int raw, ret;
resubmit:
/* 分发处理原始套接字 */
raw = raw_local_deliver(skb, protocol);
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot) {
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
return;
}
nf_reset_ct(skb);
}
/*
* 如果ipprot->handler是tcp,
* 则执行tcp_v4_rcv(skb);
* 如果是udp,
* 则执行udp_rcv(skb);
* 如果都不是,则用handler对应的协议函数处理
* ipprot->handler(skb);
*/
ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
}
/* File: linux-5.4.1\net\ipv4\ip_forward.c */
/* 函数在ip_route_input_slow->ip_mkroute_input注册
*
* IP数据包的转发是由ip_forward()处理,在ip_rcv_finish()
* 通过输入路由缓存被调用。
*/
int ip_forward(struct sk_buff *skb)
{
u32 mtu;
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
struct net *net;
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
goto drop;
if (unlikely(skb->sk))
goto drop;
if (skb_warn_if_lro(skb))
goto drop;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
goto drop;
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
skb_forward_csum(skb);
net = dev_net(skb->dev);
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
if (!xfrm4_route_forward(skb))
goto drop;
rt = skb_rtable(skb);
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
goto drop;
}
/* We are about to mangle packet. Copy it! */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
ip_decrease_ttl(iph);
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
if (net->ipv4.sysctl_ip_fwd_update_priority)
skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
/* File: linux-5.4.1\net\ipv4\tcp.c */
/* tcp发送上层数据包
* tcp发送数据最终都会调用到tcp_sendmsg(), 如:
* send系统调用会直接调用sys_sendto(), 然后填充msghdr数据结构, 并调用sock_sendmsg,
* 最终会调用__sock_sendmsg().在这个函数里会初始化sock_iocb结构,然后调用tcp_sendmsg()
*/
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
int ret;
/* 给整个发送消息的过程上锁 */
lock_sock(sk);
ret = tcp_sendmsg_locked(sk, msg, size);
release_sock(sk);
return ret;
}
/* File: linux-5.4.1\net\ipv4\tcp.c */
/* 由tcp_sendmsg()调用
* tcp_sendmsg_locked()的主要工作是把用户层的数据,填充到skb中,然后加入到sock的发送队列。
* 之后电泳tcp_write_xmit()来把sock发送队列中的skb尽量的发送出去。
* 另外。TCP的发送缓存的管理业主要在这个函数中
*/
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
int process_backlog = 0;
bool zc = false;
long timeo;
flags = msg->msg_flags;
/* 如果设置了MSG_ZEROCOPY,零拷贝 */
if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
skb = tcp_write_queue_tail(sk);
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
goto out_err;
}
zc = sk->sk_route_caps & NETIF_F_SG;
if (!zc)
uarg->zerocopy = 0;
}
/* 如果设置了MSG_FASTOPEN,调用tcp_sendmsg_fastopen() */
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
!tp->repair) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
goto out_err;
}
/* 取得发送超时时间 */
timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
/* 标记套接字是否有应用程序限制 */
tcp_rate_check_app_limited(sk); /* is sending application-limited? */
/* Wait for a connection to finish. One exception is TCP Fast Open
* (passive side) where data is allowed to be sent before a connection
* is fully established.
*
* 如果connect还没有完成,则等待连接完成,
* 如果是非阻塞,则直接返回。
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
err = sk_stream_wait_connect(sk, &timeo);
if (err != 0)
goto do_error;
}
/* 调用tcp_send_rcvq()将数据放入接收队列中 */
if (unlikely(tp->repair)) {
if (tp->repair_queue == TCP_RECV_QUEUE) {
copied = tcp_send_rcvq(sk, msg, size);
goto out_nopush;
}
err = -EINVAL;
if (tp->repair_queue == TCP_NO_QUEUE)
goto out_err;
/* 'common' sending to sendq */
}
sockcm_init(&sockc, sk);
if (msg->msg_controllen) {
err = sock_cmsg_send(sk, msg, &sockc);
if (unlikely(err)) {
err = -EINVAL;
goto out_err;
}
}
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/* Ok commence sending. */
copied = 0;
restart:
/* 获取当前的MSS、网络设备支持的最大数据长度size_goal
* 如果支持GSo, size_goal会是MSS的整数倍。
*/
mss_now = tcp_send_mss(sk, &size_goal, flags);
err = -EPIPE;
/* 如果发送端已经完全关闭则返回,并设置err */
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
/* 遍历用户层的数据块数组 */
while (msg_data_left(msg)) {
int copy = 0;
/* 发送队列的最后一个skb */
skb = tcp_write_queue_tail(sk);
if (skb)
copy = size_goal - skb->len;
/* 使用新的skb来装数据 */
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
new_segment:
/* 如果发送队列的总大小sk_wmem_queued大于等于发送缓存的上限
* sk_sndbuf,或者发送缓存中尚未发送的数据量唱过用户的设置值,
* 就进入等待。
*/
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
if (unlikely(process_backlog >= 16)) {
process_backlog = 0;
if (sk_flush_backlog(sk))
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
/* 申请发送缓存
* alloc的大小一般都是等于mss的大小,这里通过select_size得到的。
*/
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
first_skb);
if (!skb)
goto wait_for_memory;
process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
/* 将这个skb加入到sk_write_queue队列中,并更新sk_send_head()域 */
skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
* already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*
* 如果使用了TCP REPAIR选项,那么为skb设置“发送时间”
*/
if (tp->repair)
TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
}
/* Try to append data to the end of skb. */
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
/* Where to copy to? */
if (skb_availroom(skb) > 0 && !zc) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
}
/* 如果skb的线性数据区已经用完了,且 不是零拷贝,那么使用分页区 */
else if (!zc) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
goto wait_for_memory;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
if (i >= sysctl_max_skb_frags) {
/* 给TCP加一个PSH标记 */
tcp_mark_push(tp, skb);
goto new_segment;
}
merge = false;
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
pfrag->offset,
copy);
if (err)
goto do_error;
/* Update the skb. */
if (merge) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
} else {
skb_fill_page_desc(skb, i, pfrag->page,
pfrag->offset, copy);
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
}
else {/* 使用零拷贝 */
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
goto new_segment;
}
if (err < 0)
goto do_error;
copy = err;
}
/* 如果这是第一次拷贝,取消PSH标志 */
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;/* 更新发送队列的最后一个序号 */
tcp_skb_pcount_set(skb, 0);
/* 已经拷贝到发送队列的数据量 */
copied += copy;
/* 如果所有数据都拷贝好了,退出 */
if (!msg_data_left(msg)) {
if (unlikely(flags & MSG_EOR))
TCP_SKB_CB(skb)->eor = 1;
goto out;
}
/* 如果skb还可以继续填充数据,或者发送的是带外数据,或者使用TCP REPAIR选项
* 那么继续拷贝数据,先不发送。
*/
if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
continue;
if (forced_push(tp)) {
tcp_mark_push(tp, skb);
/* 把sk发送队列中所有的skb全部发送出去 */
__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
}
else if (skb == tcp_send_head(sk))
/* 发送队列不为空,则只发送一个skb */
tcp_push_one(sk, mss_now);
continue;
wait_for_sndbuf:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
err = sk_stream_wait_memory(sk, &timeo);
if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
}
out:
/* 如果已经有数据复制到发送队列了,就尝试立即发送 */
if (copied) {
tcp_tx_timestamp(sk, sockc.tsflags);
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
sock_zerocopy_put(uarg);
return copied + copied_syn;
do_error:
skb = tcp_write_queue_tail(sk);
do_fault:
tcp_remove_empty_skb(sk, skb);
if (copied + copied_syn)
goto out;
out_err:
sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
return err;
}
/* File: linux-5.4.1\net\ipv4\tcp_output.c */
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
*
* LARGESEND note: !tcp_urg_mode is overkill, only frames between
* snd_up-64k-mss .. snd_up cannot be large. However, taking into
* account rare use of URG, this is not a big flaw.
*
* Send at most one packet when push_one > 0. Temporarily ignore
* cwnd limit to force at most one packet out when push_one == 2.
* Returns true, if no segments are in flight and we have queued segments,
* but cannot send anything now because of SWS or another problem.
*
*
* 该函数讲发送队列上的SKB发送出去,返回值为0表示发送成功。
* 过程如下:
* 1、检测当前转台是否是TCP_CLOSE
* 2、检测用塞窗口大小
* 3、检测当前段是否完全处在发送窗口内
* 4、检测端是否使用nagle算法
* 5、通过以上检测后,将SKB发送出去
* 6、循环检测并发送【发送队列】上所有未发送的SKB
*
* 参数说明:
* mss_now:当前有效的MSS
* nonagle:标识是否启用nonagle算法
*
* 最终调用tcp_transmit_skb
*
* 对拥塞进行处理:
* 1、如果开启了pacing发送,检查是否应该发送,如果还不到发送时机,停止发送
* 2、如果拥塞窗口为0,停止发送
* 3、如果该数据包颤过了发送窗口,停止发送
* 4、如果上面条件均通过,就把数据包封装成tcp_segment发送出去
*/
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
/* 更改tp->tcp_mstamp为当前时间的us表示 */
tcp_mstamp_refresh(tp);
if (!push_one) {
/* Do MTU probing. 探测pmtu */
result = tcp_mtu_probe(sk);
if (!result) {
return false;
} else if (result > 0) {
sent_pkts = 1;
}
}
/* 预测发送的最大seg数 */
max_segs = tcp_tso_segs(sk, mss_now);
/* 如果发送队列不空,则准备开始发送段 */
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
}
if (tcp_pacing_check(sk))
break;
/* 设置有关tso的信息,包括GSO类型、GSO分段的大小等。这些
* 信息是准备给软件TSO分段使用的。如果网络设备不支持TSO,
* 但又使用了TSO功能,则段在提交给网络设备之前,需要进行
* 软分段,即由代码实现TSO分段。
*
*/
tso_segs = tcp_init_tso_segs(skb, mss_now);
BUG_ON(!tso_segs);
/* 检查目前是否可以发送数据,
* 确认当前发送窗口的大小。
* 检测拥塞窗口的大小,如果
* 为0,则说明拥塞窗口已经满
*/
cwnd_quota = tcp_cwnd_test(tp, skb);
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
cwnd_quota = 1;
else
break;
}
/* 检测当前段(包括线性区和分散聚合I/O区shifo)是否完全处在发送窗口内,如果是
* 则可以发送,否则目前不能发送。
*/
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
break;
}
if (tso_segs == 1) {
/* 如果无需TSO分段,则检测是否使用Nagle算法,
* 并确定当前能否立即发送该段。
*/
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
(tcp_skb_is_last(sk, skb) ?
nonagle : TCP_NAGLE_PUSH))))
break;
} else {
/* 如果需要TSO分段,则检测该段是否应该延迟发送,
* 如果是,则目前不能发送。
*/
if (!push_one &&
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
&is_rwnd_limited, max_segs))
break;
}
/* limit为再次分段的段长,初始化为当前的MSS */
limit = mss_now;
/* 判断当前段是不是TSO分段的段,如果是才处理 */
if (tso_segs > 1 && !tcp_urg_mode(tp))
/* 以发送窗口和拥塞窗口的最小值作为分段段长。也就是比较可用拥塞窗口和tcp_sendmsg中的发散聚合I/O页 */
limit = tcp_mss_split_point(sk, skb, mss_now,
min_t(unsigned int,
cwnd_quota,
max_segs),
nonagle);
/* 得到分段段长后,如果SKB中的数据长度大于分段长度,则调用tso_fragment()根据段长进行分段
* 如果分段失败,则目前暂不发送。
*/
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
if (tcp_small_queue_check(sk, skb, 0))
break;
/* 使用地址族相关的af_specific->queue_xmit()函数
* 将数据转发到网络层。IPv4使用的是 ip_queue_xmit()
*/
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
repair:
/* Advance the send_head. This one is sent out.
* This call will increment packets_out.
*/
tcp_event_new_data_sent(sk, skb);
/* 如果发送的段小于MSS,则更新最近发送的小包的最后一个字节序号 */
tcp_minshall_update(tp, mss_now, skb);
sent_pkts += tcp_skb_pcount(skb);/* 更新在函数中已发送的总段数 */
if (push_one)
break;
}
/* 如果接受窗口有限制,应该启动探测机制 */
if (is_rwnd_limited)
tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
/* 如果本次有数据发送,则对TCP拥塞窗口进行确认,最后返回成功 */
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk, false);
is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return !tp->packets_out && !tcp_write_queue_empty(sk);
}
/* File: linux-5.4.1\net\ipv4\tcp_output.c */
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
* All SKB's seen here are completely headerless. It is our
* job to build the TCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*
*
* 通常要发送一个TCP字段都是通过tcp_transmit_skb()的,该函数会给
* 待发送的段构造TCP首部,然后调用网络层接口到IP层,最终抵达网络设备。
* 由于在成功发送到网络设备后会释放该SKB,而TCP必须要街道对应的ACK后
* 才能真正释放数据,因此在发送前会根据参数确定是克隆还是复制一份SKB
* 用于发送。
*
* 最终的tcp发送都会调用这个clone_it()表示发送【发送队列】的第一个SKB的时候,
* 采用克隆还是直接使用skb.
* 如果发送应用层的数据使用克隆的,等待对方应答ACK回答才能把数据删除。
* 如果是回送ack信息的,则无需克隆。
* 如果不支持TSO或者GSO,这里的SKB->len为mss,否则如果支持,并且有数据在shinfo中,
* 则这里的SKB长度为shinfo或者拥塞窗口的最小值。
*
* 对于拥塞的处理:
* 1、tcp_event_ack_sent 统计我们发送了多少ack
* 2、tcp_event_data_sent 数据包发送后的拥塞状态记账
* 3、tcp_internal_pacing 计算bbr下一次发送数据包的时间
*/
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
struct tcp_sock *tp;
struct tcp_skb_cb *tcb;
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct sk_buff *oskb = NULL;
struct tcp_md5sig_key *md5;
struct tcphdr *th;
u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
/* 根据clone_it确定是否克隆待发送的数据包 */
if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una;
oskb = skb;
tcp_skb_tsorted_save(oskb) {
if (unlikely(skb_cloned(oskb)))
skb = pskb_copy(oskb, gfp_mask);
else
skb = skb_clone(oskb, gfp_mask);
} tcp_skb_tsorted_restore(oskb);
if (unlikely(!skb))
return -ENOBUFS;
}
/* 获取INET层和TCP层的传输控制块、SKB中的TCP私有控制块
* 以及当前TCP首部长度
*/
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
/* 判断当前TCP段是不是SYN段,因为有些选项只能出现在SYN段中,需作特别处理 */
if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
} else {
tcp_options_size = tcp_established_options(sk, skb, &opts,
&md5);
/* Force a PSH flag on all (GSO) packets to expedite GRO flush
* at receiver : This slightly improve GRO performance.
* Note that we do not force the PSH flag for non GSO packets,
* because they might be sent under high congestion events,
* and in this case it is better to delay the delivery of 1-MSS
* packets and thus the corresponding ACK packet that would
* release the following packet.
*/
if (tcp_skb_pcount(skb) > 1)
tcb->tcp_flags |= TCPHDR_PSH;
}
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
/* if no packet is in qdisc/device queue, then allow XPS to select
* another queue. We can be called from tcp_tsq_handler()
* which holds one reference to sk.
*
* TODO: Ideally, in-flight pure ACK packets should not matter here.
* One way to get this would be to set skb->truesize = 2 on them.
*/
skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
/* If we had to use memory reserve to allocate this skb,
* this might cause drops if packet is looped back :
* Other socket might not have SOCK_MEMALLOC.
* Packets not looped back do not care about pfmemalloc.
*/
skb->pfmemalloc = 0;
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
skb->sk = sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
skb_set_hash_from_sk(skb, sk);
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
th->check = 0;
th->urg_ptr = 0;
/* The urg_mode check is necessary during a below snd_una win probe
*
* 判断是否需要设置紧急指针和带外数据标志,判断条件有两个,
* 1、发送时是否设置了紧急方式
* 2、紧急指针是否在以该报文数据序号为起始的65535范围内,
* 其中,第二个条件主要是判断紧急指针的合法性。
*/
if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
if (before(tp->snd_up, tcb->seq + 0x10000)) {
th->urg_ptr = htons(tp->snd_up - tcb->seq);
th->urg = 1;
} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
th->urg_ptr = htons(0xFFFF);
th->urg = 1;
}
}
/* TCP首部调整完毕,开始构建TCP首部选项 */
tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
th->window = htons(tcp_select_window(sk));
tcp_ecn_send(sk, skb, th, tcp_header_size);
} else {
/* RFC1323: The window in SYN & SYN/ACK segments
* is never scaled.
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
}
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, skb);
}
#endif
/* 调用IPv4执行校验 和 接口send_check计算校验和,并设置到TCP首部中。
* 在TCP中,send_check接口被初始化为tcp_v4_send_check()
*/
icsk->icsk_af_ops->send_check(sk, skb);
/* 如果发送出去的段有ACK标志,则需要通知延时确认模块,递减
* 快速发送ACK段的数量,同时停止延时确认定时器。
*/
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
/* */
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);/* 数据包发送后的拥塞状态记账 */
tp->data_segs_out += tcp_skb_pcount(skb);/* 统计发送出去的data seg数量 */
tp->bytes_sent += skb->len - tcp_header_size;
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
tcp_skb_pcount(skb));
/* 统计发送出去的所有seg数量 */
tp->segs_out += tcp_skb_pcount(skb);
/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
tcp_add_tx_delay(skb, tp);
/* 调用发送接口queue_xmit发送报文,如果失败,返回错误码
* 在TCP中,该接口实现函数为ip_queue_xmit()
*/
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
if (unlikely(err > 0)) {
/* 当发送失败时,类似接收到显式拥塞通知,使拥塞控制进入CWR状态。
* 最后,根据错误信息,返回发送是否成功
*/
tcp_enter_cwr(sk);
err = net_xmit_eval(err);
}
if (!err && oskb) {
tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}
return err;
}
/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* Note: skb->sk can be different from sk, in case of tunnels
*
* 在TCP中,将TCP段打包成IP数据包的方法根据TCP类型的不同而有多种接口。
* 其中,最常用的就是ip_queue_xmit(),而ip_build_and_send_pkt()和
* ip_send_reply()只有在发送特定段时才会被调用
*
* @skb: 待封装成IP数据包的TCP段
*
* TCP发送的时候从tcp_transmit_skb函数里面跳转过来。
*/
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
__u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
struct iphdr *iph;
int res;
/* Skip all of this if the packet is already routed,
* f.e. by something like SCTP.
*/
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
fl4 = &fl->u.ip4;
rt = skb_rtable(skb);
if (rt)
goto packet_routed;
/* Make sure we can route this packet. */
rt = (struct rtable *)__sk_dst_check(sk, 0);
if (!rt) {
__be32 daddr;
/* Use correct destination address if we have options. */
daddr = inet->inet_daddr;
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
sk->sk_protocol,
RT_CONN_FLAGS_TOS(sk, tos),
sk->sk_bound_dev_if);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
}
skb_dst_set_noref(skb, &rt->dst);
packet_routed:
if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
goto no_route;
/* OK, we know where to send it, allocate and build IP header. */
skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
iph->frag_off = htons(IP_DF);
else
iph->frag_off = 0;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->protocol = sk->sk_protocol;
ip_copy_addrs(iph, fl4);
/* Transport layer set skb->h.foo itself. */
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
/* File: linux-5.4.1\net\ipv4\udp.c */
/* 应用层数据发包 */
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
struct flowi4 fl4_stack;
struct flowi4 *fl4;
int ulen = len;
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
int free = 0;
int connected = 0;
__be32 daddr, faddr, saddr;
__be16 dport;
u8 tos;
int err, is_udplite = IS_UDPLITE(sk);
/* 局部标志corkreq的初始化取决于多个因素,而此标志回传给ip_append_data,用于指出是由应该使用缓冲区机制。 */
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
/* 如果长度大于64k, 返回错误 */
if (len > 0xFFFF)
return -EMSGSIZE;
/*
* Check the flags.
*
* udp不支持带外数据的发送
*/
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
return -EOPNOTSUPP;
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
fl4 = &inet->cork.fl.u.ip4;
/* UDP正在输出数据 */
if (up->pending) {
/*
* There are pending frames.
* The socket lock must be held while it's corked.
*/
lock_sock(sk);
if (likely(up->pending)) {
if (unlikely(up->pending != AF_INET)) {
release_sock(sk);
return -EINVAL;
}
goto do_append_data;/* 确实在输出数据,跳转到do_append_data直接处理UDP数据 */
}
release_sock(sk);
}
ulen += sizeof(struct udphdr);/* 计算UDP报文总长度 */
/*
* Get and verify the address.
*/
if (usin) {
if (msg->msg_namelen < sizeof(*usin))/* 检查目的地址长度 */
return -EINVAL;
if (usin->sin_family != AF_INET) { /* 检查协议族 */
if (usin->sin_family != AF_UNSPEC)
return -EAFNOSUPPORT;
}
/* 缓存目的地址和端口 */
daddr = usin->sin_addr.s_addr;
dport = usin->sin_port;
if (dport == 0)
return -EINVAL;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = inet->inet_daddr;
dport = inet->inet_dport;
/* Open fast path for connected socket.
Route will not be used, if at least one option is set.
*/
connected = 1;
}
ipcm_init_sk(&ipc, inet);
ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
err = udp_cmsg_send(sk, msg, &ipc.gso_size);
if (err > 0)
err = ip_cmsg_send(sk, msg, &ipc,
sk->sk_family == AF_INET6);
if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
if (ipc.opt)
free = 1;
connected = 0;
}
if (!ipc.opt) {
struct ip_options_rcu *inet_opt;
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
memcpy(&opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
ipc.opt = &opt_copy.opt;
}
rcu_read_unlock();
}
if (cgroup_bpf_enabled && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
(struct sockaddr *)usin, &ipc.addr);
if (err)
goto out_free;
if (usin) {
if (usin->sin_port == 0) {
/* BPF program set invalid port. Reject it. */
err = -EINVAL;
goto out_free;
}
daddr = usin->sin_addr.s_addr;
dport = usin->sin_port;
}
}
saddr = ipc.addr;
ipc.addr = faddr = daddr;
if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr) {
err = -EINVAL;
goto out_free;
}
faddr = ipc.opt->opt.faddr;
connected = 0;
}
tos = get_rttos(&ipc, inet);
if (sock_flag(sk, SOCK_LOCALROUTE) ||
(msg->msg_flags & MSG_DONTROUTE) ||
(ipc.opt && ipc.opt->opt.is_strictroute)) {
tos |= RTO_ONLINK;
connected = 0;
}
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
connected = 0;
} else if (!ipc.oif) {
ipc.oif = inet->uc_index;
} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
/* oif is set, packet is to local broadcast and
* and uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
if (ipc.oif != inet->uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
inet->uc_index)) {
ipc.oif = inet->uc_index;
}
}
if (connected)
rt = (struct rtable *)sk_dst_check(sk, 0);
if (!rt) {
struct net *net = sock_net(sk);
__u8 flow_flags = inet_sk_flowi_flags(sk);
fl4 = &fl4_stack;
flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport,
sk->sk_uid);
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
/* 在路由表中查询路由 */
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
if (err == -ENETUNREACH)
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
goto out;
}
err = -EACCES;
if ((rt->rt_flags & RTCF_BROADCAST) &&
!sock_flag(sk, SOCK_BROADCAST)) /* 广播地址,但不允许进行广播,退出 */
goto out;
if (connected)
sk_dst_set(sk, dst_clone(&rt->dst));进行
}
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
saddr = fl4->saddr;/* 从路由中获取源地址和目的地址 */
if (!ipc.addr)
daddr = ipc.addr = fl4->daddr;
/* Lockless fast path for the non-corking case. */
if (!corkreq) {
struct inet_cork cork;
skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
&cork, msg->msg_flags);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
/* 输出报文 */
err = udp_send_skb(skb, fl4, &cork);
goto out;
}
lock_sock(sk);
if (unlikely(up->pending)) {
/* The socket is already corked while preparing it. */
/* ... which is an evident application bug. --ANK */
release_sock(sk);
net_dbg_ratelimited("socket already corked\n");
err = -EINVAL;
goto out;
}
/*
* Now cork the socket to pend data.
*/
fl4 = &inet->cork.fl.u.ip4;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->fl4_dport = dport;
fl4->fl4_sport = inet->inet_sport;
up->pending = AF_INET; /* 此标志表示正在发送数据 */
do_append_data:
/* 累计发送报文长度 */
up->len += ulen;
/* 发送数据给ip层,让ip层自己分片 */
err = ip_append_data(sk, fl4, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc, &rt,
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_flush_pending_frames(sk);
else if (!corkreq)
/* 没有后续数据或者IP选项步缓存数据,则调用udp_push_pending_frames发送数据 */
err = udp_push_pending_frames(sk);
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
up->pending = 0;
release_sock(sk);
out:
ip_rt_put(rt);/* 发送完成,递减对路由的引用 */
out_free:
if (free)
kfree(ipc.opt);
if (!err)
return len;
/*
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
* ENOBUFS might not be good (it's not tunable per se), but otherwise
* we don't have a good statistic (IpOutDiscards but it can be too many
* things). We could add another new stat but at least for now that
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err;
do_confirm:
/* 发送数据时设置了MSG_CONFIRM标志
* MSG_CONFIRM标志表示仅仅用来发现路径,并不直接发送数据。如果没有指定这个标志,则发送数据
*/
if (msg->msg_flags & MSG_PROBE)
dst_confirm_neigh(&rt->dst, &fl4->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
goto out;
}
/* File: linux-5.4.1\net\ipv4\udp.c */
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
struct inet_cork *cork)
{
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct udphdr *uh;
int err = 0;
int is_udplite = IS_UDPLITE(sk);
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
int datalen = len - sizeof(*uh);
__wsum csum = 0;
/*
* Create a UDP header
*/
uh = udp_hdr(skb);
uh->source = inet->inet_sport;
uh->dest = fl4->fl4_dport;
uh->len = htons(len);
uh->check = 0;
if (cork->gso_size) {
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
if (hlen + cork->gso_size > cork->fragsize) {
kfree_skb(skb);
return -EINVAL;
}
if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) {
kfree_skb(skb);
return -EINVAL;
}
if (sk->sk_no_check_tx) {
kfree_skb(skb);
return -EINVAL;
}
if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
dst_xfrm(skb_dst(skb))) {
kfree_skb(skb);
return -EIO;
}
if (datalen > cork->gso_size) {
skb_shinfo(skb)->gso_size = cork->gso_size;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
cork->gso_size);
}
goto csum_partial;
}
if (is_udplite) /* UDP-Lite */
csum = udplite_csum(skb);
else if (sk->sk_no_check_tx) { /* UDP csum off */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
csum_partial:
udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
goto send;
} else
csum = udp_csum(skb);
/* add protocol-dependent pseudo-header */
uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
sk->sk_protocol, csum);
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
send:
err = ip_send_skb(sock_net(sk), skb);
if (err) {
if (err == -ENOBUFS && !inet->recverr) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
}
} else
UDP_INC_STATS(sock_net(sk),
UDP_MIB_OUTDATAGRAMS, is_udplite);
return err;
}
/* File: linux-5.4.1\net\ipv4\ip_output.c */
int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
if (err)
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
}
return err;
}
/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* 通过ip_local_out()最终会走到IP层输出函数dev_queue_xmit() */
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
return err;
}
/* 当IP头封装好后,调用__ip_local_out() */
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*
* vrf发包实现
*/
skb = l3mdev_ip_out(sk, skb);
if (unlikely(!skb))
return 0;
skb->protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
}
/* File: linux-5.4.1\include\net\dst.h */
/* Output packet to network from transport.
* output()函数:
* 1、如果是单薄数据包,设置的是ip_output();
* 2、如果是组播数据包,设置的是ip_mc_output();
*/
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return skb_dst(skb)->output(net, sk, skb);
}
/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* 单播报文 */
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
/* 经过netfilter处理后,调用ip_finish_output()继续IP数据包的输出 */
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
/* File: linux-5.4.1\net\ipv4\ip_output.c */
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int ret;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
switch (ret) {
case NET_XMIT_SUCCESS:
return __ip_finish_output(net, sk, skb);
case NET_XMIT_CN:
return __ip_finish_output(net, sk, skb) ? : ret;
default:
kfree_skb(skb);
return ret;
}
}
static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(net, sk, skb);
}
#endif
mtu = ip_skb_dst_mtu(sk, skb);
/* 支持gso的话用ip_finish_output_gso()函数 */
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
/* 如果数据包长度大于MTU,则调用ip_fragment()对IP数据包进行分片处理
*
* 如果不支持TSO或者GSO,tcp发送的时候时按照mms来组织skb的,所以skb->len会等于mtu,
* 所以TCP叫分段,和IP分片不一样,只有UDP才有IP分片
*/
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
}
/* File: linux-5.4.1\net\ipv4\ip_output.c */
/*
* 此函数通过另据子系统将数据包输出到网络设备。
* 先调用__ipv4_neigh_lookup_noref()从邻居表查找邻居,
* 如果没找到,则用__netgh_create新建一个。
*/
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (!skb2) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
if (res < 0 || res == LWTUNNEL_XMIT_DONE)
return res;
}
rcu_read_lock_bh();
/* 从邻居表查找邻居 */
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
/*
* 如果缓存了链路层的首部,则调用
* neigh_hh_output()输出数据包。否则,
* 若存在对应的邻居项,则通过邻居项的输出方式输出数据包。
* 最后调用二层函数,dev_queue_xmit()
*/
if (!IS_ERR(neigh)) {
int res;
sock_confirm_neigh(skb, neigh);
/* if crossing protocols, can not use the cached header */
res = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb(skb);
return -EINVAL;
}
/* File: linux-5.4.1\include\net\neighbour.h */
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
bool skip_cache)
{
const struct hh_cache *hh = &n->hh;
/* 如果neighbour已连接且hh已设置 */
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
return neigh_hh_output(hh, skb);
else
return n->output(n, skb);/* 初始阶段调用此函数,此时neigh_resolve_output函数 */
}
static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
unsigned int hh_alen = 0;
unsigned int seq;
unsigned int hh_len;
do {
seq = read_seqbegin(&hh->hh_lock);
hh_len = hh->hh_len;
if (likely(hh_len <= HH_DATA_MOD)) {
hh_alen = HH_DATA_MOD;
/* skb_push() would proceed silently if we have room for
* the unaligned size but not for the aligned size:
* check headroom explicitly.
*/
if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
/* this is inlined by gcc */
memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
HH_DATA_MOD);
}
} else {
hh_alen = HH_DATA_ALIGN(hh_len);
if (likely(skb_headroom(skb) >= hh_alen)) {
memcpy(skb->data - hh_alen, hh->hh_data,
hh_alen);
}
}
} while (read_seqretry(&hh->hh_lock, seq));
if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
kfree_skb(skb);
return NET_XMIT_DROP;
}
__skb_push(skb, hh_len);
return dev_queue_xmit(skb);
}
/* File: X:\linux-5.4.1\net\core\dev.c*/
int dev_queue_xmit(struct sk_buff *skb)
{
return __dev_queue_xmit(skb, NULL);
}
/**
* __dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
* @sb_dev: suboordinate device used for L2 forwarding offload
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*
*
*
*
* 网络接口核心层向网络协议层提供的统一的发送接口,无论IP还是ARP协议,以及其他各种底层协议,
* 通过这个函数把要发送的数据传递给网络接口核心层。
*
* 若支持流量控制,则将带输出额数据包根据规则加入到输出网络队列,并在合适的时机激活网络设备
* 输出软中断,依次将报文从队列中取出通过网络设备输出。若不支持流量控制,则直接将数据包从网
* 络设备输出。
* 如果提交失败,则返回相应的错误吗,然而返回成功也并不能确保数据包被成功发送,因为有可能由
* 于拥塞而导致流量控制机制将数据包丢弃。
* 调用dev_queue_xmit()函数输出数据包,前提时必须启用中断,只有启动中断之后才能激活下半部。
*/
static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;
skb_reset_mac_header(skb);
if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
skb_update_prio(skb);
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
}
# endif
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
else
skb_dst_force(skb);
txq = netdev_core_pick_tx(dev, skb, sb_dev);
/* 实际上就是获取net_device->netdev_queue, 也就是该dev设备的根qdisc
* 对于物理网卡而言,缺省使用的时FIFO qdisc, 该成员函数非空,只有逻辑网卡才可能为空
*/
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
/*
* 如果队列输入非空,将数据包入队
*
* 将待发送数据按排队规则插入到队列,然后进行流量控制,
* 调度对立输出数据包,完成后返回。
*/
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
goto out;
}
/* The device has no queue. Common case for software devices:
* loopback, all the sorts of tunnels...
* Really, it is unlikely that netif_tx_lock protection is necessary
* here. (f.e. loopback and IP tunnels are clean ignoring statistics
* counters.)
* However, it is possible, that they rely on protection
* made by us here.
* Check this and shot the lock. It is not prone from deadlocks.
* Either shot noqueue qdisc, it is even simpler 8)
*
* 如果设备已打开但未启用QoS,则直接输出数据包
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq->xmit_lock_owner != cpu) {
if (dev_xmit_recursion())
goto recursion_alert;
/* 检查skb是否有效 */
skb = validate_xmit_skb(skb, dev, &again);
if (!skb)
goto out;
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
dev_xmit_recursion_inc();
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
dev_xmit_recursion_dec();
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
HARD_TX_UNLOCK(dev, txq);
net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
dev->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately
*/
recursion_alert:
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
atomic_long_inc(&dev->tx_dropped);
kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
/* File: linux-5.4.1\net\core\dev.c */
/* dev_hard_start_xmit()将输出的数据包提交给网络设备的输出接口,完成数据包的输出
*
* SKB通过ip_local_out()走到这里, 在ip_local_out()中已经把IP层以及其以上各层已经封装完毕。该函数后
* 开始走二层封装。
*
* xmit_one()负责输出数据
*/
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
struct netdev_queue *txq, int *ret)
{
struct sk_buff *skb = first;
int rc = NETDEV_TX_OK;
while (skb) {
struct sk_buff *next = skb->next;
skb_mark_not_on_list(skb);
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
goto out;
}
skb = next;
if (netif_tx_queue_stopped(txq) && skb) {
rc = NETDEV_TX_BUSY;
break;
}
}
out:
*ret = rc;
return skb;
}
/* File: linux-5.4.1\net\core\dev.c */
/* 由 dev_hard_start_xmit() 调用,
* netdev_start_xmit()用输出数据
*
*/
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
unsigned int len;
int rc;
if (dev_nit_active(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
trace_net_dev_start_xmit(skb, dev);
rc = netdev_start_xmit(skb, dev, txq, more);
trace_net_dev_xmit(skb, rc, dev, len);
return rc;
}
/* File: linux-5.4.1\include\linux\netdevice.h */
static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq, bool more)
{
const struct net_device_ops *ops = dev->netdev_ops;
netdev_tx_t rc;
rc = __netdev_start_xmit(ops, skb, dev, more);
if (rc == NETDEV_TX_OK)
txq_trans_update(txq);
return rc;
}
static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
struct sk_buff *skb, struct net_device *dev,
bool more)
{
__this_cpu_write(softnet_data.xmit.more, more);
return ops->ndo_start_xmit(skb, dev);
}
最后这里的
ops->ndo_start_xmit(skb, dev);
就是根据网络设备驱动来调用的,如e1000的网络驱动:
/* File: linux-5.4.1\drivers\net\ethernet\intel\e1000e\netdev.c */ static const struct net_device_ops e1000e_netdev_ops = { .ndo_open = e1000e_open, .ndo_stop = e1000e_close, .ndo_start_xmit = e1000_xmit_frame, /* 这里是对应的函数指针挂载的驱动函数 */ .ndo_get_stats64 = e1000e_get_stats64, .ndo_set_rx_mode = e1000e_set_rx_mode, .ndo_set_mac_address = e1000_set_mac, .ndo_change_mtu = e1000_change_mtu, .ndo_do_ioctl = e1000_ioctl, .ndo_tx_timeout = e1000_tx_timeout, .ndo_validate_addr = eth_validate_addr, .ndo_vlan_rx_add_vid = e1000_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = e1000_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = e1000_netpoll, #endif .ndo_set_features = e1000_set_features, .ndo_fix_features = e1000_fix_features, .ndo_features_check = passthru_features_check, };