iptable_netfilter介绍以及简单代码分析

前言

Linux kernel 版本:5.4.1

Netfilter特指内核中的netfilter框架

iptables指用户空间的配置工具

概念

Linux 上最常用的防火墙工具是 iptables。iptables 与协议栈内有包过滤功能的 hook 交 互来完成工作。这些内核 hook 构成了 netfilter 框架。

每个进入网络系统的包(接收或发送)在经过协议栈时都会触发这些 hook,程序 可以通过注册 hook 函数的方式在一些关键路径上处理网络流量。iptables 相关的内核模 块在这些 hook 点注册了处理函数,因此可以通过配置 iptables 规则来使得网络流量符合 防火墙规则。

Iptable/netfilter的表和链(Table and Chains)

Iptable/netfilter使用table来组织规则,根据**用来做什么**的判断标准,将规则分为不同table。

Table名 作用描述
Filter 用于判断是否允许一个包通过,最常用的table。
NAT 用于实现网络地址转换规则,一般用于将数据包路由到无法直接访问的网络。
Mangle 用于修改包的IP头。例如,可以修改包的TTL。
Raw 控制数据包不被connecting tracking所追踪。
Security 用于给包打上SELinux标记,以此影响SELinux或其他可以解读SELinux安全上线问的系统处理包的行为。

而在每个table内部,规则被进一步组织成chain,内置的chain是由内置的hook触发的。chain基本上能决定规则**何时**被触发。

Chain名称 HOOK宏
PREROUTING NF_INET_PRE_ROUTING
INPUT NF_INET_LOCAL_IN
FORWARD NF_INET_FORWARD
OUTPUT NF_INET_LOCAL_OUT
POSTROUTING NF_INET_POST_ROUTING

每个table里面的chain不是相同的,即数据包在不同的处理流程的位置,Iptable/netfilter的处理方法不一样,具体见每个table的宏定义。

  • Filter
/* File: linux-5.4.1\net\ipv4\netfilter\iptable_filter.c */
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
			    (1 << NF_INET_FORWARD) | \
			    (1 << NF_INET_LOCAL_OUT))
  • NAT
/* File: linux-5.4.1\net\ipv4\netfilter\iptable_nat.c */
static const struct xt_table nf_nat_ipv4_table = {
	.name		= "nat",
	.valid_hooks	= (1 << NF_INET_PRE_ROUTING) |
			  (1 << NF_INET_POST_ROUTING) |
			  (1 << NF_INET_LOCAL_OUT) |
			  (1 << NF_INET_LOCAL_IN),
	.me		= THIS_MODULE,
	.af		= NFPROTO_IPV4,
	.table_init	= iptable_nat_table_init,
};
  • Mangle
/* File: linux-5.4.1\net\ipv4\netfilter\iptable_mangle.c */
#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
			    (1 << NF_INET_LOCAL_IN) | \
			    (1 << NF_INET_FORWARD) | \
			    (1 << NF_INET_LOCAL_OUT) | \
			    (1 << NF_INET_POST_ROUTING))
  • Raw
/* File: X:\linux-5.4.1\net\ipv4\netfilter\iptable_raw.c */
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
  • Security
/* File: X:\linux-5.4.1\net\ipv4\netfilter\iptable_security.c */
#define SECURITY_VALID_HOOKS	(1 << NF_INET_LOCAL_IN) | \
				(1 << NF_INET_FORWARD) | \
				(1 << NF_INET_LOCAL_OUT)
Chains/Table Filter NAT Mangle Raw Security
PREROUTING Y Y Y
INPUT Y Y Y Y
FORWARD Y Y Y
OUTPUT Y Y Y Y Y
POSTROUTING Y Y

当一个包触发 netfilter hook 时,处理过程存在先后顺序和处理条件。 触发哪个 hook (列)和包的方向(ingress/egress)、路由判断、过滤条件等相关。

特定事件会导致 table 的 chain 被跳过。例如,只有每个连接的第一个包会去匹配 NAT 规则,对这个包的动作会应用于此连接后面的所有包。到这个连接的应答包会被自动应用反 方向的 NAT 规则。

具体的包触发流程如下:

iptable_netfilter介绍以及简单代码分析_第1张图片

这里的ct表示的是conntrack

netfilter钩子(hooks)

NF_HOOK()函数

NF_HOOK是协议和Netfilter框架的切入点, 完成netfilter模块的处理后,调用int (*okfn)(struct net *, struct sock *, struct sk_buff *)进行后续处理流程。

而在netfilter模块的处理中,主要的就是对iptables所配置的规则进行匹配,如果这个chain上所有的规则匹配均通过,则可以执行后续的函数。

/* File:linux-5.4.1\include\linux\netfilter.h */
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out,
	int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
	int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
    /* 根据nf_hook()的返回值,决定要不要执行后续函数 */
	if (ret == 1)
		ret = okfn(net, sk, skb);
	return ret;
}

nf_hook()函数

/* File:linux-5.4.1\include\linux\netfilter.h */
/**
 *	nf_hook - call a netfilter hook
 *
 *	Returns 1 if the hook has allowed the packet to pass.  The function
 *	okfn must be invoked by the caller in this case.  Any other return
 *	value indicates the packet has been consumed by the hook.
 *
 * 
 *  调用hook函数,如果该函数允许数据包通过,返回1
 *  在这种情况下,调用方必须调用函数okfn。任何其他的返回值都表示数据包已被hook使用。
 */
static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
			  struct sock *sk, struct sk_buff *skb,
			  struct net_device *indev, struct net_device *outdev,
			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
	struct nf_hook_entries *hook_head = NULL;
	int ret = 1;

#ifdef CONFIG_JUMP_LABEL
	if (__builtin_constant_p(pf) &&
	    __builtin_constant_p(hook) &&
	    !static_key_false(&nf_hooks_needed[pf][hook]))
		return 1;
#endif

	rcu_read_lock();
    /* 判断网络层协议 */
	switch (pf) {
	case NFPROTO_IPV4:
		hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]);
		break;
	case NFPROTO_IPV6:
		hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]);
		break;
	case NFPROTO_ARP:
#ifdef CONFIG_NETFILTER_FAMILY_ARP
		if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp)))
			break;
		hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
#endif
		break;
	case NFPROTO_BRIDGE:
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
		hook_head = rcu_dereference(net->nf.hooks_bridge[hook]);
#endif
		break;
#if IS_ENABLED(CONFIG_DECNET)
	case NFPROTO_DECNET:
		hook_head = rcu_dereference(net->nf.hooks_decnet[hook]);
		break;
#endif
	default:
		WARN_ON_ONCE(1);
		break;
	}

	if (hook_head) {
		struct nf_hook_state state;

		nf_hook_state_init(&state, hook, pf, indev, outdev,
				   sk, net, okfn);

		ret = nf_hook_slow(skb, &state, hook_head, 0); 
		// 返回1 ,代表NF_ACCEPT,okfn()需要被别的caller执行
		// 返回-EPERM,代表NF_DROP,也就是NF_HOOK中,不再执行okfn()函数
	}
	rcu_read_unlock();

	return ret;
}

nf_hook_slow()

具体hook函数中的规则匹配流程,见《规则匹配流程》小节

/* File: linux-5.4.1\net\netfilter\core.c */
/* Returns 1 if okfn() needs to be executed by the caller,
 * -EPERM for NF_DROP, 0 otherwise.  Caller must hold rcu_read_lock. 
 *
 * nf_hook_slow()用于完成成钩子函数遍历执行,也就是规则匹配,这里的执行顺序是按照table优先级执行
 *
 */
int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
		 const struct nf_hook_entries *e, unsigned int s)
{
	unsigned int verdict;
	int ret;

	for (; s < e->num_hook_entries; s++) {
		/* 执行对应的hook函数
	     * 这里的hook函数,是在iptables中的5个table在init的时候挂载的几个
	     * 1、Filter   : iptable_filter_hook();
	     * 2、Mangle   : iptable_mangle_hook();
	     * 3、NAT      : iptable_nat_do_hook();
	     * 4、Raw      : iptable_raw_hook();
	     * 5、Security : iptable_security_hook();
	     */
		verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
		switch (verdict & NF_VERDICT_MASK) {
		case NF_ACCEPT:
			break;
		case NF_DROP:
			kfree_skb(skb);
			ret = NF_DROP_GETERR(verdict);
			if (ret == 0)
				ret = -EPERM;
			return ret;
		case NF_QUEUE:
			ret = nf_queue(skb, state, s, verdict);
			if (ret == 1)
				continue;
			return ret;
		default:
			/* Implicit handling for NF_STOLEN, as well as any other
			 * non conventional verdicts.
			 */
			return 0;
		}
	}

	return 1;
}

Iptables规则匹配

相关结构体

ipt_entry结构体

/* File: linux-5.4.1\include\uapi\linux\netfilter_ipv4\ip_tables.h */
/* This structure defines each of the firewall rules.  Consists of 3
   parts which are 
   1) general IP header stuff 
   2) match specific stuff 
   3) the target to perform if the rule matches 
   
   标准匹配结构,主要包括:
   数据包的源、目的IP
   出/入接口
   掩码等
 */
struct ipt_entry {
    /* 要匹配的报文IP头 */
	struct ipt_ip ip;

	/* Mark with fields that we care about. 
	 * 位向量,标识本规则关心报文的什么部分
	 */
	unsigned int nfcache;

	/* Size of ipt_entry + matches 
 	   target区的偏移,通常target区位于match区之后,而match区则在ipt_entry的末尾;
       初始化为sizeof(struct ipt_entry),即假定没有match
	 */
	__u16 target_offset;
    
	/* Size of ipt_entry + matches + target 
	   下一条规则相对于本规则的偏移,也即本规则所用空间的总和,
        初始化为sizeof(struct ipt_entry)+sizeof(struct ipt_target),即没有match
	 */
	__u16 next_offset;

	/* Back pointer 
	 位向量,标记调用本规则的HOOK号,可用于检查规则的有效性 */
	unsigned int comefrom;

	/* Packet and byte counters. 
	 记录该规则处理过的报文数和报文总字节数 */
	struct xt_counters counters;

	/* The matches (if any), then the target. 
	 target或者是match的起始位置 */
	unsigned char elems[0];
};

ipt_replace结构体

iptables的规则是由iptables的命令创建并添加到对应的链上,用户配置完一条iptables规则之后,传给内核的是一个ipt_replace结构,其中包含了内核所需的所有的内容。

/* File: linux-5.4.1\include\uapi\linux\netfilter_ipv4\ip_tables.h */
/* The argument to IPT_SO_SET_REPLACE. */
/* 用户配置完iptables规则之后,传给内核的时一个ipt-table结构,其中包含了内核所需要的所有内容 */
struct ipt_replace {
	/* Which table.  表名 */
	char name[XT_TABLE_MAXNAMELEN];

	/* Which hook entry points are valid: bitmask.  You can't
           change this. */
	unsigned int valid_hooks;

	/* Number of entries 新的entry数 */
	unsigned int num_entries;

	/* Total size of new entries */
	unsigned int size;

	/* Hook entry points. */
	unsigned int hook_entry[NF_INET_NUMHOOKS];

	/* Underflow points. */
	unsigned int underflow[NF_INET_NUMHOOKS];

	/* Information about old entries: 旧的规则数*/
	/* Number of counters (must be equal to current number of entries). */
	unsigned int num_counters;
	/* The old entries' counters. */
	struct xt_counters __user *counters;

	/* The entries (hang off end: not really an array). */
	/* 规则本身 */
	struct ipt_entry entries[0];
};

ipt_match结构体

ipt_match就是xt_match

#define ipt_match xt_match 

struct xt_match {
	struct list_head list;

	const char name[XT_EXTENSION_MAXNAMELEN];
	u_int8_t revision;

	/* Return true or false: return FALSE and set *hotdrop = 1 to
           force immediate packet drop. */
	/* Arguments changed since 2.6.9, as this must now handle
	   non-linear skb, using skb_header_pointer and
	   skb_ip_make_writable. */
    /* 匹配函数,最重要的部分,返回true表示匹配成功,返回false表示匹配失败 */
	bool (*match)(const struct sk_buff *skb,
		      struct xt_action_param *);

	/* Called when user tries to insert an entry of this type. 
	 在使用本Match的规则注入表中之前调用,进行有效性检查,如果返回0,规则就不会加入iptables中.  */
	int (*checkentry)(const struct xt_mtchk_param *);

	/* Called when entry of this type deleted. 
	 删除包含本match的entry时调用,与checkentry配合可用于动态内存分配和释放 */
	void (*destroy)(const struct xt_mtdtor_param *);
#ifdef CONFIG_COMPAT
	/* Called when userspace align differs from kernel space one */
	void (*compat_from_user)(void *dst, const void *src);
	int (*compat_to_user)(void __user *dst, const void *src);
#endif
	/* Set this to THIS_MODULE if you are a module, otherwise NULL 
	 是否为模块 */
	struct module *me;

	const char *table;
	unsigned int matchsize;
	unsigned int usersize;
#ifdef CONFIG_COMPAT
	unsigned int compatsize;
#endif
	unsigned int hooks;
	unsigned short proto;

	unsigned short family;
};

ipt_target结构体

ipt_target就是xt_target

#define ipt_target 

/* Registration hooks for targets. */
struct xt_target {
	struct list_head list;

	const char name[XT_EXTENSION_MAXNAMELEN];
	u_int8_t revision;

	/* Returns verdict. Argument order changed since 2.6.9, as this
	   must now handle non-linear skbs, using skb_copy_bits and
	   skb_ip_make_writable. 
       target的模块函数,如果需要继续处理则返回IPT_CONTINUE(-1),否则返回NF_ACCEPT、NF_DROP等值,
       它的调用者根据它的返回值来判断如何处理它处理过的报文
	 */
	unsigned int (*target)(struct sk_buff *skb,
			       const struct xt_action_param *);

	/* Called when user tries to insert an entry of this type:
           hook_mask is a bitmask of hooks from which it can be
           called. */
	/* Should return 0 on success or an error code otherwise (-Exxxx). 
	 在使用本Match的规则注入表中之前调用,进行有效性检查,如果返回0,规则就不会加入iptables中 */
	int (*checkentry)(const struct xt_tgchk_param *);

	/* Called when entry of this type deleted. 
	 在包含本Target的规则从表中删除时调用,与checkentry配合可用于动态内存分配和释放 */
	void (*destroy)(const struct xt_tgdtor_param *);
#ifdef CONFIG_COMPAT
	/* Called when userspace align differs from kernel space one */
	void (*compat_from_user)(void *dst, const void *src);
	int (*compat_to_user)(void __user *dst, const void *src);
#endif
	/* Set this to THIS_MODULE if you are a module, otherwise NULL 
	 表示当前Target是否为模块(NULL为否) */
	struct module *me;

	const char *table;
	unsigned int targetsize;
	unsigned int usersize;
#ifdef CONFIG_COMPAT
	unsigned int compatsize;
#endif
	unsigned int hooks;
	unsigned short proto;

	unsigned short family;
};

对应的命令结构

每一条iptables配置的rule都包含了匹配条件(match)部分和动作(target)。当报文途径HOOK点时,Netfilter会逐个遍历挂在该钩子点上的表的rule,若报文满足rule的匹配条件,内核就会执行动作(target)。

iptable_netfilter介绍以及简单代码分析_第2张图片

添加一条规则的流程

在ip_tables.c中,ip_tables_init函数中调用hf_register_sockopts(&ipt_sockopts)注册对应的get和set方法

/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
static struct nf_sockopt_ops ipt_sockopts = {
	.pf		    = PF_INET,
	.set_optmin	= IPT_BASE_CTL,
	.set_optmax	= IPT_SO_SET_MAX+1,
	.set		= do_ipt_set_ctl,//set方法
#ifdef CONFIG_COMPAT
	.compat_set	= compat_do_ipt_set_ctl,
#endif
	.get_optmin	= IPT_BASE_CTL,
	.get_optmax	= IPT_SO_GET_MAX+1,
	.get		= do_ipt_get_ctl,//get方法
#ifdef CONFIG_COMPAT
	.compat_get	= compat_do_ipt_get_ctl,
#endif
	.owner		= THIS_MODULE,
};

static int __init ip_tables_init(void)
{
	int ret;

	ret = register_pernet_subsys(&ip_tables_net_ops);
	if (ret < 0)
		goto err1;

	/* No one else will be downing sem now, so we won't sleep */
	ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
	if (ret < 0)
		goto err2;
	ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
	if (ret < 0)
		goto err4;

	/* Register setsockopt */
	/* 注册一个socket option,这个option用于读或写iptable的配置,
     * 如:Linux的防火墙规则、NAT转换映射最终都是通过这个接口通知内核的
	 */
	ret = nf_register_sockopt(&ipt_sockopts);
	if (ret < 0)
		goto err5;

	return 0;

err5:
	xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
err4:
	xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
	unregister_pernet_subsys(&ip_tables_net_ops);
err1:
	return ret;
}

用户改变iptables规则后,如set方法就是将用户空间传过来的ipt_replace来替换旧的iptables规则。

/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* 将用户空间传过来的ipt_replace来替换旧的iptables规则。该工作在do_replace()函数中完成 */
static int
do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
{
	int ret;

	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
		return -EPERM;

	switch (cmd) {
	case IPT_SO_SET_REPLACE:
		ret = do_replace(sock_net(sk), user, len);
		break;

	case IPT_SO_SET_ADD_COUNTERS:
		ret = do_add_counters(sock_net(sk), user, len, 0);
		break;

	default:
		ret = -EINVAL;
	}

	return ret;
}
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
static int
do_replace(struct net *net, const void __user *user, unsigned int len)
{
	int ret;
	struct ipt_replace tmp;
	struct xt_table_info *newinfo;
	void *loc_cpu_entry;
	struct ipt_entry *iter;

    /* 拷贝用户空间的struct ip_replace结构 */
	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
		return -EFAULT;

	/* overflow check */
	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
		return -ENOMEM;
	if (tmp.num_counters == 0)
		return -EINVAL;

	tmp.name[sizeof(tmp.name)-1] = 0;

    /* 分配一个xt_table_info结构,并根据ipt_replace.size
       给xt_table_info的entry成员分配空间 
    */
	newinfo = xt_alloc_table_info(tmp.size);
	if (!newinfo)
		return -ENOMEM;

	loc_cpu_entry = newinfo->entries;
	/* 将ipt_replace.entries的内容拷贝到内核 */
	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
			   tmp.size) != 0) {
		ret = -EFAULT;
		goto free_newinfo;
	}
    /* 根据ipt_replace给xt_table_info结构各个成员赋值 */
	ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
	if (ret != 0)
		goto free_newinfo;

    /* 根据ipt_replace的name找到要更新的表,
       用上面的xt_table_info为xt_table赋值
     */
	ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
			   tmp.num_counters, tmp.counters);
	if (ret)
		goto free_newinfo_untrans;
	return 0;

 free_newinfo_untrans:
	xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
		cleanup_entry(iter, net);
 free_newinfo:
	xt_free_table_info(newinfo);
	return ret;
}

最终的结果是更新了内核中的某个xt_table表,如NAT表。

规则匹配流程

iptables中有5种tables,不同table的对应的rule作用不一致,每个tables都有它对应的hook函数,但最终都是走的ip_do_table()函数来匹配规则。

Tables Hook函数
Filter iptable_filter_hook()
NAT iptable_nat_do_chain()
Mangle iptable_mangle_hook()
Raw iptable_raw_hook()
Security iptable_security_hook()
/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c  */
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
/* 遍历钩子链上的所有规则,进行标准匹配和扩展匹配,执行其target操作 */
unsigned int
ipt_do_table(struct sk_buff *skb,
	     const struct nf_hook_state *state,
	     struct xt_table *table)
{
	unsigned int hook = state->hook;
	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
	const struct iphdr *ip;
	/* Initializing verdict to NF_DROP keeps gcc happy. */
	unsigned int verdict = NF_DROP;
	const char *indev, *outdev;
	const void *table_base;
	struct ipt_entry *e, **jumpstack;
	unsigned int stackidx, cpu;
	const struct xt_table_info *private;
	struct xt_action_param acpar;
	unsigned int addend;

	/* Initialization */
	stackidx = 0;
	ip = ip_hdr(skb);
	indev = state->in ? state->in->name : nulldevname;
	outdev = state->out ? state->out->name : nulldevname;
	/* We handle fragments by dealing with the first fragment as
	 * if it was a normal packet.  All other fragments are treated
	 * normally, except that they will NEVER match rules that ask
	 * things we don't know, ie. tcp syn flag or ports).  If the
	 * rule is also a fragment-specific rule, non-fragments won't
	 * match it. */
	acpar.fragoff = ntohs(ip->frag_off) & IP_OFFSET;
	acpar.thoff   = ip_hdrlen(skb);
	acpar.hotdrop = false;
	acpar.state   = state;

	WARN_ON(!(table->valid_hooks & (1 << hook)));
	local_bh_disable();
	addend = xt_write_recseq_begin();
	private = READ_ONCE(table->private); /* Address dependency. */
	cpu        = smp_processor_id();

	/* 首个规则地址 */
	table_base = private->entries;
	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];

	/* Switch to alternate jumpstack if we're being invoked via TEE.
	 * TEE issues XT_CONTINUE verdict on original skb so we must not
	 * clobber the jumpstack.
	 *
	 * For recursion via REJECT or SYNPROXY the stack will be clobbered
	 * but it is no problem since absolute verdict is issued by these.
	 */
	if (static_key_false(&xt_tee_enabled))
		jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);

    /* 获取对应链上的首个匹配规则 */
	e = get_entry(table_base, private->hook_entry[hook]);

	do {
		const struct xt_entry_target *t;
		const struct xt_entry_match *ematch;
		struct xt_counters *counter;

		WARN_ON(!e);
		/* 标准match */
		if (!ip_packet_match(ip, indev, outdev,
		    &e->ip, acpar.fragoff)) {
 no_match:
 	        /* 未匹配成功,继续下一个规则 */
			e = ipt_next_entry(e);
			continue;
		}
        /* 扩展match */
		xt_ematch_foreach(ematch, e) {
			acpar.match     = ematch->u.kernel.match;
			acpar.matchinfo = ematch->data;
			/* 只要有返回不匹配的,则说明匹配当前规则失败 */
			if (!acpar.match->match(skb, &acpar))
				goto no_match;
		}

		counter = xt_get_this_cpu_counter(&e->counters);
		ADD_COUNTER(*counter, skb->len, 1);

        /* 标准match和扩展match都成功 */

		/* 获取target */
		t = ipt_get_target_c(e);
		WARN_ON(!t->u.kernel.target);

#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
		/* The packet is traced: log it */
		if (unlikely(skb->nf_trace))
			trace_packet(state->net, skb, hook, state->in,
				     state->out, table->name, private, e);
#endif
		/* Standard target? */
        /* 标准target */
		if (!t->u.kernel.target->target) {
			int v;

			v = ((struct xt_standard_target *)t)->verdict;
		    /* 不会跳转到用户自定义规则 */
			if (v < 0) {
				/* Pop from stack? */
			    /* 不是XT_RETURN,则跳出结果 */
				if (v != XT_RETURN) {
					verdict = (unsigned int)(-v) - 1;
					break;
				}

				/* XT_RETURN则继续匹配下一个规则 */
				if (stackidx == 0) {
					e = get_entry(table_base,
					    private->underflow[hook]);
				} else {
					e = jumpstack[--stackidx];
					e = ipt_next_entry(e);
				}
				continue;
			}

			/* 记录跳转规则,以便返回时获取吓一跳规则进行后续匹配 */
			if (table_base + v != ipt_next_entry(e) &&
			    !(e->ip.flags & IPT_F_GOTO)) {
				if (unlikely(stackidx >= private->stacksize)) {
					verdict = NF_DROP;
					break;
				}
				jumpstack[stackidx++] = e;
			}

			/* 获取自定义规则 */
			e = get_entry(table_base, v);
			continue;
		}

        /* 扩展target,执行target回调 */
		acpar.target   = t->u.kernel.target;
		acpar.targinfo = t->data;

		verdict = t->u.kernel.target->target(skb, &acpar);

		/* 需要继续匹配 */
		if (verdict == XT_CONTINUE) {
			/* Target might have changed stuff. */
			ip = ip_hdr(skb);
			e = ipt_next_entry(e);
		}
		/* 跳出处理匹配结果 */
		else {
			/* Verdict */
			break;
		}
	} while (!acpar.hotdrop);

	xt_write_recseq_end(addend);
	local_bh_enable();

	/* drop标记 */
	if (acpar.hotdrop)
		return NF_DROP;
	else return verdict;
}

标准匹配Match

/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* Returns whether matches rule or not. */
/* Performance critical - called for every packet 
 *
 * 标准匹配match
 */
static inline bool
ip_packet_match(const struct iphdr *ip,
		const char *indev,
		const char *outdev,
		const struct ipt_ip *ipinfo,
		int isfrag)
{
	unsigned long ret;
    /* 处理源IP和目的IP */
	if (NF_INVF(ipinfo, IPT_INV_SRCIP,
		    (ip->saddr & ipinfo->smsk.s_addr) != ipinfo->src.s_addr) ||
	    NF_INVF(ipinfo, IPT_INV_DSTIP,
		    (ip->daddr & ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr))
		return false;
    /* 处理输入输出接口 */
	ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);

	if (NF_INVF(ipinfo, IPT_INV_VIA_IN, ret != 0))
		return false;

	ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);

	if (NF_INVF(ipinfo, IPT_INV_VIA_OUT, ret != 0))
		return false;

	/* Check specific protocol 检查协议字段是否匹配*/
	if (ipinfo->proto &&
	    NF_INVF(ipinfo, IPT_INV_PROTO, ip->protocol != ipinfo->proto))
		return false;

	/* If we have a fragment rule but the packet is not a fragment
	 * then we return zero 
	 * 处理分片包的匹配情况
	 */
	if (NF_INVF(ipinfo, IPT_INV_FRAG,
		    (ipinfo->flags & IPT_F_FRAG) && !isfrag))
		return false;
    /* 以上所有匹配均通过,返回true */
	return true;
}

Target流程

什么是target

Targets就是找到匹配的数据包之后怎么办,常见的有下面几种:

  • DROP:直接将数据包丢弃,不再进行后续的处理
  • RETURN: 跳出当前chain,该chain里后续的rule不再执行
  • QUEUE: 将数据包放入用户空间的队列,供用户空间的程序处理
  • ACCEPT: 同意数据包通过,继续执行后续的rule
  • 跳转到其它用户自定义的chain继续执行

当然iptables包含的targets很多很多,但并不是每个表都支持所有的targets,
rule所支持的target由它所在的表和chain以及所开启的扩展功能来决定,具体每个表支持的targets请参考Iptables Targets And Jumps

标准target

/* File: linux-5.4.1\net\ipv4\netfilter\ip_tables.c */
/* for const-correctness */
static inline const struct xt_entry_target *
ipt_get_target_c(const struct ipt_entry *e)
{
	return ipt_get_target((struct ipt_entry *)e);
}

/* File: linux-5.4.1\include\uapi\linux\netfilter_ipv4\ip_tables.h */
/* Helper functions */
static __inline__ struct xt_entry_target *
ipt_get_target(struct ipt_entry *e)
{
	return (void *)e + e->target_offset;
}

IP报文流程

netfilter在整个的IP报文收发流程源码的HOOK点如下图所示:

iptable_netfilter介绍以及简单代码分析_第3张图片

IP报文接受流程

以下这段这里不做研究,我们这里只研究协议栈

硬件网卡->DMA驱动->CPU硬件中断处理(IRQ)->NIC Driver->软中断处理->内核网络模块->协议栈
/* File: linux-5.4.1\net\core\dev.c */
static inline int deliver_skb(struct sk_buff *skb,
			      struct packet_type *pt_prev,
			      struct net_device *orig_dev)
{
	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
		return -ENOMEM;
	refcount_inc(&skb->users);
    /* 调用ip_rcv() */
	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
/* 这个 .func = ip_rcv,是dev调用ip_rcv的接口 */
static struct packet_type ip_packet_type __read_mostly = {
	.type = cpu_to_be16(ETH_P_IP),
	.func = ip_rcv,
	.list_func = ip_list_rcv,
};

接受

ip_rcv()

从net协议栈分发到Ipv4模块时,是由ip_rcv(函数)来处理。

/* File: linux-5.4.1\net\ipv4\ip_input.c */
/*
 * IP receive entry point
 *
 * @skb      接收到的IP数据包
 * @dev      接收到的IP包当前的输入网络设备
 * @pt       输入此数据包的网络层输入接口
 * @orig_dev 接受到的IP数据包原始的输入网络设备
 *
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
	   struct net_device *orig_dev)
{
	struct net *net = dev_net(dev);

	skb = ip_rcv_core(skb, net);    //对收到的报文进行处理,根据处理结果,决定是否drop
	if (skb == NULL)
		return NET_RX_DROP;

    /*  NF_HOOK是该函数和Netfilter框架的切入点
     *  完成netfilter模块的处理后,调用ip_rcv_finish()
	 */
	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
		       net, NULL, skb, dev, NULL,
		       ip_rcv_finish);
}


/*
 * 	Main IP Receive routine.
 *
 *  主IP接受程序
 *  是从ip_rcv()函数中,把这块的功能剥离出来成一个单独的函数
 */
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
	const struct iphdr *iph;
	u32 len;

	/* When the interface is in promisc. mode, drop all the crap
	 * that it receives, do not try to analyse it.
     *
	 * skb->pkt_type 值的判断是在 eth_type_trans函数(net/ethernet/eth.c)中做判定的
	 * 这里的判定的依据是报文的[目的MAC]来判断的
	 */
	if (skb->pkt_type == PACKET_OTHERHOST)
		goto drop;

	__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);

	skb = skb_share_check(skb, GFP_ATOMIC); 
	//共享检查,如果是共享数据包,因为可能要修改skb中的信息,所以要先复制一份副本,再进一步处理
	if (!skb) {
		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
		goto out;
	}

	if (!pskb_may_pull(skb, sizeof(struct iphdr))) //检查首部是否够长
		goto inhdr_error;

	iph = ip_hdr(skb);

	/*
	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
	 *
	 *	Is the datagram acceptable?
	 *
	 *	1.	Length at least the size of an ip header
	 *	2.	Version of 4
	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
	 *	4.	Doesn't have a bogus length
	 */

	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;

	BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
	BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
	BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
	__IP_ADD_STATS(net,
		       IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
		       max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));

	if (!pskb_may_pull(skb, iph->ihl*4))//对数据报的头长度进行检查
		goto inhdr_error;

	iph = ip_hdr(skb);

	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
		goto csum_error;

	len = ntohs(iph->tot_len);
	if (skb->len < len) {
		__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
		goto drop;
	} else if (len < (iph->ihl*4))
		goto inhdr_error;

	/* Our transport medium may have padded the buffer out. Now we know it
	 * is IP we can trim to the true length of the frame.
	 * Note this now means skb->len holds ntohs(iph->tot_len).
	 *
	 * 根据ip包总长度,重新计算skb的长度,去掉末尾无用信息。
	 */
	if (pskb_trim_rcsum(skb, len)) {
		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
		goto drop;
	}

	iph = ip_hdr(skb);
	skb->transport_header = skb->network_header + iph->ihl*4;

	/* Remove any debris in the socket control block */
	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
	IPCB(skb)->iif = skb->skb_iif;

	/* Must drop socket now because of tproxy. */
	skb_orphan(skb);

	return skb;

csum_error:
	__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
	__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
	kfree_skb(skb);
out:
	return NULL;
}

NF_HOOK()

在ip_rcv()中的NF_HOOK中,根据nf_hook()的返回值,决定要不要执行后续函数ip_rcv_finish()。在nf_hook()函数中,根据NF_INET_PRE_ROUTING链上table中对应的规则,来决定要不要drop。

/* File:linux-5.4.1\include\linux\netfilter.h */
static inline int
NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb,
	struct net_device *in, struct net_device *out,
	int (*okfn)(struct net *, struct sock *, struct sk_buff *))
{
	int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
    /* 根据nf_hook()的返回值,决定要不要执行后续函数 */
	if (ret == 1)
		ret = okfn(net, sk, skb);
	return ret;
}

ip_rcv_finish()

走到这步,说明报文符合incoming阶段的规则,没有被drop掉。后面怎么走,根据查找输入路由缓存决定。

这里的主要功能为:

  1. 如果还没有为该数据包查找输入路由缓存,则调用ip_route_input()为其查找输入路由缓存;
  2. 处理IP数据包首部中的选项;
  3. 根据输入路由缓存输入到本地或者转发。
/* File: linux-5.4.1\net\ipv4\ip_input.c */
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;
	int ret;

	/* if ingress device is enslaved to an L3 master device pass the
	 * skb to its handler for processing
	 */
	skb = l3mdev_ip_rcv(skb);
	if (!skb)
		return NET_RX_SUCCESS;

	ret = ip_rcv_finish_core(net, sk, skb, dev);

	/* 最后,根据输入路由缓存,决定是"输入到本地"或者"转发"
	 * 这里的dst_input,有两种可能
	 * 1、ip_local_deliver(), 对应于NF_INET_LOCAL_IN;
	 * 2、ip_forward(), 对应于NF_INET_FORWARD;
	 */
	if (ret != NET_RX_DROP)
		ret = dst_input(skb);
	return ret;
}

static int ip_rcv_finish_core(struct net *net, struct sock *sk,
			      struct sk_buff *skb, struct net_device *dev)
{
	const struct iphdr *iph = ip_hdr(skb);
	int (*edemux)(struct sk_buff *skb);
	struct rtable *rt;
	int err;

	if (net->ipv4.sysctl_ip_early_demux &&
	    !skb_dst(skb) &&
	    !skb->sk &&
	    !ip_is_fragment(iph)) {
		const struct net_protocol *ipprot;
		int protocol = iph->protocol;

		ipprot = rcu_dereference(inet_protos[protocol]);
		if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
			err = INDIRECT_CALL_2(edemux, tcp_v4_early_demux,
					      udp_v4_early_demux, skb);
			if (unlikely(err))
				goto drop_error;
			/* must reload iph, skb->head might have changed */
			iph = ip_hdr(skb);
		}
	}

	/*
	 *	Initialise the virtual path cache for the packet. It describes
	 *	how the packet travels inside Linux networking.
	 *
	 *  用于为该packet初始化其在linux网络中的路径缓存
	 *
	 *  如果还没有为该数据包查找到输入路由缓存。则调用ip_route_input_noref()来为其查找,
	 *  若失败,则丢弃packet。
	 */
	if (!skb_valid_dst(skb)) {
		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
					   iph->tos, dev);
		if (unlikely(err))
			goto drop_error;
	}

#ifdef CONFIG_IP_ROUTE_CLASSID
	if (unlikely(skb_dst(skb)->tclassid)) {
		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
		u32 idx = skb_dst(skb)->tclassid;
		st[idx&0xFF].o_packets++;
		st[idx&0xFF].o_bytes += skb->len;
		st[(idx>>16)&0xFF].i_packets++;
		st[(idx>>16)&0xFF].i_bytes += skb->len;
	}
#endif
    /* 通过
     *   ip_rcv_option()->ip_options_rcv_srr()->ip_route_input()来查找路由缓存 
     */
	if (iph->ihl > 5 && ip_rcv_options(skb, dev))
		goto drop;

	rt = skb_rtable(skb);
	if (rt->rt_type == RTN_MULTICAST) {
		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
	} else if (rt->rt_type == RTN_BROADCAST) {
		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
	} else if (skb->pkt_type == PACKET_BROADCAST ||
		   skb->pkt_type == PACKET_MULTICAST) {
		struct in_device *in_dev = __in_dev_get_rcu(dev);

		/* RFC 1122 3.3.6:
		 *
		 *   When a host sends a datagram to a link-layer broadcast
		 *   address, the IP destination address MUST be a legal IP
		 *   broadcast or IP multicast address.
		 *
		 *   A host SHOULD silently discard a datagram that is received
		 *   via a link-layer broadcast (see Section 2.4) but does not
		 *   specify an IP multicast or broadcast destination address.
		 *
		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
		 * in a way a form of multicast and the most common use case for
		 * this is 802.11 protecting against cross-station spoofing (the
		 * so-called "hole-196" attack) so do it for both.
		 */
		if (in_dev &&
		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
			goto drop;
	}

	return NET_RX_SUCCESS;

drop:
	kfree_skb(skb);
	return NET_RX_DROP;

drop_error:
	if (err == -EXDEV)
		__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
	goto drop;
}

输入到本地

ip_local_deliver()

根据路由缓存,转到本地。

/* File: linux-5.4.1\net\ipv4\ip_input.c */
/*
 * 	Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
	/*
	 *	Reassemble IP fragments.
	 */
	struct net *net = dev_net(skb->dev);

	if (ip_is_fragment(ip_hdr(skb))) {
		/* ip数据包分片重组,ip_defrag() */
		if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
			return 0;
	}

	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
		       net, NULL, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

ip_local_deliver_finish()

这里最终,报文根据对应的协议handler处理函数,上发到L4层处理。

/* File: linux-5.4.1\net\ipv4\ip_input.c */
static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	__skb_pull(skb, skb_network_header_len(skb));

	rcu_read_lock();
	ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
	rcu_read_unlock();

	return 0;
}
/*
 * 通过ip_local_deliver_finish()将输入数据包从网络层传递到传输层。过程如下:
 *
 * 1、去掉IP首部
 * 2、如果是raw套接字接受数据包,则需要复制一份副本,输入到该接受数据包的套接字。
 * 3、最后,通过传输层的接受例程,将数据包传递到传输层,由传输层进行处理。
 */
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
	const struct net_protocol *ipprot;
	int raw, ret;

resubmit:
	/* 分发处理原始套接字 */
	raw = raw_local_deliver(skb, protocol);

	ipprot = rcu_dereference(inet_protos[protocol]);
	if (ipprot) {
		if (!ipprot->no_policy) {
			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				kfree_skb(skb);
				return;
			}
			nf_reset_ct(skb);
		}
		/*
		 * 如果ipprot->handler是tcp,
 		 *    则执行tcp_v4_rcv(skb);
		 * 如果是udp,
		 *    则执行udp_rcv(skb);
		 * 如果都不是,则用handler对应的协议函数处理
		 *    ipprot->handler(skb);
		 */
		ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
				      skb);
		if (ret < 0) {
			protocol = -ret;
			goto resubmit;
		}
		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
	} else {
		if (!raw) {
			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
				icmp_send(skb, ICMP_DEST_UNREACH,
					  ICMP_PROT_UNREACH, 0);
			}
			kfree_skb(skb);
		} else {
			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
			consume_skb(skb);
		}
	}
}

转发出去

ip_forward()

/* File: linux-5.4.1\net\ipv4\ip_forward.c */
/* 函数在ip_route_input_slow->ip_mkroute_input注册
 *
 * IP数据包的转发是由ip_forward()处理,在ip_rcv_finish()
 * 通过输入路由缓存被调用。
 */
int ip_forward(struct sk_buff *skb)
{
	u32 mtu;
	struct iphdr *iph;	/* Our header */
	struct rtable *rt;	/* Route we use */
	struct ip_options *opt	= &(IPCB(skb)->opt);
	struct net *net;

	/* that should never happen */
	if (skb->pkt_type != PACKET_HOST)
		goto drop;

	if (unlikely(skb->sk))
		goto drop;

	if (skb_warn_if_lro(skb))
		goto drop;

	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
		goto drop;

	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
		return NET_RX_SUCCESS;

	skb_forward_csum(skb);
	net = dev_net(skb->dev);

	/*
	 *	According to the RFC, we must first decrease the TTL field. If
	 *	that reaches zero, we must reply an ICMP control message telling
	 *	that the packet's lifetime expired.
	 */
	if (ip_hdr(skb)->ttl <= 1)
		goto too_many_hops;

	if (!xfrm4_route_forward(skb))
		goto drop;

	rt = skb_rtable(skb);

	if (opt->is_strictroute && rt->rt_uses_gateway)
		goto sr_failed;

	IPCB(skb)->flags |= IPSKB_FORWARDED;
	mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
	if (ip_exceeds_mtu(skb, mtu)) {
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(mtu));
		goto drop;
	}

	/* We are about to mangle packet. Copy it! */
	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
		goto drop;
	iph = ip_hdr(skb);

	/* Decrease ttl after skb cow done */
	ip_decrease_ttl(iph);

	/*
	 *	We now generate an ICMP HOST REDIRECT giving the route
	 *	we calculated.
	 */
	if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
	    !skb_sec_path(skb))
		ip_rt_send_redirect(skb);

	if (net->ipv4.sysctl_ip_fwd_update_priority)
		skb->priority = rt_tos2priority(iph->tos);

	return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
		       net, NULL, skb, skb->dev, rt->dst.dev,
		       ip_forward_finish);

sr_failed:
	/*
	 *	Strict routing permits no gatewaying
	 */
	 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
	 goto drop;

too_many_hops:
	/* Tell the sender its packet died... */
	__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

IP报文发送流程

tcp发数据

tcp_sendmsg()

/* File: linux-5.4.1\net\ipv4\tcp.c */
/* tcp发送上层数据包 
 * tcp发送数据最终都会调用到tcp_sendmsg(), 如:
 * send系统调用会直接调用sys_sendto(), 然后填充msghdr数据结构, 并调用sock_sendmsg,
 * 最终会调用__sock_sendmsg().在这个函数里会初始化sock_iocb结构,然后调用tcp_sendmsg()
 */
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
	int ret;
    /* 给整个发送消息的过程上锁 */
	lock_sock(sk);
	ret = tcp_sendmsg_locked(sk, msg, size);
	release_sock(sk);

	return ret;
}

tcp_sendmsg_locked()

/* File: linux-5.4.1\net\ipv4\tcp.c */
/* 由tcp_sendmsg()调用
 * tcp_sendmsg_locked()的主要工作是把用户层的数据,填充到skb中,然后加入到sock的发送队列。
 * 之后电泳tcp_write_xmit()来把sock发送队列中的skb尽量的发送出去。
 * 另外。TCP的发送缓存的管理业主要在这个函数中
 */
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct ubuf_info *uarg = NULL;
	struct sk_buff *skb;
	struct sockcm_cookie sockc;
	int flags, err, copied = 0;
	int mss_now = 0, size_goal, copied_syn = 0;
	int process_backlog = 0;
	bool zc = false;
	long timeo;

	flags = msg->msg_flags;

    /* 如果设置了MSG_ZEROCOPY,零拷贝 */
	if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
		skb = tcp_write_queue_tail(sk);
		uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
		if (!uarg) {
			err = -ENOBUFS;
			goto out_err;
		}

		zc = sk->sk_route_caps & NETIF_F_SG;
		if (!zc)
			uarg->zerocopy = 0;
	}
    /* 如果设置了MSG_FASTOPEN,调用tcp_sendmsg_fastopen() */
	if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
	    !tp->repair) {
		err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
		if (err == -EINPROGRESS && copied_syn > 0)
			goto out;
		else if (err)
			goto out_err;
	}
    /* 取得发送超时时间 */
	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
    /* 标记套接字是否有应用程序限制 */
	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */

	/* Wait for a connection to finish. One exception is TCP Fast Open
	 * (passive side) where data is allowed to be sent before a connection
	 * is fully established.
	 *
	 * 如果connect还没有完成,则等待连接完成,
	 * 如果是非阻塞,则直接返回。
	 */
	if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
	    !tcp_passive_fastopen(sk)) {
		err = sk_stream_wait_connect(sk, &timeo);
		if (err != 0)
			goto do_error;
	}
    /* 调用tcp_send_rcvq()将数据放入接收队列中 */
	if (unlikely(tp->repair)) {
		if (tp->repair_queue == TCP_RECV_QUEUE) {
			copied = tcp_send_rcvq(sk, msg, size);
			goto out_nopush;
		}

		err = -EINVAL;
		if (tp->repair_queue == TCP_NO_QUEUE)
			goto out_err;

		/* 'common' sending to sendq */
	}

	sockcm_init(&sockc, sk);
	if (msg->msg_controllen) {
		err = sock_cmsg_send(sk, msg, &sockc);
		if (unlikely(err)) {
			err = -EINVAL;
			goto out_err;
		}
	}

	/* This should be in poll */
	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);

	/* Ok commence sending. */
	copied = 0;

restart:
	/* 获取当前的MSS、网络设备支持的最大数据长度size_goal
	 * 如果支持GSo, size_goal会是MSS的整数倍。
	 */
	mss_now = tcp_send_mss(sk, &size_goal, flags);

	err = -EPIPE;
	/* 如果发送端已经完全关闭则返回,并设置err */
	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
		goto do_error;
    /* 遍历用户层的数据块数组 */
	while (msg_data_left(msg)) {
		int copy = 0;
        /* 发送队列的最后一个skb */
		skb = tcp_write_queue_tail(sk);
		if (skb)
			copy = size_goal - skb->len;
        /* 使用新的skb来装数据 */
		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
			bool first_skb;

new_segment:
            /* 如果发送队列的总大小sk_wmem_queued大于等于发送缓存的上限
             * sk_sndbuf,或者发送缓存中尚未发送的数据量唱过用户的设置值,
             * 就进入等待。
             */
			if (!sk_stream_memory_free(sk))
				goto wait_for_sndbuf;

			if (unlikely(process_backlog >= 16)) {
				process_backlog = 0;
				if (sk_flush_backlog(sk))
					goto restart;
			}
			first_skb = tcp_rtx_and_write_queues_empty(sk);
			/* 申请发送缓存
			 * alloc的大小一般都是等于mss的大小,这里通过select_size得到的。
			 */
			skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
						  first_skb);
			if (!skb)
				goto wait_for_memory;

			process_backlog++;
			skb->ip_summed = CHECKSUM_PARTIAL;

			/* 将这个skb加入到sk_write_queue队列中,并更新sk_send_head()域 */
			skb_entail(sk, skb);
			copy = size_goal;

			/* All packets are restored as if they have
			 * already been sent. skb_mstamp_ns isn't set to
			 * avoid wrong rtt estimation.
			 *
			 * 如果使用了TCP REPAIR选项,那么为skb设置“发送时间”
			 */
			if (tp->repair)
				TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED;
		}

		/* Try to append data to the end of skb. */
		if (copy > msg_data_left(msg))
			copy = msg_data_left(msg);

		/* Where to copy to? */
		if (skb_availroom(skb) > 0 && !zc) {
			/* We have some space in skb head. Superb! */
			copy = min_t(int, copy, skb_availroom(skb));
			err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
			if (err)
				goto do_fault;
		} 
		/* 如果skb的线性数据区已经用完了,且 不是零拷贝,那么使用分页区 */
		else if (!zc) {
			bool merge = true;
			int i = skb_shinfo(skb)->nr_frags;
			struct page_frag *pfrag = sk_page_frag(sk);

			if (!sk_page_frag_refill(sk, pfrag))
				goto wait_for_memory;

			if (!skb_can_coalesce(skb, i, pfrag->page,
					      pfrag->offset)) {
				if (i >= sysctl_max_skb_frags) {
					/* 给TCP加一个PSH标记 */
					tcp_mark_push(tp, skb);
					goto new_segment;
				}
				merge = false;
			}

			copy = min_t(int, copy, pfrag->size - pfrag->offset);

			if (!sk_wmem_schedule(sk, copy))
				goto wait_for_memory;

			err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
						       pfrag->page,
						       pfrag->offset,
						       copy);
			if (err)
				goto do_error;

			/* Update the skb. */
			if (merge) {
				skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
			} else {
				skb_fill_page_desc(skb, i, pfrag->page,
						   pfrag->offset, copy);
				page_ref_inc(pfrag->page);
			}
			pfrag->offset += copy;
		} 
		else {/* 使用零拷贝 */
			err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
			if (err == -EMSGSIZE || err == -EEXIST) {
				tcp_mark_push(tp, skb);
				goto new_segment;
			}
			if (err < 0)
				goto do_error;
			copy = err;
		}
        /* 如果这是第一次拷贝,取消PSH标志 */
		if (!copied)
			TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;

		WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
		TCP_SKB_CB(skb)->end_seq += copy;/* 更新发送队列的最后一个序号 */
		tcp_skb_pcount_set(skb, 0);
        /* 已经拷贝到发送队列的数据量 */
		copied += copy;

		/* 如果所有数据都拷贝好了,退出 */
		if (!msg_data_left(msg)) {
			if (unlikely(flags & MSG_EOR))
				TCP_SKB_CB(skb)->eor = 1;
			goto out;
		}
        /* 如果skb还可以继续填充数据,或者发送的是带外数据,或者使用TCP REPAIR选项
         * 那么继续拷贝数据,先不发送。
         */
		if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
			continue;

		if (forced_push(tp)) {
			tcp_mark_push(tp, skb);
			/* 把sk发送队列中所有的skb全部发送出去 */
			__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
		} 
		else if (skb == tcp_send_head(sk))
			/* 发送队列不为空,则只发送一个skb */
			tcp_push_one(sk, mss_now);
		continue;

wait_for_sndbuf:
		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
wait_for_memory:
		if (copied)
			tcp_push(sk, flags & ~MSG_MORE, mss_now,
				 TCP_NAGLE_PUSH, size_goal);

		err = sk_stream_wait_memory(sk, &timeo);
		if (err != 0)
			goto do_error;

		mss_now = tcp_send_mss(sk, &size_goal, flags);
	}

out:
    /* 如果已经有数据复制到发送队列了,就尝试立即发送 */
	if (copied) {
		tcp_tx_timestamp(sk, sockc.tsflags);
		tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
	}
out_nopush:
	sock_zerocopy_put(uarg);
	return copied + copied_syn;

do_error:
	skb = tcp_write_queue_tail(sk);
do_fault:
	tcp_remove_empty_skb(sk, skb);

	if (copied + copied_syn)
		goto out;
out_err:
	sock_zerocopy_put_abort(uarg, true);
	err = sk_stream_error(sk, flags, err);
	/* make sure we wake any epoll edge trigger waiter */
	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
		     err == -EAGAIN)) {
		sk->sk_write_space(sk);
		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
	}
	return err;
}

tcp_write_xmit()

/* File: linux-5.4.1\net\ipv4\tcp_output.c */
/* This routine writes packets to the network.  It advances the
 * send_head.  This happens as incoming acks open up the remote
 * window for us.
 *
 * LARGESEND note: !tcp_urg_mode is overkill, only frames between
 * snd_up-64k-mss .. snd_up cannot be large. However, taking into
 * account rare use of URG, this is not a big flaw.
 *
 * Send at most one packet when push_one > 0. Temporarily ignore
 * cwnd limit to force at most one packet out when push_one == 2.

 * Returns true, if no segments are in flight and we have queued segments,
 * but cannot send anything now because of SWS or another problem.
 *
 *
 * 该函数讲发送队列上的SKB发送出去,返回值为0表示发送成功。
 * 过程如下:
 * 1、检测当前转台是否是TCP_CLOSE
 * 2、检测用塞窗口大小
 * 3、检测当前段是否完全处在发送窗口内
 * 4、检测端是否使用nagle算法
 * 5、通过以上检测后,将SKB发送出去
 * 6、循环检测并发送【发送队列】上所有未发送的SKB
 * 
 * 参数说明:
 * mss_now:当前有效的MSS
 * nonagle:标识是否启用nonagle算法
 *
 * 最终调用tcp_transmit_skb
 * 
 * 对拥塞进行处理:
 * 1、如果开启了pacing发送,检查是否应该发送,如果还不到发送时机,停止发送
 * 2、如果拥塞窗口为0,停止发送
 * 3、如果该数据包颤过了发送窗口,停止发送
 * 4、如果上面条件均通过,就把数据包封装成tcp_segment发送出去
 */
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
			   int push_one, gfp_t gfp)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	unsigned int tso_segs, sent_pkts;
	int cwnd_quota;
	int result;
	bool is_cwnd_limited = false, is_rwnd_limited = false;
	u32 max_segs;

	sent_pkts = 0;
    /* 更改tp->tcp_mstamp为当前时间的us表示 */
	tcp_mstamp_refresh(tp);
	if (!push_one) {
		/* Do MTU probing. 探测pmtu */
		result = tcp_mtu_probe(sk);
		if (!result) {
			return false;
		} else if (result > 0) {
			sent_pkts = 1;
		}
	}
    /* 预测发送的最大seg数 */
	max_segs = tcp_tso_segs(sk, mss_now);
	/* 如果发送队列不空,则准备开始发送段 */
	while ((skb = tcp_send_head(sk))) {
		unsigned int limit;

		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
			skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
			tcp_init_tso_segs(skb, mss_now);
			goto repair; /* Skip network transmission */
		}

		if (tcp_pacing_check(sk))
			break;
        /* 设置有关tso的信息,包括GSO类型、GSO分段的大小等。这些
		 * 信息是准备给软件TSO分段使用的。如果网络设备不支持TSO,
		 * 但又使用了TSO功能,则段在提交给网络设备之前,需要进行
		 * 软分段,即由代码实现TSO分段。
		 *
		 */
		tso_segs = tcp_init_tso_segs(skb, mss_now);
		BUG_ON(!tso_segs);

		/* 检查目前是否可以发送数据,
		 * 确认当前发送窗口的大小。
		 * 检测拥塞窗口的大小,如果
		 * 为0,则说明拥塞窗口已经满
		 */
		cwnd_quota = tcp_cwnd_test(tp, skb);
		if (!cwnd_quota) {
			if (push_one == 2)
				/* Force out a loss probe pkt. */
				cwnd_quota = 1;
			else
				break;
		}
        /* 检测当前段(包括线性区和分散聚合I/O区shifo)是否完全处在发送窗口内,如果是
         * 则可以发送,否则目前不能发送。
         */
		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
			is_rwnd_limited = true;
			break;
		}

		if (tso_segs == 1) {
			/* 如果无需TSO分段,则检测是否使用Nagle算法,
			 * 并确定当前能否立即发送该段。
			 */
			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
						     (tcp_skb_is_last(sk, skb) ?
						      nonagle : TCP_NAGLE_PUSH))))
				break;
		} else {
			/* 如果需要TSO分段,则检测该段是否应该延迟发送, 
		     * 如果是,则目前不能发送。
		     */
			if (!push_one &&
			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
						 &is_rwnd_limited, max_segs))
				break;
		}
		
        /* limit为再次分段的段长,初始化为当前的MSS */
		limit = mss_now;
		/* 判断当前段是不是TSO分段的段,如果是才处理 */
		if (tso_segs > 1 && !tcp_urg_mode(tp))
			/* 以发送窗口和拥塞窗口的最小值作为分段段长。也就是比较可用拥塞窗口和tcp_sendmsg中的发散聚合I/O页 */
			limit = tcp_mss_split_point(sk, skb, mss_now,
						    min_t(unsigned int,
							  cwnd_quota,
							  max_segs),
						    nonagle);
        /* 得到分段段长后,如果SKB中的数据长度大于分段长度,则调用tso_fragment()根据段长进行分段
         * 如果分段失败,则目前暂不发送。
         */
		if (skb->len > limit &&
		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
			break;

		if (tcp_small_queue_check(sk, skb, 0))
			break;
        /* 使用地址族相关的af_specific->queue_xmit()函数
         * 将数据转发到网络层。IPv4使用的是 ip_queue_xmit()
         */
		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
			break;

repair:
		/* Advance the send_head.  This one is sent out.
		 * This call will increment packets_out.
		 */
		tcp_event_new_data_sent(sk, skb);
        /* 如果发送的段小于MSS,则更新最近发送的小包的最后一个字节序号 */
		tcp_minshall_update(tp, mss_now, skb);
		sent_pkts += tcp_skb_pcount(skb);/* 更新在函数中已发送的总段数 */

		if (push_one)
			break;
	}
    /* 如果接受窗口有限制,应该启动探测机制 */
	if (is_rwnd_limited)
		tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
	else
		tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
    /* 如果本次有数据发送,则对TCP拥塞窗口进行确认,最后返回成功 */
	if (likely(sent_pkts)) {
		if (tcp_in_cwnd_reduction(sk))
			tp->prr_out += sent_pkts;

		/* Send one loss probe per tail loss episode. */
		if (push_one != 2)
			tcp_schedule_loss_probe(sk, false);
		is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
		tcp_cwnd_validate(sk, is_cwnd_limited);
		return false;
	}
	
	return !tp->packets_out && !tcp_write_queue_empty(sk);
}

__tcp_transmit_skb()

/* File: linux-5.4.1\net\ipv4\tcp_output.c */
/* This routine actually transmits TCP packets queued in by
 * tcp_do_sendmsg().  This is used by both the initial
 * transmission and possible later retransmissions.
 * All SKB's seen here are completely headerless.  It is our
 * job to build the TCP header, and pass the packet down to
 * IP so it can do the same plus pass the packet off to the
 * device.
 *
 * We are working here with either a clone of the original
 * SKB, or a fresh unique copy made by the retransmit engine.
 *
 *
 * 通常要发送一个TCP字段都是通过tcp_transmit_skb()的,该函数会给
 * 待发送的段构造TCP首部,然后调用网络层接口到IP层,最终抵达网络设备。
 * 由于在成功发送到网络设备后会释放该SKB,而TCP必须要街道对应的ACK后
 * 才能真正释放数据,因此在发送前会根据参数确定是克隆还是复制一份SKB
 * 用于发送。
 *
 * 最终的tcp发送都会调用这个clone_it()表示发送【发送队列】的第一个SKB的时候,
 * 采用克隆还是直接使用skb.
 * 如果发送应用层的数据使用克隆的,等待对方应答ACK回答才能把数据删除。
 * 如果是回送ack信息的,则无需克隆。
 * 如果不支持TSO或者GSO,这里的SKB->len为mss,否则如果支持,并且有数据在shinfo中,
 * 则这里的SKB长度为shinfo或者拥塞窗口的最小值。
 *
 * 对于拥塞的处理:
 * 1、tcp_event_ack_sent 统计我们发送了多少ack
 * 2、tcp_event_data_sent 数据包发送后的拥塞状态记账
 * 3、tcp_internal_pacing 计算bbr下一次发送数据包的时间
 */
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
	struct tcp_out_options opts;
	unsigned int tcp_options_size, tcp_header_size;
	struct sk_buff *oskb = NULL;
	struct tcp_md5sig_key *md5;
	struct tcphdr *th;
	u64 prior_wstamp;
	int err;

	BUG_ON(!skb || !tcp_skb_pcount(skb));
	tp = tcp_sk(sk);
	prior_wstamp = tp->tcp_wstamp_ns;
	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;

	/* 根据clone_it确定是否克隆待发送的数据包 */
	if (clone_it) {
		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
			- tp->snd_una;
		oskb = skb;

		tcp_skb_tsorted_save(oskb) {
			if (unlikely(skb_cloned(oskb)))
				skb = pskb_copy(oskb, gfp_mask);
			else
				skb = skb_clone(oskb, gfp_mask);
		} tcp_skb_tsorted_restore(oskb);

		if (unlikely(!skb))
			return -ENOBUFS;
	}
    /* 获取INET层和TCP层的传输控制块、SKB中的TCP私有控制块 
	 * 以及当前TCP首部长度
	 */
	inet = inet_sk(sk);
	tcb = TCP_SKB_CB(skb);
	memset(&opts, 0, sizeof(opts));

    /* 判断当前TCP段是不是SYN段,因为有些选项只能出现在SYN段中,需作特别处理 */
	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
	} else {
		tcp_options_size = tcp_established_options(sk, skb, &opts,
							   &md5);
		/* Force a PSH flag on all (GSO) packets to expedite GRO flush
		 * at receiver : This slightly improve GRO performance.
		 * Note that we do not force the PSH flag for non GSO packets,
		 * because they might be sent under high congestion events,
		 * and in this case it is better to delay the delivery of 1-MSS
		 * packets and thus the corresponding ACK packet that would
		 * release the following packet.
		 */
		if (tcp_skb_pcount(skb) > 1)
			tcb->tcp_flags |= TCPHDR_PSH;
	}
	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

	/* if no packet is in qdisc/device queue, then allow XPS to select
	 * another queue. We can be called from tcp_tsq_handler()
	 * which holds one reference to sk.
	 *
	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
	 * One way to get this would be to set skb->truesize = 2 on them.
	 */
	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);

	/* If we had to use memory reserve to allocate this skb,
	 * this might cause drops if packet is looped back :
	 * Other socket might not have SOCK_MEMALLOC.
	 * Packets not looped back do not care about pfmemalloc.
	 */
	skb->pfmemalloc = 0;

	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);

	skb_orphan(skb);
	skb->sk = sk;
	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
	skb_set_hash_from_sk(skb, sk);
	refcount_add(skb->truesize, &sk->sk_wmem_alloc);

	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);

	/* Build TCP header and checksum it. */
	th = (struct tcphdr *)skb->data;
	th->source		= inet->inet_sport;
	th->dest		= inet->inet_dport;
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(rcv_nxt);
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
					tcb->tcp_flags);

	th->check		= 0;
	th->urg_ptr		= 0;

	/* The urg_mode check is necessary during a below snd_una win probe 
	 *
	 * 判断是否需要设置紧急指针和带外数据标志,判断条件有两个,
	 * 1、发送时是否设置了紧急方式
	 * 2、紧急指针是否在以该报文数据序号为起始的65535范围内,
	 * 其中,第二个条件主要是判断紧急指针的合法性。
	 */
	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
		if (before(tp->snd_up, tcb->seq + 0x10000)) {
			th->urg_ptr = htons(tp->snd_up - tcb->seq);
			th->urg = 1;
		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
			th->urg_ptr = htons(0xFFFF);
			th->urg = 1;
		}
	}
    /* TCP首部调整完毕,开始构建TCP首部选项 */
	tcp_options_write((__be32 *)(th + 1), tp, &opts);
	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
		th->window      = htons(tcp_select_window(sk));
		tcp_ecn_send(sk, skb, th, tcp_header_size);
	} else {
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 */
		th->window	= htons(min(tp->rcv_wnd, 65535U));
	}
#ifdef CONFIG_TCP_MD5SIG
	/* Calculate the MD5 hash, as we have all we need now */
	if (md5) {
		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
		tp->af_specific->calc_md5_hash(opts.hash_location,
					       md5, sk, skb);
	}
#endif
    /* 调用IPv4执行校验 和 接口send_check计算校验和,并设置到TCP首部中。
     * 在TCP中,send_check接口被初始化为tcp_v4_send_check()
     */
	icsk->icsk_af_ops->send_check(sk, skb);

	/* 如果发送出去的段有ACK标志,则需要通知延时确认模块,递减
	 * 快速发送ACK段的数量,同时停止延时确认定时器。
	 */
	if (likely(tcb->tcp_flags & TCPHDR_ACK))
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);

    /*  */
	if (skb->len != tcp_header_size) {
		tcp_event_data_sent(tp, sk);/* 数据包发送后的拥塞状态记账 */
		tp->data_segs_out += tcp_skb_pcount(skb);/* 统计发送出去的data seg数量 */
		tp->bytes_sent += skb->len - tcp_header_size;
	}

	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
			      tcp_skb_pcount(skb));
    /* 统计发送出去的所有seg数量 */
	tp->segs_out += tcp_skb_pcount(skb);
	/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);

	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */

	/* Cleanup our debris for IP stacks */
	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
			       sizeof(struct inet6_skb_parm)));

	tcp_add_tx_delay(skb, tp);

	/* 调用发送接口queue_xmit发送报文,如果失败,返回错误码
	 * 在TCP中,该接口实现函数为ip_queue_xmit()
	 */
	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);

	if (unlikely(err > 0)) {
		/* 当发送失败时,类似接收到显式拥塞通知,使拥塞控制进入CWR状态。
		 * 最后,根据错误信息,返回发送是否成功
		 */
		tcp_enter_cwr(sk);
		err = net_xmit_eval(err);
	}
	if (!err && oskb) {
		tcp_update_skb_after_send(sk, oskb, prior_wstamp);
		tcp_rate_skb_sent(sk, oskb);
	}
	return err;
}

__ip_queue_xmit()

/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* Note: skb->sk can be different from sk, in case of tunnels 
 *
 * 在TCP中,将TCP段打包成IP数据包的方法根据TCP类型的不同而有多种接口。
 * 其中,最常用的就是ip_queue_xmit(),而ip_build_and_send_pkt()和
 * ip_send_reply()只有在发送特定段时才会被调用
 *
 * @skb: 待封装成IP数据包的TCP段
 * 
 * TCP发送的时候从tcp_transmit_skb函数里面跳转过来。
 */
int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
		    __u8 tos)
{
	struct inet_sock *inet = inet_sk(sk);
	struct net *net = sock_net(sk);
	struct ip_options_rcu *inet_opt;
	struct flowi4 *fl4;
	struct rtable *rt;
	struct iphdr *iph;
	int res;

	/* Skip all of this if the packet is already routed,
	 * f.e. by something like SCTP.
	 */
	rcu_read_lock();
	inet_opt = rcu_dereference(inet->inet_opt);
	fl4 = &fl->u.ip4;
	rt = skb_rtable(skb);
	if (rt)
		goto packet_routed;

	/* Make sure we can route this packet. */
	rt = (struct rtable *)__sk_dst_check(sk, 0);
	if (!rt) {
		__be32 daddr;

		/* Use correct destination address if we have options. */
		daddr = inet->inet_daddr;
		if (inet_opt && inet_opt->opt.srr)
			daddr = inet_opt->opt.faddr;

		/* If this fails, retransmit mechanism of transport layer will
		 * keep trying until route appears or the connection times
		 * itself out.
		 */
		rt = ip_route_output_ports(net, fl4, sk,
					   daddr, inet->inet_saddr,
					   inet->inet_dport,
					   inet->inet_sport,
					   sk->sk_protocol,
					   RT_CONN_FLAGS_TOS(sk, tos),
					   sk->sk_bound_dev_if);
		if (IS_ERR(rt))
			goto no_route;
		sk_setup_caps(sk, &rt->dst);
	}
	skb_dst_set_noref(skb, &rt->dst);

packet_routed:
	if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
		goto no_route;

	/* OK, we know where to send it, allocate and build IP header. */
	skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
	skb_reset_network_header(skb);
	iph = ip_hdr(skb);
	*((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
	if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
		iph->frag_off = htons(IP_DF);
	else
		iph->frag_off = 0;
	iph->ttl      = ip_select_ttl(inet, &rt->dst);
	iph->protocol = sk->sk_protocol;
	ip_copy_addrs(iph, fl4);

	/* Transport layer set skb->h.foo itself. */

	if (inet_opt && inet_opt->opt.optlen) {
		iph->ihl += inet_opt->opt.optlen >> 2;
		ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
	}

	ip_select_ident_segs(net, skb, sk,
			     skb_shinfo(skb)->gso_segs ?: 1);

	/* TODO : should we use skb->sk here instead of sk ? */
	skb->priority = sk->sk_priority;
	skb->mark = sk->sk_mark;
	
	res = ip_local_out(net, sk, skb);
	rcu_read_unlock();
	return res;

no_route:
	rcu_read_unlock();
	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
	kfree_skb(skb);
	return -EHOSTUNREACH;
}

udp发数据

udp_sendmsg()

/* File: linux-5.4.1\net\ipv4\udp.c */
/* 应用层数据发包 */
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
	struct inet_sock *inet = inet_sk(sk);
	struct udp_sock *up = udp_sk(sk);
	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
	struct flowi4 fl4_stack;
	struct flowi4 *fl4;
	int ulen = len;
	struct ipcm_cookie ipc;
	struct rtable *rt = NULL;
	int free = 0;
	int connected = 0;
	__be32 daddr, faddr, saddr;
	__be16 dport;
	u8  tos;
	int err, is_udplite = IS_UDPLITE(sk);
	/* 局部标志corkreq的初始化取决于多个因素,而此标志回传给ip_append_data,用于指出是由应该使用缓冲区机制。 */
	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
	struct sk_buff *skb;
	struct ip_options_data opt_copy;


    /* 如果长度大于64k, 返回错误 */
	if (len > 0xFFFF)
		return -EMSGSIZE;

	/*
	 *	Check the flags.
	 *
	 *  udp不支持带外数据的发送
	 */

	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
		return -EOPNOTSUPP;

	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;

	fl4 = &inet->cork.fl.u.ip4;
	/* UDP正在输出数据 */
	if (up->pending) {
		/*
		 * There are pending frames.
		 * The socket lock must be held while it's corked.
		 */
		lock_sock(sk);
		if (likely(up->pending)) {
			if (unlikely(up->pending != AF_INET)) {
				release_sock(sk);
				return -EINVAL;
			}
			goto do_append_data;/* 确实在输出数据,跳转到do_append_data直接处理UDP数据 */
		}
		release_sock(sk);
	}
	ulen += sizeof(struct udphdr);/* 计算UDP报文总长度 */

	/*
	 *	Get and verify the address.
	 */
	if (usin) {
		if (msg->msg_namelen < sizeof(*usin))/* 检查目的地址长度 */
			return -EINVAL;
		if (usin->sin_family != AF_INET) {   /* 检查协议族 */
			if (usin->sin_family != AF_UNSPEC)
				return -EAFNOSUPPORT; 
		}
        /* 缓存目的地址和端口 */
		daddr = usin->sin_addr.s_addr;
		dport = usin->sin_port;
		if (dport == 0)
			return -EINVAL;
	} else {
		if (sk->sk_state != TCP_ESTABLISHED)
			return -EDESTADDRREQ;
		daddr = inet->inet_daddr;
		dport = inet->inet_dport;
		/* Open fast path for connected socket.
		   Route will not be used, if at least one option is set.
		 */
		connected = 1;
	}

	ipcm_init_sk(&ipc, inet);
	ipc.gso_size = up->gso_size;

	if (msg->msg_controllen) {
		err = udp_cmsg_send(sk, msg, &ipc.gso_size);
		if (err > 0)
			err = ip_cmsg_send(sk, msg, &ipc,
					   sk->sk_family == AF_INET6);
		if (unlikely(err < 0)) {
			kfree(ipc.opt);
			return err;
		}
		if (ipc.opt)
			free = 1;
		connected = 0;
	}
	if (!ipc.opt) {
		struct ip_options_rcu *inet_opt;

		rcu_read_lock();
		inet_opt = rcu_dereference(inet->inet_opt);
		if (inet_opt) {
			memcpy(&opt_copy, inet_opt,
			       sizeof(*inet_opt) + inet_opt->opt.optlen);
			ipc.opt = &opt_copy.opt;
		}
		rcu_read_unlock();
	}

	if (cgroup_bpf_enabled && !connected) {
		err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
					    (struct sockaddr *)usin, &ipc.addr);
		if (err)
			goto out_free;
		if (usin) {
			if (usin->sin_port == 0) {
				/* BPF program set invalid port. Reject it. */
				err = -EINVAL;
				goto out_free;
			}
			daddr = usin->sin_addr.s_addr;
			dport = usin->sin_port;
		}
	}

	saddr = ipc.addr;
	ipc.addr = faddr = daddr;

	if (ipc.opt && ipc.opt->opt.srr) {
		if (!daddr) {
			err = -EINVAL;
			goto out_free;
		}
		faddr = ipc.opt->opt.faddr;
		connected = 0;
	}
	tos = get_rttos(&ipc, inet);
	if (sock_flag(sk, SOCK_LOCALROUTE) ||
	    (msg->msg_flags & MSG_DONTROUTE) ||
	    (ipc.opt && ipc.opt->opt.is_strictroute)) {
		tos |= RTO_ONLINK;
		connected = 0;
	}

	if (ipv4_is_multicast(daddr)) {
		if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
			ipc.oif = inet->mc_index;
		if (!saddr)
			saddr = inet->mc_addr;
		connected = 0;
	} else if (!ipc.oif) {
		ipc.oif = inet->uc_index;
	} else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
		/* oif is set, packet is to local broadcast and
		 * and uc_index is set. oif is most likely set
		 * by sk_bound_dev_if. If uc_index != oif check if the
		 * oif is an L3 master and uc_index is an L3 slave.
		 * If so, we want to allow the send using the uc_index.
		 */
		if (ipc.oif != inet->uc_index &&
		    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
							      inet->uc_index)) {
			ipc.oif = inet->uc_index;
		}
	}

	if (connected)
		rt = (struct rtable *)sk_dst_check(sk, 0);

	if (!rt) {
		struct net *net = sock_net(sk);
		__u8 flow_flags = inet_sk_flowi_flags(sk);

		fl4 = &fl4_stack;

		flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
				   RT_SCOPE_UNIVERSE, sk->sk_protocol,
				   flow_flags,
				   faddr, saddr, dport, inet->inet_sport,
				   sk->sk_uid);

		security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
		/* 在路由表中查询路由 */
		rt = ip_route_output_flow(net, fl4, sk);
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
			if (err == -ENETUNREACH)
				IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
			goto out;
		}

		err = -EACCES;
		if ((rt->rt_flags & RTCF_BROADCAST) &&
		    !sock_flag(sk, SOCK_BROADCAST)) /* 广播地址,但不允许进行广播,退出     */
			goto out;
		if (connected)
			sk_dst_set(sk, dst_clone(&rt->dst));进行
	}

	if (msg->msg_flags&MSG_CONFIRM)
		goto do_confirm;
back_from_confirm:

	saddr = fl4->saddr;/* 从路由中获取源地址和目的地址 */
	if (!ipc.addr)
		daddr = ipc.addr = fl4->daddr;

	/* Lockless fast path for the non-corking case. */
	if (!corkreq) {
		struct inet_cork cork;

		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
				  sizeof(struct udphdr), &ipc, &rt,
				  &cork, msg->msg_flags);
		err = PTR_ERR(skb);
		if (!IS_ERR_OR_NULL(skb))
			/* 输出报文 */
			err = udp_send_skb(skb, fl4, &cork);
		goto out;
	}

	lock_sock(sk);
	if (unlikely(up->pending)) {
		/* The socket is already corked while preparing it. */
		/* ... which is an evident application bug. --ANK */
		release_sock(sk);

		net_dbg_ratelimited("socket already corked\n");
		err = -EINVAL;
		goto out;
	}
	/*
	 *	Now cork the socket to pend data.
	 */
	fl4 = &inet->cork.fl.u.ip4;
	fl4->daddr = daddr;
	fl4->saddr = saddr;
	fl4->fl4_dport = dport;
	fl4->fl4_sport = inet->inet_sport;
	up->pending = AF_INET; /* 此标志表示正在发送数据 */

do_append_data:
	/* 累计发送报文长度 */
	up->len += ulen;
	/* 发送数据给ip层,让ip层自己分片 */
	err = ip_append_data(sk, fl4, getfrag, msg, ulen,
			     sizeof(struct udphdr), &ipc, &rt,
			     corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
	if (err)
		udp_flush_pending_frames(sk);
	else if (!corkreq)
		/* 没有后续数据或者IP选项步缓存数据,则调用udp_push_pending_frames发送数据 */
		err = udp_push_pending_frames(sk);
	else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
		up->pending = 0;
	release_sock(sk);

out:
	ip_rt_put(rt);/* 发送完成,递减对路由的引用 */
out_free:
	if (free)
		kfree(ipc.opt);
	if (!err)
		return len;
	/*
	 * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
	 * ENOBUFS might not be good (it's not tunable per se), but otherwise
	 * we don't have a good statistic (IpOutDiscards but it can be too many
	 * things).  We could add another new stat but at least for now that
	 * seems like overkill.
	 */
	if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_SNDBUFERRORS, is_udplite);
	}
	return err;

do_confirm:
	/* 发送数据时设置了MSG_CONFIRM标志
	 * MSG_CONFIRM标志表示仅仅用来发现路径,并不直接发送数据。如果没有指定这个标志,则发送数据
	 */
	if (msg->msg_flags & MSG_PROBE)
		dst_confirm_neigh(&rt->dst, &fl4->daddr);
	if (!(msg->msg_flags&MSG_PROBE) || len)
		goto back_from_confirm;
	err = 0;
	goto out;
}

udp_send_skb()

/* File: linux-5.4.1\net\ipv4\udp.c */
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
			struct inet_cork *cork)
{
	struct sock *sk = skb->sk;
	struct inet_sock *inet = inet_sk(sk);
	struct udphdr *uh;
	int err = 0;
	int is_udplite = IS_UDPLITE(sk);
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
	int datalen = len - sizeof(*uh);
	__wsum csum = 0;

	/*
	 * Create a UDP header
	 */
	uh = udp_hdr(skb);
	uh->source = inet->inet_sport;
	uh->dest = fl4->fl4_dport;
	uh->len = htons(len);
	uh->check = 0;

	if (cork->gso_size) {
		const int hlen = skb_network_header_len(skb) +
				 sizeof(struct udphdr);

		if (hlen + cork->gso_size > cork->fragsize) {
			kfree_skb(skb);
			return -EINVAL;
		}
		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) {
			kfree_skb(skb);
			return -EINVAL;
		}
		if (sk->sk_no_check_tx) {
			kfree_skb(skb);
			return -EINVAL;
		}
		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
		    dst_xfrm(skb_dst(skb))) {
			kfree_skb(skb);
			return -EIO;
		}

		if (datalen > cork->gso_size) {
			skb_shinfo(skb)->gso_size = cork->gso_size;
			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
								 cork->gso_size);
		}
		goto csum_partial;
	}

	if (is_udplite)  				 /*     UDP-Lite      */
		csum = udplite_csum(skb);

	else if (sk->sk_no_check_tx) {			 /* UDP csum off */

		skb->ip_summed = CHECKSUM_NONE;
		goto send;

	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
csum_partial:

		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
		goto send;

	} else
		csum = udp_csum(skb);

	/* add protocol-dependent pseudo-header */
	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
				      sk->sk_protocol, csum);
	if (uh->check == 0)
		uh->check = CSUM_MANGLED_0;

send:
	err = ip_send_skb(sock_net(sk), skb);
	if (err) {
		if (err == -ENOBUFS && !inet->recverr) {
			UDP_INC_STATS(sock_net(sk),
				      UDP_MIB_SNDBUFERRORS, is_udplite);
			err = 0;
		}
	} else
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_OUTDATAGRAMS, is_udplite);
	return err;
}

ip_send_skb()

/* File: linux-5.4.1\net\ipv4\ip_output.c */
int ip_send_skb(struct net *net, struct sk_buff *skb)
{
	int err;

	err = ip_local_out(net, skb->sk, skb);
	if (err) {
		if (err > 0)
			err = net_xmit_errno(err);
		if (err)
			IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
	}

	return err;
}

IP协议栈发送报文

ip_local_out()

/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* 通过ip_local_out()最终会走到IP层输出函数dev_queue_xmit() */
int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	int err;

	err = __ip_local_out(net, sk, skb);
	if (likely(err == 1))
		err = dst_output(net, sk, skb);

	return err;
}

/* 当IP头封装好后,调用__ip_local_out() */
int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct iphdr *iph = ip_hdr(skb);

	iph->tot_len = htons(skb->len);
	ip_send_check(iph);

	/* if egress device is enslaved to an L3 master device pass the
	 * skb to its handler for processing
	 *
	 * vrf发包实现
	 */
	skb = l3mdev_ip_out(sk, skb);
	if (unlikely(!skb))
		return 0;

	skb->protocol = htons(ETH_P_IP);

	return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
		       net, sk, skb, NULL, skb_dst(skb)->dev,
		       dst_output);
}

dst_output()

/* File: linux-5.4.1\include\net\dst.h */
/* Output packet to network from transport.  
 * output()函数:
 * 1、如果是单薄数据包,设置的是ip_output();
 * 2、如果是组播数据包,设置的是ip_mc_output();
 */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	return skb_dst(skb)->output(net, sk, skb);
}

ip_output()

/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* 单播报文 */
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb_dst(skb)->dev;

	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

	skb->dev = dev;
	skb->protocol = htons(ETH_P_IP);

	/* 经过netfilter处理后,调用ip_finish_output()继续IP数据包的输出 */
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
			    net, sk, skb, NULL, dev,
			    ip_finish_output,
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
}

ip_finish_output()

/* File: linux-5.4.1\net\ipv4\ip_output.c */
static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	int ret;

	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
	switch (ret) {
	case NET_XMIT_SUCCESS:
		return __ip_finish_output(net, sk, skb);
	case NET_XMIT_CN:
		return __ip_finish_output(net, sk, skb) ? : ret;
	default:
		kfree_skb(skb);
		return ret;
	}
}

static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	unsigned int mtu;

#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
	if (skb_dst(skb)->xfrm) {
		IPCB(skb)->flags |= IPSKB_REROUTED;
		return dst_output(net, sk, skb);
	}
#endif
	mtu = ip_skb_dst_mtu(sk, skb);
    /* 支持gso的话用ip_finish_output_gso()函数 */
	if (skb_is_gso(skb))
		return ip_finish_output_gso(net, sk, skb, mtu);

	/* 如果数据包长度大于MTU,则调用ip_fragment()对IP数据包进行分片处理 
	 * 
	 * 如果不支持TSO或者GSO,tcp发送的时候时按照mms来组织skb的,所以skb->len会等于mtu, 
	 * 所以TCP叫分段,和IP分片不一样,只有UDP才有IP分片
	 */
	if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
		return ip_fragment(net, sk, skb, mtu, ip_finish_output2);

	return ip_finish_output2(net, sk, skb);
}

ip_finish_output2()

/* File: linux-5.4.1\net\ipv4\ip_output.c */
/* 
 * 此函数通过另据子系统将数据包输出到网络设备。
 * 先调用__ipv4_neigh_lookup_noref()从邻居表查找邻居,
 * 如果没找到,则用__netgh_create新建一个。
 */
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	struct rtable *rt = (struct rtable *)dst;
	struct net_device *dev = dst->dev;
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
	struct neighbour *neigh;
	bool is_v6gw = false;

	if (rt->rt_type == RTN_MULTICAST) {
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
	} else if (rt->rt_type == RTN_BROADCAST)
		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);

	/* Be paranoid, rather than too clever. */
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
		if (!skb2) {
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
		consume_skb(skb);
		skb = skb2;
	}

	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
		int res = lwtunnel_xmit(skb);

		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
			return res;
	}

	rcu_read_lock_bh();
	/* 从邻居表查找邻居 */
	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);

	/*  
	 * 如果缓存了链路层的首部,则调用
	 * neigh_hh_output()输出数据包。否则,
	 * 若存在对应的邻居项,则通过邻居项的输出方式输出数据包。
	 * 最后调用二层函数,dev_queue_xmit()
	 */
	if (!IS_ERR(neigh)) {
		int res;

		sock_confirm_neigh(skb, neigh);
		/* if crossing protocols, can not use the cached header */
		res = neigh_output(neigh, skb, is_v6gw);
		rcu_read_unlock_bh();
		return res;
	}
	rcu_read_unlock_bh();

	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
	kfree_skb(skb);
	return -EINVAL;
}

neigh_output()

/* File: linux-5.4.1\include\net\neighbour.h */
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
			       bool skip_cache)
{
	const struct hh_cache *hh = &n->hh;
    /* 如果neighbour已连接且hh已设置 */
	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
		return neigh_hh_output(hh, skb);
	else
		return n->output(n, skb);/* 初始阶段调用此函数,此时neigh_resolve_output函数 */
}


static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
	unsigned int hh_alen = 0;
	unsigned int seq;
	unsigned int hh_len;

	do {
		seq = read_seqbegin(&hh->hh_lock);
		hh_len = hh->hh_len;
		if (likely(hh_len <= HH_DATA_MOD)) {
			hh_alen = HH_DATA_MOD;

			/* skb_push() would proceed silently if we have room for
			 * the unaligned size but not for the aligned size:
			 * check headroom explicitly.
			 */
			if (likely(skb_headroom(skb) >= HH_DATA_MOD)) {
				/* this is inlined by gcc */
				memcpy(skb->data - HH_DATA_MOD, hh->hh_data,
				       HH_DATA_MOD);
			}
		} else {
			hh_alen = HH_DATA_ALIGN(hh_len);

			if (likely(skb_headroom(skb) >= hh_alen)) {
				memcpy(skb->data - hh_alen, hh->hh_data,
				       hh_alen);
			}
		}
	} while (read_seqretry(&hh->hh_lock, seq));

	if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) {
		kfree_skb(skb);
		return NET_XMIT_DROP;
	}

	__skb_push(skb, hh_len);
	return dev_queue_xmit(skb);
}

转交至设备驱动

dev_queue_xmit()

/* File:  X:\linux-5.4.1\net\core\dev.c*/
int dev_queue_xmit(struct sk_buff *skb)
{
	return __dev_queue_xmit(skb, NULL);
}

/**
 *	__dev_queue_xmit - transmit a buffer
 *	@skb: buffer to transmit
 *	@sb_dev: suboordinate device used for L2 forwarding offload
 *
 *	Queue a buffer for transmission to a network device. The caller must
 *	have set the device and priority and built the buffer before calling
 *	this function. The function can be called from an interrupt.
 *
 *	A negative errno code is returned on a failure. A success does not
 *	guarantee the frame will be transmitted as it may be dropped due
 *	to congestion or traffic shaping.
 *
 * -----------------------------------------------------------------------------------
 *      I notice this method can also return errors from the queue disciplines,
 *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
 *      be positive.
 *
 *      Regardless of the return value, the skb is consumed, so it is currently
 *      difficult to retry a send to this method.  (You can bump the ref count
 *      before sending to hold a reference for retry if you are careful.)
 *
 *      When calling this method, interrupts MUST be enabled.  This is because
 *      the BH enable code must have IRQs enabled so that it will not deadlock.
 *          --BLG
 *
 *
 *
 *
 * 网络接口核心层向网络协议层提供的统一的发送接口,无论IP还是ARP协议,以及其他各种底层协议,
 * 通过这个函数把要发送的数据传递给网络接口核心层。
 *
 * 若支持流量控制,则将带输出额数据包根据规则加入到输出网络队列,并在合适的时机激活网络设备
 * 输出软中断,依次将报文从队列中取出通过网络设备输出。若不支持流量控制,则直接将数据包从网
 * 络设备输出。
 * 如果提交失败,则返回相应的错误吗,然而返回成功也并不能确保数据包被成功发送,因为有可能由
 * 于拥塞而导致流量控制机制将数据包丢弃。
 * 调用dev_queue_xmit()函数输出数据包,前提时必须启用中断,只有启动中断之后才能激活下半部。
 */
static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
	struct net_device *dev = skb->dev;
	struct netdev_queue *txq;
	struct Qdisc *q;
	int rc = -ENOMEM;
	bool again = false;

	skb_reset_mac_header(skb);

	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);

	/* Disable soft irqs for various locks below. Also
	 * stops preemption for RCU.
	 */
	rcu_read_lock_bh();

	skb_update_prio(skb);

	qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
	skb->tc_at_ingress = 0;
# ifdef CONFIG_NET_EGRESS
	if (static_branch_unlikely(&egress_needed_key)) {
		skb = sch_handle_egress(skb, &rc, dev);
		if (!skb)
			goto out;
	}
# endif
#endif
	/* If device/qdisc don't need skb->dst, release it right now while
	 * its hot in this cpu cache.
	 */
	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
		skb_dst_drop(skb);
	else
		skb_dst_force(skb);

	txq = netdev_core_pick_tx(dev, skb, sb_dev);

	/* 实际上就是获取net_device->netdev_queue, 也就是该dev设备的根qdisc
	 * 对于物理网卡而言,缺省使用的时FIFO qdisc, 该成员函数非空,只有逻辑网卡才可能为空
	 */
	q = rcu_dereference_bh(txq->qdisc);

	trace_net_dev_queue(skb);

	/*
	 * 如果队列输入非空,将数据包入队
	 *
	 * 将待发送数据按排队规则插入到队列,然后进行流量控制,
	 * 调度对立输出数据包,完成后返回。
	 */
	if (q->enqueue) {
		rc = __dev_xmit_skb(skb, q, dev, txq);
		goto out;
	}

	/* The device has no queue. Common case for software devices:
	 * loopback, all the sorts of tunnels...

	 * Really, it is unlikely that netif_tx_lock protection is necessary
	 * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
	 * counters.)
	 * However, it is possible, that they rely on protection
	 * made by us here.

	 * Check this and shot the lock. It is not prone from deadlocks.
	 * Either shot noqueue qdisc, it is even simpler 8)
	 *
	 * 如果设备已打开但未启用QoS,则直接输出数据包
	 */
	if (dev->flags & IFF_UP) {
		int cpu = smp_processor_id(); /* ok because BHs are off */

		if (txq->xmit_lock_owner != cpu) {
			if (dev_xmit_recursion())
				goto recursion_alert;

			/* 检查skb是否有效 */
			skb = validate_xmit_skb(skb, dev, &again);
			if (!skb)
				goto out;

			HARD_TX_LOCK(dev, txq, cpu);

			if (!netif_xmit_stopped(txq)) {
				dev_xmit_recursion_inc();
				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
				dev_xmit_recursion_dec();
				if (dev_xmit_complete(rc)) {
					HARD_TX_UNLOCK(dev, txq);
					goto out;
				}
			}
			HARD_TX_UNLOCK(dev, txq);
			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
					     dev->name);
		} else {
			/* Recursion is detected! It is possible,
			 * unfortunately
			 */
recursion_alert:
			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
					     dev->name);
		}
	}

	rc = -ENETDOWN;
	rcu_read_unlock_bh();

	atomic_long_inc(&dev->tx_dropped);
	kfree_skb_list(skb);
	return rc;
out:
	rcu_read_unlock_bh();
	return rc;
}

dev_hard_start_xmit()

/* File: linux-5.4.1\net\core\dev.c */
/* dev_hard_start_xmit()将输出的数据包提交给网络设备的输出接口,完成数据包的输出
 *
 * SKB通过ip_local_out()走到这里, 在ip_local_out()中已经把IP层以及其以上各层已经封装完毕。该函数后
 * 开始走二层封装。
 *
 * xmit_one()负责输出数据
 */
struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
				    struct netdev_queue *txq, int *ret)
{
	struct sk_buff *skb = first;
	int rc = NETDEV_TX_OK;

	while (skb) {
		struct sk_buff *next = skb->next;

		skb_mark_not_on_list(skb);
		rc = xmit_one(skb, dev, txq, next != NULL);
		if (unlikely(!dev_xmit_complete(rc))) {
			skb->next = next;
			goto out;
		}

		skb = next;
		if (netif_tx_queue_stopped(txq) && skb) {
			rc = NETDEV_TX_BUSY;
			break;
		}
	}

out:
	*ret = rc;
	return skb;
}

xmit_one()

/* File: linux-5.4.1\net\core\dev.c */
/* 由 dev_hard_start_xmit() 调用,
 * netdev_start_xmit()用输出数据
 *
 */
static int xmit_one(struct sk_buff *skb, struct net_device *dev,
		    struct netdev_queue *txq, bool more)
{
	unsigned int len;
	int rc;

	if (dev_nit_active(dev))
		dev_queue_xmit_nit(skb, dev);

	len = skb->len;
	trace_net_dev_start_xmit(skb, dev);
	rc = netdev_start_xmit(skb, dev, txq, more);
	trace_net_dev_xmit(skb, rc, dev, len);

	return rc;
}

netdev_start_xmit()

/* File: linux-5.4.1\include\linux\netdevice.h */
static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
					    struct netdev_queue *txq, bool more)
{
	const struct net_device_ops *ops = dev->netdev_ops;
	netdev_tx_t rc;

	rc = __netdev_start_xmit(ops, skb, dev, more);
	if (rc == NETDEV_TX_OK)
		txq_trans_update(txq);

	return rc;
}

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
					      struct sk_buff *skb, struct net_device *dev,
					      bool more)
{
	__this_cpu_write(softnet_data.xmit.more, more);
	return ops->ndo_start_xmit(skb, dev);
}

最后这里的

ops->ndo_start_xmit(skb, dev);

就是根据网络设备驱动来调用的,如e1000的网络驱动:

/* File: linux-5.4.1\drivers\net\ethernet\intel\e1000e\netdev.c */
static const struct net_device_ops e1000e_netdev_ops = {
	.ndo_open		= e1000e_open,
	.ndo_stop		= e1000e_close,
	.ndo_start_xmit		= e1000_xmit_frame,  /* 这里是对应的函数指针挂载的驱动函数 */
	.ndo_get_stats64	= e1000e_get_stats64,
	.ndo_set_rx_mode	= e1000e_set_rx_mode,
	.ndo_set_mac_address	= e1000_set_mac,
	.ndo_change_mtu		= e1000_change_mtu,
	.ndo_do_ioctl		= e1000_ioctl,
	.ndo_tx_timeout		= e1000_tx_timeout,
	.ndo_validate_addr	= eth_validate_addr,

	.ndo_vlan_rx_add_vid	= e1000_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid	= e1000_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= e1000_netpoll,
#endif
	.ndo_set_features = e1000_set_features,
	.ndo_fix_features = e1000_fix_features,
	.ndo_features_check	= passthru_features_check,
};

iptable_netfilter介绍以及简单代码分析_第4张图片

你可能感兴趣的:(linux,linux内核,编程)