分类流控qdisc之htb

文章目录

  • tc参数
    • 配置示例
  • 算法思想
  • 用户态实现
    • 数据结构
      • htb全局配置参数: tc_htb_glob
      • htb类配置参数: tc_htb_opt
    • qdisc配置参数解析: htb_parse_opt()
    • class配置参数解析: htb_parse_copt()
  • 内核实现
    • 数据结构
      • qdisc操作集私有数据: htb_sched
      • htb类结构: htb_class
    • qdisc操作集: htb_qdisc_ops
      • 初始化: htb_init()
    • qdisc类操作集: htb_class_ops
      • change()回调: htb_change_class()
      • 入队回调: htb_enqueue()
        • htb分类: htb_classify()
      • 出队: htb_dequeue()

tc参数

# 命令行语法
tc qdisc ... dev dev ( parent classid | root) [ handle major: ] htb [ default minor-id ]

tc class ... dev dev parent major:[minor] [ classid major:minor ] htb rate rate [ ceil rate ] burst bytes [ cburst bytes ] [ prio priority ]
  • “parent classid | root”: 如果htb作为队列的根qdisc,那么应该选择root。否则应该指明其属于哪个类的孩子;

  • “handle major:”: 指定该qdisc的句柄,该major无需和parent classid的主号码一致;

  • “default minor-id”: 表示如果数据包没有被分到某个class,那么应该将其分到哪个class,即默认class的此号码(主号码是明确的,就是该qdisc的主号码);

  • parent major:[minor]: 指定class的父亲,如果class的父亲就是qdisc,那么该class也称为根class,其父亲的minor可以不指定;其它class则必须指定其父亲class的句柄;

  • classid major:minor: 指定class自己的句柄;

  • rate rate: 指定该class要配置的速率;

  • ceil rate: htb在条件允许的情况下可以让class的速率超过rate参数,但是最大不能超过本参数。一个class其超过rate的速率其实是和父亲借来的(不用还),特别注意根class是不能向qdisc借用速率的;

  • prio priority: 指定class的优先级,值越小,优先级越高。当同一层的class都可以发送时,htb会对这些class进行遍历,每个class传输其quantum个数据包后轮询下一个;

配置示例

# htb为eth0的根qdisc,其句柄为1:,默认的数据包分类为1:12
tc qdisc add dev eth0 root handle 1: htb default 12

# 建立一个如右图所示的class树,rate分别代表其速率,ceil分别代表其最大速率
tc class add dev eth0 parent 1: classid 1:1 htb rate 100kbps ceil 100kbps                      1:1
tc class add dev eth0 parent 1:1 classid 1:10 htb rate 30kbps ceil 100kbps                  /   |   \
tc class add dev eth0 parent 1:1 classid 1:11 htb rate 10kbps ceil 100kbps               1:10  1:11  1:12
tc class add dev eth0 parent 1:1 classid 1:12 htb rate 60kbps ceil 100kbps

算法思想

htb将class组织成一颗树,它将待传输的数据包缓存在叶子class的队列中,其它class是不可以缓存流量的,但是孩子clas可以共享parent calss的流量配额。

htb中的每个class虽然都配置了rate,但是并不是说clas只能以指定的rate传输数据包,当网卡比较空闲时,是可以以高于rate的速率传输的,但不会高于ceil。简单来说,思想就是空闲时共享,繁忙时按照比例分配带宽。

每个class在某个时刻可以处于三种状态中的一种:

  1. CAN_SEND(令牌充足,此时发送速率小于rate);
  2. MAY_BORROW(没有令牌,但可借用,此时发送速率大于rate但小于ceil);
  3. CANT_SEND(没有令牌不可借用,此时发送速率大于ceil);

htb在收到出队请求时,按照下述原则出包:

  1. htb算法从class树的底部开始往上找处于CAN_SEND状态的class,如果找到某一层有CAN_SEND状态的类则停止;

  2. 如果该层中有多个class处于CAN_SEND状态则选取优先级最高(priority最小)的class,如果最高优先级还是有多个class,那就在这些类中轮询处理,每个类每发送自己的quantum个字节后,轮到下一个类发送;

  3. 只有叶子class才可以缓存数据包,如果步骤1,2最终选到了中间类,那么会顺着树往下找,找到一个孩子叶子class,并且该叶子类处于MAY_BORROW状态,将自己富余的令牌借给该叶子class让其传输。同样的道理,可能会有多个孩子叶子class处于MAY_BORROW状态,这里的处理跟步骤2是一样的,也是按照轮询处理;

用户态实现

数据结构

htb全局配置参数: tc_htb_glob

struct tc_htb_glob {
    __u32 version;		/* to match HTB/TC */
    __u32 rate2quantum;	/* bps->quantum divisor */
    __u32 defcls;		/* default class number */
    __u32 debug;		/* debug flags */

    /* stats */
    __u32 direct_pkts; /* count of non shaped packets */
};

htb类配置参数: tc_htb_opt

struct tc_htb_opt {
	struct tc_ratespec 	rate;
	struct tc_ratespec 	ceil;
	__u32	buffer;
	__u32	cbuffer;
	__u32	quantum;
	__u32	level;		/* out only */
	__u32	prio; // 类的优先级
};

qdisc配置参数解析: htb_parse_opt()

static int htb_parse_opt(struct qdisc_util *qu, int argc, char **argv,
    struct nlmsghdr *n)
{
	unsigned int direct_qlen = ~0U;
	struct tc_htb_glob opt = {
		.rate2quantum = 10,
		.version = 3,
	};
	struct rtattr *tail;
	unsigned int i; char *p;

	while (argc > 0) {
		if (matches(*argv, "r2q") == 0) {
			NEXT_ARG();
			if (get_u32(&opt.rate2quantum, *argv, 10)) {
				explain1("r2q"); return -1;
			}
		} else if (matches(*argv, "default") == 0) {
			NEXT_ARG();
			if (get_u32(&opt.defcls, *argv, 16)) {
				explain1("default"); return -1;
			}
		} else if (matches(*argv, "debug") == 0) {
			NEXT_ARG(); p = *argv;
			for (i = 0; i < 16; i++, p++) {
				if (*p < '0' || *p > '3') break;
				opt.debug |= (*p-'0')<<(2*i);
			}
		} else if (matches(*argv, "direct_qlen") == 0) {
			NEXT_ARG();
			if (get_u32(&direct_qlen, *argv, 10)) {
				explain1("direct_qlen"); return -1;
			}
		} else {
			fprintf(stderr, "What is \"%s\"?\n", *argv);
			explain();
			return -1;
		}
		argc--; argv++;
	}
	tail = NLMSG_TAIL(n);
	addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
	addattr_l(n, 2024, TCA_HTB_INIT, &opt, NLMSG_ALIGN(sizeof(opt)));
	if (direct_qlen != ~0U)
		addattr_l(n, 2024, TCA_HTB_DIRECT_QLEN, &direct_qlen, sizeof(direct_qlen));
	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
	return 0;
}

class配置参数解析: htb_parse_copt()

绝大多数参数和tbf的参数相同。

static int htb_parse_class_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n)
{
	int ok = 0;
	struct tc_htb_opt opt = {};
	__u32 rtab[256], ctab[256];
	unsigned buffer = 0, cbuffer = 0;
	int cell_log =  -1, ccell_log = -1;
	unsigned int mtu = 1600; /* eth packet len */
	unsigned short mpu = 0;
	unsigned short overhead = 0;
	unsigned int linklayer  = LINKLAYER_ETHERNET; /* Assume ethernet */
	struct rtattr *tail;
	__u64 ceil64 = 0, rate64 = 0;

	while (argc > 0) {
		if (matches(*argv, "prio") == 0) { // 解析优先级参数
			NEXT_ARG();
			if (get_u32(&opt.prio, *argv, 10)) {
				explain1("prio"); return -1;
			}
			ok++;
		} else if (matches(*argv, "mtu") == 0) {
			NEXT_ARG();
			if (get_u32(&mtu, *argv, 10)) {
				explain1("mtu"); return -1;
			}
		} else if (matches(*argv, "mpu") == 0) {
			NEXT_ARG();
			if (get_u16(&mpu, *argv, 10)) {
				explain1("mpu"); return -1;
			}
		} else if (matches(*argv, "overhead") == 0) {
			NEXT_ARG();
			if (get_u16(&overhead, *argv, 10)) {
				explain1("overhead"); return -1;
			}
		} else if (matches(*argv, "linklayer") == 0) {
			NEXT_ARG();
			if (get_linklayer(&linklayer, *argv)) {
				explain1("linklayer"); return -1;
			}
		} else if (matches(*argv, "quantum") == 0) {
			NEXT_ARG();
			if (get_u32(&opt.quantum, *argv, 10)) {
				explain1("quantum"); return -1;
			}
		} else if (matches(*argv, "burst") == 0 ||
			   strcmp(*argv, "buffer") == 0 ||
			   strcmp(*argv, "maxburst") == 0) {
			NEXT_ARG();
			if (get_size_and_cell(&buffer, &cell_log, *argv) < 0) {
				explain1("buffer");
				return -1;
			}
			ok++;
		} else if (matches(*argv, "cburst") == 0 ||
			   strcmp(*argv, "cbuffer") == 0 ||
			   strcmp(*argv, "cmaxburst") == 0) {
			NEXT_ARG();
			if (get_size_and_cell(&cbuffer, &ccell_log, *argv) < 0) {
				explain1("cbuffer");
				return -1;
			}
			ok++;
		} else if (strcmp(*argv, "ceil") == 0) {
			NEXT_ARG();
			if (ceil64) {
				fprintf(stderr, "Double \"ceil\" spec\n");
				return -1;
			}
			if (get_rate64(&ceil64, *argv)) {
				explain1("ceil");
				return -1;
			}
			ok++;
		} else if (strcmp(*argv, "rate") == 0) {
			NEXT_ARG();
			if (rate64) {
				fprintf(stderr, "Double \"rate\" spec\n");
				return -1;
			}
			if (get_rate64(&rate64, *argv)) {
				explain1("rate");
				return -1;
			}
			ok++;
		} else if (strcmp(*argv, "help") == 0) {
			explain();
			return -1;
		} else {
			fprintf(stderr, "What is \"%s\"?\n", *argv);
			explain();
			return -1;
		}
		argc--; argv++;
	}

	if (!rate64) {
		fprintf(stderr, "\"rate\" is required.\n");
		return -1;
	}
	/* if ceil params are missing, use the same as rate */
	if (!ceil64)
		ceil64 = rate64;

	opt.rate.rate = (rate64 >= (1ULL << 32)) ? ~0U : rate64;
	opt.ceil.rate = (ceil64 >= (1ULL << 32)) ? ~0U : ceil64;

	/* compute minimal allowed burst from rate; mtu is added here to make
	   sute that buffer is larger than mtu and to have some safeguard space */
	if (!buffer)
		buffer = rate64 / get_hz() + mtu;
	if (!cbuffer)
		cbuffer = ceil64 / get_hz() + mtu;

	opt.ceil.overhead = overhead;
	opt.rate.overhead = overhead;

	opt.ceil.mpu = mpu;
	opt.rate.mpu = mpu;

	if (tc_calc_rtable(&opt.rate, rtab, cell_log, mtu, linklayer) < 0) {
		fprintf(stderr, "htb: failed to calculate rate table.\n");
		return -1;
	}
	opt.buffer = tc_calc_xmittime(rate64, buffer);

	if (tc_calc_rtable(&opt.ceil, ctab, ccell_log, mtu, linklayer) < 0) {
		fprintf(stderr, "htb: failed to calculate ceil rate table.\n");
		return -1;
	}
	opt.cbuffer = tc_calc_xmittime(ceil64, cbuffer);

	tail = NLMSG_TAIL(n);
	addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);

	if (rate64 >= (1ULL << 32))
		addattr_l(n, 1124, TCA_HTB_RATE64, &rate64, sizeof(rate64));

	if (ceil64 >= (1ULL << 32))
		addattr_l(n, 1224, TCA_HTB_CEIL64, &ceil64, sizeof(ceil64));

	addattr_l(n, 2024, TCA_HTB_PARMS, &opt, sizeof(opt));
	addattr_l(n, 3024, TCA_HTB_RTAB, rtab, 1024);
	addattr_l(n, 4024, TCA_HTB_CTAB, ctab, 1024);
	tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
	return 0;
}

内核实现

数据结构

qdisc操作集私有数据: htb_sched

struct htb_sched {
	struct Qdisc_class_hash clhash; // class哈希表
	struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */

	/* self list - roots of self generating tree */
	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
	int row_mask[TC_HTB_MAXDEPTH];
	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];

	/* self wait list - roots of wait PQs per row */
	struct rb_root wait_pq[TC_HTB_MAXDEPTH];

	/* time of nearest event per level (row) */
	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];

	int defcls;	// 未分类数据包的默认归属的class句柄

	// 过滤器列表,决定了htb中数据包的分类
	struct tcf_proto *filter_list;

	int rate2quantum;	/* quant = rate / rate2quantum */
	psched_time_t now;	/* cached dequeue time */
	struct qdisc_watchdog watchdog;

	/* non shaped skbs; let them go directly thru */
	 // 直接传输队列,htb允许某些特征的数据包不经过内部队列控制而直接传输
	struct sk_buff_head direct_queue;
	int direct_qlen; // 直接传输队列所能容纳的最大数据包个数,超过该门限也会丢包
	long direct_pkts; // 直接传输队列入队数据包统计值,该值会持续累加
};

htb类结构: htb_class

/* interior & leaf nodes; props specific to leaves are marked L: */
struct htb_class {
	struct Qdisc_class_common common; // 通用的class字段
	/* general class parameters */
	struct gnet_stats_basic bstats;
	struct gnet_stats_queue qstats;
	struct gnet_stats_rate_est rate_est;
	struct tc_htb_xstats xstats;	/* our special stats */
	int refcnt;	// 引用计数

	/* topology */
	int level;		/* our level (see above) */
	unsigned int children; // 直接孩子个数
	struct htb_class *parent; // 指向父亲class,对于根class,该字段为NULL

	int prio; /* these two are used only by leaves... */
	int quantum; /* but stored for parent-to-leaf return */

	union {
		struct htb_class_leaf {
			struct Qdisc *q; // 叶子类必须有的qdisc
			int deficit[TC_HTB_MAXDEPTH];
			struct list_head drop_list;
		} leaf; // 叶子类信息
		struct htb_class_inner {
			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
			/* When class changes from state 1->2 and disconnects from
			   parent's feed then we lost ptr value and start from the
			   first child again. Here we store classid of the
			   last valid ptr (used when ptr is NULL). */
			u32 last_ptr_id[TC_HTB_NUMPRIO];
		} inner; // 中间类信息
	} un;
	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
	struct rb_node pq_node;	/* node for event queue */
	psched_time_t pq_key;

	int prio_activity;	/* for which prios are we active */
	enum htb_cmode cmode;	/* current mode of the class */

	// class内部自己的filter链表,不配置则为NULL
	struct tcf_proto *filter_list;
	int filter_cnt;

	int warned;	/* only one warning about non work conserving .. */

	/* token bucket parameters */
	struct qdisc_rate_table *rate;	/* rate table of the class itself */
	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
	long buffer, cbuffer;	/* token bucket depth/rate */
	psched_tdiff_t mbuffer;	/* max wait time */
	long tokens, ctokens;	/* current number of tokens */
	psched_time_t t_c;	/* checkpoint time */
};

qdisc操作集: htb_qdisc_ops

static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
	.next		=	NULL,
	.cl_ops		=	&htb_class_ops,
	.id		=	"htb",
	.priv_size	=	sizeof(struct htb_sched),
	.enqueue	=	htb_enqueue,
	.dequeue	=	htb_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.drop		=	htb_drop,
	.init		=	htb_init,
	.reset		=	htb_reset,
	.destroy	=	htb_destroy,
	.change		=	NULL /* htb_change */,
	.dump		=	htb_dump,
	.owner		=	THIS_MODULE,
};

初始化: htb_init()

static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
};

static int htb_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct htb_sched *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_HTB_INIT + 1]; // 保存用户态对htb qdisc的配置参数
	struct tc_htb_glob *gopt;
	int err;
	int i;

	if (!opt)
		return -EINVAL;
    // 解析htb qdisc初始化配置参数到tb数组
	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
	if (err < 0)
		return err;
	if (tb[TCA_HTB_INIT] == NULL) {
		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
		return -EINVAL;
	}
	gopt = nla_data(tb[TCA_HTB_INIT]);
	if (gopt->version != HTB_VER >> 16) { // 匹配版本号
		printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n",
		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
		return -EINVAL;
	}
    // 分配并初始化保存class树的哈希表
	err = qdisc_class_hash_init(&q->clhash);
	if (err < 0)
		return err;
	for (i = 0; i < TC_HTB_NUMPRIO; i++)
		INIT_LIST_HEAD(q->drops + i);

	qdisc_watchdog_init(&q->watchdog, sch);
	skb_queue_head_init(&q->direct_queue);

	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
		q->direct_qlen = 2;

	if ((q->rate2quantum = gopt->rate2quantum) < 1)
		q->rate2quantum = 1;
	q->defcls = gopt->defcls;
	return 0;
}

qdisc类操作集: htb_class_ops

static const struct Qdisc_class_ops htb_class_ops = {
	.graft		=	htb_graft,
	.leaf		=	htb_leaf,
	.qlen_notify	=	htb_qlen_notify,
	.get		=	htb_get,
	.put		=	htb_put,
	.change		=	htb_change_class,
	.delete		=	htb_delete,
	.walk		=	htb_walk,
	.tcf_chain	=	htb_find_tcf,
	.bind_tcf	=	htb_bind_filter,
	.unbind_tcf	=	htb_unbind_filter,
	.dump		=	htb_dump_class,
	.dump_stats	=	htb_dump_class_stats,
};

change()回调: htb_change_class()

@sch: 要操作的qdisc对象;
@classid: 要操作的类句柄,必须指定;
@parentid: 要操作的类的parent,可能为htb qdisc的句柄,也可能为某个类的句柄
@tca: 用户态参数信息;
@arg: 出参,保存修改后class标识信息(本函数为指针);
static int htb_change_class(struct Qdisc *sch, u32 classid,
    u32 parentid, struct nlattr **tca, unsigned long *arg)
{
	int err = -EINVAL;
	struct htb_sched *q = qdisc_priv(sch);
	struct htb_class *cl = (struct htb_class *)*arg, *parent;
	struct nlattr *opt = tca[TCA_OPTIONS];
	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
	struct nlattr *tb[TCA_HTB_RTAB + 1];
	struct tc_htb_opt *hopt;

	/* extract all subattrs from opt attr */
	if (!opt)
		goto failure;
    // 获取用户态配置参数,保存到tb中
	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
	if (err < 0)
		goto failure;
	err = -EINVAL;
	if (tb[TCA_HTB_PARMS] == NULL)
		goto failure;

    // 查找类的parent class。特别的,对于根class,由于其parent为qdisc的句柄,
    // 所以一定也是找不到class对象的
	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);

    // 获取htb类的配置参数
	hopt = nla_data(tb[TCA_HTB_PARMS]);

    // rate table和ceil rate table,这两个信息必须都指定
	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
	if (!rtab || !ctab)
		goto failure;

	if (!cl) { // 需要创建class
		struct Qdisc *new_q;
		int prio;
		struct {
			struct nlattr		nla;
			struct gnet_estimator	opt;
		} est = {
			.nla = {
				.nla_len	= nla_attr_size(sizeof(est.opt)),
				.nla_type	= TCA_RATE,
			},
			.opt = {
				/* 4s interval, 16s averaging constant */
				.interval	= 2,
				.ewma_log	= 2,
			},
		};

		// 检验指定的classid是否合理
		if (!classid || TC_H_MAJ(classid ^ sch->handle) || htb_find(classid, sch))
			goto failure;

		// 检查层数限制
		if (parent && parent->parent && parent->parent->level < 2) {
			printk(KERN_ERR "htb: tree is too deep\n");
			goto failure;
		}
		// 分配class对象
		err = -ENOBUFS;
		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
			goto failure;

		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
					qdisc_root_sleeping_lock(sch),
					tca[TCA_RATE] ? : &est.nla);
		if (err) {
			kfree(cl);
			goto failure;
		}
        // 初始化class的各字段
		cl->refcnt = 1;
		cl->children = 0;
		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
		RB_CLEAR_NODE(&cl->pq_node);

		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
			RB_CLEAR_NODE(&cl->node[prio]);

		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
		   so that can't be used inside of sch_tree_lock
		   -- thanks to Karlis Peisenieks */
		// 为新的类关联一个默认的qdisc,类型为pfifo
		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
					  &pfifo_qdisc_ops, classid);
		sch_tree_lock(sch);
		if (parent && !parent->level) {
		    // 新的class成为了parent的第一个孩子(parent->level为0),需要更新parent的信息
			unsigned int qlen = parent->un.leaf.q->q.qlen;

			/* turn parent into inner node */
			qdisc_reset(parent->un.leaf.q);
			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
			// 叶子节点的qdisc要销毁,可见htb的class树中,中间类是没有qdisc的
			qdisc_destroy(parent->un.leaf.q);
			if (parent->prio_activity)
				htb_deactivate(q, parent);

			/* remove from evt list because of level change */
			if (parent->cmode != HTB_CAN_SEND) {
				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
				parent->cmode = HTB_CAN_SEND;
			}
			parent->level = (parent->parent ? parent->parent->level : TC_HTB_MAXDEPTH) - 1;
			// parent变为了中间类,以后使用inner信息,先清零
			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
		}
		// 目前新的class肯定没有孩子,也就是说它是一个叶子类,关联qdisc
		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
        // 保存classid以及parent信息
		cl->common.classid = classid;
		cl->parent = parent;

		/* set class to be in HTB_CAN_SEND state */
		cl->tokens = hopt->buffer;
		cl->ctokens = hopt->cbuffer;
		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
		cl->t_c = psched_get_time();
		cl->cmode = HTB_CAN_SEND;

		// 将新的class加入类哈希表
		qdisc_class_hash_insert(&q->clhash, &cl->common);
		if (parent) // 增加parent的孩子个数
			parent->children++;
	} else { // 修改已有class的配置参数
	    // 只有速率参数可修改
		if (tca[TCA_RATE]) {
			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
			    qdisc_root_sleeping_lock(sch), tca[TCA_RATE]);
			if (err)
				return err;
		}
		sch_tree_lock(sch);
	}

	/* it used to be a nasty bug here, we have to check that node
	   is really leaf before changing cl->un.leaf ! */
	if (!cl->level) {
		cl->quantum = rtab->rate.rate / q->rate2quantum;
		if (!hopt->quantum && cl->quantum < 1000) {
			printk(KERN_WARNING
			       "HTB: quantum of class %X is small. Consider r2q change.\n",
			       cl->common.classid);
			cl->quantum = 1000;
		}
		if (!hopt->quantum && cl->quantum > 200000) {
			printk(KERN_WARNING
			       "HTB: quantum of class %X is big. Consider r2q change.\n",
			       cl->common.classid);
			cl->quantum = 200000;
		}
		if (hopt->quantum)
			cl->quantum = hopt->quantum;
		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
			cl->prio = TC_HTB_NUMPRIO - 1;
	}

	cl->buffer = hopt->buffer;
	cl->cbuffer = hopt->cbuffer;
	if (cl->rate)
		qdisc_put_rtab(cl->rate);
	cl->rate = rtab;
	if (cl->ceil)
		qdisc_put_rtab(cl->ceil);
	cl->ceil = ctab;
	sch_tree_unlock(sch);
    // 尝试增大class哈希桶
	qdisc_class_hash_grow(sch, &q->clhash);
    // 将新的class对象指针通过arg参数返回
	*arg = (unsigned long)cl;
	return 0;

failure:
	if (rtab)
		qdisc_put_rtab(rtab);
	if (ctab)
		qdisc_put_rtab(ctab);
	return err;
}

入队回调: htb_enqueue()

static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	int uninitialized_var(ret);
	struct htb_sched *q = qdisc_priv(sch);
	// 对数据包执行分类,找到其归属的class
	struct htb_class *cl = htb_classify(skb, sch, &ret);

	if (cl == HTB_DIRECT) { // 表示要把数据包放入直接传输队列
		/* enqueue to helper queue */
		if (q->direct_queue.qlen < q->direct_qlen) {
			__skb_queue_tail(&q->direct_queue, skb);
			q->direct_pkts++;
		} else {
			kfree_skb(skb);
			sch->qstats.drops++;
			return NET_XMIT_DROP;
		}
#ifdef CONFIG_NET_CLS_ACT
	} else if (!cl) {
		if (ret & __NET_XMIT_BYPASS)
			sch->qstats.drops++;
		kfree_skb(skb);
		return ret;
#endif
	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
	    // 找到了叶子class,但是入队列失败,则丢弃数据包
		if (net_xmit_drop_count(ret)) {
			sch->qstats.drops++;
			cl->qstats.drops++;
		}
		return ret;
	} else {
	    // 找到了叶子class,并且入队列成功,更新叶子class的统计信息
		cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
		cl->bstats.bytes += qdisc_pkt_len(skb);
		htb_activate(q, cl);
	}
    // 入队列成功,更新根qdisc的统计信息
	sch->q.qlen++;
	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
	sch->bstats.bytes += qdisc_pkt_len(skb);
	return NET_XMIT_SUCCESS;
}
htb分类: htb_classify()

注释解释的非常清晰:

  1. 返回NULL时,htb会将数据包丢弃。比如速率太高,流控策略已经不允许入队列;
  2. 返回HTB_DIRECT(实际是-1)会将数据包放入htb的直接发送队列。这种数据包不会受htb的流控策略,htb会将它们直接发送给网络设备;
  3. 高层协议可以通过设置skb->priority字段,可以跳过htb自己的filter策略,指定让数据包直接分到某个叶子class;特别的,skb->priority如果指定的是根qdisc的句柄,那么也会返回HTB_DIRECT;
  4. 返回叶子class,htb会将数据包放入该class的qdisc队列;
/**
 * htb_classify - classify a packet into class
 *
 * It returns NULL if the packet should be dropped or -1 if the packet
 * should be passed directly thru. In all other cases leaf class is returned.
 * We allow direct class selection by classid in priority. The we examine
 * filters in qdisc and in inner nodes (if higher filter points to the inner
 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
 * internal fifo (direct). These packets then go directly thru. If we still
 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
 * then finish and return direct queue.
 */
#define HTB_DIRECT (struct htb_class*)-1

static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
	struct htb_sched *q = qdisc_priv(sch);
	struct htb_class *cl;
	struct tcf_result res;
	struct tcf_proto *tcf;
	int result;

	/* allow to select class by setting skb->priority to valid classid;
	   note that nfmark can be used too by attaching filter fw with no
	   rules in it */
	// 特别的,skb->priority指定的是根qdisc的句柄,那么数据包会跳过htb
	if (skb->priority == sch->handle)
		return HTB_DIRECT;	/* X:0 (direct flow) selected */
	// skb->priority直接指定让数据包分到某个叶子class
	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
		return cl;

    // 调用流控框架的tc_classify()对数据包进行分类,并根据分类结果进行处理
	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
	tcf = q->filter_list;
	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
#ifdef CONFIG_NET_CLS_ACT
		switch (result) {
		case TC_ACT_QUEUED:
		case TC_ACT_STOLEN:
			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
		case TC_ACT_SHOT:
			return NULL;
		}
#endif
		if ((cl = (void *)res.class) == NULL) { // res.class为NULL,此时class ID有效
		    // 可见,filter可以通过返回根qdisc的句柄,让数据包跳过htb的策略控制
			if (res.classid == sch->handle)
				return HTB_DIRECT;	/* X:0 (direct flow) */
			// 没有找到class ID指定的类,是种错误,将数据包分给默认class
			if ((cl = htb_find(res.classid, sch)) == NULL)
				break;	/* filter selected invalid classid */
		}
		// 找到的class就是叶子class,分类完毕
		if (!cl->level)
			return cl;	/* we hit leaf; return it */
		// 是中间class,用该中间class的filter链表继续分类
		tcf = cl->filter_list;
	}
	// 分类失败,将数据包分到默认class中
	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
	// 默认class非法(不存在,或者不是叶子class),也走直接传输
	if (!cl || cl->level)
		return HTB_DIRECT;	/* bad default .. this is safe bet */
	return cl;
}

出队: htb_dequeue()

static struct sk_buff *htb_dequeue(struct Qdisc *sch)
{
	struct sk_buff *skb = NULL;
	struct htb_sched *q = qdisc_priv(sch);
	int level;
	psched_time_t next_event;
	unsigned long start_at;

	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
	// 优先从直接队列中出队列
	skb = __skb_dequeue(&q->direct_queue);
	if (skb != NULL) {
		sch->flags &= ~TCQ_F_THROTTLED;
		sch->q.qlen--;
		return skb;
	}
    // htb树中没有数据待发送
	if (!sch->q.qlen)
		goto fin;
	q->now = psched_get_time();
	start_at = jiffies;

	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
    // 从0层(即叶子class)开始遍历,找到需要出队的数据包
	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
		/* common case optimization - skip event handler quickly */
		int m;
		psched_time_t event;

		if (q->now >= q->near_ev_cache[level]) {
			event = htb_do_events(q, level, start_at);
			if (!event)
				event = q->now + PSCHED_TICKS_PER_SEC;
			q->near_ev_cache[level] = event;
		} else
			event = q->near_ev_cache[level];

		if (next_event > event)
			next_event = event;

		m = ~q->row_mask[level];
		while (m != (int)(-1)) {
			int prio = ffz(m);
			m |= 1 << prio;
			skb = htb_dequeue_tree(q, prio, level);
			if (likely(skb != NULL)) {
				sch->q.qlen--;
				sch->flags &= ~TCQ_F_THROTTLED;
				goto fin;
			}
		}
	}
	sch->qstats.overlimits++;
	qdisc_watchdog_schedule(&q->watchdog, next_event);
fin:
	return skb;
}

你可能感兴趣的:(linux网络设备接口层,流量控制,htb)