Linux 流量控制源码学习
红色粗体斜体,是流控相关的部分。
struct net_device
{
/*
* This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* the interface.
*/
char name[IFNAMSIZ];
/* device name hash chain */
struct hlist_node name_hlist;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end; /* shared mem end */
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
*/
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
unsigned long state;
struct net_device *next;
/* The device initialization function. Called only once. */
int (*init)(struct net_device *dev);
/* ------- Fields preinitialized in Space.c finish here ------- */
/* Net device features */
unsigned long features;
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum only TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0xffff0000
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
struct net_device *next_sched;
/* Interface index. Unique device identifier */
int ifindex;
int iflink;
struct net_device_stats* (*get_stats)(struct net_device *dev);
struct iw_statistics* (*get_wireless_stats)(struct net_device *dev);
/* List of functions to handle Wireless Extensions (instead of ioctl).
* See
const struct iw_handler_def * wireless_handlers;
/* Instance data managed by the core of Wireless Extensions. */
struct iw_public_data * wireless_data;
struct ethtool_ops *ethtool_ops;
/*
* This marks the end of the "visible" part of the structure. All
* fields hereafter are internal to the system, and may change at
* will (read: may be cleaned up at will).
*/
unsigned int flags; /* interface flags (a la BSD) */
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short padded; /* How much padding added by alloc_netdev() */
unsigned char operstate; /* RFC2863 operstate */
unsigned char link_mode; /* mapping policy to operstate */
unsigned mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
unsigned char addr_len; /* hardware address length */
unsigned short dev_id; /* for shared network cards */
struct dev_mc_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
int promiscuity;
int allmulti;
/* Protocol specific pointers */
void *atalk_ptr; /* AppleTalk link */
void *ip_ptr; /* IPv4 specific data */
void *dn_ptr; /* DECnet specific data */
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
/*
* Cache line mostly used on receive path (including eth_type_trans())
*/
struct list_head poll_list ____cacheline_aligned_in_smp;
/* Link to poll list */
int (*poll) (struct net_device *dev, int *quota);
int quota;
int weight;
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast
because most packets are unicast) */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
/*
* Cache line mostly used on queue transmit path (qdisc)
*/
/* device queue lock */
spinlock_t queue_lock ____cacheline_aligned_in_smp;
struct Qdisc *qdisc;
struct Qdisc *qdisc_sleeping;
struct list_head qdisc_list;
unsigned long tx_queue_len; /* Max frames per queue allowed */
/* Partially transmitted GSO packet. */
struct sk_buff *gso_skb;
/* ingress path synchronizer */
spinlock_t ingress_lock;
struct Qdisc *qdisc_ingress;
/*
* One part is mostly used on xmit path (device)
*/
/* hard_start_xmit synchronizer */
spinlock_t _xmit_lock ____cacheline_aligned_in_smp;
/* cpu id of processor entered to hard_start_xmit or -1,
if nobody entered there.
*/
int xmit_lock_owner;
void *priv; /* pointer to private data */
int (*hard_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
/* These may be needed for future network-power-down code. */
unsigned long trans_start; /* Time (in jiffies) of last Tx */
int watchdog_timeo; /* used by dev_watchdog() */
struct timer_list watchdog_timer;
/*
* refcnt is a very hot point, so align it on SMP
*/
/* Number of references to this device */
atomic_t refcnt ____cacheline_aligned_in_smp;
/* delayed register/unregister */
struct list_head todo_list;
/* device index hash chain */
struct hlist_node index_hlist;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
} reg_state;
/* Called after device is detached from network. */
void (*uninit)(struct net_device *dev);
/* Called after last user reference disappears. */
void (*destructor)(struct net_device *dev);
/* Pointers to interface service routines. */
int (*open)(struct net_device *dev);
int (*stop)(struct net_device *dev);
#define HAVE_NETDEV_POLL
int (*hard_header) (struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
void *daddr,
void *saddr,
unsigned len);
int (*rebuild_header)(struct sk_buff *skb);
#define HAVE_MULTICAST
void (*set_multicast_list)(struct net_device *dev);
#define HAVE_SET_MAC_ADDR
int (*set_mac_address)(struct net_device *dev,
void *addr);
#define HAVE_PRIVATE_IOCTL
int (*do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
#define HAVE_SET_CONFIG
int (*set_config)(struct net_device *dev,
struct ifmap *map);
#define HAVE_HEADER_CACHE
int (*hard_header_cache)(struct neighbour *neigh,
struct hh_cache *hh);
void (*header_cache_update)(struct hh_cache *hh,
struct net_device *dev,
unsigned char * haddr);
#define HAVE_CHANGE_MTU
int (*change_mtu)(struct net_device *dev, int new_mtu);
#define HAVE_TX_TIMEOUT
void (*tx_timeout) (struct net_device *dev);
void (*vlan_rx_register)(struct net_device *dev,
struct vlan_group *grp);
void (*vlan_rx_add_vid)(struct net_device *dev,
unsigned short vid);
void (*vlan_rx_kill_vid)(struct net_device *dev,
unsigned short vid);
int (*hard_header_parse)(struct sk_buff *skb,
unsigned char *haddr);
int (*neigh_setup)(struct net_device *dev, struct neigh_parms *);
#ifdef CONFIG_NETPOLL
struct netpoll_info *npinfo;
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*poll_controller)(struct net_device *dev);
#endif
/* bridge stuff */
struct net_bridge_port *br_port;
#ifdef CONFIG_NET_DIVERT
/* this will get initialized at each interface type init routine */
struct divert_blk *divert;
#endif /* CONFIG_NET_DIVERT */
/* class/net/name entry */
struct class_device class_dev;
/* space for optional statistics and wireless sysfs groups */
struct attribute_group *sysfs_groups[3];
};
struct Qdisc
{
int (*enqueue)(struct sk_buff *skb, struct Qdisc *dev);//对应于队列操作的入队
struct sk_buff * (*dequeue)(struct Qdisc *dev);//对应于队列操作的出队
unsigned flags;
#define TCQ_F_BUILTIN 1
#define TCQ_F_THROTTLED 2
#define TCQ_F_INGRESS 4
int padded;
struct Qdisc_ops *ops;//对应于该队列的操作结构体
u32 handle;//该队列的标识
u32 parent;//对于我们的流控实现来说是
atomic_t refcnt;
struct sk_buff_head q;
struct net_device *dev;//该队列关联的接口设备
struct list_head list;//关联到接口net_device的qdisc_list链表
struct gnet_stats_basic bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est rate_est;
spinlock_t *stats_lock;//对应于接口net_device的queue_lock
struct rcu_head q_rcu;
int (*reshape_fail)(struct sk_buff *skb,
struct Qdisc *q);
/* This field is deprecated, but it is still used by CBQ
* and it will live until better solution will be invented.
*/
struct Qdisc *__parent;
};
struct Qdisc_ops
{
struct Qdisc_ops *next;//下一个队列
struct Qdisc_class_ops *cl_ops;//该队列的class的操作
char id[IFNAMSIZ];//队列类型标识,htb等
int priv_size;//对于htb,对应于htb_sched结构体大小
int (*enqueue)(struct sk_buff *, struct Qdisc *);//入队操作
struct sk_buff * (*dequeue)(struct Qdisc *);//出队操作
int (*requeue)(struct sk_buff *, struct Qdisc *);//重新排队操作
unsigned int (*drop)(struct Qdisc *);//丢包操作
int (*init)(struct Qdisc *, struct rtattr *arg);//初始化操作
void (*reset)(struct Qdisc *);//重置操作
void (*destroy)(struct Qdisc *);//销毁操作
int (*change)(struct Qdisc *, struct rtattr *arg);//对于htb设置为空
int (*dump)(struct Qdisc *, struct sk_buff *);//调试
int (*dump_stats)(struct Qdisc *, struct gnet_dump *);调试
struct module *owner;//当前模块
};
struct htb_sched
{
struct list_head root; /* root classes list *///所有基类链表
struct list_head hash[HTB_HSIZE]; /* hashed by classid *///hash classes
struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */
/* self list - roots of self generating tree */
struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
int row_mask[TC_HTB_MAXDEPTH];
struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
/* self wait list - roots of wait PQs per row */
struct rb_root wait_pq[TC_HTB_MAXDEPTH];
/* time of nearest event per level (row) */
unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
/* cached value of jiffies in dequeue */
unsigned long jiffies;
/* whether we hit non-work conserving class during this dequeue; we use */
int nwc_hit; /* this to disable mindelay complaint in dequeue */
int defcls; /* class where unclassified flows go to */
u32 debug; /* subsystem debug levels */
/* filters for qdisc itself */
struct tcf_proto *filter_list;//该类型的所有filter
int filter_cnt;
int rate2quantum; /* quant = rate / rate2quantum */
psched_time_t now; /* cached dequeue time */
struct timer_list timer; /* send delay timer */
#ifdef HTB_RATECM
struct timer_list rttim; /* rate computer timer */
int recmp_bucket; /* which hash bucket to recompute next */
#endif
/* non shaped skbs; let them go directly thru */
struct sk_buff_head direct_queue;
int direct_qlen; /* max qlen of above */
long direct_pkts;
};
struct htb_class
{
#ifdef HTB_DEBUG
unsigned magic;
#endif
/* general class parameters */
u32 classid;
struct gnet_stats_basic bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est rate_est;
struct tc_htb_xstats xstats;/* our special stats */
int refcnt; /* usage count of this class */
#ifdef HTB_RATECM
/* rate measurement counters */
unsigned long rate_bytes,sum_bytes;
unsigned long rate_packets,sum_packets;
#endif
/* topology */
int level; /* our level (see above) */
struct htb_class *parent; /* parent class */
struct list_head hlist; /* classid hash list item */
struct list_head sibling; /* sibling list item */
struct list_head children; /* children list */
union {
struct htb_class_leaf {
struct Qdisc *q;
int prio;
int aprio;
int quantum;
int deficit[TC_HTB_MAXDEPTH];
struct list_head drop_list;
} leaf;
struct htb_class_inner {
struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
/* When class changes from state 1->2 and disconnects from
parent's feed then we lost ptr value and start from the
first child again. Here we store classid of the
last valid ptr (used when ptr is NULL). */
u32 last_ptr_id[TC_HTB_NUMPRIO];
} inner;
} un;
struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
struct rb_node pq_node; /* node for event queue */
unsigned long pq_key; /* the same type as jiffies global */
int prio_activity; /* for which prios are we active */
enum htb_cmode cmode; /* current mode of the class */
/* class attached filters */
struct tcf_proto *filter_list;
int filter_cnt;
int warned; /* only one warning about non work conserving .. */
/* token bucket parameters */
struct qdisc_rate_table *rate; /* rate table of the class itself */
struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */
long buffer,cbuffer; /* token bucket depth/rate */
psched_tdiff_t mbuffer; /* max wait time */
long tokens,ctokens; /* current number of tokens *///当前的令牌数
psched_time_t t_c; /* checkpoint time */
};
struct tc_htb_opt
{
struct tc_ratespec rate;//保证速率
struct tc_ratespec ceil;//最高速率
__u32 buffer;
__u32 cbuffer;
__u32 quantum;
__u32 level; /* out only */
__u32 prio;
};
struct tcf_proto
{
/* Fast access part */
struct tcf_proto *next;
void *root;
int (*classify)(struct sk_buff*, struct tcf_proto*,
struct tcf_result *);
u32 protocol;
/* All the rest */
u32 prio;
u32 classid;
struct Qdisc *q;
void *data;
struct tcf_proto_ops *ops;
};
struct tcf_proto_ops
{
struct tcf_proto_ops *next;
char kind[IFNAMSIZ];
int (*classify)(struct sk_buff*, struct tcf_proto*,
struct tcf_result *);
int (*init)(struct tcf_proto*);
void (*destroy)(struct tcf_proto*);
unsigned long (*get)(struct tcf_proto*, u32 handle);
void (*put)(struct tcf_proto*, unsigned long);
int (*change)(struct tcf_proto*, unsigned long,
u32 handle, struct rtattr **,
unsigned long *);
int (*delete)(struct tcf_proto*, unsigned long);
void (*walk)(struct tcf_proto*, struct tcf_walker *arg);
/* rtnetlink specific */
int (*dump)(struct tcf_proto*, unsigned long,
struct sk_buff *skb, struct tcmsg*);
struct module *owner;
};
struct fw_head
{
struct fw_filter *ht[HTSIZE];
};
struct fw_filter
{
struct fw_filter *next;
u32 id;
struct tcf_result res;
#ifdef CONFIG_NET_CLS_IND
char indev[IFNAMSIZ];
#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
};
将全局变量Htb队列规则操作htb_qdisc_ops加入到全局变量当前注册在系统中的所有队列规则操作链表qdisc_base中。
Linux源码对应的函数调用如下:
注意;
对于原有的class进行移植,对于我们的实现不存在这种情况。
构造响应消息,通过netlink连接,发送响应消息到用户层。
执行如下操作
Linux源码中创建class的函数调用如下所示,
注意:
叶子class指向的qdisc是fifo qdisc,而不是初始化的htb qdisc。
将全局变量过滤器操作cls_fw_ops加入到全局变量当前注册在系统中的所有过滤器链表tcf_proto_base。
在全局变量rtnetlink_links中添加过滤器操作。
代码结构如下所示:
入队的代码结构如下所示,
高效管理,见附录,红黑树
主要参照htb_charge_class函数的统计操作。
Rate和ceil:配置输入的是bit/s,在输入内核前转换成byte/s。
Tokens和ctokens:(参看iproute源码的htb_parse_class_opt函数)
buffer = opt.rate.rate / get_hz() + mtu;
转换成文字表达为:
Buffer = 配置的速率转换成byte/s的值/250 + mtu
其中opt.rate.rate是转换成byte/s的值,mtu默认为1600;get_hz()的值为250。
opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer);
转换成文字表达为:
opt.buffer = (1000000*buffer)/配置的速率转换成byte/s的值
说明:可以看出opt.buffer随配置的速率的增加而减少,极限值为1000000/250 = 4000。
举例说明:
./tc class add dev eth0 parent 1:1 classid 1:12 htb rate 1kbit ceil 1kbit prio 0
打印信息为:
htb_parse_class_opt: <<<
htb_parse_class_opt: <<<
htb_parse_class_opt: <<<
有如下几种情况:
令牌桶的一个令牌,指的是该tc class 发送1个byte需要的ticks。
#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
if (toks > cl->B) toks = cl->B; \
toks -= L2T(cl, cl->R, bytes); \
if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
cl->T = toks
static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate,
int size)
{
int slot = size >> rate->rate.cell_log;
if (slot > 255) {
cl->xstats.giants++;
slot = 255;
}
return rate->data[slot];
}
根据
上面的两个条件是对上限的限制,所以,判断条件是ceil令牌数小于某个给定的值,而不是大于某个给定的值。
至于class如何利用父类借用带宽,可以通过其父类是否在发送模式的状态树中决定。在htb_dequeue函数中会将所有的level都遍历到,如果父亲节点有剩余带宽可以使用,则它的子节点会继续进行报文发送的。
根据所耗用的带宽,每个HTB类可能处与3种状态之一。
绿色:类的实际带宽小于等于其limit-at带宽。在这种状态下,类被附在它所在层的相应优先级的inner slot上,并被允许满足其limit-at带宽,无论它父类的限制带宽是多少。例如:如果有一个leaf class的limit-at=512000,它父类的max-limit=limit-at=128000,该leaf class可以达到512kbps!
黄色:类的实际带宽大于其limit-at,但小于等于max-limit。在此状态下,类被附于它父类inner feed的与它优先级相应的inner slot上;而父类的inner feed可能附于“祖父类”的相应优先级的inner slot(在父类也是黄色状态时)或者它(父类)所在层相应优先级的self slot(在父类是绿色状态时)上。在转换为这种状态时,类与他所在层的self feed“断开”,与他父类的inner feed“连接”。
红色:类的实际带宽超过max-limit。这个类不能从其父类处借用带宽。
大体的流程应该是这个样子,具体很多细节没有进行分析,时间和精力有限就分析到这里了。