ovs支持qos功能,其根据datapath类型调用不同的接口,对于kernel datapath,qos使用kernel提供的tc模块实现了入方向限速和出方向流量整型。对于userspace datapath,qos使用了dpdk提供的qos lib实现比较简单的入方向和出方向限速。
kernel的tc提供了多种调度机制,但是ovs目前只支持如下两种:htb和hfsc,更多qos参数可参考ovs db中qos和queue的定义。
image.png
也可以参考这篇文章,专门讲述qos。
先看一下代码中配置qos的地方,配置这部分是不区分哪种qos的,适用于kernel和userspace qos,区别是根据类型调用不同的api。
userspace qos配置的例子可以参数官网。
不管哪种qos,配置都会下到最底层接口收发包模块,不会在datapath或者slow path中。
遍历所有bridge上所有port下的interface,执行iface_configure_qos
static void
bridge_reconfigure(const struct ovsrec_open_vswitch *ovs_cfg)
HMAP_FOR_EACH (br, node, &all_bridges) {
struct port *port;
/* We need the datapath ID early to allow LACP ports to use it as the
* default system ID. */
bridge_configure_datapath_id(br);
HMAP_FOR_EACH (port, hmap_node, &br->ports) {
struct iface *iface;
LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
...
iface_configure_qos(iface, port->cfg->qos);
...
}
}
...
}
配置接口的qos,入方向,出方向,队列上的配置都在此函数完成。对于usespace qos来说,调用dpdk提供的接口,入方向qos配置最终调用 netdev_dpdk_set_policing,出方向qos配置调用 netdev_dpdk_set_qos
static void
iface_configure_qos(struct iface *iface, const struct ovsrec_qos *qos)
{
struct ofpbuf queues_buf;
ofpbuf_init(&queues_buf, 0);
if (!qos || qos->type[0] == '\0') {
netdev_set_qos(iface->netdev, NULL, NULL);
} else {
const struct ovsdb_datum *queues;
struct netdev_queue_dump dump;
unsigned int queue_id;
struct smap details;
bool queue_zero;
size_t i;
/* Configure top-level Qos for 'iface'. */
//class->set_qos(netdev, type, details) -- netdev_dpdk_set_qos
netdev_set_qos(iface->netdev, qos->type, &qos->other_config);
/* Deconfigure queues that were deleted. */
queues = ovsrec_qos_get_queues(qos, OVSDB_TYPE_INTEGER, OVSDB_TYPE_UUID);
smap_init(&details);
NETDEV_QUEUE_FOR_EACH (&queue_id, &details, &dump, iface->netdev) {
if (!queue_ids_include(queues, queue_id)) {
netdev_delete_queue(iface->netdev, queue_id);
}
}
smap_destroy(&details);
/* Configure queues for 'iface'. */
queue_zero = false;
for (i = 0; i < qos->n_queues; i++) {
const struct ovsrec_queue *queue = qos->value_queues[i];
queue_id = qos->key_queues[i];
if (queue_id == 0) {
queue_zero = true;
}
if (queue->n_dscp == 1) {
struct ofproto_port_queue *port_queue;
port_queue = ofpbuf_put_uninit(&queues_buf,
sizeof *port_queue);
port_queue->queue = queue_id;
port_queue->dscp = queue->dscp[0];
}
//class->set_queue(netdev, queue_id, details)
netdev_set_queue(iface->netdev, queue_id, &queue->other_config);
}
if (!queue_zero) {
smap_init(&details);
netdev_set_queue(iface->netdev, 0, &details);
smap_destroy(&details);
}
}
if (iface->ofp_port != OFPP_NONE) {
const struct ofproto_port_queue *port_queues = queues_buf.data;
size_t n_queues = queues_buf.size / sizeof *port_queues;
ofproto_port_set_queues(iface->port->bridge->ofproto, iface->ofp_port,
port_queues, n_queues);
}
//netdev->netdev_class->set_policing(netdev, kbits_rate, kbits_burst) -- netdev_dpdk_set_qos
netdev_set_policing(iface->netdev,
MIN(UINT32_MAX, iface->cfg->ingress_policing_rate),
MIN(UINT32_MAX, iface->cfg->ingress_policing_burst));
ofpbuf_uninit(&queues_buf);
}
入方向设置 netdev_dpdk_set_policing
static int
netdev_dpdk_set_policing(struct netdev* netdev, uint32_t policer_rate,
uint32_t policer_burst)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct ingress_policer *policer;
/* Force to 0 if no rate specified,
* default to 8000 kbits if burst is 0,
* else stick with user-specified value.
*/
policer_burst = (!policer_rate ? 0
: !policer_burst ? 8000
: policer_burst);
ovs_mutex_lock(&dev->mutex);
policer = ovsrcu_get_protected(struct ingress_policer *,
&dev->ingress_policer);
if (dev->policer_rate == policer_rate &&
dev->policer_burst == policer_burst) {
/* Assume that settings haven't changed since we last set them. */
ovs_mutex_unlock(&dev->mutex);
return 0;
}
/* Destroy any existing ingress policer for the device if one exists */
if (policer) {
ovsrcu_postpone(free, policer);
}
if (policer_rate != 0) {
//函数内部调用dpdk lib提供的api进行配置,返回policer
policer = netdev_dpdk_policer_construct(policer_rate, policer_burst);
} else {
policer = NULL;
}
//将policer赋值到接口设备 dev->ingress_policer, 从此接口收包后,会使用policer测速。
ovsrcu_set(&dev->ingress_policer, policer);
dev->policer_rate = policer_rate;
dev->policer_burst = policer_burst;
ovs_mutex_unlock(&dev->mutex);
return 0;
}
//rte_meter_srtcm_config 为dpdk提供的api
static struct ingress_policer *
netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
{
struct ingress_policer *policer = NULL;
uint64_t rate_bytes;
uint64_t burst_bytes;
int err = 0;
policer = xmalloc(sizeof *policer);
rte_spinlock_init(&policer->policer_lock);
/* rte_meter requires bytes so convert kbits rate and burst to bytes. */
rate_bytes = rate * 1000ULL / 8;
burst_bytes = burst * 1000ULL / 8;
policer->app_srtcm_params.cir = rate_bytes;
policer->app_srtcm_params.cbs = burst_bytes;
policer->app_srtcm_params.ebs = 0;
err = rte_meter_srtcm_config(&policer->in_policer,
&policer->app_srtcm_params);
if (err) {
VLOG_ERR("Could not create rte meter for ingress policer");
free(policer);
return NULL;
}
return policer;
}
出方向设置 netdev_dpdk_set_qos
当前看的是2.8.2版本的ovs,目前只支持一种出方向限速egress_policer_ops 。最终也是调用dpdk的lib api rte_meter_srtcm_config进行配置,和入方向配置使用的函数相同。
static const struct dpdk_qos_ops egress_policer_ops = {
"egress-policer", /* qos_name */
egress_policer_qos_construct,
egress_policer_qos_destruct,
egress_policer_qos_get,
egress_policer_qos_is_equal,
egress_policer_run
};
static int
netdev_dpdk_set_qos(struct netdev *netdev, const char *type,
const struct smap *details)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
const struct dpdk_qos_ops *new_ops = NULL;
struct qos_conf *qos_conf, *new_qos_conf = NULL;
int error = 0;
ovs_mutex_lock(&dev->mutex);
qos_conf = ovsrcu_get_protected(struct qos_conf *, &dev->qos_conf);
//目前只有一种:egress_policer_ops,name为egress-policer
new_ops = qos_lookup_name(type);
if (!new_ops || !new_ops->qos_construct) {
new_qos_conf = NULL;
if (type && type[0]) {
error = EOPNOTSUPP;
}
} else if (qos_conf && qos_conf->ops == new_ops
&& qos_conf->ops->qos_is_equal(qos_conf, details)) {
new_qos_conf = qos_conf;
} else {
//egress_policer_qos_construct
error = new_ops->qos_construct(details, &new_qos_conf);
}
if (error) {
VLOG_ERR("Failed to set QoS type %s on port %s: %s",
type, netdev->name, rte_strerror(error));
}
if (new_qos_conf != qos_conf) {
ovsrcu_set(&dev->qos_conf, new_qos_conf);
if (qos_conf) {
ovsrcu_postpone(qos_conf->ops->qos_destruct, qos_conf);
}
}
ovs_mutex_unlock(&dev->mutex);
return error;
}
static int
egress_policer_qos_construct(const struct smap *details,
struct qos_conf **conf)
{
struct egress_policer *policer;
int err = 0;
policer = xmalloc(sizeof *policer);
qos_conf_init(&policer->qos_conf, &egress_policer_ops);
//解析配置
egress_policer_details_to_param(details, &policer->app_srtcm_params);
err = rte_meter_srtcm_config(&policer->egress_meter,
&policer->app_srtcm_params);
if (!err) {
*conf = &policer->qos_conf;
} else {
free(policer);
*conf = NULL;
err = -err;
}
return err;
}
入方向qos处理
ovs dpdk中收包函数有两种,一种是vhostuser类型,调用netdev_dpdk_vhost_rxq_recv,一种是物理网卡类型的,调用netdev_dpdk_rxq_recv
netdev_dpdk_vhost_rxq_recv
netdev_dpdk_rxq_recv
struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq);
struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev);
//从接口设备取出policer
struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
int nb_rx;
int dropped = 0;
//调用dpdk函数rte_eth_rx_burst收包
nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id,
(struct rte_mbuf **) batch->packets,
NETDEV_MAX_BURST);
//如果policer不为空,说明配置了qos,开始处理报文。
if (policer) {
dropped = nb_rx;
nb_rx = ingress_policer_run(policer, (struct rte_mbuf **) batch->packets, nb_rx);
rte_spinlock_lock(&policer->policer_lock);
cnt = netdev_dpdk_policer_run(&policer->in_policer, pkts, pkt_cnt);
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
/* Handle current packet */
if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
rte_pktmbuf_free(pkt);
}
}
return cnt;
rte_spinlock_unlock(&policer->policer_lock);
dropped -= nb_rx;
}
//调用dpdk函数rte_meter_srtcm_color_blind_check对每个报文进行着色处理,绿色表示允许报文通过,红色和黄色表示报文超速,需要丢弃。
static inline bool
netdev_dpdk_policer_pkt_handle(struct rte_meter_srtcm *meter,
struct rte_mbuf *pkt, uint64_t time)
{
uint32_t pkt_len = rte_pktmbuf_pkt_len(pkt) - sizeof(struct ether_hdr);
return rte_meter_srtcm_color_blind_check(meter, time, pkt_len) ==
e_RTE_METER_GREEN;
}
出方向qos处理
出方向处理也是调用rte_meter_srtcm_color_blind_check对报文着色,根据颜色类型判断是否丢包。
netdev_dpdk_send__
__netdev_dpdk_vhost_send
cnt = netdev_dpdk_qos_run(dev, pkts, cnt);
struct qos_conf *qos_conf = ovsrcu_get(struct qos_conf *, &dev->qos_conf);
if (qos_conf) {
rte_spinlock_lock(&qos_conf->lock);
//egress_policer_run
cnt = qos_conf->ops->qos_run(qos_conf, pkts, cnt);
int cnt = 0;
struct egress_policer *policer =
CONTAINER_OF(conf, struct egress_policer, qos_conf);
cnt = netdev_dpdk_policer_run(&policer->egress_meter, pkts, pkt_cnt);
int i = 0;
int cnt = 0;
struct rte_mbuf *pkt = NULL;
uint64_t current_time = rte_rdtsc();
for (i = 0; i < pkt_cnt; i++) {
pkt = pkts[i];
/* Handle current packet */
if (netdev_dpdk_policer_pkt_handle(meter, pkt, current_time)) {
if (cnt != i) {
pkts[cnt] = pkt;
}
cnt++;
} else {
rte_pktmbuf_free(pkt);
}
}
return cnt;
return cnt;
rte_spinlock_unlock(&qos_conf->lock);
}
return cnt;
也可参考:ovs qos - 简书 (jianshu.com)