ovs中的分类器(classifier)中有多种用处,主要有openflow流表,路由处理,tunnel相关的,本文结合openflow流表看一下分类器的实现。
数据结构
下图为openflow流表和分类器的数据结构关系,其中右边橘色部分为分类器的结构,左边为openflow的结构,属于分类器的使用者。
下面简单看一个各个数据结构的作用
ofproto
ofproto用来表示一个bridge。
tables[255]: 表示此bridge上最多可以有255个oftable,即openflow流表table个数。
n_tables: 表示实际生效的oftable个数,在ofproto_init_tables中被初始化为255。
tables_version: 用来控制流表是否可被查询到。
expirable: 链表上挂的是此bridge上所有oftable中需要超时处理的openflow流表struct rule。
oftable
oftable用来表示一个openflow流表table。
cls: 指向分类器用来保存流表,
name: 默认为NULL,可使用如下命令更改name
ovs-vsctl -- --id=@ft create Flow_Table name=test1 -- set Bridge br10 flow_tables=0=@ft
max_flows: 支持最大openflow流表个数,默认为UINT_MAX,可通过如下命令修改
ovs-vsctl -- --id=@ft create Flow_Table flow_limit=10000 -- set Bridge br10 flow_tables=0=@ft
n_flows: 当前配置的流表个数。
miss_config: 用来决定查找流表失败后的行为,支持如下配置
OFPUTIL_TABLE_MISS_DEFAULT: 由不同版本openflow协议规定,1.0-1.2版本,报文需要被送到controller,1.3及以后的版本,直接丢包
OFPUTIL_TABLE_MISS_CONTROLLER: 将报文通过packet-in消息发给controller
OFPUTIL_TABLE_MISS_CONTINUE: 继续在下一个table进行匹配
OFPUTIL_TABLE_MISS_DROP: 直接丢包
rule
struct rule表示openflow流表信息。
cr: 指向cls_rule,包含流表优先级和匹配域,最终需要将cls_rule插入分类器,struct rule对分类器不可见。
table_id: 表示将流表插入哪个table。
hard_timeout: 流表超时时间,从创建流表,或者修改流表开始计时,超时时间到后,不管此流表有没有被使用,都会被删除。
idle_timeout: 流表空闲超时时间,从最近流表被使用开始计时,如果指定时间内此流表没有被使用,则被删除。
expirable: 一个链表节点,如果hard_timeout或者idel_timeout任意一个被设置了,则将流表插入ofproto->expirable链表。
actions: 指向rule_actions,用来指定流表的action,可以指定多个action。
rule_actions
rule_actions表示流表的action,其中ofpacts_len指定了ofpacts的大小,ofpacts指定了具体的action,可以有多个action,即多个struct ofpact。
ofpact
struct ofpact表示具体的action,其中type指定了action类型,如下所示
#define OFPACTS \
/* Output. */ \
OFPACT(OUTPUT, ofpact_output, ofpact, "output") \
OFPACT(GROUP, ofpact_group, ofpact, "group") \
OFPACT(CONTROLLER, ofpact_controller, userdata, "controller") \
OFPACT(ENQUEUE, ofpact_enqueue, ofpact, "enqueue") \
OFPACT(OUTPUT_REG, ofpact_output_reg, ofpact, "output_reg") \
OFPACT(BUNDLE, ofpact_bundle, slaves, "bundle") \
...
/* Instructions. */ \
OFPACT(METER, ofpact_meter, ofpact, "meter") \
OFPACT(CLEAR_ACTIONS, ofpact_null, ofpact, "clear_actions") \
OFPACT(WRITE_ACTIONS, ofpact_nest, actions, "write_actions") \
OFPACT(WRITE_METADATA, ofpact_metadata, ofpact, "write_metadata") \
OFPACT(GOTO_TABLE, ofpact_goto_table, ofpact, "goto_table")
/* enum ofpact_type, with a member OFPACT_ for each action. */
enum OVS_PACKED_ENUM ofpact_type {
#define OFPACT(ENUM, STRUCT, MEMBER, NAME) OFPACT_##ENUM,
OFPACTS
#undef OFPACT
};
struct classifier
struct classifier用来表示分类器,里面保存的是openflow流表。
n_rules表示流表个数。
subtables: 有序数组,保存cls_subtable,根据cls_subtable->max_priority排序,查找流表时使用。
subtables_map: 也是用来保存cls_subtable,插入流表时使用。
tries[3]: 前缀树,根据流表中的cls_trie->field字段将流表插入树根节点cls_trie->root。最多支持3个前缀树。默认为MFF_IPV4_DST和MFF_IPV4_SRC两个前缀树。
/* Prefix trie for a 'field' */
struct cls_trie {
const struct mf_field *field; /* Trie field, or NULL. */
rcu_trie_ptr root; /* NULL if none. */
};
n_tries: 当前配置的前缀树的个数。
flow_segments[3]: 保存staged查找用到的field的结束位置
//查找openflow流表时,将struct flow分为四段内容进行分段查找,
//第一段是metadata,从0-FLOW_SEGMENT_1_ENDS_AT,
//第二段是l2从FLOW_SEGMENT_1_ENDS_AT到FLOW_SEGMENT_2_ENDS_AT,
//第三段是l3从FLOW_SEGMENT_2_ENDS_AT到FLOW_SEGMENT_3_ENDS_AT,
//第四段是l4从FLOW_SEGMENT_3_ENDS_AT到FLOW_U64S
enum {
FLOW_SEGMENT_1_ENDS_AT = offsetof(struct flow, dl_dst),
FLOW_SEGMENT_2_ENDS_AT = offsetof(struct flow, nw_src),
FLOW_SEGMENT_3_ENDS_AT = offsetof(struct flow, tp_src),
};
/* U64 indices for segmented flow classification. */
const uint8_t flow_segment_u64s[4] = {
FLOW_SEGMENT_1_ENDS_AT / sizeof(uint64_t),
FLOW_SEGMENT_2_ENDS_AT / sizeof(uint64_t),
FLOW_SEGMENT_3_ENDS_AT / sizeof(uint64_t),
FLOW_U64S
};
classifier_init(&table->cls, flow_segment_u64s);
n_flow_segments: 分段个数。
publish: 控制是否将subtable插入有序数组subtables,即控制subtable是否对查找可见。
struct cls_subtable
struct cls_subtable表示有相同mask的流表的集合,比如"ip, nw_src=10.10.0.0/16"和"ip, nw_src=192.168.0.0/16"这两条流表有相同的mask,则它们就会被插入同一个cls_subtable。
cmap_node: classfier->subtables_map的一个节点。
max_priority: 保存struct cls_subtable中所有流表中的最大优先级。
max_count: 有最大优先级流表的个数。
rules_list: 链表,用来保存流表struct cls_rule。添加流表时,使用CLS_FOR_EACH_TARGET宏遍历所有的rule。
struct flowmap index_maps[4]: 将流表mask分成cls->n_flow_segments+1段,每段保存到对应的struct flowmap。
/* Init indices for segmented lookup, if any. */
prev = 0;
for (i = 0; i < cls->n_flow_segments; i++) {
stage_map = miniflow_get_map_in_range(&mask->masks, prev,
cls->flow_segments[i]);
/* Add an index if it adds mask bits. */
if (!flowmap_is_empty(stage_map)) {
ccmap_init(&subtable->indices[index]);
*CONST_CAST(struct flowmap *, &subtable->index_maps[index])
= stage_map;
index++;
}
prev = cls->flow_segments[i];
}
/* Map for the final stage. */
*CONST_CAST(struct flowmap *, &subtable->index_maps[index])
= miniflow_get_map_in_range(&mask->masks, prev, FLOW_U64S);
/* Check if the final stage adds any bits. */
if (index > 0) {
if (flowmap_is_empty(subtable->index_maps[index])) {
/* Remove the last index, as it has the same fields as the rules
* map. */
--index;
ccmap_destroy(&subtable->indices[index]);
}
}
*CONST_CAST(uint8_t *, &subtable->n_indices) = index;
n_indices: 分段的个数。
indices[3]: 保存hash值。将流表匹配域分成n_indices段,并和对应的mask index_maps相与后计算hash值。
for (i = 0; i < subtable->n_indices; i++) {
ihash[i] = minimatch_hash_range(&rule->match, subtable->index_maps[i],
&mask_offset, &basis);
}
trie_plen[3]: 根据前缀树用的field计算当前subtable的mask的field的长度,如果为0,说明不需要前缀树,否则在查找流表时,需要先在前缀树中查找。
for (i = 0; i < cls->n_tries; i++) {
subtable->trie_plen[i] = minimask_get_prefix_len(mask,
cls->tries[i].field);
}
struct cmap rules: 用来保存struct cls_match。匹配流表时find_match使用。
struct minimask mask: 当前subtable的mask。
struct cls_rule
cls_rule指定了流表的优先级和匹配域。
priority: 流表优先级。
cls_match: 指向struct cls_match。
match: 指向struct minimatch,包含具体的匹配字段及其mask。
struct minimatch
minimatch指定了具体的匹配字段及其mask。
struct minimatch {
union {
struct {
struct miniflow *flow;
struct minimask *mask;
};
struct miniflow *flows[2];
};
};
分类器实现原理
分类器使用TSS(tuple space search)算法查找流表,具体为插入流表时根据流表mask生成不同的subtable,有相同mask的流表需要插入到同一个subtable,在subtable中根据mask和流表相与后的结果计算出hash值,插入subtable的hashmap subtable->rules中。
查找流表时遍历subtable,使用从报文提前的flow信息和subtable的mask相与计算hash后,查找subtable的hash表进行匹配。最坏情况下需要查找所有的subtable。
为了提高查找速度,ovs使用了如下的优化技术,其中优先级排序是为了提高查找openflow流表的速度,而分段查找和前缀匹配主要是为了让下发到megaflow的流表尽量模糊,这样报文可以更多的命中megaflow,不至于miss后查找openflow流表。
Tuple Priority Sorting(优先级排序)
假如没有优先级排序的优化,查询流表时需要遍历所有的subtable中的所有流表,因为即使很早就能找到匹配项,也不确定是否还有优先级更高的流表存在,所以需要遍历所有的流表。
为了解决这个问题,在subtable T中引入了变量T.pri_max用来保存subtable T中所有流表的优先级的最大值,同时将subtable T根据T.pri_max按照递减顺序插入分类器的有序数组subtables中。这样在查找流表时,按照优先级从高到低遍历有序数组subtables,如果找到了匹配流表F,则将F保存到B中,遍历下一个subtable时,如果B.pri 大于等于 T.pri_max,则说明当前和之后的subtable肯定没有比B优先级更高的流表,则B就是优先级最高的匹配流表。如果B.pri小于T.pri_max,则说明T中可能有更匹配的流表,继续查找当前subtable T中的流表,如果匹配到了F,并且F.pri大于B.pri,说明找到了更高优先级的流表,将F保存到B,继续遍历。伪码如下图所示
Staged Lookup(分段查找)
查找到openflow流表后,需要将匹配时用到的field下发到datapath指导后续报文转发。
先看一下没有 分段查找优化时会有什么问题,假如有如下两条流表信息
priority=200, ip, nw_dst=11.0.0.0/8 dst_port=80 action=output:1
priority=100, ip, nw_dst=10.0.0.0/8 action=output:2
一条数据流匹配上面流表
10.5.6.6: 45666 -> 10.5.6.7:22
根据Tuple Priority Sorting优化后的TSS查找,首先匹配优先级为200的流表,显然匹配不上,但是下发megaflow流表的掩码会被设置包含目的ip和目的端口号,接着匹配优先级100的流表,可以匹配成功,最终下发到megaflow的流表如下
ip,nw_dst=10.0.0.0/8, dst_port=22 action=output:2
如果再有相同目的网段ip,但是目的端口号不同的数据流经过ovs,还是会miss megaflow,走慢速路径的处理。
为了解决这个问题引入了分段查找,具体实现为将flow信息分为四个group: metadata(比如入端口), l2, l3和l4。然后将subtable中的一个hash表改成4个hash表,第一个hash表只包含metadata信息,第二个包含metadat和l2信息,第三个包含metadata,l2和l3信息,第三个包含metadat,l2,l3和l4信息,即全部信息,也就是以前的hash表。这样的话,报文在匹配时,首先匹配metadata,如果不能命中,就不用继续匹配后面的group,如果能命令metadata,则继续匹配l2,依次进行。
有了分段查找后,再回到上面的例子,数据流在匹配优先级为200的流表时,metadata和l2的掩码为0,不用匹配,l3掩码非0,进行匹配并且不能成功,则结束匹配此流表,此时是没有用到dst_port的,所以也不用将dst_port下发到megaflow。最终下发的流表为
ip,nw_dst=10.0.0.0/8 action=output:2
后面相同目的网段ip,但是目的端口号不同的数据流经过ovs,就可以命中 megaflow实现快速转发。
Prefix Tracking
假如还是如下两条流表,但是第一条目的ip掩码改成32位
priority=200, ip, nw_dst=11.5.6.7/32 dst_port=80 action=output:1
priority=100, ip, nw_dst=10.0.0.0/8 action=output:2
数据流 10.5.6.6: 45666 -> 10.5.6.7:22 匹配上面的流表时,根据优先级排序和分段查找,会先匹配11.5.6.7/32,尽管匹配不上,但是也会将32位掩码下发到megaflow中。
ip,nw_dst=10.5.6.7/32 action=output:2
后续再有相同网段不同目的ip的报文经过ovs时,仍然需要走慢速路径。
解决这个问题的办法是引入了基于ipv4/v6的前缀树prefix trie,注意这个trie是每个openflow table都有的,并且所有的subtable共用相同的trie。当需要匹配ip地址时,在查找hash表前,先根据LPM查找前缀树,来决定(1)megaflow流表需要的最长前缀,(2)哪个hash表可以直接跳过。
假如有如下流表指定的subnet
20 /8
10.1 /16
10.2 /16
10.1.3 /24
10.1.4.5/32
对应的前缀树如下所示
查找前缀树时,如果遇到了叶子节点,说明查找成功,可以结束查找,下发到megaflow流表的不用关心ip地址剩余的位数,比如10.1.3.5查找前缀树,叶子节点为3,则会安装10.1.3/24到megaflow,20.0.5.1查找前缀树,叶子节点为20,安装20/8。如果由于没有匹配到节点而导致查找结束,也要将查找失败的bit安装到megaflow,比如10.5.6.7在查找5时失败,则安装10.5/16到megaflow。同时也会跳过此tuple的查找,因为这个分类器中没有匹配流表。
再回到例子中,查找分类器的subtable前,先用目的ip 10.5.6.7根据LPM匹配分类器的前缀树,如果匹配成功,并且匹配的掩码长度为8,说明有匹配流表,接着遍历subtable,先匹配包含优先级为200的流表所在subtable,其掩码长度为32,大于8,说明匹配的流表不在此subtable,接着匹配优先级为100的流表所在subtable,其掩码长度为8,说明匹配的流表可能在此subtable,所以可继续查找此subtable中的流表。
如果查找前缀树失败,说明此分类器中没有匹配的流表,不用再查找subtable。
源码
classifier_init
分类器的初始化在oftable_init时被调用。
/* Default fields to use for prefix tries in each flow table, unless something
* else is configured. */
const enum mf_field_id default_prefix_fields[2] =
{ MFF_IPV4_DST, MFF_IPV4_SRC };
static void
oftable_init(struct oftable *table)
//初始化分类器
classifier_init(&table->cls, flow_segment_u64s);
//设置前缀树,默认为src ip和dst ip
classifier_set_prefix_fields(&table->cls, default_prefix_fields,
ARRAY_SIZE(default_prefix_fields));
初始化分类器struct classifier相关字段,参数flow_segments指定了段分类器的个数和结束位置。
/* Initializes 'cls' as a classifier that initially contains no classification
* rules. */
void
classifier_init(struct classifier *cls, const uint8_t *flow_segments)
{
cls->n_rules = 0;
cmap_init(&cls->subtables_map);
pvector_init(&cls->subtables);
cls->n_flow_segments = 0;
//设置分段的结束位置
if (flow_segments) {
while (cls->n_flow_segments < CLS_MAX_INDICES
&& *flow_segments < FLOW_U64S) {
cls->flow_segments[cls->n_flow_segments++] = *flow_segments++;
}
}
//前缀树默认为0
cls->n_tries = 0;
for (int i = 0; i < CLS_MAX_TRIES; i++) {
trie_init(cls, i, NULL);
}
cls->publish = true;
}
classifier_insert
将openflow规则插入分类器。
/* Inserts 'rule' into 'cls'. Until 'rule' is removed from 'cls', the caller
* must not modify or free it.
*
* 'cls' must not contain an identical rule (including wildcards, values of
* fixed fields, and priority). Use classifier_find_rule_exactly() to find
* such a rule. */
void
classifier_insert(struct classifier *cls, const struct cls_rule *rule,
ovs_version_t version, const struct cls_conjunction conj[],
size_t n_conj)
{
const struct cls_rule *displaced_rule
= classifier_replace(cls, rule, version, conj, n_conj);
ovs_assert(!displaced_rule);
}
/* Inserts 'rule' into 'cls' in 'version'. Until 'rule' is removed from 'cls',
* the caller must not modify or free it.
*
* If 'cls' already contains an identical rule (including wildcards, values of
* fixed fields, and priority) that is visible in 'version', replaces the old
* rule by 'rule' and returns the rule that was replaced. The caller takes
* ownership of the returned rule and is thus responsible for destroying it
* with cls_rule_destroy(), after RCU grace period has passed (see
* ovsrcu_postpone()).
*
* Returns NULL if 'cls' does not contain a rule with an identical key, after
* inserting the new rule. In this case, no rules are displaced by the new
* rule, even rules that cannot have any effect because the new rule matches a
* superset of their flows and has higher priority.
*/
const struct cls_rule *
classifier_replace(struct classifier *cls, const struct cls_rule *rule,
ovs_version_t version,
const struct cls_conjunction *conjs, size_t n_conjs)
{
struct cls_match *new;
struct cls_subtable *subtable;
uint32_t ihash[CLS_MAX_INDICES];
struct cls_match *head;
unsigned int mask_offset;
size_t n_rules = 0;
uint32_t basis;
uint32_t hash;
unsigned int i;
//申请struct cls_match,将rule中相关字段赋值到struct cls_match
/* 'new' is initially invisible to lookups. */
new = cls_match_alloc(rule, version, conjs, n_conjs);
ovsrcu_set(&CONST_CAST(struct cls_rule *, rule)->cls_match, new);
//根据流表的mask查找subtable
subtable = find_subtable(cls, rule->match.mask);
if (!subtable) {
//没有找到则新建subtable,后面会分析此函数
subtable = insert_subtable(cls, rule->match.mask);
}
/* Compute hashes in segments. */
basis = 0;
mask_offset = 0;
//在insert_subtable中会将mask分段,并保存到index_maps
//这里根据分段的mask index_maps,计算分段hash
for (i = 0; i < subtable->n_indices; i++) {
ihash[i] = minimatch_hash_range(&rule->match, subtable->index_maps[i],
&mask_offset, &basis);
}
//计算全部mask的hash值
hash = minimatch_hash_range(&rule->match, subtable->index_maps[i],
&mask_offset, &basis);
//根据rule匹配域查找hash表subtable->rules
head = find_equal(subtable, rule->match.flow, hash);
//没查到说明是新rule
if (!head) {
/* Add rule to tries.
*
* Concurrent readers might miss seeing the rule until this update,
* which might require being fixed up by revalidation later. */
//在insert_subtable中,如果subtable的mask包含前缀树域,比如指定了ip地址,
//则设置subtable->trie_plen为mask指定的ip掩码长度。则将rule插入分类器的前缀树,用于实现prefix tracking
for (i = 0; i < cls->n_tries; i++) {
if (subtable->trie_plen[i]) {
trie_insert(&cls->tries[i], rule, subtable->trie_plen[i]);
}
}
//将rule插入port前缀树
/* Add rule to ports trie. */
if (subtable->ports_mask_len) {
/* We mask the value to be inserted to always have the wildcarded
* bits in known (zero) state, so we can include them in comparison
* and they will always match (== their original value does not
* matter). */
ovs_be32 masked_ports = minimatch_get_ports(&rule->match);
trie_insert_prefix(&subtable->ports_trie, &masked_ports,
subtable->ports_mask_len);
}
//将分段hash值插入subtable->indices,用于实现staged lookup
/* Add new node to segment indices. */
for (i = 0; i < subtable->n_indices; i++) {
ccmap_inc(&subtable->indices[i], ihash[i]);
}
//将rule插入subtable->rules,用于hash查找
//同时返回subtable->rules中rule的总个数
n_rules = cmap_insert(&subtable->rules, &new->cmap_node, hash);
} else { /* Equal rules exist in the classifier already. */
struct cls_match *prev, *iter;
//找到了有相同匹配域的head(单向链表保存有相同匹配域的规则),但是rule的其他字段比如priority,action等可能不同,所以需要从head开始向后遍历,如果
//找到相同优先级的cls_match则替换,如果rule优先级大于iter,则插入iter之前,如果rule优先级最小,则插到最后
/* Scan the list for the insertion point that will keep the list in
* order of decreasing priority. Insert after rules marked invisible
* in any version of the same priority. */
FOR_EACH_RULE_IN_LIST_PROTECTED (iter, prev, head) {
if (rule->priority > iter->priority
|| (rule->priority == iter->priority
&& !cls_match_is_eventually_invisible(iter))) {
break;
}
}
/* Replace 'iter' with 'new' or insert 'new' between 'prev' and
* 'iter'. */
if (iter) {
struct cls_rule *old;
//优先级相同,替换iter
if (rule->priority == iter->priority) {
cls_match_replace(prev, iter, new);
old = CONST_CAST(struct cls_rule *, iter->cls_rule);
} else {//rule优先级大于iter,查到iter前面
cls_match_insert(prev, iter, new);
old = NULL;
}
/* Replace the existing head in data structures, if rule is the new
* head. */
if (iter == head) {
cmap_replace(&subtable->rules, &head->cmap_node,
&new->cmap_node, hash);
}
if (old) {
struct cls_conjunction_set *conj_set;
conj_set = ovsrcu_get_protected(struct cls_conjunction_set *,
&iter->conj_set);
if (conj_set) {
ovsrcu_postpone(free, conj_set);
}
ovsrcu_set(&old->cls_match, NULL); /* Marks old rule as removed
* from the classifier. */
ovsrcu_postpone(cls_match_free_cb, iter);
/* No change in subtable's max priority or max count. */
/* Make 'new' visible to lookups in the appropriate version. */
cls_match_set_remove_version(new, OVS_VERSION_NOT_REMOVED);
/* Make rule visible to iterators (immediately). */
rculist_replace(CONST_CAST(struct rculist *, &rule->node),
&old->node);
/* Return displaced rule. Caller is responsible for keeping it
* around until all threads quiesce. */
return old;
}
} else {
//插到链表最后面
/* 'new' is new node after 'prev' */
cls_match_insert(prev, iter, new);
}
}
/* Make 'new' visible to lookups in the appropriate version. */
cls_match_set_remove_version(new, OVS_VERSION_NOT_REMOVED);
//将rule插入链表subtable->rules_list
/* Make rule visible to iterators (immediately). */
rculist_push_back(&subtable->rules_list,
CONST_CAST(struct rculist *, &rule->node));
/* Rule was added, not replaced. Update 'subtable's 'max_priority' and
* 'max_count', if necessary.
*
* The rule was already inserted, but concurrent readers may not see the
* rule yet as the subtables vector is not updated yet. This will have to
* be fixed by revalidation later. */
//n_rules等于1,说明当前rule是subtable中第一个rule,
if (n_rules == 1) {
//只有一个rule,所以max_priority为rule的priority
subtable->max_priority = rule->priority;
subtable->max_count = 1;
//subtable也是刚创建,将subtable插入有序数组cls->subtables,根据rule的priority进行排序
pvector_insert(&cls->subtables, subtable, rule->priority);
} else if (rule->priority == subtable->max_priority) {
//插入的rule不是subtable第一个rule,但是和之前插入的rule的优先级的最大值相同
++subtable->max_count;
} else if (rule->priority > subtable->max_priority) {
//插入的rule不是subtable第一个rule,并且rule的priority比之前插入的rule最大优先级大
//则更新subtable的最大优先级
subtable->max_priority = rule->priority;
subtable->max_count = 1;
//根据最新的最大优先级,对cls->subtables重新排序
pvector_change_priority(&cls->subtables, subtable, rule->priority);
}
//更新分类器rule总数
/* Nothing was replaced. */
cls->n_rules++;
//上面只是将subtable插入cls->subtables的临时变量tmp中,调用pvector_publish时才会真正将tmp中的subtable更新到pvec->impl,这样才对查找可见。
if (cls->publish) {
pvector_publish(&cls->subtables);
}
return NULL;
}
根据流表的mask创建新的subtable,并插入cls->subtables_map
/* The new subtable will be visible to the readers only after this. */
static struct cls_subtable *
insert_subtable(struct classifier *cls, const struct minimask *mask)
{
uint32_t hash = minimask_hash(mask, 0);
struct cls_subtable *subtable;
int i, index = 0;
struct flowmap stage_map;
uint8_t prev;
size_t count = miniflow_n_values(&mask->masks);
//分配subtable内存,MINIFLOW_VALUES_SIZE(count)表示mask占用内存
subtable = xzalloc(sizeof *subtable + MINIFLOW_VALUES_SIZE(count));
cmap_init(&subtable->rules);
//复制mask
miniflow_clone(CONST_CAST(struct miniflow *, &subtable->mask.masks),
&mask->masks, count);
//将mask进行分段并保存
/* Init indices for segmented lookup, if any. */
prev = 0;
for (i = 0; i < cls->n_flow_segments; i++) {
stage_map = miniflow_get_map_in_range(&mask->masks, prev,
cls->flow_segments[i]);
/* Add an index if it adds mask bits. */
if (!flowmap_is_empty(stage_map)) {
ccmap_init(&subtable->indices[index]);
*CONST_CAST(struct flowmap *, &subtable->index_maps[index])
= stage_map;
index++;
}
prev = cls->flow_segments[i];
}
/* Map for the final stage. */
*CONST_CAST(struct flowmap *, &subtable->index_maps[index])
= miniflow_get_map_in_range(&mask->masks, prev, FLOW_U64S);
/* Check if the final stage adds any bits. */
if (index > 0) {
if (flowmap_is_empty(subtable->index_maps[index])) {
/* Remove the last index, as it has the same fields as the rules
* map. */
--index;
ccmap_destroy(&subtable->indices[index]);
}
}
*CONST_CAST(uint8_t *, &subtable->n_indices) = index;
//如果mask包含分类器的前缀树域,则计算掩码长度,后面插入此subtable的rule也要同时插入前缀树。
for (i = 0; i < cls->n_tries; i++) {
subtable->trie_plen[i] = minimask_get_prefix_len(mask,
cls->tries[i].field);
}
//mask中如果包含tp_src,则需要将rule按照tp_src插入前缀树subtable->ports_trie
/* Ports trie. */
ovsrcu_set_hidden(&subtable->ports_trie, NULL);
*CONST_CAST(int *, &subtable->ports_mask_len)
= 32 - ctz32(ntohl(MINIFLOW_GET_BE32(&mask->masks, tp_src)));
/* List of rules. */
rculist_init(&subtable->rules_list);
//最后插入cls->subtables_map
cmap_insert(&cls->subtables_map, &subtable->cmap_node, hash);
return subtable;
}
classifier_lookup
分类器查找函数用于在分类器中查找匹配flow,并且对指定版本可见的流表信息,参数wc是需要在classifier_lookup过程中赋值的,将查找分类器时用到的field mask保存到wc,以便下发megaflow时使用。
在查找中还用到了上面介绍的Tuple Priority Sorting,Staged Lookup和Prefix Tracking等优化技术。
/* Finds and returns the highest-priority rule in 'cls' that matches 'flow' and
* that is visible in 'version'. Returns a null pointer if no rules in 'cls'
* match 'flow'. If multiple rules of equal priority match 'flow', returns one
* arbitrarily.
*
* If a rule is found and 'wc' is non-null, bitwise-OR's 'wc' with the
* set of bits that were significant in the lookup. At some point
* earlier, 'wc' should have been initialized (e.g., by
* flow_wildcards_init_catchall()).
*
* 'flow' is non-const to allow for temporary modifications during the lookup.
* Any changes are restored before returning. */
const struct cls_rule *
classifier_lookup(const struct classifier *cls, ovs_version_t version,
struct flow *flow, struct flow_wildcards *wc)
{
return classifier_lookup__(cls, version, flow, wc, true);
}
/* Like classifier_lookup(), except that support for conjunctive matches can be
* configured with 'allow_conjunctive_matches'. That feature is not exposed
* externally because turning off conjunctive matches is only useful to avoid
* recursion within this function itself.
*
* 'flow' is non-const to allow for temporary modifications during the lookup.
* Any changes are restored before returning. */
static const struct cls_rule *
classifier_lookup__(const struct classifier *cls, ovs_version_t version,
struct flow *flow, struct flow_wildcards *wc,
bool allow_conjunctive_matches)
{
struct trie_ctx trie_ctx[CLS_MAX_TRIES];
const struct cls_match *match;
/* Highest-priority flow in 'cls' that certainly matches 'flow'. */
//hard用来保存具有最高优先级的匹配规则
const struct cls_match *hard = NULL;
int hard_pri = INT_MIN; /* hard ? hard->priority : INT_MIN. */
//下面几个变量用来插入联合规则,暂时先不考虑这个
/* Highest-priority conjunctive flows in 'cls' matching 'flow'. Since
* these are (components of) conjunctive flows, we can only know whether
* the full conjunctive flow matches after seeing multiple of them. Thus,
* we refer to these as "soft matches". */
struct cls_conjunction_set *soft_stub[64];
struct cls_conjunction_set **soft = soft_stub;
size_t n_soft = 0, allocated_soft = ARRAY_SIZE(soft_stub);
int soft_pri = INT_MIN; /* n_soft ? MAX(soft[*]->priority) : INT_MIN. */
/* Synchronize for cls->n_tries and subtable->trie_plen. They can change
* when table configuration changes, which happens typically only on
* startup. */
atomic_thread_fence(memory_order_acquire);
/* Initialize trie contexts for find_match_wc(). */
for (int i = 0; i < cls->n_tries; i++) {
trie_ctx_init(&trie_ctx[i], &cls->tries[i]);
}
//主循环,遍历分类器的有序数组cls->subtables,
//宏PVECTOR_FOR_EACH_PRIORITY 保证subtable的最大优先级大于hard_pri + 1,如果不大于循环就结束,这是Tuple Priority Sorting的实现要求。
/* Main loop. */
struct cls_subtable *subtable;
PVECTOR_FOR_EACH_PRIORITY (subtable, hard_pri + 1, 2, sizeof *subtable,
&cls->subtables) {
struct cls_conjunction_set *conj_set;
/* Skip subtables with no match, or where the match is lower-priority
* than some certain match we've already found. */
//到subtable查找规则
match = find_match_wc(subtable, version, flow, trie_ctx, cls->n_tries, wc);
//没有匹配的,或者匹配到的优先级比上次的低,则继续查找下一个subtable
if (!match || match->priority <= hard_pri) {
continue;
}
conj_set = ovsrcu_get(struct cls_conjunction_set *, &match->conj_set);
if (!conj_set) {
/* 'match' isn't part of a conjunctive match. It's the best
* certain match we've got so far, since we know that it's
* higher-priority than hard_pri.
*
* (There might be a higher-priority conjunctive match. We can't
* tell yet.) */
//保存匹配match及其优先级
hard = match;
hard_pri = hard->priority;
} else if (allow_conjunctive_matches) {
...
}
}
/* In the common case, at this point we have no soft matches and we can
* return immediately. (We do the same thing if we have potential soft
* matches but none of them are higher-priority than our hard match.) */
if (hard_pri >= soft_pri) {
if (soft != soft_stub) {
free(soft);
}
//找到匹配规则,返回其对应的流表cls_rule
return hard ? hard->cls_rule : NULL;
}
...
}
在subtable查找flow匹配的规则
static const struct cls_match *
find_match_wc(const struct cls_subtable *subtable, ovs_version_t version,
const struct flow *flow, struct trie_ctx trie_ctx[CLS_MAX_TRIES],
unsigned int n_tries, struct flow_wildcards *wc)
{
if (OVS_UNLIKELY(!wc)) {
return find_match(subtable, version, flow,
flow_hash_in_minimask(flow, &subtable->mask, 0));
}
uint32_t basis = 0, hash;
const struct cls_match *rule = NULL;
struct flowmap stages_map = FLOWMAP_EMPTY_INITIALIZER;
unsigned int mask_offset = 0;
int i;
//分段查找
/* Try to finish early by checking fields in segments. */
for (i = 0; i < subtable->n_indices; i++) {
//先查找前缀树,如果查找失败,则说明此分类器中没有匹配flow的流表,直接返回。
if (check_tries(trie_ctx, n_tries, subtable->trie_plen,
subtable->index_maps[i], flow, wc)) {
/* 'wc' bits for the trie field set, now unwildcard the preceding
* bits used so far. */
goto no_match;
}
//合并分段map
/* Accumulate the map used so far. */
stages_map = flowmap_or(stages_map, subtable->index_maps[i]);
//增量计算flow hash
hash = flow_hash_in_minimask_range(flow, &subtable->mask,
subtable->index_maps[i],
&mask_offset, &basis);
//根据hash查找subtable->indices,查不到则说明此subtable没有匹配的规则,no_match结束查找。
if (!ccmap_find(&subtable->indices[i], hash)) {
goto no_match;
}
}
/* Trie check for the final range. */
if (check_tries(trie_ctx, n_tries, subtable->trie_plen,
subtable->index_maps[i], flow, wc)) {
goto no_match;
}
//前面几段查找都匹配,则计算最全hash值,调用find_match查找hash表
hash = flow_hash_in_minimask_range(flow, &subtable->mask,
subtable->index_maps[i],
&mask_offset, &basis);
rule = find_match(subtable, version, flow, hash);
if (!rule && subtable->ports_mask_len) {
/* The final stage had ports, but there was no match. Instead of
* unwildcarding all the ports bits, use the ports trie to figure out a
* smaller set of bits to unwildcard. */
unsigned int mbits;
ovs_be32 value, plens, mask;
mask = MINIFLOW_GET_BE32(&subtable->mask.masks, tp_src);
value = ((OVS_FORCE ovs_be32 *)flow)[TP_PORTS_OFS32] & mask;
mbits = trie_lookup_value(&subtable->ports_trie, &value, &plens, 32);
((OVS_FORCE ovs_be32 *)&wc->masks)[TP_PORTS_OFS32] |=
mask & be32_prefix_mask(mbits);
goto no_match;
}
//查找到规则,将subtable->mask保存到wc
/* Must unwildcard all the fields, as they were looked at. */
flow_wildcards_fold_minimask(wc, &subtable->mask);
return rule;
no_match:
/* Unwildcard the bits in stages so far, as they were used in determining
* there is no match. */
//没有匹配规则,将查找用到的mask stages_map保存到wc
flow_wildcards_fold_minimask_in_map(wc, &subtable->mask, stages_map);
return NULL;
}
调用到find_match说明肯定有匹配规则,找到优先级最高的规则即可
static inline const struct cls_match *
find_match(const struct cls_subtable *subtable, ovs_version_t version,
const struct flow *flow, uint32_t hash)
{
const struct cls_match *head, *rule;
CMAP_FOR_EACH_WITH_HASH (head, cmap_node, hash, &subtable->rules) {
if (OVS_LIKELY(miniflow_and_mask_matches_flow(&head->flow,
&subtable->mask,
flow))) {
/* Return highest priority rule that is visible. */
CLS_MATCH_FOR_EACH (rule, head) {
if (OVS_LIKELY(cls_match_visible_in_version(rule, version))) {
return rule;
}
}
}
}
return NULL;
}
添加流表时如果不指定优先级,则默认优先级为
/* By default, choose a priority in the middle. */
#define OFP_DEFAULT_PRIORITY 0x8000
参考
https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/pfaff
https://software.intel.com/content/www/us/en/develop/articles/ovs-dpdk-datapath-classifier.html
https://software.intel.com/content/www/us/en/develop/articles/ovs-dpdk-datapath-classifier-part-2.html
https://zhuanlan.zhihu.com/p/66561734
https://segmentfault.com/a/1190000020458867
https://www.sdnlab.com/15713.html