ovs分类器实现

ovs中的分类器(classifier)中有多种用处,主要有openflow流表,路由处理,tunnel相关的,本文结合openflow流表看一下分类器的实现。

数据结构

下图为openflow流表和分类器的数据结构关系,其中右边橘色部分为分类器的结构,左边为openflow的结构,属于分类器的使用者。


image.png

下面简单看一个各个数据结构的作用
ofproto
ofproto用来表示一个bridge。
tables[255]: 表示此bridge上最多可以有255个oftable,即openflow流表table个数。
n_tables: 表示实际生效的oftable个数,在ofproto_init_tables中被初始化为255。
tables_version: 用来控制流表是否可被查询到。
expirable: 链表上挂的是此bridge上所有oftable中需要超时处理的openflow流表struct rule。

oftable
oftable用来表示一个openflow流表table。
cls: 指向分类器用来保存流表,
name: 默认为NULL,可使用如下命令更改name

ovs-vsctl -- --id=@ft create Flow_Table name=test1 -- set Bridge br10 flow_tables=0=@ft

max_flows: 支持最大openflow流表个数,默认为UINT_MAX,可通过如下命令修改

ovs-vsctl -- --id=@ft create Flow_Table flow_limit=10000 -- set Bridge br10 flow_tables=0=@ft

n_flows: 当前配置的流表个数。
miss_config: 用来决定查找流表失败后的行为,支持如下配置

OFPUTIL_TABLE_MISS_DEFAULT: 由不同版本openflow协议规定,1.0-1.2版本,报文需要被送到controller,1.3及以后的版本,直接丢包
OFPUTIL_TABLE_MISS_CONTROLLER: 将报文通过packet-in消息发给controller
OFPUTIL_TABLE_MISS_CONTINUE: 继续在下一个table进行匹配
OFPUTIL_TABLE_MISS_DROP: 直接丢包

rule
struct rule表示openflow流表信息。
cr: 指向cls_rule,包含流表优先级和匹配域,最终需要将cls_rule插入分类器,struct rule对分类器不可见。
table_id: 表示将流表插入哪个table。
hard_timeout: 流表超时时间,从创建流表,或者修改流表开始计时,超时时间到后,不管此流表有没有被使用,都会被删除。
idle_timeout: 流表空闲超时时间,从最近流表被使用开始计时,如果指定时间内此流表没有被使用,则被删除。
expirable: 一个链表节点,如果hard_timeout或者idel_timeout任意一个被设置了,则将流表插入ofproto->expirable链表。
actions: 指向rule_actions,用来指定流表的action,可以指定多个action。

rule_actions
rule_actions表示流表的action,其中ofpacts_len指定了ofpacts的大小,ofpacts指定了具体的action,可以有多个action,即多个struct ofpact。

ofpact
struct ofpact表示具体的action,其中type指定了action类型,如下所示

#define OFPACTS                                                         \
    /* Output. */                                                       \
    OFPACT(OUTPUT,          ofpact_output,      ofpact, "output")       \
    OFPACT(GROUP,           ofpact_group,       ofpact, "group")        \
    OFPACT(CONTROLLER,      ofpact_controller,  userdata, "controller") \
    OFPACT(ENQUEUE,         ofpact_enqueue,     ofpact, "enqueue")      \
    OFPACT(OUTPUT_REG,      ofpact_output_reg,  ofpact, "output_reg")   \
    OFPACT(BUNDLE,          ofpact_bundle,      slaves, "bundle")       \
    ...
    /* Instructions. */                                                 \
    OFPACT(METER,           ofpact_meter,       ofpact, "meter")        \
    OFPACT(CLEAR_ACTIONS,   ofpact_null,        ofpact, "clear_actions") \
    OFPACT(WRITE_ACTIONS,   ofpact_nest,        actions, "write_actions") \
    OFPACT(WRITE_METADATA,  ofpact_metadata,    ofpact, "write_metadata") \
    OFPACT(GOTO_TABLE,      ofpact_goto_table,  ofpact, "goto_table")

/* enum ofpact_type, with a member OFPACT_ for each action. */
enum OVS_PACKED_ENUM ofpact_type {
#define OFPACT(ENUM, STRUCT, MEMBER, NAME) OFPACT_##ENUM,
    OFPACTS
#undef OFPACT
};

struct classifier
struct classifier用来表示分类器,里面保存的是openflow流表。
n_rules表示流表个数。
subtables: 有序数组,保存cls_subtable,根据cls_subtable->max_priority排序,查找流表时使用。
subtables_map: 也是用来保存cls_subtable,插入流表时使用。
tries[3]: 前缀树,根据流表中的cls_trie->field字段将流表插入树根节点cls_trie->root。最多支持3个前缀树。默认为MFF_IPV4_DST和MFF_IPV4_SRC两个前缀树。

/* Prefix trie for a 'field' */
struct cls_trie {
    const struct mf_field *field; /* Trie field, or NULL. */
    rcu_trie_ptr root;            /* NULL if none. */
};

n_tries: 当前配置的前缀树的个数。
flow_segments[3]: 保存staged查找用到的field的结束位置

//查找openflow流表时,将struct flow分为四段内容进行分段查找,
//第一段是metadata,从0-FLOW_SEGMENT_1_ENDS_AT,
//第二段是l2从FLOW_SEGMENT_1_ENDS_AT到FLOW_SEGMENT_2_ENDS_AT,
//第三段是l3从FLOW_SEGMENT_2_ENDS_AT到FLOW_SEGMENT_3_ENDS_AT,
//第四段是l4从FLOW_SEGMENT_3_ENDS_AT到FLOW_U64S
enum {
    FLOW_SEGMENT_1_ENDS_AT = offsetof(struct flow, dl_dst),
    FLOW_SEGMENT_2_ENDS_AT = offsetof(struct flow, nw_src),
    FLOW_SEGMENT_3_ENDS_AT = offsetof(struct flow, tp_src),
};
/* U64 indices for segmented flow classification. */
const uint8_t flow_segment_u64s[4] = {
    FLOW_SEGMENT_1_ENDS_AT / sizeof(uint64_t),
    FLOW_SEGMENT_2_ENDS_AT / sizeof(uint64_t),
    FLOW_SEGMENT_3_ENDS_AT / sizeof(uint64_t),
    FLOW_U64S
};
classifier_init(&table->cls, flow_segment_u64s);

n_flow_segments: 分段个数。
publish: 控制是否将subtable插入有序数组subtables,即控制subtable是否对查找可见。

struct cls_subtable
struct cls_subtable表示有相同mask的流表的集合,比如"ip, nw_src=10.10.0.0/16"和"ip, nw_src=192.168.0.0/16"这两条流表有相同的mask,则它们就会被插入同一个cls_subtable。
cmap_node: classfier->subtables_map的一个节点。
max_priority: 保存struct cls_subtable中所有流表中的最大优先级。
max_count: 有最大优先级流表的个数。
rules_list: 链表,用来保存流表struct cls_rule。添加流表时,使用CLS_FOR_EACH_TARGET宏遍历所有的rule。
struct flowmap index_maps[4]: 将流表mask分成cls->n_flow_segments+1段,每段保存到对应的struct flowmap。

    /* Init indices for segmented lookup, if any. */
    prev = 0;
    for (i = 0; i < cls->n_flow_segments; i++) {
        stage_map = miniflow_get_map_in_range(&mask->masks, prev,
                                              cls->flow_segments[i]);
        /* Add an index if it adds mask bits. */
        if (!flowmap_is_empty(stage_map)) {
            ccmap_init(&subtable->indices[index]);
            *CONST_CAST(struct flowmap *, &subtable->index_maps[index])
                = stage_map;
            index++;
        }
        prev = cls->flow_segments[i];
    }
    /* Map for the final stage. */
    *CONST_CAST(struct flowmap *, &subtable->index_maps[index])
        = miniflow_get_map_in_range(&mask->masks, prev, FLOW_U64S);
    /* Check if the final stage adds any bits. */
    if (index > 0) {
        if (flowmap_is_empty(subtable->index_maps[index])) {
            /* Remove the last index, as it has the same fields as the rules
             * map. */
            --index;
            ccmap_destroy(&subtable->indices[index]);
        }
    }
    *CONST_CAST(uint8_t *, &subtable->n_indices) = index;

n_indices: 分段的个数。
indices[3]: 保存hash值。将流表匹配域分成n_indices段,并和对应的mask index_maps相与后计算hash值。

    for (i = 0; i < subtable->n_indices; i++) {
        ihash[i] = minimatch_hash_range(&rule->match, subtable->index_maps[i],
                                        &mask_offset, &basis);
    }

trie_plen[3]: 根据前缀树用的field计算当前subtable的mask的field的长度,如果为0,说明不需要前缀树,否则在查找流表时,需要先在前缀树中查找。

    for (i = 0; i < cls->n_tries; i++) {
        subtable->trie_plen[i] = minimask_get_prefix_len(mask,
                                                         cls->tries[i].field);
    }

struct cmap rules: 用来保存struct cls_match。匹配流表时find_match使用。
struct minimask mask: 当前subtable的mask。

struct cls_rule
cls_rule指定了流表的优先级和匹配域。
priority: 流表优先级。
cls_match: 指向struct cls_match。
match: 指向struct minimatch,包含具体的匹配字段及其mask。

struct minimatch
minimatch指定了具体的匹配字段及其mask。

struct minimatch {
    union {
        struct {
            struct miniflow *flow;
            struct minimask *mask;
        };
        struct miniflow *flows[2];
    };
};

分类器实现原理

分类器使用TSS(tuple space search)算法查找流表,具体为插入流表时根据流表mask生成不同的subtable,有相同mask的流表需要插入到同一个subtable,在subtable中根据mask和流表相与后的结果计算出hash值,插入subtable的hashmap subtable->rules中。
查找流表时遍历subtable,使用从报文提前的flow信息和subtable的mask相与计算hash后,查找subtable的hash表进行匹配。最坏情况下需要查找所有的subtable。

为了提高查找速度,ovs使用了如下的优化技术,其中优先级排序是为了提高查找openflow流表的速度,而分段查找和前缀匹配主要是为了让下发到megaflow的流表尽量模糊,这样报文可以更多的命中megaflow,不至于miss后查找openflow流表。

Tuple Priority Sorting(优先级排序)
假如没有优先级排序的优化,查询流表时需要遍历所有的subtable中的所有流表,因为即使很早就能找到匹配项,也不确定是否还有优先级更高的流表存在,所以需要遍历所有的流表。
为了解决这个问题,在subtable T中引入了变量T.pri_max用来保存subtable T中所有流表的优先级的最大值,同时将subtable T根据T.pri_max按照递减顺序插入分类器的有序数组subtables中。这样在查找流表时,按照优先级从高到低遍历有序数组subtables,如果找到了匹配流表F,则将F保存到B中,遍历下一个subtable时,如果B.pri 大于等于 T.pri_max,则说明当前和之后的subtable肯定没有比B优先级更高的流表,则B就是优先级最高的匹配流表。如果B.pri小于T.pri_max,则说明T中可能有更匹配的流表,继续查找当前subtable T中的流表,如果匹配到了F,并且F.pri大于B.pri,说明找到了更高优先级的流表,将F保存到B,继续遍历。伪码如下图所示

image.png

Staged Lookup(分段查找)
查找到openflow流表后,需要将匹配时用到的field下发到datapath指导后续报文转发。
先看一下没有 分段查找优化时会有什么问题,假如有如下两条流表信息

priority=200, ip, nw_dst=11.0.0.0/8 dst_port=80 action=output:1
priority=100, ip, nw_dst=10.0.0.0/8  action=output:2

一条数据流匹配上面流表

10.5.6.6: 45666 -> 10.5.6.7:22

根据Tuple Priority Sorting优化后的TSS查找,首先匹配优先级为200的流表,显然匹配不上,但是下发megaflow流表的掩码会被设置包含目的ip和目的端口号,接着匹配优先级100的流表,可以匹配成功,最终下发到megaflow的流表如下

ip,nw_dst=10.0.0.0/8, dst_port=22 action=output:2

如果再有相同目的网段ip,但是目的端口号不同的数据流经过ovs,还是会miss megaflow,走慢速路径的处理。

为了解决这个问题引入了分段查找,具体实现为将flow信息分为四个group: metadata(比如入端口), l2, l3和l4。然后将subtable中的一个hash表改成4个hash表,第一个hash表只包含metadata信息,第二个包含metadat和l2信息,第三个包含metadata,l2和l3信息,第三个包含metadat,l2,l3和l4信息,即全部信息,也就是以前的hash表。这样的话,报文在匹配时,首先匹配metadata,如果不能命中,就不用继续匹配后面的group,如果能命令metadata,则继续匹配l2,依次进行。

有了分段查找后,再回到上面的例子,数据流在匹配优先级为200的流表时,metadata和l2的掩码为0,不用匹配,l3掩码非0,进行匹配并且不能成功,则结束匹配此流表,此时是没有用到dst_port的,所以也不用将dst_port下发到megaflow。最终下发的流表为

ip,nw_dst=10.0.0.0/8 action=output:2

后面相同目的网段ip,但是目的端口号不同的数据流经过ovs,就可以命中 megaflow实现快速转发。

Prefix Tracking
假如还是如下两条流表,但是第一条目的ip掩码改成32位

priority=200, ip, nw_dst=11.5.6.7/32 dst_port=80 action=output:1
priority=100, ip, nw_dst=10.0.0.0/8  action=output:2

数据流 10.5.6.6: 45666 -> 10.5.6.7:22 匹配上面的流表时,根据优先级排序和分段查找,会先匹配11.5.6.7/32,尽管匹配不上,但是也会将32位掩码下发到megaflow中。

ip,nw_dst=10.5.6.7/32 action=output:2

后续再有相同网段不同目的ip的报文经过ovs时,仍然需要走慢速路径。
解决这个问题的办法是引入了基于ipv4/v6的前缀树prefix trie,注意这个trie是每个openflow table都有的,并且所有的subtable共用相同的trie。当需要匹配ip地址时,在查找hash表前,先根据LPM查找前缀树,来决定(1)megaflow流表需要的最长前缀,(2)哪个hash表可以直接跳过。

假如有如下流表指定的subnet

20 /8
10.1 /16
10.2 /16
10.1.3 /24
10.1.4.5/32

对应的前缀树如下所示


image.png

查找前缀树时,如果遇到了叶子节点,说明查找成功,可以结束查找,下发到megaflow流表的不用关心ip地址剩余的位数,比如10.1.3.5查找前缀树,叶子节点为3,则会安装10.1.3/24到megaflow,20.0.5.1查找前缀树,叶子节点为20,安装20/8。如果由于没有匹配到节点而导致查找结束,也要将查找失败的bit安装到megaflow,比如10.5.6.7在查找5时失败,则安装10.5/16到megaflow。同时也会跳过此tuple的查找,因为这个分类器中没有匹配流表。

再回到例子中,查找分类器的subtable前,先用目的ip 10.5.6.7根据LPM匹配分类器的前缀树,如果匹配成功,并且匹配的掩码长度为8,说明有匹配流表,接着遍历subtable,先匹配包含优先级为200的流表所在subtable,其掩码长度为32,大于8,说明匹配的流表不在此subtable,接着匹配优先级为100的流表所在subtable,其掩码长度为8,说明匹配的流表可能在此subtable,所以可继续查找此subtable中的流表。
如果查找前缀树失败,说明此分类器中没有匹配的流表,不用再查找subtable。

源码

classifier_init
分类器的初始化在oftable_init时被调用。

/* Default fields to use for prefix tries in each flow table, unless something
 * else is configured. */
const enum mf_field_id default_prefix_fields[2] =
    { MFF_IPV4_DST, MFF_IPV4_SRC };

static void
oftable_init(struct oftable *table)
  //初始化分类器
  classifier_init(&table->cls, flow_segment_u64s);
  //设置前缀树,默认为src ip和dst ip
  classifier_set_prefix_fields(&table->cls, default_prefix_fields,
                                 ARRAY_SIZE(default_prefix_fields));

初始化分类器struct classifier相关字段,参数flow_segments指定了段分类器的个数和结束位置。

/* Initializes 'cls' as a classifier that initially contains no classification
 * rules. */
void
classifier_init(struct classifier *cls, const uint8_t *flow_segments)
{
    cls->n_rules = 0;
    cmap_init(&cls->subtables_map);
    pvector_init(&cls->subtables);
    cls->n_flow_segments = 0;
    //设置分段的结束位置
    if (flow_segments) {
        while (cls->n_flow_segments < CLS_MAX_INDICES
               && *flow_segments < FLOW_U64S) {
            cls->flow_segments[cls->n_flow_segments++] = *flow_segments++;
        }
    }
    //前缀树默认为0
    cls->n_tries = 0;
    for (int i = 0; i < CLS_MAX_TRIES; i++) {
        trie_init(cls, i, NULL);
    }
    cls->publish = true;
}

classifier_insert
将openflow规则插入分类器。

/* Inserts 'rule' into 'cls'.  Until 'rule' is removed from 'cls', the caller
 * must not modify or free it.
 *
 * 'cls' must not contain an identical rule (including wildcards, values of
 * fixed fields, and priority).  Use classifier_find_rule_exactly() to find
 * such a rule. */
void
classifier_insert(struct classifier *cls, const struct cls_rule *rule,
                  ovs_version_t version, const struct cls_conjunction conj[],
                  size_t n_conj)
{
    const struct cls_rule *displaced_rule
        = classifier_replace(cls, rule, version, conj, n_conj);
    ovs_assert(!displaced_rule);
}

/* Inserts 'rule' into 'cls' in 'version'.  Until 'rule' is removed from 'cls',
 * the caller must not modify or free it.
 *
 * If 'cls' already contains an identical rule (including wildcards, values of
 * fixed fields, and priority) that is visible in 'version', replaces the old
 * rule by 'rule' and returns the rule that was replaced.  The caller takes
 * ownership of the returned rule and is thus responsible for destroying it
 * with cls_rule_destroy(), after RCU grace period has passed (see
 * ovsrcu_postpone()).
 *
 * Returns NULL if 'cls' does not contain a rule with an identical key, after
 * inserting the new rule.  In this case, no rules are displaced by the new
 * rule, even rules that cannot have any effect because the new rule matches a
 * superset of their flows and has higher priority.
 */
const struct cls_rule *
classifier_replace(struct classifier *cls, const struct cls_rule *rule,
                   ovs_version_t version,
                   const struct cls_conjunction *conjs, size_t n_conjs)
{
    struct cls_match *new;
    struct cls_subtable *subtable;
    uint32_t ihash[CLS_MAX_INDICES];
    struct cls_match *head;
    unsigned int mask_offset;
    size_t n_rules = 0;
    uint32_t basis;
    uint32_t hash;
    unsigned int i;
    //申请struct cls_match,将rule中相关字段赋值到struct cls_match
    /* 'new' is initially invisible to lookups. */
    new = cls_match_alloc(rule, version, conjs, n_conjs);
    ovsrcu_set(&CONST_CAST(struct cls_rule *, rule)->cls_match, new);
    //根据流表的mask查找subtable
    subtable = find_subtable(cls, rule->match.mask);
    if (!subtable) {
        //没有找到则新建subtable,后面会分析此函数
        subtable = insert_subtable(cls, rule->match.mask);
    }

    /* Compute hashes in segments. */
    basis = 0;
    mask_offset = 0;
    //在insert_subtable中会将mask分段,并保存到index_maps
    //这里根据分段的mask index_maps,计算分段hash
    for (i = 0; i < subtable->n_indices; i++) {
        ihash[i] = minimatch_hash_range(&rule->match, subtable->index_maps[i],
                                        &mask_offset, &basis);
    }
    //计算全部mask的hash值
    hash = minimatch_hash_range(&rule->match, subtable->index_maps[i],
                                &mask_offset, &basis);
    //根据rule匹配域查找hash表subtable->rules
    head = find_equal(subtable, rule->match.flow, hash);
    //没查到说明是新rule
    if (!head) {
        /* Add rule to tries.
         *
         * Concurrent readers might miss seeing the rule until this update,
         * which might require being fixed up by revalidation later. */
        //在insert_subtable中,如果subtable的mask包含前缀树域,比如指定了ip地址,
        //则设置subtable->trie_plen为mask指定的ip掩码长度。则将rule插入分类器的前缀树,用于实现prefix tracking
        for (i = 0; i < cls->n_tries; i++) {
            if (subtable->trie_plen[i]) {
                trie_insert(&cls->tries[i], rule, subtable->trie_plen[i]);
            }
        }
        //将rule插入port前缀树
        /* Add rule to ports trie. */
        if (subtable->ports_mask_len) {
            /* We mask the value to be inserted to always have the wildcarded
             * bits in known (zero) state, so we can include them in comparison
             * and they will always match (== their original value does not
             * matter). */
            ovs_be32 masked_ports = minimatch_get_ports(&rule->match);

            trie_insert_prefix(&subtable->ports_trie, &masked_ports,
                               subtable->ports_mask_len);
        }
        //将分段hash值插入subtable->indices,用于实现staged lookup
        /* Add new node to segment indices. */
        for (i = 0; i < subtable->n_indices; i++) {
            ccmap_inc(&subtable->indices[i], ihash[i]);
        }
        //将rule插入subtable->rules,用于hash查找
        //同时返回subtable->rules中rule的总个数
        n_rules = cmap_insert(&subtable->rules, &new->cmap_node, hash);
    } else {   /* Equal rules exist in the classifier already. */
        struct cls_match *prev, *iter;
        //找到了有相同匹配域的head(单向链表保存有相同匹配域的规则),但是rule的其他字段比如priority,action等可能不同,所以需要从head开始向后遍历,如果
        //找到相同优先级的cls_match则替换,如果rule优先级大于iter,则插入iter之前,如果rule优先级最小,则插到最后
        /* Scan the list for the insertion point that will keep the list in
         * order of decreasing priority.  Insert after rules marked invisible
         * in any version of the same priority. */
        FOR_EACH_RULE_IN_LIST_PROTECTED (iter, prev, head) {
            if (rule->priority > iter->priority
                || (rule->priority == iter->priority
                    && !cls_match_is_eventually_invisible(iter))) {
                break;
            }
        }

        /* Replace 'iter' with 'new' or insert 'new' between 'prev' and
         * 'iter'. */
        if (iter) {
            struct cls_rule *old;
            //优先级相同,替换iter
            if (rule->priority == iter->priority) {
                cls_match_replace(prev, iter, new);
                old = CONST_CAST(struct cls_rule *, iter->cls_rule);
            } else {//rule优先级大于iter,查到iter前面
                cls_match_insert(prev, iter, new);
                old = NULL;
            }

            /* Replace the existing head in data structures, if rule is the new
             * head. */
            if (iter == head) {
                cmap_replace(&subtable->rules, &head->cmap_node,
                             &new->cmap_node, hash);
            }

            if (old) {
                struct cls_conjunction_set *conj_set;

                conj_set = ovsrcu_get_protected(struct cls_conjunction_set *,
                                                &iter->conj_set);
                if (conj_set) {
                    ovsrcu_postpone(free, conj_set);
                }

                ovsrcu_set(&old->cls_match, NULL); /* Marks old rule as removed
                                                    * from the classifier. */
                ovsrcu_postpone(cls_match_free_cb, iter);

                /* No change in subtable's max priority or max count. */

                /* Make 'new' visible to lookups in the appropriate version. */
                cls_match_set_remove_version(new, OVS_VERSION_NOT_REMOVED);

                /* Make rule visible to iterators (immediately). */
                rculist_replace(CONST_CAST(struct rculist *, &rule->node),
                                &old->node);

                /* Return displaced rule.  Caller is responsible for keeping it
                 * around until all threads quiesce. */
                return old;
            }
        } else {
            //插到链表最后面
            /* 'new' is new node after 'prev' */
            cls_match_insert(prev, iter, new);
        }
    }

    /* Make 'new' visible to lookups in the appropriate version. */
    cls_match_set_remove_version(new, OVS_VERSION_NOT_REMOVED);
    //将rule插入链表subtable->rules_list
    /* Make rule visible to iterators (immediately). */
    rculist_push_back(&subtable->rules_list,
                      CONST_CAST(struct rculist *, &rule->node));

    /* Rule was added, not replaced.  Update 'subtable's 'max_priority' and
     * 'max_count', if necessary.
     *
     * The rule was already inserted, but concurrent readers may not see the
     * rule yet as the subtables vector is not updated yet.  This will have to
     * be fixed by revalidation later. */
    //n_rules等于1,说明当前rule是subtable中第一个rule,
    if (n_rules == 1) {
        //只有一个rule,所以max_priority为rule的priority
        subtable->max_priority = rule->priority;
        subtable->max_count = 1;
        //subtable也是刚创建,将subtable插入有序数组cls->subtables,根据rule的priority进行排序
        pvector_insert(&cls->subtables, subtable, rule->priority);
    } else if (rule->priority == subtable->max_priority) {
        //插入的rule不是subtable第一个rule,但是和之前插入的rule的优先级的最大值相同
        ++subtable->max_count;
    } else if (rule->priority > subtable->max_priority) {
        //插入的rule不是subtable第一个rule,并且rule的priority比之前插入的rule最大优先级大
        //则更新subtable的最大优先级
        subtable->max_priority = rule->priority;
        subtable->max_count = 1;
        //根据最新的最大优先级,对cls->subtables重新排序
        pvector_change_priority(&cls->subtables, subtable, rule->priority);
    }
    //更新分类器rule总数
    /* Nothing was replaced. */
    cls->n_rules++;
    
//上面只是将subtable插入cls->subtables的临时变量tmp中,调用pvector_publish时才会真正将tmp中的subtable更新到pvec->impl,这样才对查找可见。
    if (cls->publish) {
        pvector_publish(&cls->subtables);
    }

    return NULL;
}

根据流表的mask创建新的subtable,并插入cls->subtables_map

/* The new subtable will be visible to the readers only after this. */
static struct cls_subtable *
insert_subtable(struct classifier *cls, const struct minimask *mask)
{
    uint32_t hash = minimask_hash(mask, 0);
    struct cls_subtable *subtable;
    int i, index = 0;
    struct flowmap stage_map;
    uint8_t prev;
    size_t count = miniflow_n_values(&mask->masks);
    //分配subtable内存,MINIFLOW_VALUES_SIZE(count)表示mask占用内存
    subtable = xzalloc(sizeof *subtable + MINIFLOW_VALUES_SIZE(count));
    cmap_init(&subtable->rules);
    //复制mask
    miniflow_clone(CONST_CAST(struct miniflow *, &subtable->mask.masks),
                   &mask->masks, count);
    //将mask进行分段并保存
    /* Init indices for segmented lookup, if any. */
    prev = 0;
    for (i = 0; i < cls->n_flow_segments; i++) {
        stage_map = miniflow_get_map_in_range(&mask->masks, prev,
                                              cls->flow_segments[i]);
        /* Add an index if it adds mask bits. */
        if (!flowmap_is_empty(stage_map)) {
            ccmap_init(&subtable->indices[index]);
            *CONST_CAST(struct flowmap *, &subtable->index_maps[index])
                = stage_map;
            index++;
        }
        prev = cls->flow_segments[i];
    }
    /* Map for the final stage. */
    *CONST_CAST(struct flowmap *, &subtable->index_maps[index])
        = miniflow_get_map_in_range(&mask->masks, prev, FLOW_U64S);
    /* Check if the final stage adds any bits. */
    if (index > 0) {
        if (flowmap_is_empty(subtable->index_maps[index])) {
            /* Remove the last index, as it has the same fields as the rules
             * map. */
            --index;
            ccmap_destroy(&subtable->indices[index]);
        }
    }
    *CONST_CAST(uint8_t *, &subtable->n_indices) = index;
    //如果mask包含分类器的前缀树域,则计算掩码长度,后面插入此subtable的rule也要同时插入前缀树。
    for (i = 0; i < cls->n_tries; i++) {
        subtable->trie_plen[i] = minimask_get_prefix_len(mask,
                                                         cls->tries[i].field);
    }

    //mask中如果包含tp_src,则需要将rule按照tp_src插入前缀树subtable->ports_trie
    /* Ports trie. */
    ovsrcu_set_hidden(&subtable->ports_trie, NULL);
    *CONST_CAST(int *, &subtable->ports_mask_len)
        = 32 - ctz32(ntohl(MINIFLOW_GET_BE32(&mask->masks, tp_src)));

    /* List of rules. */
    rculist_init(&subtable->rules_list);
    //最后插入cls->subtables_map
    cmap_insert(&cls->subtables_map, &subtable->cmap_node, hash);

    return subtable;
}

classifier_lookup
分类器查找函数用于在分类器中查找匹配flow,并且对指定版本可见的流表信息,参数wc是需要在classifier_lookup过程中赋值的,将查找分类器时用到的field mask保存到wc,以便下发megaflow时使用。

在查找中还用到了上面介绍的Tuple Priority Sorting,Staged Lookup和Prefix Tracking等优化技术。

/* Finds and returns the highest-priority rule in 'cls' that matches 'flow' and
 * that is visible in 'version'.  Returns a null pointer if no rules in 'cls'
 * match 'flow'.  If multiple rules of equal priority match 'flow', returns one
 * arbitrarily.
 *
 * If a rule is found and 'wc' is non-null, bitwise-OR's 'wc' with the
 * set of bits that were significant in the lookup.  At some point
 * earlier, 'wc' should have been initialized (e.g., by
 * flow_wildcards_init_catchall()).
 *
 * 'flow' is non-const to allow for temporary modifications during the lookup.
 * Any changes are restored before returning. */
const struct cls_rule *
classifier_lookup(const struct classifier *cls, ovs_version_t version,
                  struct flow *flow, struct flow_wildcards *wc)
{
    return classifier_lookup__(cls, version, flow, wc, true);
}

/* Like classifier_lookup(), except that support for conjunctive matches can be
 * configured with 'allow_conjunctive_matches'.  That feature is not exposed
 * externally because turning off conjunctive matches is only useful to avoid
 * recursion within this function itself.
 *
 * 'flow' is non-const to allow for temporary modifications during the lookup.
 * Any changes are restored before returning. */
static const struct cls_rule *
classifier_lookup__(const struct classifier *cls, ovs_version_t version,
                    struct flow *flow, struct flow_wildcards *wc,
                    bool allow_conjunctive_matches)
{
    struct trie_ctx trie_ctx[CLS_MAX_TRIES];
    const struct cls_match *match;
    /* Highest-priority flow in 'cls' that certainly matches 'flow'. */
    //hard用来保存具有最高优先级的匹配规则
    const struct cls_match *hard = NULL;
    int hard_pri = INT_MIN;     /* hard ? hard->priority : INT_MIN. */

    //下面几个变量用来插入联合规则,暂时先不考虑这个
    /* Highest-priority conjunctive flows in 'cls' matching 'flow'.  Since
     * these are (components of) conjunctive flows, we can only know whether
     * the full conjunctive flow matches after seeing multiple of them.  Thus,
     * we refer to these as "soft matches". */
    struct cls_conjunction_set *soft_stub[64];
    struct cls_conjunction_set **soft = soft_stub;
    size_t n_soft = 0, allocated_soft = ARRAY_SIZE(soft_stub);
    int soft_pri = INT_MIN;    /* n_soft ? MAX(soft[*]->priority) : INT_MIN. */

    /* Synchronize for cls->n_tries and subtable->trie_plen.  They can change
     * when table configuration changes, which happens typically only on
     * startup. */
    atomic_thread_fence(memory_order_acquire);

    /* Initialize trie contexts for find_match_wc(). */
    for (int i = 0; i < cls->n_tries; i++) {
        trie_ctx_init(&trie_ctx[i], &cls->tries[i]);
    }
    //主循环,遍历分类器的有序数组cls->subtables,
    //宏PVECTOR_FOR_EACH_PRIORITY 保证subtable的最大优先级大于hard_pri + 1,如果不大于循环就结束,这是Tuple Priority Sorting的实现要求。
    /* Main loop. */
    struct cls_subtable *subtable;
    PVECTOR_FOR_EACH_PRIORITY (subtable, hard_pri + 1, 2, sizeof *subtable,
                               &cls->subtables) {
        struct cls_conjunction_set *conj_set;

        /* Skip subtables with no match, or where the match is lower-priority
         * than some certain match we've already found. */
        //到subtable查找规则
        match = find_match_wc(subtable, version, flow, trie_ctx, cls->n_tries, wc);
        //没有匹配的,或者匹配到的优先级比上次的低,则继续查找下一个subtable
        if (!match || match->priority <= hard_pri) {
            continue;
        }

        conj_set = ovsrcu_get(struct cls_conjunction_set *, &match->conj_set);
        if (!conj_set) {
            /* 'match' isn't part of a conjunctive match.  It's the best
             * certain match we've got so far, since we know that it's
             * higher-priority than hard_pri.
             *
             * (There might be a higher-priority conjunctive match.  We can't
             * tell yet.) */
            //保存匹配match及其优先级
            hard = match;
            hard_pri = hard->priority;
        } else if (allow_conjunctive_matches) {
                ...
        }
    }

    /* In the common case, at this point we have no soft matches and we can
     * return immediately.  (We do the same thing if we have potential soft
     * matches but none of them are higher-priority than our hard match.) */
    if (hard_pri >= soft_pri) {
        if (soft != soft_stub) {
            free(soft);
        }
        //找到匹配规则,返回其对应的流表cls_rule
        return hard ? hard->cls_rule : NULL;
    }
    ...
}

在subtable查找flow匹配的规则

static const struct cls_match *
find_match_wc(const struct cls_subtable *subtable, ovs_version_t version,
              const struct flow *flow, struct trie_ctx trie_ctx[CLS_MAX_TRIES],
              unsigned int n_tries, struct flow_wildcards *wc)
{
    if (OVS_UNLIKELY(!wc)) {
        return find_match(subtable, version, flow,
                          flow_hash_in_minimask(flow, &subtable->mask, 0));
    }

    uint32_t basis = 0, hash;
    const struct cls_match *rule = NULL;
    struct flowmap stages_map = FLOWMAP_EMPTY_INITIALIZER;
    unsigned int mask_offset = 0;
    int i;
    //分段查找
    /* Try to finish early by checking fields in segments. */
    for (i = 0; i < subtable->n_indices; i++) {
        //先查找前缀树,如果查找失败,则说明此分类器中没有匹配flow的流表,直接返回。
        if (check_tries(trie_ctx, n_tries, subtable->trie_plen,
                        subtable->index_maps[i], flow, wc)) {
            /* 'wc' bits for the trie field set, now unwildcard the preceding
             * bits used so far. */
            goto no_match;
        }
        //合并分段map
        /* Accumulate the map used so far. */
        stages_map = flowmap_or(stages_map, subtable->index_maps[i]);
        //增量计算flow hash
        hash = flow_hash_in_minimask_range(flow, &subtable->mask,
                                           subtable->index_maps[i],
                                           &mask_offset, &basis);
        //根据hash查找subtable->indices,查不到则说明此subtable没有匹配的规则,no_match结束查找。
        if (!ccmap_find(&subtable->indices[i], hash)) {
            goto no_match;
        }
    }
    /* Trie check for the final range. */
    if (check_tries(trie_ctx, n_tries, subtable->trie_plen,
                    subtable->index_maps[i], flow, wc)) {
        goto no_match;
    }
    //前面几段查找都匹配,则计算最全hash值,调用find_match查找hash表
    hash = flow_hash_in_minimask_range(flow, &subtable->mask,
                                       subtable->index_maps[i],
                                       &mask_offset, &basis);
    rule = find_match(subtable, version, flow, hash);
    if (!rule && subtable->ports_mask_len) {
        /* The final stage had ports, but there was no match.  Instead of
         * unwildcarding all the ports bits, use the ports trie to figure out a
         * smaller set of bits to unwildcard. */
        unsigned int mbits;
        ovs_be32 value, plens, mask;

        mask = MINIFLOW_GET_BE32(&subtable->mask.masks, tp_src);
        value = ((OVS_FORCE ovs_be32 *)flow)[TP_PORTS_OFS32] & mask;
        mbits = trie_lookup_value(&subtable->ports_trie, &value, &plens, 32);

        ((OVS_FORCE ovs_be32 *)&wc->masks)[TP_PORTS_OFS32] |=
            mask & be32_prefix_mask(mbits);

        goto no_match;
    }
    //查找到规则,将subtable->mask保存到wc
    /* Must unwildcard all the fields, as they were looked at. */
    flow_wildcards_fold_minimask(wc, &subtable->mask);
    return rule;

no_match:
    /* Unwildcard the bits in stages so far, as they were used in determining
     * there is no match. */
    //没有匹配规则,将查找用到的mask stages_map保存到wc
    flow_wildcards_fold_minimask_in_map(wc, &subtable->mask, stages_map);
    return NULL;
}

调用到find_match说明肯定有匹配规则,找到优先级最高的规则即可
static inline const struct cls_match *
find_match(const struct cls_subtable *subtable, ovs_version_t version,
           const struct flow *flow, uint32_t hash)
{
    const struct cls_match *head, *rule;

    CMAP_FOR_EACH_WITH_HASH (head, cmap_node, hash, &subtable->rules) {
        if (OVS_LIKELY(miniflow_and_mask_matches_flow(&head->flow,
                                                      &subtable->mask,
                                                      flow))) {
            /* Return highest priority rule that is visible. */
            CLS_MATCH_FOR_EACH (rule, head) {
                if (OVS_LIKELY(cls_match_visible_in_version(rule, version))) {
                    return rule;
                }
            }
        }
    }

    return NULL;
}

添加流表时如果不指定优先级,则默认优先级为

/* By default, choose a priority in the middle. */
#define OFP_DEFAULT_PRIORITY 0x8000

参考

https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/pfaff
https://software.intel.com/content/www/us/en/develop/articles/ovs-dpdk-datapath-classifier.html
https://software.intel.com/content/www/us/en/develop/articles/ovs-dpdk-datapath-classifier-part-2.html
https://zhuanlan.zhihu.com/p/66561734
https://segmentfault.com/a/1190000020458867
https://www.sdnlab.com/15713.html

你可能感兴趣的:(ovs分类器实现)