无论是内核态datapath还是基于dpdk的用户态datapath,当flow table查不到之后都会进入upcall的处理(我喜欢管这条路径叫做慢速路径,那么datapath里就是快速路径啦~~)
upcall的处理函数udpif_upcall_handler会在udpif_start_threads里面初始化,同时创建的还有udpif_revalidator的线程
/* Starts the handler and revalidator threads, must be enclosed in
* ovsrcu quiescent state. */
static void
udpif_start_threads(struct udpif *udpif, size_t n_handlers,
size_t n_revalidators)
{
if (udpif && n_handlers && n_revalidators) {
size_t i;
bool enable_ufid;
udpif->n_handlers = n_handlers;
udpif->n_revalidators = n_revalidators;
udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers);
for (i = 0; i < udpif->n_handlers; i++) {
struct handler *handler = &udpif->handlers[i];
handler->udpif = udpif;
handler->handler_id = i;
handler->thread = ovs_thread_create(
"handler", udpif_upcall_handler, handler);
}
enable_ufid = ofproto_dpif_get_enable_ufid(udpif->backer);
atomic_init(&udpif->enable_ufid, enable_ufid);
dpif_enable_upcall(udpif->dpif);
ovs_barrier_init(&udpif->reval_barrier, udpif->n_revalidators);
ovs_barrier_init(&udpif->pause_barrier, udpif->n_revalidators + 1);
udpif->reval_exit = false;
udpif->pause = false;
udpif->revalidators = xzalloc(udpif->n_revalidators
* sizeof *udpif->revalidators);
for (i = 0; i < udpif->n_revalidators; i++) {
struct revalidator *revalidator = &udpif->revalidators[i];
revalidator->udpif = udpif;
revalidator->thread = ovs_thread_create(
"revalidator", udpif_revalidator, revalidator);
}
}
}
udpif_upcall_handler通过fd poll的方式等待触发,如果有upcall上送,则进入recv_upcalls的处理函数中
先看下几个相关的数据结构,struct udpif是和ofproto-dpif处理upcall相关的函数,分为upcall处理和flow回收两部分
/* An upcall handler for ofproto_dpif.
*
* udpif keeps records of two kind of logically separate units:
*
* upcall handling
* ---------------
*
* - An array of 'struct handler's for upcall handling and flow
* installation.
*
* flow revalidation
* -----------------
*
* - Revalidation threads which read the datapath flow table and maintains
* them.
*/
struct udpif {
struct ovs_list list_node; /* In all_udpifs list. */
struct dpif *dpif; /* Datapath handle. */
struct dpif_backer *backer; /* Opaque dpif_backer pointer. */
struct handler *handlers; /* Upcall handlers. */
size_t n_handlers;
struct revalidator *revalidators; /* Flow revalidators. */
size_t n_revalidators;
struct latch exit_latch; /* Tells child threads to exit. */
/* There are 'N_UMAPS' maps containing 'struct udpif_key' elements.
*
* During the flow dump phase, revalidators insert into these with a random
* distribution. During the garbage collection phase, each revalidator
* takes care of garbage collecting a slice of these maps. */
struct umap *ukeys;
};
struct umap是cuckoo hash实现的大规模hash表,用于通过udpif_keys查找datapath flow,struct udpif创建时一共会实现N_UMAPS个这样的哈希表
struct dp_packet是实际报文的封装,如果是在dpdk的dp下,会在mbuf后面的线性内存存放这些元数据
/* Buffer for holding packet data. A dp_packet is automatically reallocated
* as necessary if it grows too large for the available memory.
*/
struct dp_packet {
#ifdef DPDK_NETDEV
struct rte_mbuf mbuf; /* DPDK mbuf */
#else
void *base_; /* First byte of allocated space. */
uint16_t allocated_; /* Number of bytes allocated. */
uint16_t data_ofs; /* First byte actually in use. */
uint32_t size_; /* Number of bytes in use. */
uint32_t rss_hash; /* Packet hash. */
bool rss_hash_valid; /* Is the 'rss_hash' valid? */
#endif
enum dp_packet_source source; /* Source of memory allocated as 'base'. */
uint8_t l2_pad_size; /* Detected l2 padding size.
* Padding is non-pullable. */
uint16_t l2_5_ofs; /* MPLS label stack offset, or UINT16_MAX */
uint16_t l3_ofs; /* Network-level header offset,
* or UINT16_MAX. */
uint16_t l4_ofs; /* Transport-level header offset,
or UINT16_MAX. */
uint32_t cutlen; /* length in bytes to cut from the end. */
union {
struct pkt_metadata md;
uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
};
};
/* Datapath packet metadata */
struct pkt_metadata {
uint32_t recirc_id; /* Recirculation id carried with the
recirculating packets. 0 for packets
received from the wire. */
uint32_t dp_hash; /* hash value computed by the recirculation
action. */
uint32_t skb_priority; /* Packet priority for QoS. */
uint32_t pkt_mark; /* Packet mark. */
uint16_t ct_state; /* Connection state. */
uint16_t ct_zone; /* Connection zone. */
uint32_t ct_mark; /* Connection mark. */
ovs_u128 ct_label; /* Connection label. */
union flow_in_port in_port; /* Input port. */
struct flow_tnl tunnel; /* Encapsulating tunnel parameters. Note that
* if 'ip_dst' == 0, the rest of the fields may
* be uninitialized. */
};
/* Tunnel information used in flow key and metadata. */
struct flow_tnl {
ovs_be32 ip_dst;
struct in6_addr ipv6_dst;
ovs_be32 ip_src;
struct in6_addr ipv6_src;
ovs_be64 tun_id;
uint16_t flags;
uint8_t ip_tos;
uint8_t ip_ttl;
ovs_be16 tp_src;
ovs_be16 tp_dst;
ovs_be16 gbp_id;
uint8_t gbp_flags;
uint8_t pad1[5]; /* Pad to 64 bits. */
struct tun_metadata metadata;
};
struct dpif_upcall代表了一个报文的upcall,除了报文内容还有upcall带上来的netlink属性数据
/* A packet passed up from the datapath to userspace.
*
* The 'packet', 'key' and 'userdata' may point into data in a buffer
* provided by the caller, so the buffer should be released only after the
* upcall processing has been finished.
*
* While being processed, the 'packet' may be reallocated, so the packet must
* be separately released with ofpbuf_uninit().
*/
struct dpif_upcall {
/* All types. */
enum dpif_upcall_type type;
struct dp_packet packet; /* Packet data. */
struct nlattr *key; /* Flow key. */
size_t key_len; /* Length of 'key' in bytes. */
ovs_u128 ufid; /* Unique flow identifier for 'key'. */
struct nlattr *mru; /* Maximum receive unit. */
struct nlattr *cutlen; /* Number of bytes shrink from the end. */
/* DPIF_UC_ACTION only. */
struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
struct nlattr *out_tun_key; /* Output tunnel key. */
struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
};
recv_upcalls会一次处理UPCALL_MAX_BATCH个请求,我们以单个请求的处理为例子,首先调用的是dpif_recv,实际调用了dpif_class->recv注册的函数。接收的数据会放到struct dpif_upcall和struct ofpbuf里面
/* Polls for an upcall from 'dpif' for an upcall handler. Since there
* there can be multiple poll loops, 'handler_id' is needed as index to
* identify the corresponding poll loop. If successful, stores the upcall
* into '*upcall', using 'buf' for storage. Should only be called if
* 'recv_set' has been used to enable receiving packets from 'dpif'.
*
* 'upcall->key' and 'upcall->userdata' point into data in the caller-provided
* 'buf', so their memory cannot be freed separately from 'buf'.
*
* The caller owns the data of 'upcall->packet' and may modify it. If
* packet's headroom is exhausted as it is manipulated, 'upcall->packet'
* will be reallocated. This requires the data of 'upcall->packet' to be
* released with ofpbuf_uninit() before 'upcall' is destroyed. However,
* when an error is returned, the 'upcall->packet' may be uninitialized
* and should not be released.
*
* Returns 0 if successful, otherwise a positive errno value. Returns EAGAIN
* if no upcall is immediately available. */
int
dpif_recv(struct dpif *dpif, uint32_t handler_id, struct dpif_upcall *upcall,
struct ofpbuf *buf)
{
int error = EAGAIN;
if (dpif->dpif_class->recv) {
error = dpif->dpif_class->recv(dpif, handler_id, upcall, buf);
if (!error) {
dpif_print_packet(dpif, upcall);
} else if (error != EAGAIN) {
log_operation(dpif, "recv", error);
}
}
return error;
}
第二步是调用upcall_receive,该函数用于构造一个struct upcall结构体
static int
upcall_receive(struct upcall *upcall, const struct dpif_backer *backer,
const struct dp_packet *packet, enum dpif_upcall_type type,
const struct nlattr *userdata, const struct flow *flow,
const unsigned int mru,
const ovs_u128 *ufid, const unsigned pmd_id)
{
int error;
error = xlate_lookup(backer, flow, &upcall->ofproto, &upcall->ipfix,
&upcall->sflow, NULL, &upcall->in_port);
if (error) {
return error;
}
upcall->recirc = NULL;
upcall->have_recirc_ref = false;
upcall->flow = flow;
upcall->packet = packet;
upcall->ufid = ufid;
upcall->pmd_id = pmd_id;
upcall->type = type;
upcall->userdata = userdata;
ofpbuf_use_stub(&upcall->odp_actions, upcall->odp_actions_stub,
sizeof upcall->odp_actions_stub);
ofpbuf_init(&upcall->put_actions, 0);
upcall->xout_initialized = false;
upcall->ukey_persists = false;
upcall->ukey = NULL;
upcall->key = NULL;
upcall->key_len = 0;
upcall->mru = mru;
upcall->out_tun_key = NULL;
upcall->actions = NULL;
return 0;
}
/* Given a datapath and flow metadata ('backer', and 'flow' respectively),
* optionally populates 'ofproto' with the ofproto_dpif, 'ofp_in_port' with the
* openflow in_port, and 'ipfix', 'sflow', and 'netflow' with the appropriate
* handles for those protocols if they're enabled. Caller may use the returned
* pointers until quiescing, for longer term use additional references must
* be taken.
*
* Returns 0 if successful, ENODEV if the parsed flow has no associated ofproto.
*/
int
xlate_lookup(const struct dpif_backer *backer, const struct flow *flow,
struct ofproto_dpif **ofprotop, struct dpif_ipfix **ipfix,
struct dpif_sflow **sflow, struct netflow **netflow,
ofp_port_t *ofp_in_port)
最后调用process_upcall来处理这个struct upcall,根据upcall类型不同处理方式也不同,我们这里只看MISS_UPCALL的处理,会调用到upcall_xlate
upcall_xlate首先初始化xlate_in, struct xlate_in结构体如下
struct xlate_in {
struct ofproto_dpif *ofproto;
/* Flow to which the OpenFlow actions apply. xlate_actions() will modify
* this flow when actions change header fields. */
struct flow flow;
/* The packet corresponding to 'flow', or a null pointer if we are
* revalidating without a packet to refer to. */
const struct dp_packet *packet;
/* Should OFPP_NORMAL update the MAC learning table? Should "learn"
* actions update the flow table?
*
* We want to update these tables if we are actually processing a packet,
* or if we are accounting for packets that the datapath has processed, but
* not if we are just revalidating. */
bool may_learn;
/* The rule initiating translation or NULL. If both 'rule' and 'ofpacts'
* are NULL, xlate_actions() will do the initial rule lookup itself. */
struct rule_dpif *rule;
/* The actions to translate. If 'rule' is not NULL, these may be NULL. */
const struct ofpact *ofpacts;
size_t ofpacts_len;
/* Union of the set of TCP flags seen so far in this flow. (Used only by
* NXAST_FIN_TIMEOUT. Set to zero to avoid updating updating rules'
* timeouts.) */
uint16_t tcp_flags;
/* If nonnull, flow translation calls this function just before executing a
* resubmit or OFPP_TABLE action. In addition, disables logging of traces
* when the recursion depth is exceeded.
*
* 'rule' is the rule being submitted into. It will be null if the
* resubmit or OFPP_TABLE action didn't find a matching rule.
*
* 'indentation' is the resubmit recursion depth at time of invocation,
* suitable for indenting the output.
*
* This is normally null so the client has to set it manually after
* calling xlate_in_init(). */
void (*resubmit_hook)(struct xlate_in *, struct rule_dpif *rule,
int indentation);
/* If nonnull, flow translation calls this function to report some
* significant decision, e.g. to explain why OFPP_NORMAL translation
* dropped a packet. 'indentation' is the resubmit recursion depth at time
* of invocation, suitable for indenting the output. */
void (*report_hook)(struct xlate_in *, int indentation,
const char *format, va_list args);
/* If nonnull, flow translation credits the specified statistics to each
* rule reached through a resubmit or OFPP_TABLE action.
*
* This is normally null so the client has to set it manually after
* calling xlate_in_init(). */
const struct dpif_flow_stats *resubmit_stats;
/* Counters carried over from a pre-existing translation of a related flow.
* This can occur due to, e.g., the translation of an ARP packet that was
* generated as the result of outputting to a tunnel port. In that case,
* the original flow going to the tunnel is the related flow. Since the
* two flows are different, they should not use the same xlate_ctx
* structure. However, we still need limit the maximum recursion across
* the entire translation.
*
* These fields are normally set to zero, so the client has to set them
* manually after calling xlate_in_init(). In that case, they should be
* copied from the same-named fields in the related flow's xlate_ctx.
*
* These fields are really implementation details; the client doesn't care
* about what they mean. See the corresponding fields in xlate_ctx for
* real documentation. */
int indentation;
int depth;
int resubmits;
/* If nonnull, flow translation populates this cache with references to all
* modules that are affected by translation. This 'xlate_cache' may be
* passed to xlate_push_stats() to perform the same function as
* xlate_actions() without the full cost of translation.
*
* This is normally null so the client has to set it manually after
* calling xlate_in_init(). */
struct xlate_cache *xcache;
/* If nonnull, flow translation puts the resulting datapath actions in this
* buffer. If null, flow translation will not produce datapath actions. */
struct ofpbuf *odp_actions;
/* If nonnull, flow translation populates this with wildcards relevant in
* translation. Any fields that were used to calculate the action are set,
* to allow caching and kernel wildcarding to work. For example, if the
* flow lookup involved performing the "normal" action on IPv4 and ARP
* packets, 'wc' would have the 'in_port' (always set), 'dl_type' (flow
* match), 'vlan_tci' (normal action), and 'dl_dst' (normal action) fields
* set. */
struct flow_wildcards *wc;
/* The frozen state to be resumed, as returned by xlate_lookup(). */
const struct frozen_state *frozen_state;
};
之后调用xlate_actions,生成datapath需要的struct xlate_out,xlate_actions函数比较复杂,其中最重要的调用是通过rule_dpif_lookup_from_table查找到匹配的流表规则,进而生成actions
rule_dpif_lookup_from_table又会通过流表的级联一个个顺序查找,每单个流表都会调用rule_dpif_lookup_in_table
/* Look up 'flow' in 'ofproto''s classifier version 'version', starting from
* table '*table_id'. Returns the rule that was found, which may be one of the
* special rules according to packet miss hadling. If 'may_packet_in' is
* false, returning of the miss_rule (which issues packet ins for the
* controller) is avoided. Updates 'wc', if nonnull, to reflect the fields
* that were used during the lookup.
*
* If 'honor_table_miss' is true, the first lookup occurs in '*table_id', but
* if none is found then the table miss configuration for that table is
* honored, which can result in additional lookups in other OpenFlow tables.
* In this case the function updates '*table_id' to reflect the final OpenFlow
* table that was searched.
*
* If 'honor_table_miss' is false, then only one table lookup occurs, in
* '*table_id'.
*
* The rule is returned in '*rule', which is valid at least until the next
* RCU quiescent period. If the '*rule' needs to stay around longer, the
* caller must take a reference.
*
* 'in_port' allows the lookup to take place as if the in port had the value
* 'in_port'. This is needed for resubmit action support.
*
* 'flow' is non-const to allow for temporary modifications during the lookup.
* Any changes are restored before returning. */
struct rule_dpif *
rule_dpif_lookup_from_table(struct ofproto_dpif *ofproto,
ovs_version_t version, struct flow *flow,
struct flow_wildcards *wc,
const struct dpif_flow_stats *stats,
uint8_t *table_id, ofp_port_t in_port,
bool may_packet_in, bool honor_table_miss)
{
ovs_be16 old_tp_src = flow->tp_src, old_tp_dst = flow->tp_dst;
ofp_port_t old_in_port = flow->in_port.ofp_port;
enum ofputil_table_miss miss_config;
struct rule_dpif *rule;
uint8_t next_id;
/* We always unwildcard nw_frag (for IP), so they
* need not be unwildcarded here. */
if (flow->nw_frag & FLOW_NW_FRAG_ANY
&& ofproto->up.frag_handling != OFPUTIL_FRAG_NX_MATCH) {
if (ofproto->up.frag_handling == OFPUTIL_FRAG_NORMAL) {
/* We must pretend that transport ports are unavailable. */
flow->tp_src = htons(0);
flow->tp_dst = htons(0);
} else {
/* Must be OFPUTIL_FRAG_DROP (we don't have OFPUTIL_FRAG_REASM).
* Use the drop_frags_rule (which cannot disappear). */
rule = ofproto->drop_frags_rule;
if (stats) {
struct oftable *tbl = &ofproto->up.tables[*table_id];
unsigned long orig;
atomic_add_relaxed(&tbl->n_matched, stats->n_packets, &orig);
}
return rule;
}
}
/* Look up a flow with 'in_port' as the input port. Then restore the
* original input port (otherwise OFPP_NORMAL and OFPP_IN_PORT will
* have surprising behavior). */
flow->in_port.ofp_port = in_port;
/* Our current implementation depends on n_tables == N_TABLES, and
* TBL_INTERNAL being the last table. */
BUILD_ASSERT_DECL(N_TABLES == TBL_INTERNAL + 1);
miss_config = OFPUTIL_TABLE_MISS_CONTINUE;
for (next_id = *table_id;
next_id < ofproto->up.n_tables;
next_id++, next_id += (next_id == TBL_INTERNAL))
{
*table_id = next_id;
rule = rule_dpif_lookup_in_table(ofproto, version, next_id, flow, wc);
if (stats) {
struct oftable *tbl = &ofproto->up.tables[next_id];
unsigned long orig;
atomic_add_relaxed(rule ? &tbl->n_matched : &tbl->n_missed,
stats->n_packets, &orig);
}
if (rule) {
goto out; /* Match. */
}
if (honor_table_miss) {
miss_config = ofproto_table_get_miss_config(&ofproto->up,
*table_id);
if (miss_config == OFPUTIL_TABLE_MISS_CONTINUE) {
continue;
}
}
break;
}
/* Miss. */
rule = ofproto->no_packet_in_rule;
if (may_packet_in) {
if (miss_config == OFPUTIL_TABLE_MISS_CONTINUE
|| miss_config == OFPUTIL_TABLE_MISS_CONTROLLER) {
struct ofport_dpif *port;
port = ofp_port_to_ofport(ofproto, old_in_port);
if (!port) {
VLOG_WARN_RL(&rl, "packet-in on unknown OpenFlow port %"PRIu16,
old_in_port);
} else if (!(port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN)) {
rule = ofproto->miss_rule;
}
} else if (miss_config == OFPUTIL_TABLE_MISS_DEFAULT &&
connmgr_wants_packet_in_on_miss(ofproto->up.connmgr)) {
rule = ofproto->miss_rule;
}
}
out:
/* Restore port numbers, as they may have been modified above. */
flow->tp_src = old_tp_src;
flow->tp_dst = old_tp_dst;
/* Restore the old in port. */
flow->in_port.ofp_port = old_in_port;
return rule;
}
而对于rule_dpif_lookup_in_table而言,实际调用了classifier_lookup来在流表中查找rule,struct classifier的细节后面再分析
/* Finds and returns the highest-priority rule in 'cls' that matches 'flow' and
* that is visible in 'version'. Returns a null pointer if no rules in 'cls'
* match 'flow'. If multiple rules of equal priority match 'flow', returns one
* arbitrarily.
*
* If a rule is found and 'wc' is non-null, bitwise-OR's 'wc' with the
* set of bits that were significant in the lookup. At some point
* earlier, 'wc' should have been initialized (e.g., by
* flow_wildcards_init_catchall()).
*
* 'flow' is non-const to allow for temporary modifications during the lookup.
* Any changes are restored before returning. */
const struct cls_rule *
classifier_lookup(const struct classifier *cls, ovs_version_t version,
struct flow *flow, struct flow_wildcards *wc)
{
return classifier_lookup__(cls, version, flow, wc, true);
}
xlate_actions最终调用do_xlate_actions针对每种ACTION_ATTR对flow执行不同操作。
好了,下面我们回到recv_upcalls了,最后会调用handle_upcalls,用于向datapath下发flow,handle_upcalls最终调用的是dpif_operate来下发flow,后者调用的是dpif_class->operate,该接口针对不同的dpif实现,可以是dpif_netdev_operate或者dpif_netlink_operate