内核匹配不到流表通过netlink发送到用户态处理。
用户态 udpif_upcall_handler线程接收内核发送的upcall信息。
创建udpif_upcall_handler流程:
udpif_upcall_handler接收upcall流程:
udpif_upcall_handler ---> recv_upcalls---> process_upcall ---> upcall_xlate ---> xlate_actions ---> do_xlate_actions---> handle_upcalls
udpif_upcall_handler 线程会循环接收upcall请求。
static void *
udpif_upcall_handler(void *arg)
{
struct handler *handler = arg;
struct udpif *udpif = handler->udpif;
while (!latch_is_set(&handler->udpif->exit_latch)) {
if (recv_upcalls(handler)) {
poll_immediate_wake();
} else {
dpif_recv_wait(udpif->dpif, handler->handler_id);
latch_wait(&udpif->exit_latch);
}
poll_block();
}
return NULL;
}
接收并处理upcall请求
static size_t recv_upcalls(struct handler*handler)
调用upcall_receive读取upcall调用信息,调用flow_extract提取flow信息。
static size_t
recv_upcalls(struct handler *handler)
{
struct udpif *udpif = handler->udpif;
uint64_t recv_stubs[UPCALL_MAX_BATCH][512 / 8];
struct ofpbuf recv_bufs[UPCALL_MAX_BATCH];
struct dpif_upcall dupcalls[UPCALL_MAX_BATCH];
struct upcall upcalls[UPCALL_MAX_BATCH];
struct flow flows[UPCALL_MAX_BATCH];
size_t n_upcalls, i;
n_upcalls = 0;
while (n_upcalls < UPCALL_MAX_BATCH) {
struct ofpbuf *recv_buf = &recv_bufs[n_upcalls];
struct dpif_upcall *dupcall = &dupcalls[n_upcalls];
struct upcall *upcall = &upcalls[n_upcalls];
struct flow *flow = &flows[n_upcalls];
unsigned int mru;
int error;
ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls],
sizeof recv_stubs[n_upcalls]);
if (dpif_recv(udpif->dpif, handler->handler_id, dupcall, recv_buf)) {
ofpbuf_uninit(recv_buf);
break;
}
if (odp_flow_key_to_flow(dupcall->key, dupcall->key_len, flow)
== ODP_FIT_ERROR) {
goto free_dupcall;
}
if (dupcall->mru) {
mru = nl_attr_get_u16(dupcall->mru);
} else {
mru = 0;
}
error = upcall_receive(upcall, udpif->backer, &dupcall->packet,
dupcall->type, dupcall->userdata, flow, mru,
&dupcall->ufid, PMD_ID_NULL);
if (error) {
if (error == ENODEV) {
/* Received packet on datapath port for which we couldn't
* associate an ofproto. This can happen if a port is removed
* while traffic is being received. Print a rate-limited
* message in case it happens frequently. */
dpif_flow_put(udpif->dpif, DPIF_FP_CREATE, dupcall->key,
dupcall->key_len, NULL, 0, NULL, 0,
&dupcall->ufid, PMD_ID_NULL, NULL);
VLOG_INFO_RL(&rl, "received packet on unassociated datapath "
"port %"PRIu32, flow->in_port.odp_port);
}
goto free_dupcall;
}
upcall->key = dupcall->key;
upcall->key_len = dupcall->key_len;
upcall->ufid = &dupcall->ufid;
upcall->out_tun_key = dupcall->out_tun_key;
upcall->actions = dupcall->actions;
pkt_metadata_from_flow(&dupcall->packet.md, flow);
flow_extract(&dupcall->packet, flow);
error = process_upcall(udpif, upcall,
&upcall->odp_actions, &upcall->wc);
if (error) {
goto cleanup;
}
n_upcalls++;
continue;
cleanup:
upcall_uninit(upcall);
free_dupcall:
dp_packet_uninit(&dupcall->packet);
ofpbuf_uninit(recv_buf);
}
if (n_upcalls) {
handle_upcalls(handler->udpif, upcalls, n_upcalls);
for (i = 0; i < n_upcalls; i++) {
dp_packet_uninit(&dupcalls[i].packet);
ofpbuf_uninit(&recv_bufs[i]);
upcall_uninit(&upcalls[i]);
}
}
return n_upcalls;
}
调用process_upcall处理upcall信息,upcall类型分为
enum upcall_type {
BAD_UPCALL, /*Some kind of bug somewhere. */
MISS_UPCALL, /* Aflow miss. */
SFLOW_UPCALL, /*sFlow sample. */
FLOW_SAMPLE_UPCALL, /*Per-flow sampling. */
IPFIX_UPCALL /*Per-bridge sampling. */
};
内核datapath未匹配的类型为MISS_UPCALL。
调用xlate_actions 进行动作处理,此函数会调用rule_dpif_lookup_from_table 在flow table 中进行规则查找,查找到规则后调用do_xlate_actions,根据action的不同执行不同的操作。
enum xlate_error
xlate_actions(struct xlate_in *xin, struct xlate_out *xout)
{
*xout = (struct xlate_out) {
.slow = 0,
.recircs = RECIRC_REFS_EMPTY_INITIALIZER,
};
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xbridge *xbridge = xbridge_lookup(xcfg, xin->ofproto);
if (!xbridge) {
return XLATE_BRIDGE_NOT_FOUND;
}
struct flow *flow = &xin->flow;
uint8_t stack_stub[1024];
uint64_t action_set_stub[1024 / 8];
uint64_t frozen_actions_stub[1024 / 8];
uint64_t actions_stub[256 / 8];
struct ofpbuf scratch_actions = OFPBUF_STUB_INITIALIZER(actions_stub);
struct xlate_ctx ctx = {
.xin = xin,
.xout = xout,
.base_flow = *flow,
.orig_tunnel_ipv6_dst = flow_tnl_dst(&flow->tunnel),
.xbridge = xbridge,
.stack = OFPBUF_STUB_INITIALIZER(stack_stub),
.rule = xin->rule,
.wc = (xin->wc
? xin->wc
: &(struct flow_wildcards) { .masks = { .dl_type = 0 } }),
.odp_actions = xin->odp_actions ? xin->odp_actions : &scratch_actions,
.depth = xin->depth,
.resubmits = xin->resubmits,
.in_group = false,
.in_action_set = false,
.table_id = 0,
.rule_cookie = OVS_BE64_MAX,
.orig_skb_priority = flow->skb_priority,
.sflow_n_outputs = 0,
.sflow_odp_port = 0,
.nf_output_iface = NF_OUT_DROP,
.exit = false,
.error = XLATE_OK,
.mirrors = 0,
.freezing = false,
.recirc_update_dp_hash = false,
.frozen_actions = OFPBUF_STUB_INITIALIZER(frozen_actions_stub),
.pause = NULL,
.was_mpls = false,
.conntracked = false,
.ct_nat_action = NULL,
.action_set_has_group = false,
.action_set = OFPBUF_STUB_INITIALIZER(action_set_stub),
};
/* 'base_flow' reflects the packet as it came in, but we need it to reflect
* the packet as the datapath will treat it for output actions. Our
* datapath doesn't retain tunneling information without us re-setting
* it, so clear the tunnel data.
*/
memset(&ctx.base_flow.tunnel, 0, sizeof ctx.base_flow.tunnel);
ofpbuf_reserve(ctx.odp_actions, NL_A_U32_SIZE);
xlate_wc_init(&ctx);
COVERAGE_INC(xlate_actions);
xin->trace = xlate_report(&ctx, OFT_BRIDGE, "bridge(\"%s\")",
xbridge->name);
if (xin->frozen_state) {
const struct frozen_state *state = xin->frozen_state;
struct ovs_list *old_trace = xin->trace;
xin->trace = xlate_report(&ctx, OFT_THAW, "thaw");
if (xin->ofpacts_len > 0 || ctx.rule) {
xlate_report_error(&ctx, "Recirculation conflict (%s)!",
xin->ofpacts_len ? "actions" : "rule");
ctx.error = XLATE_RECIRCULATION_CONFLICT;
goto exit;
}
/* Set the bridge for post-recirculation processing if needed. */
if (!uuid_equals(&ctx.xbridge->ofproto->uuid, &state->ofproto_uuid)) {
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
const struct xbridge *new_bridge
= xbridge_lookup_by_uuid(xcfg, &state->ofproto_uuid);
if (OVS_UNLIKELY(!new_bridge)) {
/* Drop the packet if the bridge cannot be found. */
xlate_report_error(&ctx, "Frozen bridge no longer exists.");
ctx.error = XLATE_BRIDGE_NOT_FOUND;
xin->trace = old_trace;
goto exit;
}
ctx.xbridge = new_bridge;
/* The bridge is now known so obtain its table version. */
ctx.xin->tables_version
= ofproto_dpif_get_tables_version(ctx.xbridge->ofproto);
}
/* Set the thawed table id. Note: A table lookup is done only if there
* are no frozen actions. */
ctx.table_id = state->table_id;
xlate_report(&ctx, OFT_THAW,
"Resuming from table %"PRIu8, ctx.table_id);
if (!state->conntracked) {
clear_conntrack(&ctx);
}
/* Restore pipeline metadata. May change flow's in_port and other
* metadata to the values that existed when freezing was triggered. */
frozen_metadata_to_flow(&state->metadata, flow);
/* Restore stack, if any. */
if (state->stack) {
ofpbuf_put(&ctx.stack, state->stack, state->stack_size);
}
/* Restore mirror state. */
ctx.mirrors = state->mirrors;
/* Restore action set, if any. */
if (state->action_set_len) {
xlate_report_actions(&ctx, OFT_THAW, "Restoring action set",
state->action_set, state->action_set_len);
flow->actset_output = OFPP_UNSET;
xlate_write_actions__(&ctx, state->action_set,
state->action_set_len);
}
/* Restore frozen actions. If there are no actions, processing will
* start with a lookup in the table set above. */
xin->ofpacts = state->ofpacts;
xin->ofpacts_len = state->ofpacts_len;
if (state->ofpacts_len) {
xlate_report_actions(&ctx, OFT_THAW, "Restoring actions",
xin->ofpacts, xin->ofpacts_len);
}
xin->trace = old_trace;
} else if (OVS_UNLIKELY(flow->recirc_id)) {
xlate_report_error(&ctx,
"Recirculation context not found for ID %"PRIx32,
flow->recirc_id);
ctx.error = XLATE_NO_RECIRCULATION_CONTEXT;
goto exit;
}
/* Tunnel metadata in udpif format must be normalized before translation. */
if (flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
const struct tun_table *tun_tab = ofproto_get_tun_tab(
&ctx.xbridge->ofproto->up);
int err;
err = tun_metadata_from_geneve_udpif(tun_tab, &xin->upcall_flow->tunnel,
&xin->upcall_flow->tunnel,
&flow->tunnel);
if (err) {
xlate_report_error(&ctx, "Invalid Geneve tunnel metadata");
ctx.error = XLATE_INVALID_TUNNEL_METADATA;
goto exit;
}
} else if (!flow->tunnel.metadata.tab) {
/* If the original flow did not come in on a tunnel, then it won't have
* FLOW_TNL_F_UDPIF set. However, we still need to have a metadata
* table in case we generate tunnel actions. */
flow->tunnel.metadata.tab = ofproto_get_tun_tab(
&ctx.xbridge->ofproto->up);
}
ctx.wc->masks.tunnel.metadata.tab = flow->tunnel.metadata.tab;
if (!xin->ofpacts && !ctx.rule) {
ctx.rule = rule_dpif_lookup_from_table(
ctx.xbridge->ofproto, ctx.xin->tables_version, flow, ctx.wc,
ctx.xin->resubmit_stats, &ctx.table_id,
flow->in_port.ofp_port, true, true, ctx.xin->xcache);
if (ctx.xin->resubmit_stats) {
rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats);
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
entry->rule = ctx.rule;
ofproto_rule_ref(&ctx.rule->up);
}
xlate_report_table(&ctx, ctx.rule, ctx.table_id);
}
/* Get the proximate input port of the packet. (If xin->frozen_state,
* flow->in_port is the ultimate input port of the packet.) */
struct xport *in_port = get_ofp_port(xbridge,
ctx.base_flow.in_port.ofp_port);
/* Tunnel stats only for not-thawed packets. */
if (!xin->frozen_state && in_port && in_port->is_tunnel) {
if (ctx.xin->resubmit_stats) {
netdev_vport_inc_rx(in_port->netdev, ctx.xin->resubmit_stats);
if (in_port->bfd) {
bfd_account_rx(in_port->bfd, ctx.xin->resubmit_stats);
}
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETDEV);
entry->dev.rx = netdev_ref(in_port->netdev);
entry->dev.bfd = bfd_ref(in_port->bfd);
}
}
if (!xin->frozen_state && process_special(&ctx, in_port)) {
/* process_special() did all the processing for this packet.
*
* We do not perform special processing on thawed packets, since that
* was done before they were frozen and should not be redone. */
} else if (in_port && in_port->xbundle
&& xbundle_mirror_out(xbridge, in_port->xbundle)) {
xlate_report_error(&ctx, "dropping packet received on port "
"%s, which is reserved exclusively for mirroring",
in_port->xbundle->name);
} else {
/* Sampling is done on initial reception; don't redo after thawing. */
unsigned int user_cookie_offset = 0;
if (!xin->frozen_state) {
user_cookie_offset = compose_sflow_action(&ctx);
compose_ipfix_action(&ctx, ODPP_NONE);
}
size_t sample_actions_len = ctx.odp_actions->size;
if (tnl_process_ecn(flow)
&& (!in_port || may_receive(in_port, &ctx))) {
const struct ofpact *ofpacts;
size_t ofpacts_len;
if (xin->ofpacts) {
ofpacts = xin->ofpacts;
ofpacts_len = xin->ofpacts_len;
} else if (ctx.rule) {
const struct rule_actions *actions
= rule_get_actions(&ctx.rule->up);
ofpacts = actions->ofpacts;
ofpacts_len = actions->ofpacts_len;
ctx.rule_cookie = ctx.rule->up.flow_cookie;
} else {
OVS_NOT_REACHED();
}
mirror_ingress_packet(&ctx);
do_xlate_actions(ofpacts, ofpacts_len, &ctx);
if (ctx.error) {
goto exit;
}
/* We've let OFPP_NORMAL and the learning action look at the
* packet, so cancel all actions and freezing if forwarding is
* disabled. */
if (in_port && (!xport_stp_forward_state(in_port) ||
!xport_rstp_forward_state(in_port))) {
ctx.odp_actions->size = sample_actions_len;
ctx_cancel_freeze(&ctx);
ofpbuf_clear(&ctx.action_set);
}
if (!ctx.freezing) {
xlate_action_set(&ctx);
}
if (ctx.freezing) {
finish_freezing(&ctx);
}
}
/* Output only fully processed packets. */
if (!ctx.freezing
&& xbridge->has_in_band
&& in_band_must_output_to_local_port(flow)
&& !actions_output_to_local_port(&ctx)) {
compose_output_action(&ctx, OFPP_LOCAL, NULL);
}
if (user_cookie_offset) {
fix_sflow_action(&ctx, user_cookie_offset);
}
}
if (nl_attr_oversized(ctx.odp_actions->size)) {
/* These datapath actions are too big for a Netlink attribute, so we
* can't hand them to the kernel directly. dpif_execute() can execute
* them one by one with help, so just mark the result as SLOW_ACTION to
* prevent the flow from being installed. */
COVERAGE_INC(xlate_actions_oversize);
ctx.xout->slow |= SLOW_ACTION;
} else if (too_many_output_actions(ctx.odp_actions)) {
COVERAGE_INC(xlate_actions_too_many_output);
ctx.xout->slow |= SLOW_ACTION;
}
/* Do netflow only for packets on initial reception, that are not sent to
* the controller. We consider packets sent to the controller to be part
* of the control plane rather than the data plane. */
if (!xin->frozen_state
&& xbridge->netflow
&& !(xout->slow & SLOW_CONTROLLER)) {
if (ctx.xin->resubmit_stats) {
netflow_flow_update(xbridge->netflow, flow,
ctx.nf_output_iface,
ctx.xin->resubmit_stats);
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETFLOW);
entry->nf.netflow = netflow_ref(xbridge->netflow);
entry->nf.flow = xmemdup(flow, sizeof *flow);
entry->nf.iface = ctx.nf_output_iface;
}
}
/* Translate tunnel metadata masks to udpif format if necessary. */
if (xin->upcall_flow->tunnel.flags & FLOW_TNL_F_UDPIF) {
if (ctx.wc->masks.tunnel.metadata.present.map) {
const struct flow_tnl *upcall_tnl = &xin->upcall_flow->tunnel;
struct geneve_opt opts[TLV_TOT_OPT_SIZE /
sizeof(struct geneve_opt)];
tun_metadata_to_geneve_udpif_mask(&flow->tunnel,
&ctx.wc->masks.tunnel,
upcall_tnl->metadata.opts.gnv,
upcall_tnl->metadata.present.len,
opts);
memset(&ctx.wc->masks.tunnel.metadata, 0,
sizeof ctx.wc->masks.tunnel.metadata);
memcpy(&ctx.wc->masks.tunnel.metadata.opts.gnv, opts,
upcall_tnl->metadata.present.len);
}
ctx.wc->masks.tunnel.metadata.present.len = 0xff;
ctx.wc->masks.tunnel.metadata.tab = NULL;
ctx.wc->masks.tunnel.flags |= FLOW_TNL_F_UDPIF;
} else if (!xin->upcall_flow->tunnel.metadata.tab) {
/* If we didn't have options in UDPIF format and didn't have an existing
* metadata table, then it means that there were no options at all when
* we started processing and any wildcards we picked up were from
* action generation. Without options on the incoming packet, wildcards
* aren't meaningful. To avoid them possibly getting misinterpreted,
* just clear everything. */
if (ctx.wc->masks.tunnel.metadata.present.map) {
memset(&ctx.wc->masks.tunnel.metadata, 0,
sizeof ctx.wc->masks.tunnel.metadata);
} else {
ctx.wc->masks.tunnel.metadata.tab = NULL;
}
}
xlate_wc_finish(&ctx);
exit:
/* Reset the table to what it was when we came in. If we only fetched
* it locally, then it has no meaning outside of flow translation. */
flow->tunnel.metadata.tab = xin->upcall_flow->tunnel.metadata.tab;
ofpbuf_uninit(&ctx.stack);
ofpbuf_uninit(&ctx.action_set);
ofpbuf_uninit(&ctx.frozen_actions);
ofpbuf_uninit(&scratch_actions);
/* Make sure we return a "drop flow" in case of an error. */
if (ctx.error) {
xout->slow = 0;
if (xin->odp_actions) {
ofpbuf_clear(xin->odp_actions);
}
}
return ctx.error;
}
handle_upcalls将flow规则下刷到内核的datapath中,调用dpif_netlink_operate() 修改内核中的datapath规则。
static void
handle_upcalls(struct udpif *udpif, struct upcall *upcalls,
size_t n_upcalls)
{
struct dpif_op *opsp[UPCALL_MAX_BATCH * 2];
struct ukey_op ops[UPCALL_MAX_BATCH * 2];
size_t n_ops, n_opsp, i;
/* Handle the packets individually in order of arrival.
*
* - For SLOW_CFM, SLOW_LACP, SLOW_STP, and SLOW_BFD, translation is what
* processes received packets for these protocols.
*
* - For SLOW_CONTROLLER, translation sends the packet to the OpenFlow
* controller.
*
* The loop fills 'ops' with an array of operations to execute in the
* datapath. */
n_ops = 0;
for (i = 0; i < n_upcalls; i++) {
struct upcall *upcall = &upcalls[i];
const struct dp_packet *packet = upcall->packet;
struct ukey_op *op;
if (should_install_flow(udpif, upcall)) {
struct udpif_key *ukey = upcall->ukey;
if (ukey_install(udpif, ukey)) {
upcall->ukey_persists = true;
put_op_init(&ops[n_ops++], ukey, DPIF_FP_CREATE);
}
}
if (upcall->odp_actions.size) {
op = &ops[n_ops++];
op->ukey = NULL;
op->dop.type = DPIF_OP_EXECUTE;
op->dop.u.execute.packet = CONST_CAST(struct dp_packet *, packet);
op->dop.u.execute.flow = upcall->flow;
odp_key_to_pkt_metadata(upcall->key, upcall->key_len,
&op->dop.u.execute.packet->md);
op->dop.u.execute.actions = upcall->odp_actions.data;
op->dop.u.execute.actions_len = upcall->odp_actions.size;
op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0;
op->dop.u.execute.probe = false;
op->dop.u.execute.mtu = upcall->mru;
}
}
/* Execute batch. */
n_opsp = 0;
for (i = 0; i < n_ops; i++) {
opsp[n_opsp++] = &ops[i].dop;
}
dpif_operate(udpif->dpif, opsp, n_opsp);
for (i = 0; i < n_ops; i++) {
struct udpif_key *ukey = ops[i].ukey;
if (ukey) {
ovs_mutex_lock(&ukey->mutex);
if (ops[i].dop.error) {
transition_ukey(ukey, UKEY_EVICTED);
} else {
transition_ukey(ukey, UKEY_OPERATIONAL);
}
ovs_mutex_unlock(&ukey->mutex);
}
}
}