一、首包精确流表生成
首包到达OVS交换机时,尚未建立基于目的mac的流表规则,需要upcall到用户态进行学习,此时生成的规则是把报文flood到其他端口。
1、upcall_xlate函数
upcall->dump_seq = seq_read(udpif->dump_seq);
upcall->reval_seq = seq_read(udpif->reval_seq);
xlate_actions(&xin, &upcall->xout);
upcall->xout_initialized = true;
/* The bridge is now known so obtain its table version. */
ctx.tables_version = ofproto_dpif_get_tables_version(ctx.xbridge->ofproto);
if (!xin->ofpacts && !ctx.rule) {
ctx.rule = rule_dpif_lookup_from_table( //检索流表
ctx.xbridge->ofproto, ctx.tables_version, flow, xin->wc,
ctx.xin->resubmit_stats, &ctx.table_id,
flow->in_port.ofp_port, true, true);
if (ctx.xin->resubmit_stats) {
rule_dpif_credit_stats(ctx.rule, ctx.xin->resubmit_stats);
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_RULE);
entry->u.rule = ctx.rule;
rule_dpif_ref(ctx.rule);
}
if (OVS_UNLIKELY(ctx.xin->resubmit_hook)) {
ctx.xin->resubmit_hook(ctx.xin, ctx.rule, 0);
}
}
xout->fail_open = ctx.rule && rule_dpif_is_fail_open(ctx.rule);
/* Get the proximate input port of the packet. (If xin->recirc,
* flow->in_port is the ultimate input port of the packet.) */
struct xport *in_port = get_ofp_port(xbridge,
ctx.base_flow.in_port.ofp_port);
/* Tunnel stats only for non-recirculated packets. */
if (!xin->recirc && in_port && in_port->is_tunnel) {
if (ctx.xin->resubmit_stats) {
netdev_vport_inc_rx(in_port->netdev, ctx.xin->resubmit_stats);
if (in_port->bfd) {
bfd_account_rx(in_port->bfd, ctx.xin->resubmit_stats);
}
}
if (ctx.xin->xcache) {
struct xc_entry *entry;
entry = xlate_cache_add_entry(ctx.xin->xcache, XC_NETDEV);
entry->u.dev.rx = netdev_ref(in_port->netdev);
entry->u.dev.bfd = bfd_ref(in_port->bfd);
}
}
if (!xin->recirc && process_special(&ctx, in_port)) {
/* process_special() did all the processing for this packet.
*
* We do not perform special processing on recirculated packets, as
* recirculated packets are not really received by the bridge.*/
} else if (in_port && in_port->xbundle
&& xbundle_mirror_out(xbridge, in_port->xbundle)) {
if (ctx.xin->packet != NULL) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port "
"%s, which is reserved exclusively for mirroring",
ctx.xbridge->name, in_port->xbundle->name);
}
} else {
/* Sampling is done only for packets really received by the bridge. */
unsigned int user_cookie_offset = 0;
if (!xin->recirc) {
user_cookie_offset = compose_sflow_action(&ctx);
compose_ipfix_action(&ctx, ODPP_NONE);
}
size_t sample_actions_len = ctx.odp_actions->size;
if (tnl_process_ecn(flow)
&& (!in_port || may_receive(in_port, &ctx))) {
const struct ofpact *ofpacts;
size_t ofpacts_len;
if (xin->ofpacts) {
ofpacts = xin->ofpacts;
ofpacts_len = xin->ofpacts_len;
} else if (ctx.rule) {
const struct rule_actions *actions
= rule_dpif_get_actions(ctx.rule);
ofpacts = actions->ofpacts;
ofpacts_len = actions->ofpacts_len;
ctx.rule_cookie = rule_dpif_get_flow_cookie(ctx.rule);
} else {
OVS_NOT_REACHED();
}
mirror_ingress_packet(&ctx);
do_xlate_actions(ofpacts, ofpacts_len, &ctx); //把openflow流表转化为精确流表
if (ctx.error) {
goto exit;
}
/* We've let OFPP_NORMAL and the learning action look at the
* packet, so drop it now if forwarding is disabled. */
if (in_port && (!xport_stp_forward_state(in_port) ||
!xport_rstp_forward_state(in_port))) {
/* Drop all actions added by do_xlate_actions() above. */
ctx.odp_actions->size = sample_actions_len;
/* Undo changes that may have been done for recirculation. */
if (exit_recirculates(&ctx)) {
ctx.action_set.size = ctx.recirc_action_offset;
ctx.recirc_action_offset = -1;
ctx.last_unroll_offset = -1;
}
} else if (ctx.action_set.size) {
/* Translate action set only if not dropping the packet and
* not recirculating. */
if (!exit_recirculates(&ctx)) {
xlate_action_set(&ctx);
}
}
/* Check if need to recirculate. */
if (exit_recirculates(&ctx)) {
compose_recirculate_action(&ctx);
}
}
3、do_xlate_actions函数
switch (a->type) {
case OFPACT_OUTPUT:
xlate_output_action(ctx, ofpact_get_OUTPUT(a)->port,
ofpact_get_OUTPUT(a)->max_len, true);
break;
switch (port) {
case OFPP_IN_PORT:
compose_output_action(ctx, ctx->xin->flow.in_port.ofp_port, NULL);
break;
case OFPP_TABLE:
xlate_table_action(ctx, ctx->xin->flow.in_port.ofp_port,
0, may_packet_in, true);
break;
case OFPP_NORMAL:
xlate_normal(ctx); //normal规则
break;
case OFPP_FLOOD:
flood_packets(ctx, false);
break;
ovs_rwlock_unlock(&ms->rwlock);
} else {
ovs_rwlock_rdlock(&ctx->xbridge->ml->rwlock);
mac = mac_learning_lookup(ctx->xbridge->ml, flow->dl_dst, vlan);
mac_port = mac ? mac_entry_get_port(ctx->xbridge->ml, mac) : NULL;
ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock);
if (mac_port) {
struct xlate_cfg *xcfg = ovsrcu_get(struct xlate_cfg *, &xcfgp);
struct xbundle *mac_xbundle = xbundle_lookup(xcfg, mac_port);
if (mac_xbundle && mac_xbundle != in_xbundle) {
xlate_report(ctx, "forwarding to learned port");
output_normal(ctx, mac_xbundle, vlan); //能够匹配到mac,确定出端口
} else if (!mac_xbundle) {
xlate_report(ctx, "learned port is unknown, dropping");
} else {
xlate_report(ctx, "learned port is input port, dropping");
}
} else {
xlate_report(ctx, "no learned MAC for destination, flooding");
xlate_normal_flood(ctx, in_xbundle, vlan); //初始,会flood到其他端口;
}
当首包的响应报文到达交换机后,那么源报文的流表可以刷新了(因为知道目的mac在哪个端口),OVS是通过revalidate线程来刷新流表的。
1、revalidate函数
static void
revalidate(struct revalidator *revalidator)
{
uint64_t odp_actions_stub[1024 / 8];
struct ofpbuf odp_actions = OFPBUF_STUB_INITIALIZER(odp_actions_stub);
struct udpif *udpif = revalidator->udpif;
struct dpif_flow_dump_thread *dump_thread;
uint64_t dump_seq, reval_seq;
unsigned int flow_limit;
dump_seq = seq_read(udpif->dump_seq);
reval_seq = seq_read(udpif->reval_seq);
atomic_read_relaxed(&udpif->flow_limit, &flow_limit);
dump_thread = dpif_flow_dump_thread_create(udpif->dump);
for (;;) {
struct ukey_op ops[REVALIDATE_MAX_BATCH];
int n_ops = 0;
struct dpif_flow flows[REVALIDATE_MAX_BATCH];
const struct dpif_flow *f;
int n_dumped;
long long int max_idle;
long long int now;
size_t n_dp_flows;
bool kill_them_all;
n_dumped = dpif_flow_dump_next(dump_thread, flows, ARRAY_SIZE(flows)); //从datapath中获取流表
if (!n_dumped) {
break;
}
now = time_msec();
/* In normal operation we want to keep flows around until they have
* been idle for 'ofproto_max_idle' milliseconds. However:
*
* - If the number of datapath flows climbs above 'flow_limit',
* drop that down to 100 ms to try to bring the flows down to
* the limit.
*
* - If the number of datapath flows climbs above twice
* 'flow_limit', delete all the datapath flows as an emergency
* measure. (We reassess this condition for the next batch of
* datapath flows, so we will recover before all the flows are
* gone.) */
n_dp_flows = udpif_get_n_flows(udpif);
kill_them_all = n_dp_flows > flow_limit * 2;
max_idle = n_dp_flows > flow_limit ? 100 : ofproto_max_idle;
for (f = flows; f < &flows[n_dumped]; f++) {
long long int used = f->stats.used;
struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER;
enum reval_result result;
struct udpif_key *ukey;
bool already_dumped;
int error;
if (ukey_acquire(udpif, f, &ukey, &error)) {
if (error == EBUSY) {
/* Another thread is processing this flow, so don't bother
* processing it.*/
COVERAGE_INC(upcall_ukey_contention);
} else {
log_unexpected_flow(f, error);
if (error != ENOENT) {
delete_op_init__(udpif, &ops[n_ops++], f);
}
}
continue;
}
already_dumped = ukey->dump_seq == dump_seq;
if (already_dumped) {
/* The flow has already been handled during this flow dump
* operation. Skip it. */
if (ukey->xcache) {
COVERAGE_INC(dumped_duplicate_flow);
} else {
COVERAGE_INC(dumped_new_flow);
}
ovs_mutex_unlock(&ukey->mutex);
continue;
}
if (!used) {
used = ukey->created;
}
if (kill_them_all || (used && used < now - max_idle)) {
result = UKEY_DELETE;
} else {
result = revalidate_ukey(udpif, ukey, &f->stats, &odp_actions, //比较流表是否有变化
reval_seq, &recircs);
}
ukey->dump_seq = dump_seq;
ukey->flow_exists = result != UKEY_DELETE;
if (result != UKEY_KEEP) {
/* Takes ownership of 'recircs'. */
reval_op_init(&ops[n_ops++], result, udpif, ukey, &recircs,
&odp_actions);
}
ovs_mutex_unlock(&ukey->mutex);
}
if (n_ops) {
push_ukey_ops__(udpif, ops, n_ops); //是否有刷新则刷新流表
}
ovsrcu_quiesce();
}
dpif_flow_dump_thread_destroy(dump_thread);
ofpbuf_uninit(&odp_actions);
}
static enum reval_result
revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
const struct dpif_flow_stats *stats,
struct ofpbuf *odp_actions, uint64_t reval_seq,
struct recirc_refs *recircs)
OVS_REQUIRES(ukey->mutex)
{
struct xlate_out xout, *xoutp;
struct netflow *netflow;
struct ofproto_dpif *ofproto;
struct dpif_flow_stats push;
struct flow flow;
struct flow_wildcards dp_mask, wc;
enum reval_result result;
ofp_port_t ofp_in_port;
struct xlate_in xin;
long long int last_used;
int error;
bool need_revalidate;
result = UKEY_DELETE;
xoutp = NULL;
netflow = NULL;
ofpbuf_clear(odp_actions);
need_revalidate = (ukey->reval_seq != reval_seq);
last_used = ukey->stats.used;
push.used = stats->used;
push.tcp_flags = stats->tcp_flags;
push.n_packets = (stats->n_packets > ukey->stats.n_packets
? stats->n_packets - ukey->stats.n_packets
: 0);
push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes
? stats->n_bytes - ukey->stats.n_bytes
: 0);
if (need_revalidate && last_used
&& !should_revalidate(udpif, push.n_packets, last_used)) {
goto exit;
}
/* We will push the stats, so update the ukey stats cache. */
ukey->stats = *stats;
if (!push.n_packets && !need_revalidate) {
result = UKEY_KEEP;
goto exit;
}
if (ukey->xcache && !need_revalidate) {
xlate_push_stats(ukey->xcache, &push);
result = UKEY_KEEP;
goto exit;
}
if (odp_flow_key_to_flow(ukey->key, ukey->key_len, &flow)
== ODP_FIT_ERROR) {
goto exit;
}
error = xlate_lookup(udpif->backer, &flow, &ofproto, NULL, NULL, &netflow,
&ofp_in_port);
if (error) {
goto exit;
}
if (need_revalidate) {
xlate_cache_clear(ukey->xcache);
}
if (!ukey->xcache) {
ukey->xcache = xlate_cache_new();
}
xlate_in_init(&xin, ofproto, &flow, ofp_in_port, NULL, push.tcp_flags,
NULL, need_revalidate ? &wc : NULL, odp_actions);
if (push.n_packets) {
xin.resubmit_stats = &push;
xin.may_learn = true;
}
xin.xcache = ukey->xcache;
xlate_actions(&xin, &xout); //action学习,由于此时可以确认目的mac的端口,所以生成的action会不同
xoutp = &xout;
if (!need_revalidate) {
result = UKEY_KEEP;
goto exit;
}
if (xout.slow) {
ofpbuf_clear(odp_actions);
compose_slow_path(udpif, &xout, &flow, flow.in_port.odp_port,
odp_actions);
}
if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, ukey->key,
ukey->key_len, &dp_mask, &flow)
== ODP_FIT_ERROR) {
goto exit;
}
/* Do not modify if any bit is wildcarded by the installed datapath flow,
* but not the newly revalidated wildcard mask (wc), i.e., if revalidation
* tells that the datapath flow is now too generic and must be narrowed
* down. Note that we do not know if the datapath has ignored any of the
* wildcarded bits, so we may be overtly conservative here. */
if (flow_wildcards_has_extra(&dp_mask, &wc)) {
goto exit;
}
if (!ofpbuf_equal(odp_actions,
ovsrcu_get(struct ofpbuf *, &ukey->actions))) { //判断action是否有差异,有则需要刷新
/* The datapath mask was OK, but the actions seem to have changed.
* Let's modify it in place. */
result = UKEY_MODIFY;
/* Transfer recirc action ID references to the caller. */
recirc_refs_swap(recircs, &xoutp->recircs);
goto exit;
}
result = UKEY_KEEP;
exit:
if (result != UKEY_DELETE) {
ukey->reval_seq = reval_seq;
}
if (netflow && result == UKEY_DELETE) {
netflow_flow_clear(netflow, &flow);
}
xlate_out_uninit(xoutp);
return result;
}