OVS vswitchd 模块分析(2)


3.bridge,ofproto模块是vswitchd的核心,启动的时候初始化网桥(OVSDB配置相关),接着依次运行RPC服务,网桥,网络设备相关的调用。
----------vswitchd/ovs-vswitchd.c
     bridge_init(remote);
    free(remote);

    exiting = false;
    while (!exiting) {
        worker_run();
        bridge_run_fast();
        bridge_run();
        bridge_run_fast();
        unixctl_server_run(unixctl);
        netdev_run();
............
---------lib/worker.c
3.1 如果该进程启动了一个worker(client_sock >= 0)并且没有发生故障,调用reply_cb来回复。其实就相当于多线程,时刻待命响应主进程的各种请求。
void worker_run(void){
    if (worker_is_running()) {
        int error;

        error = rxbuf_run(&client_rx, client_sock, sizeof(struct worker_reply));
        if (!error) {
            struct worker_reply *reply = client_rx.header.data;
            reply->reply_cb(&client_rx.payload, client_rx.fds, client_rx.n_fds, reply->reply_aux);
            rxbuf_clear(&client_rx);
        } else if (error != EAGAIN) {
            worker_broke();
            VLOG_ABORT("receive from worker failed (%s)",
                       ovs_retval_to_string(error));
        }
    }
}

3.2  函数 bridge_run_fast执行间歇性的活动:遍历所有的网桥,然后完成of switch的功能(通过bridge->ofproto)。在每次轮询循环中多次呼叫这个函数很有意义,因为ofprotos使用ofproto-dpif 实现,可以有性能提升。
     函数 bridge_run 通过和ovsdb的沟通维护一些系统状态,核心逻辑仍然是通过ofproto完成相应的任务。

void   bridge_run_fast(void) {
    struct bridge *br;

    HMAP_FOR_EACH (br, node, &all_bridges) {
        ofproto_run_fast(br->ofproto);
    }
}

void bridge_run(void) {
    static const struct ovsrec_open_vswitch null_cfg;
    const struct ovsrec_open_vswitch *cfg;
    struct ovsdb_idl_txn *reconf_txn = NULL;

    bool vlan_splinters_changed;
    struct bridge *br;

    ovsrec_open_vswitch_init((struct ovsrec_open_vswitch *) &null_cfg);

    /* (Re)configure if necessary. */
    if (!reconfiguring) {
        ovsdb_idl_run(idl);

        if (ovsdb_idl_is_lock_contended(idl)) {
            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
            struct bridge *br, *next_br;

            VLOG_ERR_RL(&rl, "another ovs-vswitchd process is running, disabling this process until it goes away");

            HMAP_FOR_EACH_SAFE (br, next_br, node, &all_bridges) {
                bridge_destroy(br);
            }
            return;
        } else if (!ovsdb_idl_has_lock(idl)) {
            return;
        }
    }
    cfg = ovsrec_open_vswitch_first(idl);

    /* Let each bridge do the work that it needs to do. */
    HMAP_FOR_EACH (br, node, &all_bridges) {
        ofproto_run(br->ofproto);
    }

    /* 重新配置SSL,选择在主循环中而不是在数据库改变的时候配置ssl,是因为在没有数据库改变的情况下key和certificate文件内容也会改变;选择在 bridge_reconfigure()之前是因为这个函数可能需要初始化SSL连接(需要配置SSL)。 */
    if (cfg && cfg->ssl) {
        const struct ovsrec_ssl *ssl = cfg->ssl;

        stream_ssl_set_key_and_cert(ssl->private_key, ssl->certificate);
        stream_ssl_set_ca_cert_file(ssl->ca_cert, ssl->bootstrap_ca_cert);
    }

    if (!reconfiguring) {
        /* If VLAN splinters are in use, then we need to reconfigure if VLAN usage has changed. */
        vlan_splinters_changed = false;
        if (vlan_splinters_enabled_anywhere) {
            HMAP_FOR_EACH (br, node, &all_bridges) {
                if (ofproto_has_vlan_usage_changed(br->ofproto)) {
                    vlan_splinters_changed = true;
                    break;
                }
            }
        }

        if (ovsdb_idl_get_seqno(idl) != idl_seqno || vlan_splinters_changed) {
            idl_seqno = ovsdb_idl_get_seqno(idl);
            if (cfg) {
                reconf_txn = ovsdb_idl_txn_create(idl);
                bridge_reconfigure(cfg);
            } else {
                /* We still need to reconfigure to avoid dangling pointers to
                 * now-destroyed ovsrec structures inside bridge data. */
                bridge_reconfigure(&null_cfg);
            }
        }
    }

    if (reconfiguring) {
        if (cfg) {
            if (!reconf_txn) {
                reconf_txn = ovsdb_idl_txn_create(idl);
            }
            if (bridge_reconfigure_continue(cfg)) {
                ovsrec_open_vswitch_set_cur_cfg(cfg, cfg->next_cfg);
            }
        } else {
            bridge_reconfigure_continue(&null_cfg);
        }
    }

    if (reconf_txn) {
        ovsdb_idl_txn_commit(reconf_txn);
        ovsdb_idl_txn_destroy(reconf_txn);
        reconf_txn = NULL;
    }

    /* Refresh interface and mirror stats if necessary. */
    if (time_msec() >= iface_stats_timer) {
        if (cfg) {
            struct ovsdb_idl_txn *txn;

            txn = ovsdb_idl_txn_create(idl);
            HMAP_FOR_EACH (br, node, &all_bridges) {
                struct port *port;
                struct mirror *m;

                HMAP_FOR_EACH (port, hmap_node, &br->ports) {
                    struct iface *iface;

                    LIST_FOR_EACH (iface, port_elem, &port->ifaces) {
                        iface_refresh_stats(iface);
                        iface_refresh_status(iface);
                    }
                }

                HMAP_FOR_EACH (m, hmap_node, &br->mirrors) {
                    mirror_refresh_stats(m);
                }

            }
            refresh_controller_status();
            ovsdb_idl_txn_commit(txn);
            ovsdb_idl_txn_destroy(txn); /* XXX */
        }

        iface_stats_timer = time_msec() + IFACE_STATS_INTERVAL;
    }

    run_system_stats();
    refresh_instant_stats();
}


3.3 bridge和ofproto结构体定义(vswitchd/bridge.c, ofproto/ofproto-provider.h)

struct bridge {
    struct hmap_node node;      //用哈希映射表all_bridges 来管理这些br
    char *name;                 /* User-specified arbitrary name. */
    char *type;                 /* Datapath type. */
    uint8_t ea[ETH_ADDR_LEN];   /* Bridge Ethernet Address. */
    uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */
    const struct ovsrec_bridge *cfg;

    /* OpenFlow switch processing. */
    struct ofproto *ofproto;    /* OpenFlow switch. */

    /* Bridge ports. */
    struct hmap ports;          /* "struct port"s indexed by name. */
    struct hmap ifaces;         /* "struct iface"s indexed by ofp_port. */
    struct hmap iface_by_name;  /* "struct iface"s indexed by name. */

    struct list ofpp_garbage;   /* "struct ofpp_garbage" slated for removal. */
    struct hmap if_cfg_todo;    /* "struct if_cfg"s slated for creation.   Indexed on 'cfg->name'. */

    /* Port mirroring. */
    struct hmap mirrors;        /* "struct mirror" indexed by UUID. */
};
--------
 定义了 OpenFlow switch.通常情况下ofproto 实现只是查询这些域而不该修改它们。

struct ofproto {
    struct hmap_node hmap_node;    //哈希表管理这些of  switch
    const struct ofproto_class *ofproto_class;   //openflow protocol 的具体实现
    char *type;                 /* Datapath type. */
    char *name;                 /* Datapath name. */

    /* Settings. */
    uint64_t fallback_dpid;     /* Datapath ID if no better choice found. */
    uint64_t datapath_id;       /* Datapath ID. */
    unsigned flow_eviction_threshold; /* Threshold at which to begin flow table eviction. Only affects the
                                       * ofproto-dpif implementation */
    bool forward_bpdu;          /* Option to allow forwarding of BPDU frames 
                                 * when NORMAL action is invoked. */
    char *mfr_desc;             /* Manufacturer. */
    char *hw_desc;              /* Hardware. */
    char *sw_desc;              /* Software version. */
    char *serial_desc;          /* Serial number. */
    char *dp_desc;              /* Datapath description. */
    enum ofp_config_flags frag_handling; /* One of OFPC_*.  */

    /* Datapath. */
    struct hmap ports;          /* Contains "struct ofport"s. */
    struct shash port_by_name; 
    uint16_t max_ports;         /* Max possible OpenFlow port num, plus one. */

    /* Flow tables. */
    struct oftable *tables;
    int n_tables;

    /* OpenFlow connections. */
    struct connmgr *connmgr;

    /* Flow table operation tracking. */
    int state;                  /* Internal state. */
    struct list pending;        /* List of "struct ofopgroup"s. */
    unsigned int n_pending;     /* list_size(&pending). */
    struct hmap deletions;      /* All OFOPERATION_DELETE "ofoperation"s. */

    /* Flow table operation logging. */
    int n_add, n_delete, n_modify; /* Number of unreported ops of each kind. */
    long long int first_op, last_op; /* Range of times for unreported ops. */
    long long int next_op_report;    /* Time to report ops, or LLONG_MAX. */
    long long int op_backoff;        /* Earliest time to report ops again. */
};


3.4 bridge_run_fast 通过 ofproto_run_fast完成核心功能,呼叫具体of实现的run_fast函数 (p->ofproto_class->run_fast(p))
ofproto.c :ofproto_run_fast() --> ofproto-dpif.c :  run_fast()。首先遍历所有的port调用 port_run_fast(),然后检查是否有upcall(内核空间不能处理而传到用户层来的),呼叫handle_upcalls() 。

static int  run_fast(struct ofproto *ofproto_) {
    struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);  
     //利用container_of 根据ofproto_得到ofproto_dpif 容器的地址,其实就是构造一个ofproto_dpif,填充里面的up域。
    struct ofport_dpif *ofport;
    unsigned int work;

    HMAP_FOR_EACH (ofport, up.hmap_node, &ofproto->up.ports) {
        port_run_fast(ofport);
    }

    /*利用batch IO来优化系统 */
    work = 0;
    while (work < FLOW_MISS_MAX_BATCH) {
        int retval = handle_upcalls(ofproto, FLOW_MISS_MAX_BATCH - work);
        if (retval <= 0) {
            return -retval;
        }
        work += retval;
    }
    return 0;
}


struct ofproto_dpif {
    struct hmap_node all_ofproto_dpifs_node; /* In 'all_ofproto_dpifs'. */
    struct ofproto up;
    struct dpif *dpif;

    /* Special OpenFlow rules. */
    struct rule_dpif *miss_rule; /* Sends flow table misses to controller. */
    struct rule_dpif *no_packet_in_rule; /* Drops flow table misses. */

    /* Statistics. */
    uint64_t n_matches;

    /* Bridging. */
    struct netflow *netflow;
    struct dpif_sflow *sflow;
    struct hmap bundles;        /* Contains "struct ofbundle"s. */
    struct mac_learning *ml;
    struct ofmirror *mirrors[MAX_MIRRORS];
    bool has_mirrors;
    bool has_bonded_bundles;

    /* Expiration. */
    struct timer next_expiration;

    /* Facets. */
    struct hmap facets;
    struct hmap subfacets;
    struct governor *governor;

    /* Revalidation. */
    struct table_dpif tables[N_TABLES];
    enum revalidate_reason need_revalidate;
    struct tag_set revalidate_set;

    /* Support for debugging async flow mods. */
    struct list completions;

    bool has_bundle_action; /* True when the first bundle action appears. */
    struct netdev_stats stats; /* To account packets generated and consumed in
                                * userspace. */

    /* Spanning tree. */
    struct stp *stp;
    long long int stp_last_tick;

    /* VLAN splinters. */
    struct hmap realdev_vid_map; /* (realdev,vid) -> vlandev. */
    struct hmap vlandev_map;     /* vlandev -> (realdev,vid). */
};


3.5  函数 port_run_fast 根据CFM(连通性故障管理)中的时间约束ccm_interval(lib/cfm.c),检查是否需要发送连接性检查消息(CCM,来自IEEE 802.1ag)有的话就构造一个CCM消息到packet中,而后发送packet。发送的过程是构造netlink msg  attribute,其中有key 和 action,最终调用struct dpif_class中的execute方法(具体实现看dpif_linux_class中字段)感觉是通过generic netlink机制发送到内核空间进行处理。

static void port_run_fast(struct ofport_dpif *ofport) {
    if (ofport->cfm && cfm_should_send_ccm(ofport->cfm)) {
        struct ofpbuf packet;

        ofpbuf_init(&packet, 0);
        cfm_compose_ccm(ofport->cfm, &packet, ofport->up.pp.hw_addr);
        send_packet(ofport, &packet);
        ofpbuf_uninit(&packet);
    }
}
----
struct ofport_dpif {
    struct ofport up;

    uint32_t odp_port;
    struct ofbundle *bundle;    /* Bundle that contains this port, if any. */
    struct list bundle_node;    /* In struct ofbundle's "ports" list. */
    struct cfm *cfm;            /* Connectivity Fault Management, if any. */
    tag_type tag;               /* Tag associated with this port. */
    uint32_t bond_stable_id;    /* stable_id to use as bond slave, or 0. */
    bool may_enable;            /* May be enabled in bonds. */
    long long int carrier_seq;  /* Carrier status changes. */

    /* Spanning tree. */
    struct stp_port *stp_port;  /* Spanning Tree Protocol, if any. */
    enum stp_state stp_state;   /* Always STP_DISABLED if STP not in use. */
    long long int stp_state_entered;

    struct hmap priorities;     /* Map of attached 'priority_to_dscp's. */
};

// 通过'ofport'发送 'packet'.
static int  send_packet(const struct ofport_dpif *ofport, struct ofpbuf *packet) {
    const struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofport->up.ofproto);
    struct ofpbuf key, odp_actions;
    struct odputil_keybuf keybuf;
    uint16_t odp_port;
    struct flow flow;
    int error;

    flow_extract(packet, 0, 0, NULL, 0, &flow);   //根据packet提取数据到flow中
    odp_port = vsp_realdev_to_vlandev(ofproto, ofport->odp_port, flow.vlan_tci);
     //通过实际端口 和 vlan_tci 得到 vlandev (如 eth0.19)
    if (odp_port != ofport->odp_port) {
        eth_pop_vlan(packet); //从packet中移除最外的VLAN 头,把flow.vlan_tci置为0
        flow.vlan_tci = htons(0);
    }

    ofpbuf_use_stack(&key, &keybuf, sizeof keybuf);
    odp_flow_key_from_flow(&key, &flow);
     //在key中构造attributes ,把flow中相关信息存入其中 ,如 nl_msg_put_u32(buf, OVS_KEY_ATTR_IN_PORT,
    //  ofp_port_to_odp_port(flow->in_port));   --lib/odf-util.c 
    ofpbuf_init(&odp_actions, 32);
    compose_sflow_action(ofproto, &odp_actions, &flow, odp_port); //这里是在attribute中存入 flow action:output

    nl_msg_put_u32(&odp_actions, OVS_ACTION_ATTR_OUTPUT, odp_port);
    error = dpif_execute(ofproto->dpif, key.data, key.size, odp_actions.data, odp_actions.size, packet);
   //把上面 构造的key和action都装入一个struct dpif_execute(lib/dpif.h)容器中,调用具体实现的execute方法。
    ofpbuf_uninit(&odp_actions);

    if (error) {
        VLOG_WARN_RL(&rl, "%s: failed to send packet on port %"PRIu32" (%s)",
                     ofproto->up.name, odp_port, strerror(error));
    }
    ofproto_update_local_port_stats(ofport->up.ofproto, packet->size, 0);
    return error;
}
-----------
促使datapath interface来对packet中的以太网帧执行actions,key看似对于packet来说是冗余的,但是仍然包含一些不能从packet中恢复的元数据,比如tunnel和in_port;
int dpif_execute(struct dpif *dpif,  const struct nlattr *key, size_t key_len,
             const struct nlattr *actions, size_t actions_len,  const struct ofpbuf *buf)
{
    struct dpif_execute execute;

    execute.key = key;
    execute.key_len = key_len;
    execute.actions = actions;
    execute.actions_len = actions_len;
    execute.packet = buf;
    return dpif_execute__(dpif, &execute);
}
---------
static int  dpif_execute__(struct dpif *dpif, const struct dpif_execute *execute)
{
    int error;

    COVERAGE_INC(dpif_execute);
    if (execute->actions_len > 0) {
        error = dpif->dpif_class->execute(dpif, execute);
    } else {
        error = 0;
    }

    log_execute_message(dpif, execute, error);
    return error;
}
(接下来的过程细节有待完善)
----->dpif_linux_execute ---> dpif_linux_execute__ ---> nl_sock_transact  -->
|dp-linux.c| |netlink-socket.c|
--------------------------------------------
3.6  函数handle_upcalls 从内核空间datapath 接收内核空间没能处理的包(流?)构造struct dpif_upcall (lib/dpif.h)。

static int  handle_upcalls(struct ofproto_dpif *ofproto, unsigned int max_batch) {
    struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
    struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
    uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
    int n_processed;
    int n_misses;
    int i;

    assert(max_batch <= FLOW_MISS_MAX_BATCH);

    n_misses = 0;
    for (n_processed = 0; n_processed < max_batch; n_processed++) {
        struct dpif_upcall *upcall = &misses[n_misses];
        struct ofpbuf *buf = &miss_bufs[n_misses];
        int error;

        ofpbuf_use_stub(buf, miss_buf_stubs[n_misses], sizeof miss_buf_stubs[n_misses]);  // ??
        error = dpif_recv(ofproto->dpif, upcall, buf);
       
        switch (classify_upcall(upcall)) {
        case MISS_UPCALL:
            /* Handle it later. */
            n_misses++;
            break;

        case SFLOW_UPCALL:
            if (ofproto->sflow) {
                handle_sflow_upcall(ofproto, upcall);
//对于SFLOW_UPCALL 和 BAD_UPCALL,进行对应处理后释放存有 upcall 消息的 buf,而对于
//MISS_UPCALL 类型,则调用 handle_miss_upcalls 进行后续的处理。

            }
            ofpbuf_uninit(buf);
            break;

        case BAD_UPCALL:
            ofpbuf_uninit(buf);
            break;
        }
    }
    /* Handle deferred MISS_UPCALL processing. */
    handle_miss_upcalls(ofproto, misses, n_misses);
    for (i = 0; i < n_misses; i++) {
        ofpbuf_uninit(&miss_bufs[i]);
    }
    return n_processed;
}
--------
//轮询dpif 看是否有upcall ,如果有的话就用upcall表征,数据在buf中,这里要确保设置了能够从dpif接收packets即 dpif_recv_set() ;'upcall->packet' 和 'upcall->key'都会指向用户提供的buf中存的数据,所以它们不能分别进行内存释放。

int dpif_recv(struct dpif *dpif, struct dpif_upcall *upcall, struct ofpbuf *buf)
{
    int error = dpif->dpif_class->recv(dpif, upcall, buf);
    if (!error && !VLOG_DROP_DBG(&dpmsg_rl)) {
        struct ds flow;   // dynamic string (lib/dynamic-string.h)
        char *packet;
          //以字符串形式返回代表ethernet frame的内容(lib/ofp-print.c)
        packet = ofp_packet_to_string(upcall->packet->data, upcall->packet->size);

        ds_init(&flow);
        odp_flow_key_format(upcall->key, upcall->key_len, &flow);
          //将upcall中的key中的attribute-OVS_KEY_ATTR_*分别解析出来到flow中,只是为了日志输出。
        VLOG_DBG("%s: %s upcall:\n%s\n%s",
                 dpif_name(dpif), dpif_upcall_type_to_string(upcall->type), ds_cstr(&flow), packet);

        ds_destroy(&flow);
        free(packet);
    } else if (error && error != EAGAIN) {
        log_operation(dpif, "recv", error);
    }
    return error;
}
--------------

static void handle_miss_upcalls(struct ofproto_dpif *ofproto, struct dpif_upcall *upcalls,size_t n_upcalls) {
    struct dpif_upcall *upcall;
    struct flow_miss *miss;
    struct flow_miss misses[FLOW_MISS_MAX_BATCH];
    struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
    struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
    struct hmap todo;
    int n_misses;
    size_t n_ops;
    size_t i;

    //构造一个to do list ,这相当于从每个packet中析取出flow 然后将那些具有相同流的packets 集合到 "flow_miss"structure 中,这样我们可以一同处理它们。
    hmap_init(&todo);
    n_misses = 0;
    for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
        struct flow_miss *miss = &misses[n_misses];
        struct flow_miss *existing_miss;
        struct flow flow;
        uint32_t hash;

        /*和函数 odp_flow_key_to_flow()类似,函数ofproto_dpif_extract_flow_key(ofproto/ofproto-dpif.c)将key中的特定长度的OVS_KEY_ATTR_* attribute转换成一个流结构体flow,返回ODP_FIT_* 值 来表征 upcall->key 和我们期望的合适度。 */
        miss->key_fitness = ofproto_dpif_extract_flow_key(  ofproto, upcall->key, upcall->key_len,  &flow, &miss->initial_tci, upcall->packet);
        if (miss->key_fitness == ODP_FIT_ERROR) {
            continue;
        }
          /* flow_extract(lib/flow.c) 用packet中的信息,'skb_priority', 'tnl', 及 'ofp_in_port' 来填充flow中的域(重要),同时packet中的一些指针也会真正有效,层层跟进(比如packet->l4 = b.data)**/
        flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark, &flow.tunnel, flow.in_port, &miss->flow);

        //将新的packets加入todo list 中(通过对struct flow_miss中的flow进行hash)
        hash = flow_hash(&miss->flow, 0);
        existing_miss = flow_miss_find(&todo, &miss->flow, hash);
        if (!existing_miss) {
            hmap_insert(&todo, &miss->hmap_node, hash);
            miss->key = upcall->key;
            miss->key_len = upcall->key_len;
            miss->upcall_type = upcall->type;
            list_init(&miss->packets);

            n_misses++;
        } else {
            miss = existing_miss;
        }
        list_push_back(&miss->packets, &upcall->packet->list_node);
    }

   /*然后对todo list中的packets处理看是否完全匹配流表分别呼叫handle_flow_miss_without_facet,          handle_flow_miss_with_facet。*/
    n_ops = 0;
    HMAP_FOR_EACH (miss, hmap_node, &todo) {
        handle_flow_miss(ofproto, miss, flow_miss_ops, &n_ops);
    }
    assert(n_ops <= ARRAY_SIZE(flow_miss_ops));

    /* Execute batch. */
    for (i = 0; i < n_ops; i++) {
        dpif_ops[i] = &flow_miss_ops[i].dpif_op;
    }
    dpif_operate(ofproto->dpif, dpif_ops, n_ops);
     //根据dpif_op中的操作类型分别调用dpif_flow_put/del/execute__(lib/dpif.c)
    /* Free memory and update facets. */
    for (i = 0; i < n_ops; i++) {
        struct flow_miss_op *op = &flow_miss_ops[i];

        switch (op->dpif_op.type) {
        case DPIF_OP_EXECUTE:
            break;

        case DPIF_OP_FLOW_PUT:
            if (!op->dpif_op.error) {
                op->subfacet->path = subfacet_want_path(op->subfacet->slow);
            }
            break;

        case DPIF_OP_FLOW_DEL:
            NOT_REACHED();
        }

        free(op->garbage);
    }
    hmap_destroy(&todo);
}

































你可能感兴趣的:(bridge,handle_upcalls,vswitchd,ofproto_dpif,ofproto)