+-------------------+
| ovs-vswitchd |<-->ovsdb-server
+-------------------+
| ofproto |<-->OpenFlow controllers
+--------+-+--------+
| netdev | | ofproto|
+--------+ |provider|
| netdev | +--------+
|provider|
+--------+
ovs-vswitchd : ovs的主要userspace程序,它通过ipc通道将db中的配置下发到ofproto层,也会把状态和统计信息保存到db中
ofproto : Open vSwitch library,可以实现ovs交换机,它与controller通信,
netdev : 网路设备的抽象
_
| +-------------------+
| | ovs-vswitchd |<-->ovsdb-server
| +-------------------+
| | ofproto |<-->OpenFlow controllers
| +--------+-+--------+ _
| | netdev | |ofproto-| |
| +--------+ | dpif | |
| | netdev | +--------+ |
| |provider| | dpif | |
userspace | +---||---+ +--------+ |
| || | dpif | | implementation of
| || |provider| | ofproto provider
|_ || +---||---+ |
|| || |
_ +---||-----+---||---+ |
| | |datapath| |
kernel | | +--------+ _|
| | |
|_ +--------||---------+
||
physical
NIC
"netdev provider"实现一个操作系统和具体的硬件端口,e.g. eth0 on Linux.
netdev_class是一个可以实现网络设备的接口类
struct netdev_class {
/* Type of netdevs in this class, e.g. "system", "tap", "gre", etc.
*
* One of the providers should supply a "system" type, since this is
* the type assumed if no type is specified when opening a netdev.
* The "system" type corresponds to an existing network device on
* the system. */
const char *type;
/* If 'true' then this netdev should be polled by PMD threads. */
bool is_pmd;
/* ## ------------------- ## */
/* ## Top-Level Functions ## */
/* ## ------------------- ## */
/* Called when the netdev provider is registered, typically at program
* startup. Returning an error from this function will prevent any network
* device in this class from being opened.
*
* This function may be set to null if a network device class needs no
* initialization at registration time. */
int (*init)(void);
/* Performs periodic work needed by netdevs of this class. May be null if
* no periodic work is necessary.
*
* 'netdev_class' points to the class. It is useful in case the same
* function is used to implement different classes. */
void (*run)(const struct netdev_class *netdev_class);
/* Arranges for poll_block() to wake up if the "run" member function needs
* to be called. Implementations are additionally required to wake
* whenever something changes in any of its netdevs which would cause their
* ->change_seq() function to change its result. May be null if nothing is
* needed here.
*
* 'netdev_class' points to the class. It is useful in case the same
* function is used to implement different classes. */
void (*wait)(const struct netdev_class *netdev_class);
/* ## ---------------- ## */
/* ## netdev Functions ## */
/* ## ---------------- ## */
struct netdev *(*alloc)(void);
int (*construct)(struct netdev *);
void (*destruct)(struct netdev *);
void (*dealloc)(struct netdev *);
int (*get_config)(const struct netdev *netdev, struct smap *args);
/* Changes the device 'netdev''s configuration to 'args'.
int (*set_config)(struct netdev *netdev, const struct smap *args,
char **errp);
const struct netdev_tunnel_config *
(*get_tunnel_config)(const struct netdev *netdev);
int (*build_header)(const struct netdev *, struct ovs_action_push_tnl *data,
const struct netdev_tnl_build_header_params *params);
void (*push_header)(const struct netdev *,
struct dp_packet *packet,
const struct ovs_action_push_tnl *data);
struct dp_packet * (*pop_header)(struct dp_packet *packet);
int (*get_numa_id)(const struct netdev *netdev);
int (*set_tx_multiq)(struct netdev *netdev, unsigned int n_txq);
int (*send)(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
bool concurrent_txq);
void (*send_wait)(struct netdev *netdev, int qid);
int (*set_etheraddr)(struct netdev *netdev, const struct eth_addr mac);
int (*get_etheraddr)(const struct netdev *netdev, struct eth_addr *mac);
int (*get_mtu)(const struct netdev *netdev, int *mtup);
int (*set_mtu)(struct netdev *netdev, int mtu);
int (*get_ifindex)(const struct netdev *netdev);
int (*get_carrier)(const struct netdev *netdev, bool *carrier);
long long int (*get_carrier_resets)(const struct netdev *netdev);
int (*set_miimon_interval)(struct netdev *netdev, long long int interval);
int (*get_stats)(const struct netdev *netdev, struct netdev_stats *);
int (*get_custom_stats)(const struct netdev *netdev,
struct netdev_custom_stats *custom_stats);
int (*get_features)(const struct netdev *netdev,
enum netdev_features *current,
enum netdev_features *advertised,
enum netdev_features *supported,
enum netdev_features *peer);
int (*set_advertisements)(struct netdev *netdev,
enum netdev_features advertise);
enum netdev_pt_mode (*get_pt_mode)(const struct netdev *netdev);
int (*set_policing)(struct netdev *netdev, unsigned int kbits_rate,
unsigned int kbits_burst);
int (*get_qos_types)(const struct netdev *netdev, struct sset *types);
int (*get_qos_capabilities)(const struct netdev *netdev,
const char *type,
struct netdev_qos_capabilities *caps);
int (*get_qos)(const struct netdev *netdev,
const char **typep, struct smap *details);
int (*set_qos)(struct netdev *netdev,
const char *type, const struct smap *details);
int (*get_queue)(const struct netdev *netdev,
unsigned int queue_id, struct smap *details);
int (*set_queue)(struct netdev *netdev,
unsigned int queue_id, const struct smap *details);
int (*delete_queue)(struct netdev *netdev, unsigned int queue_id);
int (*get_queue_stats)(const struct netdev *netdev, unsigned int queue_id,
struct netdev_queue_stats *stats);
int (*queue_dump_start)(const struct netdev *netdev, void **statep);
int (*queue_dump_next)(const struct netdev *netdev, void *state,
unsigned int *queue_id, struct smap *details);
int (*queue_dump_done)(const struct netdev *netdev, void *state);
int (*dump_queue_stats)(const struct netdev *netdev,
void (*cb)(unsigned int queue_id,
struct netdev_queue_stats *,
void *aux),
void *aux);
int (*set_in4)(struct netdev *netdev, struct in_addr addr,
struct in_addr mask);
int (*get_addr_list)(const struct netdev *netdev, struct in6_addr **in,
struct in6_addr **mask, int *n_in6);
int (*add_router)(struct netdev *netdev, struct in_addr router);
int (*get_next_hop)(const struct in_addr *host, struct in_addr *next_hop,
char **netdev_name);
int (*get_status)(const struct netdev *netdev, struct smap *smap);
int (*arp_lookup)(const struct netdev *netdev, ovs_be32 ip,
struct eth_addr *mac);
int (*update_flags)(struct netdev *netdev, enum netdev_flags off,
enum netdev_flags on, enum netdev_flags *old_flags);
int (*reconfigure)(struct netdev *netdev);
/* ## -------------------- ## */
/* ## netdev_rxq Functions ## */
/* ## -------------------- ## */
struct netdev_rxq *(*rxq_alloc)(void);
int (*rxq_construct)(struct netdev_rxq *);
void (*rxq_destruct)(struct netdev_rxq *);
void (*rxq_dealloc)(struct netdev_rxq *);
int (*rxq_recv)(struct netdev_rxq *rx, struct dp_packet_batch *batch,
int *qfill);
void (*rxq_wait)(struct netdev_rxq *rx);
/* Discards all packets waiting to be received from 'rx'. */
int (*rxq_drain)(struct netdev_rxq *rx);
/* ## -------------------------------- ## */
/* ## netdev flow offloading functions ## */
/* ## -------------------------------- ## */
int (*flow_flush)(struct netdev *);
int (*flow_dump_create)(struct netdev *, struct netdev_flow_dump **dump);
int (*flow_dump_destroy)(struct netdev_flow_dump *);
bool (*flow_dump_next)(struct netdev_flow_dump *, struct match *,
struct nlattr **actions,
struct dpif_flow_stats *stats,
struct dpif_flow_attrs *attrs, ovs_u128 *ufid,
struct ofpbuf *rbuffer, struct ofpbuf *wbuffer);
int (*flow_put)(struct netdev *, struct match *, struct nlattr *actions,
size_t actions_len, const ovs_u128 *ufid,
struct offload_info *info, struct dpif_flow_stats *);
int (*flow_get)(struct netdev *, struct match *, struct nlattr **actions,
const ovs_u128 *ufid, struct dpif_flow_stats *,
struct dpif_flow_attrs *, struct ofpbuf *wbuffer);
int (*flow_del)(struct netdev *, const ovs_u128 *ufid,
struct dpif_flow_stats *);
int (*init_flow_api)(struct netdev *);
uint32_t (*get_block_id)(struct netdev *);
};
ovs已经实现的网络设备有多种,netdev-linux.c、netdev_vport.c、netdev-dummy.c等,具体讲一下netdev_vport,对于任意的net_device设备,当把设备连接到br上时(即add port),需要把OVS的接收函数hook到net_device的包接收函数中,这样net_device的进包就不会进入常规的内核协议栈中,而是由OVS接过来处理,实现如下:
netdev_create
static struct vport *netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
return ovs_netdev_link(vport, parms->name);
}
ovs_netdev_link
struct vport *ovs_netdev_link(struct vport *vport, const char *name)
{
int err;
vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
if (!vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
// 判断如果是internal或者不是Ether或者是回环,则直接丢弃
if (vport->dev->flags & IFF_LOOPBACK ||
(vport->dev->type != ARPHRD_ETHER &&
vport->dev->type != ARPHRD_NONE) ||
ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
}
rtnl_lock();
err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp),
NULL, NULL, NULL);
if (err)
goto error_unlock;
err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_master_upper_dev_unlink;
dev_disable_lro(vport->dev);
dev_set_promiscuity(vport->dev, 1);
vport->dev->priv_flags |= IFF_OVS_DATAPATH;
rtnl_unlock();
return vport;
error_master_upper_dev_unlink:
netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
error_unlock:
rtnl_unlock();
error_put:
dev_put(vport->dev);
error_free_vport:
ovs_vport_free(vport);
return ERR_PTR(err);
}
通过netdev_rx_handler_register将vport与dev连接在一起,dev存储在vport的priv数据中,netdev_frame_hook实际上是调用了netdev_port_receive,netdev_port_receive接着又调用了osv_vport_receive,之后的过程就是我们所熟悉的了。
ofproto_class定义了ofproto Provider实现的一个接口,具体实现为ofproto_dpif_class,除了ofproto_class,还有几个重要的代表不同组成部分的数据结构,如struct ofport,rule,oftable,ofgroup等。
ofproto_class
struct ofproto_class {
void (*init)(const struct shash *iface_hints);
void (*enumerate_types)(struct sset *types);
int (*enumerate_names)(const char *type, struct sset *names);
int (*del)(const char *type, const char *name);
const char *(*port_open_type)(const char *datapath_type,
const char *port_type);
int (*type_run)(const char *type);
void (*type_wait)(const char *type);
struct ofproto *(*alloc)(void);
int (*construct)(struct ofproto *ofproto);
void (*destruct)(struct ofproto *ofproto, bool del);
void (*dealloc)(struct ofproto *ofproto);
int (*run)(struct ofproto *ofproto);
void (*wait)(struct ofproto *ofproto);
void (*get_memory_usage)(const struct ofproto *ofproto,
struct simap *usage);
void (*type_get_memory_usage)(const char *type, struct simap *usage);
void (*flush)(struct ofproto *ofproto);
void (*query_tables)(struct ofproto *ofproto,
struct ofputil_table_features *features,
struct ofputil_table_stats *stats);
void (*set_tables_version)(struct ofproto *ofproto, ovs_version_t version);
struct ofport *(*port_alloc)(void);
int (*port_construct)(struct ofport *ofport);
void (*port_destruct)(struct ofport *ofport, bool del);
void (*port_dealloc)(struct ofport *ofport);
void (*port_modified)(struct ofport *ofport);
int (*port_query_by_name)(const struct ofproto *ofproto,
const char *devname, struct ofproto_port *port);
int (*port_add)(struct ofproto *ofproto, struct netdev *netdev);
int (*port_del)(struct ofproto *ofproto, ofp_port_t ofp_port);
int (*port_set_config)(const struct ofport *port, const struct smap *cfg);
/* Get port stats */
int (*port_get_stats)(const struct ofport *port,
struct netdev_stats *stats);
int (*port_dump_start)(const struct ofproto *ofproto, void **statep);
int (*port_dump_next)(const struct ofproto *ofproto, void *state,
struct ofproto_port *port);
int (*port_dump_done)(const struct ofproto *ofproto, void *state);
int (*port_poll)(const struct ofproto *ofproto, char **devnamep);
void (*port_poll_wait)(const struct ofproto *ofproto);
int (*port_is_lacp_current)(const struct ofport *port);
int (*port_get_lacp_stats)(const struct ofport *port,
struct lacp_slave_stats *stats);
enum ofperr (*rule_choose_table)(const struct ofproto *ofproto,
const struct minimatch *match,
uint8_t *table_idp);
/* Rules indexed on their cookie values, in all flow tables. */
struct rule *(*rule_alloc)(void);
enum ofperr (*rule_construct)(struct rule *rule)
/* OVS_REQUIRES(ofproto_mutex) */;
void (*rule_insert)(struct rule *rule, struct rule *old_rule,
bool forward_counts)
/* OVS_REQUIRES(ofproto_mutex) */;
void (*rule_delete)(struct rule *rule) /* OVS_REQUIRES(ofproto_mutex) */;
void (*rule_destruct)(struct rule *rule);
void (*rule_dealloc)(struct rule *rule);
/* ## ------------------------- ## */
/* ## OFPP_NORMAL configuration ## */
/* ## ------------------------- ## */
/* Configures NetFlow on 'ofproto' according to the options in
* 'netflow_options', or turns off NetFlow if 'netflow_options' is NULL.
*
* EOPNOTSUPP as a return value indicates that 'ofproto' does not support
* NetFlow, as does a null pointer. */
int (*set_netflow)(struct ofproto *ofproto,
const struct netflow_options *netflow_options);
void (*get_netflow_ids)(const struct ofproto *ofproto,
uint8_t *engine_type, uint8_t *engine_id);
/* Configures sFlow on 'ofproto' according to the options in
* 'sflow_options', or turns off sFlow if 'sflow_options' is NULL.
*
* EOPNOTSUPP as a return value indicates that 'ofproto' does not support
* sFlow, as does a null pointer. */
int (*set_sflow)(struct ofproto *ofproto,
const struct ofproto_sflow_options *sflow_options);
/* Configures IPFIX on 'ofproto' according to the options in
* 'bridge_exporter_options' and the 'flow_exporters_options'
* array, or turns off IPFIX if 'bridge_exporter_options' and
* 'flow_exporters_options' is NULL.
*
* EOPNOTSUPP as a return value indicates that 'ofproto' does not support
* IPFIX, as does a null pointer. */
int (*set_ipfix)(
struct ofproto *ofproto,
const struct ofproto_ipfix_bridge_exporter_options
*bridge_exporter_options,
const struct ofproto_ipfix_flow_exporter_options
*flow_exporters_options, size_t n_flow_exporters_options);
/* Gets IPFIX stats on 'ofproto' according to the exporter of birdge
* IPFIX or flow-based IPFIX.
*
* OFPERR_NXST_NOT_CONFIGURED as a return value indicates that bridge
* IPFIX or flow-based IPFIX is not configured. */
ofproto_dpif是内置的ofproto provider,datapath尝试匹配收到的数据包,如果匹配成功,则执行action;否则则将数据包传给dpif(dpif将自己的大部分功能下放给dpif provider),dpif尝试进行full flow table lookup,如果找到,则执行action;否则,则把数据报传送给controller。
dpif class是datapath interface实现的工厂接口类,用于和实际的datapath, e.g. openvswitch.ko, 或者userspace datapath交互。目前已有的两个dpif的实现是dpif-netlink和dpif-netdev,前者是基于内核datapath的dpif实现,后者基于用户态datapath,具体定义如下:
dpif_netdev_class
const struct dpif_class dpif_netdev_class = {
"netdev",
dpif_netdev_init,
dpif_netdev_enumerate,
dpif_netdev_port_open_type,
dpif_netdev_open,
dpif_netdev_close,
dpif_netdev_destroy,
dpif_netdev_run,
dpif_netdev_wait,
dpif_netdev_get_stats,
dpif_netdev_port_add,
dpif_netdev_port_del,
dpif_netdev_port_set_config,
dpif_netdev_port_query_by_number,
dpif_netdev_port_query_by_name,
NULL, /* port_get_pid */
dpif_netdev_port_dump_start,
dpif_netdev_port_dump_next,
dpif_netdev_port_dump_done,
dpif_netdev_port_poll,
dpif_netdev_port_poll_wait,
dpif_netdev_flow_flush,
dpif_netdev_flow_dump_create,
dpif_netdev_flow_dump_destroy,
dpif_netdev_flow_dump_thread_create,
dpif_netdev_flow_dump_thread_destroy,
dpif_netdev_flow_dump_next,
dpif_netdev_operate,
NULL, /* recv_set */
NULL, /* handlers_set */
dpif_netdev_set_config,
dpif_netdev_queue_to_priority,
NULL, /* recv */
NULL, /* recv_wait */
NULL, /* recv_purge */
dpif_netdev_register_dp_purge_cb,
dpif_netdev_register_upcall_cb,
dpif_netdev_enable_upcall,
dpif_netdev_disable_upcall,
dpif_netdev_get_datapath_version,
dpif_netdev_ct_dump_start,
dpif_netdev_ct_dump_next,
dpif_netdev_ct_dump_done,
dpif_netdev_ct_flush,
dpif_netdev_ct_set_maxconns,
dpif_netdev_ct_get_maxconns,
dpif_netdev_ct_get_nconns,
NULL, /* ct_set_limits */
NULL, /* ct_get_limits */
NULL, /* ct_del_limits */
dpif_netdev_meter_get_features,
dpif_netdev_meter_set,
dpif_netdev_meter_get,
dpif_netdev_meter_del,
};
dpif_netlink_class
const struct dpif_class dpif_netlink_class = {
"system",
NULL, /* init */
dpif_netlink_enumerate,
NULL,
dpif_netlink_open,
dpif_netlink_close,
dpif_netlink_destroy,
dpif_netlink_run,
NULL, /* wait */
dpif_netlink_get_stats,
dpif_netlink_port_add,
dpif_netlink_port_del,
NULL, /* port_set_config */
dpif_netlink_port_query_by_number,
dpif_netlink_port_query_by_name,
dpif_netlink_port_get_pid,
dpif_netlink_port_dump_start,
dpif_netlink_port_dump_next,
dpif_netlink_port_dump_done,
dpif_netlink_port_poll,
dpif_netlink_port_poll_wait,
dpif_netlink_flow_flush,
dpif_netlink_flow_dump_create,
dpif_netlink_flow_dump_destroy,
dpif_netlink_flow_dump_thread_create,
dpif_netlink_flow_dump_thread_destroy,
dpif_netlink_flow_dump_next,
dpif_netlink_operate,
dpif_netlink_recv_set,
dpif_netlink_handlers_set,
NULL, /* set_config */
dpif_netlink_queue_to_priority,
dpif_netlink_recv,
dpif_netlink_recv_wait,
dpif_netlink_recv_purge,
NULL, /* register_dp_purge_cb */
NULL, /* register_upcall_cb */
NULL, /* enable_upcall */
NULL, /* disable_upcall */
dpif_netlink_get_datapath_version, /* get_datapath_version */
dpif_netlink_ct_dump_start,
dpif_netlink_ct_dump_next,
dpif_netlink_ct_dump_done,
dpif_netlink_ct_flush,
NULL, /* ct_set_maxconns */
NULL, /* ct_get_maxconns */
NULL, /* ct_get_nconns */
dpif_netlink_ct_set_limits,
dpif_netlink_ct_get_limits,
dpif_netlink_ct_del_limits,
dpif_netlink_meter_get_features,
dpif_netlink_meter_set,
dpif_netlink_meter_get,
dpif_netlink_meter_del,
};
ofproto_dpif_class
const struct ofproto_class ofproto_dpif_class = {
init,
enumerate_types,
enumerate_names,
del,
port_open_type,
type_run,
type_wait,
alloc,
construct,
destruct,
dealloc,
run,
ofproto_dpif_wait,
NULL, /* get_memory_usage. */
type_get_memory_usage,
flush,
query_tables,
set_tables_version,
port_alloc,
port_construct,
port_destruct,
port_dealloc,
port_modified,
port_reconfigured,
port_query_by_name,
port_add,
port_del,
port_set_config,
port_get_stats,
port_dump_start,
port_dump_next,
port_dump_done,
port_poll,
port_poll_wait,
port_is_lacp_current,
port_get_lacp_stats,
NULL, /* rule_choose_table */
rule_alloc,
rule_construct,
rule_insert,
NULL, /* rule_delete */
rule_destruct,
rule_dealloc,
rule_get_stats,
packet_xlate,
packet_xlate_revert,
packet_execute,
set_frag_handling,
nxt_resume,
set_netflow,
get_netflow_ids,
set_sflow,
set_ipfix,
get_ipfix_stats,
set_cfm,
cfm_status_changed,
get_cfm_status,
set_lldp,
get_lldp_status,
set_aa,
aa_mapping_set,
aa_mapping_unset,
aa_vlan_get_queued,
aa_vlan_get_queue_size,
set_bfd,
bfd_status_changed,
get_bfd_status,
set_stp,
get_stp_status,
set_stp_port,
get_stp_port_status,
get_stp_port_stats,
set_rstp,
get_rstp_status,
set_rstp_port,
get_rstp_port_status,
set_queues,
bundle_set,
bundle_remove,
mirror_set__,
mirror_get_stats__,
set_flood_vlans,
is_mirror_output_bundle,
forward_bpdu_changed,
set_mac_table_config,
set_mcast_snooping,
set_mcast_snooping_port,
meter_get_features,
meter_set,
meter_get,
meter_del,
group_alloc, /* group_alloc */
group_construct, /* group_construct */
group_destruct, /* group_destruct */
group_dealloc, /* group_dealloc */
NULL, /* group_modify */
group_get_stats, /* group_get_stats */
get_datapath_version, /* get_datapath_version */
type_set_config,
ct_flush, /* ct_flush */
};
《How to Port Open vSwitch to New Software or Hardware》