ovsctl这个应用程序主要职责是根据用户的命令和ovsdb沟通,将配置信息更新到数据库中,而vswitchd会在需要重新配置的时候和ovsdb打交道,而后和内核datapath通信执行真正的动作(通过netlink传递)。这里规定了命令的语法格式(vsctl_command_syntax )以及所支持的所有命令,这里主要看add-port相关的。OVS_VPORT_CMD_NEW到达内核后的处理流程图见 图1.
图 1
struct
vsctl_command_syntax {
const char *name; /* e.g. "add-br" */
int min_args; /* Min number of arguments following name. */
int max_args; /* Max number of arguments following name. */
void (*prerequisites)(struct vsctl_context *ctx);
//如果非空调用 ovsdb_idl_add_column() 或 ovsdb_idl_add_table() 为下面插入数据占位(
??);
void (*run)(struct vsctl_context *ctx);
//执行这个命令对应的事务,并且将命令的输出(如果有)存入ctx->output或ctx->table;如果这个命令执行的先决条件未得 //到满足,要等待时机而后重新尝试,会设置 ctx->try_again= true(现在只有 "wait-until"命令这样做)
void (*postprocess)(struct vsctl_context *ctx);
//如果非空,在事物完成后调用这个函数进行后续处理, ctx->output是从run()执行后的输出,现在只有"create" 命令有postprocessing;
const char *options; // 用逗号分开的选项,形如"--a,--b"
enum { RO, RW } mode; //标识这个命令是否修改OVSDB
};
static const struct vsctl_command_syntax
all_commands[] = {
/* Open vSwitch commands. */
{"init", 0, 0, NULL, cmd_init, NULL, "", RW},
{"show", 0, 0, pre_cmd_show, cmd_show, NULL, "", RO},
/* Bridge commands. */
{"add-br", 1, 3, pre_get_info, cmd_add_br, NULL, "--may-exist", RW},
{"del-br", 1, 1, pre_get_info, cmd_del_br, NULL, "--if-exists", RW},
{"list-br", 0, 0, pre_get_info, cmd_list_br, NULL, "--real,--fake", RO},
{"br-exists", 1, 1, pre_get_info, cmd_br_exists, NULL, "", RO},
{"br-to-vlan", 1, 1, pre_get_info, cmd_br_to_vlan, NULL, "", RO},
{"br-to-parent", 1, 1, pre_get_info, cmd_br_to_parent, NULL, "", RO},
{"br-set-external-id", 2, 3, pre_cmd_br_set_external_id, cmd_br_set_external_id, NULL, "", RW},
{"br-get-external-id", 1, 2, pre_cmd_br_get_external_id, cmd_br_get_external_id, NULL, "", RO},
/* Port commands. */
{"list-ports", 1, 1, pre_get_info, cmd_list_ports, NULL, "", RO},
{"add-port", 2, INT_MAX, pre_get_info, cmd_add_port, NULL, "--may-exist", RW},
{"add-bond", 4, INT_MAX, pre_get_info, cmd_add_bond, NULL, "--may-exist,--fake-iface", RW},
{"del-port", 1, 2, pre_get_info, cmd_del_port, NULL, "--if-exists,--with-iface", RW},
{"port-to-br", 1, 1, pre_get_info, cmd_port_to_br, NULL, "", RO},
/* Interface commands. */
{"list-ifaces", 1, 1, pre_get_info, cmd_list_ifaces, NULL, "", RO},
{"iface-to-br", 1, 1, pre_get_info, cmd_iface_to_br, NULL, "", RO},
/* Controller commands. */
{"get-controller", 1, 1, pre_controller, cmd_get_controller, NULL, "", RO},
{"del-controller", 1, 1, pre_controller, cmd_del_controller, NULL, "", RW},
{"set-controller", 1, INT_MAX, pre_controller, cmd_set_controller, NULL,
"", RW},
{"get-fail-mode", 1, 1, pre_get_info, cmd_get_fail_mode, NULL, "", RO},
{"del-fail-mode", 1, 1, pre_get_info, cmd_del_fail_mode, NULL, "", RW},
{"set-fail-mode", 2, 2, pre_get_info, cmd_set_fail_mode, NULL, "", RW},
/* Manager commands. */
{"get-manager", 0, 0, pre_manager, cmd_get_manager, NULL, "", RO},
{"del-manager", 0, 0, pre_manager, cmd_del_manager, NULL, "", RW},
{"set-manager", 1, INT_MAX, pre_manager, cmd_set_manager, NULL, "", RW},
/* SSL commands. */
{"get-ssl", 0, 0, pre_cmd_get_ssl, cmd_get_ssl, NULL, "", RO},
{"del-ssl", 0, 0, pre_cmd_del_ssl, cmd_del_ssl, NULL, "", RW},
{"set-ssl", 3, 3, pre_cmd_set_ssl, cmd_set_ssl, NULL, "--bootstrap", RW},
/* Switch commands. */
{"emer-reset", 0, 0, pre_cmd_emer_reset, cmd_emer_reset, NULL, "", RW},
/* Database commands. */
{"comment", 0, INT_MAX, NULL, NULL, NULL, "", RO},
{"get", 2, INT_MAX, pre_cmd_get, cmd_get, NULL, "--if-exists,--id=", RO},
{"list", 1, INT_MAX, pre_cmd_list, cmd_list, NULL, "--columns=", RO},
{"find", 1, INT_MAX, pre_cmd_find, cmd_find, NULL, "--columns=", RO},
{"set", 3, INT_MAX, pre_cmd_set, cmd_set, NULL, "", RW},
{"add", 4, INT_MAX, pre_cmd_add, cmd_add, NULL, "", RW},
{"remove", 4, INT_MAX, pre_cmd_remove, cmd_remove, NULL, "", RW},
{"clear", 3, INT_MAX, pre_cmd_clear, cmd_clear, NULL, "", RW},
{"create", 2, INT_MAX, pre_create, cmd_create, post_create, "--id=", RW},
{"destroy", 1, INT_MAX, pre_cmd_destroy, cmd_destroy, NULL, "--if-exists,--all", RW},
{"wait-until", 2, INT_MAX, pre_cmd_wait_until, cmd_wait_until, NULL, "", RO},
{NULL, 0, 0, NULL, NULL, NULL, NULL, RO},
};
static void
cmd_add_port(struct vsctl_context *ctx) {
bool may_exist = shash_find(&ctx->options, "--may-exist") != NULL;
//看vsctl_context->options 中是否有 --may-exist选项;
add_port(ctx, ctx->argv[1], ctx->argv[2], may_exist, false, &ctx->argv[2], 1, &ctx->argv[3], ctx->argc - 3);
}
static void
add_port(struct vsctl_context *ctx, const char *br_name, const char *port_name, bool may_exist,
bool fake_iface, char *iface_names[], int n_ifaces, char *settings[], int n_settings) {
struct vsctl_port *vsctl_port;
struct vsctl_bridge *bridge;
struct ovsrec_interface **ifaces;
struct ovsrec_port *port;
size_t i;
vsctl_context_populate_cache(ctx);
if (may_exist) {
struct vsctl_port *vsctl_port;
vsctl_port = find_port(ctx, port_name, false);
if (vsctl_port) {
struct svec want_names, have_names;
svec_init(&want_names);
for (i = 0; i < n_ifaces; i++) {
svec_add(&want_names, iface_names[i]);
}
svec_sort(&want_names);
svec_init(&have_names);
for (i = 0; i < vsctl_port->port_cfg->n_interfaces; i++) {
svec_add(&have_names, vsctl_port->port_cfg->interfaces[i]->name);
}
svec_sort(&have_names);
if (strcmp(vsctl_port->bridge->name, br_name)) {
char *command = vsctl_context_to_string(ctx);
vsctl_fatal("\"%s\" but %s is actually attached to bridge %s", command, port_name, vsctl_port->bridge->name);
}
if (!svec_equal(&want_names, &have_names)) {
char *have_names_string = svec_join(&have_names, ", ", "");
char *command = vsctl_context_to_string(ctx);
vsctl_fatal("\"%s\" but %s actually has interface(s) %s",command, port_name, have_names_string);
}
svec_destroy(&want_names);
svec_destroy(&have_names);
return;
}
}
check_conflicts(ctx, port_name, xasprintf("cannot create a port named %s", port_name));
for (i = 0; i < n_ifaces; i++) {
check_conflicts(ctx, iface_names[i], xasprintf("cannot create an interface named %s", iface_names[i]));
}
bridge = find_bridge(ctx, br_name, true);
ifaces = xmalloc(n_ifaces * sizeof *ifaces);
for (i = 0; i < n_ifaces; i++) {
ifaces[i] = ovsrec_interface_insert(ctx->txn);
ovsrec_interface_set_name(ifaces[i], iface_names[i]);
}
port = ovsrec_port_insert(ctx->txn);
ovsrec_port_set_name(port, port_name);
ovsrec_port_set_interfaces(port, ifaces, n_ifaces);
ovsrec_port_set_bond_fake_iface(port, fake_iface);
if (bridge->parent) {
int64_t tag = bridge->vlan;
ovsrec_port_set_tag(port, &tag, 1);
}
for (i = 0; i < n_settings; i++) {
set_column(get_table("Port"), &port->header_, settings[i], ctx->symtab);
}
bridge_insert_port((bridge->parent ? bridge->parent->br_cfg : bridge->br_cfg), port);
vsctl_port = add_port_to_cache(ctx, bridge, port);
for (i = 0; i < n_ifaces; i++) {
add_iface_to_cache(ctx, vsctl_port, ifaces[i]);
}
free(ifaces);
}
void ovsrec_port_set_name(const struct ovsrec_port *row, const char *name) {
struct ovsdb_datum datum;
assert(inited);
datum.n = 1;
datum.keys = xmalloc(sizeof *datum.keys);
datum.keys[0].string = xstrdup(name);
datum.values = NULL;
ovsdb_idl_txn_write(&row->header_, &ovsrec_port_columns[OVSREC_PORT_COL_NAME], &datum);
}
上面都是db相关操作,当更新到ovsdb后这个应用程序的任务就完成了,接下来就是守护进程合适的时候响应这些新的配置。
函数 ofproto_port_add(ofproto/ofproto.c line 1353)将netdev (struct netdev是一个open network device实体,lib/netdev-provider.h)增加为ofproto 的端口,如果成功的话 ofp_portp更新为新端口的openflow port number;
疑问:何时根据ovsdb来调用这里的ofproto_port_add 呢?
int
ofproto_port_add(struct ofproto *ofproto, struct netdev *netdev, uint16_t *ofp_portp) {
uint16_t ofp_port;
int error;
error =
ofproto->ofproto_class->port_add(ofproto, netdev, &ofp_port);
if (!error) {
update_port(ofproto, netdev_get_name(netdev));
}
if (ofp_portp) {
*ofp_portp = error ? OFPP_NONE : ofp_port;
}
return error;
}
函数port_addd(ofproto/ofproto-dpif.c) ,dpif_port_add(lib/dpif.c)将netdev增设为dpif(struct dpif是ovs datapath接口dpif-provider.h)的一个端口,接下来会执行具体的struct dpif_class dpif_linux_class 的 port_add 函数。
static int
port_add(struct ofproto *ofproto_, struct netdev *netdev, uint16_t *ofp_portp) {
struct ofproto_dpif *ofproto = ofproto_dpif_cast(ofproto_);
uint16_t odp_port = UINT16_MAX;
int error;
error =
dpif_port_add(ofproto->dpif, netdev, &odp_port);
if (!error) {
*ofp_portp = odp_port_to_ofp_port(odp_port);
}
return error;
}
int
dpif_port_add(struct dpif *dpif, struct netdev *netdev, uint16_t *port_nop) {
const char *netdev_name = netdev_get_name(netdev);
uint16_t port_no = UINT16_MAX;
int error;
COVERAGE_INC(dpif_port_add); //操作计数
if (port_nop) {
port_no = *port_nop;
}
error = dpif->dpif_class->port_add(dpif, netdev, &port_no);
if (!error) {
VLOG_DBG_RL(&dpmsg_rl, "%s: added %s as port %"PRIu16,dpif_name(dpif), netdev_name, port_no);
} else {
VLOG_WARN_RL(&error_rl, "%s: failed to add %s as port: %s",dpif_name(dpif), netdev_name, strerror(error));
port_no = UINT16_MAX;
}
if (port_nop) {
*port_nop = port_no;
}
return error;
}
当打开一个已存在的datapath或者创建并打开一个datapath都会调用 lib/dpif.c:do_open()函数,其中会调用 dp_initialize() 来注册这两个dpif_class 到 registered_dpif_class 中,然后调用其open函数,这里就是dpif_linux_open,主要功能是查看那4个generic family是否存在,并且创建netlink 套接字等等netlink 用户空间的准备工作; 这里会调用 dpif_linux_class里面的port_add handler,构造相应的nlmsg发送到内核空间。
static int
do_open (const char *name, const char *type, bool create, struct dpif **dpifp)
{
struct dpif *dpif = NULL;
int error;
struct registered_dpif_class *registered_class;
dp_initialize();
type = dpif_normalize_type(type);
registered_class = shash_find_data(&dpif_classes, type); //根据具体类型找到对应的 registered_dpif_class
if (!registered_class) {
VLOG_WARN("could not create datapath %s of unknown type %s", name,type);
error = EAFNOSUPPORT;
goto exit;
}
error =
registered_class->dpif_class->open(registered_class->dpif_class,name, create, &dpif);
if (!error) {
assert(dpif->dpif_class == registered_class->dpif_class);
registered_class->refcount++;
}
exit:
*dpifp = error ? NULL : dpif;
return error;
}
static const struct dpif_class *
base_dpif_classes[] = {
#ifdef LINUX_DATAPATH
&dpif_linux_class,
#endif
&dpif_netdev_class,
};
static void
dp_initialize(void)
{
static int status = -1;
if (status < 0) {
int i;
status = 0;
for (i = 0; i < ARRAY_SIZE(base_dpif_classes); i++) {
dp_register_provider(base_dpif_classes[i]);
}
}
}
/* Registers a new datapath provider. After successful registration, new
* datapaths of that type can be opened using dpif_open(). */
int
dp_register_provider(const struct dpif_class *new_class) {
struct registered_dpif_class *registered_class;
registered_class = xmalloc(sizeof *registered_class);
registered_class->dpif_class = new_class;
registered_class->refcount = 0;
shash_add(&dpif_classes, new_class->type, registered_class); //增加到hash表中;
return 0;
}
------------lib/dpif-linux.c
const struct dpif_class
dpif_linux_class = {
"system",
dpif_linux_enumerate,
dpif_linux_open,
dpif_linux_close,
dpif_linux_destroy,
dpif_linux_run,
dpif_linux_wait,
dpif_linux_get_stats,
dpif_linux_port_add,
dpif_linux_port_del,
dpif_linux_port_query_by_number,
dpif_linux_port_query_by_name,
dpif_linux_get_max_ports,
dpif_linux_port_get_pid,
dpif_linux_port_dump_start,
dpif_linux_port_dump_next,
dpif_linux_port_dump_done,
dpif_linux_port_poll,
dpif_linux_port_poll_wait,
dpif_linux_flow_get,
dpif_linux_flow_put,
dpif_linux_flow_del,
dpif_linux_flow_flush,
dpif_linux_flow_dump_start,
dpif_linux_flow_dump_next,
dpif_linux_flow_dump_done,
dpif_linux_execute,
dpif_linux_operate,
dpif_linux_recv_set,
dpif_linux_queue_to_priority,
dpif_linux_recv,
dpif_linux_recv_wait,
dpif_linux_recv_purge,
};
static int
dpif_linux_open(const struct dpif_class *class OVS_UNUSED, const char *name, bool create, struct dpif **dpifp)
{
struct dpif_linux_dp dp_request, dp;
struct ofpbuf *buf;
uint32_t upcall_pid;
int error;
error =
dpif_linux_init();
if (error) {
return error;
}
dpif_linux_dp_init(&dp_request); //memset 0
if (create) {
dp_request.cmd = OVS_DP_CMD_NEW;
upcall_pid = 0;
dp_request.upcall_pid = &upcall_pid;
} else {
dp_request.cmd = OVS_DP_CMD_GET;
}
dp_request.name = name;
error =
dpif_linux_dp_transact(&dp_request, &dp, &buf);
//将命令(创建或者获得已存在的DP)通过genl传到内核空间执行;
if (error) {
return error;
}
open_dpif(&dp, dpifp);
ofpbuf_delete(buf);
return 0;
}
---->
static int
dpif_linux_init(void) {
static int error = -1;
if (error < 0) {
unsigned int ovs_vport_mcgroup;
error = nl_lookup_genl_family(OVS_DATAPATH_FAMILY,&ovs_datapath_family);
if (!error) {
error = nl_lookup_genl_family(OVS_VPORT_FAMILY, &ovs_vport_family);
}
if (!error) {
error = nl_lookup_genl_family(OVS_FLOW_FAMILY, &ovs_flow_family);
}
if (!error) {
error = nl_lookup_genl_family(OVS_PACKET_FAMILY,&ovs_packet_family);
}
if (!error) {
error =
nl_sock_create(NETLINK_GENERIC, &genl_sock);
//创建一个netlink socket:genl_sock
}
if (!error) {
error = nl_lookup_genl_mcgroup(OVS_VPORT_FAMILY, OVS_VPORT_MCGROUP,&ovs_vport_mcgroup,OVS_VPORT_MCGROUP_FALLBACK_ID);
}
if (!error) {
static struct dpif_linux_vport vport;
nln = nln_create(NETLINK_GENERIC, ovs_vport_mcgroup, dpif_linux_nln_parse, &vport); //创建一个netlink notifier
}
}
return error;
}
static int
dpif_linux_port_add(struct dpif *dpif_, struct netdev *netdev, uint16_t *port_nop) {
struct dpif_linux *dpif = dpif_linux_cast(dpif_);
const char *name = netdev_get_name(netdev);
const char *type = netdev_get_type(netdev);
struct dpif_linux_vport request, reply;
const struct ofpbuf *options;
struct ofpbuf *buf;
int error, i = 0, max_ports = MAX_PORTS;
dpif_linux_vport_init(&request); //为下面发送到内核空间做准备;
request.cmd = OVS_VPORT_CMD_NEW;
request.dp_ifindex = dpif->dp_ifindex;
request.type = netdev_vport_get_vport_type(netdev);
request.name = name;
options = netdev_vport_get_options(netdev);
if (options && options->size) {
request.options = options->data;
request.options_len = options->size;
}
if (request.type == OVS_VPORT_TYPE_NETDEV) {
netdev_linux_ethtool_set_flag(netdev, ETH_FLAG_LRO, "LRO", false);
}
/* Unless a specific port was requested, loop until we find a port
* that isn't used. */
do {
uint32_t upcall_pid;
request.port_no = *port_nop != UINT16_MAX ? *port_nop : ++dpif->alloc_port_no;
upcall_pid = dpif_linux_port_get_pid(dpif_, request.port_no);
request.upcall_pid = &upcall_pid;
error =
dpif_linux_vport_transact(&request, &reply, &buf);
ofpbuf_delete(buf);
} while ((*port_nop == UINT16_MAX) && (i++ < max_ports)&& (error == EBUSY || error == EFBIG));
return error;
}
在datapath的内核层处理 request:dpif_linux_dp_to_ofpbuf 通过nl_msg_put_* 将request中秒数的netlink msg增加到ofpbuf中,而后通过genl_sock发送到kernel 等待回复,成功后bufp指向的是响应消息体,然后从中解析到 reply 中。
int
dpif_linux_vport_transact(const struct dpif_linux_vport *request, struct dpif_linux_vport *reply, struct ofpbuf **bufp) {
struct ofpbuf *request_buf;
int error;
assert((reply != NULL) == (bufp != NULL));
error =
dpif_linux_init();
if (error) {
if (reply) {
*bufp = NULL;
dpif_linux_vport_init(reply);
}
return error;
}
request_buf = ofpbuf_new(1024);
dpif_linux_vport_to_ofpbuf(request, request_buf);
error =
nl_sock_transact(genl_sock, request_buf, bufp);
ofpbuf_delete(request_buf);
if (reply) {
if (!error) {
error =
dpif_linux_vport_from_ofpbuf(reply, *bufp);
}
if (error) {
dpif_linux_vport_init(reply);
ofpbuf_delete(*bufp);
*bufp = NULL;
}
}
return error;
}
----> lib/netlink-socket.c:nl_sock_transact通过sock发送请求到内核空间等待回复,
int
nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request, struct ofpbuf **replyp) {
struct nl_transaction *transactionp;
struct nl_transaction transaction;
transaction.request = CONST_CAST(struct ofpbuf *, request);
transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
transactionp = &transaction;
nl_sock_transact_multiple(sock, &transactionp, 1);
if (replyp) {
if (transaction.error) {
ofpbuf_delete(transaction.reply);
*replyp = NULL;
} else {
*replyp = transaction.reply;
}
}
return transaction.error;
}
当成功收到内核的回复之后dpif_linux_vport_from_ofpbuf 将ofbuf中的内容通过netlink attribute的方式(结构是 nlmsghdr | genlmsghdr | ovs_header ,就是??)解析到vport中,dpif_linux_vport 有指向ofbuf的指针,所以调用者要确保不能释放buf 当vport在使用的时候。此时得到的vport 后来如何被使用的呢?
static int
dpif_linux_vport_from_ofpbuf(struct dpif_linux_vport *vport, const struct ofpbuf *buf) {
static const struct nl_policy ovs_vport_policy[] = {
[OVS_VPORT_ATTR_PORT_NO] = { .type = NL_A_U32 },
[OVS_VPORT_ATTR_TYPE] = { .type = NL_A_U32 },
[OVS_VPORT_ATTR_NAME] = { .type = NL_A_STRING, .max_len = IFNAMSIZ },
[OVS_VPORT_ATTR_UPCALL_PID] = { .type = NL_A_U32 },
[OVS_VPORT_ATTR_STATS] = { NL_POLICY_FOR(struct ovs_vport_stats), .optional = true },
[OVS_VPORT_ATTR_ADDRESS] = { .type = NL_A_UNSPEC, .min_len = ETH_ADDR_LEN,
.max_len = ETH_ADDR_LEN, .optional = true },
[OVS_VPORT_ATTR_OPTIONS] = { .type = NL_A_NESTED, .optional = true },
};
struct nlattr *a[ARRAY_SIZE(ovs_vport_policy)];
struct ovs_header *ovs_header;
struct nlmsghdr *nlmsg;
struct genlmsghdr *genl;
struct ofpbuf b;
dpif_linux_vport_init(vport);
ofpbuf_use_const(&b, buf->data, buf->size);
nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
genl = ofpbuf_try_pull(&b, sizeof *genl);
ovs_header = ofpbuf_try_pull(&b, sizeof *ovs_header);
if (!nlmsg || !genl || !ovs_header
|| nlmsg->nlmsg_type != ovs_vport_family
|| !nl_policy_parse(&b, 0, ovs_vport_policy, a,
ARRAY_SIZE(ovs_vport_policy))) {
return EINVAL;
}
vport->cmd = genl->cmd;
vport->dp_ifindex = ovs_header->dp_ifindex;
vport->port_no = nl_attr_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
vport->type = nl_attr_get_u32(a[OVS_VPORT_ATTR_TYPE]);
vport->name = nl_attr_get_string(a[OVS_VPORT_ATTR_NAME]);
if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
vport->upcall_pid = nl_attr_get(a[OVS_VPORT_ATTR_UPCALL_PID]);
}
if (a[OVS_VPORT_ATTR_STATS]) {
vport->stats = nl_attr_get(a[OVS_VPORT_ATTR_STATS]);
}
if (a[OVS_VPORT_ATTR_ADDRESS]) {
vport->address = nl_attr_get(a[OVS_VPORT_ATTR_ADDRESS]);
}
if (a[OVS_VPORT_ATTR_OPTIONS]) {
vport->options = nl_attr_get(a[OVS_VPORT_ATTR_OPTIONS]);
vport->options_len = nl_attr_get_size(a[OVS_VPORT_ATTR_OPTIONS]);
}
return 0;
}
在datapath模块启动的时候会注册一些generic netlink family。解析并执行从用户空间传来的指令,这里是
OVS_VPORT_CMD_NEW,就会呼叫对应的operation。
static int
dp_register_genl(void) {
int n_registered;
int err;
int i;
n_registered = 0;
for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) {
const struct genl_family_and_ops *f = &dp_genl_families[i];
err =
genl_register_family_with_ops(f->family, f->ops,f->n_ops);
if (err)
goto error;
n_registered++;
if (f->group) {
err = genl_register_mc_group(f->family, f->group);
if (err)
goto error;
}
}
return 0;
}
static const struct genl_family_and_ops
dp_genl_families[] = {
{ &dp_datapath_genl_family, dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), &ovs_dp_datapath_multicast_group },
{ &dp_vport_genl_family, dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops), &ovs_dp_vport_multicast_group },
{ &dp_flow_genl_family, dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops), &ovs_dp_flow_multicast_group },
{ &dp_packet_genl_family, dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), NULL },
};
tatic struct genl_family
dp_vport_genl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = sizeof(struct ovs_header),
.name = OVS_VPORT_FAMILY,
.version = OVS_VPORT_VERSION,
.maxattr = OVS_VPORT_ATTR_MAX,
SET_NETNSOK
};
struct genl_multicast_group ovs_dp_vport_multicast_group = {
.name = OVS_VPORT_MCGROUP
};
static struct genl_ops
dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_new
},
{ .cmd = OVS_VPORT_CMD_DEL,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_del
},
{ .cmd = OVS_VPORT_CMD_GET,
.flags = 0, /* OK for unprivileged users. */
.policy = vport_policy,
.doit = ovs_vport_cmd_get,
.dumpit = ovs_vport_cmd_dump
},
{ .cmd = OVS_VPORT_CMD_SET,
.flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_set,
},
};
static int
ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct vport_parms parms;
struct sk_buff *reply;
struct vport *vport;
struct datapath *dp;
u32 port_no;
int err;
err = -EINVAL;
if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID])
goto exit;
err = ovs_vport_cmd_validate(a);
if (err)
goto exit;
rtnl_lock();
dp =
get_dp(sock_net(skb->sk), ovs_header->dp_ifindex);
// 通过dp_ifindex 的netdevice ,然后通过dev找到对应的vport 返回vport->dp;
err = -ENODEV;
if (!dp)
goto exit_unlock;
if (a[OVS_VPORT_ATTR_PORT_NO]) {
port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]);
err = -EFBIG;
if (port_no >= DP_MAX_PORTS)
goto exit_unlock;
vport = ovs_vport_rtnl(dp, port_no);
// -- ovs_lookup_vport(dp,port_no)通过端口号返回对应的vport结构;
err = -EBUSY;
if (vport)
goto exit_unlock;
} else { //如果没有得到port_no 的话就取得具有最小端口号的vport;
for (port_no = 1; ; port_no++) {
if (port_no >= DP_MAX_PORTS) {
err = -EFBIG;
goto exit_unlock;
}
vport = ovs_vport_rtnl(dp, port_no);
if (!vport)
break;
}
}
//接下来填充vport的参数结构体;
parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]);
parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]);
parms.options = a[OVS_VPORT_ATTR_OPTIONS];
parms.dp = dp;
parms.port_no = port_no;
parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]);
vport = new_vport(&parms);
err = PTR_ERR(vport);
if (IS_ERR(vport))
goto exit_unlock;
ovs_dp_sysfs_add_if(vport);
err = change_vport(vport, a);
if (!err) {
reply = ovs_vport_cmd_build_info(vport, info->snd_portid,
info->snd_seq,
OVS_VPORT_CMD_NEW);
if (IS_ERR(reply))
err = PTR_ERR(reply);
}
if (err) {
ovs_dp_detach_port(vport);
goto exit_unlock;
}
genl_notify(reply, genl_info_net(info), info->snd_portid,
ovs_dp_vport_multicast_group.id, info->nlhdr, GFP_KERNEL);
exit_unlock:
rtnl_unlock();
exit:
return err;
}
static struct vport *
new_vport(const struct vport_parms *parms)
{
struct vport *vport;
vport = ovs_vport_add(parms);
if (!IS_ERR(vport)) {
struct datapath *dp = parms->dp;
struct hlist_head *head = vport_hash_bucket(dp, vport->port_no);
hlist_add_head_rcu(&vport->dp_hash_node, head);
dp_ifinfo_notify(RTM_NEWLINK, vport);
}
return vport;
}
函数 ovs_vport_add()是内核真正的增加vport设备(datapath/vport.c),根据要添加端口的参数从vport_ops_list中找到对应的类型;parms是要新增的vport的信息。接下来有了具体设备地类型后就是调用相应的create方法(这里是ovs_netdev_vport_ops)。
struct vport *
ovs_vport_add(const struct vport_parms *parms)
{
struct vport *vport;
int err = 0;
int i;
ASSERT_RTNL(); ///和RTNL lock相关;
for (i = 0; i < n_vport_types; i++) {
if (vport_ops_list[i]->type == parms->type) {
struct hlist_head *bucket;
//在 ovs_vport_init() 的时候会将base_vport_ops_list[] 更新到 vport_ops_list中;
vport = vport_ops_list[i]->create(parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto out;
}
bucket = hash_bucket(ovs_dp_get_net(vport->dp), vport->ops->get_name(vport));
hlist_add_head_rcu(&vport->hash_node, bucket);
return vport;
}
}
err = -EAFNOSUPPORT;
out:
return ERR_PTR(err);
}
static const struct vport_ops *
base_vport_ops_list[] = {
&ovs_netdev_vport_ops,
&ovs_internal_vport_ops,
&ovs_patch_vport_ops,
&ovs_gre_vport_ops,
&ovs_gre_ft_vport_ops,
&ovs_gre64_vport_ops,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
&ovs_capwap_vport_ops,
#endif
};
const struct vport_ops
ovs_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_NETDEV,
.flags = VPORT_F_REQUIRED,
.init = netdev_init,
.exit = netdev_exit,
.create = netdev_create,
.destroy = netdev_destroy,
.set_addr = ovs_netdev_set_addr,
.get_name = ovs_netdev_get_name,
.get_addr = ovs_netdev_get_addr,
.get_kobj = ovs_netdev_get_kobj,
.get_dev_flags = ovs_netdev_get_dev_flags,
.is_running = ovs_netdev_is_running,
.get_operstate = ovs_netdev_get_operstate,
.get_ifindex = ovs_netdev_get_ifindex,
.get_mtu = ovs_netdev_get_mtu,
.send = netdev_send,
};
static struct vport *
netdev_create(const struct vport_parms *parms)
{
struct vport *vport;
struct netdev_vport *netdev_vport;
int err;
vport = ovs_vport_alloc(sizeof(struct netdev_vport), &ovs_netdev_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
netdev_vport = netdev_vport_priv(vport);
netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
if (!netdev_vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
if (netdev_vport->dev->flags & IFF_LOOPBACK || netdev_vport->dev->type != ARPHRD_ETHER ||
ovs_is_internal_dev(netdev_vport->dev)) {
err = -EINVAL;
goto error_put;
}
err =
netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, vport);
if (err)
goto error_put;
dev_set_promiscuity(netdev_vport->dev, 1);
netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
return vport;
error_put:
dev_put(netdev_vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
return ERR_PTR(err);
}
这个时候我们就要处理来自网络接口的数据了,见
packet处理流程。