Linux网络事件通知机制

1. kernel space –> kernel space

1.1. notifier_block原理介绍

linux内核中各个子系统相互依赖,当其中某个子系统状态发送改变时,就必须使用一定的机制告知使用其服务的其他子系统,以便其他子系统采取相应的措施。为满足这样的需求,内核实现了事件通知链机制。

网络子系统的通知链有三个:

  • netdev_chain,表示网络设备状态变化;
  • inetaddr_chain,表示ipv4地址发生变化;
  • inet6addr_chain,表示ipv6地址发生变化;

网络子系统中是由下面三个函数来注册netdev_chain、inetaddr_chain和inet6addr_chain的事件通知处理函数的。

int register_netdevice_notifier(struct notifer_block *nb);
int register_inetaddr_notifier(struct notifer_block *nb);
int register_inet6addr_notifier(struct notifer_block *nb);

可见上面三个注册函数都有一个关键的结构体,struct notifier_block;

struct notifier_block {
    int (*notifer_call)(struct notifier_block*, unsigned long, void*);
    struct notifier_block *next;
    int priority;
}
notifer_call: 当相对应事件发生时应以调用的函数;
next: 注册会把nb添加到对应的链表去,上面三个注册函数对应的链表分别为,netdev_chain、inetaddr_chain和inet6addr_chain,next用于遍历链表;
priority: 表示优先级,一般默认为0

例如在ip_fib_init函数中,注册了一个fib_netdev_notifier的网络设备状态变化监听器(netdev_chain),register_netdevice_notifier(&fib_netdev_notifier);
其中fib_netdev_notifier为:

static struct notifier_block fib_netdev_notifier = {
    .notifier_call = fib_netdev_event,
};

处理函数是fib_netdev_event,进到fib_netdev_event函数中可以看到对NETDEV_UP、NETDEV_DOWN、NETDEV_CHANGEMTU、NETDEV_CHANGE事件的处理;

注册好事件监听以及处理函数后,是如何触发事件通知的呢?

int call_netdevice_notifier(unsigned long val, struct net_device *dev)
{
ASSENT_RTNL();
return raw_notifier_call_chain(&netdev_chain, val, dev); /* val表示通知的事件值,例如NETDEV_UP */
}

raw_notifier_call_chain –> __raw_notifier_call_chain –> notifier_call_chain –> nb->notifier_call;
可见是通过call_device_notifier来遍历netdev_chain的监听器(struct notifier_block *nb),并依此调用其notifier_call,对应到上面那个例子就是fib_netdev_event;

同样,对于inetaddr_chain,通知函数是blocking_notifier_call_chain(&inetaddr_chain, val, v);对于inet6addr_chain,通知函数是inet6addr_notifier_call_chain –> atomic_notifier_call_chain(&inet6addr_chain, val, v);

1.2 常见的网络子系统的事件和通知

netdev_chain通知链,包括的事件有:

  • NETDEV_CHANGENAME,设置网口名称,dev_change_name;
  • NETDEV_FEAT_CHANGE,硬件feature改变,例如聚合,netdev_features_change;
  • NETDEV_CHANGE,载波状态发生改变,linkwatch_do_dev;
  • NETDEV_NOTIFY_PEERS,
  • NETDEV_PRE_UP,
  • NETDEV_UP,打开网口,dev_open;
  • NETDEV_GOING_DOWN,即将关闭网口,__dev_close_many;
  • NETDEV_DOWN,关闭网口,dev_close_many;
  • NETDEV_CHANGEMTU,设置mtu,dev_set_mtu;
  • NETDEV_CHANGEADDR,设置mac地址,dev_set_mac_address;
  • NETDEV_UNREGISTER,
  • NETDEV_POST_INIT,
  • NETDEV_REGISTER,网络设备注册完成,register_netdevice;
  • NETDEV_UNREGISTER_FINAL,

netdev_chain通知链的事件全部在net/core/dev.c中通知;

inetaddr_chain通知链,包括的事件有:

  • NETDEV_DOWN,网口上的ipaddr被删除,inet_del_ifa;
  • NETDEV_UP,网口上增加ipaddr,inet_insert_ifa;

2. kernel space -> user space

2.1. netlink原理介绍

netlink是一种特殊的socket,它是linux所特有的,类似于BSD中的AF_ROUTE,当又远比它的功能强大,目前使用netlink进行应用与内核通讯的应用很多,包括:NETLINK_ROUTE(路由daemon)、NETLINK_W1(1-wire子系统)、NETLINK_USERSOCK(用户态socket协议)、NETLINK_NFLOG(netfilter日志)、NETLINK_XFRM(ipsce安全策略)、NETLINK_SELINUX(SELinux事件通知)、NETLINK_ISCSI(iSCSI子系统)、NETLINK_AUDIT(进程审计)、NETLINK_FIB_LOOKUP(转发信息表查询)、NETLINK_CONNECTOR(netlink connector)、NETLINK_NETFILTER(netfilter子系统)、NETLINK_IP6_FW(IPv6防火墙)、NETLINK_DNRTMSG(DECnet路由信息)、NETLINK_KOBJECT_UEVENT(内核事件向用户态通知)、NETLINK_GENERIC(通用netlink);

其中网络子系统中最常使用的几个有NETLINK_KOBJECT_UEVENT、NETLINK_ROUTE、NETLINK_NETFILTER;

以dev_open为例,在打开网口后,kernel space会通知user space网口的状态变成IFF_UP|IFF_RUNNING,rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOBUFS;
        size_t if_info_size;

        skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); /* 新建nlmsg */
        if (skb == NULL)
                goto errout;

        err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); /* 填充nlmsg,将type,dev,change信息添加到skb中 */
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); /* 发送消息 */
        return;
errout:
        if (err < 0)
                rtnl_set_sk_err(net, RTNLGRP_LINK, err);
}

其中nlmsg_new就是申请一定长度的sk_buff的内存,这个长度包括:

return NLMSG_ALIGN(sizeof(struct ifinfomsg))
       + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
       + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
       + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
       + nla_total_size(sizeof(struct rtnl_link_ifmap))
       + nla_total_size(sizeof(struct rtnl_link_stats))
       + nla_total_size(sizeof(struct rtnl_link_stats64))
       + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
       + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
       + nla_total_size(4) /* IFLA_TXQLEN */
       + nla_total_size(4) /* IFLA_WEIGHT */
       + nla_total_size(4) /* IFLA_MTU */
       + nla_total_size(4) /* IFLA_LINK */
       + nla_total_size(4) /* IFLA_MASTER */
       + nla_total_size(1) /* IFLA_CARRIER */
       + nla_total_size(4) /* IFLA_PROMISCUITY */
       + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
       + nla_total_size(1) /* IFLA_OPERSTATE */
       + nla_total_size(1) /* IFLA_LINKMODE */
       + nla_total_size(ext_filter_mask
                        & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
       + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
       + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
       + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
       + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */

rtl_notify最终调用nlmsg_notify,nlmsg_unicast,netlink_unicast将nelink事件发到user space;
其中在rtnl_notify中使用到一个sock:struct sock *rtnl = net->rtnl;最终netlink_unicast就是通过rtnl这个socket发送消息的;
其中net->rtnl在rtnetlink_net_init中初始化;

static int __net_init rtnetlink_net_init(struct net *net)
{
        struct sock *sk;
        struct netlink_kernel_cfg cfg = {
                .groups         = RTNLGRP_MAX,
                .input          = rtnetlink_rcv,
                .cb_mutex       = &rtnl_mutex,
                .flags          = NL_CFG_F_NONROOT_RECV,
        };

        sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); /* 创建netlink socket,类型为NETLINK_ROUTE */
        if (!sk)
                return -ENOMEM;
        net->rtnl = sk;
        return 0;
}

2.2. 常见的网络内核消息通知

linkwatch_do_dev -->
  netdev_state_change
    rtmsg_ifinfo(RTM_NEWLINK, dev, 0)

在rtnl_fill_ifinfo里面,会获取dev的flag,ifm->ifi_flags = dev_get_flags(dev);

unsigned int dev_get_flags(const struct net_device *dev)
{
        unsigned int flags;

        flags = (dev->flags & ~(IFF_PROMISC |
                                IFF_ALLMULTI |
                                IFF_RUNNING |
                                IFF_LOWER_UP |
                                IFF_DORMANT)) |
                (dev->gflags & (IFF_PROMISC |
                                IFF_ALLMULTI));

        if (netif_running(dev)) {
                if (netif_oper_up(dev))
                        flags |= IFF_RUNNING;
                if (netif_carrier_ok(dev))
                        flags |= IFF_LOWER_UP; /* 检查载波状态,如果ok则设置IFF_LOWER_UP,否则不设置;
                if (netif_dormant(dev))
                        flags |= IFF_DORMANT;
        }

        return flags;
}

netlink类型是NETLINK_ROUTE,消息类型是RTM_NEWLINK;

2.2.2. interfaceAdded/interfaceRemoved,网口添加或移除事件

register_netdev -->
  register_netdevice -->
    netdev_register_kobject -->
      device_add -->
        kobject_uevent(&dev-kobj, KOBJ_ADD)

可见网络添加和移除走的是linux通用热插拔事件流程,网口添加时KOBJ_ADD,网络移除是KOBJ_REMOVE,netlink类型是NETLINK_KOBJECT_UEVENT;

2.2.3. addressUpdated/addressRemoved,配置ipaddr,ipaddr删除

__inet_insert_ifa -->
  rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid) -->
    rtn_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GTP_KERNLE);

配置ipaddr用的netlink类型是NETLINK_ROUTE,消息类型是RTM_NEWADDR;
删除ipaddr用的netlink类型是NETLINK_ROUTE,消息类型是RTM_DELADDR;

2.2.4. routeUpdated/routeRemoved,更新路由,删除路由

fib_table_insert -->
  rtmsg_fib(RTM_NEWROUTE, ...) -->
    rtnl_nofify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, ifo->nlh, GFP_KERNEL)

更新路由用的netlink类型是NETLINK_ROUTE,消息类型是RTM_NEWADDR;
删除路由用的netlink类型是NETLINK_ROUTE,消息类型是RTM_DELADDR;

2.3. 用户态接受消息

android对网络发自kernel的netlink事件的接收是在netd和libsysutils里面实现的,其中netd注册了接收netlink的socket,libsysutils负责接收和事件解析;

2.3.1. 注册socket

if ((mUeventHandler = setupSocket(&mUeventSock, NETLINK_KOBJECT_UEVENT,
         0xffffffff, NetlinkListener::NETLINK_FORMAT_ASCII)) == NULL) {
    return -1;
}

if ((mRouteHandler = setupSocket(&mRouteSock, NETLINK_ROUTE,
                                 RTMGRP_LINK |
                                 RTMGRP_IPV4_IFADDR |
                                 RTMGRP_IPV6_IFADDR |
                                 RTMGRP_IPV6_ROUTE |
                                 (1 << (RTNLGRP_ND_USEROPT - 1)),
         NetlinkListener::NETLINK_FORMAT_BINARY)) == NULL) {
    return -1;
}
NetlinkHandler *NetlinkManager::setupSocket(int *sock, int netlinkFamily,
    int groups, int format) {

    struct sockaddr_nl nladdr;
    int sz = 64 * 1024;
    int on = 1;

    memset(&nladdr, 0, sizeof(nladdr));
    nladdr.nl_family = AF_NETLINK;
    nladdr.nl_pid = getpid();
    nladdr.nl_groups = groups;

    if ((*sock = socket(PF_NETLINK, SOCK_DGRAM, netlinkFamily)) < 0) {
        ALOGE("Unable to create netlink socket: %s", strerror(errno));
        return NULL;
    }

    if (setsockopt(*sock, SOL_SOCKET, SO_RCVBUFFORCE, &sz, sizeof(sz)) < 0) {
        ALOGE("Unable to set uevent socket SO_RCVBUFFORCE option: %s", strerror(errno));
        close(*sock);
        return NULL;
    }

    if (setsockopt(*sock, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on)) < 0) {
        SLOGE("Unable to set uevent socket SO_PASSCRED option: %s", strerror(errno));
        close(*sock);
        return NULL;
    }

    if (bind(*sock, (struct sockaddr *) &nladdr, sizeof(nladdr)) < 0) {
        ALOGE("Unable to bind netlink socket: %s", strerror(errno));
        close(*sock);
        return NULL;
    }

    NetlinkHandler *handler = new NetlinkHandler(this, *sock, format);
    if (handler->start()) {
        ALOGE("Unable to start NetlinkHandler: %s", strerror(errno));
        close(*sock);
        return NULL;
    }

    return handler;
}

2.3.2. 接收消息

bool NetlinkListener::onDataAvailable(SocketClient *cli)
{
    int socket = cli->getSocket();
    ssize_t count;
    uid_t uid = -1;

    count = TEMP_FAILURE_RETRY(uevent_kernel_multicast_uid_recv(
                                       socket, mBuffer, sizeof(mBuffer), &uid));
    if (count < 0) {
        if (uid > 0)
            LOG_EVENT_INT(65537, uid);
        SLOGE("recvmsg failed (%s)", strerror(errno));
        return false;
    }

    NetlinkEvent *evt = new NetlinkEvent();
    if (evt->decode(mBuffer, count, mFormat)) {
        onEvent(evt);
    } else if (mFormat != NETLINK_FORMAT_BINARY) {
        // Don't complain if parseBinaryNetlinkMessage returns false. That can
        // just mean that the buffer contained no messages we're interested in.
        SLOGE("Error decoding NetlinkEvent");
    }

    delete evt;
    return true;
}

2.3.3. 消息解析

bool NetlinkEvent::decode(char *buffer, int size, int format) {
    if (format == NetlinkListener::NETLINK_FORMAT_BINARY) {
        /**
         * parseIfInfoMessage: NlActionLinkUp, NlActionLinkDown
         * parseIfAddrMessage: NlActionAddressUpdated, NlActionAddressRemoved
         * parseRtMessage: NlActionRouteUpdated, NlActionRouteRemoved
         */
        return parseBinaryNetlinkMessage(buffer, size);
    } else {
        /**
         * NlActionAdd, NlActionRemove
         */
        return parseAsciiNetlinkMessage(buffer, size);
    }
}

3. user space –> kernel space

用户态经常需要配置一下参数到内核态,常见的机制有netlink和ioctl;
android netd中的RouteControl.cpp中就有很多路由的设置,其使用的就是netlink socket;

WARN_UNUSED_RESULT int sendNetlinkRequest(uint16_t action, uint16_t flags, iovec* iov, int iovlen) {
    nlmsghdr nlmsg = {
        .nlmsg_type = action,
        .nlmsg_flags = flags,
    };
    iov[0].iov_base = &nlmsg;
    iov[0].iov_len = sizeof(nlmsg);
    for (int i = 0; i < iovlen; ++i) {
        nlmsg.nlmsg_len += iov[i].iov_len;
    }

    int ret;
    struct {
        nlmsghdr msg;
        nlmsgerr err;
    } response;

    int sock = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); // 创建netlink socket
    if (sock != -1 &&
            connect(sock, reinterpret_cast<const sockaddr*>(&NETLINK_ADDRESS),
                    sizeof(NETLINK_ADDRESS)) != -1 &&
            writev(sock, iov, iovlen) != -1 &&
            (ret = recv(sock, &response, sizeof(response), 0)) != -1) { // 发送消息并接收回复
        if (ret == sizeof(response)) {
            ret = response.err.error;  // Netlink errors are negative errno.
            if (ret) {
                ALOGE("netlink response contains error (%s)", strerror(-ret));
            }
        } else {
            ALOGE("bad netlink response message size (%d != %zu)", ret, sizeof(response));
            ret = -EBADMSG;
        }
    } else {
        ALOGE("netlink socket/connect/writev/recv failed (%s)", strerror(errno));
        ret = -errno;
    }

    if (sock != -1) {
        close(sock);
    }

    return ret;
}

发送的消息有:

  • RTM_NEWRULE: 添加策略路由表
  • RTM_DELRULE: 删除策略路由表
  • RTM_NEWROUTE: 插入路由表项
  • RTM_DElROUTE: 删除路由表项

kernel中在rtnelink_net_init中就注册了一个接收用户态netlink事件额socket:

static int __net_init rtnetlink_net_init(struct net *net)
{
        struct sock *sk;
        struct netlink_kernel_cfg cfg = {
                .groups         = RTNLGRP_MAX,
                .input          = rtnetlink_rcv, /* 接收函数 */
                .cb_mutex       = &rtnl_mutex,
                .flags          = NL_CFG_F_NONROOT_RECV,
        };

        sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
        if (!sk)
                return -ENOMEM;
        net->rtnl = sk;
        return 0;
}

netlink_rcv –> netlink_rcv_skb –> rtnetlink_rcv_msg –> rtnl_get_doit –> doit;
其中rtnl_get_doit及时从一个数组中去消息处理函数,由family和type来做匹配,而这个消息处理函数是在ip_fib_init中添加的;

rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);

在fib_rules_init中添加RTM_NEWRULE和RTM_DELRULE的处理函数:

rtnl_register(PF_USPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
rtnl_register(PF_USPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);

综上,RTM_NEWRULE、RTM_DELRULE、RTM_NEWROUTE、RTM_DElROUTE对应于kernel的处理函数是:
* RTM_NEWRULE –> fib_nl_newrule
* RTM_DELRULE –> fib_nl_delrule
* RTM_NEWROUTE –> inet_rtm_newroute
* RTM_DElROUTE –> inet_rtm_delroute

此外,还要一些用户态消息与内核态处理函数的映射关系:

/* 邻居子系统相关 *、
rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_delete, NULL, NULL);

/* ipaddr相关 */
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);

你可能感兴趣的:(Linux网络事件通知机制)