linux内核中各个子系统相互依赖,当其中某个子系统状态发送改变时,就必须使用一定的机制告知使用其服务的其他子系统,以便其他子系统采取相应的措施。为满足这样的需求,内核实现了事件通知链机制。
网络子系统的通知链有三个:
网络子系统中是由下面三个函数来注册netdev_chain、inetaddr_chain和inet6addr_chain的事件通知处理函数的。
int register_netdevice_notifier(struct notifer_block *nb);
int register_inetaddr_notifier(struct notifer_block *nb);
int register_inet6addr_notifier(struct notifer_block *nb);
可见上面三个注册函数都有一个关键的结构体,struct notifier_block;
struct notifier_block {
int (*notifer_call)(struct notifier_block*, unsigned long, void*);
struct notifier_block *next;
int priority;
}
notifer_call: 当相对应事件发生时应以调用的函数;
next: 注册会把nb添加到对应的链表去,上面三个注册函数对应的链表分别为,netdev_chain、inetaddr_chain和inet6addr_chain,next用于遍历链表;
priority: 表示优先级,一般默认为0;
例如在ip_fib_init函数中,注册了一个fib_netdev_notifier的网络设备状态变化监听器(netdev_chain),register_netdevice_notifier(&fib_netdev_notifier);
其中fib_netdev_notifier为:
static struct notifier_block fib_netdev_notifier = {
.notifier_call = fib_netdev_event,
};
处理函数是fib_netdev_event,进到fib_netdev_event函数中可以看到对NETDEV_UP、NETDEV_DOWN、NETDEV_CHANGEMTU、NETDEV_CHANGE事件的处理;
注册好事件监听以及处理函数后,是如何触发事件通知的呢?
int call_netdevice_notifier(unsigned long val, struct net_device *dev)
{
ASSENT_RTNL();
return raw_notifier_call_chain(&netdev_chain, val, dev); /* val表示通知的事件值,例如NETDEV_UP */
}
raw_notifier_call_chain –> __raw_notifier_call_chain –> notifier_call_chain –> nb->notifier_call;
可见是通过call_device_notifier来遍历netdev_chain的监听器(struct notifier_block *nb),并依此调用其notifier_call,对应到上面那个例子就是fib_netdev_event;
同样,对于inetaddr_chain,通知函数是blocking_notifier_call_chain(&inetaddr_chain, val, v);对于inet6addr_chain,通知函数是inet6addr_notifier_call_chain –> atomic_notifier_call_chain(&inet6addr_chain, val, v);
netdev_chain通知链,包括的事件有:
netdev_chain通知链的事件全部在net/core/dev.c中通知;
inetaddr_chain通知链,包括的事件有:
netlink是一种特殊的socket,它是linux所特有的,类似于BSD中的AF_ROUTE,当又远比它的功能强大,目前使用netlink进行应用与内核通讯的应用很多,包括:NETLINK_ROUTE(路由daemon)、NETLINK_W1(1-wire子系统)、NETLINK_USERSOCK(用户态socket协议)、NETLINK_NFLOG(netfilter日志)、NETLINK_XFRM(ipsce安全策略)、NETLINK_SELINUX(SELinux事件通知)、NETLINK_ISCSI(iSCSI子系统)、NETLINK_AUDIT(进程审计)、NETLINK_FIB_LOOKUP(转发信息表查询)、NETLINK_CONNECTOR(netlink connector)、NETLINK_NETFILTER(netfilter子系统)、NETLINK_IP6_FW(IPv6防火墙)、NETLINK_DNRTMSG(DECnet路由信息)、NETLINK_KOBJECT_UEVENT(内核事件向用户态通知)、NETLINK_GENERIC(通用netlink);
其中网络子系统中最常使用的几个有NETLINK_KOBJECT_UEVENT、NETLINK_ROUTE、NETLINK_NETFILTER;
以dev_open为例,在打开网口后,kernel space会通知user space网口的状态变成IFF_UP|IFF_RUNNING,rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
size_t if_info_size;
skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); /* 新建nlmsg */
if (skb == NULL)
goto errout;
err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); /* 填充nlmsg,将type,dev,change信息添加到skb中 */
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); /* 发送消息 */
return;
errout:
if (err < 0)
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
}
其中nlmsg_new就是申请一定长度的sk_buff的内存,这个长度包括:
return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ nla_total_size(sizeof(struct rtnl_link_ifmap))
+ nla_total_size(sizeof(struct rtnl_link_stats))
+ nla_total_size(sizeof(struct rtnl_link_stats64))
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ nla_total_size(4) /* IFLA_TXQLEN */
+ nla_total_size(4) /* IFLA_WEIGHT */
+ nla_total_size(4) /* IFLA_MTU */
+ nla_total_size(4) /* IFLA_LINK */
+ nla_total_size(4) /* IFLA_MASTER */
+ nla_total_size(1) /* IFLA_CARRIER */
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(ext_filter_mask
& RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+ rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+ rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
+ rtnl_link_get_size(dev) /* IFLA_LINKINFO */
+ rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */
rtl_notify最终调用nlmsg_notify,nlmsg_unicast,netlink_unicast将nelink事件发到user space;
其中在rtnl_notify中使用到一个sock:struct sock *rtnl = net->rtnl;最终netlink_unicast就是通过rtnl这个socket发送消息的;
其中net->rtnl在rtnetlink_net_init中初始化;
static int __net_init rtnetlink_net_init(struct net *net)
{
struct sock *sk;
struct netlink_kernel_cfg cfg = {
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv,
.cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
};
sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); /* 创建netlink socket,类型为NETLINK_ROUTE */
if (!sk)
return -ENOMEM;
net->rtnl = sk;
return 0;
}
linkwatch_do_dev -->
netdev_state_change
rtmsg_ifinfo(RTM_NEWLINK, dev, 0)
在rtnl_fill_ifinfo里面,会获取dev的flag,ifm->ifi_flags = dev_get_flags(dev);
unsigned int dev_get_flags(const struct net_device *dev)
{
unsigned int flags;
flags = (dev->flags & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
(dev->gflags & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
if (netif_oper_up(dev))
flags |= IFF_RUNNING;
if (netif_carrier_ok(dev))
flags |= IFF_LOWER_UP; /* 检查载波状态,如果ok则设置IFF_LOWER_UP,否则不设置;
if (netif_dormant(dev))
flags |= IFF_DORMANT;
}
return flags;
}
netlink类型是NETLINK_ROUTE,消息类型是RTM_NEWLINK;
register_netdev -->
register_netdevice -->
netdev_register_kobject -->
device_add -->
kobject_uevent(&dev-kobj, KOBJ_ADD)
可见网络添加和移除走的是linux通用热插拔事件流程,网口添加时KOBJ_ADD,网络移除是KOBJ_REMOVE,netlink类型是NETLINK_KOBJECT_UEVENT;
__inet_insert_ifa -->
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid) -->
rtn_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GTP_KERNLE);
配置ipaddr用的netlink类型是NETLINK_ROUTE,消息类型是RTM_NEWADDR;
删除ipaddr用的netlink类型是NETLINK_ROUTE,消息类型是RTM_DELADDR;
fib_table_insert -->
rtmsg_fib(RTM_NEWROUTE, ...) -->
rtnl_nofify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, ifo->nlh, GFP_KERNEL)
更新路由用的netlink类型是NETLINK_ROUTE,消息类型是RTM_NEWADDR;
删除路由用的netlink类型是NETLINK_ROUTE,消息类型是RTM_DELADDR;
android对网络发自kernel的netlink事件的接收是在netd和libsysutils里面实现的,其中netd注册了接收netlink的socket,libsysutils负责接收和事件解析;
if ((mUeventHandler = setupSocket(&mUeventSock, NETLINK_KOBJECT_UEVENT,
0xffffffff, NetlinkListener::NETLINK_FORMAT_ASCII)) == NULL) {
return -1;
}
if ((mRouteHandler = setupSocket(&mRouteSock, NETLINK_ROUTE,
RTMGRP_LINK |
RTMGRP_IPV4_IFADDR |
RTMGRP_IPV6_IFADDR |
RTMGRP_IPV6_ROUTE |
(1 << (RTNLGRP_ND_USEROPT - 1)),
NetlinkListener::NETLINK_FORMAT_BINARY)) == NULL) {
return -1;
}
NetlinkHandler *NetlinkManager::setupSocket(int *sock, int netlinkFamily,
int groups, int format) {
struct sockaddr_nl nladdr;
int sz = 64 * 1024;
int on = 1;
memset(&nladdr, 0, sizeof(nladdr));
nladdr.nl_family = AF_NETLINK;
nladdr.nl_pid = getpid();
nladdr.nl_groups = groups;
if ((*sock = socket(PF_NETLINK, SOCK_DGRAM, netlinkFamily)) < 0) {
ALOGE("Unable to create netlink socket: %s", strerror(errno));
return NULL;
}
if (setsockopt(*sock, SOL_SOCKET, SO_RCVBUFFORCE, &sz, sizeof(sz)) < 0) {
ALOGE("Unable to set uevent socket SO_RCVBUFFORCE option: %s", strerror(errno));
close(*sock);
return NULL;
}
if (setsockopt(*sock, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on)) < 0) {
SLOGE("Unable to set uevent socket SO_PASSCRED option: %s", strerror(errno));
close(*sock);
return NULL;
}
if (bind(*sock, (struct sockaddr *) &nladdr, sizeof(nladdr)) < 0) {
ALOGE("Unable to bind netlink socket: %s", strerror(errno));
close(*sock);
return NULL;
}
NetlinkHandler *handler = new NetlinkHandler(this, *sock, format);
if (handler->start()) {
ALOGE("Unable to start NetlinkHandler: %s", strerror(errno));
close(*sock);
return NULL;
}
return handler;
}
bool NetlinkListener::onDataAvailable(SocketClient *cli)
{
int socket = cli->getSocket();
ssize_t count;
uid_t uid = -1;
count = TEMP_FAILURE_RETRY(uevent_kernel_multicast_uid_recv(
socket, mBuffer, sizeof(mBuffer), &uid));
if (count < 0) {
if (uid > 0)
LOG_EVENT_INT(65537, uid);
SLOGE("recvmsg failed (%s)", strerror(errno));
return false;
}
NetlinkEvent *evt = new NetlinkEvent();
if (evt->decode(mBuffer, count, mFormat)) {
onEvent(evt);
} else if (mFormat != NETLINK_FORMAT_BINARY) {
// Don't complain if parseBinaryNetlinkMessage returns false. That can
// just mean that the buffer contained no messages we're interested in.
SLOGE("Error decoding NetlinkEvent");
}
delete evt;
return true;
}
bool NetlinkEvent::decode(char *buffer, int size, int format) {
if (format == NetlinkListener::NETLINK_FORMAT_BINARY) {
/**
* parseIfInfoMessage: NlActionLinkUp, NlActionLinkDown
* parseIfAddrMessage: NlActionAddressUpdated, NlActionAddressRemoved
* parseRtMessage: NlActionRouteUpdated, NlActionRouteRemoved
*/
return parseBinaryNetlinkMessage(buffer, size);
} else {
/**
* NlActionAdd, NlActionRemove
*/
return parseAsciiNetlinkMessage(buffer, size);
}
}
用户态经常需要配置一下参数到内核态,常见的机制有netlink和ioctl;
android netd中的RouteControl.cpp中就有很多路由的设置,其使用的就是netlink socket;
WARN_UNUSED_RESULT int sendNetlinkRequest(uint16_t action, uint16_t flags, iovec* iov, int iovlen) {
nlmsghdr nlmsg = {
.nlmsg_type = action,
.nlmsg_flags = flags,
};
iov[0].iov_base = &nlmsg;
iov[0].iov_len = sizeof(nlmsg);
for (int i = 0; i < iovlen; ++i) {
nlmsg.nlmsg_len += iov[i].iov_len;
}
int ret;
struct {
nlmsghdr msg;
nlmsgerr err;
} response;
int sock = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); // 创建netlink socket
if (sock != -1 &&
connect(sock, reinterpret_cast<const sockaddr*>(&NETLINK_ADDRESS),
sizeof(NETLINK_ADDRESS)) != -1 &&
writev(sock, iov, iovlen) != -1 &&
(ret = recv(sock, &response, sizeof(response), 0)) != -1) { // 发送消息并接收回复
if (ret == sizeof(response)) {
ret = response.err.error; // Netlink errors are negative errno.
if (ret) {
ALOGE("netlink response contains error (%s)", strerror(-ret));
}
} else {
ALOGE("bad netlink response message size (%d != %zu)", ret, sizeof(response));
ret = -EBADMSG;
}
} else {
ALOGE("netlink socket/connect/writev/recv failed (%s)", strerror(errno));
ret = -errno;
}
if (sock != -1) {
close(sock);
}
return ret;
}
发送的消息有:
kernel中在rtnelink_net_init中就注册了一个接收用户态netlink事件额socket:
static int __net_init rtnetlink_net_init(struct net *net)
{
struct sock *sk;
struct netlink_kernel_cfg cfg = {
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv, /* 接收函数 */
.cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
};
sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
if (!sk)
return -ENOMEM;
net->rtnl = sk;
return 0;
}
netlink_rcv –> netlink_rcv_skb –> rtnetlink_rcv_msg –> rtnl_get_doit –> doit;
其中rtnl_get_doit及时从一个数组中去消息处理函数,由family和type来做匹配,而这个消息处理函数是在ip_fib_init中添加的;
rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
在fib_rules_init中添加RTM_NEWRULE和RTM_DELRULE的处理函数:
rtnl_register(PF_USPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
rtnl_register(PF_USPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
综上,RTM_NEWRULE、RTM_DELRULE、RTM_NEWROUTE、RTM_DElROUTE对应于kernel的处理函数是:
* RTM_NEWRULE –> fib_nl_newrule
* RTM_DELRULE –> fib_nl_delrule
* RTM_NEWROUTE –> inet_rtm_newroute
* RTM_DElROUTE –> inet_rtm_delroute
此外,还要一些用户态消息与内核态处理函数的映射关系:
/* 邻居子系统相关 *、
rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETNEIGH, neigh_delete, NULL, NULL);
/* ipaddr相关 */
rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, NULL);
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, NULL);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, NULL);