本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严
5.3 连接
连接通常是针对客户端连接服务器
static int netlink_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
int err = 0;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr;
if (addr->sa_family == AF_UNSPEC) {
// 目的地址协议族为AF_UNSPEC(未指定), 简单返回成功
sk->sk_state = NETLINK_UNCONNECTED;
nlk->dst_pid = 0;
nlk->dst_group = 0;
return 0;
}
// 限制目的地址协议族类型为AF_NETLINK
if (addr->sa_family != AF_NETLINK)
return -EINVAL;
/* Only superuser is allowed to send multicasts */
// 只有ROOT权限才能多播
if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND))
return -EPERM;
// 没指定pid的话自动绑定一个pid
if (!nlk->pid)
err = netlink_autobind(sock);
if (err == 0) {
// 已经指定了pid或者自动绑定成功时设置sock的对方参数, 状态为连接成功
sk->sk_state = NETLINK_CONNECTED;
nlk->dst_pid = nladdr->nl_pid;
nlk->dst_group = ffs(nladdr->nl_groups);
}
return err;
}
5.4 获取sock名称
// 填充sockaddr_nl结构中的数据
static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int
peer)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr;
// 协议族
nladdr->nl_family = AF_NETLINK;
nladdr->nl_pad = 0;
*addr_len = sizeof(*nladdr);
if (peer) {
// 对方sock的pid和groups
nladdr->nl_pid = nlk->dst_pid;
nladdr->nl_groups = netlink_group_mask(nlk->dst_group);
} else {
// 自己sock的pid和groups
nladdr->nl_pid = nlk->pid;
nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
}
return 0;
}
5.5 poll
poll是用poll(2)或select(2)系统调用选择套接口数据是否准备好时的处理函数,netlink用的是通用
的数据报的poll处理函数dategram_poll(), 说明略。
5.6 setsockopt
设置netlink sock的各种控制参数:
static int netlink_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, int optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
int val = 0, err;
// sock层次要为SOL_NETLINK
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
// 读取用户空间的设置信息
if (optlen >= sizeof(int) &&
get_user(val, (int __user *)optval))
return -EFAULT;
switch (optname) {
case NETLINK_PKTINFO:
// 处理NETLINK_RECV_PKTINFO标志, 非0设置, 0为清除
if (val)
nlk->flags |= NETLINK_RECV_PKTINFO;
else
nlk->flags &= ~NETLINK_RECV_PKTINFO;
err = 0;
break;
case NETLINK_ADD_MEMBERSHIP:
case NETLINK_DROP_MEMBERSHIP: {
// 加入或退出多播组
unsigned int subscriptions;
int old, new = optname == NETLINK_ADD_MEMBERSHIP ? 1 : 0;
// 检查权限
if (!netlink_capable(sock, NL_NONROOT_RECV))
return -EPERM;
// 如果当前sock的多播组为空是分配空间
if (nlk->groups == NULL) {
err = netlink_alloc_groups(sk);
if (err)
return err;
}
// 检查数据范围
if (!val || val - 1 >= nlk->ngroups)
return -EINVAL;
netlink_table_grab();
// 原来的状态标志
old = test_bit(val - 1, nlk->groups);
// 如果old=1, new=0, subscriptions-1
// 如果old=0, new=1, subscriptions+1
subscriptions = nlk->subscriptions - old + new;
// 设置或清除相应状态标志
if (new)
__set_bit(val - 1, nlk->groups);
else
__clear_bit(val - 1, nlk->groups);
// 更新sock参数
netlink_update_subscriptions(sk, subscriptions);
netlink_update_listeners(sk);
netlink_table_ungrab();
err = 0;
break;
}
default:
err = -ENOPROTOOPT;
}
return err;
}
// 分配netlink sock的多播组空间
static int netlink_alloc_groups(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
unsigned int groups;
int err = 0;
netlink_lock_table();
// 组的数量是内核初始化时固定的, 最小值32, 尽量是8的倍数
groups = nl_table[sk->sk_protocol].groups;
if (!nl_table[sk->sk_protocol].registered)
err = -ENOENT;
netlink_unlock_table();
if (err)
return err;
// NLGRPSZ(groups)进行8字节对齐
nlk->groups = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
if (nlk->groups == NULL)
return -ENOMEM;
nlk->ngroups = groups;
return 0;
}
5.7 getsockopt
获取netlink sock的各种控制参数:
static int netlink_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
int len, val, err;
// sock层次要为SOL_NETLINK
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
// 读取用户空间的查询信息
if (get_user(len, optlen))
return -EFAULT;
if (len < 0)
return -EINVAL;
switch (optname) {
case NETLINK_PKTINFO:
// 只提供一种选项信息PKTINFO
if (len < sizeof(int))
return -EINVAL;
len = sizeof(int);
// 看sock标志是否有NETLINK_RECV_PKTINFO返回1或0
val = nlk->flags & NETLINK_RECV_PKTINFO ? 1 : 0;
if (put_user(len, optlen) ||
put_user(val, optval))
return -EFAULT;
err = 0;
break;
default:
err = -ENOPROTOOPT;
}
return err;
}
5.8 发送消息
从用户层发送数据到内核, 内核的sock是接收方
static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
struct msghdr *msg, size_t len)
{
// sock的IO控制块
struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
// socket -> sock
struct sock *sk = sock->sk;
// sock -> netlink sock
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *addr=msg->msg_name;
u32 dst_pid;
u32 dst_group;
struct sk_buff *skb;
int err;
// scm: Socket level control messages processing
struct scm_cookie scm;
// 设置了OOB(out of band)标志, 在TCP中支持,netlink不支持
if (msg->msg_flags&MSG_OOB)
return -EOPNOTSUPP;
if (NULL == siocb->scm)
siocb->scm = &scm;
// scm这些处理是干什么的以后再看
err = scm_send(sock, msg, siocb->scm);
if (err < 0)
return err;
// 确定目的pid和组
if (msg->msg_namelen) {
if (addr->nl_family != AF_NETLINK)
return -EINVAL;
dst_pid = addr->nl_pid;
dst_group = ffs(addr->nl_groups);
if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
return -EPERM;
} else {
dst_pid = nlk->dst_pid;
dst_group = nlk->dst_group;
}
// 如果sock的pid为0, 自动绑定一个pid
if (!nlk->pid) {
err = netlink_autobind(sock);
if (err)
goto out;
}
err = -EMSGSIZE;
// 消息长度太大
if (len > sk->sk_sndbuf - 32)
goto out;
err = -ENOBUFS;
// 新生成一个skb数据包
skb = nlmsg_new(len, GFP_KERNEL);
if (skb==NULL)
goto out;
// 设置该skb的netlink控制块参数
NETLINK_CB(skb).pid = nlk->pid;
NETLINK_CB(skb).dst_pid = dst_pid;
NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
selinux_get_task_sid(current, &(NETLINK_CB(skb).sid));
memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
/* What can I do? Netlink is asynchronous, so that
we will have to save current capabilities to
check them, when this message will be delivered
to corresponding kernel module. --ANK (980802)
*/
err = -EFAULT;
// 将发送的信息拷贝到skb的存储区
if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) {
kfree_skb(skb);
goto out;
}
/* @netlink_send:
* Save security information for a netlink message so that permission
* checking can be performed when the message is processed. The security
* information can be saved using the eff_cap field of the
* netlink_skb_parms structure. Also may be used to provide fine
* grained control over message transmission.
* @sk associated sock of task sending the message.,
* @skb contains the sk_buff structure for the netlink message.
* Return 0 if the information was successfully saved and message
* is allowed to be transmitted.
*/
err = security_netlink_send(sk, skb);
if (err) {
kfree_skb(skb);
goto out;
}
// 如果是多播的,先进行广播发送
if (dst_group) {
// 增加使用者计数, 使skb不会真正释放
atomic_inc(&skb->users);
netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
}
// 单播发送
err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
out:
return err;
}
// netlink广播, 发送到组内的全部sock
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
u32 group, gfp_t allocation)
{
// netlink广播数据结构信息
struct netlink_broadcast_data info;
struct hlist_node *node;
struct sock *sk;
// 调整skb空间
skb = netlink_trim(skb, allocation);
// 填充info结构基本参数
info.exclude_sk = ssk;
info.pid = pid;
info.group = group;
info.failure = 0;
info.congested = 0;
info.delivered = 0;
info.allocation = allocation;
info.skb = skb;
info.skb2 = NULL;
/* While we sleep in clone, do not allow to change socket list */
netlink_lock_table();
// 遍历多播链表, 分别对每个sock进行单播
sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
do_one_broadcast(sk, &info);
// 释放skb, 其实没有立即释放, 要先减少使用者数
kfree_skb(skb);
netlink_unlock_table();
// 如果分配了skb2,释放之
if (info.skb2)
kfree_skb(info.skb2);
if (info.delivered) {
if (info.congested && (allocation & __GFP_WAIT))
yield();
return 0;
}
if (info.failure)
return -ENOBUFS;
return -ESRCH;
}
// 单一广播
static inline int do_one_broadcast(struct sock *sk,
struct netlink_broadcast_data *p)
{
struct netlink_sock *nlk = nlk_sk(sk);
int val;
if (p->exclude_sk == sk)
goto out;
// 检查pid和组是否合法
if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
!test_bit(p->group - 1, nlk->groups))
goto out;
if (p->failure) {
netlink_overrun(sk);
goto out;
}
sock_hold(sk);
if (p->skb2 == NULL) {
if (skb_shared(p->skb)) {
// 克隆skb
p->skb2 = skb_clone(p->skb, p->allocation);
} else {
// 此时skb2不会为NULL的
p->skb2 = skb_get(p->skb);
/*
* skb ownership may have been set when
* delivered to a previous socket.
*/
skb_orphan(p->skb2);
}
}
if (p->skb2 == NULL) {
// 如果还是为NULL必然是克隆失败
netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */
p->failure = 1;
// 否则发送skb2
} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
netlink_overrun(sk);
} else {
// 数据正常发送
p->congested |= val;
p->delivered = 1;
p->skb2 = NULL;
}
sock_put(sk);
out:
return 0;
}
static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
struct netlink_sock *nlk = nlk_sk(sk);
// 发送缓冲中要有足够空间
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!test_bit(0, &nlk->state)) {
skb_set_owner_r(skb, sk);
// 添加到接收队列尾, 由于是本机内部通信, 可以自己找到要发送的目的方,
// 所以直接将数据扔给目的方, 所以是接收队列
skb_queue_tail(&sk->sk_receive_queue, skb);
// 调用netlink sock的sk_data_ready函数处理, 由此进入内核中netlink各协议
// 的回调处理
sk->sk_data_ready(sk, skb->len);
return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf;
}
return -1;
}
// netlink单播
int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
{
struct sock *sk;
int err;
long timeo;
// 调整skb大小
skb = netlink_trim(skb, gfp_any());
// 获取超时时间
timeo = sock_sndtimeo(ssk, nonblock);
retry:
// ssk是服务器端的sock, 然后根据pid找到客户端的sock
sk = netlink_getsockbypid(ssk, pid);
if (IS_ERR(sk)) {
kfree_skb(skb);
return PTR_ERR(sk);
}
// 将数据包附着在客户端sock上
err = netlink_attachskb(sk, skb, nonblock, timeo, ssk);
if (err == 1)
goto retry;
if (err)
return err;
// 发送netlink数据包
return netlink_sendskb(sk, skb, ssk->sk_protocol);
}
/*
* Attach a skb to a netlink socket.
* The caller must hold a reference to the destination socket. On error, the
* reference is dropped. The skb is not send to the destination, just all
* all error checks are performed and memory in the queue is reserved.
* Return values:
* < 0: error. skb freed, reference to sock dropped.
* 0: continue
* 1: repeat lookup - reference dropped while waiting for socket memory.
*/
// 注意这个是内核全局函数, 非static
int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
long timeo, struct sock *ssk)
{
struct netlink_sock *nlk;
nlk = nlk_sk(sk);
// 检查接收缓存大小是否足够, 不够的话阻塞等待直到出错或条件满足
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(0, &nlk->state)) {
// 声明当前进程的等待队列
DECLARE_WAITQUEUE(wait, current);
if (!timeo) {
if (!ssk || nlk_sk(ssk)->pid == 0)
netlink_overrun(sk);
sock_put(sk);
kfree_skb(skb);
return -EAGAIN;
}
// 设置当前进程状态为可中断的
__set_current_state(TASK_INTERRUPTIBLE);
// 将sock挂接到等待队列
add_wait_queue(&nlk->wait, &wait);
// 空间不够的话阻塞, timeo为阻塞超时
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(0, &nlk->state)) &&
!sock_flag(sk, SOCK_DEAD))
timeo = schedule_timeout(timeo);
// 进程状态运行
__set_current_state(TASK_RUNNING);
// 删除等待队列
remove_wait_queue(&nlk->wait, &wait);
sock_put(sk);
if (signal_pending(current)) {
// 阻塞是通过超时解开的,而不是空间条件符合解开, 属于错误状态
kfree_skb(skb);
return sock_intr_errno(timeo);
}
// 返回1, 重新选sock
return 1;
}
// 条件满足, 直接将skb的所有者设为该netlink sock
skb_set_owner_r(skb, sk);
return 0;
}
// 注意这个是内核全局函数, 非static
int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol)
{
int len = skb->len;
// 将skb添加到接收队列末尾
skb_queue_tail(&sk->sk_receive_queue, skb);
// 调用netlink sock的sk_data_ready函数处理
sk->sk_data_ready(sk, len);
sock_put(sk);
return len;
}
5.9 接收消息
数据是内核传向用户空间的
static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
struct msghdr *msg, size_t len,
int flags)
{
// sock的IO控制块
struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
// scm
struct scm_cookie scm;
// socket -> sock
struct sock *sk = sock->sk;
// sock -> netlink sock
struct netlink_sock *nlk = nlk_sk(sk);
// 是否是非阻塞的
int noblock = flags&MSG_DONTWAIT;
size_t copied;
struct sk_buff *skb;
int err;
// 不能带OOB标志
if (flags&MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
// 接收一个数据包
skb = skb_recv_datagram(sk,flags,noblock,&err);
if (skb==NULL)
goto out;
msg->msg_namelen = 0;
// 收到的实际数据长度
copied = skb->len;
// 接收缓冲小于数据长度, 设置数据裁剪标志
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
skb->h.raw = skb->data;
// 将skb的数据拷贝到接收缓冲区
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
if (msg->msg_name) {
// sock有效, 填写nl sock的数据
struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name;
addr->nl_family = AF_NETLINK;
addr->nl_pad = 0;
addr->nl_pid = NETLINK_CB(skb).pid;
addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
msg->msg_namelen = sizeof(*addr);
}
// 接收数据包信息标志, 将消息头拷贝到用户空间
if (nlk->flags & NETLINK_RECV_PKTINFO)
netlink_cmsg_recv_pktinfo(msg, skb);
if (NULL == siocb->scm) {
memset(&scm, 0, sizeof(scm));
siocb->scm = &scm;
}
siocb->scm->creds = *NETLINK_CREDS(skb);
skb_free_datagram(sk, skb);
if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
netlink_dump(sk);
scm_recv(sock, msg, siocb->scm, flags);
out:
// 接收唤醒
netlink_rcv_wake(sk);
return err ? : copied;
}
6. 结论
netlink处理代码不是很好懂, 毕竟和其他协议不同之处是内核中同时存在服务器和客户端的sock, 因
此接收发送数据要注意数据的流向。不过在实际使用中感觉不是很稳定, 流量大时会发生各种奇异的死机现象。