主要内容:socket发送函数的系统调用、Socket层实现。
内核版本:3.15.2
我的博客:http://blog.csdn.net/zhangskd
以下是send()、sendto()、sendmsg()和sendmmsg()的发送流程图,这四个函数除了在系统调用层面
上有些差别,在Socket层和TCP层的实现都是相同的。
应用层可以使用以下Socket函数来发送数据:
ssize_t write(int fd, const void *buf, size_t count);
ssize_t send(int s, const void *buf, size_t len, int flags);
ssize_t sendto(int s, const void *buf, size_t len, int flags, const struct sockaddr *to, socklen_t tolen);
ssize_t sendmsg(int s, const struct msghdr *msg, int flags);
int sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, unsigned int flags);
这些发送函数有什么区别呢?
当flags为0时,send()和write()功能相同。
send(s, buf, len, flags)和sendto(s, buf, len, flags, NULL, 0)功能相同。
write()和send()在套接字处于连接状态时可以使用,而sendto()、sendmsg()和sendmmsg()在任何时候都可用。
用户层的数据最终都是以消息头来描述的。
struct msghdr { void *msg_name; /* optional address,目的地址 */ socklen_t msg_namelen; /* size of address,目的地址的长度 */ struct iovec *msg_iov; /* scatter/gather array,分散的数据块数组 */ size_t msg_iovlen; /* #elements in msg_iov,分散的数据块个数 */ void *msg_control; /* ancillary data, 控制数据 */ socklen_t msg_controllen; /* ancillary data buffer len,控制数据的长度 */ int msg_flags; /* flags on received message */ };
/* Structure for scatter/gather I/O. */ struct iovec { void *iov_base; /* Pointer to data. */ size_t iov_len; /* Length of data. */ };
发送默认为阻塞发送,也可以设置为非阻塞发送。
非阻塞标志:O_NONBLOCK、MSG_DONTWAIT
When the message does not fit into the send buffer of the socket, send() normally blocks, unless the
socket has been placed in non-blocking I/O mode.
Enables non-blocking operations; if the operation would block, EAGAIN is returned (this can also be enabled
using the O_NON-BLOCK with the F_SETEL fcntl(2)).
发送函数是由glibc提供的,声明位于include/sys/socket.h中,实现位于sysdeps/mach/hurd/connect.c中,
主要是用来从用户空间进入名为sys_socketcall的系统调用,并传递参数。sys_socketcall()实际上是所有
socket函数进入内核空间的共同入口。
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { ... switch(call) { ... case SYS_SEND: err = sys_send(a0, (void __user *)a1, a[2], a[3]); break; case SYS_SENDTO: err = sys_sendto(a0, (void __user *)a1 a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; ... case SYS_SENDMSG: err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_SENDMMSG: err = sys_sendmmsg(a0, (struct msghdr __user *)a1, a[2], a[3]); break; ... } }
send()其实是sendto()的一种特殊情况。
SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, unsigned, flags) { return sys_sendto(fd, buff, len, flags, NULL, 0); }
sendto()初始化了消息头,接着就调用sock_sendmsg()来处理。
/* Send a datagram to a given address. We move the address into kernel space * and check the user space data area is readable before invoking the protocol. */ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, unsigned, flags, struct sockaddr __user *, addr, int, addr_len) { struct socket *sock; struct sockaddr_storage address; int err; struct msghdr msg; struct iovec iov; int fput_needed; if (len > INT_MAX) len = INT_MAX; /* 通过文件描述符fd,找到对应的socket实例。 * 以fd为索引从当前进程的文件描述符表files_struct实例中找到对应的file实例, * 然后从file实例的private_data成员中获取socket实例。 */ sock = sockfd_lookup_light(fd, &err, &fput_needed); if (! sock) goto out; /* 初始化消息头 */ iov.iov_base = buff; iov.iov_len = len; msg.msg_name = NULL; msg.msg_iov = &iov; msg.msg_iovlen = 1; /* 只有一个数据块 */ msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; if (addr) { /* 把套接字地址从用户空间拷贝到内核空间 */ err = move_addr_to_kernel(addr, addr_len, &address); if (err < 0) goto out_put; msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } /* 如果设置了非阻塞标志 */ if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; /* 调用统一的发送入口函数sock_sendmsg() */ err = sock_sendmsg(sock , &msg, len); out_put: fput_light(sock->file, fput_needed); out: return err; }
struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iovec *msg_iov; /* scatter/gather array,分散的数据块数组 */ __kernel_size_t msg_iovlen; /* #elements in msg_iov,分散的数据块个数 */ void *msg_control; /* ancillary data, 控制数据 */ __kernel_size_t msg_controllen; /* ancillary data buffer len,控制数据的长度 */ unsigned int msg_flags; /* flags on received message */ }; /* Structure for scatter/gather I/O. */ struct iovec { void *iov_base; /* Pointer to data. */ __kernel_size_t iov_len; /* Length of data. */ }; /* For recvmmsg/ sendmmsg */ struct mmsghdr { struct msghdr msg_hdr; unsigned int msg_len; };
sock_sendmsg()在初始化异步IO控制块后,调用__sock_sendmsg()。
int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct kiocb iocb; struct sock_iocb siocb; int ret; init_sync_kiocb(&iocb, NULL); iocb.private = &siocb; ret = __sock_sendmsg(&iocb, sock, msg, size); /* iocb queued, will get completion event */ if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&iocb); return ret; } /* AIO控制块 */ struct kiocb { struct file *ki_filp; struct kioctx *ki_ctx; /* NULL for sync ops,如果是同步的则为NULL */ kiocb_cancel_fn *ki_cancel; void *private; /* 指向sock_iocb */ union { void __user *user; struct task_struct *tsk; /* 执行io的进程 */ } ki_obj; __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; size_t ki_nbytes; /* copy of iocb->aio_nbytes */ struct list_head ki_list; /* the aio core uses this for cancellation */ /* If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. */ struct eventfd_ctx *ki_eventfd; };
__sock_sendmsg()会调用Socket层的发送函数,如果是SOCK_STREAM,
那么接着就调用inet_sendmsg()处理。
static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { int err = security_socket_sendmsg(sock, msg, size); return err ?: __sock_sendmsg_nosec(iocb, sock, msg, size); } static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock_iocb *si = kiocb_to_siocb(iocb); si->sock = sock; si->scm = NULL; si->msg = msg; si->size = size; /* 调用Socket层的操作函数,如果是SOCK_STREAM,则proto_ops为inet_stream_ops, * 函数指针指向inet_sendmsg()。 */ return sock->ops->sendmsg(iocb, sock, msg, size); }
sendmsg()和sendmmsg()在系统调用函数中也是拷贝用户空间的数据到内核消息头,最后调用
Socket层的发送函数inet_sendmsg()进行下一步处理,这里不再赘述。
SOCK_STREAM套接口的socket层操作函数集实例为inet_stream_ops,其中发送函数为inet_sendmsg()。
const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, ... .sendmsg = inet_sendmsg, ... };
inet_sendmsg()主要调用TCP层的发送函数tcp_sendmsg()来处理。
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; sock_rps_record_flow(sk); /* We may need to bnd the socket. * 如果连接还没有分配本地端口,且允许自动绑定,那么给连接绑定一个本地端口。 * tcp_prot的no_autobaind为true,所以TCP是不允许自动绑定端口的。 */ if (! inet_sk(sk)->inet_num && ! sk->sk_prot->no_autobind && inet_autobind(s)) return -EAGAIN; /* 如果传输层使用的是TCP,则sk_prot为tcp_prot,sendmsg指向tcp_sendmsg() */ return sk->sk_prot->sendmsg(iocb, sk, msg, size); } /* Automatically bind an unbound socket. */ static int inet_autobind(struct sock *sk) { struct inet_sock *inet; /* We may need to bind the socket. */ lock_sock(sk); /* 如果还没有分配本地端口 */ if (! inet->inet_num) { /* SOCK_STREAM套接口的TCP操作函数集为tcp_prot,其中端口绑定函数为 * inet_csk_get_port()。 */ if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } inet->inet_sport = htons(inet->inet_num); } release_sock(sk); return 0; }