提升网络I/O 性能
翻译:韩红军。[email protected] ; [email protected]
原文出自:https://www.captech.com.cn
英文原文:http://www.xmailserver.org/linux-patches/nio-improve.html
由于水平有限,错误在所难免,希望各位指正。
07-01-2001 – 初稿 - Davide Libenzi
10-30-2002 – epoll补丁成为Linux内核一部分。请参考这个版本的,因为这个版本将会成为标准,并得到广泛支持Davide Libenzi
绪论:
眼下的工作是分析不同的方法,这些方法都是用来实现从内核模式高效传递网络事件到用户模式。我们考察了五种方法:作为一种相对较好的老的方法poll ,标准/dev/poll接口,标准RT信号,RT signals with one-sig-per-fd patch,和使用很特别的通知方法的新式/dev/epoll。工作有如下四部分组成:
1) 新式 /dev/epoll 内核补丁
2) Provos-Lever修改的能够在内核 2.4.6 工作的/dev/poll 补丁
3) HTTP server
4) 一个能够产生“dead“连接的deadconn(tm)工具。
Httperf被采用作为度量工具,尽管不完美,但也提供了足够的网络负载选项。
新式 /dev/epoll 内核补丁:
这个补丁很简单,它在struct file的数据结构里面添加了通知回调链表项。如下代码段:
代码: |
******* include/linux/fs.h struct file { ... /* 文件回调列表 */ rwlock_t f_cblock; struct list_head f_cblist; }; |
代码: |
****include/linux/fcblist.h /* 文件回调通知事件 */ #define ION_IN 1 #define ION_OUT 2 #define ION_HUP 3 #define ION_ERR 4 #define FCB_LOCAL_SIZE 4 #define fcblist_read_lock(fp, fl) read_lock_irqsave(&(fp)->f_cblock, fl) #define fcblist_read_unlock(fp, fl) read_unlock_irqrestore(&(fp)->f_cblock, fl) #define fcblist_write_lock(fp, fl) write_lock_irqsave(&(fp)->f_cblock, fl) #define fcblist_write_unlock(fp,fl) write_unlock_irqrestore(&(fp)->f_cblock, fl) struct fcb_struct { struct list_head lnk; void (*cbproc)(struct file *, void *, unsigned long *, long *); void *data; unsigned long local[FCB_LOCAL_SIZE]; }; extern long ion_band_table[]; extern long poll_band_table[]; static inline void file_notify_init(struct file *filep) { rwlock_init(&filep->f_cblock); INIT_LIST_HEAD(&filep->f_cblist); } void file_notify_event(struct file *filep, long *event); int file_notify_addcb(struct file *filep, void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data); int file_notify_delcb(struct file *filep, void (*cbproc)(struct file *, void *, unsigned long *, long *)); void file_notify_cleanup(struct file *filep); |
代码: |
****** fs/file_table.c struct file * get_empty_filp(void) { ... file_notify_init(f); ... } int init_private_file(struct file *filp, struct dentry *dentry, int mode) { ... file_notify_init(filp); ... } void fput(struct file * file) { ... file_notify_cleanup(file); ... } ****** fs/fcblist.c void file_notify_event(struct file *filep, long *event) { unsigned long flags; struct list_head *lnk; fcblist_read_lock(filep, flags); list_for_each(lnk, &filep->f_cblist) { struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, lnk); fcbp->cbproc(filep, fcbp->data, fcbp->local, event); } fcblist_read_unlock(filep, flags); } int file_notify_addcb(struct file *filep, void (*cbproc)(struct file *, void *, unsigned long *, long *), void *data) { unsigned long flags; struct fcb_struct *fcbp; if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL))) return -ENOMEM; memset(fcbp, 0, sizeof(struct fcb_struct)); fcbp->cbproc = cbproc; fcbp->data = data; fcblist_write_lock(filep, flags); list_add_tail(&fcbp->lnk, &filep->f_cblist); fcblist_write_unlock(filep, flags); return 0; } int file_notify_delcb(struct file *filep, void (*cbproc)(struct file *, void *, unsigned long *, long *)) { unsigned long flags; struct list_head *lnk; fcblist_write_lock(filep, flags); list_for_each(lnk, &filep->f_cblist) { struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, lnk); if (fcbp->cbproc == cbproc) { list_del(lnk); fcblist_write_unlock(filep, flags); kfree(fcbp); return 0; } } fcblist_write_unlock(filep, flags); return -ENOENT; } void file_notify_cleanup(struct file *filep) { unsigned long flags; struct list_head *lnk; fcblist_write_lock(filep, flags); while ((lnk = list_first(&filep->f_cblist))) { struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, lnk); list_del(lnk); fcblist_write_unlock(filep, flags); kfree(fcbp); fcblist_write_lock(filep, flags); } fcblist_write_unlock(filep, flags); } |
代码: |
****** include/net/sock.h static inline void sk_wake_async(struct sock *sk, int how, int band) { if (sk->socket) { if (sk->socket->file) { long event[] = { ion_band_table[band - POLL_IN], poll_band_table[band - POLL_IN], -1 }; file_notify_event(sk->socket->file, event); } if (sk->socket->fasync_list) sock_wake_async(sk->socket, how, band); } } |
代码: |
if ((kdpfd = open("/dev/epoll", O_RDWR)) == -1) { } if (ioctl(kdpfd, EP_ALLOC, maxfds)) { } if ((map = (char *) mmap(NULL, EP_MAP_SIZE(maxfds), PROT_READ, MAP_PRIVATE, kdpfd, 0)) == (char *) -1) { } |
代码: |
struct pollfd pfd; pfd.fd = fd; pfd.events = POLLIN | POLLOUT | POLLERR | POLLHUP; pfd.revents = 0; if (write(kdpfd, &pfd, sizeof(pfd)) != sizeof(pfd)) { ... } |
代码: |
struct pollfd pfd; pfd.fd = fd; pfd.events = POLLREMOVE; pfd.revents = 0; if (write(kdpfd, &pfd, sizeof(pfd)) != sizeof(pfd)) { ... } |
代码: |
struct pollfd *pfds; struct evpoll evp; for (;;) { evp.ep_timeout = STD_SCHED_TIMEOUT; evp.ep_resoff = 0; nfds = ioctl(kdpfd, EP_POLL, &evp); pfds = (struct pollfd *) (map + evp.ep_resoff); for (ii = 0; ii < nfds; ii++, pfds++) { ... } } |