epoll的原理和应用- I/O多路复用

网络服务器的传统实现方式是使用每个进程或线程处理一个连接,对于大规模的并发访问来说,这样的时间和空间效率都是不行的。因此推荐的方法是:在一个线程中使用非阻塞IO循环处理IO事件。 

epoll系统调用
epoll系统调用在linux2.6开始支持,提供了三个函数。
epoll_create()创建epoll实例。
epoll_ctl()为epoll实例添加或删除描述符。
epoll_wait()用于等待特定的事件。

当描述符添加到epoll实例时可以设置为两种模式,Edge-Triggered和Level-Triggered。两种模式的区别是:
1、采用 Level-Triggered模式时,当IO数据可读时,epoll_wait()会因事件到达而返回。如果你数据没有读完,还可以调用epoll_wait()并马上返回。这是epoll默认采用的模式。

2、采用 Edge-Triggered模式时,当IO数据可读时,事件到达的通知只会发出一次。如果数据没有读完,再次调用epoll_wait()时还会阻塞(因为刚才已经发出过一次事件通知了,需要等到有新事件到达)。

epoll 事件的结构体
typedef union epoll_data
{
  void        *ptr;
  int          fd;
  __uint32_t   u32;
  __uint64_t   u64;
} epoll_data_t;

struct epoll_event
{
  __uint32_t   events; /* Epoll events */
  epoll_data_t data;   /* User data variable */
};

一般要处理的到达事件:
出错事件
新连接
发送来的数据就绪
要发送出的数据就绪

下面实现一个小型的TCP服务器,能输出所有发送到服务器上的数据。
第一步,写一个函数创建和绑定多个TCP套接字:
第二步,将套接字设置为非阻塞;
第三步,事件循环。
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define MAXEVENTS 64

static int make_socket_non_blocking (int sfd)
{
  int flags, s;

  flags = fcntl (sfd, F_GETFL, 0);
  if (flags == -1)
    {
      perror ("fcntl");
      return -1;
    }

  flags |= O_NONBLOCK;
  s = fcntl (sfd, F_SETFL, flags);
  if (s == -1)
    {
      perror ("fcntl");
      return -1;
    }

  return 0;
}

static int create_and_bind (char *port)
{
  struct addrinfo hints;
  struct addrinfo *result, *rp;
  int s, sfd;

  memset (&hints, 0, sizeof (struct addrinfo));
  hints.ai_family = AF_UNSPEC;     /* Return IPv4 and IPv6 choices */
  hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */
  hints.ai_flags = AI_PASSIVE;     /* All interfaces */

  s = getaddrinfo (NULL, port, &hints, &result); //一个地址可能映射到多个ip上去
  if (s != 0)
    {
      fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s));
      return -1;
    }

  for (rp = result; rp != NULL; rp = rp->ai_next) //遍历链表
    {
      sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol);
      if (sfd == -1)
        continue;

      s = bind (sfd, rp->ai_addr, rp->ai_addrlen);
      if (s == 0)
        {
          /* We managed to bind successfully! */
          break;
        }

      close (sfd);
    }

  if (rp == NULL)
    {
      fprintf (stderr, "Could not bind\n");
      return -1;
    }

  freeaddrinfo (result);

  return sfd;
}

int main (int argc, char *argv[]) //需要参数:绑定的端口号
{
  int sfd, s;
  int efd;
  struct epoll_event event;
  struct epoll_event *events;

  if (argc != 2) 
    {
      fprintf (stderr, "Usage: %s [port]\n", argv[0]);
      exit (EXIT_FAILURE);
    }

  sfd = create_and_bind (argv[1]);
  if (sfd == -1)
    abort ();

  s = make_socket_non_blocking (sfd);
  if (s == -1)
    abort ();

  s = listen (sfd, SOMAXCONN);
  if (s == -1)
    {
      perror ("listen");
      abort ();
    }

  efd = epoll_create1 (0); //创建epoll实例
  if (efd == -1)
    {
      perror ("epoll_create");
      abort ();
    }

  event.data.fd = sfd;
  event.events = EPOLLIN | EPOLLET; //描述符对应的文件支持读操作,采用Edge-Triggered模式
  s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event); //向epoll添加描述符
  if (s == -1)
    {
      perror ("epoll_ctl");
      abort ();
    }

  /* Buffer where events are returned */
  events = calloc (MAXEVENTS, sizeof event);

  /*事件循环 */
  while (1)
    {
      int n, i;

      n = epoll_wait (efd, events, MAXEVENTS, -1);
      //返回值n是就绪的socket描述符的个数
      for (i = 0; i < n; i++)
	{
	  if ((events[i].events & EPOLLERR) ||   //文件描述服发生错误
              (events[i].events & EPOLLHUP) ||	 //文件描述符被挂断
              (!(events[i].events & EPOLLIN)))	 //文件描述符不可读
	    {
              /* An error has occured on this fd, or the socket is not
                 ready for reading (why were we notified then?) */
	      fprintf (stderr, "epoll error\n");
	      close (events[i].data.fd);
	      continue;
	    }

	  else if (sfd == events[i].data.fd)  //新连接事件
	    {
              /* We have a notification on the listening socket, which
                 means one or more incoming connections. */
              while (1)
                {
                  struct sockaddr in_addr;
                  socklen_t in_len;
                  int infd;
                  char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];

                  in_len = sizeof in_addr;
                  infd = accept (sfd, &in_addr, &in_len);
                  if (infd == -1)
                    {
                      if ((errno == EAGAIN) ||
                          (errno == EWOULDBLOCK))
                        {
                          /* We have processed all incoming
                             connections. */
                          break;
                        }
                      else
                        {
                          perror ("accept");
                          break;
                        }
                    }

                  s = getnameinfo (&in_addr, in_len,
                                   hbuf, sizeof hbuf,
                                   sbuf, sizeof sbuf,
                                   NI_NUMERICHOST | NI_NUMERICSERV);
                  if (s == 0)
                    {
                      printf("Accepted connection on descriptor %d "
                             "(host=%s, port=%s)\n", infd, hbuf, sbuf);
                    }

                  /* Make the incoming socket non-blocking and add it to the
                     list of fds to monitor. */
                  s = make_socket_non_blocking (infd);
                  if (s == -1)
                    abort ();

                  event.data.fd = infd;
                  event.events = EPOLLIN | EPOLLET;
                  s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event);
                  if (s == -1)
                    {
                      perror ("epoll_ctl");
                      abort ();
                    }
                }
              continue;
            }
          else
            {
              /* We have data on the fd waiting to be read. Read and
                 display it. We must read whatever data is available
                 completely, as we are running in edge-triggered mode
                 and won't get a notification again for the same
                 data. */
              int done = 0;

              while (1)
                {
                  ssize_t count;
                  char buf[512];

                  count = read (events[i].data.fd, buf, sizeof buf); //从文件中读出到buf
                  if (count == -1)
                    {
                      /* If errno == EAGAIN, that means we have read all
                         data. So go back to the main loop. */
                      if (errno != EAGAIN)
                        {
                          perror ("read");
                          done = 1;
                        }
                      break;
                    }
                  else if (count == 0)
                    {
                      /* End of file. The remote has closed the
                         connection. */
                      done = 1;
                      break;
                    }

                  /* 从buf写入到标准输出 */
                  s = write (1, buf, count);
                  if (s == -1)
                    {
                      perror ("write");
                      abort ();
                    }
                }

              if (done)
                {
                  printf ("Closed connection on descriptor %d\n",
                          events[i].data.fd);

                  /* Closing the descriptor will make epoll remove it
                     from the set of descriptors which are monitored. */
                  close (events[i].data.fd);
                }
            }
        }
    }

  free (events);

  close (sfd);

  return EXIT_SUCCESS;
}

epoll的实现原理
select比epoll效率差的原因:select是轮询,epoll是触发式的。

epoll会在内核注册了一个文件系统,用于存储被监控文件描述符(比如socket描述符)。
epol拥有自己的内核高速cache区,用于安置每一个我们想监控的描述符,这些描述符会以红黑树的组织形式保存在内核cache里,以支持快速的查找、插入、删除。
在调用epoll_create()时,内核存储要监控的文件描述符了。调用epoll_ctl()是在往内核的数据结构里添加新的文件描述符。

就绪链表(Ready List)
epoll维护了一个就绪链表。当某个被epoll监控的文件描述符有事件发生时,向内核发出中断信号,内核会将该事件放到就绪链表中,并发出一个事件到达的通知。这样,所以到达的事件会保留在就绪链表中,等待用户态进程去处理。

Level-Triggered模式下,只要就绪链表中有就绪事件,调用epoll_wait()就会马上返回让你去处理。
在Edge-Triggered模式下,不管就绪链表中是否有就绪事件,只要不来新的事件到达通知,epoll_wait()会一直阻塞。

参考资料:
https://banu.com/blog/2/how-to-use-epoll-a-complete-example-in-c/
http://blog.csdn.net/russell_tao/article/details/7160071
http://man7.org/linux/man-pages/man2/epoll_ctl.2.html

你可能感兴趣的:(网络编程,Linux,操作系统)