linux pipe文件系统(pipefs)

linux的pipe,FIFO(named pipe)都是基于pipe文件系统(pipefs)的。
linux的pipe和FIFO都是半双工的,即数据流向只能是一个方向;
pipe只能在pipe的创建进程及其后代进程(后代进程fork/exec时,通过继承父进程的打开文件描述符表)之间使用,来实现通信;
FIFO是有名称的pipe,即可以通过名称查找到pipe,所以没有“只能在pipe的创建进程及其后代进程之间使用”的限制;通过名称找到pipe文件,创建相应的pipe,就可以实现进程间的通信。

I.数据结构
i.pipe_buffer
管道缓存,用于暂存写入管道的数据;写进程通过管道写入端将数据写入管道缓存中,读进程通过管道读出端将数据从管道缓存中读出

/* include/linux/pipe_fs_i.h */
  8 #define PIPE_BUF_FLAG_LRU       0x01    /* page is on the LRU */
  9 #define PIPE_BUF_FLAG_ATOMIC    0x02    /* was atomically mapped */
 10 #define PIPE_BUF_FLAG_GIFT      0x04    /* page is a gift */
 11 
 12 /**
 13  *      struct pipe_buffer - a linux kernel pipe buffer
 14  *      @page: the page containing the data for the pipe buffer
 15  *      @offset: offset of data inside the @page
 16  *      @len: length of data inside the @page
 17  *      @ops: operations associated with this buffer. See @pipe_buf_operations.
 18  *      @flags: pipe buffer flags. See above.
 19  *      @private: private data owned by the ops.
 20  **/
 21 struct pipe_buffer {
 22         struct page *page;
 23         unsigned int offset, len;
 24         const struct pipe_buf_operations *ops;
 25         unsigned int flags;
 26         unsigned long private;
 27 };

page:页帧,用于存储pipe数据;pipe缓存与页帧是一对一的关系
offset:页内偏移,用于记录有效数据在页帧的超始地址(只能用偏移,而不能用地址,因为高内存页帧在内核空间中没有虚拟地址与之对应)
len:有效数据长度
ops:缓存操作集
flags:缓存标识
private:缓存操作私有数据

 

ii. pipe_buf_operations
用于存储管道缓存操作集

/* include/linux/pipe_fs_i.h */
 60 /*
 61  * Note on the nesting of these functions:
 62  *
 63  * ->confirm()
 64  *      ->steal()
 65  *      ...
 66  *      ->map()
 67  *      ...
 68  *      ->unmap()
 69  *
 70  * That is, ->map() must be called on a confirmed buffer,
 71  * same goes for ->steal(). See below for the meaning of each
 72  * operation. Also see kerneldoc in fs/pipe.c for the pipe
 73  * and generic variants of these hooks.
 74  */
 75 struct pipe_buf_operations {
 76         /*
 77          * This is set to 1, if the generic pipe read/write may coalesce
 78          * data into an existing buffer. If this is set to 0, a new pipe
 79          * page segment is always used for new data.
 80          */
 81         int can_merge;
 82 
 83         /*
 84          * ->map() returns a virtual address mapping of the pipe buffer.
 85          * The last integer flag reflects whether this should be an atomic
 86          * mapping or not. The atomic map is faster, however you can't take
 87          * page faults before calling ->unmap() again. So if you need to eg
 88          * access user data through copy_to/from_user(), then you must get
 89          * a non-atomic map. ->map() uses the KM_USER0 atomic slot for
 90          * atomic maps, so you can't map more than one pipe_buffer at once
 91          * and you have to be careful if mapping another page as source
 92          * or destination for a copy (IOW, it has to use something else
 93          * than KM_USER0).
 94          */
 95         void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int);
 96 
 97         /*
 98          * Undoes ->map(), finishes the virtual mapping of the pipe buffer.
 99          */
100         void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *);
101 
102         /*
103          * ->confirm() verifies that the data in the pipe buffer is there
104          * and that the contents are good. If the pages in the pipe belong
105          * to a file system, we may need to wait for IO completion in this
106          * hook. Returns 0 for good, or a negative error value in case of
107          * error.
108          */
109         int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
110 
111         /*
112          * When the contents of this pipe buffer has been completely
113          * consumed by a reader, ->release() is called.
114          */
115         void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
116 
117         /*
118          * Attempt to take ownership of the pipe buffer and its contents.
119          * ->steal() returns 0 for success, in which case the contents
120          * of the pipe (the buf->page) is locked and now completely owned
121          * by the caller. The page may then be transferred to a different
122          * mapping, the most often used case is insertion into different
123          * file address space cache.
124          */
125         int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
126 
127         /*
128          * Get a reference to the pipe buffer.
129          */
130         void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
131 };

can_merge:合并标识;如果pipe_buffer中有空闲空间,有数据写入时,如果can_merge置位,会先写pipe_buffer的空闲空间;否则重新分配一个pipe_buffer来存储写入数据
map:由于pipe_buffer的page可能是高内存页帧,由于内核空间页表没有相应的页表项,所以内核不能直接访问page;只有通过map将page映射到内核地址空间后,内核才能访问
unmap:map的逆过程;因为内核地址空间有限,所以page访问完后释文地址映射
confirm:检验pipe_buffer中的数据
release:当pipe_buffer中的数据被读完后,用于释放pipe_buffer
get:增加pipe_buffer的引用计数器

 

iii.pipe_inode_info
管道描述符,用于表示一个管道,存储管道相应的信息

/* include/linux/pipe_fs_i.h */
 29 /**
 30  *      struct pipe_inode_info - a linux kernel pipe
 31  *      @wait: reader/writer wait point in case of empty/full pipe
 32  *      @nrbufs: the number of non-empty pipe buffers in this pipe
 33  *      @curbuf: the current pipe buffer entry
 34  *      @tmp_page: cached released page
 35  *      @readers: number of current readers of this pipe
 36  *      @writers: number of current writers of this pipe
 37  *      @waiting_writers: number of writers blocked waiting for room
 38  *      @r_counter: reader counter
 39  *      @w_counter: writer counter
 40  *      @fasync_readers: reader side fasync
 41  *      @fasync_writers: writer side fasync
 42  *      @inode: inode this pipe is attached to
 43  *      @bufs: the circular array of pipe buffers
 44  **/
 45 struct pipe_inode_info {
 46         wait_queue_head_t wait;
 47         unsigned int nrbufs, curbuf;
 48         struct page *tmp_page;
 49         unsigned int readers;
 50         unsigned int writers;
 51         unsigned int waiting_writers;
 52         unsigned int r_counter;
 53         unsigned int w_counter;
 54         struct fasync_struct *fasync_readers;
 55         struct fasync_struct *fasync_writers;
 56         struct inode *inode;
 57         struct pipe_buffer bufs[PIPE_BUFFERS];
 58 };

wait:读/写/poll等待队列;由于读/写不可能同时出现在等待的情况,所以可以共用等待队列;poll读与读,poll写与写可以共存出现在等待队列中
nrbufs:非空的pipe_buffer数量
curbuf:数据的起始pipe_buffer
tmp_page:页缓存,可以加速页帧的分配过程;当释放页帧时将页帧记入tmp_page,当分配页帧时,先从tmp_page中获取,如果tmp_page为空才从伙伴系统中获取
readers:当前管道的读者个数;每次以读方式打开时,readers加1;关闭时readers减1
writers:当前管道的写者个数;每次以写方式打开时,writers加1;关闭时writers减1
waiting_writers:被阻塞的管道写者个数;写进程被阻塞时,waiting_writers加1;被唤醒时,waiting_writers
r_counter:管道读者记数器,每次以读方式打开管道时,r_counter加1;关闭是不变
w_counter:管道读者计数器;每次以写方式打开时,w_counter加1;关闭是不变
fasync_readers:读端异步描述符
fasync_writers:写端异步描述符
inode:pipe对应的inode
bufs:pipe_buffer回环数据

 

iv.以上结构关系图

linux pipe文件系统(pipefs)_第1张图片

II.pipe_buf_operations
pipe_buf_operations主要用于记录pipe缓存的操作集:

/* fs/pipe.c */
 319 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 320         .can_merge = 1,
 321         .map = generic_pipe_buf_map,
 322         .unmap = generic_pipe_buf_unmap,
 323         .confirm = generic_pipe_buf_confirm,
 324         .release = anon_pipe_buf_release,
 325         .steal = generic_pipe_buf_steal,
 326         .get = generic_pipe_buf_get,
 327 };

i.generic_pipe_buf_confirm

 290 /**
 291  * generic_pipe_buf_confirm - verify contents of the pipe buffer
 292  * @info:       the pipe that the buffer belongs to
 293  * @buf:        the buffer to confirm
 294  *
 295  * Description:
 296  *      This function does nothing, because the generic pipe code uses
 297  *      pages that are always good when inserted into the pipe.
 298  */
 299 int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 300                              struct pipe_buffer *buf)
 301 {
 302         return 0;
 303 }

因为pipe使用页作为缓存,所以不会出现错误,confirm也就没有检查操作


ii.generic_pipe_buf_steal

 245 /**
 246  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
 247  * @pipe:       the pipe that the buffer belongs to
 248  * @buf:        the buffer to attempt to steal
 249  *
 250  * Description:
 251  *      This function attempts to steal the &struct page attached to
 252  *      @buf. If successful, this function returns 0 and returns with
 253  *      the page locked. The caller may then reuse the page for whatever
 254  *      he wishes; the typical use is insertion into a different file
 255  *      page cache.
 256  */
 257 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 258                            struct pipe_buffer *buf)
 259 {
 260         struct page *page = buf->page;
 261 
 262         /*
 263          * A reference of one is golden, that means that the owner of this
 264          * page is the only one holding a reference to it. lock the page
 265          * and return OK.
 266          */
 267         if (page_count(page) == 1) {
 268                 lock_page(page);
 269                 return 0;
 270         }
 271 
 272         return 1;
 273 }

steal主要用于获取pipe缓存的控制权,主要就是锁定页描述符;pipefs暂未使用steal

 

iii.generic_pipe_buf_map

 201 /**
 202  * generic_pipe_buf_map - virtually map a pipe buffer
 203  * @pipe:       the pipe that the buffer belongs to
 204  * @buf:        the buffer that should be mapped
 205  * @atomic:     whether to use an atomic map
 206  *
 207  * Description:
 208  *      This function returns a kernel virtual address mapping for the
 209  *      pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
 210  *      and the caller has to be careful not to fault before calling
 211  *      the unmap function.
 212  *
 213  *      Note that this function occupies KM_USER0 if @atomic != 0.
 214  */
 215 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
 216                            struct pipe_buffer *buf, int atomic)
 217 {
 218         if (atomic) {
 219                 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
 220                 return kmap_atomic(buf->page, KM_USER0);
 221         }
 222 
 223         return kmap(buf->page);
 224 }

将高端内存页帧映射到内核空间的永久映射区中,以便内核访问该缓存页帧;由于内核不能直接访问物理地址高于highstart_pfn<<PAGE_SHIFT的高端内存(ZONE_HIGHMEM区域中),所以在访问前必须将页帧映射到内核空间后才能访问。

 

iv.generic_pipe_buf_unmap

 226 /**
 227  * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
 228  * @pipe:       the pipe that the buffer belongs to
 229  * @buf:        the buffer that should be unmapped
 230  * @map_data:   the data that the mapping function returned
 231  *
 232  * Description:
 233  *      This function undoes the mapping that ->map() provided.
 234  */
 235 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 236                             struct pipe_buffer *buf, void *map_data)
 237 {
 238         if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
 239                 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
 240                 kunmap_atomic(map_data, KM_USER0);
 241         } else
 242                 kunmap(buf->page);
 243 }

取消页帧在内核空间的映射,因为内核永久映射区大小有限;所以访问完页帧的数据后,立即unmap掉该页帧

 

v.generic_pipe_buf_get

 275 /**
 276  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 277  * @pipe:       the pipe that the buffer belongs to
 278  * @buf:        the buffer to get a reference to
 279  *
 280  * Description:
 281  *      This function grabs an extra reference to @buf. It's used in
 282  *      in the tee() system call, when we duplicate the buffers in one
 283  *      pipe into another.
 284  */
 285 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 286 {
 287         page_cache_get(buf->page);
 288 }

增加pipe缓存的引用计数器,由于pipe缓存与页帧是一对一的关系,所以可以直接增加页帧的引用计数器即可

 

vi.anon_pipe_buf_release

 305 /**
 306  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 307  * @pipe:       the pipe that the buffer belongs to
 308  * @buf:        the buffer to put a reference to
 309  *
 310  * Description:
 311  *      This function releases a reference to @buf.
 312  */
 313 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 314                               struct pipe_buffer *buf)
 315 {
 316         page_cache_release(buf->page);
 317 }

释放pipe缓存;由于pipe缓存描述符是放在pipe描述符中的,所以只需要释放pipe缓存的页帧即可

 


III.文件操作集
当创建pipe/FIFO时,内核会分配file,dentry,inode,inode_pipe_info对象;
并将file对象的f_op指向read_pipefifo_fop/write_pipefifo_fops/rdwr_pipefifo_fops,当后续的read,write,poll等系统调用,会通过vfs调用相应的f_op中方法。

pipe/FIFO文件操作集如下:

/* fs/pipe.c */
 831 /*
 832  * The file_operations structs are not static because they
 833  * are also used in linux/fs/fifo.c to do operations on FIFOs.
 834  *
 835  * Pipes reuse fifos' file_operations structs.
 836  */
 837 const struct file_operations read_pipefifo_fops = {
 838         .llseek         = no_llseek,
 839         .read           = do_sync_read,
 840         .aio_read       = pipe_read,
 841         .write          = bad_pipe_w,
 842         .poll           = pipe_poll,
 843         .unlocked_ioctl = pipe_ioctl,
 844         .open           = pipe_read_open,
 845         .release        = pipe_read_release,
 846         .fasync         = pipe_read_fasync,
 847 };
 848 
 849 const struct file_operations write_pipefifo_fops = {
 850         .llseek         = no_llseek,
 851         .read           = bad_pipe_r,
 852         .write          = do_sync_write,
 853         .aio_write      = pipe_write,
 854         .poll           = pipe_poll,
 856         .open           = pipe_write_open,
 857         .release        = pipe_write_release,
 858         .fasync         = pipe_write_fasync,
 859 };
 860 
 861 const struct file_operations rdwr_pipefifo_fops = {
 862         .llseek         = no_llseek,
 863         .read           = do_sync_read,
 864         .aio_read       = pipe_read,
 865         .write          = do_sync_write,
 866         .aio_write      = pipe_write,
 867         .poll           = pipe_poll,
 868         .unlocked_ioctl = pipe_ioctl,
 869         .open           = pipe_rdwr_open,
 870         .release        = pipe_rdwr_release,
 871         .fasync         = pipe_rdwr_fasync,
 872 };

read_pipefifo_fops:pipe读端文件操作/FIFO只读方式文件操作
write_pipefifo_fops:pipe写端文件操作/FIFO只写方式文件操作
rdwr_pipefifo_fops:FIFO读写方式文件操作

 

i.open
暂时未发现什么地方触发read_pipefifo_fop/write_pipefifo_fops/rdwr_pipefifo_fops中的open操作;
注:
打开pipe文件(即FIFO)触发的是def_fifo_fops(fs/fifo.c)的fifo_open
ext4文件系统打开pipe文件代码过程如下:
1.取文件操作
  VFS:
    open->do_sys_open->do_filp_open->path_lookup_open->do_path_lookup->path_walk->link_path_walk->__link_path_walk->do_lookup->real_lookup
  ext4:
    ext4_lookup->ext4_iget->init_special_inode->def_fifo_fops
2.open触发
  open->do_sys_open->do_filp_open->nameidata_to_filp->__dentry_open

 

ii.fasync
fasync主要是用于启用异步I/O操作;
以下方式可以启用异步I/O,当然必须得有内核的支持:
1.open文件时,flags中O_ASYNC标识置位;
2.fcntl的F_SETFL命令,设置文件的状态标识O_ASYNC;代码跟踪fcntl->do_fcntl->setfl->fasync
注:
  O_ASYNC可以启动信号驱动I/O,即当文件描述符可读,可写后内核发送信号给用户进程(信号默认是SIGIO);支持O_ASYNC只有:终端,虚拟终端,socket,pipe,FIFO

/* fs/pipe.c */
 708 static int
 709 pipe_read_fasync(int fd, struct file *filp, int on)
 710 {
 711         struct inode *inode = filp->f_path.dentry->d_inode;
 712         int retval;
 713 
 714         mutex_lock(&inode->i_mutex);
 715         retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
 716         mutex_unlock(&inode->i_mutex);
 717         
 718         return retval;
 719 }       
 720 
 721 
 722 static int
 723 pipe_write_fasync(int fd, struct file *filp, int on)
 724 {
 725         struct inode *inode = filp->f_path.dentry->d_inode;
 726         int retval;
 727         
 728         mutex_lock(&inode->i_mutex);
 729         retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
 730         mutex_unlock(&inode->i_mutex);
 731         
 732         return retval;
 733 }
 734 
 735 
 736 static int
 737 pipe_rdwr_fasync(int fd, struct file *filp, int on)
 738 {
 739         struct inode *inode = filp->f_path.dentry->d_inode;
 740         struct pipe_inode_info *pipe = inode->i_pipe;
 741         int retval;
 742 
 743         mutex_lock(&inode->i_mutex);
 744         retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 745         if (retval >= 0) {
 746                 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 747                 if (retval < 0) /* this can happen only if on == T */
 748                         fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 749         }
 750         mutex_unlock(&inode->i_mutex);
 751         return retval;
 752 }

将文件添加到pipe的异步读fasync_readers/异步写fasync_writers的通知链表中;
当写pipe后,会发送信号(默认SIGIO)给fasync_readers通知链表中的文件所属进程;(见pipe_read)
当读pipe后,会发送信号(默认SIGIO)给fasync_writers链表文件中的文件所属进程;(见pipe_write)

 

iii.ioctl
pipe_ioctl会在系统调用ioctl中调用,ioctl->do_vfs_ioctl->vfs_ioctl->unlocked_ioctl

 628 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 629 {
 630         struct inode *inode = filp->f_path.dentry->d_inode;
 631         struct pipe_inode_info *pipe;
 632         int count, buf, nrbufs;
 633 
 634         switch (cmd) {
 635                 case FIONREAD:
 636                         mutex_lock(&inode->i_mutex);
 637                         pipe = inode->i_pipe;
 638                         count = 0;
 639                         buf = pipe->curbuf;
 640                         nrbufs = pipe->nrbufs;
 641                         while (--nrbufs >= 0) {
 642                                 count += pipe->bufs[buf].len;
 643                                 buf = (buf+1) & (PIPE_BUFFERS-1);
 644                         }
 645                         mutex_unlock(&inode->i_mutex);
 646 
 647                         return put_user(count, (int __user *)arg);
 648                 default:
 649                         return -EINVAL;
 650         }
 651 }

pipe_ioctl只支持FIONREAD命令,用于取pipe缓存中的数据大小
 
iv.write
pipe_write用于往管道缓存中写数据,当有读者被阻塞时唤醒读者进程;当管道缓存写满时阻塞写进程,直到有缓存中有数据被读出,即有空闲缓存时,阻塞的写进程被唤醒。
write->vfs_write->pipe_write

 442 static ssize_t
 443 pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 444             unsigned long nr_segs, loff_t ppos)
 445 {
 446         struct file *filp = iocb->ki_filp;
 447         struct inode *inode = filp->f_path.dentry->d_inode;
 448         struct pipe_inode_info *pipe;
 449         ssize_t ret;
 450         int do_wakeup;
 451         struct iovec *iov = (struct iovec *)_iov;
 452         size_t total_len;
 453         ssize_t chars;
 454 
 455         total_len = iov_length(iov, nr_segs);
 456         /* Null write succeeds. */
 457         if (unlikely(total_len == 0))
 458                 return 0;
 459 
 460         do_wakeup = 0;
 461         ret = 0;
 462         mutex_lock(&inode->i_mutex);
 463         pipe = inode->i_pipe;
 464 
 465         if (!pipe->readers) {
 466                 send_sig(SIGPIPE, current, 0);
 467                 ret = -EPIPE;
 468                 goto out;
 469         }
 470 
 471         /* We try to merge small writes */
 472         chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 473         if (pipe->nrbufs && chars != 0) {
 474                 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
 475                                                         (PIPE_BUFFERS-1);
 476                 struct pipe_buffer *buf = pipe->bufs + lastbuf;
 477                 const struct pipe_buf_operations *ops = buf->ops;
 478                 int offset = buf->offset + buf->len;
 479 
 480                 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 481                         int error, atomic = 1;
 482                         void *addr;
 483 
 484                         error = ops->confirm(pipe, buf);
 485                         if (error)
 486                                 goto out;
 487 
 488                         iov_fault_in_pages_read(iov, chars);
 489 redo1:
 490                         addr = ops->map(pipe, buf, atomic);
 491                         error = pipe_iov_copy_from_user(offset + addr, iov,
 492                                                         chars, atomic);
 493                         ops->unmap(pipe, buf, addr);
 494                         ret = error;
 495                         do_wakeup = 1;
 496                         if (error) {
 497                                 if (atomic) {
 498                                         atomic = 0;
 499                                         goto redo1;
 500                                 }
 501                                 goto out;
 502                         }
 503                         buf->len += chars;
 504                         total_len -= chars;
 505                         ret = chars;
 506                         if (!total_len)
 507                                 goto out;
 508                 }
 509         }
 510 
 511         for (;;) {
 512                 int bufs;
 513 
 514                 if (!pipe->readers) {
 515                         send_sig(SIGPIPE, current, 0);
 516                         if (!ret)
 517                                 ret = -EPIPE;
 518                         break;
 519                 }
 520                 bufs = pipe->nrbufs;
 521                 if (bufs < PIPE_BUFFERS) {
 522                         int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
 523                         struct pipe_buffer *buf = pipe->bufs + newbuf;
 524                         struct page *page = pipe->tmp_page;
 525                         char *src;
 526                         int error, atomic = 1;
 527 
 528                         if (!page) {
 529                                 page = alloc_page(GFP_HIGHUSER);
 530                                 if (unlikely(!page)) {
 531                                         ret = ret ? : -ENOMEM;
 532                                         break;
 533                                 }
 534                                 pipe->tmp_page = page;
 535                         }
 536                         /* Always wake up, even if the copy fails. Otherwise
 537                          * we lock up (O_NONBLOCK-)readers that sleep due to
 538                          * syscall merging.
 539                          * FIXME! Is this really true?
 540                          */
 541                         do_wakeup = 1;
 542                         chars = PAGE_SIZE;
 543                         if (chars > total_len)
 544                                 chars = total_len;
 545 
 546                         iov_fault_in_pages_read(iov, chars);
 547 redo2:
 548                         if (atomic)
 549                                 src = kmap_atomic(page, KM_USER0);
 550                         else
 551                                 src = kmap(page);
 552 
 553                         error = pipe_iov_copy_from_user(src, iov, chars,
 554                                                         atomic);
 555                         if (atomic)
 556                                 kunmap_atomic(src, KM_USER0);
 557                         else
 558                                 kunmap(page);
 559 
 560                         if (unlikely(error)) {
 561                                 if (atomic) {
 562                                         atomic = 0;
 563                                         goto redo2;
 564                                 }
 565                                 if (!ret)
 566                                         ret = error;
 567                                 break;
 568                         }
 569                         ret += chars;
 570 
 571                         /* Insert it into the buffer array */
 572                         buf->page = page;
 573                         buf->ops = &anon_pipe_buf_ops;
 574                         buf->offset = 0;
 575                         buf->len = chars;
 576                         pipe->nrbufs = ++bufs;
 577                         pipe->tmp_page = NULL;
 578 
 579                         total_len -= chars;
 580                         if (!total_len)
 581                                 break;
 582                 }
 583                 if (bufs < PIPE_BUFFERS)
 584                         continue;
 585                 if (filp->f_flags & O_NONBLOCK) {
 586                         if (!ret)
 587                                 ret = -EAGAIN;
 588                         break;
 589                 }
 590                 if (signal_pending(current)) {
 591                         if (!ret)
 592                                 ret = -ERESTARTSYS;
 593                         break;
 594                 }
 595                 if (do_wakeup) {
 596                         wake_up_interruptible_sync(&pipe->wait);
 597                         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 598                         do_wakeup = 0;
 599                 }
 600                 pipe->waiting_writers++;
 601                 pipe_wait(pipe);
 602                 pipe->waiting_writers--;
 603         }
 604 out:
 605         mutex_unlock(&inode->i_mutex);
 606         if (do_wakeup) {
 607                 wake_up_interruptible_sync(&pipe->wait);
 608                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 609         }
 610         if (ret > 0)
 611                 file_update_time(filp);
 612         return ret;
 613 }

1.计算写数据长度,如果长度为0直接返回
2.获取pipe互斥锁,进入数据复制临界区
3.当没有读者时返回EPIPE错误,并向当前写进程发送SIGPIPE信号
4.计算写数据超过页大小的整数倍的长度(主要用于将余数部分与当前缓存合并,整数页分配新页帧存储)
5.如果当前缓存有空间容纳余数大小的数据,并且缓存可以合并数据,就复制余数长度数据到当前缓存中;如果复制完成,跳到步骤
  注:
    由于缓存页帧可能是高端内存页,所以要用confirm,map,写数据,unmap一系列操作;
    由于用户进程写pipe的数据所在页帧可能被swap到硬盘中,内核访问就会出现缺页异常;为了能够原子复制,在复制前先触发缺页异常,主要通过pipe_iov_copy_from_user去预触发缺页异常。
6.分配新缓存存放数据
  A.当没有读者时返回EPIPE错误,并向当前写进程发送SIGPIPE信号;当pipe缓存被写满后,仍有数据未写,写进程会被阻塞,pipe锁被释放,此时读进程可以获取pipe锁进而读数据,读到数据后可能会关闭pipe的读端,所以每循环一次都会检测读者个数。
  B.当有空闲缓存空间时
    a.从tmp_page中分配页帧,如果tmp_page没有页帧则从伙伴系统中获取分配页帧
    b.iov_fault_in_pages_read进行读用户地址空间缺页异常预触发,以便后面原子复制数据,保证数据从用户空间往内核空间复制时不产生缺页异常。
    c.将页帧映射到内核永久映射区中,获得线性地址,以便内核访问物理页帧
    d.将数据从用户空间复制到页帧中
    e.将页帧从内核永久映射区中移除
    f.初始化pipe缓存,如缓存页帧、偏移、大小、操作等
    g.如果数据复制完,走步骤7,退出;否则走步骤6继续循环
  C.当没有空闲缓存空间时
    a.如果是非阻塞写时,有数据写入则返回写入的数据长度,没有数据写入则走步骤7并返回EAGAIN错误
    b.如果有信号产生,有数据写入则返回写入的数据长度,没有数据写入则走步骤7并返回ERESTARTSYS错误,内核处理完信号后会自动重启系统调用write
    c.如果有数据写入且之前没有唤醒操作,则唤醒被阻塞的读者进程;向设置了O_ASYNC标识的文件所属读者进程发送异步I/O信号SIGIO
    d.阻塞写者计数器加1,释放pipe锁阻塞当前进程;进程被唤醒时获取pipe锁,并将阻塞写者计数器减1
    e.走步骤6继续循环
7.释放pipe互斥锁,退出数据复制临界区
8.如果有数据写入且之前没有唤醒操作,则唤醒被阻塞的读者进程;向设置了O_ASYNC标识的文件所属读者进程发送异步I/O信号SIGIO
9.返回写的数据长度
注:
  当没有pipe缓存空间时,不管是NOBLOCK的写还是阻塞进程因信号而被唤醒,在有数据写入时都会返回实际写入的数据长度;所以需在用户进程中判断实际写入数据的长度是否是预期的写入长度。

 

v.read

pipe_read用于从管道缓存中读数据,当有写者被阻塞时唤醒写者进程;当管道缓存中没有数据时阻塞读进程,直到有缓存中有数据被写入,即缓存中有数据,阻塞的读进程被唤醒。
read->vfs_read->pipe_read

 329 static ssize_t
 330 pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 331            unsigned long nr_segs, loff_t pos)
 332 {
 333         struct file *filp = iocb->ki_filp;
 334         struct inode *inode = filp->f_path.dentry->d_inode;
 335         struct pipe_inode_info *pipe;
 336         int do_wakeup;
 337         ssize_t ret;
 338         struct iovec *iov = (struct iovec *)_iov;
 339         size_t total_len;
 340 
 341         total_len = iov_length(iov, nr_segs);
 342         /* Null read succeeds. */
 343         if (unlikely(total_len == 0))
 344                 return 0;
 345 
 346         do_wakeup = 0;
 347         ret = 0;
 348         mutex_lock(&inode->i_mutex);
 349         pipe = inode->i_pipe;
 350         for (;;) {
 351                 int bufs = pipe->nrbufs;
 352                 if (bufs) {
 353                         int curbuf = pipe->curbuf;
 354                         struct pipe_buffer *buf = pipe->bufs + curbuf;
 355                         const struct pipe_buf_operations *ops = buf->ops;
 356                         void *addr;
 357                         size_t chars = buf->len;
 358                         int error, atomic;
 359 
 360                         if (chars > total_len)
 361                                 chars = total_len;
 362 
 363                         error = ops->confirm(pipe, buf);
 364                         if (error) {
 365                                 if (!ret)
 366                                         ret = error;
 367                                 break;
 368                         }
 369 
 370                         atomic = !iov_fault_in_pages_write(iov, chars);
 371 redo:
 372                         addr = ops->map(pipe, buf, atomic);
 373                         error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
 374                         ops->unmap(pipe, buf, addr);
 375                         if (unlikely(error)) {
 376                                 /*
 377                                  * Just retry with the slow path if we failed.
 378                                  */
 379                                 if (atomic) {
 380                                         atomic = 0;
 381                                         goto redo;
 382                                 }
 383                                 if (!ret)
 384                                         ret = error;
 385                                 break;
 386                         }
 387                         ret += chars;
 388                         buf->offset += chars;
 389                         buf->len -= chars;
 390                         if (!buf->len) {
 391                                 buf->ops = NULL;
 392                                 ops->release(pipe, buf);
 393                                 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
 394                                 pipe->curbuf = curbuf;
 395                                 pipe->nrbufs = --bufs;
 396                                 do_wakeup = 1;
 397                         }
 398                         total_len -= chars;
 399                         if (!total_len)
 400                                 break;  /* common path: read succeeded */
 401                 }
 402                 if (bufs)       /* More to do? */
 403                         continue;
 404                 if (!pipe->writers)
 405                         break;
 406                 if (!pipe->waiting_writers) {
 407                         /* syscall merging: Usually we must not sleep
 408                          * if O_NONBLOCK is set, or if we got some data.
 409                          * But if a writer sleeps in kernel space, then
 410                          * we can wait for that data without violating POSIX.
 411                          */
 412                         if (ret)
 413                                 break;
 414                         if (filp->f_flags & O_NONBLOCK) {
 415                                 ret = -EAGAIN;
 416                                 break;
 417                         }
 418                 }
 419                 if (signal_pending(current)) {
 420                         if (!ret)
 421                                 ret = -ERESTARTSYS;
 422                         break;
 423                 }
 424                 if (do_wakeup) {
 425                         wake_up_interruptible_sync(&pipe->wait);
 426                         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 427                 }
 428                 pipe_wait(pipe);
 429         }
 430         mutex_unlock(&inode->i_mutex);
 431 
 432         /* Signal writers asynchronously that there is more room. */
 433         if (do_wakeup) {
 434                 wake_up_interruptible_sync(&pipe->wait);
 435                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 436         }
 437         if (ret > 0)
 438                 file_accessed(filp);
 439         return ret;
 440 }

1.计算写数据长度,如果长度为0直接返回
2.获取pipe互斥锁,进入读数据临界区
3.如果pipe缓存中有数据
  A.通过confirm,map,copy,unmap一系列操作将数据从内核空间的pipe缓存复制到用户进程空间。
  B.如果当前缓存pipe_buffer中数据复制完,则释放当前pipe_buffer;将唤醒标识置1
  C.如果复制完所需的数据,跳转步骤5退出
4.如果pipe缓存中没有数据
  A.如果没有写者了,跳转步骤5退出
  B.如果没有数据写等待进程,读出部分数据时返回实际读出数据,未读出数据且时NONBLOCK读时返回EAGAIN错误
  C.有果有信号产生,且没有读出数据时,返回ERESTARTSYS错误,内核处理完信号后会自动重启系统调用read
  D.如果唤醒标识do_wakeup置位,唤醒被阻塞的写者进程;向设置了O_ASYNC标识的文件所属写者进程发送异步I/O信号SIGIO
  E.释放pipe互斥锁,进程被阻塞;如果进程被唤醒,获取pipe互斥锁,跳转步骤3继续循环
5.释放pipe互斥锁,退出读数据临界区
6.如果唤醒标识do_wakeup置位,唤醒被阻塞的写者进程;向设置了O_ASYNC标识的文件所属写者进程发送异步I/O信号SIGIO
7.返回实际读出的数据
注:
  由于实际读出的数据长度可能比要求的小,所以要在程序中判断实际读出数据长度

 

vi.poll
pipe_poll主要用于返回文件当前可以进行的poll操作
poll->do_sys_poll->do_poll->do_pollfd->pipe_poll/select->core_sys_select->do_select->pipe_poll

 653 /* No kernel lock held - fine */
 654 static unsigned int
 655 pipe_poll(struct file *filp, poll_table *wait)
 656 {
 657         unsigned int mask;
 658         struct inode *inode = filp->f_path.dentry->d_inode;
 659         struct pipe_inode_info *pipe = inode->i_pipe;
 660         int nrbufs;
 661 
 662         poll_wait(filp, &pipe->wait, wait);
 663 
 664         /* Reading only -- no need for acquiring the semaphore.  */
 665         nrbufs = pipe->nrbufs;
 666         mask = 0;
 667         if (filp->f_mode & FMODE_READ) {
 668                 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
 669                 if (!pipe->writers && filp->f_version != pipe->w_counter)
 670                         mask |= POLLHUP;
 671         }
 672 
 673         if (filp->f_mode & FMODE_WRITE) {
 674                 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
 675                 /*
 676                  * Most Unices do not set POLLERR for FIFOs but on Linux they
 677                  * behave exactly like pipes for poll().
 678                  */
 679                 if (!pipe->readers)
 680                         mask |= POLLERR;
 681         }
 682 
 683         return mask;
 684 }

1.将wait添加到pipe文件的等待队列中
2.如果管道是读端,当有缓存数据时mask添加POLLIN | POLLRDNORM;如果没有写者且写者关闭(打开只读管道时f_version=0/w_counter,打开写管道时w_counter会加1),mask添加POLLHUP,即管道写端打开过(w_counter!=f_version)但又关闭了(writers=0)则表示管道读端POLLHUP
3.如果管道是写端,当pipe缓存有空闲空间时mask添加POLLOUT | POLLWRNORM;如果没有读者,mask添加POLLERR
4.返回pipe管道文件可以进行的poll操作mask

 

vii.release
当close文件时会调用release操作(文件引用计数器为0)
close->filp_close->fput->__fput->release

/* fs/pipe.c */
 686 static int
 687 pipe_release(struct inode *inode, int decr, int decw)
 688 {
 689         struct pipe_inode_info *pipe;
 690 
 691         mutex_lock(&inode->i_mutex);
 692         pipe = inode->i_pipe;
 693         pipe->readers -= decr;
 694         pipe->writers -= decw;
 695 
 696         if (!pipe->readers && !pipe->writers) {
 697                 free_pipe_info(inode);
 698         } else {
 699                 wake_up_interruptible_sync(&pipe->wait);
 700                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 701                 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 702         }
 703         mutex_unlock(&inode->i_mutex);
 704 
 705         return 0;
 706 }
 755 static int
 756 pipe_read_release(struct inode *inode, struct file *filp)
 757 {
 758         return pipe_release(inode, 1, 0);
 759 }
 760 
 761 static int
 762 pipe_write_release(struct inode *inode, struct file *filp)
 763 {
 764         return pipe_release(inode, 0, 1);
 765 }
 766 
 767 static int
 768 pipe_rdwr_release(struct inode *inode, struct file *filp)
 769 {
 770         int decr, decw;
 771 
 772         decr = (filp->f_mode & FMODE_READ) != 0;
 773         decw = (filp->f_mode & FMODE_WRITE) != 0;
 774         return pipe_release(inode, decr, decw);
 775 }

1.获取管道互斥锁,进入管道操作临界区
2.读/写者计数器减1
3.如果管道既没有读者也没有写者,则释放管道缓存及管道描述符
4.否则,唤醒管道等待队列中的阻塞进程,向管道读者&写者发送异步I/O信号SIGIO

 


IV.pipefs

1119 /*
1120  * pipefs should _never_ be mounted by userland - too much of security hassle,
1121  * no real gain from having the whole whorehouse mounted. So we don't need
1122  * any operations on the root directory. However, we need a non-trivial
1123  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1124  */
1125 static int pipefs_get_sb(struct file_system_type *fs_type,
1126                          int flags, const char *dev_name, void *data,
1127                          struct vfsmount *mnt)
1128 {
1129         return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
1130 }
1131 
1132 static struct file_system_type pipe_fs_type = {
1133         .name           = "pipefs",
1134         .get_sb         = pipefs_get_sb,
1135         .kill_sb        = kill_anon_super,
1136 };
1137  
1138 static int __init init_pipe_fs(void)
1139 {
1140         int err = register_filesystem(&pipe_fs_type);
1141 
1142         if (!err) {
1143                 pipe_mnt = kern_mount(&pipe_fs_type);
1144                 if (IS_ERR(pipe_mnt)) {
1145                         err = PTR_ERR(pipe_mnt);
1146                         unregister_filesystem(&pipe_fs_type);
1147                 }
1148         }
1149         return err;
1150 }
1151 
1152 static void __exit exit_pipe_fs(void)
1153 {
1154         unregister_filesystem(&pipe_fs_type);
1155         mntput(pipe_mnt);
1156 }

pipefs是一个虚拟的文件系统,挂载在内核中而不会被挂载到根文件系统中

你可能感兴趣的:(linux pipe文件系统(pipefs))