linux的pipe,FIFO(named pipe)都是基于pipe文件系统(pipefs)的。
linux的pipe和FIFO都是半双工的,即数据流向只能是一个方向;
pipe只能在pipe的创建进程及其后代进程(后代进程fork/exec时,通过继承父进程的打开文件描述符表)之间使用,来实现通信;
FIFO是有名称的pipe,即可以通过名称查找到pipe,所以没有“只能在pipe的创建进程及其后代进程之间使用”的限制;通过名称找到pipe文件,创建相应的pipe,就可以实现进程间的通信。
I.数据结构
i.pipe_buffer
管道缓存,用于暂存写入管道的数据;写进程通过管道写入端将数据写入管道缓存中,读进程通过管道读出端将数据从管道缓存中读出
/* include/linux/pipe_fs_i.h */ 8 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ 9 #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ 10 #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ 11 12 /** 13 * struct pipe_buffer - a linux kernel pipe buffer 14 * @page: the page containing the data for the pipe buffer 15 * @offset: offset of data inside the @page 16 * @len: length of data inside the @page 17 * @ops: operations associated with this buffer. See @pipe_buf_operations. 18 * @flags: pipe buffer flags. See above. 19 * @private: private data owned by the ops. 20 **/ 21 struct pipe_buffer { 22 struct page *page; 23 unsigned int offset, len; 24 const struct pipe_buf_operations *ops; 25 unsigned int flags; 26 unsigned long private; 27 };
page:页帧,用于存储pipe数据;pipe缓存与页帧是一对一的关系
offset:页内偏移,用于记录有效数据在页帧的超始地址(只能用偏移,而不能用地址,因为高内存页帧在内核空间中没有虚拟地址与之对应)
len:有效数据长度
ops:缓存操作集
flags:缓存标识
private:缓存操作私有数据
ii. pipe_buf_operations
用于存储管道缓存操作集
/* include/linux/pipe_fs_i.h */ 60 /* 61 * Note on the nesting of these functions: 62 * 63 * ->confirm() 64 * ->steal() 65 * ... 66 * ->map() 67 * ... 68 * ->unmap() 69 * 70 * That is, ->map() must be called on a confirmed buffer, 71 * same goes for ->steal(). See below for the meaning of each 72 * operation. Also see kerneldoc in fs/pipe.c for the pipe 73 * and generic variants of these hooks. 74 */ 75 struct pipe_buf_operations { 76 /* 77 * This is set to 1, if the generic pipe read/write may coalesce 78 * data into an existing buffer. If this is set to 0, a new pipe 79 * page segment is always used for new data. 80 */ 81 int can_merge; 82 83 /* 84 * ->map() returns a virtual address mapping of the pipe buffer. 85 * The last integer flag reflects whether this should be an atomic 86 * mapping or not. The atomic map is faster, however you can't take 87 * page faults before calling ->unmap() again. So if you need to eg 88 * access user data through copy_to/from_user(), then you must get 89 * a non-atomic map. ->map() uses the KM_USER0 atomic slot for 90 * atomic maps, so you can't map more than one pipe_buffer at once 91 * and you have to be careful if mapping another page as source 92 * or destination for a copy (IOW, it has to use something else 93 * than KM_USER0). 94 */ 95 void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); 96 97 /* 98 * Undoes ->map(), finishes the virtual mapping of the pipe buffer. 99 */ 100 void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); 101 102 /* 103 * ->confirm() verifies that the data in the pipe buffer is there 104 * and that the contents are good. If the pages in the pipe belong 105 * to a file system, we may need to wait for IO completion in this 106 * hook. Returns 0 for good, or a negative error value in case of 107 * error. 108 */ 109 int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); 110 111 /* 112 * When the contents of this pipe buffer has been completely 113 * consumed by a reader, ->release() is called. 114 */ 115 void (*release)(struct pipe_inode_info *, struct pipe_buffer *); 116 117 /* 118 * Attempt to take ownership of the pipe buffer and its contents. 119 * ->steal() returns 0 for success, in which case the contents 120 * of the pipe (the buf->page) is locked and now completely owned 121 * by the caller. The page may then be transferred to a different 122 * mapping, the most often used case is insertion into different 123 * file address space cache. 124 */ 125 int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); 126 127 /* 128 * Get a reference to the pipe buffer. 129 */ 130 void (*get)(struct pipe_inode_info *, struct pipe_buffer *); 131 };
can_merge:合并标识;如果pipe_buffer中有空闲空间,有数据写入时,如果can_merge置位,会先写pipe_buffer的空闲空间;否则重新分配一个pipe_buffer来存储写入数据
map:由于pipe_buffer的page可能是高内存页帧,由于内核空间页表没有相应的页表项,所以内核不能直接访问page;只有通过map将page映射到内核地址空间后,内核才能访问
unmap:map的逆过程;因为内核地址空间有限,所以page访问完后释文地址映射
confirm:检验pipe_buffer中的数据
release:当pipe_buffer中的数据被读完后,用于释放pipe_buffer
get:增加pipe_buffer的引用计数器
iii.pipe_inode_info
管道描述符,用于表示一个管道,存储管道相应的信息
/* include/linux/pipe_fs_i.h */ 29 /** 30 * struct pipe_inode_info - a linux kernel pipe 31 * @wait: reader/writer wait point in case of empty/full pipe 32 * @nrbufs: the number of non-empty pipe buffers in this pipe 33 * @curbuf: the current pipe buffer entry 34 * @tmp_page: cached released page 35 * @readers: number of current readers of this pipe 36 * @writers: number of current writers of this pipe 37 * @waiting_writers: number of writers blocked waiting for room 38 * @r_counter: reader counter 39 * @w_counter: writer counter 40 * @fasync_readers: reader side fasync 41 * @fasync_writers: writer side fasync 42 * @inode: inode this pipe is attached to 43 * @bufs: the circular array of pipe buffers 44 **/ 45 struct pipe_inode_info { 46 wait_queue_head_t wait; 47 unsigned int nrbufs, curbuf; 48 struct page *tmp_page; 49 unsigned int readers; 50 unsigned int writers; 51 unsigned int waiting_writers; 52 unsigned int r_counter; 53 unsigned int w_counter; 54 struct fasync_struct *fasync_readers; 55 struct fasync_struct *fasync_writers; 56 struct inode *inode; 57 struct pipe_buffer bufs[PIPE_BUFFERS]; 58 };
wait:读/写/poll等待队列;由于读/写不可能同时出现在等待的情况,所以可以共用等待队列;poll读与读,poll写与写可以共存出现在等待队列中
nrbufs:非空的pipe_buffer数量
curbuf:数据的起始pipe_buffer
tmp_page:页缓存,可以加速页帧的分配过程;当释放页帧时将页帧记入tmp_page,当分配页帧时,先从tmp_page中获取,如果tmp_page为空才从伙伴系统中获取
readers:当前管道的读者个数;每次以读方式打开时,readers加1;关闭时readers减1
writers:当前管道的写者个数;每次以写方式打开时,writers加1;关闭时writers减1
waiting_writers:被阻塞的管道写者个数;写进程被阻塞时,waiting_writers加1;被唤醒时,waiting_writers
r_counter:管道读者记数器,每次以读方式打开管道时,r_counter加1;关闭是不变
w_counter:管道读者计数器;每次以写方式打开时,w_counter加1;关闭是不变
fasync_readers:读端异步描述符
fasync_writers:写端异步描述符
inode:pipe对应的inode
bufs:pipe_buffer回环数据
iv.以上结构关系图
II.pipe_buf_operations
pipe_buf_operations主要用于记录pipe缓存的操作集:
/* fs/pipe.c */ 319 static const struct pipe_buf_operations anon_pipe_buf_ops = { 320 .can_merge = 1, 321 .map = generic_pipe_buf_map, 322 .unmap = generic_pipe_buf_unmap, 323 .confirm = generic_pipe_buf_confirm, 324 .release = anon_pipe_buf_release, 325 .steal = generic_pipe_buf_steal, 326 .get = generic_pipe_buf_get, 327 };
i.generic_pipe_buf_confirm
290 /** 291 * generic_pipe_buf_confirm - verify contents of the pipe buffer 292 * @info: the pipe that the buffer belongs to 293 * @buf: the buffer to confirm 294 * 295 * Description: 296 * This function does nothing, because the generic pipe code uses 297 * pages that are always good when inserted into the pipe. 298 */ 299 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 300 struct pipe_buffer *buf) 301 { 302 return 0; 303 }
因为pipe使用页作为缓存,所以不会出现错误,confirm也就没有检查操作
ii.generic_pipe_buf_steal
245 /** 246 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 247 * @pipe: the pipe that the buffer belongs to 248 * @buf: the buffer to attempt to steal 249 * 250 * Description: 251 * This function attempts to steal the &struct page attached to 252 * @buf. If successful, this function returns 0 and returns with 253 * the page locked. The caller may then reuse the page for whatever 254 * he wishes; the typical use is insertion into a different file 255 * page cache. 256 */ 257 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 258 struct pipe_buffer *buf) 259 { 260 struct page *page = buf->page; 261 262 /* 263 * A reference of one is golden, that means that the owner of this 264 * page is the only one holding a reference to it. lock the page 265 * and return OK. 266 */ 267 if (page_count(page) == 1) { 268 lock_page(page); 269 return 0; 270 } 271 272 return 1; 273 }
steal主要用于获取pipe缓存的控制权,主要就是锁定页描述符;pipefs暂未使用steal
iii.generic_pipe_buf_map
201 /** 202 * generic_pipe_buf_map - virtually map a pipe buffer 203 * @pipe: the pipe that the buffer belongs to 204 * @buf: the buffer that should be mapped 205 * @atomic: whether to use an atomic map 206 * 207 * Description: 208 * This function returns a kernel virtual address mapping for the 209 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 210 * and the caller has to be careful not to fault before calling 211 * the unmap function. 212 * 213 * Note that this function occupies KM_USER0 if @atomic != 0. 214 */ 215 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 216 struct pipe_buffer *buf, int atomic) 217 { 218 if (atomic) { 219 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 220 return kmap_atomic(buf->page, KM_USER0); 221 } 222 223 return kmap(buf->page); 224 }
将高端内存页帧映射到内核空间的永久映射区中,以便内核访问该缓存页帧;由于内核不能直接访问物理地址高于highstart_pfn<<PAGE_SHIFT的高端内存(ZONE_HIGHMEM区域中),所以在访问前必须将页帧映射到内核空间后才能访问。
iv.generic_pipe_buf_unmap
226 /** 227 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 228 * @pipe: the pipe that the buffer belongs to 229 * @buf: the buffer that should be unmapped 230 * @map_data: the data that the mapping function returned 231 * 232 * Description: 233 * This function undoes the mapping that ->map() provided. 234 */ 235 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 236 struct pipe_buffer *buf, void *map_data) 237 { 238 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 239 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 240 kunmap_atomic(map_data, KM_USER0); 241 } else 242 kunmap(buf->page); 243 }
取消页帧在内核空间的映射,因为内核永久映射区大小有限;所以访问完页帧的数据后,立即unmap掉该页帧
v.generic_pipe_buf_get
275 /** 276 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 277 * @pipe: the pipe that the buffer belongs to 278 * @buf: the buffer to get a reference to 279 * 280 * Description: 281 * This function grabs an extra reference to @buf. It's used in 282 * in the tee() system call, when we duplicate the buffers in one 283 * pipe into another. 284 */ 285 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 286 { 287 page_cache_get(buf->page); 288 }
增加pipe缓存的引用计数器,由于pipe缓存与页帧是一对一的关系,所以可以直接增加页帧的引用计数器即可
vi.anon_pipe_buf_release
305 /** 306 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 307 * @pipe: the pipe that the buffer belongs to 308 * @buf: the buffer to put a reference to 309 * 310 * Description: 311 * This function releases a reference to @buf. 312 */ 313 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 314 struct pipe_buffer *buf) 315 { 316 page_cache_release(buf->page); 317 }
释放pipe缓存;由于pipe缓存描述符是放在pipe描述符中的,所以只需要释放pipe缓存的页帧即可
III.文件操作集
当创建pipe/FIFO时,内核会分配file,dentry,inode,inode_pipe_info对象;
并将file对象的f_op指向read_pipefifo_fop/write_pipefifo_fops/rdwr_pipefifo_fops,当后续的read,write,poll等系统调用,会通过vfs调用相应的f_op中方法。
pipe/FIFO文件操作集如下:
/* fs/pipe.c */ 831 /* 832 * The file_operations structs are not static because they 833 * are also used in linux/fs/fifo.c to do operations on FIFOs. 834 * 835 * Pipes reuse fifos' file_operations structs. 836 */ 837 const struct file_operations read_pipefifo_fops = { 838 .llseek = no_llseek, 839 .read = do_sync_read, 840 .aio_read = pipe_read, 841 .write = bad_pipe_w, 842 .poll = pipe_poll, 843 .unlocked_ioctl = pipe_ioctl, 844 .open = pipe_read_open, 845 .release = pipe_read_release, 846 .fasync = pipe_read_fasync, 847 }; 848 849 const struct file_operations write_pipefifo_fops = { 850 .llseek = no_llseek, 851 .read = bad_pipe_r, 852 .write = do_sync_write, 853 .aio_write = pipe_write, 854 .poll = pipe_poll, 856 .open = pipe_write_open, 857 .release = pipe_write_release, 858 .fasync = pipe_write_fasync, 859 }; 860 861 const struct file_operations rdwr_pipefifo_fops = { 862 .llseek = no_llseek, 863 .read = do_sync_read, 864 .aio_read = pipe_read, 865 .write = do_sync_write, 866 .aio_write = pipe_write, 867 .poll = pipe_poll, 868 .unlocked_ioctl = pipe_ioctl, 869 .open = pipe_rdwr_open, 870 .release = pipe_rdwr_release, 871 .fasync = pipe_rdwr_fasync, 872 };
read_pipefifo_fops:pipe读端文件操作/FIFO只读方式文件操作
write_pipefifo_fops:pipe写端文件操作/FIFO只写方式文件操作
rdwr_pipefifo_fops:FIFO读写方式文件操作
i.open
暂时未发现什么地方触发read_pipefifo_fop/write_pipefifo_fops/rdwr_pipefifo_fops中的open操作;
注:
打开pipe文件(即FIFO)触发的是def_fifo_fops(fs/fifo.c)的fifo_open
ext4文件系统打开pipe文件代码过程如下:
1.取文件操作
VFS:
open->do_sys_open->do_filp_open->path_lookup_open->do_path_lookup->path_walk->link_path_walk->__link_path_walk->do_lookup->real_lookup
ext4:
ext4_lookup->ext4_iget->init_special_inode->def_fifo_fops
2.open触发
open->do_sys_open->do_filp_open->nameidata_to_filp->__dentry_open
ii.fasync
fasync主要是用于启用异步I/O操作;
以下方式可以启用异步I/O,当然必须得有内核的支持:
1.open文件时,flags中O_ASYNC标识置位;
2.fcntl的F_SETFL命令,设置文件的状态标识O_ASYNC;代码跟踪fcntl->do_fcntl->setfl->fasync
注:
O_ASYNC可以启动信号驱动I/O,即当文件描述符可读,可写后内核发送信号给用户进程(信号默认是SIGIO);支持O_ASYNC只有:终端,虚拟终端,socket,pipe,FIFO
/* fs/pipe.c */ 708 static int 709 pipe_read_fasync(int fd, struct file *filp, int on) 710 { 711 struct inode *inode = filp->f_path.dentry->d_inode; 712 int retval; 713 714 mutex_lock(&inode->i_mutex); 715 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 716 mutex_unlock(&inode->i_mutex); 717 718 return retval; 719 } 720 721 722 static int 723 pipe_write_fasync(int fd, struct file *filp, int on) 724 { 725 struct inode *inode = filp->f_path.dentry->d_inode; 726 int retval; 727 728 mutex_lock(&inode->i_mutex); 729 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 730 mutex_unlock(&inode->i_mutex); 731 732 return retval; 733 } 734 735 736 static int 737 pipe_rdwr_fasync(int fd, struct file *filp, int on) 738 { 739 struct inode *inode = filp->f_path.dentry->d_inode; 740 struct pipe_inode_info *pipe = inode->i_pipe; 741 int retval; 742 743 mutex_lock(&inode->i_mutex); 744 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 745 if (retval >= 0) { 746 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 747 if (retval < 0) /* this can happen only if on == T */ 748 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 749 } 750 mutex_unlock(&inode->i_mutex); 751 return retval; 752 }
将文件添加到pipe的异步读fasync_readers/异步写fasync_writers的通知链表中;
当写pipe后,会发送信号(默认SIGIO)给fasync_readers通知链表中的文件所属进程;(见pipe_read)
当读pipe后,会发送信号(默认SIGIO)给fasync_writers链表文件中的文件所属进程;(见pipe_write)
iii.ioctl
pipe_ioctl会在系统调用ioctl中调用,ioctl->do_vfs_ioctl->vfs_ioctl->unlocked_ioctl
628 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 629 { 630 struct inode *inode = filp->f_path.dentry->d_inode; 631 struct pipe_inode_info *pipe; 632 int count, buf, nrbufs; 633 634 switch (cmd) { 635 case FIONREAD: 636 mutex_lock(&inode->i_mutex); 637 pipe = inode->i_pipe; 638 count = 0; 639 buf = pipe->curbuf; 640 nrbufs = pipe->nrbufs; 641 while (--nrbufs >= 0) { 642 count += pipe->bufs[buf].len; 643 buf = (buf+1) & (PIPE_BUFFERS-1); 644 } 645 mutex_unlock(&inode->i_mutex); 646 647 return put_user(count, (int __user *)arg); 648 default: 649 return -EINVAL; 650 } 651 }
pipe_ioctl只支持FIONREAD命令,用于取pipe缓存中的数据大小
iv.write
pipe_write用于往管道缓存中写数据,当有读者被阻塞时唤醒读者进程;当管道缓存写满时阻塞写进程,直到有缓存中有数据被读出,即有空闲缓存时,阻塞的写进程被唤醒。
write->vfs_write->pipe_write
442 static ssize_t 443 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 444 unsigned long nr_segs, loff_t ppos) 445 { 446 struct file *filp = iocb->ki_filp; 447 struct inode *inode = filp->f_path.dentry->d_inode; 448 struct pipe_inode_info *pipe; 449 ssize_t ret; 450 int do_wakeup; 451 struct iovec *iov = (struct iovec *)_iov; 452 size_t total_len; 453 ssize_t chars; 454 455 total_len = iov_length(iov, nr_segs); 456 /* Null write succeeds. */ 457 if (unlikely(total_len == 0)) 458 return 0; 459 460 do_wakeup = 0; 461 ret = 0; 462 mutex_lock(&inode->i_mutex); 463 pipe = inode->i_pipe; 464 465 if (!pipe->readers) { 466 send_sig(SIGPIPE, current, 0); 467 ret = -EPIPE; 468 goto out; 469 } 470 471 /* We try to merge small writes */ 472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 473 if (pipe->nrbufs && chars != 0) { 474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 475 (PIPE_BUFFERS-1); 476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 477 const struct pipe_buf_operations *ops = buf->ops; 478 int offset = buf->offset + buf->len; 479 480 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 481 int error, atomic = 1; 482 void *addr; 483 484 error = ops->confirm(pipe, buf); 485 if (error) 486 goto out; 487 488 iov_fault_in_pages_read(iov, chars); 489 redo1: 490 addr = ops->map(pipe, buf, atomic); 491 error = pipe_iov_copy_from_user(offset + addr, iov, 492 chars, atomic); 493 ops->unmap(pipe, buf, addr); 494 ret = error; 495 do_wakeup = 1; 496 if (error) { 497 if (atomic) { 498 atomic = 0; 499 goto redo1; 500 } 501 goto out; 502 } 503 buf->len += chars; 504 total_len -= chars; 505 ret = chars; 506 if (!total_len) 507 goto out; 508 } 509 } 510 511 for (;;) { 512 int bufs; 513 514 if (!pipe->readers) { 515 send_sig(SIGPIPE, current, 0); 516 if (!ret) 517 ret = -EPIPE; 518 break; 519 } 520 bufs = pipe->nrbufs; 521 if (bufs < PIPE_BUFFERS) { 522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 523 struct pipe_buffer *buf = pipe->bufs + newbuf; 524 struct page *page = pipe->tmp_page; 525 char *src; 526 int error, atomic = 1; 527 528 if (!page) { 529 page = alloc_page(GFP_HIGHUSER); 530 if (unlikely(!page)) { 531 ret = ret ? : -ENOMEM; 532 break; 533 } 534 pipe->tmp_page = page; 535 } 536 /* Always wake up, even if the copy fails. Otherwise 537 * we lock up (O_NONBLOCK-)readers that sleep due to 538 * syscall merging. 539 * FIXME! Is this really true? 540 */ 541 do_wakeup = 1; 542 chars = PAGE_SIZE; 543 if (chars > total_len) 544 chars = total_len; 545 546 iov_fault_in_pages_read(iov, chars); 547 redo2: 548 if (atomic) 549 src = kmap_atomic(page, KM_USER0); 550 else 551 src = kmap(page); 552 553 error = pipe_iov_copy_from_user(src, iov, chars, 554 atomic); 555 if (atomic) 556 kunmap_atomic(src, KM_USER0); 557 else 558 kunmap(page); 559 560 if (unlikely(error)) { 561 if (atomic) { 562 atomic = 0; 563 goto redo2; 564 } 565 if (!ret) 566 ret = error; 567 break; 568 } 569 ret += chars; 570 571 /* Insert it into the buffer array */ 572 buf->page = page; 573 buf->ops = &anon_pipe_buf_ops; 574 buf->offset = 0; 575 buf->len = chars; 576 pipe->nrbufs = ++bufs; 577 pipe->tmp_page = NULL; 578 579 total_len -= chars; 580 if (!total_len) 581 break; 582 } 583 if (bufs < PIPE_BUFFERS) 584 continue; 585 if (filp->f_flags & O_NONBLOCK) { 586 if (!ret) 587 ret = -EAGAIN; 588 break; 589 } 590 if (signal_pending(current)) { 591 if (!ret) 592 ret = -ERESTARTSYS; 593 break; 594 } 595 if (do_wakeup) { 596 wake_up_interruptible_sync(&pipe->wait); 597 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 598 do_wakeup = 0; 599 } 600 pipe->waiting_writers++; 601 pipe_wait(pipe); 602 pipe->waiting_writers--; 603 } 604 out: 605 mutex_unlock(&inode->i_mutex); 606 if (do_wakeup) { 607 wake_up_interruptible_sync(&pipe->wait); 608 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 609 } 610 if (ret > 0) 611 file_update_time(filp); 612 return ret; 613 }
1.计算写数据长度,如果长度为0直接返回
2.获取pipe互斥锁,进入数据复制临界区
3.当没有读者时返回EPIPE错误,并向当前写进程发送SIGPIPE信号
4.计算写数据超过页大小的整数倍的长度(主要用于将余数部分与当前缓存合并,整数页分配新页帧存储)
5.如果当前缓存有空间容纳余数大小的数据,并且缓存可以合并数据,就复制余数长度数据到当前缓存中;如果复制完成,跳到步骤
注:
由于缓存页帧可能是高端内存页,所以要用confirm,map,写数据,unmap一系列操作;
由于用户进程写pipe的数据所在页帧可能被swap到硬盘中,内核访问就会出现缺页异常;为了能够原子复制,在复制前先触发缺页异常,主要通过pipe_iov_copy_from_user去预触发缺页异常。
6.分配新缓存存放数据
A.当没有读者时返回EPIPE错误,并向当前写进程发送SIGPIPE信号;当pipe缓存被写满后,仍有数据未写,写进程会被阻塞,pipe锁被释放,此时读进程可以获取pipe锁进而读数据,读到数据后可能会关闭pipe的读端,所以每循环一次都会检测读者个数。
B.当有空闲缓存空间时
a.从tmp_page中分配页帧,如果tmp_page没有页帧则从伙伴系统中获取分配页帧
b.iov_fault_in_pages_read进行读用户地址空间缺页异常预触发,以便后面原子复制数据,保证数据从用户空间往内核空间复制时不产生缺页异常。
c.将页帧映射到内核永久映射区中,获得线性地址,以便内核访问物理页帧
d.将数据从用户空间复制到页帧中
e.将页帧从内核永久映射区中移除
f.初始化pipe缓存,如缓存页帧、偏移、大小、操作等
g.如果数据复制完,走步骤7,退出;否则走步骤6继续循环
C.当没有空闲缓存空间时
a.如果是非阻塞写时,有数据写入则返回写入的数据长度,没有数据写入则走步骤7并返回EAGAIN错误
b.如果有信号产生,有数据写入则返回写入的数据长度,没有数据写入则走步骤7并返回ERESTARTSYS错误,内核处理完信号后会自动重启系统调用write
c.如果有数据写入且之前没有唤醒操作,则唤醒被阻塞的读者进程;向设置了O_ASYNC标识的文件所属读者进程发送异步I/O信号SIGIO
d.阻塞写者计数器加1,释放pipe锁阻塞当前进程;进程被唤醒时获取pipe锁,并将阻塞写者计数器减1
e.走步骤6继续循环
7.释放pipe互斥锁,退出数据复制临界区
8.如果有数据写入且之前没有唤醒操作,则唤醒被阻塞的读者进程;向设置了O_ASYNC标识的文件所属读者进程发送异步I/O信号SIGIO
9.返回写的数据长度
注:
当没有pipe缓存空间时,不管是NOBLOCK的写还是阻塞进程因信号而被唤醒,在有数据写入时都会返回实际写入的数据长度;所以需在用户进程中判断实际写入数据的长度是否是预期的写入长度。
v.read
pipe_read用于从管道缓存中读数据,当有写者被阻塞时唤醒写者进程;当管道缓存中没有数据时阻塞读进程,直到有缓存中有数据被写入,即缓存中有数据,阻塞的读进程被唤醒。
read->vfs_read->pipe_read
329 static ssize_t 330 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 331 unsigned long nr_segs, loff_t pos) 332 { 333 struct file *filp = iocb->ki_filp; 334 struct inode *inode = filp->f_path.dentry->d_inode; 335 struct pipe_inode_info *pipe; 336 int do_wakeup; 337 ssize_t ret; 338 struct iovec *iov = (struct iovec *)_iov; 339 size_t total_len; 340 341 total_len = iov_length(iov, nr_segs); 342 /* Null read succeeds. */ 343 if (unlikely(total_len == 0)) 344 return 0; 345 346 do_wakeup = 0; 347 ret = 0; 348 mutex_lock(&inode->i_mutex); 349 pipe = inode->i_pipe; 350 for (;;) { 351 int bufs = pipe->nrbufs; 352 if (bufs) { 353 int curbuf = pipe->curbuf; 354 struct pipe_buffer *buf = pipe->bufs + curbuf; 355 const struct pipe_buf_operations *ops = buf->ops; 356 void *addr; 357 size_t chars = buf->len; 358 int error, atomic; 359 360 if (chars > total_len) 361 chars = total_len; 362 363 error = ops->confirm(pipe, buf); 364 if (error) { 365 if (!ret) 366 ret = error; 367 break; 368 } 369 370 atomic = !iov_fault_in_pages_write(iov, chars); 371 redo: 372 addr = ops->map(pipe, buf, atomic); 373 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 374 ops->unmap(pipe, buf, addr); 375 if (unlikely(error)) { 376 /* 377 * Just retry with the slow path if we failed. 378 */ 379 if (atomic) { 380 atomic = 0; 381 goto redo; 382 } 383 if (!ret) 384 ret = error; 385 break; 386 } 387 ret += chars; 388 buf->offset += chars; 389 buf->len -= chars; 390 if (!buf->len) { 391 buf->ops = NULL; 392 ops->release(pipe, buf); 393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 394 pipe->curbuf = curbuf; 395 pipe->nrbufs = --bufs; 396 do_wakeup = 1; 397 } 398 total_len -= chars; 399 if (!total_len) 400 break; /* common path: read succeeded */ 401 } 402 if (bufs) /* More to do? */ 403 continue; 404 if (!pipe->writers) 405 break; 406 if (!pipe->waiting_writers) { 407 /* syscall merging: Usually we must not sleep 408 * if O_NONBLOCK is set, or if we got some data. 409 * But if a writer sleeps in kernel space, then 410 * we can wait for that data without violating POSIX. 411 */ 412 if (ret) 413 break; 414 if (filp->f_flags & O_NONBLOCK) { 415 ret = -EAGAIN; 416 break; 417 } 418 } 419 if (signal_pending(current)) { 420 if (!ret) 421 ret = -ERESTARTSYS; 422 break; 423 } 424 if (do_wakeup) { 425 wake_up_interruptible_sync(&pipe->wait); 426 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 427 } 428 pipe_wait(pipe); 429 } 430 mutex_unlock(&inode->i_mutex); 431 432 /* Signal writers asynchronously that there is more room. */ 433 if (do_wakeup) { 434 wake_up_interruptible_sync(&pipe->wait); 435 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 436 } 437 if (ret > 0) 438 file_accessed(filp); 439 return ret; 440 }
1.计算写数据长度,如果长度为0直接返回
2.获取pipe互斥锁,进入读数据临界区
3.如果pipe缓存中有数据
A.通过confirm,map,copy,unmap一系列操作将数据从内核空间的pipe缓存复制到用户进程空间。
B.如果当前缓存pipe_buffer中数据复制完,则释放当前pipe_buffer;将唤醒标识置1
C.如果复制完所需的数据,跳转步骤5退出
4.如果pipe缓存中没有数据
A.如果没有写者了,跳转步骤5退出
B.如果没有数据写等待进程,读出部分数据时返回实际读出数据,未读出数据且时NONBLOCK读时返回EAGAIN错误
C.有果有信号产生,且没有读出数据时,返回ERESTARTSYS错误,内核处理完信号后会自动重启系统调用read
D.如果唤醒标识do_wakeup置位,唤醒被阻塞的写者进程;向设置了O_ASYNC标识的文件所属写者进程发送异步I/O信号SIGIO
E.释放pipe互斥锁,进程被阻塞;如果进程被唤醒,获取pipe互斥锁,跳转步骤3继续循环
5.释放pipe互斥锁,退出读数据临界区
6.如果唤醒标识do_wakeup置位,唤醒被阻塞的写者进程;向设置了O_ASYNC标识的文件所属写者进程发送异步I/O信号SIGIO
7.返回实际读出的数据
注:
由于实际读出的数据长度可能比要求的小,所以要在程序中判断实际读出数据长度
vi.poll
pipe_poll主要用于返回文件当前可以进行的poll操作
poll->do_sys_poll->do_poll->do_pollfd->pipe_poll/select->core_sys_select->do_select->pipe_poll
653 /* No kernel lock held - fine */ 654 static unsigned int 655 pipe_poll(struct file *filp, poll_table *wait) 656 { 657 unsigned int mask; 658 struct inode *inode = filp->f_path.dentry->d_inode; 659 struct pipe_inode_info *pipe = inode->i_pipe; 660 int nrbufs; 661 662 poll_wait(filp, &pipe->wait, wait); 663 664 /* Reading only -- no need for acquiring the semaphore. */ 665 nrbufs = pipe->nrbufs; 666 mask = 0; 667 if (filp->f_mode & FMODE_READ) { 668 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 669 if (!pipe->writers && filp->f_version != pipe->w_counter) 670 mask |= POLLHUP; 671 } 672 673 if (filp->f_mode & FMODE_WRITE) { 674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 675 /* 676 * Most Unices do not set POLLERR for FIFOs but on Linux they 677 * behave exactly like pipes for poll(). 678 */ 679 if (!pipe->readers) 680 mask |= POLLERR; 681 } 682 683 return mask; 684 }
1.将wait添加到pipe文件的等待队列中
2.如果管道是读端,当有缓存数据时mask添加POLLIN | POLLRDNORM;如果没有写者且写者关闭(打开只读管道时f_version=0/w_counter,打开写管道时w_counter会加1),mask添加POLLHUP,即管道写端打开过(w_counter!=f_version)但又关闭了(writers=0)则表示管道读端POLLHUP
3.如果管道是写端,当pipe缓存有空闲空间时mask添加POLLOUT | POLLWRNORM;如果没有读者,mask添加POLLERR
4.返回pipe管道文件可以进行的poll操作mask
vii.release
当close文件时会调用release操作(文件引用计数器为0)
close->filp_close->fput->__fput->release
/* fs/pipe.c */ 686 static int 687 pipe_release(struct inode *inode, int decr, int decw) 688 { 689 struct pipe_inode_info *pipe; 690 691 mutex_lock(&inode->i_mutex); 692 pipe = inode->i_pipe; 693 pipe->readers -= decr; 694 pipe->writers -= decw; 695 696 if (!pipe->readers && !pipe->writers) { 697 free_pipe_info(inode); 698 } else { 699 wake_up_interruptible_sync(&pipe->wait); 700 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 701 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 702 } 703 mutex_unlock(&inode->i_mutex); 704 705 return 0; 706 } 755 static int 756 pipe_read_release(struct inode *inode, struct file *filp) 757 { 758 return pipe_release(inode, 1, 0); 759 } 760 761 static int 762 pipe_write_release(struct inode *inode, struct file *filp) 763 { 764 return pipe_release(inode, 0, 1); 765 } 766 767 static int 768 pipe_rdwr_release(struct inode *inode, struct file *filp) 769 { 770 int decr, decw; 771 772 decr = (filp->f_mode & FMODE_READ) != 0; 773 decw = (filp->f_mode & FMODE_WRITE) != 0; 774 return pipe_release(inode, decr, decw); 775 }
1.获取管道互斥锁,进入管道操作临界区
2.读/写者计数器减1
3.如果管道既没有读者也没有写者,则释放管道缓存及管道描述符
4.否则,唤醒管道等待队列中的阻塞进程,向管道读者&写者发送异步I/O信号SIGIO
IV.pipefs
1119 /* 1120 * pipefs should _never_ be mounted by userland - too much of security hassle, 1121 * no real gain from having the whole whorehouse mounted. So we don't need 1122 * any operations on the root directory. However, we need a non-trivial 1123 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1124 */ 1125 static int pipefs_get_sb(struct file_system_type *fs_type, 1126 int flags, const char *dev_name, void *data, 1127 struct vfsmount *mnt) 1128 { 1129 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1130 } 1131 1132 static struct file_system_type pipe_fs_type = { 1133 .name = "pipefs", 1134 .get_sb = pipefs_get_sb, 1135 .kill_sb = kill_anon_super, 1136 }; 1137 1138 static int __init init_pipe_fs(void) 1139 { 1140 int err = register_filesystem(&pipe_fs_type); 1141 1142 if (!err) { 1143 pipe_mnt = kern_mount(&pipe_fs_type); 1144 if (IS_ERR(pipe_mnt)) { 1145 err = PTR_ERR(pipe_mnt); 1146 unregister_filesystem(&pipe_fs_type); 1147 } 1148 } 1149 return err; 1150 } 1151 1152 static void __exit exit_pipe_fs(void) 1153 { 1154 unregister_filesystem(&pipe_fs_type); 1155 mntput(pipe_mnt); 1156 }
pipefs是一个虚拟的文件系统,挂载在内核中而不会被挂载到根文件系统中