xv6文件系统采用了分层的实现,下面的每一层都向上提供接口供上层调用,这里并不阐述xv6文件系统的系统细节,仅仅针对每一层需要注意的问题和各种接口的联系做解释,具体的文件系统细节可参考xv6中文文档。
xv6文件系统
xv6将硬盘中的每个分区编号为各种块,每块512Byte,磁盘读写总是以块为单位,xv6使用结构buf来代表磁盘块数据在内核中的表示:
struct buf {
int flags;
uint dev;
uint blockno;
struct sleeplock lock;
uint refcnt;
struct buf *prev; // LRU cache list
struct buf *next;
struct buf *qnext; // disk queue
uchar data[BSIZE];
};
xv6设置有内核缓冲区来缓存一定量的块,并用LRU来实现缓存替换。
struct {
struct spinlock lock;
struct buf buf[NBUF];
// Linked list of all buffers, through prev/next.
// head.next is most recently used.
struct buf head;
} bcache;
xv6在内核中分配了静态数组然后通过head buf来构成双向链表,双向链表维护着块的使用频率,按照最近使用的顺序来组织结构能让块读取更加效率。
块缓冲层提供有binit,bget,bread,bwrite,brelse接口。
binit初始化bcache结构并设置块缓冲区需要使用的锁。
void
binit(void)
{
struct buf *b;
initlock(&bcache.lock, "bcache");
//PAGEBREAK!
// Create linked list of buffers
bcache.head.prev = &bcache.head;
bcache.head.next = &bcache.head;
for(b = bcache.buf; b < bcache.buf+NBUF; b++){
b->next = bcache.head.next;
b->prev = &bcache.head;
initsleeplock(&b->lock, "buffer");
bcache.head.next->prev = b;
bcache.head.next = b;
}
}
bread根据参数确定设备号和块编号并调用bget得到块缓冲结构,bget在块缓冲区中找到缓冲块,如果此缓冲块已经有进程占用,则睡眠当前进程等待唤醒。如果bget没有找到相应的块缓冲结构,则在缓冲区中找到一个无效的块缓冲区并返回,由bread调用iderw来将数据读入内核。如果缓冲区满,bget简单滴panic。
// Look through buffer cache for block on device dev.
// If not found, allocate a buffer.
// In either case, return locked buffer.
static struct buf*
bget(uint dev, uint blockno)
{
struct buf *b;
acquire(&bcache.lock);
// Is the block already cached?
for(b = bcache.head.next; b != &bcache.head; b = b->next){
if(b->dev == dev && b->blockno == blockno){
b->refcnt++;
release(&bcache.lock);
acquiresleep(&b->lock);
return b;
}
}
// Not cached; recycle some unused buffer and clean buffer
// "clean" because B_DIRTY and not locked means log.c
// hasn't yet committed the changes to the buffer.
for(b = bcache.head.prev; b != &bcache.head; b = b->prev){
if(b->refcnt == 0 && (b->flags & B_DIRTY) == 0) {
b->dev = dev;
b->blockno = blockno;
b->flags = 0;
b->refcnt = 1;
release(&bcache.lock);
acquiresleep(&b->lock);
return b;
}
}
panic("bget: no buffers");
}
// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno)
{
struct buf *b;
b = bget(dev, blockno);
if(!(b->flags & B_VALID)) {
iderw(b);
}
return b;
}
bwrite将块缓冲结构写入磁盘
void
bwrite(struct buf *b)
{
if(!holdingsleep(&b->lock))
panic("bwrite");
b->flags |= B_DIRTY;
iderw(b);
}
brelse则减少块的引用次数,并移动块的位置实现LRU
// Release a locked buffer.
// Move to the head of the MRU list.
void
brelse(struct buf *b)
{
if(!holdingsleep(&b->lock))
panic("brelse");
releasesleep(&b->lock);
acquire(&bcache.lock);
b->refcnt--;
if (b->refcnt == 0) {
// no one is waiting for it.
b->next->prev = b->prev;
b->prev->next = b->next;
b->next = bcache.head.next;
b->prev = &bcache.head;
bcache.head.next->prev = b;
bcache.head.next = b;
}
release(&bcache.lock);
}
//PAGEBREAK!
// Blank page.
xv6使用了日志式文件系统来确保写操作不会导致文件系统的破坏,进程的写操作像一种“原子”操作,如果写操作过程中断电崩溃,将很大可能损坏文件系统,例如,在断电后目录有一个指向空闲i节点的项将可能导致严重的问题。
xv6使用了非常严格的日志读写来使读写操作要么完全完成,要么完成未完成。所有的读写操作首先都会写入磁盘中存放日志的区域,只有当真正的读写操作完成后才会使日志失效,这样,就算任何过程中断电或者其他原因导致系统崩溃,文件系统的组织结构都不会损坏,结果是要么操作完全完成,要么都未完成。尽管这样使得每个操作进行了两次,降低了读写效率。
xv6在硬盘中的日志有一个初始快和数据块,初始快包括一个数组,数组的值为对应数据块的内容应该写入文件系统中的哪一块,初始快还有当前有效数据块的计数。在内存中同样要一样的结构来存储数据。
struct logheader {
int n;
int block[LOGSIZE];
};
struct log {
struct spinlock lock;
int start;
int size;
int outstanding; // how many FS sys calls are executing.
int committing; // in commit(), please wait.
int dev;
struct logheader lh;
};
通过这种方式,bwrite可以使用log_write替代,当修改了内存中的块缓冲区后,log_wirte同时在block数组中记录这个块需要写到磁盘中的哪一块,但是没有立即写入,当调用commit的时候,调用write_log写入日志区域中,并调用write_head更新初始快,然后调用install_trans真正地更新文件系统,此时,发生崩溃都会导致日志有非零的计数,以便重启后再次进行写操作,最后将计数变量置零使日志失效并更新日志初始快。
通过log_write写入磁盘时,数据并不会立即写入磁盘,只有当调用commit来提交日志时,磁盘操作才会正式开始磁盘操作。
static void
commit()
{
if (log.lh.n > 0) {
write_log(); // Write modified blocks from cache to log
write_head(); // Write header to disk -- the real commit
install_trans(); // Now install writes to home locations
log.lh.n = 0;
write_head(); // Erase the transaction from the log
}
}
void
log_write(struct buf *b)
{
int i;
if (log.lh.n >= LOGSIZE || log.lh.n >= log.size - 1)
panic("too big a transaction");
if (log.outstanding < 1)
panic("log_write outside of trans");
acquire(&log.lock);
for (i = 0; i < log.lh.n; i++) {
if (log.lh.block[i] == b->blockno) // log absorbtion
break;
}
log.lh.block[i] = b->blockno;
if (i == log.lh.n)
log.lh.n++;
b->flags |= B_DIRTY; // prevent eviction
release(&log.lock);
}
xv6日志读写支持并发操作,当要写操作时,调用begin_op,结束时调用end_op,begin_op检查日志是否正在提交,如果正在提交则睡眠当前进程,如果不在提交则增加操作次数,end_op减少操作次数,当没有任何进程正在操作log时,调用commit提交日志。
// called at the start of each FS system call.
void
begin_op(void)
{
acquire(&log.lock);
while(1){
if(log.committing){
sleep(&log, &log.lock);
} else if(log.lh.n + (log.outstanding+1)*MAXOPBLOCKS > LOGSIZE){
// this op might exhaust log space; wait for commit.
sleep(&log, &log.lock);
} else {
log.outstanding += 1;
release(&log.lock);
break;
}
}
}
// called at the end of each FS system call.
// commits if this was the last outstanding operation.
void
end_op(void)
{
int do_commit = 0;
acquire(&log.lock);
log.outstanding -= 1;
if(log.committing)
panic("log.committing");
if(log.outstanding == 0){
do_commit = 1;
log.committing = 1;
} else {
// begin_op() may be waiting for log space.
wakeup(&log);
}
release(&log.lock);
if(do_commit){
// call commit w/o holding locks, since not allowed
// to sleep with locks.
commit();
acquire(&log.lock);
log.committing = 0;
wakeup(&log);
release(&log.lock);
}
}
// Allocate a zeroed disk block.
static uint
balloc(uint dev)
{
int b, bi, m;
struct buf *bp;
bp = 0;
for(b = 0; b < sb.size; b += BPB){
bp = bread(dev, BBLOCK(b, sb));
for(bi = 0; bi < BPB && b + bi < sb.size; bi++){
m = 1 << (bi % 8);
if((bp->data[bi/8] & m) == 0){ // Is block free?
bp->data[bi/8] |= m; // Mark block in use.
log_write(bp);
brelse(bp);
bzero(dev, b + bi);
return b + bi;
}
}
brelse(bp);
}
panic("balloc: out of blocks");
}
// Free a disk block.
static void
bfree(int dev, uint b)
{
struct buf *bp;
int bi, m;
readsb(dev, &sb);
bp = bread(dev, BBLOCK(b, sb));
bi = b % BPB;
m = 1 << (bi % 8);
if((bp->data[bi/8] & m) == 0)
panic("freeing free block");
bp->data[bi/8] &= ~m;
log_write(bp);
brelse(bp);
}
i节点分为内核i节点(inode)和磁盘上的i节点(dinode),xv6使i节点表来缓存i节点
struct dinode {
short type; // File type
short major; // Major device number (T_DEV only)
short minor; // Minor device number (T_DEV only)
short nlink; // Number of links to inode in file system
uint size; // Size of file (bytes)
uint addrs[NDIRECT+1]; // Data block addresses
};
// in-memory copy of an inode
struct inode {
uint dev; // Device number
uint inum; // Inode number
int ref; // Reference count
struct sleeplock lock;
int flags; // I_VALID
short type; // copy of disk inode
short major;
short minor;
short nlink;
uint size;
uint addrs[NDIRECT+1];
};
struct {
struct spinlock lock;
struct inode inode[NINODE];
} icache;
iinit负责初始化i节点相关内容
void
iinit(int dev)
{
int i = 0;
initlock(&icache.lock, "icache");
for(i = 0; i < NINODE; i++) {
initsleeplock(&icache.inode[i].lock, "inode");
}
readsb(dev, &sb);
cprintf("sb: size %d nblocks %d ninodes %d nlog %d logstart %d\
inodestart %d bmap start %d\n", sb.size, sb.nblocks,
sb.ninodes, sb.nlog, sb.logstart, sb.inodestart,
sb.bmapstart);
}
ialloc在磁盘中找到空闲i节点并返回内核i节点
struct inode*
ialloc(uint dev, short type)
{
int inum;
struct buf *bp;
struct dinode *dip;
for(inum = 1; inum < sb.ninodes; inum++){
bp = bread(dev, IBLOCK(inum, sb));
dip = (struct dinode*)bp->data + inum%IPB;
if(dip->type == 0){ // a free inode
memset(dip, 0, sizeof(*dip));
dip->type = type;
log_write(bp); // mark it allocated on the disk
brelse(bp);
return iget(dev, inum);
}
brelse(bp);
}
panic("ialloc: no inodes");
}
iupdate将内核i节点相关内容写入磁盘i节点
void
iupdate(struct inode *ip)
{
struct buf *bp;
struct dinode *dip;
bp = bread(ip->dev, IBLOCK(ip->inum, sb));
dip = (struct dinode*)bp->data + ip->inum%IPB;
dip->type = ip->type;
dip->major = ip->major;
dip->minor = ip->minor;
dip->nlink = ip->nlink;
dip->size = ip->size;
memmove(dip->addrs, ip->addrs, sizeof(ip->addrs));
log_write(bp);
brelse(bp);
}
iget返回一个内核i节点
static struct inode*
iget(uint dev, uint inum)
{
struct inode *ip, *empty;
acquire(&icache.lock);
// Is the inode already cached?
empty = 0;
for(ip = &icache.inode[0]; ip < &icache.inode[NINODE]; ip++){
if(ip->ref > 0 && ip->dev == dev && ip->inum == inum){
ip->ref++;
release(&icache.lock);
return ip;
}
if(empty == 0 && ip->ref == 0) // Remember empty slot.
empty = ip;
}
// Recycle an inode cache entry.
if(empty == 0)
panic("iget: no inodes");
ip = empty;
ip->dev = dev;
ip->inum = inum;
ip->ref = 1;
ip->flags = 0;
release(&icache.lock);
return ip;
}
idup复制一个i节点
struct inode*
idup(struct inode *ip)
{
acquire(&icache.lock);
ip->ref++;
release(&icache.lock);
return ip;
}
ilock锁住i节点并在必要的时候读取i节点元数据
void
ilock(struct inode *ip)
{
struct buf *bp;
struct dinode *dip;
if(ip == 0 || ip->ref < 1)
panic("ilock");
acquiresleep(&ip->lock);
if(!(ip->flags & I_VALID)){
bp = bread(ip->dev, IBLOCK(ip->inum, sb));
dip = (struct dinode*)bp->data + ip->inum%IPB;
ip->type = dip->type;
ip->major = dip->major;
ip->minor = dip->minor;
ip->nlink = dip->nlink;
ip->size = dip->size;
memmove(ip->addrs, dip->addrs, sizeof(ip->addrs));
brelse(bp);
ip->flags |= I_VALID;
if(ip->type == 0)
panic("ilock: no type");
}
}
iunlock解锁i节点
// Unlock the given inode.
void
iunlock(struct inode *ip)
{
if(ip == 0 || !holdingsleep(&ip->lock) || ip->ref < 1)
panic("iunlock");
releasesleep(&ip->lock);
}