linux文件系统写过程简析

linux写入磁盘过程经历VFS ->  页缓存(page cache) -> 具体的文件系统(ext2/3/4、XFS、ReiserFS等) -> Block IO ->设备驱动 -> SCSI指令(或者其他指令),总体来说linux文件写入磁盘过程比较复杂

1、VFS(虚拟文件系统)

      Linux中采用了VFS的方式屏蔽了多个文件系统的差别, 当需要不同的设备或者其他文件系统时,采用挂载mount的方式访问其他设备或者其他文件系统(这里可以把文件系统理解为具体的设备)。正是因为使用了VFS,所以所有的文件系统设备使用统一的文件目录树视图访问,整个存储空间采用一个文件系统目录树来管理,屏蔽了底层多个文件系统之间的差别。当然,如果你需要把你自己编写的文件系统集成到Linux内核,采用VFS的方式进行访问,你需要采用模块加载的方式进行处理,相应的文件系统模块文件需要编入到系统目录/lib/modules/your-system-name/kernel/fs当中。当然VFS的作用远不止这些,通过VFS也进行访问设备,在Linux下所有的对象都是文件,简化了系统的访问。

     1.1 正常情况下,所有的文件操作通过系统调用进入到VFS中,特殊的处理,直接操作原始设备。文件系统写入的系统调用为:

  #include <unistd.h>

  ssize_t  write(int fd,  const void * buffer, size_t  count);

    1.2 当采用系统调用进入VFS时,接下来的处理交给VFS层。处理过程比较中要的是vfs_write、generic_file_aio_write

  

 1 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
 2 {
 3     ssize_t ret;
 4 
 5     if (!(file->f_mode & FMODE_WRITE))
 6         return -EBADF;
 7     if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
 8         return -EINVAL;
 9     if (unlikely(!access_ok(VERIFY_READ, buf, count)))
10         return -EFAULT;
11 
12     ret = rw_verify_area(WRITE, file, pos, count);
13     if (ret >= 0) {
14         count = ret;
15         if (file->f_op->write)
16             ret = file->f_op->write(file, buf, count, pos);
17         else
18             ret = do_sync_write(file, buf, count, pos);
19         if (ret > 0) {
20             fsnotify_modify(file->f_path.dentry);
21             add_wchar(current, ret);
22         }
23         inc_syscw(current);
24     }
25 
26     return ret;
27 }

 

 1 /**
 2  * generic_file_aio_write - write data to a file
 3  * @iocb:    IO state structure
 4  * @iov:    vector with data to write
 5  * @nr_segs:    number of segments in the vector
 6  * @pos:    position in file where to write
 7  *
 8  * This is a wrapper around __generic_file_aio_write() to be used by most
 9  * filesystems. It takes care of syncing the file in case of O_SYNC file
10  * and acquires i_mutex as needed.
11  */
12 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
13         unsigned long nr_segs, loff_t pos)
14 {
15     struct file *file = iocb->ki_filp;
16     struct inode *inode = file->f_mapping->host;
17     ssize_t ret;
18 
19     BUG_ON(iocb->ki_pos != pos);
20 
21     mutex_lock(&inode->i_mutex);
22     ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
23     mutex_unlock(&inode->i_mutex);
24 
25     if (ret > 0 || ret == -EIOCBQUEUED) {
26         ssize_t err;
27 
28         err = generic_write_sync(file, pos, ret);
29         if (err < 0 && ret > 0)
30             ret = err;
31     }
32     return ret;
33 }

  2、 对于VFS层也有采用page cache和非page cache两种,下面重要介绍采用page cache的处理。

      在VFS中, 每个打开操作的文件对应内核都有一个address_space 数据结构, 该数据结构是用来表示系统中打开的文件,并且一个打开的文件只有一个address_space数据结构。

如下:   

 1 struct address_space {
 2     struct inode        *host;        /* owner: inode, block_device */
 3     struct radix_tree_root    page_tree;    /* radix tree of all pages */
 4     spinlock_t        tree_lock;    /* and lock protecting it */
 5     unsigned int        i_mmap_writable;/* count VM_SHARED mappings */
 6     struct prio_tree_root    i_mmap;        /* tree of private and shared mappings */
 7     struct list_head    i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 8     spinlock_t        i_mmap_lock;    /* protect tree, count, list */
 9     unsigned int        truncate_count;    /* Cover race condition with truncate */
10     unsigned long        nrpages;    /* number of total pages */
11     pgoff_t            writeback_index;/* writeback starts here */
12     const struct address_space_operations *a_ops;    /* methods */
13     unsigned long        flags;        /* error bits/gfp mask */
14     struct backing_dev_info *backing_dev_info; /* device readahead, etc */
15     spinlock_t        private_lock;    /* for use by the address_space */
16     struct list_head    private_list;    /* ditto */
17     struct address_space    *assoc_mapping;    /* ditto */
18     struct mutex        unmap_mutex;    /* to protect unmapping */
19 } __attribute__((aligned(sizeof(long))));

    对于文件中的文件内容缓存采用的是基数树的方式来保存的,在成员变量page_tree中,关于基数树的介绍参考[1]和[2]。 下面是关于page cache写处理的几个重要的函数    

 1 ssize_t
 2 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 3         unsigned long nr_segs, loff_t pos, loff_t *ppos,
 4         size_t count, ssize_t written)
 5 {
 6     struct file *file = iocb->ki_filp;
 7     struct address_space *mapping = file->f_mapping;
 8     ssize_t status;
 9     struct iov_iter i;
10 
11     iov_iter_init(&i, iov, nr_segs, count, written);
12     status = generic_perform_write(file, &i, pos);
13 
14     if (likely(status >= 0)) {
15         written += status;
16         *ppos = pos + status;
17       }
18     
19     /*
20      * If we get here for O_DIRECT writes then we must have fallen through
21      * to buffered writes (block instantiation inside i_size).  So we sync
22      * the file data here, to try to honour O_DIRECT expectations.
23      */
24     if (unlikely(file->f_flags & O_DIRECT) && written)
25         status = filemap_write_and_wait_range(mapping,
26                     pos, pos + written - 1);
27 
28     return written ? written : status;
29 }

    调用page cache中的write_begin 和write_end 

    Note: 在进行VFS系统调用写入文件过程中,可以允许在文件中的任何位置写入,这其中就包括当写入的过程中写入的起始位置不是一个block的开始位置,这时需要特殊的处理,上述的过程都在write_begin这个函数调用过程中处理完毕。

3、ext2/3/4中文件的处理。

   当在page cache中进行到write_begin时,需要ext4中的ext4_write_begin处理, 如下:   

 1 static int ext4_write_begin(struct file *file, struct address_space *mapping,
 2                 loff_t pos, unsigned len, unsigned flags,
 3                 struct page **pagep, void **fsdata)
 4 {
 5     struct inode *inode = mapping->host;
 6     int ret, needed_blocks;
 7     handle_t *handle;
 8     int retries = 0;
 9     struct page *page;
10     pgoff_t index;
11     unsigned from, to;
12         .........
13 
14     index = pos >> PAGE_CACHE_SHIFT;
15     from = pos & (PAGE_CACHE_SIZE - 1);
16     to = from + len;
17 
18 retry:
19     handle = ext4_journal_start(inode, needed_blocks);
20     if (IS_ERR(handle)) {
21         ret = PTR_ERR(handle);
22         goto out;
23     }
24 
25     /* We cannot recurse into the filesystem as the transaction is already
26      * started */
27     flags |= AOP_FLAG_NOFS;
28 
29     page = grab_cache_page_write_begin(mapping, index, flags);
30     if (!page) {
31         ext4_journal_stop(handle);
32         ret = -ENOMEM;
33         goto out;
34     }
35     *pagep = page;
36 
37     ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
38                 ext4_get_block);
39 
40     if (!ret && ext4_should_journal_data(inode)) {
41         ret = walk_page_buffers(handle, page_buffers(page),
42                 from, to, NULL, do_journal_get_write_access);
43     }
44 
45     if (ret) {
46         unlock_page(page);
47         page_cache_release(page);
48         /*
49          * block_write_begin may have instantiated a few blocks
50          * outside i_size.  Trim these off again. Don‘t need
51          * i_size_read because we hold i_mutex.
52          *
53          * Add inode to orphan list in case we crash before
54          * truncate finishes
55          */
56         if (pos + len > inode->i_size && ext4_can_truncate(inode))
57             ext4_orphan_add(handle, inode);
58 
59         ext4_journal_stop(handle);
60         if (pos + len > inode->i_size) {
61             ext4_truncate_failed_write(inode);
62             /*
63              * If truncate failed early the inode might
64              * still be on the orphan list; we need to
65              * make sure the inode is removed from the
66              * orphan list in that case.
67              */
68             if (inode->i_nlink)
69                 ext4_orphan_del(NULL, inode);
70         }
71     }
72 
73     if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
74         goto retry;
75 out:
76     return ret;
77 }

       其中在ext4_write_begin中包含了很多的处理功能,包括文件物理块的分配(假设ext4中的delay allocation特性没有开启)、文件块的部分写过程的处理等。下面是在ext_write_begin函数调用过程中比较重要的几个函数。 

 1 /*
 2  * block_write_begin takes care of the basic task of block allocation and
 3  * bringing partial write blocks uptodate first.
 4  *
 5  * If *pagep is not NULL, then block_write_begin uses the locked page
 6  * at *pagep rather than allocating its own. In this case, the page will
 7  * not be unlocked or deallocated on failure.
 8  */
 9 int block_write_begin(struct file *file, struct address_space *mapping,
10             loff_t pos, unsigned len, unsigned flags,
11             struct page **pagep, void **fsdata,
12             get_block_t *get_block)
13 {
14     struct inode *inode = mapping->host;
15     int status = 0;
16     struct page *page;
17     pgoff_t index;
18     unsigned start, end;
19     int ownpage = 0;
20 
21     index = pos >> PAGE_CACHE_SHIFT;
22     start = pos & (PAGE_CACHE_SIZE - 1);
23     end = start + len;
24 
25     page = *pagep;
26     if (page == NULL) {
27         ownpage = 1;
28         page = grab_cache_page_write_begin(mapping, index, flags);
29         if (!page) {
30             status = -ENOMEM;
31             goto out;
32         }
33         *pagep = page;
34     } else
35         BUG_ON(!PageLocked(page));
36 
37     status = __block_prepare_write(inode, page, start, end, get_block);
38     if (unlikely(status)) {
39         ClearPageUptodate(page);
40 
41         if (ownpage) {
42             unlock_page(page);
43             page_cache_release(page);
44             *pagep = NULL;
45 
46             /*
47              * prepare_write() may have instantiated a few blocks
48              * outside i_size.  Trim these off again. Don‘t need
49              * i_size_read because we hold i_mutex.
50              */
51             if (pos + len > inode->i_size)
52                 vmtruncate(inode, inode->i_size);
53         }
54     }
55 
56 out:
57     return status;
58 }

       

 1 static int __block_prepare_write(struct inode *inode, struct page *page,
 2         unsigned from, unsigned to, get_block_t *get_block)
 3 {
 4     unsigned block_start, block_end;
 5     sector_t block;
 6     int err = 0;
 7     unsigned blocksize, bbits;
 8     struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
 9 
10     BUG_ON(!PageLocked(page));
11     BUG_ON(from > PAGE_CACHE_SIZE);
12     BUG_ON(to > PAGE_CACHE_SIZE);
13     BUG_ON(from > to);
14 
15     blocksize = 1 << inode->i_blkbits;
16     if (!page_has_buffers(page))
17         create_empty_buffers(page, blocksize, 0);
18     head = page_buffers(page);
19 
20     bbits = inode->i_blkbits;
21     block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
22 
23     for(bh = head, block_start = 0; bh != head || !block_start;
24         block++, block_start=block_end, bh = bh->b_this_page) {
25         block_end = block_start + blocksize;
26         if (block_end <= from || block_start >= to) {
27             if (PageUptodate(page)) {
28                 if (!buffer_uptodate(bh))
29                     set_buffer_uptodate(bh);
30             }
31             continue;
32         }
33         if (buffer_new(bh))
34             clear_buffer_new(bh);
35         if (!buffer_mapped(bh)) {
36             WARN_ON(bh->b_size != blocksize);
37             err = get_block(inode, block, bh, 1);
38             if (err)
39                 break;
40             if (buffer_new(bh)) {
41                 unmap_underlying_metadata(bh->b_bdev,
42                             bh->b_blocknr);
43                 if (PageUptodate(page)) {
44                     clear_buffer_new(bh);
45                     set_buffer_uptodate(bh);
46                     mark_buffer_dirty(bh);
47                     continue;
48                 }
49                 if (block_end > to || block_start < from)
50                     zero_user_segments(page,
51                         to, block_end,
52                         block_start, from);
53                 continue;
54             }
55         }
56         if (PageUptodate(page)) {
57             if (!buffer_uptodate(bh))
58                 set_buffer_uptodate(bh);
59             continue; 
60         }
61         if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
62             !buffer_unwritten(bh) &&
63              (block_start < from || block_end > to)) {
64             ll_rw_block(READ, 1, &bh);
65             *wait_bh++=bh;
66         }
67     }
68     /*
69      * If we issued read requests - let them complete.
70      */
71     while(wait_bh > wait) {
72         wait_on_buffer(*--wait_bh);
73         if (!buffer_uptodate(*wait_bh))
74             err = -EIO;
75     }
76     if (unlikely(err))
77         page_zero_new_buffers(page, from, to);
78     return err;
79 }

     

 1 /**
 2  * ll_rw_block: low-level access to block devices (DEPRECATED)
 3  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
 4  * @nr: number of &struct buffer_heads in the array
 5  * @bhs: array of pointers to &struct buffer_head
 6  *
 7  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
 8  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
 9  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
10  * are sent to disk. The fourth %READA option is described in the documentation
11  * for generic_make_request() which ll_rw_block() calls.
12  *
13  * This function drops any buffer that it cannot get a lock on (with the
14  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
15  * clean when doing a write request, and any buffer that appears to be
16  * up-to-date when doing read request.  Further it marks as clean buffers that
17  * are processed for writing (the buffer cache won‘t assume that they are
18  * actually clean until the buffer gets unlocked).
19  *
20  * ll_rw_block sets b_end_io to simple completion handler that marks
21  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
22  * any waiters. 
23  *
24  * All of the buffers must be for the same device, and must also be a
25  * multiple of the current approved size for the device.
26  */
27 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
28 {
29     int i;
30 
31     for (i = 0; i < nr; i++) {
32         struct buffer_head *bh = bhs[i];
33 
34         if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
35             lock_buffer(bh);
36         else if (!trylock_buffer(bh))
37             continue;
38 
39         if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
40             rw == SWRITE_SYNC_PLUG) {
41             if (test_clear_buffer_dirty(bh)) {
42                 bh->b_end_io = end_buffer_write_sync;
43                 get_bh(bh);
44                 if (rw == SWRITE_SYNC)
45                     submit_bh(WRITE_SYNC, bh);
46                 else
47                     submit_bh(WRITE, bh);
48                 continue;
49             }
50         } else {
51             if (!buffer_uptodate(bh)) {
52                 bh->b_end_io = end_buffer_read_sync;
53                 get_bh(bh);
54                 submit_bh(rw, bh);
55                 continue;
56             }
57         }
58         unlock_buffer(bh);
59     }
60 }

     其中在ext4中块的分配过程中,管理块分配处理的函数实现在fs/ext4/balloc.c  fs/ext4/mballoc.c

   4、当page cache中的数据需要刷新到disk上的时候,这时处理的过程由Block IO接管。

      在进行文件page cache刷新到disk上的过程中比较重要的数据结构有如下两个buffer_head 和 bio      

 1 struct buffer_head {
 2     unsigned long b_state;        /* buffer state bitmap (see above) */
 3     struct buffer_head *b_this_page;/* circular list of page‘s buffers */
 4     struct page *b_page;        /* the page this bh is mapped to */
 5 
 6     sector_t b_blocknr;        /* start block number */
 7     size_t b_size;            /* size of mapping */
 8     char *b_data;            /* pointer to data within the page */
 9 
10     struct block_device *b_bdev;
11     bh_end_io_t *b_end_io;        /* I/O completion */
12      void *b_private;        /* reserved for b_end_io */
13     struct list_head b_assoc_buffers; /* associated with another mapping */
14     struct address_space *b_assoc_map;    /* mapping this buffer is
15                            associated with */
16     atomic_t b_count;        /* users using this buffer_head */
17 };

   

 1 /*
 2  * main unit of I/O for the block layer and lower layers (ie drivers and
 3  * stacking drivers)
 4  */
 5 struct bio {
 6     sector_t        bi_sector;    /* device address in 512 byte
 7                            sectors */
 8     struct bio        *bi_next;    /* request queue link */
 9     struct block_device    *bi_bdev;
10     unsigned long        bi_flags;    /* status, command, etc */
11     unsigned long        bi_rw;        /* bottom bits READ/WRITE,
12                          * top bits priority
13                          */
14 
15     unsigned short        bi_vcnt;    /* how many bio_vec‘s */
16     unsigned short        bi_idx;        /* current index into bvl_vec */
17     ...............
18 
19     /*
20      * We can inline a number of vecs at the end of the bio, to avoid
21      * double allocations for a small number of bio_vecs. This member
22      * MUST obviously be kept at the very end of the bio.
23      */
24     struct bio_vec        bi_inline_vecs[0];
25 };

   在Block IO层进行基本的IO request的合并和处理调度, 基本的层由elevator管理, 具体的调度算法有noop、deadline和anticipate等多种调度算法,现在默认的调度算法是deadline,当然调度算法可调,根据系统可以调成系统最有的处理。 

[1] 基数树(radix tree). http://blog.csdn.net/joker0910/article/details/8250085

[2] Radix Tree. http://en.wikipedia.org/wiki/Radix_tree 

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。