Linux内核源代码情景分析-文件的写
write对应的系统调用是sys_write,代码如下:
asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count) { ssize_t ret; struct file * file; ret = -EBADF; file = fget(fd); if (file) { if (file->f_mode & FMODE_WRITE) { struct inode *inode = file->f_dentry->d_inode; ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, file->f_pos, count); if (!ret) { ssize_t (*write)(struct file *, const char *, size_t, loff_t *); ret = -EINVAL; if (file->f_op && (write = file->f_op->write) != NULL) ret = write(file, buf, count, &file->f_pos); } } if (ret > 0) inode_dir_notify(file->f_dentry->d_parent->d_inode, DN_MODIFY); fput(file); } return ret; }fd假设就是Linux内核源代码情景分析-文件的打开,一文中刚刚打开文件/usr/local/hello.c的文件号。fget(fd),根据打开文件号fd找到该已打开文件的file结构。代码如下:
struct file * fget(unsigned int fd) { struct file * file; struct files_struct *files = current->files; read_lock(&files->file_lock); file = fcheck(fd); if (file) get_file(file); read_unlock(&files->file_lock); return file; }
static inline struct file * fcheck(unsigned int fd) { struct file * file = NULL; struct files_struct *files = current->files; if (fd < files->max_fds) file = files->fd[fd]; return file; }
ssize_t generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)//file是要写入文件节点的file结构,buf为要写入内容的指针,count是数量,ppos是要写入文件的位置 { struct inode *inode = file->f_dentry->d_inode; struct address_space *mapping = inode->i_mapping; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; loff_t pos; struct page *page, *cached_page; unsigned long written; long status; int err; cached_page = NULL; down(&inode->i_sem); pos = *ppos; err = -EINVAL; if (pos < 0) goto out; err = file->f_error; if (err) { file->f_error = 0; goto out; } written = 0; if (file->f_flags & O_APPEND) pos = inode->i_size; /* * Check whether we‘ve reached the file size limit. */ err = -EFBIG; if (limit != RLIM_INFINITY) { if (pos >= limit) { send_sig(SIGXFSZ, current, 0); goto out; } if (count > limit - pos) { send_sig(SIGXFSZ, current, 0); count = limit - pos; } } status = 0; if (count) { remove_suid(inode); inode->i_ctime = inode->i_mtime = CURRENT_TIME; mark_inode_dirty_sync(inode); } while (count) { unsigned long bytes, index, offset; char *kaddr; int deactivate = 1; /* * Try to find the page in the cache. If it isn‘t there, * allocate a free page. */ offset = (pos & (PAGE_CACHE_SIZE -1)); //根据当前位置pos计算出本次循环中要写多的缓冲页面index、在该页面中的起点offset以及写入长度bytes index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) { bytes = count; deactivate = 0; } /* * Bring in the user page that we will copy from _first_. * Otherwise there‘s a nasty deadlock on copying from the * same page as we‘re writing to, without it being marked * up-to-date. */ { volatile unsigned char dummy; __get_user(dummy, buf); __get_user(dummy, buf+bytes-1); } status = -ENOMEM; /* we‘ll assign it later anyway */ page = __grab_cache_page(mapping, index, &cached_page);//在page_hash_table中找到该缓冲页面,如找不到,就分配、建立一个缓冲页面 if (!page) break; /* We have exclusive IO access to the page.. */ if (!PageLocked(page)) { PAGE_BUG(page); } status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);//预写先读,先把文件在设备上对应位置的数据读到page中 if (status) goto unlock; kaddr = page_address(page); status = copy_from_user(kaddr+offset, buf, bytes);//把数据从用户空间拷贝到page指向的页面中,已经放入了缓冲区 flush_dcache_page(page); if (status) goto fail_write; status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);//真正的写,把缓冲区的数据写到设备上 if (!status) status = bytes; if (status >= 0) { written += status; count -= status; pos += status; buf += status; } unlock: /* Mark it unlocked again and drop the page.. */ UnlockPage(page); if (deactivate) deactivate_page(page); page_cache_release(page); if (status < 0) break; } *ppos = pos; if (cached_page) page_cache_free(cached_page); /* For now, when the user asks for O_SYNC, we‘ll actually * provide O_DSYNC. */ if ((status >= 0) && (file->f_flags & O_SYNC)) status = generic_osync_inode(inode, 1); /* 1 means datasync */ err = written ? written : status; out: up(&inode->i_sem); return err; fail_write: status = -EFAULT; ClearPageUptodate(page); kunmap(page); goto unlock; }inode结构中有个指针i_mapping,指向一个address_space数据结构,其定义如下:
struct address_space { struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ unsigned long nrpages; /* number of total pages */ struct address_space_operations *a_ops; /* methods */ struct inode *host; /* owner: inode, block_device */ struct vm_area_struct *i_mmap; /* list of private mappings */ struct vm_area_struct *i_mmap_shared; /* list of shared mappings */ spinlock_t i_shared_lock; /* and spinlock protecting it */ };其中a_ops,它指向一个address_space_operations数据结构,就ext2文件系统来说,这个数据结构为ext2_aops,代码如下:
struct address_space_operations { int (*writepage)(struct page *); int (*readpage)(struct file *, struct page *); int (*sync_page)(struct page *); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); /* Unfortunately this kludge is needed for FIBMAP. Don‘t use it */ int (*bmap)(struct address_space *, long); };
static inline struct page * __grab_cache_page(struct address_space *mapping, unsigned long index, struct page **cached_page) { struct page *page, **hash = page_hash(mapping, index); repeat: page = __find_lock_page(mapping, index, hash);//在page_hash_table中寻找该缓冲页面 if (!page) {//如果找不到 if (!*cached_page) {//cached_page为NULL *cached_page = page_cache_alloc();//分配一个页面 if (!*cached_page) return NULL; } page = *cached_page; if (add_to_page_cache_unique(page, mapping, index, hash))//加入到page_hash_table中 goto repeat; *cached_page = NULL; } return page; }
#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))add_to_page_cache_unique,加入到page_hash_table中,代码如下:
static int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long offset, struct page **hash) { int err; struct page *alias; spin_lock(&pagecache_lock); alias = __find_page_nolock(mapping, offset, *hash); err = 1; if (!alias) { __add_to_page_cache(page,mapping,offset,hash); err = 0; } spin_unlock(&pagecache_lock); return err; }
static inline void __add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long offset, struct page **hash) { unsigned long flags; if (PageLocked(page)) BUG(); flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_locked); page_cache_get(page); page->index = offset;//就是最初传递进来的页面缓存index add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, hash);//加入到page_hash_table表中 lru_cache_add(page); }
mapping->a_ops->prepare_write开始执行,指向了ext2_prepare_write,代码如下:
static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return block_prepare_write(page,from,to,ext2_get_block); }
int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { struct inode *inode = page->mapping->host; int err = __block_prepare_write(inode, page, from, to, get_block); if (err) { ClearPageUptodate(page); kunmap(page); } return err; }
static int __block_prepare_write(struct inode *inode, struct page *page, unsigned from, unsigned to, get_block_t *get_block) { unsigned block_start, block_end; unsigned long block; int err = 0; unsigned blocksize, bbits; struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; char *kaddr = kmap(page); blocksize = inode->i_sb->s_blocksize; if (!page->buffers)//说明是新分配的页面,没有buffer_head结构 create_empty_buffers(page, inode->i_dev, blocksize);//为该页面配备好相应的buffer_head结构,并建立起这个队列 head = page->buffers; bbits = inode->i_sb->s_blocksize_bits; block = page->index << (PAGE_CACHE_SHIFT - bbits);//这里用到了page->index for(bh = head, block_start = 0; bh != head || !block_start; block++, block_start=block_end, bh = bh->b_this_page) { if (!bh) BUG(); block_end = block_start+blocksize; if (block_end <= from) continue; if (block_start >= to) break; if (!buffer_mapped(bh)) { err = get_block(inode, block, bh, 1);//bh中存相关的信息,为ll_rw_block准备 if (err) goto out; if (buffer_new(bh)) { unmap_underlying_metadata(bh); if (Page_Uptodate(page)) { set_bit(BH_Uptodate, &bh->b_state); continue; } if (block_end > to) memset(kaddr+to, 0, block_end-to); if (block_start < from) memset(kaddr+block_start, 0, from-block_start); if (block_end > to || block_start < from) flush_dcache_page(page); continue; } } if (Page_Uptodate(page)) { set_bit(BH_Uptodate, &bh->b_state); continue; } if (!buffer_uptodate(bh) && (block_start < from || block_end > to)) {//如果是新分配的页面,一定不一致,如果原有的页面,有可能不一致 ll_rw_block(READ, 1, &bh);//如果不一致,就从设备上读入数据到page中,bh中已经存好了用于从设备中读入数据的相关信息 *wait_bh++=bh; } } /* * If we issued read requests - let them complete. */ while(wait_bh > wait) { wait_on_buffer(*--wait_bh); err = -EIO; if (!buffer_uptodate(*wait_bh)) goto out; } return 0; out: return err; }create_empty_buffers,为该页面配备好相应的buffer_head结构,并建立起这个队列
static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize) { struct buffer_head *bh, *head, *tail; head = create_buffers(page, blocksize, 1); if (page->buffers) BUG(); bh = head; do { bh->b_dev = dev;//重要点 bh->b_blocknr = 0; bh->b_end_io = NULL; tail = bh; bh = bh->b_this_page; } while (bh); tail->b_this_page = head; page->buffers = head;//重要点 page_cache_get(page); }
static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async) { struct buffer_head *bh, *head; long offset; try_again: head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { bh = get_unused_buffer_head(async); if (!bh) goto no_grow; bh->b_dev = B_FREE; /* Flag as unused */ bh->b_this_page = head; head = bh; bh->b_state = 0; bh->b_next_free = NULL; bh->b_pprev = NULL; atomic_set(&bh->b_count, 0); bh->b_size = size;//重要点,block_size set_bh_page(bh, page, offset); bh->b_list = BUF_CLEAN; bh->b_end_io = NULL; } return head; ...... }
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) { bh->b_page = page;//重要点 if (offset >= PAGE_SIZE) BUG(); if (PageHighMem(page)) /* * This catches illegal uses and preserves the offset: */ bh->b_data = (char *)(0 + offset); else bh->b_data = page_address(page) + offset;//重要点,页面的实际位置 }返回到generic_file_write,继续执行mapping->a_ops->commit_write,真正的写,把缓冲区的数据写到设备上,对应的指针是generic_commit_write,代码如下:
int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { struct inode *inode = page->mapping->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; __block_commit_write(inode,page,from,to); kunmap(page); if (pos > inode->i_size) { inode->i_size = pos; mark_inode_dirty(inode); } return 0; }
static int __block_commit_write(struct inode *inode, struct page *page, unsigned from, unsigned to) { unsigned block_start, block_end; int partial = 0, need_balance_dirty = 0; unsigned blocksize; struct buffer_head *bh, *head; blocksize = inode->i_sb->s_blocksize; for(bh = head = page->buffers, block_start = 0;//page->buffers得到buffer_head结构 bh != head || !block_start; block_start=block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = 1; } else { set_bit(BH_Uptodate, &bh->b_state); if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh); buffer_insert_inode_queue(bh, inode); need_balance_dirty = 1;//只要有记录块缓冲区从"干净"状态变成"脏"状态,need_balance_dirty就置1 } } } if (need_balance_dirty) balance_dirty(bh->b_dev);//如果置1,这个函数看看这样的记录块是否已经积累到一定的数量,如果是,就唤醒bdflushin进行一次"冲刷" /* * is this a partial write that happened to make all buffers * uptodate then we can optimize away a bogus readpage() for * the next read(). Here we ‘discover‘ wether the page went * uptodate as a result of this (potentially partial) write. */ if (!partial) SetPageUptodate(page); return 0; }至此,文件写就分析完了,page和buffer_head同时管理页面,page->buffers指向了buffer_head,bh->b_page指向了page。
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。