Linux内核源代码情景分析-文件的写

浏览数：26 / 时间：2015年06月20日

write对应的系统调用是sys_write，代码如下：

asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
{
	ssize_t ret;
	struct file * file;

	ret = -EBADF;
	file = fget(fd);
	if (file) {
		if (file->f_mode & FMODE_WRITE) {
			struct inode *inode = file->f_dentry->d_inode;
			ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,
				file->f_pos, count);
			if (!ret) {
				ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
				ret = -EINVAL;
				if (file->f_op && (write = file->f_op->write) != NULL)
					ret = write(file, buf, count, &file->f_pos);
			}
		}
		if (ret > 0)
			inode_dir_notify(file->f_dentry->d_parent->d_inode,
				DN_MODIFY);
		fput(file);
	}
	return ret;
}

fd假设就是Linux内核源代码情景分析-文件的打开，一文中刚刚打开文件/usr/local/hello.c的文件号。fget(fd)，根据打开文件号fd找到该已打开文件的file结构。代码如下：

struct file * fget(unsigned int fd)
{
	struct file * file;
	struct files_struct *files = current->files;

	read_lock(&files->file_lock);
	file = fcheck(fd);
	if (file)
		get_file(file);
	read_unlock(&files->file_lock);
	return file;
}

static inline struct file * fcheck(unsigned int fd)
{
	struct file * file = NULL;
	struct files_struct *files = current->files;

	if (fd < files->max_fds)
		file = files->fd[fd];
	return file;
}

还记得在打开文件的时候，file->f_op被设置为f->f_op = fops_get(inode->i_fop)；对于ext2文件系统，所以f->f_op要么为ext2_file_operations，另一个是ext2_dir_oprations，视操作的目标为文件或目录而选择其一。对于文件来说，file->f_op->write为generic_file_write。

ssize_t
generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)//file是要写入文件节点的file结构，buf为要写入内容的指针，count是数量，ppos是要写入文件的位置
{
	struct inode	*inode = file->f_dentry->d_inode; 
	struct address_space *mapping = inode->i_mapping;
	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
	loff_t		pos;
	struct page	*page, *cached_page;
	unsigned long	written;
	long		status;
	int		err;

	cached_page = NULL;

	down(&inode->i_sem);

	pos = *ppos;
	err = -EINVAL;
	if (pos < 0)
		goto out;

	err = file->f_error;
	if (err) {
		file->f_error = 0;
		goto out;
	}

	written = 0;

	if (file->f_flags & O_APPEND)
		pos = inode->i_size;

	/*
	 * Check whether we‘ve reached the file size limit.
	 */
	err = -EFBIG;
	if (limit != RLIM_INFINITY) {
		if (pos >= limit) {
			send_sig(SIGXFSZ, current, 0);
			goto out;
		}
		if (count > limit - pos) {
			send_sig(SIGXFSZ, current, 0);
			count = limit - pos;
		}
	}

	status  = 0;
	if (count) {
		remove_suid(inode);
		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
		mark_inode_dirty_sync(inode);
	}

	while (count) {
		unsigned long bytes, index, offset;
		char *kaddr;
		int deactivate = 1;

		/*
		 * Try to find the page in the cache. If it isn‘t there,
		 * allocate a free page.
		 */
		offset = (pos & (PAGE_CACHE_SIZE -1)); //根据当前位置pos计算出本次循环中要写多的缓冲页面index、在该页面中的起点offset以及写入长度bytes
		index = pos >> PAGE_CACHE_SHIFT;
		bytes = PAGE_CACHE_SIZE - offset;
		if (bytes > count) {
			bytes = count;
			deactivate = 0;
		}

		/*
		 * Bring in the user page that we will copy from _first_.
		 * Otherwise there‘s a nasty deadlock on copying from the
		 * same page as we‘re writing to, without it being marked
		 * up-to-date.
		 */
		{ volatile unsigned char dummy;
			__get_user(dummy, buf);
			__get_user(dummy, buf+bytes-1);
		}

		status = -ENOMEM;	/* we‘ll assign it later anyway */
		page = __grab_cache_page(mapping, index, &cached_page);//在page_hash_table中找到该缓冲页面，如找不到，就分配、建立一个缓冲页面
		if (!page)
			break;

		/* We have exclusive IO access to the page.. */
		if (!PageLocked(page)) {
			PAGE_BUG(page);
		}

		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);//预写先读，先把文件在设备上对应位置的数据读到page中
		if (status)
			goto unlock;
		kaddr = page_address(page);
		status = copy_from_user(kaddr+offset, buf, bytes);//把数据从用户空间拷贝到page指向的页面中，已经放入了缓冲区
		flush_dcache_page(page);
		if (status)
			goto fail_write;
		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);//真正的写，把缓冲区的数据写到设备上
		if (!status)
			status = bytes;

		if (status >= 0) {
			written += status;
			count -= status;
			pos += status;
			buf += status;
		}
unlock:
		/* Mark it unlocked again and drop the page.. */
		UnlockPage(page);
		if (deactivate)
			deactivate_page(page);
		page_cache_release(page);

		if (status < 0)
			break;
	}
	*ppos = pos;

	if (cached_page)
		page_cache_free(cached_page);

	/* For now, when the user asks for O_SYNC, we‘ll actually
	 * provide O_DSYNC. */
	if ((status >= 0) && (file->f_flags & O_SYNC))
		status = generic_osync_inode(inode, 1); /* 1 means datasync */
	
	err = written ? written : status;
out:

	up(&inode->i_sem);
	return err;
fail_write:
	status = -EFAULT;
	ClearPageUptodate(page);
	kunmap(page);
	goto unlock;
}

inode结构中有个指针i_mapping，指向一个address_space数据结构，其定义如下：

struct address_space {
	struct list_head	clean_pages;	/* list of clean pages */
	struct list_head	dirty_pages;	/* list of dirty pages */
	struct list_head	locked_pages;	/* list of locked pages */
	unsigned long		nrpages;	/* number of total pages */
	struct address_space_operations *a_ops;	/* methods */
	struct inode		*host;		/* owner: inode, block_device */
	struct vm_area_struct	*i_mmap;	/* list of private mappings */
	struct vm_area_struct	*i_mmap_shared; /* list of shared mappings */
	spinlock_t		i_shared_lock;  /* and spinlock protecting it */
};

其中a_ops，它指向一个address_space_operations数据结构，就ext2文件系统来说，这个数据结构为ext2_aops，代码如下：

struct address_space_operations {
	int (*writepage)(struct page *);
	int (*readpage)(struct file *, struct page *);
	int (*sync_page)(struct page *);
	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
	/* Unfortunately this kludge is needed for FIBMAP. Don‘t use it */
	int (*bmap)(struct address_space *, long);
};

__grab_cache_page，在page_hash_table中找到该缓冲页面，如找不到，就分配、建立一个缓冲页面，代码如下：

static inline struct page * __grab_cache_page(struct address_space *mapping,
				unsigned long index, struct page **cached_page)
{
	struct page *page, **hash = page_hash(mapping, index);
repeat:
	page = __find_lock_page(mapping, index, hash);//在page_hash_table中寻找该缓冲页面 
	if (!page) {//如果找不到
		if (!*cached_page) {//cached_page为NULL
			*cached_page = page_cache_alloc();//分配一个页面
			if (!*cached_page)
				return NULL;
		}
		page = *cached_page;
		if (add_to_page_cache_unique(page, mapping, index, hash))//加入到page_hash_table中
			goto repeat;
		*cached_page = NULL;
	}
	return page;
}

#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))

add_to_page_cache_unique，加入到page_hash_table中，代码如下：

static int add_to_page_cache_unique(struct page * page,
	struct address_space *mapping, unsigned long offset,
	struct page **hash)
{
	int err;
	struct page *alias;

	spin_lock(&pagecache_lock);
	alias = __find_page_nolock(mapping, offset, *hash);

	err = 1;
	if (!alias) {
		__add_to_page_cache(page,mapping,offset,hash);
		err = 0;
	}

	spin_unlock(&pagecache_lock);
	return err;
}

static inline void __add_to_page_cache(struct page * page,
	struct address_space *mapping, unsigned long offset,
	struct page **hash)
{
	unsigned long flags;

	if (PageLocked(page))
		BUG();

	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_locked);
	page_cache_get(page);
	page->index = offset;//就是最初传递进来的页面缓存index
	add_page_to_inode_queue(mapping, page);
	add_page_to_hash_queue(page, hash);//加入到page_hash_table表中
	lru_cache_add(page);
}

获取了缓冲页面后，这个页面有可能是个新分配的空白页面。新分配的空白页面与业已存在的缓冲页面除了在内容上有根本性的区别外，在结构上也有重要的区别。那就是前面所讲的，缓冲页面一方面与一个page结构相联系，而新分配的页面尚无buffer_head结构与之挂钩。所以，对于新分配的空白页面一来要为其配备相应的buffer_head数据结构，二来将目标页面的内容先从设备中读入(因为写操作未必是整个页面的写入)。不仅如此，就是业已存在的老页面也有个缓冲页面中的内容是否"up to date"，即是否一致的问题。这里所谓"一致"，是指缓冲页面火车缓冲区内容与设备上的逻辑内容一致。

mapping->a_ops->prepare_write开始执行，指向了ext2_prepare_write，代码如下：

static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
{
	return block_prepare_write(page,from,to,ext2_get_block);
}

int block_prepare_write(struct page *page, unsigned from, unsigned to,
			get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	int err = __block_prepare_write(inode, page, from, to, get_block);
	if (err) {
		ClearPageUptodate(page);
		kunmap(page);
	}
	return err;
}

static int __block_prepare_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to, get_block_t *get_block)
{
	unsigned block_start, block_end;
	unsigned long block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
	char *kaddr = kmap(page);

	blocksize = inode->i_sb->s_blocksize;
	if (!page->buffers)//说明是新分配的页面，没有buffer_head结构
		create_empty_buffers(page, inode->i_dev, blocksize);//为该页面配备好相应的buffer_head结构，并建立起这个队列
	head = page->buffers;

	bbits = inode->i_sb->s_blocksize_bits;
	block = page->index << (PAGE_CACHE_SHIFT - bbits);//这里用到了page->index

	for(bh = head, block_start = 0; bh != head || !block_start;
	    block++, block_start=block_end, bh = bh->b_this_page) {
		if (!bh)
			BUG();
		block_end = block_start+blocksize;
		if (block_end <= from)
			continue;
		if (block_start >= to)
			break;
		if (!buffer_mapped(bh)) {
			err = get_block(inode, block, bh, 1);//bh中存相关的信息，为ll_rw_block准备
			if (err)
				goto out;
			if (buffer_new(bh)) {
				unmap_underlying_metadata(bh);
				if (Page_Uptodate(page)) {
					set_bit(BH_Uptodate, &bh->b_state);
					continue;
				}
				if (block_end > to)
					memset(kaddr+to, 0, block_end-to);
				if (block_start < from)
					memset(kaddr+block_start, 0, from-block_start);
				if (block_end > to || block_start < from)
					flush_dcache_page(page);
				continue;
			}
		}
		if (Page_Uptodate(page)) {
			set_bit(BH_Uptodate, &bh->b_state);
			continue; 
		}
		if (!buffer_uptodate(bh) &&
		     (block_start < from || block_end > to)) {//如果是新分配的页面，一定不一致，如果原有的页面，有可能不一致
			ll_rw_block(READ, 1, &bh);//如果不一致，就从设备上读入数据到page中,bh中已经存好了用于从设备中读入数据的相关信息
			*wait_bh++=bh;
		}
	}
	/*
	 * If we issued read requests - let them complete.
	 */
	while(wait_bh > wait) {
		wait_on_buffer(*--wait_bh);
		err = -EIO;
		if (!buffer_uptodate(*wait_bh))
			goto out;
	}
	return 0;
out:
	return err;
}

create_empty_buffers，为该页面配备好相应的buffer_head结构，并建立起这个队列

static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
{
	struct buffer_head *bh, *head, *tail;

	head = create_buffers(page, blocksize, 1);
	if (page->buffers)
		BUG();

	bh = head;
	do {
		bh->b_dev = dev;//重要点
		bh->b_blocknr = 0;
		bh->b_end_io = NULL;
		tail = bh;
		bh = bh->b_this_page;
	} while (bh);
	tail->b_this_page = head;
	page->buffers = head;//重要点
	page_cache_get(page);
}

static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
{
	struct buffer_head *bh, *head;
	long offset;

try_again:
	head = NULL;
	offset = PAGE_SIZE;
	while ((offset -= size) >= 0) {
		bh = get_unused_buffer_head(async);
		if (!bh)
			goto no_grow;

		bh->b_dev = B_FREE;  /* Flag as unused */
		bh->b_this_page = head;
		head = bh;

		bh->b_state = 0;
		bh->b_next_free = NULL;
		bh->b_pprev = NULL;
		atomic_set(&bh->b_count, 0);
		bh->b_size = size;//重要点,block_size

		set_bh_page(bh, page, offset);

		bh->b_list = BUF_CLEAN;
		bh->b_end_io = NULL;
	}
	return head;
        ......
}

void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
{
	bh->b_page = page;//重要点
	if (offset >= PAGE_SIZE)
		BUG();
	if (PageHighMem(page))
		/*
		 * This catches illegal uses and preserves the offset:
		 */
		bh->b_data = (char *)(0 + offset);
	else
		bh->b_data = page_address(page) + offset;//重要点，页面的实际位置
}

返回到generic_file_write，继续执行mapping->a_ops->commit_write，真正的写，把缓冲区的数据写到设备上，对应的指针是generic_commit_write，代码如下：

int generic_commit_write(struct file *file, struct page *page,
		unsigned from, unsigned to)
{
	struct inode *inode = page->mapping->host;
	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
	__block_commit_write(inode,page,from,to);
	kunmap(page);
	if (pos > inode->i_size) {
		inode->i_size = pos;
		mark_inode_dirty(inode);
	}
	return 0;
}

static int __block_commit_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0, need_balance_dirty = 0;
	unsigned blocksize;
	struct buffer_head *bh, *head;

	blocksize = inode->i_sb->s_blocksize;

	for(bh = head = page->buffers, block_start = 0;//page->buffers得到buffer_head结构
	    bh != head || !block_start;
	    block_start=block_end, bh = bh->b_this_page) {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_bit(BH_Uptodate, &bh->b_state);
			if (!atomic_set_buffer_dirty(bh)) {
				__mark_dirty(bh);
				buffer_insert_inode_queue(bh, inode);
				need_balance_dirty = 1;//只要有记录块缓冲区从"干净"状态变成"脏"状态,need_balance_dirty就置1
			}
		}
	}

	if (need_balance_dirty)
		balance_dirty(bh->b_dev);//如果置1，这个函数看看这样的记录块是否已经积累到一定的数量，如果是，就唤醒bdflushin进行一次"冲刷"
	/*
	 * is this a partial write that happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we ‘discover‘ wether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	if (!partial)
		SetPageUptodate(page);
	return 0;
}

至此，文件写就分析完了，page和buffer_head同时管理页面，page->buffers指向了buffer_head，bh->b_page指向了page。

郑重声明：本站内容如果来自互联网及其他传播媒体，其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享，并不代表本站赞同其观点和对其真实性负责，也不构成任何其他建议。

Linux内核源代码情景分析-文件的写