Linux内核源代码情景分析-文件的写

    write对应的系统调用是sys_write,代码如下:

asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
{
	ssize_t ret;
	struct file * file;

	ret = -EBADF;
	file = fget(fd);
	if (file) {
		if (file->f_mode & FMODE_WRITE) {
			struct inode *inode = file->f_dentry->d_inode;
			ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,
				file->f_pos, count);
			if (!ret) {
				ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
				ret = -EINVAL;
				if (file->f_op && (write = file->f_op->write) != NULL)
					ret = write(file, buf, count, &file->f_pos);
			}
		}
		if (ret > 0)
			inode_dir_notify(file->f_dentry->d_parent->d_inode,
				DN_MODIFY);
		fput(file);
	}
	return ret;
}
    fd假设就是Linux内核源代码情景分析-文件的打开,一文中刚刚打开文件/usr/local/hello.c的文件号。fget(fd),根据打开文件号fd找到该已打开文件的file结构。代码如下:

struct file * fget(unsigned int fd)
{
	struct file * file;
	struct files_struct *files = current->files;

	read_lock(&files->file_lock);
	file = fcheck(fd);
	if (file)
		get_file(file);
	read_unlock(&files->file_lock);
	return file;
}
static inline struct file * fcheck(unsigned int fd)
{
	struct file * file = NULL;
	struct files_struct *files = current->files;

	if (fd < files->max_fds)
		file = files->fd[fd];
	return file;
}


   还记得在打开文件的时候,file->f_op被设置为f->f_op = fops_get(inode->i_fop);对于ext2文件系统,所以f->f_op要么为ext2_file_operations,另一个是ext2_dir_oprations,视操作的目标为文件或目录而选择其一。对于文件来说,file->f_op->write为generic_file_write。

ssize_t
generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)//file是要写入文件节点的file结构,buf为要写入内容的指针,count是数量,ppos是要写入文件的位置
{
	struct inode	*inode = file->f_dentry->d_inode; 
	struct address_space *mapping = inode->i_mapping;
	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
	loff_t		pos;
	struct page	*page, *cached_page;
	unsigned long	written;
	long		status;
	int		err;

	cached_page = NULL;

	down(&inode->i_sem);

	pos = *ppos;
	err = -EINVAL;
	if (pos < 0)
		goto out;

	err = file->f_error;
	if (err) {
		file->f_error = 0;
		goto out;
	}

	written = 0;

	if (file->f_flags & O_APPEND)
		pos = inode->i_size;

	/*
	 * Check whether we‘ve reached the file size limit.
	 */
	err = -EFBIG;
	if (limit != RLIM_INFINITY) {
		if (pos >= limit) {
			send_sig(SIGXFSZ, current, 0);
			goto out;
		}
		if (count > limit - pos) {
			send_sig(SIGXFSZ, current, 0);
			count = limit - pos;
		}
	}

	status  = 0;
	if (count) {
		remove_suid(inode);
		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
		mark_inode_dirty_sync(inode);
	}

	while (count) {
		unsigned long bytes, index, offset;
		char *kaddr;
		int deactivate = 1;

		/*
		 * Try to find the page in the cache. If it isn‘t there,
		 * allocate a free page.
		 */
		offset = (pos & (PAGE_CACHE_SIZE -1)); //根据当前位置pos计算出本次循环中要写多的缓冲页面index、在该页面中的起点offset以及写入长度bytes
		index = pos >> PAGE_CACHE_SHIFT;
		bytes = PAGE_CACHE_SIZE - offset;
		if (bytes > count) {
			bytes = count;
			deactivate = 0;
		}

		/*
		 * Bring in the user page that we will copy from _first_.
		 * Otherwise there‘s a nasty deadlock on copying from the
		 * same page as we‘re writing to, without it being marked
		 * up-to-date.
		 */
		{ volatile unsigned char dummy;
			__get_user(dummy, buf);
			__get_user(dummy, buf+bytes-1);
		}

		status = -ENOMEM;	/* we‘ll assign it later anyway */
		page = __grab_cache_page(mapping, index, &cached_page);//在page_hash_table中找到该缓冲页面,如找不到,就分配、建立一个缓冲页面
		if (!page)
			break;

		/* We have exclusive IO access to the page.. */
		if (!PageLocked(page)) {
			PAGE_BUG(page);
		}

		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);//预写先读,先把文件在设备上对应位置的数据读到page中
		if (status)
			goto unlock;
		kaddr = page_address(page);
		status = copy_from_user(kaddr+offset, buf, bytes);//把数据从用户空间拷贝到page指向的页面中,已经放入了缓冲区
		flush_dcache_page(page);
		if (status)
			goto fail_write;
		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);//真正的写,把缓冲区的数据写到设备上
		if (!status)
			status = bytes;

		if (status >= 0) {
			written += status;
			count -= status;
			pos += status;
			buf += status;
		}
unlock:
		/* Mark it unlocked again and drop the page.. */
		UnlockPage(page);
		if (deactivate)
			deactivate_page(page);
		page_cache_release(page);

		if (status < 0)
			break;
	}
	*ppos = pos;

	if (cached_page)
		page_cache_free(cached_page);

	/* For now, when the user asks for O_SYNC, we‘ll actually
	 * provide O_DSYNC. */
	if ((status >= 0) && (file->f_flags & O_SYNC))
		status = generic_osync_inode(inode, 1); /* 1 means datasync */
	
	err = written ? written : status;
out:

	up(&inode->i_sem);
	return err;
fail_write:
	status = -EFAULT;
	ClearPageUptodate(page);
	kunmap(page);
	goto unlock;
}
    inode结构中有个指针i_mapping,指向一个address_space数据结构,其定义如下:

struct address_space {
	struct list_head	clean_pages;	/* list of clean pages */
	struct list_head	dirty_pages;	/* list of dirty pages */
	struct list_head	locked_pages;	/* list of locked pages */
	unsigned long		nrpages;	/* number of total pages */
	struct address_space_operations *a_ops;	/* methods */
	struct inode		*host;		/* owner: inode, block_device */
	struct vm_area_struct	*i_mmap;	/* list of private mappings */
	struct vm_area_struct	*i_mmap_shared; /* list of shared mappings */
	spinlock_t		i_shared_lock;  /* and spinlock protecting it */
};
    其中a_ops,它指向一个address_space_operations数据结构,就ext2文件系统来说,这个数据结构为ext2_aops,代码如下:

struct address_space_operations {
	int (*writepage)(struct page *);
	int (*readpage)(struct file *, struct page *);
	int (*sync_page)(struct page *);
	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
	/* Unfortunately this kludge is needed for FIBMAP. Don‘t use it */
	int (*bmap)(struct address_space *, long);
};


    __grab_cache_page,在page_hash_table中找到该缓冲页面,如找不到,就分配、建立一个缓冲页面,代码如下:

static inline struct page * __grab_cache_page(struct address_space *mapping,
				unsigned long index, struct page **cached_page)
{
	struct page *page, **hash = page_hash(mapping, index);
repeat:
	page = __find_lock_page(mapping, index, hash);//在page_hash_table中寻找该缓冲页面 
	if (!page) {//如果找不到
		if (!*cached_page) {//cached_page为NULL
			*cached_page = page_cache_alloc();//分配一个页面
			if (!*cached_page)
				return NULL;
		}
		page = *cached_page;
		if (add_to_page_cache_unique(page, mapping, index, hash))//加入到page_hash_table中
			goto repeat;
		*cached_page = NULL;
	}
	return page;
}
#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
    add_to_page_cache_unique,加入到page_hash_table中,代码如下:

static int add_to_page_cache_unique(struct page * page,
	struct address_space *mapping, unsigned long offset,
	struct page **hash)
{
	int err;
	struct page *alias;

	spin_lock(&pagecache_lock);
	alias = __find_page_nolock(mapping, offset, *hash);

	err = 1;
	if (!alias) {
		__add_to_page_cache(page,mapping,offset,hash);
		err = 0;
	}

	spin_unlock(&pagecache_lock);
	return err;
}
static inline void __add_to_page_cache(struct page * page,
	struct address_space *mapping, unsigned long offset,
	struct page **hash)
{
	unsigned long flags;

	if (PageLocked(page))
		BUG();

	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_locked);
	page_cache_get(page);
	page->index = offset;//就是最初传递进来的页面缓存index
	add_page_to_inode_queue(mapping, page);
	add_page_to_hash_queue(page, hash);//加入到page_hash_table表中
	lru_cache_add(page);
}


    获取了缓冲页面后,这个页面有可能是个新分配的空白页面。新分配 的空白页面与业已存在的缓冲页面除了在内容上有根本性的区别外,在结构上也有重要的区别。那就是前面所讲的,缓冲页面一方面与一个page结构相联系,而新分配的页面尚无buffer_head结构与之挂钩。所以,对于新分配的空白页面一来要为其配备相应的buffer_head数据结构,二来将目标页面的内容先从设备中读入(因为写操作未必是整个页面的写入)。不仅如此,就是业已存在的老页面也有个缓冲页面中的内容是否"up to date",即是否一致的问题。这里所谓"一致",是指缓冲页面火车缓冲区内容与设备上的逻辑内容一致。

    mapping->a_ops->prepare_write开始执行,指向了ext2_prepare_write,代码如下:

static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
{
	return block_prepare_write(page,from,to,ext2_get_block);
}
int block_prepare_write(struct page *page, unsigned from, unsigned to,
			get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	int err = __block_prepare_write(inode, page, from, to, get_block);
	if (err) {
		ClearPageUptodate(page);
		kunmap(page);
	}
	return err;
}
static int __block_prepare_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to, get_block_t *get_block)
{
	unsigned block_start, block_end;
	unsigned long block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
	char *kaddr = kmap(page);

	blocksize = inode->i_sb->s_blocksize;
	if (!page->buffers)//说明是新分配的页面,没有buffer_head结构
		create_empty_buffers(page, inode->i_dev, blocksize);//为该页面配备好相应的buffer_head结构,并建立起这个队列
	head = page->buffers;

	bbits = inode->i_sb->s_blocksize_bits;
	block = page->index << (PAGE_CACHE_SHIFT - bbits);//这里用到了page->index

	for(bh = head, block_start = 0; bh != head || !block_start;
	    block++, block_start=block_end, bh = bh->b_this_page) {
		if (!bh)
			BUG();
		block_end = block_start+blocksize;
		if (block_end <= from)
			continue;
		if (block_start >= to)
			break;
		if (!buffer_mapped(bh)) {
			err = get_block(inode, block, bh, 1);//bh中存相关的信息,为ll_rw_block准备
			if (err)
				goto out;
			if (buffer_new(bh)) {
				unmap_underlying_metadata(bh);
				if (Page_Uptodate(page)) {
					set_bit(BH_Uptodate, &bh->b_state);
					continue;
				}
				if (block_end > to)
					memset(kaddr+to, 0, block_end-to);
				if (block_start < from)
					memset(kaddr+block_start, 0, from-block_start);
				if (block_end > to || block_start < from)
					flush_dcache_page(page);
				continue;
			}
		}
		if (Page_Uptodate(page)) {
			set_bit(BH_Uptodate, &bh->b_state);
			continue; 
		}
		if (!buffer_uptodate(bh) &&
		     (block_start < from || block_end > to)) {//如果是新分配的页面,一定不一致,如果原有的页面,有可能不一致
			ll_rw_block(READ, 1, &bh);//如果不一致,就从设备上读入数据到page中,bh中已经存好了用于从设备中读入数据的相关信息
			*wait_bh++=bh;
		}
	}
	/*
	 * If we issued read requests - let them complete.
	 */
	while(wait_bh > wait) {
		wait_on_buffer(*--wait_bh);
		err = -EIO;
		if (!buffer_uptodate(*wait_bh))
			goto out;
	}
	return 0;
out:
	return err;
}
    create_empty_buffers,为该页面配备好相应的buffer_head结构,并建立起这个队列
static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
{
	struct buffer_head *bh, *head, *tail;

	head = create_buffers(page, blocksize, 1);
	if (page->buffers)
		BUG();

	bh = head;
	do {
		bh->b_dev = dev;//重要点
		bh->b_blocknr = 0;
		bh->b_end_io = NULL;
		tail = bh;
		bh = bh->b_this_page;
	} while (bh);
	tail->b_this_page = head;
	page->buffers = head;//重要点
	page_cache_get(page);
}
static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
{
	struct buffer_head *bh, *head;
	long offset;

try_again:
	head = NULL;
	offset = PAGE_SIZE;
	while ((offset -= size) >= 0) {
		bh = get_unused_buffer_head(async);
		if (!bh)
			goto no_grow;

		bh->b_dev = B_FREE;  /* Flag as unused */
		bh->b_this_page = head;
		head = bh;

		bh->b_state = 0;
		bh->b_next_free = NULL;
		bh->b_pprev = NULL;
		atomic_set(&bh->b_count, 0);
		bh->b_size = size;//重要点,block_size

		set_bh_page(bh, page, offset);

		bh->b_list = BUF_CLEAN;
		bh->b_end_io = NULL;
	}
	return head;
        ......
}
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
{
	bh->b_page = page;//重要点
	if (offset >= PAGE_SIZE)
		BUG();
	if (PageHighMem(page))
		/*
		 * This catches illegal uses and preserves the offset:
		 */
		bh->b_data = (char *)(0 + offset);
	else
		bh->b_data = page_address(page) + offset;//重要点,页面的实际位置
}
    返回到generic_file_write,继续执行mapping->a_ops->commit_write,真正的写,把缓冲区的数据写到设备上,对应的指针是generic_commit_write,代码如下:

int generic_commit_write(struct file *file, struct page *page,
		unsigned from, unsigned to)
{
	struct inode *inode = page->mapping->host;
	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
	__block_commit_write(inode,page,from,to);
	kunmap(page);
	if (pos > inode->i_size) {
		inode->i_size = pos;
		mark_inode_dirty(inode);
	}
	return 0;
}
static int __block_commit_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0, need_balance_dirty = 0;
	unsigned blocksize;
	struct buffer_head *bh, *head;

	blocksize = inode->i_sb->s_blocksize;

	for(bh = head = page->buffers, block_start = 0;//page->buffers得到buffer_head结构
	    bh != head || !block_start;
	    block_start=block_end, bh = bh->b_this_page) {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_bit(BH_Uptodate, &bh->b_state);
			if (!atomic_set_buffer_dirty(bh)) {
				__mark_dirty(bh);
				buffer_insert_inode_queue(bh, inode);
				need_balance_dirty = 1;//只要有记录块缓冲区从"干净"状态变成"脏"状态,need_balance_dirty就置1
			}
		}
	}

	if (need_balance_dirty)
		balance_dirty(bh->b_dev);//如果置1,这个函数看看这样的记录块是否已经积累到一定的数量,如果是,就唤醒bdflushin进行一次"冲刷"
	/*
	 * is this a partial write that happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we ‘discover‘ wether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	if (!partial)
		SetPageUptodate(page);
	return 0;
}
    至此,文件写就分析完了,page和buffer_head同时管理页面,page->buffers指向了buffer_head,bh->b_page指向了page。

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。