Linux内核源代码情景分析-特殊文件系统/proc

浏览数：35 / 时间：2015年06月20日

由于proc文件系统并不物理地存在于任何设备上，它的安装过程是特殊的。对proc文件系统不能直接通过mount()来安装，而要先由系统内核在内核初始化时自动地通过一个函数kern_mount()安装一次，然后再由处理系统初始化的进程通过mount()安装，实际上是"重安装"。

一、在内核初始化时调用init_proc_fs()，代码如下：

static DECLARE_FSTYPE(proc_fs_type, "proc", proc_read_super, FS_SINGLE);

static int __init init_proc_fs(void)
{
	int err = register_filesystem(&proc_fs_type);//向系统登记"proc"这么一种文件系统
	if (!err) {
		proc_mnt = kern_mount(&proc_fs_type);//将一个具体的proc文件系统安装到系统中的/proc节点上
		err = PTR_ERR(proc_mnt);
		if (IS_ERR(proc_mnt))
			unregister_filesystem(&proc_fs_type);
		else
			err = 0;
	}
	return err;
}

#define DECLARE_FSTYPE(var,type,read,flags) struct file_system_type var = { 	name:		type, 	read_super:	read, 	fs_flags:	flags, 	owner:		THIS_MODULE, }

register_filesystem，向系统登记"proc"这么一种文件系统，代码如下：

int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	if (!fs)
		return -EINVAL;
	if (fs->next)
		return -EBUSY;
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name);
	if (*p)
		res = -EBUSY;
	else
		*p = fs;//向系统登记"proc"这么一种文件系统
	write_unlock(&file_systems_lock);
	return res;
}

static struct file_system_type **find_filesystem(const char *name)
{
	struct file_system_type **p;
	for (p=&file_systems; *p; p=&(*p)->next)
		if (strcmp((*p)->name,name) == 0)
			break;
	return p;
}

kern_mount，将一个具体的proc文件系统安装到系统中的/proc节点上，代码如下：

struct vfsmount *kern_mount(struct file_system_type *type)
{
	kdev_t dev = get_unnamed_dev();//获得一个设备号
	struct super_block *sb;
	struct vfsmount *mnt;
	if (!dev)
		return ERR_PTR(-EMFILE);
	sb = read_super(dev, NULL, type, 0, NULL, 0);//先分配一个空白的super_block数据结构，然后通过由具体文件系统的file_system_type数据结构中的函数指针read_super调用具体的函数来读入超级块
	if (!sb) {
		put_unnamed_dev(dev);
		return ERR_PTR(-EINVAL);
	}
	mnt = add_vfsmnt(NULL, sb->s_root, NULL);
	if (!mnt) {
		kill_super(sb, 0);
		return ERR_PTR(-ENOMEM);
	}
	type->kern_mnt = mnt;//最后把根节点vfsmount赋值给type->kern_mnt
	return mnt;
}

read_super，先分配一个空白的super_block数据结构，然后通过由具体文件系统的file_system_type数据结构中的函数指针read_super调用具体的函数来读入超级块。

static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
				       struct file_system_type *type, int flags,
				       void *data, int silent)
{
	struct super_block * s;
	s = get_empty_super();
	if (!s)
		goto out;
	s->s_dev = dev;
	s->s_bdev = bdev;
	s->s_flags = flags;
	s->s_dirt = 0;
	sema_init(&s->s_vfs_rename_sem,1);
	sema_init(&s->s_nfsd_free_path_sem,1);
	s->s_type = type;
	sema_init(&s->s_dquot.dqio_sem, 1);
	sema_init(&s->s_dquot.dqoff_sem, 1);
	s->s_dquot.flags = 0;
	lock_super(s);
	if (!type->read_super(s, data, silent))
		goto out_fail;
	unlock_super(s);
	/* tell bdcache that we are going to keep this one */
	if (bdev)
		atomic_inc(&bdev->bd_count);
out:
	return s;

out_fail:
	s->s_dev = 0;
	s->s_bdev = 0;
	s->s_type = NULL;
	unlock_super(s);
	return NULL;
}

type->read_super对于proc文件系统来说，这个函数为proc_read_super()。代码如下：

struct super_block *proc_read_super(struct super_block *s,void *data, 
				    int silent)
{
	struct inode * root_inode;
	struct task_struct *p;

	s->s_blocksize = 1024;
	s->s_blocksize_bits = 10;
	s->s_magic = PROC_SUPER_MAGIC;
	s->s_op = &proc_sops;
	root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);//根据根目录项，得到根节点的inode结构
	if (!root_inode)
		goto out_no_root;
	/*
	 * Fixup the root inode‘s nlink value
	 */
	read_lock(&tasklist_lock);
	for_each_task(p) if (p->pid) root_inode->i_nlink++;
	read_unlock(&tasklist_lock);
	s->s_root = d_alloc_root(root_inode);//分配根节点的dentry结构，并把根节点的inode结构和dentry结构相连，并赋值给s->s_root
	if (!s->s_root)
		goto out_no_root;
	parse_options(data, &root_inode->i_uid, &root_inode->i_gid);
	return s;

out_no_root:
	printk("proc_read_super: get root inode failed\n");
	iput(root_inode);
	return NULL;
}

读入超级块，实际上是生成超级块，还有super_block结构中的super_operations指针s_op被设置成指向proc_sops，定义如下：

static struct super_operations proc_sops = { 
	read_inode:	proc_read_inode,
	put_inode:	force_delete,
	delete_inode:	proc_delete_inode,
	statfs:		proc_statfs,
};

不仅如此，proc文件系统中的目录项结构，即dentry结构，在设备上也没有对应物，而以内存中的proc_dir_entry数据结构来代替，定义如下：

struct proc_dir_entry {
	unsigned short low_ino;
	unsigned short namelen;
	const char *name;
	mode_t mode;
	nlink_t nlink;
	uid_t uid;
	gid_t gid;
	unsigned long size;
	struct inode_operations * proc_iops;
	struct file_operations * proc_fops;
	get_info_t *get_info;
	struct module *owner;
	struct proc_dir_entry *next, *parent, *subdir;
	void *data;
	read_proc_t *read_proc;
	write_proc_t *write_proc;
	atomic_t count;		/* use count */
	int deleted;		/* delete flag */
	kdev_t	rdev;
}

最重要的就是/proc节点的proc_dir_entry结构(目录项)proc_root，定义如下：

struct proc_dir_entry proc_root = {
	low_ino:	PROC_ROOT_INO, 
	namelen:	5, 
	name:		"/proc",
	mode:		S_IFDIR | S_IRUGO | S_IXUGO, 
	nlink:		2, 
	proc_iops:	&proc_root_inode_operations, 
	proc_fops:	&proc_root_operations,
	parent:		&proc_root,
};

proc_get_inode，根据根目录项，得到根节点的inode结构，代码如下：

struct inode * proc_get_inode(struct super_block * sb, int ino,
				struct proc_dir_entry * de)
{
	struct inode * inode;

	/*
	 * Increment the use count so the dir entry can‘t disappear.
	 */
	de_get(de);
#if 1
/* shouldn‘t ever happen */
if (de && de->deleted)
printk("proc_iget: using deleted entry %s, count=%d\n", de->name, atomic_read(&de->count));
#endif

	inode = iget(sb, ino);
	if (!inode)
		goto out_fail;
	
	inode->u.generic_ip = (void *) de;//根目录项结构放到了这里
	if (de) {//根据根目录项结构，填充根节点的inode结构
		if (de->mode) {
			inode->i_mode = de->mode;
			inode->i_uid = de->uid;
			inode->i_gid = de->gid;
		}
		if (de->size)
			inode->i_size = de->size;
		if (de->nlink)
			inode->i_nlink = de->nlink;
		if (de->owner)
			__MOD_INC_USE_COUNT(de->owner);
		if (S_ISBLK(de->mode)||S_ISCHR(de->mode)||S_ISFIFO(de->mode))
			init_special_inode(inode,de->mode,kdev_t_to_nr(de->rdev));
		else {
			if (de->proc_iops)
				inode->i_op = de->proc_iops;//proc_root_inode_operations
			if (de->proc_fops)
				inode->i_fop = de->proc_fops;//proc_root_operations
		}
	}

out:
	return inode;

out_fail:
	de_put(de);
	goto out;
}

返回到proc_read_super，开始执行d_alloc_root，分配根节点的dentry结构，并把根节点的inode结构和dentry结构相连。

struct dentry * d_alloc_root(struct inode * root_inode)
{
	struct dentry *res = NULL;

	if (root_inode) {
		res = d_alloc(NULL, &(const struct qstr) { "/", 1, 0 });
		if (res) {
			res->d_sb = root_inode->i_sb;
			res->d_parent = res;//已经是根节点的dentry结构了，没有上一级了
			d_instantiate(res, root_inode);//把根节点的inode结构和dentry结构相连
		}
	}
	return res;
}

返回到kern_mount，执行add_vfsmnt，代码如下：

static struct vfsmount *add_vfsmnt(struct nameidata *nd,
				struct dentry *root,
				const char *dev_name)
{
	struct vfsmount *mnt;
	struct super_block *sb = root->d_inode->i_sb;
	char *name;

	mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);
	if (!mnt)
		goto out;
	memset(mnt, 0, sizeof(struct vfsmount));

	if (nd || dev_name)
		mnt->mnt_flags = MNT_VISIBLE;

	/* It may be NULL, but who cares? */
	if (dev_name) {
		name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
		if (name) {
			strcpy(name, dev_name);
			mnt->mnt_devname = name;
		}
	}
	mnt->mnt_owner = current->uid;
	atomic_set(&mnt->mnt_count,1);
	mnt->mnt_sb = sb;//重点

	spin_lock(&dcache_lock);
	if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
		goto fail;
	mnt->mnt_root = dget(root);//重点
	mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//本身就是挂载节点dentry结构
	mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//本身就是挂载节点vfsmount结构

	if (nd) {
		list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
		list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
	} else {
		INIT_LIST_HEAD(&mnt->mnt_child);
		INIT_LIST_HEAD(&mnt->mnt_clash);
	}
	INIT_LIST_HEAD(&mnt->mnt_mounts);
	list_add(&mnt->mnt_instances, &sb->s_mounts);
	list_add(&mnt->mnt_list, vfsmntlist.prev);
	spin_unlock(&dcache_lock);
out:
	return mnt;
fail:
	spin_unlock(&dcache_lock);
	if (mnt->mnt_devname)
		kfree(mnt->mnt_devname);
	kfree(mnt);
	return NULL;
}

二、光是kern_mount()还不够，还得由系统的初始化进程从内核外部通过系统调用mount()再安装一次。通常，这个命令行为是：mount -nvt proc /dev/null proc

前面我们提到过，proc文件系统的file_system_type数据结构中的FS_SINGLE标志位为1，它起着重要的作用。为什么重要呢？因为它使sys_mount()的主体do_mount()通过get_sb_single()，而不是get_sb_bdev()，来取得所安装文件系统的super_block数据结构。相关代码如下：

if (fstype->fs_flags & FS_NOMOUNT)
		sb = ERR_PTR(-EINVAL);
	else if (fstype->fs_flags & FS_REQUIRES_DEV)
		sb = get_sb_bdev(fstype, dev_name, flags, data_page);
	else if (fstype->fs_flags & FS_SINGLE)
		sb = get_sb_single(fstype, flags, data_page);
	else
		sb = get_sb_nodev(fstype, flags, data_page);

static struct super_block *get_sb_single(struct file_system_type *fs_type,
	int flags, void *data)
{
	struct super_block * sb;
	/*
	 * Get the superblock of kernel-wide instance, but
	 * keep the reference to fs_type.
	 */
	down(&mount_sem);
	sb = fs_type->kern_mnt->mnt_sb;
	if (!sb)
		BUG();
	get_filesystem(fs_type);
	do_remount_sb(sb, flags, data);
	return sb;
}

取得了proc文件系统的super_block结构以后，回到do_mount()代码中，以后的操作就与普通文件系统的安装无异了。这样就将proc文件系统安装到了节点/proc上。

三、刚才我们看到了/proc节点的proc_dir_entry结构proc_root，现在我们创建/proc节点以下的子节点的proc_dir_entry结构，这是由内核在初始化时调用proc_root_init()完成的，代码如下：

void __init proc_root_init(void)
{
	proc_misc_init();
	proc_net = proc_mkdir("net", 0);
#ifdef CONFIG_SYSVIPC
	proc_mkdir("sysvipc", 0);
#endif
#ifdef CONFIG_SYSCTL
	proc_sys_root = proc_mkdir("sys", 0);
#endif
	proc_root_fs = proc_mkdir("fs", 0);
	proc_root_driver = proc_mkdir("driver", 0);
#if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
	/* just give it a mountpoint */
	proc_mkdir("openprom", 0);
#endif
	proc_tty_init();
#ifdef CONFIG_PROC_DEVICETREE
	proc_device_tree_init();
#endif
	proc_bus = proc_mkdir("bus", 0);
}

proc_misc_init，主要创建/proc节点以下的子节点的proc_dir_entry结构，而且子节点大多是文件，不是目录。

void __init proc_misc_init(void)
{
	struct proc_dir_entry *entry;
	static struct {
		char *name;
		int (*read_proc)(char*,char**,off_t,int,int*,void*);
	} *p, simple_ones[] = {
		{"loadavg",     loadavg_read_proc},
		{"uptime",	uptime_read_proc},
		{"meminfo",	meminfo_read_proc},
		{"version",	version_read_proc},
		{"cpuinfo",	cpuinfo_read_proc},
#ifdef CONFIG_PROC_HARDWARE
		{"hardware",	hardware_read_proc},
#endif
#ifdef CONFIG_STRAM_PROC
		{"stram",	stram_read_proc},
#endif
#ifdef CONFIG_DEBUG_MALLOC
		{"malloc",	malloc_read_proc},
#endif
#ifdef CONFIG_MODULES
		{"modules",	modules_read_proc},
		{"ksyms",	ksyms_read_proc},
#endif
		{"stat",	kstat_read_proc},
		{"devices",	devices_read_proc},
		{"partitions",	partitions_read_proc},
#if !defined(CONFIG_ARCH_S390)
		{"interrupts",	interrupts_read_proc},
#endif
		{"filesystems",	filesystems_read_proc},
		{"dma",		dma_read_proc},
		{"ioports",	ioports_read_proc},
		{"cmdline",	cmdline_read_proc},
#ifdef CONFIG_SGI_DS1286
		{"rtc",		ds1286_read_proc},
#endif
		{"locks",	locks_read_proc},
		{"mounts",	mounts_read_proc},
		{"swaps",	swaps_read_proc},
		{"iomem",	memory_read_proc},
		{"execdomains",	execdomains_read_proc},
		{NULL,}
	};
	for (p = simple_ones; p->name; p++)
		create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL);

	/* And now for trickier ones */
	entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);
	if (entry)
		entry->proc_fops = &proc_kmsg_operations;
	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
	if (proc_root_kcore) {
		proc_root_kcore->proc_fops = &proc_kcore_operations;
		proc_root_kcore->size =
				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
	}
	if (prof_shift) {
		entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
		if (entry) {
			entry->proc_fops = &proc_profile_operations;
			entry->size = (1+prof_len) * sizeof(unsigned int);
		}
	}
#ifdef __powerpc__
	{
		extern struct file_operations ppc_htab_operations;
		entry = create_proc_entry("ppc_htab", S_IRUGO|S_IWUSR, NULL);
		if (entry)
			entry->proc_fops = &ppc_htab_operations;
	}
#endif
	entry = create_proc_read_entry("slabinfo", S_IWUSR | S_IRUGO, NULL,
				       slabinfo_read_proc, NULL);
	if (entry)
		entry->write_proc = slabinfo_write_proc;
}

create_proc_read_entry，主要创建/proc节点以下的子节点的proc_dir_entry结构，而且子节点大多是文件，不是目录。

extern inline struct proc_dir_entry *create_proc_read_entry(const char *name,//我们拿第一个举例，name为loadavg，mode为0，base为NULL，read_proc为loadavg_read_proc，data为NULL
	mode_t mode, struct proc_dir_entry *base, 
	read_proc_t *read_proc, void * data)
{
	struct proc_dir_entry *res=create_proc_entry(name,mode,base);
	if (res) {
		res->read_proc=read_proc;
		res->data=data;
	}
	return res;
}

struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
					 struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
		goto out;
	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);//创建proc_dir_entry结构
	if (!ent)
		goto out;
	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);//前面是proc_dir_entry结构
	ent->name = ((char *) ent) + sizeof(*ent);//后面是名字和长度
	ent->namelen = len;

	if (S_ISDIR(mode)) {
		if ((mode & S_IALLUGO) == 0)
		mode |= S_IRUGO | S_IXUGO;
		ent->proc_fops = &proc_dir_operations;
		ent->proc_iops = &proc_dir_inode_operations;
		ent->nlink = 2;
	} else {
		if ((mode & S_IFMT) == 0)
			mode |= S_IFREG;
		if ((mode & S_IALLUGO) == 0)
			mode |= S_IRUGO;
		ent->nlink = 1;
	}
	ent->mode = mode;

	proc_register(parent, ent);//把loadavg节点的proc_dir_entry结构登记到根节点的proc_dir_entry结构
	
out:
	return ent;
}

xlate_proc_name，parent返回的是父节点的proc_dir_entry结构，fn返回当前的节点名，现在name为loadavg，返回的fn还是loadavg，parent是根节点的proc_dir_entry结构proc_root。

static int xlate_proc_name(const char *name,
			   struct proc_dir_entry **ret, const char **residual)
{
	const char     		*cp = name, *next;
	struct proc_dir_entry	*de;
	int			len;

	de = &proc_root;
	while (1) {
		next = strchr(cp, ‘/‘);//此时next为空
		if (!next)
			break;

		len = next - cp;
		for (de = de->subdir; de ; de = de->next) {
			if (proc_match(len, cp, de))
				break;
		}
		if (!de)
			return -ENOENT;
		cp += len + 1;
	}
	*residual = cp;//指向loadavg
	*ret = de;//指向根节点的proc_dir_entry结构
	return 0;
}

proc_register(parent, ent)，把loadavg节点的proc_dir_entry结构登记到根节点的proc_dir_entry结构。

static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
	int	i;
	
	i = make_inode_number();
	if (i < 0)
		return -EAGAIN;
	dp->low_ino = i;
	dp->next = dir->subdir;
	dp->parent = dir;//子节点的proc_dir_dentry通过subdir指向父节点的proc_dir_dentry
	dir->subdir = dp;//父节点的proc_dir_dentry通过subdir指向子节点的proc_dir_dentry
	if (S_ISDIR(dp->mode)) {
		if (dp->proc_iops == NULL) {
			dp->proc_fops = &proc_dir_operations;
			dp->proc_iops = &proc_dir_inode_operations;
		}
		dir->nlink++;
	} else if (S_ISLNK(dp->mode)) {
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_link_inode_operations;
	} else if (S_ISREG(dp->mode)) {//loadvag是普通文件
		if (dp->proc_fops == NULL)
			dp->proc_fops = &proc_file_operations;
	}
	return 0;
}

proc_misc_init中的其他类似的代码就不解释了，例如：

entry = create_proc_entry("kmsg", S_IRUSR, &proc_root);

proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
    
entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);

entry = create_proc_read_entry("slabinfo", S_IWUSR | S_IRUGO, NULL,
				       slabinfo_read_proc, NULL);

返回到proc_root_init，执行proc_mkdir("net", 0)，代码如下：

struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
		goto out;
	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	if (!ent)
		goto out;
	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);
	ent->name = ((char *) ent) + sizeof(*ent);
	ent->namelen = len;
	ent->proc_fops = &proc_dir_operations;//主要区别
	ent->proc_iops = &proc_dir_inode_operations;
	ent->nlink = 2;
	ent->mode = S_IFDIR | S_IRUGO | S_IXUGO;

	proc_register(parent, ent);
	
out:
	return ent;
}

和上面的操作区别在于：

	ent->proc_fops = &proc_dir_operations;
	ent->proc_iops = &proc_dir_inode_operations;

proc_root_init还有其他类似的操作，就不解释了：

	proc_mkdir("sysvipc", 0);
	proc_sys_root = proc_mkdir("sys", 0);
	proc_root_fs = proc_mkdir("fs", 0);
	proc_root_driver = proc_mkdir("driver", 0)
	proc_mkdir("openprom", 0);
	proc_tty_init();
	proc_bus = proc_mkdir("bus", 0);

我们主要关心proc_tty_init，代码如下：

void __init proc_tty_init(void)
{
	if (!proc_mkdir("tty", 0))
		return;
	proc_tty_ldisc = proc_mkdir("tty/ldisc", 0);
	proc_tty_driver = proc_mkdir("tty/driver", 0);

	create_proc_read_entry("tty/ldiscs", 0, 0, tty_ldiscs_read_proc,NULL);
	create_proc_read_entry("tty/drivers", 0, 0, tty_drivers_read_proc,NULL);
}

proc_mkdir("tty", 0)和上面的步骤一样，proc_mkdir("tty/ldisc", 0)的执行，比较不同，如下：

struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)//name指向tty/ldisc，返回parent为tty节点的proc_dir_dentry结构，fn指向ldisc字符串
		goto out;
	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	if (!ent)
		goto out;
	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(*ent), fn, len + 1);
	ent->name = ((char *) ent) + sizeof(*ent);
	ent->namelen = len;
	ent->proc_fops = &proc_dir_operations;
	ent->proc_iops = &proc_dir_inode_operations;
	ent->nlink = 2;
	ent->mode = S_IFDIR | S_IRUGO | S_IXUGO;

	proc_register(parent, ent);//将ldisc这个节点的proc_dir_entry结构登记到tty这个节点的proc_dir_entry结构
	
out:
	return ent;
}

static int xlate_proc_name(const char *name,
			   struct proc_dir_entry **ret, const char **residual)//name指向tty/ldisc
{
	const char     		*cp = name, *next;
	struct proc_dir_entry	*de;
	int			len;

	de = &proc_root;
	while (1) {
		next = strchr(cp, ‘/‘);//next指向ldisc
		if (!next)
			break;

		len = next - cp;//tty的长度,cp还指向tty
		for (de = de->subdir; de ; de = de->next) {
			if (proc_match(len, cp, de))//在根节点的proc_dir_entry结构的subdir寻找子节点的proc_dir_entry，直到匹配tty这个节点
				break;//跳出for循环
		}
		if (!de)
			return -ENOENT;
		cp += len + 1;//cp指向了ldisc
	}
	*residual = cp;//指向了ldisc
	*ret = de;//tty这个节点的proc_dir_entry结构
	return 0;
}

四、这个场景是对/proc/loadavg的访问，这个文件提供有关系统在过去1分钟、5分钟和15分钟内的平均负荷的统计信息。这个文件只支持读操作，其proc_dir_entry结构是在proc_misc_init()中通过create_proc_read_entry()创建的。

首先调用open("/proc/loadavg")，具体过程请参考Linux内核源代码情景分析-文件的打开，open_namei里面这部分会有些不同：

 if (path_init(pathname, LOOKUP_PARENT, nd))  
        error = path_walk(pathname, nd);//找到父节点

找到"/proc/loadavg"的父节点，也就是/proc的节点，参考Linux内核源代码情景分析-文件系统安装后的访问，会调用是否是挂载点，while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry))，这个函数找到proc节点的dentry结构。

然后再调用dentry = lookup_hash(&nd->last, nd->dentry)，nd->last就是下一个节点名"loadavg"。这个函数先通过cached_lookup()看看下一个节点的dentry结构是否已经建立在内存中，如果没有就要通过real_lookup()从设备上读入该节点的目录项(以及索引节点)并在内存中为之创建起它的dentry结构。

static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
{
	struct dentry * result;
	struct inode *dir = parent->d_inode;

	down(&dir->i_sem);
	/*
	 * First re-do the cached lookup just in case it was created
	 * while we waited for the directory semaphore..
	 *
	 * FIXME! This could use version numbering or similar to
	 * avoid unnecessary cache lookups.
	 */
	result = d_lookup(parent, name);
	if (!result) {
		struct dentry * dentry = d_alloc(parent, name);
		result = ERR_PTR(-ENOMEM);
		if (dentry) {
			lock_kernel();
			result = dir->i_op->lookup(dir, dentry);
			unlock_kernel();
			if (result)
				dput(dentry);
			else
				result = dentry;
		}
		up(&dir->i_sem);
		return result;
	}

	/*
	 * Uhhuh! Nasty case: the cache was re-populated while
	 * we waited on the semaphore. Need to revalidate.
	 */
	up(&dir->i_sem);
	if (result->d_op && result->d_op->d_revalidate) {
		if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) {
			dput(result);
			result = ERR_PTR(-ENOENT);
		}
	}
	return result;
}

对于/proc根节点的inode结构中的i_op指针指向proc_root_inode_operations，这是在proc_get_inode中设置的，如下：

			if (de->proc_iops)
				inode->i_op = de->proc_iops;//proc_root_inode_operations
			if (de->proc_fops)
				inode->i_fop = de->proc_fops;//proc_root_operations

static struct inode_operations proc_root_inode_operations = {
	lookup:		proc_root_lookup,
};

dir->i_op->lookup执行的代码是proc_root_lookup，代码如下：

static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry)
{
	if (dir->i_ino == PROC_ROOT_INO) { /* check for safety... */
		int nlink = proc_root.nlink;

		nlink += nr_threads;

		dir->i_nlink = nlink;
	}

	if (!proc_lookup(dir, dentry))
		return NULL;
	
	return proc_pid_lookup(dir, dentry);
}

struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry)
{
	struct inode *inode;
	struct proc_dir_entry * de;
	int error;

	error = -ENOENT;
	inode = NULL;
	de = (struct proc_dir_entry *) dir->u.generic_ip;
	if (de) {
		for (de = de->subdir; de ; de = de->next) {
			if (!de || !de->low_ino)
				continue;
			if (de->namelen != dentry->d_name.len)
				continue;
			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {//找到loadavg节点的proc_dir_entry结构
				int ino = de->low_ino;
				error = -EINVAL;
				inode = proc_get_inode(dir->i_sb, ino, de);//根据loadavg节点的proc_dir_entry结构得到loadavg节点的inode结构
				break;
			}
		}
	}

	if (inode) {
		dentry->d_op = &proc_dentry_operations;
		d_add(dentry, inode);
		return NULL;
	}
	return ERR_PTR(error);
}

struct inode * proc_get_inode(struct super_block * sb, int ino,
				struct proc_dir_entry * de)
{
	struct inode * inode;

	/*
	 * Increment the use count so the dir entry can‘t disappear.
	 */
	de_get(de);
#if 1
/* shouldn‘t ever happen */
if (de && de->deleted)
printk("proc_iget: using deleted entry %s, count=%d\n", de->name, atomic_read(&de->count));
#endif

	inode = iget(sb, ino);
	if (!inode)
		goto out_fail;
	
	inode->u.generic_ip = (void *) de;
	if (de) {
		if (de->mode) {
			inode->i_mode = de->mode;
			inode->i_uid = de->uid;
			inode->i_gid = de->gid;
		}
		if (de->size)
			inode->i_size = de->size;
		if (de->nlink)
			inode->i_nlink = de->nlink;
		if (de->owner)
			__MOD_INC_USE_COUNT(de->owner);
		if (S_ISBLK(de->mode)||S_ISCHR(de->mode)||S_ISFIFO(de->mode))
			init_special_inode(inode,de->mode,kdev_t_to_nr(de->rdev));
		else {
			if (de->proc_iops)//loadavg节点proc_dir_entry结构这个指针为NULL
				inode->i_op = de->proc_iops;
			if (de->proc_fops)
				inode->i_fop = de->proc_fops;//dp->proc_fops = &proc_file_operations，这是在create_proc_entry设置的
		}
	}

out:
	return inode;

out_fail:
	de_put(de);
	goto out;
}

open("/proc/loadavg")，执行完open_namei，继续执行dentry_open。

struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
{
	struct file * f;
	struct inode *inode;
	int error;

	error = -ENFILE;
	f = get_empty_filp();//分配一个空闲的file数据结构
	if (!f)
		goto cleanup_dentry;
	f->f_flags = flags;
	f->f_mode = (flags+1) & O_ACCMODE;
	inode = dentry->d_inode;
	if (f->f_mode & FMODE_WRITE) {
		error = get_write_access(inode);
		if (error)
			goto cleanup_file;
	}

	f->f_dentry = dentry;//该节点的dentry结构
	f->f_vfsmnt = mnt;//该节点的vfsmount结构
	f->f_pos = 0;
	f->f_reada = 0;
	f->f_op = fops_get(inode->i_fop);//f->f_op被赋值为inode_i_fop，这里为proc_file_operations
	if (inode->i_sb)
		file_move(f, &inode->i_sb->s_files);//将其从中间队列脱链而挂入该文件所在设备的super_block结构中的file结构队列s_files
	if (f->f_op && f->f_op->open) {
		error = f->f_op->open(inode,f);
		if (error)
			goto cleanup_all;
	}
	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);

	return f;

cleanup_all:
	fops_put(f->f_op);
	if (f->f_mode & FMODE_WRITE)
		put_write_access(inode);
	f->f_dentry = NULL;
	f->f_vfsmnt = NULL;
cleanup_file:
	put_filp(f);
cleanup_dentry:
	dput(dentry);
	mntput(mnt);
	return ERR_PTR(error);
}

然后调用read()，进入到内核态，如下：

asmlinkage ssize_t sys_read(unsigned int fd, char * buf, size_t count)
{
	ssize_t ret;
	struct file * file;

	ret = -EBADF;
	file = fget(fd);
	if (file) {
		if (file->f_mode & FMODE_READ) {
			ret = locks_verify_area(FLOCK_VERIFY_READ, file->f_dentry->d_inode,
						file, file->f_pos, count);
			if (!ret) {
				ssize_t (*read)(struct file *, char *, size_t, loff_t *);
				ret = -EINVAL;
				if (file->f_op && (read = file->f_op->read) != NULL)
					ret = read(file, buf, count, &file->f_pos);//proc_file_read
			}
		}
		if (ret > 0)
			inode_dir_notify(file->f_dentry->d_parent->d_inode,
				DN_ACCESS);
		fput(file);
	}
	return ret;
}

对于，proc文件系统来说，file->fop指向了proc_file_operations结构(见dentry_open里面的说明)，代码如下：

static struct file_operations proc_file_operations = {
	llseek:		proc_file_lseek,
	read:		proc_file_read,
	write:		proc_file_write,
};

static ssize_t
proc_file_read(struct file * file, char * buf, size_t nbytes, loff_t *ppos)
{
	struct inode * inode = file->f_dentry->d_inode;
	char 	*page;
	ssize_t	retval=0;
	int	eof=0;
	ssize_t	n, count;
	char	*start;
	struct proc_dir_entry * dp;

	dp = (struct proc_dir_entry *) inode->u.generic_ip;//取出loadavg节点的proc_dir_entry结构
	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
		return -ENOMEM;

	while ((nbytes > 0) && !eof)
	{
		count = MIN(PROC_BLOCK_SIZE, nbytes);

		start = NULL;
		if (dp->get_info) {
			/*
			 * Handle backwards compatibility with the old net
			 * routines.
			 */
			n = dp->get_info(page, &start, *ppos, count);
			if (n < count)
				eof = 1;
		} else if (dp->read_proc) {
			n = dp->read_proc(page, &start, *ppos, //loadavg_read_proc
					  count, &eof, dp->data);//相关信息读到page上
		} else
			break;

		if (!start) {
			/*
			 * For proc files that are less than 4k
			 */
			start = page + *ppos;
			n -= *ppos;
			if (n <= 0)
				break;
			if (n > count)
				n = count;
		}
		if (n == 0)
			break;	/* End of file */
		if (n < 0) {
			if (retval == 0)
				retval = n;
			break;
		}
		
		/* This is a hack to allow mangling of file pos independent
 		 * of actual bytes read.  Simply place the data at page,
 		 * return the bytes, and set `start‘ to the desired offset
 		 * as an unsigned int. - [email protected]
		 */
 		n -= copy_to_user(buf, start < page ? page : start, n);//相关信息返回给用户
		if (n == 0) {
			if (retval == 0)
				retval = -EFAULT;
			break;
		}

		*ppos += start < page ? (long)start : n; /* Move down the file */
		nbytes -= n;
		buf += n;
		retval += n;
	}
	free_page((unsigned long) page);
	return retval;
}

在前面代码中，设置了dp->read_proc，如下：

extern inline struct proc_dir_entry *create_proc_read_entry(const char *name,//我们拿第一个举例，name为loadavg，mode为0，base为NULL，read_proc为loadavg_read_proc，data为NULL
	mode_t mode, struct proc_dir_entry *base, 
	read_proc_t *read_proc, void * data)
{
	struct proc_dir_entry *res=create_proc_entry(name,mode,base);
	if (res) {
		res->read_proc=read_proc;
		res->data=data;
	}
	return res;
}

所以dp->read_proc，执行代码如下：

static int loadavg_read_proc(char *page, char **start, off_t off,
				 int count, int *eof, void *data)
{
	int a, b, c;
	int len;

	a = avenrun[0] + (FIXED_1/200);
	b = avenrun[1] + (FIXED_1/200);
	c = avenrun[2] + (FIXED_1/200);
	len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n",
		LOAD_INT(a), LOAD_FRAC(a),
		LOAD_INT(b), LOAD_FRAC(b),
		LOAD_INT(c), LOAD_FRAC(c),
		nr_running, nr_threads, last_pid);//过去1分钟，5分钟以及15分钟内的系统平均CPU负荷等统计信息sprintf()”打印“到缓冲区页面中，统计信息中还包括系统当前处于可运行状态的进程个数nr_running以及系统中进程的总数nr_threads，还有系统中已分配使用的最大进程号last_pid
	return proc_calc_metrics(page, start, off, count, eof, len);
}

static int proc_calc_metrics(char *page, char **start, off_t off,
				 int count, int *eof, int len)
{
	if (len <= off+count) *eof = 1;
	*start = page + off;
	len -= off;
	if (len>count) len = count;
	if (len<0) len = 0;
	return len;
}

它的作用就是将数组avenrun[]中积累的在过去1分钟，5分钟以及15分钟内的系统平均CPU负荷等统计信息sprintf()”打印“到缓冲区页面中。这些平均负荷的数值是每隔5秒钟在时钟中断服务程序中进行计算的，统计信息中还包括系统当前处于可运行状态的进程个数nr_running以及系统中进程的总数nr_threads，还有系统中已分配使用的最大进程号last_pid。

郑重声明：本站内容如果来自互联网及其他传播媒体，其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享，并不代表本站赞同其观点和对其真实性负责，也不构成任何其他建议。

Linux内核源代码情景分析-特殊文件系统/proc