Linux驱动开发学习归纳-4
【块设备驱动程序】
Linux系统主要有字符设备、网络设备、块设备,Linux内核中,I/O设备分为两大类:字符设备、块设备。块设备将数据存储在固定大小的块中,每个块都有自己的地址。数据块的大小通常在512字节到4K字节之间。
块设备与文件系统的关系如图所示:
块设备的结构:扇区,磁道,柱面,盘片。其中扇区是硬件设备传送数据的基本单位,其大小一般为512字节,也有更大的512*n字节的。但在Linux内核中逻辑扇区的大小历来固定大小512字节。
内存是一个线性结构,Linux系统将内存分为页,页的大小从4K到64K,当在内存和磁盘间传送数据时,先将页内的数据封装成一个段,内核以段为基本单位来读写磁盘。段用bio_vec表示,多个页被封装成多个段,多个段组成一个以bio_vec为元素的数组bi_io_vec。bi_io_vec是块I/O结构bio结构体中的一个指针,多个bio组成一个request,request将被连接到请求队列request_queue中。最后这个请求队列将被处理,将数据写到磁盘。关系如下图所示:
总结:扇区(512) <= 块 <= 段 <= 页(4096),且 块= n * 扇区, 段 = n * 块。
块设备驱动的架构:
块设备加载过程:
分配磁盘alloc_disk() -----> 注册设备register_blkdev() -----> 不使用请求队列blk_init_queue()【使用请求队列blk_alloc_queue()】 -----> 磁盘gendisk属性的设置 -----> 激活磁盘add_disk()。
块设备卸载过程:
删除gendisk del_gendisk() -----> 删除gendisk的引用 put_disk() -----> 清除请求队列 blk_cleanup_queue() -----> 注销块设备 unregister_blkdev()。
通用块层是块设备驱动的核心部分,主要包括了块设备驱动程序的通用代码部分。
其中块设备加载过程用到的通用块层数据结构有: gendisk 、 request_queue 、 request 、bio 、 block_device_operations等。
在Linux内核中,gendisk表示一个磁盘,也可以表示一个分区。
113 struct gendisk { 114 int major; /* major number of driver */ 115 int first_minor; 116 int minors; /* maximum number of minors, =1 for 117 * disks that can‘t be partitioned. */ 118 char disk_name[32]; /* name of major driver */ 119 struct hd_struct **part; /* [indexed by minor] */ 120 int part_uevent_suppress; 121 struct block_device_operations *fops; 122 struct request_queue *queue; 123 void *private_data; 124 sector_t capacity; 125 126 int flags; 127 struct device *driverfs_dev; 128 struct kobject kobj; 129 struct kobject *holder_dir; 130 struct kobject *slave_dir; 131 132 struct timer_rand_state *random; 133 int policy; 134 135 atomic_t sync_io; /* RAID */ 136 unsigned long stamp; 137 int in_flight; 138 #ifdef CONFIG_SMP 139 struct disk_stats *dkstats; 140 #else 141 struct disk_stats dkstats; 142 #endif 143 struct work_struct async_notify; 144 };
gendisk是一个动态的结构体,其成员随系统状态不断变化,所以不能静态分配该结构,内核提供专用函数alloc_disk()来分配该结构体。
697 struct gendisk *alloc_disk(int minors) 698 { 699 return alloc_disk_node(minors, -1); 700 } 701 702 struct gendisk *alloc_disk_node(int minors, int node_id) 703 { 704 struct gendisk *disk; 705 706 disk = kmalloc_node(sizeof(struct gendisk), 707 GFP_KERNEL | __GFP_ZERO, node_id); 708 if (disk) { 709 if (!init_disk_stats(disk)) { 710 kfree(disk); 711 return NULL; 712 } 713 if (minors > 1) { 714 int size = (minors - 1) * sizeof(struct hd_struct *); 715 disk->part = kmalloc_node(size, 716 GFP_KERNEL | __GFP_ZERO, node_id); 717 if (!disk->part) { 718 free_disk_stats(disk); 719 kfree(disk); 720 return NULL; 721 } 722 } 723 disk->minors = minors; 724 kobj_set_kset_s(disk,block_subsys); 725 kobject_init(&disk->kobj); 726 rand_initialize_disk(disk); 727 INIT_WORK(&disk->async_notify, 728 media_change_notify_thread); 729 } 730 return disk; 731 }使用alloc_disk分配了一个disk后,需要设置gendisk的属性:
strcpy(xxx_disk->disk_name, xxx_DISKNAME);//设定设备名字 xxx.disk->major = xxx_MAJOR;//设置主设备号 xxx.disk->first_minor = 0;//设置次设备号 xxx.disk->fops = &xxx_fops;//指定块设备操作函数 xxx.disk->queue = xxx_queue;//设置请求队列 set_capacity(xxx.disk, xxx_BYTES>>9);//设置设备容量之后,就可以用add_disk函数向系统激活这个磁盘设备了:
178 void add_disk(struct gendisk *disk) 179 { 180 disk->flags |= GENHD_FL_UP; 181 blk_register_region(MKDEV(disk->major, disk->first_minor), 182 disk->minors, NULL, exact_match, exact_lock, disk); 183 register_disk(disk); 184 blk_register_queue(disk); 185 }当不需要磁盘时,应当使用del_gendisk函数删除gendisk结构体,之后还应使用put_disk函数减少gendisk的引用计数。再之后就是使用unregister_blkdev函数注销块设备。
在块设备中有一个和字符设备中file_operations对应的结构体block_device_operations,它是一个对块设备操作的函数集合。
1118 struct block_device_operations { 1119 int (*open) (struct inode *, struct file *); 1120 int (*release) (struct inode *, struct file *); 1121 int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); 1122 long (*unlocked_ioctl) (struct file *, unsigned, unsigned long); 1123 long (*compat_ioctl) (struct file *, unsigned, unsigned long); 1124 int (*direct_access) (struct block_device *, sector_t, unsigned long *); 1125 int (*media_changed) (struct gendisk *); 1126 int (*revalidate_disk) (struct gendisk *); 1127 int (*getgeo)(struct block_device *, struct hd_geometry *); 1128 struct module *owner; 1129 };数据从内存到磁盘或者从磁盘到内存的过程叫做I/O操作。内核使用bio结构体来描述I/O操作,bio结构体包含一个块设备完成一次I/O操作所需的一切信息。可以将bio理解为描述内存中连续几页的数据,每页中的数据由一个段bio_vec表示,所以几页中的数据就组成一个bi_io_vec的数组。
74 struct bio { 75 sector_t bi_sector; /* device address in 512 byte 76 sectors */ 77 struct bio *bi_next; /* request queue link */ 78 struct block_device *bi_bdev; 79 unsigned long bi_flags; /* status, command, etc */ 80 unsigned long bi_rw; /* bottom bits READ/WRITE, 81 * top bits priority 82 */ 83 84 unsigned short bi_vcnt; /* how many bio_vec‘s */ 85 unsigned short bi_idx; /* current index into bvl_vec */ 86 87 /* Number of segments in this BIO after 88 * physical address coalescing is performed. 89 */ 90 unsigned short bi_phys_segments; 91 92 /* Number of segments after physical and DMA remapping 93 * hardware coalescing is performed. 94 */ 95 unsigned short bi_hw_segments; 96 97 unsigned int bi_size; /* residual I/O count */ 98 99 /* 100 * To keep track of the max hw size, we account for the 101 * sizes of the first and last virtually mergeable segments 102 * in this bio 103 */ 104 unsigned int bi_hw_front_size; 105 unsigned int bi_hw_back_size; 106 107 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ 108 109 struct bio_vec *bi_io_vec; /* the actual vec list */ 110 111 bio_end_io_t *bi_end_io; 112 atomic_t bi_cnt; /* pin count */ 113 114 void *bi_private; 115 116 bio_destructor_t *bi_destructor; /* destructor */ 117 };bio中段用bio_vec来表示:
58 */ 59 struct bio_vec { 60 struct page *bv_page; 61 unsigned int bv_len; 62 unsigned int bv_offset; 63 };几个连续的页组成一个bio结构,几个相邻的bio结构组成一个请求结构request:
218 struct request { 219 struct list_head queuelist; 220 struct list_head donelist; 221 222 struct request_queue *q; 223 224 unsigned int cmd_flags; 225 enum rq_cmd_type_bits cmd_type; 226 227 /* Maintain bio traversal state for part by part I/O submission. 228 * hard_* are block layer internals, no driver should touch them! 229 */ 230 231 sector_t sector; /* next sector to submit */ 232 sector_t hard_sector; /* next sector to complete */ 233 unsigned long nr_sectors; /* no. of sectors left to submit */ 234 unsigned long hard_nr_sectors; /* no. of sectors left to complete */ 235 /* no. of sectors left to submit in the current segment */ 236 unsigned int current_nr_sectors; 237 238 /* no. of sectors left to complete in the current segment */ 239 unsigned int hard_cur_sectors; 240 241 struct bio *bio; 242 struct bio *biotail; 243 244 struct hlist_node hash; /* merge hash */ 245 /* 246 * The rb_node is only used inside the io scheduler, requests 247 * are pruned when moved to the dispatch queue. So let the 248 * completion_data share space with the rb_node. 249 */ 250 union { 251 struct rb_node rb_node; /* sort/lookup */ 252 void *completion_data; 253 }; 254 255 /* 256 * two pointers are available for the IO schedulers, if they need 257 * more they have to dynamically allocate it. 258 */ 259 void *elevator_private; 260 void *elevator_private2; 261 262 struct gendisk *rq_disk; 263 unsigned long start_time; 264 265 /* Number of scatter-gather DMA addr+len pairs after 266 * physical address coalescing is performed. 267 */ 268 unsigned short nr_phys_segments; 269 270 /* Number of scatter-gather addr+len pairs after 271 * physical and DMA remapping hardware coalescing is performed. 272 * This is the number of scatter-gather entries the driver 273 * will actually have to deal with after DMA mapping is done. 274 */ 275 unsigned short nr_hw_segments; 276 277 unsigned short ioprio; 278 279 void *special; 280 char *buffer; 281 282 int tag; 283 int errors; 284 285 int ref_count; 286 287 /* 288 * when request is used as a packet command carrier 289 */ 290 unsigned int cmd_len; 291 unsigned char cmd[BLK_MAX_CDB]; 292 293 unsigned int data_len; 294 unsigned int sense_len; 295 void *data; 296 void *sense; 297 298 unsigned int timeout; 299 int retries; 300 301 /* 302 * completion callback. 303 */ 304 rq_end_io_fn *end_io; 305 void *end_io_data; 306 307 /* for bidi */ 308 struct request *next_rq; 309 };每一个request中包含了多个bio结构体。每个块设备驱动程序都维护者自己的请求队列request_queue,请求队列主要用来连接多个request请求结构。内核将请求队列设计成一个双向链表,它的每个元素都是一个请求结构request。其中还包括对request结构排序算法的指定。
350 struct request_queue 351 { 352 /* 353 * Together with queue_head for cacheline sharing 354 */ 355 struct list_head queue_head; 356 struct request *last_merge; 357 elevator_t *elevator; 358 359 /* 360 * the queue request freelist, one for reads and one for writes 361 */ 362 struct request_list rq; 363 364 request_fn_proc *request_fn; 365 make_request_fn *make_request_fn; 366 prep_rq_fn *prep_rq_fn; 367 unplug_fn *unplug_fn; 368 merge_bvec_fn *merge_bvec_fn; 369 prepare_flush_fn *prepare_flush_fn; 370 softirq_done_fn *softirq_done_fn; 371 372 /* 373 * Dispatch queue sorting 374 */ 375 sector_t end_sector; 376 struct request *boundary_rq; 377 378 /* 379 * Auto-unplugging state 380 */ 381 struct timer_list unplug_timer; 382 int unplug_thresh; /* After this many requests */ 383 unsigned long unplug_delay; /* After this many jiffies */ 384 struct work_struct unplug_work; 385 386 struct backing_dev_info backing_dev_info; 387 388 /* 389 * The queue owner gets to use this for whatever they like. 390 * ll_rw_blk doesn‘t touch it. 391 */ 392 void *queuedata; 393 394 /* 395 * queue needs bounce pages for pages above this limit 396 */ 397 unsigned long bounce_pfn; 398 gfp_t bounce_gfp; 399 400 /* 401 * various queue flags, see QUEUE_* below 402 */ 403 unsigned long queue_flags; 404 405 /* 406 * protects queue structures from reentrancy. ->__queue_lock should 407 * _never_ be used directly, it is queue private. always use 408 * ->queue_lock. 409 */ 410 spinlock_t __queue_lock; 411 spinlock_t *queue_lock; 412 413 /* 414 * queue kobject 415 */ 416 struct kobject kobj; 417 418 /* 419 * queue settings 420 */ 421 unsigned long nr_requests; /* Max # of requests */ 422 unsigned int nr_congestion_on; 423 unsigned int nr_congestion_off; 424 unsigned int nr_batching; 425 426 unsigned int max_sectors; 427 unsigned int max_hw_sectors; 428 unsigned short max_phys_segments; 429 unsigned short max_hw_segments; 430 unsigned short hardsect_size; 431 unsigned int max_segment_size; 432 433 unsigned long seg_boundary_mask; 434 unsigned int dma_alignment; 435 436 struct blk_queue_tag *queue_tags; 437 struct list_head tag_busy_list; 438 439 unsigned int nr_sorted; 440 unsigned int in_flight; 441 442 /* 443 * sg stuff 444 */ 445 unsigned int sg_timeout; 446 unsigned int sg_reserved_size; 447 int node; 448 #ifdef CONFIG_BLK_DEV_IO_TRACE 449 struct blk_trace *blk_trace; 450 #endif 451 /* 452 * reserved for flush operations 453 */ 454 unsigned int ordered, next_ordered, ordseq; 455 int orderr, ordcolor; 456 struct request pre_flush_rq, bar_rq, post_flush_rq; 457 struct request *orig_bar_rq; 458 459 struct mutex sysfs_lock; 460 461 #if defined(CONFIG_BLK_DEV_BSG) 462 struct bsg_class_device bsg_dev; 463 #endif 464 };请求队列、请求结构和bio之间的关系如图所示:
具体块设备驱动程序可参考Linux源码http://lxr.free-electrons.com/source/drivers/block/nbd.c?v=2.6.24
Linux/drivers/block/nbd.c
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。