Linux内核源代码情景分析-交换分区
在Linux内核源代码情景分析-共享内存中,共享内存,当内存紧张时是换出到交换分区。
在Linux内核源代码情景分析-mmap后,文件与虚拟区间建立映射中,文件映射的页面,当内存紧张时是换出到硬盘上的文件中。
这里的交换分区,就是是swap分区,记得给电脑安装ubuntu时,就有一项是swap分区。
交换分区和文件的区别是:
文件是在一个具体的文件系统之下的,交换分区没有这个必要,它可能是一个裸分区,不需要文件系统。
需要说明一点,并不是所有从物理内存中交换出来的数据都会被放到Swap中(如果这样的话,Swap就会不堪重负),有相当一部分数据被直接交换到文件系统。例如,有的程序会打开一些文件,对文件进行读写(其实每个程序都至少要打开一个文件,那就是运行程序本身),当需要将这些程序的内存空间交换出去时,就没有必要将文件部分的数据放到Swap空间中了,而可以直接将其放到文件里去。但是那些用malloc和new函数生成的对象的数据则不同,它们需要Swap空间,因为它们在文件系统中没有相应的“储备”文件,因此被称作“匿名”(Anonymous)内存数据。这类数据还包括堆栈中的一些状态和变量数据等。所以说,Swap空间是“匿名”数据的交换空间。
总结:
当内存紧张时:
共享内存区,malloc和new函数生成的对象的数据,堆栈中的一些状态和变量数据都被换出到swap分区。
文件映射的页面,换出到硬盘上的文件中。
现在来分析最后一种情况,页面中存放的是堆栈中的一些状态和变量数据。
一、堆栈页面的换入
由于pte_none(entry)为true,所以调用do_no_page()。
static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { pte_t entry; /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); entry = *pte; if (!pte_present(entry)) { /* * If it truly wasn‘t present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */ spin_unlock(&mm->page_table_lock); if (pte_none(entry)) return do_no_page(mm, vma, address, write_access, pte); return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access); } if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); spin_unlock(&mm->page_table_lock); return 1; }
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t *page_table) { struct page * new_page; pte_t entry; if (!vma->vm_ops || !vma->vm_ops->nopage)//由于nopage为NULL return do_anonymous_page(mm, vma, page_table, write_access, address); /* * The third argument is "no_share", which tells the low-level code * to copy, not share the page even if sharing is possible. It‘s * essentially an early COW detection. */ new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access); if (new_page == NULL) /* no page was available -- SIGBUS */ return 0; if (new_page == NOPAGE_OOM) return -1; ++mm->rss; /* * This silly early PAGE_DIRTY setting removes a race * due to the bad i386 page protection. But it‘s valid * for other architectures too. * * Note that if write_access is true, we either now have * an exclusive copy of the page, or this is a shared mapping, * so we can make it writable and dirty to avoid having to * handle that later. */ flush_page_to_ram(new_page); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) { entry = pte_mkwrite(pte_mkdirty(entry)); } else if (page_count(new_page) > 1 && !(vma->vm_flags & VM_SHARED)) entry = pte_wrprotect(entry); set_pte(page_table, entry); /* no need to invalidate: a not-present page shouldn‘t be cached */ update_mmu_cache(vma, address, entry); return 2; /* Major fault */ }由于vma->vm_ops->nopage为NULL,所以执行do_anonymous_page,代码如下:
static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) { struct page *page = NULL; pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); if (write_access) { page = alloc_page(GFP_HIGHUSER);//分配页面 if (!page) return -1; clear_user_highpage(page, addr); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); mm->rss++; flush_page_to_ram(page); } set_pte(page_table, entry);//并建立映射 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); return 1; /* Minor fault */ }
换入后page并未加入到任何队列。
refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。这里的活跃页面是先加入到对应的队列中,再脱离。详见swap_out调用try_to_swap_out的代码分析。
page_launder,把不活跃脏的页面变成不活跃干净的页面。
try_to_swap_out,代码如下:
static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) { pte_t pte; swp_entry_t entry; struct page * page; int onlist; pte = *page_table; if (!pte_present(pte))//物理页面是否在内存中 goto out_failed; page = pte_page(pte); if ((!VALID_PAGE(page)) || PageReserved(page)) goto out_failed; if (!mm->swap_cnt) return 1; mm->swap_cnt--;//被考察的页面数减1 onlist = PageActive(page); /* Don‘t look at this pte if it‘s been accessed recently. */ if (ptep_test_and_clear_young(page_table)) {//如果页面被访问过,那么直接out_failed age_page_up(page); goto out_failed; } if (!onlist) /* The page is still mapped, so it can‘t be freeable... */ age_page_down_ageonly(page); ...... if (page->age > 0)//如果页面的age不小于0,页out_failed goto out_failed; if (TryLockPage(page)) goto out_failed; ...... pte = ptep_get_and_clear(page_table);//走到这里,说明页面最近没有访问过,且age小于0,清空页目录项 flush_tlb_page(vma, address); ...... if (PageSwapCache(page)) {//page结构不在swapper_space队列中 entry.val = page->index; if (pte_dirty(pte)) set_page_dirty(page); set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: UnlockPage(page); mm->rss--; deactivate_page(page); page_cache_release(page); out_failed: return 0; } flush_cache_page(vma, address); if (!pte_dirty(pte))//不执行这里 goto drop_pte; /* * Ok, it‘s really dirty. That means that * we should either create a new swap cache * entry for it, or we should write it back * to its own backing store. */ if (page->mapping) {//也不执行这里 set_page_dirty(page); goto drop_pte; } /* * This is a dirty, swappable page. First of all, * get a suitable swap entry for it, and make sure * we have the swap cache set up to associate the * page with that swap entry. */ entry = get_swap_page();//执行这里,分配一个交换分区 if (!entry.val) goto out_unlock_restore; /* No swap space left */ /* Add it to the swap cache and mark it dirty */ add_to_swap_cache(page, entry);//把page加入到对应的队列 set_page_dirty(page); goto set_swap_pte;//跳转到set_swap_pte out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); }add_to_swap_cache,代码如下:
void add_to_swap_cache(struct page *page, swp_entry_t entry) { unsigned long flags; #ifdef SWAP_CACHE_INFO swap_cache_add_total++; #endif if (!PageLocked(page)) BUG(); if (PageTestandSetSwapCache(page)) BUG(); if (page->mapping) BUG(); flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_uptodate); add_to_page_cache_locked(page, &swapper_space, entry.val); }
void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) { if (!PageLocked(page)) BUG(); page_cache_get(page); spin_lock(&pagecache_lock); page->index = index; add_page_to_inode_queue(mapping, page); add_page_to_hash_queue(page, page_hash(mapping, index)); lru_cache_add(page); spin_unlock(&pagecache_lock); }
此时:
page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list;
然后返回到try_to_swap_out,继续执行go set_swap_pte,继续执行如下:
set_swap_pte: swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry));//页目录项指向盘上数据块的地址 drop_pte: UnlockPage(page); mm->rss--; deactivate_page(page);//从active_list到inactive_list page_cache_release(page);此时:
page->list链入mapping->dirty_pages/clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的inactive_dirty_list;
三、堆栈页面恢复映射
1、对于不活跃脏的页面和不活跃干净的页面, 如果发生缺页中断,会调用do_swap_page()。
static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { pte_t entry; /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); entry = *pte; if (!pte_present(entry)) { /* * If it truly wasn‘t present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */ spin_unlock(&mm->page_table_lock); if (pte_none(entry))//有值 return do_no_page(mm, vma, address, write_access, pte); return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//有值,所以会调用这个 } if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); spin_unlock(&mm->page_table_lock); return 1; }
因为页目录表有值,指向了交换分区,所以开始执行do_swap_page()。
static int do_swap_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, pte_t * page_table, swp_entry_t entry, int write_access) { struct page *page = lookup_swap_cache(entry); pte_t pte; if (!page) { lock_kernel(); swapin_readahead(entry); page = read_swap_cache(entry); unlock_kernel(); if (!page) return -1; flush_page_to_ram(page); flush_icache_page(vma, page); } mm->rss++; pte = mk_pte(page, vma->vm_page_prot); /* * Freeze the "shared"ness of the page, ie page_count + swap_count. * Must lock page before transferring our swap count to already * obtained page count. */ lock_page(page); swap_free(entry); if (write_access && !is_page_shared(page)) pte = pte_mkwrite(pte_mkdirty(pte)); UnlockPage(page); set_pte(page_table, pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); return 1; /* Minor fault */ }在do_swap_page中,lookup_swap_cache,会在全局的Hash表找到对应的页面,并且引用计数加1,变成2,但还没有移到活跃队列中。什么时候转移到活跃队列中呢?
答案在,page_launder和reclaim_page中。
page_launder:
if (PageTestandClearReferenced(page) || page->age > 0 || //此时引用计数大于1 (!page->buffers && page_count(page) > 1) || page_ramdisk(page)) { del_page_from_inactive_dirty_list(page); add_page_to_active_list(page); continue; }reclaim_page:
if (PageTestandClearReferenced(page) || page->age > 0 || (!page->buffers && page_count(page) > 1)) {//此时引用计数大于1 del_page_from_inactive_clean_list(page); add_page_to_active_list(page); continue; }
由于lookup_swap_cache返回NULL,所以接下来的流程就是我们原来分析过的一篇文章,Linux内核源代码情景分析-内存管理之用户页面的换入。
郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。