kernel-4.9内存回收核心流程
以32位系统为例
内存分配流程调用流程:
alloc_pages()
-->alloc_pages_node()
-->__alloc_pages_node()
-->__alloc_pages()
-->__alloc_pages_nodemask()
-->get_page_from_freelist()
在get_page_from_freelist()中,首先会遍历当前zone,按照HIGHMEM->NORMAL的方向进行遍历,判断当前zone是否能够进行内存分配的条件是首先判断free memory是否满足low water mark水位值,如果不满足则进行一次快速的内存回收操作,然后再次检测是否满足low water mark,如果还是不能满足,相同步骤遍历下一个zone,如果两个zone都不满足,get_page_from_freelist()函数返回NULL。
快速内存回收机制:
node_reclaim()
-->__node_reclaim()-----次处指定每轮进行回收的页面最大值为取需要回收的页面数和32的最大值,快速回收不能进行unmap,writeback操作,回收priority为4,即最多尝试调用shrink_node进行回收的次数为priority值,直到回收到的页数达到需要分配的内存页数或者完成4次循环为止,也就是最多能够回收128页.
static
int__node_reclaim(struct pglist_data *pgdat,gfp_tgfp_mask,unsignedintorder)
{
constunsignedlongnr_pages =1<< order;
………………
structscan_controlsc= {//内存回收的条件
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = memalloc_noio_flags(gfp_mask),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
.may_swap =1,
.reclaim_idx = gfp_zone(gfp_mask),
};
……………………
if(node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
* Free memory by calling shrink zone with increasing
* priorities until we have enough memory freed.
*/
do{
shrink_node(pgdat, &sc);
}while(sc.nr_reclaimed < nr_pages && --sc.priority >=0);//每一轮是否满足条件
}
}
-->shrink_node()-----这里会对系统中存在的每一个memcg对应的node进行一次内存回收操作,然后更新这一次vmpressure扫描和回收的值,直到扫描完所有的memcg或者回收的页面数到足够的页面,在完成所有的memcg的扫描或者回收到最多32页后会调用vmpressure函数根据这一轮内存回收扫描的总页数以及回收到的页数来计算当前内存的压力值,再根据扫描的页数是否大于512个,决定是否将压力传到native进程lmkd确定是否启动lmkd进行进程清理操作.
staticboolshrink_node(pg_data_t*pgdat, struct scan_control *sc)
{
do{
……………………………………
memcg = mem_cgroup_iter(root,NULL, &reclaim);
do{
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
if(memcg)
shrink_slab(sc->gfp_mask, pgdat->node_id,
memcg, sc->nr_scanned - scanned,
lru_pages);//这里会叫lowmemorykiller起来
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg,false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);//更新回收时的扫描和已回收的页面数
if(!global_reclaim(sc) &&
sc->nr_reclaimed >= sc->nr_to_reclaim) {
mem_cgroup_iter_break(root, memcg);
break;//回收的页面数达到标准则跳出循环
}
}while((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
if(global_reclaim(sc))
shrink_slab(sc->gfp_mask, pgdat->node_id,NULL,
sc->nr_scanned - nr_scanned,
node_lru_pages);//这里当memcg不存在时会叫起来lowmemorykiller
if(reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab =0;
}
/* Record the subtree's reclaim efficiency */
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);//可能会叫起来lmkd
if(sc->nr_reclaimed - nr_reclaimed)
reclaimable =true;
}while(should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
}
-->shrink_node_memcg()-----调用get_scan_count确定匿名页和文件页lru扫描的页面数,按照计算好的页数扫描各lru链表调用shrink_list进行内存回收,每次扫描32个页面。
staticvoidshrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
struct scan_control *sc,unsignedlong*lru_pages)
{
get_scan_count(lruvec, memcg, sc, nr, lru_pages);
…………………………………………
for_each_evictable_lru(lru) {
if(nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);//每次扫描32个页面
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
……………………………………………
}
-->shrink_list()-----如果是活跃lru会进行判断,当前active lru的页数和inactive lru的比例关系进行判断是否需要扫描活跃,如果是非活跃lru,则直接进行扫描
-->shrink_active_list()----分成3个临时链表,从lru上面分离下来的存放在l_hold,要放到inactive lru的存放在l_inactive,而要放回active lru的存放在l_active,函数只会将被引用到的可执行文件页放回到活跃lru,其他的全部移动到非活跃lru
static
voidshrink_active_list(unsignedlongnr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enumlru_list lru)
{
………………
LIST_HEAD(l_hold);/* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);//分离页面到l_hode
………………
if(page_referenced(page,0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
if((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);//可执行文件页放回active
continue;
}
}
ClearPageActive(page);/* we are de-activating */
list_add(&page->lru, &l_inactive);其余的全部添加到inactive
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);//添加到active lru
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);//添加到inactive lru
free_hot_cold_page_list(&l_hold,true);//剩余的free掉
}