注册kswapd module,kernel启动时调用@kernel/mm:
module_init(kswapd_init)
static int __init kswapd_init(void)
{
int nid;
swap_setup();
for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
if (kswapd_cpu_mask == NULL)
hotcpu_notifier(cpu_callback, 0);
return 0;
}
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kswapd)
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
} else if (kswapd_cpu_mask) {
if (set_kswapd_cpu_mask(pgdat))
pr_warn("error setting kswapd cpu affinity mask\n");
}
return ret;
}
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
*
* This basically trickles out pages so that we have _some_
* free memory available even if there is no other activity
* that frees anything up. This is needed for things like routing
* etc, where we otherwise might have all activity going on in
* asynchronous contexts that cannot page things out.
*
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
static int kswapd(void *p)
{
...
/*
* We can speed up thawing tasks if we don't call balance_pgdat
* after returning from the refrigerator
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
balanced_classzone_idx = classzone_idx;
balanced_order = balance_pgdat(pgdat, order,
&balanced_classzone_idx);
}
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
return 0;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
* Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
* What we do is to detect the case where all pages in the zone have been
* scanned twice and there has been zero successful reclaim. Mark the zone as
* dead and from now on, only perform a short scan. Basically we're polling
* the zone for when the problem goes away.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
* found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
* lower zones regardless of the number of free pages in the lower zones. This
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
...
/*
* There should be no need to raise the scanning
* priority if enough pages are already being scanned
* that that high watermark would be met at 100%
* efficiency.
*/
if (kswapd_shrink_zone(zone, end_zone, &sc,
lru_pages, &nr_attempted))
}
/*
* kswapd shrinks the zone by the number of pages required to reach
* the high watermark.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
struct scan_control *sc,
unsigned long lru_pages,
unsigned long *nr_attempted)
{
...
reclaim_state->reclaimed_slab = 0;
shrink_slab(&shrink, sc->nr_scanned, lru_pages);
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
...
}
/*
* Call the shrink functions to age shrinkable caches
*
* Here we assume it costs one seek to replace a lru page and that it also
* takes a seek to recreate a cache object. With this in mind we age equal
* percentages of the lru and ageable caches. This should balance the seeks
* generated by these structures.
*
* If the vm encountered mapped pages on the LRU it increase the pressure on
* slab to avoid swapping.
*
* We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
*
* `lru_pages' represents the number of on-LRU pages in all the zones which
* are eligible for the caller's allocation attempt. It is used for balancing
* slab reclaim versus page reclaim.
*
* Returns the number of slab objects which we shrunk.
*/
unsigned long shrink_slab(struct shrink_control *shrinkctl,
unsigned long nr_pages_scanned,
unsigned long lru_pages)
{
struct shrinker *shrinker;
...
for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
if (node_online(shrinkctl->nid))
freed += shrink_slab_node(shrinkctl, shrinker,
nr_pages_scanned, lru_pages);
}
...
}
static unsigned long
shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
unsigned long nr_pages_scanned, unsigned long lru_pages)
{
...
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0)
return 0;
...
/*
* Normally, we should not scan less than batch_size objects in one
* pass to avoid too frequent shrinker calls, but if the slab has less
* than batch_size objects in total and we are really tight on memory,
* we will try to reclaim all available objects, otherwise we can end
* up failing allocations although there are plenty of reclaimable
* objects spread over several slabs with usage less than the
* batch_size.
*
* We detect the "tight on memory" situations by looking at the total
* number of objects we want to scan (total_scan). If it is greater
* than the total number of objects on slab (freeable), we must be
* scanning at high prio and therefore should try to reclaim as much as
* possible.
*/
while (total_scan > min_cache_size ||
total_scan >= freeable) {
unsigned long ret;
unsigned long nr_to_scan = min(batch_size, total_scan);
shrinkctl->nr_to_scan = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
if (ret == SHRINK_STOP)
break;
freed += ret;
count_vm_events(SLABS_SCANNED, nr_to_scan);
total_scan -= nr_to_scan;
cond_resched();
}
此处shrink就对应lowmemorykiller 注册的@kernel/drivers/staging/:
static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
{
...
//选择到需要kill的task,kill掉。
set_tsk_thread_flag(selected, TIF_MEMDIE);
send_sig(SIGKILL, selected, 0);
...
}
static unsigned long lowmem_count(struct shrinker *s,
struct shrink_control *sc)
{
return global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE);
}
static struct shrinker lowmem_shrinker = {
.scan_objects = lowmem_scan,
.count_objects = lowmem_count,
.seeks = DEFAULT_SEEKS * 16
};
static int __init lowmem_init(void)
{
register_shrinker(&lowmem_shrinker);
vmpressure_notifier_register(&lmk_vmpr_nb);
return 0;
}
所以一般在android里遇到kswapd占用cpu资源较多时,可以优化lowmemorykiller里的scan过程,尽量保证更多的memory, 减少kswapd回收不断scan的过程。
AMS在更新oom adj时则是通过下面的流程@frameworks/base/services/core/java/com/android/server/am:
//通过localsocket 通到native的lmkd daemon
private static void writeLmkd(ByteBuffer buf) {
for (int i = 0; i < 3; i++) {
if (sLmkdSocket == null) {
if (openLmkdSocket() == false) {
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
}
continue;
}
}
try {
sLmkdOutputStream.write(buf.array(), 0, buf.position());
return;
} catch (IOException ex) {
Slog.w(TAG, "Error writing to lowmemorykiller socket");
try {
sLmkdSocket.close();
} catch (IOException ex2) {
}
sLmkdSocket = null;
}
}
}
private static boolean openLmkdSocket() {
try {
sLmkdSocket = new LocalSocket(LocalSocket.SOCKET_SEQPACKET);
sLmkdSocket.connect(
new LocalSocketAddress("lmkd",
LocalSocketAddress.Namespace.RESERVED));
sLmkdOutputStream = sLmkdSocket.getOutputStream();
} catch (IOException ex) {
Slog.w(TAG, "lowmemorykiller daemon socket open failed");
sLmkdSocket = null;
return false;
}
return true;
}
native lmkd daemon:
@system/core/lmkd
static int init(void) {
...
ctrl_lfd = android_get_control_socket("lmkd");
if (ctrl_lfd < 0) {
ALOGE("get lmkd control socket failed");
return -1;
}
...
}
数据处理回调:
static void ctrl_data_handler(uint32_t events) {
if (events & EPOLLHUP) {
ALOGI("ActivityManager disconnected");
if (!ctrl_dfd_reopened)
ctrl_data_close();
} else if (events & EPOLLIN) {
ctrl_command_handler();
}
}
static void ctrl_command_handler(void) {
int ibuf[CTRL_PACKET_MAX / sizeof(int)];
int len;
int cmd = -1;
int nargs;
int targets;
len = ctrl_data_read((char *)ibuf, CTRL_PACKET_MAX);
if (len <= 0)
return;
nargs = len / sizeof(int) - 1;
if (nargs < 0)
goto wronglen;
cmd = ntohl(ibuf[0]);
switch(cmd) {
case LMK_TARGET:
targets = nargs / 2;
if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
goto wronglen;
cmd_target(targets, &ibuf[1]);
break;
case LMK_PROCPRIO:
if (nargs != 3)
goto wronglen;
cmd_procprio(ntohl(ibuf[1]), ntohl(ibuf[2]), ntohl(ibuf[3]));
break;
case LMK_PROCREMOVE:
if (nargs != 1)
goto wronglen;
cmd_procremove(ntohl(ibuf[1]));
break;
default:
ALOGE("Received unknown command code %d", cmd);
return;
}
return;
wronglen:
ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
}
最终写到对应proc 节点:
#define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
#define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
"/proc/%d/oom_score_adj"