Android kswapd-->lowmemorykiller启动和扫描过程

注册kswapd module,kernel启动时调用@kernel/mm:

module_init(kswapd_init)

static int __init kswapd_init(void)
{
    int nid;

    swap_setup();
    for_each_node_state(nid, N_MEMORY)
        kswapd_run(nid);
    if (kswapd_cpu_mask == NULL)
        hotcpu_notifier(cpu_callback, 0);
    return 0;
}

/*
 * This kswapd start function will be called by init and node-hot-add.
 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
 */
int kswapd_run(int nid)
{
    pg_data_t *pgdat = NODE_DATA(nid);
    int ret = 0;

    if (pgdat->kswapd)
        return 0;

    pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
    if (IS_ERR(pgdat->kswapd)) {
        /* failure at boot is fatal */
        BUG_ON(system_state == SYSTEM_BOOTING);
        pr_err("Failed to start kswapd on node %d\n", nid);
        ret = PTR_ERR(pgdat->kswapd);
        pgdat->kswapd = NULL;
    } else if (kswapd_cpu_mask) {
        if (set_kswapd_cpu_mask(pgdat))
            pr_warn("error setting kswapd cpu affinity mask\n");
    }
    return ret;
}

/*
 * The background pageout daemon, started as a kernel thread
 * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
 * that frees anything up. This is needed for things like routing
 * etc, where we otherwise might have all activity going on in
 * asynchronous contexts that cannot page things out.
 *
 * If there are applications that are active memory-allocators
 * (most normal use), this basically shouldn't matter.
 */
static int kswapd(void *p)
{
...
        /*
         * We can speed up thawing tasks if we don't call balance_pgdat
         * after returning from the refrigerator
         */
        if (!ret) {
            trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
            balanced_classzone_idx = classzone_idx;
            balanced_order = balance_pgdat(pgdat, order,
                        &balanced_classzone_idx);
        }
    }

    tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
    current->reclaim_state = NULL;
    lockdep_clear_current_reclaim_state();

    return 0;
}

/*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
 * Returns the final order kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
 * device drivers (say, ZONE_DMA).  Or if they are all in use by hugetlb.
 * What we do is to detect the case where all pages in the zone have been
 * scanned twice and there has been zero successful reclaim.  Mark the zone as
 * dead and from now on, only perform a short scan.  Basically we're polling
 * the zone for when the problem goes away.
 *
 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
 * lower zones regardless of the number of free pages in the lower zones. This
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
                            int *classzone_idx)
{
...
            /*
             * There should be no need to raise the scanning
             * priority if enough pages are already being scanned
             * that that high watermark would be met at 100%
             * efficiency.
             */
            if (kswapd_shrink_zone(zone, end_zone, &sc,
                    lru_pages, &nr_attempted))

}



/*
 * kswapd shrinks the zone by the number of pages required to reach
 * the high watermark.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
 * reclaim or if the lack of progress was due to pages under writeback.
 * This is used to determine if the scanning priority needs to be raised.
 */
static bool kswapd_shrink_zone(struct zone *zone,
                   int classzone_idx,
                   struct scan_control *sc,
                   unsigned long lru_pages,
                   unsigned long *nr_attempted)
{
...
    reclaim_state->reclaimed_slab = 0;
    shrink_slab(&shrink, sc->nr_scanned, lru_pages);
    sc->nr_reclaimed += reclaim_state->reclaimed_slab;
...
}


/*
 * Call the shrink functions to age shrinkable caches
 *
 * Here we assume it costs one seek to replace a lru page and that it also
 * takes a seek to recreate a cache object.  With this in mind we age equal
 * percentages of the lru and ageable caches.  This should balance the seeks
 * generated by these structures.
 *
 * If the vm encountered mapped pages on the LRU it increase the pressure on
 * slab to avoid swapping.
 *
 * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
 *
 * `lru_pages' represents the number of on-LRU pages in all the zones which
 * are eligible for the caller's allocation attempt.  It is used for balancing
 * slab reclaim versus page reclaim.
 *
 * Returns the number of slab objects which we shrunk.
 */
unsigned long shrink_slab(struct shrink_control *shrinkctl,
              unsigned long nr_pages_scanned,
              unsigned long lru_pages)
{
    struct shrinker *shrinker;
...
        for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
            if (node_online(shrinkctl->nid))
                freed += shrink_slab_node(shrinkctl, shrinker,
                        nr_pages_scanned, lru_pages);

        }
...
}

static unsigned long
shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
         unsigned long nr_pages_scanned, unsigned long lru_pages)
{
...
    freeable = shrinker->count_objects(shrinker, shrinkctl);
    if (freeable == 0)
        return 0;
...
    /*
     * Normally, we should not scan less than batch_size objects in one
     * pass to avoid too frequent shrinker calls, but if the slab has less
     * than batch_size objects in total and we are really tight on memory,
     * we will try to reclaim all available objects, otherwise we can end
     * up failing allocations although there are plenty of reclaimable
     * objects spread over several slabs with usage less than the
     * batch_size.
     *
     * We detect the "tight on memory" situations by looking at the total
     * number of objects we want to scan (total_scan). If it is greater
     * than the total number of objects on slab (freeable), we must be
     * scanning at high prio and therefore should try to reclaim as much as
     * possible.
     */
    while (total_scan > min_cache_size ||
           total_scan >= freeable) {
        unsigned long ret;
        unsigned long nr_to_scan = min(batch_size, total_scan);

        shrinkctl->nr_to_scan = nr_to_scan;
        ret = shrinker->scan_objects(shrinker, shrinkctl);
        if (ret == SHRINK_STOP)
            break;
        freed += ret;

        count_vm_events(SLABS_SCANNED, nr_to_scan);
        total_scan -= nr_to_scan;

        cond_resched();
    }       


此处shrink就对应lowmemorykiller 注册的@kernel/drivers/staging/:

static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
{
...
//选择到需要kill的task,kill掉。
        set_tsk_thread_flag(selected, TIF_MEMDIE);
        send_sig(SIGKILL, selected, 0);
...
}

static unsigned long lowmem_count(struct shrinker *s,
                  struct shrink_control *sc)
{
    return global_page_state(NR_ACTIVE_ANON) +
        global_page_state(NR_ACTIVE_FILE) +
        global_page_state(NR_INACTIVE_ANON) +
        global_page_state(NR_INACTIVE_FILE);
}

static struct shrinker lowmem_shrinker = {
    .scan_objects = lowmem_scan,
    .count_objects = lowmem_count,
    .seeks = DEFAULT_SEEKS * 16
};

static int __init lowmem_init(void)
{
    register_shrinker(&lowmem_shrinker);
    vmpressure_notifier_register(&lmk_vmpr_nb);
    return 0;
}

所以一般在android里遇到kswapd占用cpu资源较多时,可以优化lowmemorykiller里的scan过程,尽量保证更多的memory, 减少kswapd回收不断scan的过程。

AMS在更新oom adj时则是通过下面的流程@frameworks/base/services/core/java/com/android/server/am:

//通过localsocket 通到native的lmkd daemon
 private static void writeLmkd(ByteBuffer buf) {

        for (int i = 0; i < 3; i++) {
            if (sLmkdSocket == null) {
                    if (openLmkdSocket() == false) {
                        try {
                            Thread.sleep(1000);
                        } catch (InterruptedException ie) {
                        }
                        continue;
                    }
            }

            try {
                sLmkdOutputStream.write(buf.array(), 0, buf.position());
                return;
            } catch (IOException ex) {
                Slog.w(TAG, "Error writing to lowmemorykiller socket");

                try {
                    sLmkdSocket.close();
                } catch (IOException ex2) {
                }

                sLmkdSocket = null;
            }
        }
    }

    private static boolean openLmkdSocket() {
        try {
            sLmkdSocket = new LocalSocket(LocalSocket.SOCKET_SEQPACKET);
            sLmkdSocket.connect(
                new LocalSocketAddress("lmkd",
                        LocalSocketAddress.Namespace.RESERVED));
            sLmkdOutputStream = sLmkdSocket.getOutputStream();
        } catch (IOException ex) {
            Slog.w(TAG, "lowmemorykiller daemon socket open failed");
            sLmkdSocket = null;
            return false;
        }

        return true;
    }

native lmkd daemon:
@system/core/lmkd

static int init(void) {
...
    ctrl_lfd = android_get_control_socket("lmkd");
    if (ctrl_lfd < 0) {
        ALOGE("get lmkd control socket failed");
        return -1;
    }
...
}

数据处理回调:

static void ctrl_data_handler(uint32_t events) {
    if (events & EPOLLHUP) {
        ALOGI("ActivityManager disconnected");
        if (!ctrl_dfd_reopened)
            ctrl_data_close();
    } else if (events & EPOLLIN) {
        ctrl_command_handler();
    }
}

static void ctrl_command_handler(void) {
    int ibuf[CTRL_PACKET_MAX / sizeof(int)];
    int len;
    int cmd = -1;
    int nargs;
    int targets;

    len = ctrl_data_read((char *)ibuf, CTRL_PACKET_MAX);
    if (len <= 0)
        return;

    nargs = len / sizeof(int) - 1;
    if (nargs < 0)
        goto wronglen;

    cmd = ntohl(ibuf[0]);

    switch(cmd) {
    case LMK_TARGET:
        targets = nargs / 2;
        if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
            goto wronglen;
        cmd_target(targets, &ibuf[1]);
        break;
    case LMK_PROCPRIO:
        if (nargs != 3)
            goto wronglen;
        cmd_procprio(ntohl(ibuf[1]), ntohl(ibuf[2]), ntohl(ibuf[3]));
        break;
    case LMK_PROCREMOVE:
        if (nargs != 1)
            goto wronglen;
        cmd_procremove(ntohl(ibuf[1]));
        break;
    default:
        ALOGE("Received unknown command code %d", cmd);
        return;
    }

    return;

wronglen:
    ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
}

最终写到对应proc 节点:

#define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
#define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
"/proc/%d/oom_score_adj"

你可能感兴趣的:(android,android,lmk,kswapd)