linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Minchan Kim <minchan.kim@gmail.com>
To: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
	linux-mm <linux-mm@kvack.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Rik van Riel <riel@redhat.com>,
	Wu Fengguang <fengguang.wu@intel.com>
Subject: Re: [RFC PATCH 2/2] Don't continue reclaim if the system have plenty free memory
Date: Tue, 7 Jul 2009 22:20:13 +0900	[thread overview]
Message-ID: <28c262360907070620n3e22801egd4493c149a263ecd@mail.gmail.com> (raw)
In-Reply-To: <20090707184714.0C73.A69D9226@jp.fujitsu.com>

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=UTF-8, Size: 7707 bytes --]

Hi, Kosaki.

On Tue, Jul 7, 2009 at 6:48 PM, KOSAKI
Motohiro<kosaki.motohiro@jp.fujitsu.com> wrote:
> Subject: [PATCH] Don't continue reclaim if the system have plenty free memory
>
> On concurrent reclaim situation, if one reclaimer makes OOM, maybe other
> reclaimer can stop reclaim because OOM killer makes enough free memory.
>
> But current kernel doesn't have its logic. Then, we can face following accidental
> 2nd OOM scenario.
>
> 1. System memory is used by only one big process.
> 2. memory shortage occur and concurrent reclaim start.
> 3. One reclaimer makes OOM and OOM killer kill above big process.
> 4. Almost reclaimable page will be freed.
> 5. Another reclaimer can't find any reclaimable page because those pages are
>   already freed.
> 6. Then, system makes accidental and unnecessary 2nd OOM killer.
>

Did you see the this situation ?
Why I ask is that we have already a routine for preventing parallel
OOM killing in __alloc_pages_may_oom.

Couldn't it protect your scenario ?
If it can't, Could you explain the scenario in more detail ?

I think first we try to modify old routine with efficient.

>
> Plus, nowaday datacenter system have badboy process monitoring system and
> it kill too much memory consumption process.
> But it don't stop other reclaimer and it makes accidental 2nd OOM by the
> same reason.
>
>
> This patch have one good side effect. it increase reclaim depended benchmark
> performance.
>
> e.g.
> =====
> % ./hackbench 140 process 100
>
> before:
>        Time: 93.361
> after:
>        Time: 28.799
>
>
>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> ---
>  fs/buffer.c          |    2 +-
>  include/linux/swap.h |    3 ++-
>  mm/page_alloc.c      |    3 ++-
>  mm/vmscan.c          |   29 ++++++++++++++++++++++++++++-
>  4 files changed, 33 insertions(+), 4 deletions(-)
>
> Index: b/mm/vmscan.c
> ===================================================================
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -87,6 +87,9 @@ struct scan_control {
>         */
>        nodemask_t      *nodemask;
>
> +       /* Caller's preferred zone. */
> +       struct zone     *preferred_zone;
> +
>        /* Pluggable isolate pages callback */
>        unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
>                        unsigned long *scanned, int order, int mode,
> @@ -1535,6 +1538,10 @@ static void shrink_zone(int priority, st
>        unsigned long nr_reclaimed = sc->nr_reclaimed;
>        unsigned long swap_cluster_max = sc->swap_cluster_max;
>        int noswap = 0;
> +       int classzone_idx = 0;
> +
> +       if (sc->preferred_zone)
> +               classzone_idx = zone_idx(sc->preferred_zone);
>
>        /* If we have no swap space, do not bother scanning anon pages. */
>        if (!sc->may_swap || (nr_swap_pages <= 0)) {
> @@ -1583,6 +1590,20 @@ static void shrink_zone(int priority, st
>                if (nr_reclaimed > swap_cluster_max &&
>                        priority < DEF_PRIORITY && !current_is_kswapd())
>                        break;
> +
> +               /*
> +                * Now, we have plenty free memory.
> +                * Perhaps, big processes exited or they killed by OOM killer.
> +                * To continue reclaim doesn't make any sense.
> +                */
> +               if (zone_page_state(zone, NR_FREE_PAGES) >
> +                   zone_lru_pages(zone) &&
> +                   zone_watermark_ok(zone, sc->order, high_wmark_pages(zone),
> +                                     classzone_idx, 0)) {
> +                       /* fake result for reclaim stop */
> +                       nr_reclaimed += swap_cluster_max;
> +                       break;
> +               }
>        }
>
>        sc->nr_reclaimed = nr_reclaimed;
> @@ -1767,7 +1788,8 @@ out:
>  }
>
>  unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> -                               gfp_t gfp_mask, nodemask_t *nodemask)
> +                               gfp_t gfp_mask, nodemask_t *nodemask,
> +                               struct zone *preferred_zone)
>  {
>        struct scan_control sc = {
>                .gfp_mask = gfp_mask,
> @@ -1780,6 +1802,7 @@ unsigned long try_to_free_pages(struct z
>                .mem_cgroup = NULL,
>                .isolate_pages = isolate_pages_global,
>                .nodemask = nodemask,
> +               .preferred_zone = preferred_zone,
>        };
>
>        return do_try_to_free_pages(zonelist, &sc);
> @@ -1808,6 +1831,10 @@ unsigned long try_to_free_mem_cgroup_pag
>        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
>                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
>        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
> +       first_zones_zonelist(zonelist,
> +                            gfp_zone(sc.gfp_mask), NULL,
> +                            &sc.preferred_zone);
> +
>        return do_try_to_free_pages(zonelist, &sc);
>  }
>  #endif
> Index: b/fs/buffer.c
> ===================================================================
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -290,7 +290,7 @@ static void free_more_memory(void)
>                                                &zone);
>                if (zone)
>                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
> -                                               GFP_NOFS, NULL);
> +                                         GFP_NOFS, NULL, zone);
>        }
>  }
>
> Index: b/include/linux/swap.h
> ===================================================================
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -213,7 +213,8 @@ static inline void lru_cache_add_active_
>
>  /* linux/mm/vmscan.c */
>  extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> -                                       gfp_t gfp_mask, nodemask_t *mask);
> +                                      gfp_t gfp_mask, nodemask_t *mask,
> +                                      struct zone *preferred_zone);
>  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
>                                                  gfp_t gfp_mask, bool noswap,
>                                                  unsigned int swappiness);
> Index: b/mm/page_alloc.c
> ===================================================================
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1629,7 +1629,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_m
>        reclaim_state.reclaimed_slab = 0;
>        p->reclaim_state = &reclaim_state;
>
> -       *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
> +       *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask,
> +                                              nodemask, preferred_zone);
>
>        p->reclaim_state = NULL;
>        lockdep_clear_current_reclaim_state();
>
>
>



-- 
Kind regards,
Minchan Kim
N‹§²æìr¸›zǧu©ž²Æ {\b­†éì¹»\x1c®&Þ–)îÆi¢žØ^n‡r¶‰šŽŠÝ¢j$½§$¢¸\x05¢¹¨­è§~Š'.)îÄÃ,yèm¶ŸÿÃ\f%Š{±šj+ƒðèž×¦j)Z†·Ÿ

  reply	other threads:[~2009-07-07 13:19 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-07-07  9:40 [RFC PATCH 0/2] fix unnecessary accidental OOM problem on concurrent reclaim KOSAKI Motohiro
2009-07-07  9:47 ` [RFC PATCH 1/2] vmscan don't isolate too many pages KOSAKI Motohiro
2009-07-07 13:23   ` Wu Fengguang
2009-07-07 18:59   ` Rik van Riel
2009-07-08  3:19     ` Wu Fengguang
2009-07-09  1:51       ` [RFC PATCH 1/2] vmscan don't isolate too many pages in a zone Rik van Riel
2009-07-09  2:47         ` Wu Fengguang
2009-07-09  3:07           ` Wu Fengguang
2009-07-09  7:01             ` KOSAKI Motohiro
2009-07-09  8:42               ` Wu Fengguang
2009-07-09 11:07                 ` Minchan Kim
2009-07-09  6:39         ` KOSAKI Motohiro
2009-07-07 23:39   ` [RFC PATCH 1/2] vmscan don't isolate too many pages Minchan Kim
2009-07-09  3:12     ` KOSAKI Motohiro
2009-07-07  9:48 ` [RFC PATCH 2/2] Don't continue reclaim if the system have plenty free memory KOSAKI Motohiro
2009-07-07 13:20   ` Minchan Kim [this message]
2009-07-09  5:08     ` KOSAKI Motohiro
2009-07-09 10:58       ` Minchan Kim
2009-07-13  0:37         ` KOSAKI Motohiro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=28c262360907070620n3e22801egd4493c149a263ecd@mail.gmail.com \
    --to=minchan.kim@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=fengguang.wu@intel.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=riel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).