All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: proc: add Sock to /proc/meminfo
@ 2020-10-10 10:38 Muchun Song
  2020-10-10 13:06 ` kernel test robot
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Muchun Song @ 2020-10-10 10:38 UTC (permalink / raw)
  To: gregkh, rafael, mst, jasowang, davem, kuba, adobriyan, akpm,
	edumazet, kuznet, yoshfuji, steffen.klassert, herbert, shakeelb,
	will, mhocko, guro, neilb, rppt, songmuchun, samitolvanen,
	kirill.shutemov, feng.tang, pabeni, willemb, rdunlap, fw,
	gustavoars, pablo, decui, jakub, peterz, christian.brauner,
	ebiederm, tglx, dave, walken, jannh, chenqiwu, christophe.leroy,
	minchan, kafai, ast, daniel, linmiaohe, keescook
  Cc: linux-kernel, virtualization, netdev, linux-fsdevel, linux-mm

The amount of memory allocated to sockets buffer can become significant.
However, we do not display the amount of memory consumed by sockets
buffer. In this case, knowing where the memory is consumed by the kernel
is very difficult. On our server with 500GB RAM, sometimes we can see
25GB disappear through /proc/meminfo. After our analysis, we found the
following memory allocation path which consumes the memory with page_owner
enabled.

  849698 times:
  Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
   __alloc_pages_nodemask+0x11d/0x290
   skb_page_frag_refill+0x68/0xf0
   sk_page_frag_refill+0x19/0x70
   tcp_sendmsg_locked+0x2f4/0xd10
   tcp_sendmsg+0x29/0xa0
   sock_sendmsg+0x30/0x40
   sock_write_iter+0x8f/0x100
   __vfs_write+0x10b/0x190
   vfs_write+0xb0/0x190
   ksys_write+0x5a/0xd0
   do_syscall_64+0x5d/0x110
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 drivers/base/node.c      |  2 ++
 drivers/net/virtio_net.c |  3 +--
 fs/proc/meminfo.c        |  1 +
 include/linux/mmzone.h   |  1 +
 include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
 kernel/exit.c            |  3 +--
 mm/page_alloc.c          |  7 +++++--
 mm/vmstat.c              |  1 +
 net/core/sock.c          |  8 ++++----
 net/ipv4/tcp.c           |  3 +--
 net/xfrm/xfrm_state.c    |  3 +--
 11 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 508b80f6329b..6f92775da85c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -418,6 +418,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_SHADOW_CALL_STACK
 		       "Node %d ShadowCallStack:%8lu kB\n"
 #endif
+		       "Node %d Sock:           %8lu kB\n"
 		       "Node %d PageTables:     %8lu kB\n"
 		       "Node %d NFS_Unstable:   %8lu kB\n"
 		       "Node %d Bounce:         %8lu kB\n"
@@ -441,6 +442,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
 		       nid, K(i.sharedram),
 		       nid, node_page_state(pgdat, NR_KERNEL_STACK_KB),
+		       nid, K(node_page_state(pgdat, NR_SOCK)),
 #ifdef CONFIG_SHADOW_CALL_STACK
 		       nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 263b005981bd..e7183f67ae4a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2646,8 +2646,7 @@ static void free_receive_page_frags(struct virtnet_info *vi)
 {
 	int i;
 	for (i = 0; i < vi->max_queue_pairs; i++)
-		if (vi->rq[i].alloc_frag.page)
-			put_page(vi->rq[i].alloc_frag.page);
+		put_page_frag(&vi->rq[i].alloc_frag);
 }
 
 static void free_unused_bufs(struct virtnet_info *vi)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..1dcf3120d831 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -106,6 +106,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	seq_printf(m, "ShadowCallStack:%8lu kB\n",
 		   global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
+	show_val_kb(m, "Sock:           ", global_node_page_state(NR_SOCK));
 	show_val_kb(m, "PageTables:     ",
 		    global_zone_page_state(NR_PAGETABLE));
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 31712bb61f7f..1996713d2c6b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -207,6 +207,7 @@ enum node_stat_item {
 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
+	NR_SOCK,                /* Count of socket buffer pages */
 	NR_VM_NODE_STAT_ITEMS
 };
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fcd53f97c186..7e5108da4d84 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -19,7 +19,8 @@
 #include <linux/rbtree.h>
 #include <linux/socket.h>
 #include <linux/refcount.h>
-
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
 #include <linux/atomic.h>
 #include <asm/types.h>
 #include <linux/spinlock.h>
@@ -3003,6 +3004,25 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
 	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
+static inline void inc_sock_node_page_state(struct page *page)
+{
+	mod_node_page_state(page_pgdat(page), NR_SOCK, compound_nr(page));
+	/*
+	 * Indicate that we need to decrease the Sock page state when
+	 * the page freed.
+	 */
+	SetPagePrivate(page);
+}
+
+static inline void dec_sock_node_page_state(struct page *page)
+{
+	if (PagePrivate(page)) {
+		ClearPagePrivate(page);
+		mod_node_page_state(page_pgdat(page), NR_SOCK,
+				    -compound_nr(page));
+	}
+}
+
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
@@ -3011,7 +3031,12 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag)
 {
-	put_page(skb_frag_page(frag));
+	struct page *page = skb_frag_page(frag);
+
+	if (put_page_testzero(page)) {
+		dec_sock_node_page_state(page);
+		__put_page(page);
+	}
 }
 
 /**
@@ -3091,6 +3116,20 @@ static inline void skb_frag_set_page(struct sk_buff *skb, int f,
 	__skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
 }
 
+static inline bool put_page_frag(struct page_frag *pfrag)
+{
+	struct page *page = pfrag->page;
+
+	if (page) {
+		if (put_page_testzero(page)) {
+			dec_sock_node_page_state(page);
+			__put_page(page);
+		}
+		return true;
+	}
+	return false;
+}
+
 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
 
 /**
diff --git a/kernel/exit.c b/kernel/exit.c
index 62912406d74a..58d373767d16 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,8 +841,7 @@ void __noreturn do_exit(long code)
 	if (tsk->splice_pipe)
 		free_pipe_info(tsk->splice_pipe);
 
-	if (tsk->task_frag.page)
-		put_page(tsk->task_frag.page);
+	put_page_frag(&tsk->task_frag);
 
 	validate_creds_for_do_exit(tsk);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cefbef32bf4a..6c543158aa06 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5379,7 +5379,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		" unevictable:%lu dirty:%lu writeback:%lu\n"
 		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-		" free:%lu free_pcp:%lu free_cma:%lu\n",
+		" free:%lu free_pcp:%lu free_cma:%lu sock:%lu\n",
 		global_node_page_state(NR_ACTIVE_ANON),
 		global_node_page_state(NR_INACTIVE_ANON),
 		global_node_page_state(NR_ISOLATED_ANON),
@@ -5397,7 +5397,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 		global_zone_page_state(NR_BOUNCE),
 		global_zone_page_state(NR_FREE_PAGES),
 		free_pcp,
-		global_zone_page_state(NR_FREE_CMA_PAGES));
+		global_zone_page_state(NR_FREE_CMA_PAGES),
+		global_node_page_state(NR_SOCK));
 
 	for_each_online_pgdat(pgdat) {
 		if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -5425,6 +5426,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_SHADOW_CALL_STACK
 			" shadow_call_stack:%lukB"
 #endif
+			" sock:%lukB"
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
@@ -5450,6 +5452,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #ifdef CONFIG_SHADOW_CALL_STACK
 			node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
+			K(node_page_state(pgdat, NR_SOCK)),
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b05dec387557..ceaf6f85c155 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1220,6 +1220,7 @@ const char * const vmstat_text[] = {
 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
 	"nr_shadow_call_stack",
 #endif
+	"nr_sock",
 
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
diff --git a/net/core/sock.c b/net/core/sock.c
index 5972d26f03ae..1661b423802b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1780,10 +1780,8 @@ static void __sk_destruct(struct rcu_head *head)
 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
 			 __func__, atomic_read(&sk->sk_omem_alloc));
 
-	if (sk->sk_frag.page) {
-		put_page(sk->sk_frag.page);
+	if (put_page_frag(&sk->sk_frag))
 		sk->sk_frag.page = NULL;
-	}
 
 	if (sk->sk_peer_cred)
 		put_cred(sk->sk_peer_cred);
@@ -2456,7 +2454,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 		}
 		if (pfrag->offset + sz <= pfrag->size)
 			return true;
-		put_page(pfrag->page);
+		put_page_frag(pfrag);
 	}
 
 	pfrag->offset = 0;
@@ -2469,12 +2467,14 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 					  SKB_FRAG_PAGE_ORDER);
 		if (likely(pfrag->page)) {
 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
+			inc_sock_node_page_state(pfrag->page);
 			return true;
 		}
 	}
 	pfrag->page = alloc_page(gfp);
 	if (likely(pfrag->page)) {
 		pfrag->size = PAGE_SIZE;
+		inc_sock_node_page_state(pfrag->page);
 		return true;
 	}
 	return false;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 57a568875539..583761844b4f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2751,8 +2751,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
 
-	if (sk->sk_frag.page) {
-		put_page(sk->sk_frag.page);
+	if (put_page_frag(&sk->sk_frag)) {
 		sk->sk_frag.page = NULL;
 		sk->sk_frag.offset = 0;
 	}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 69520ad3d83b..0f7c16679e49 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -495,8 +495,7 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
 	}
-	if (x->xfrag.page)
-		put_page(x->xfrag.page);
+	put_page_frag(&x->xfrag);
 	xfrm_dev_state_free(x);
 	security_xfrm_state_free(x);
 	xfrm_state_free(x);
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
  2020-10-10 10:38 Muchun Song
@ 2020-10-10 13:06 ` kernel test robot
  2020-10-10 13:27 ` kernel test robot
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2020-10-10 13:06 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 3348 bytes --]

Hi Muchun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ipsec-next/master]
[also build test WARNING on ipsec/master linus/master hnaz-linux-mm/master v5.9-rc8]
[cannot apply to mmotm/master driver-core/driver-core-testing next-20201009]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Muchun-Song/mm-proc-add-Sock-to-proc-meminfo/20201010-184040
base:   https://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master
config: powerpc-randconfig-r031-20201010 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 9b5b3050237db3642ed7ab1bdb3ffa2202511b99)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install powerpc cross compiling tool for clang build
        # apt-get install binutils-powerpc-linux-gnu
        # https://github.com/0day-ci/linux/commit/9befda995cb116f94e34d4ffdc8d9dd37a91dc49
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Muchun-Song/mm-proc-add-Sock-to-proc-meminfo/20201010-184040
        git checkout 9befda995cb116f94e34d4ffdc8d9dd37a91dc49
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> security/keys/encrypted-keys/encrypted.c:52:9: warning: 'HASH_SIZE' macro redefined [-Wmacro-redefined]
   #define HASH_SIZE SHA256_DIGEST_SIZE
           ^
   include/linux/hashtable.h:27:9: note: previous definition is here
   #define HASH_SIZE(name) (ARRAY_SIZE(name))
           ^
   1 warning generated.

vim +/HASH_SIZE +52 security/keys/encrypted-keys/encrypted.c

7e70cb4978507cf security/keys/encrypted_defined.c        Mimi Zohar    2010-11-23  48  
3b1826cebe1d534 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  49  #define KEY_TRUSTED_PREFIX_LEN (sizeof (KEY_TRUSTED_PREFIX) - 1)
3b1826cebe1d534 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  50  #define KEY_USER_PREFIX_LEN (sizeof (KEY_USER_PREFIX) - 1)
79a73d188726b47 security/keys/encrypted.c                Roberto Sassu 2011-06-27  51  #define KEY_ECRYPTFS_DESC_LEN 16
3b1826cebe1d534 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13 @52  #define HASH_SIZE SHA256_DIGEST_SIZE
3b1826cebe1d534 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  53  #define MAX_DATA_SIZE 4096
3b1826cebe1d534 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  54  #define MIN_DATA_SIZE  20
9db67581b91d9e9 security/keys/encrypted-keys/encrypted.c Dave Jiang    2018-12-04  55  #define KEY_ENC32_PAYLOAD_LEN 32
3b1826cebe1d534 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  56  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 29729 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
  2020-10-10 10:38 Muchun Song
  2020-10-10 13:06 ` kernel test robot
@ 2020-10-10 13:27 ` kernel test robot
  2020-10-10 16:36   ` Randy Dunlap
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2020-10-10 13:27 UTC (permalink / raw)
  To: kbuild-all

[-- Attachment #1: Type: text/plain, Size: 3391 bytes --]

Hi Muchun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ipsec-next/master]
[also build test WARNING on ipsec/master linus/master hnaz-linux-mm/master v5.9-rc8]
[cannot apply to mmotm/master driver-core/driver-core-testing next-20201009]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Muchun-Song/mm-proc-add-Sock-to-proc-meminfo/20201010-184040
base:   https://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master
config: x86_64-randconfig-m001-20201010 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/9befda995cb116f94e34d4ffdc8d9dd37a91dc49
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Muchun-Song/mm-proc-add-Sock-to-proc-meminfo/20201010-184040
        git checkout 9befda995cb116f94e34d4ffdc8d9dd37a91dc49
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> security/keys/encrypted-keys/encrypted.c:52: warning: "HASH_SIZE" redefined
      52 | #define HASH_SIZE SHA256_DIGEST_SIZE
         | 
   In file included from include/linux/elevator.h:6,
                    from include/linux/blkdev.h:285,
                    from include/linux/blk-cgroup.h:23,
                    from include/linux/writeback.h:14,
                    from include/linux/memcontrol.h:22,
                    from include/linux/skbuff.h:22,
                    from include/crypto/algapi.h:13,
                    from security/keys/encrypted-keys/encrypted.c:30:
   include/linux/hashtable.h:27: note: this is the location of the previous definition
      27 | #define HASH_SIZE(name) (ARRAY_SIZE(name))
         | 

vim +/HASH_SIZE +52 security/keys/encrypted-keys/encrypted.c

7e70cb4978507c security/keys/encrypted_defined.c        Mimi Zohar    2010-11-23  48  
3b1826cebe1d53 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  49  #define KEY_TRUSTED_PREFIX_LEN (sizeof (KEY_TRUSTED_PREFIX) - 1)
3b1826cebe1d53 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  50  #define KEY_USER_PREFIX_LEN (sizeof (KEY_USER_PREFIX) - 1)
79a73d188726b4 security/keys/encrypted.c                Roberto Sassu 2011-06-27  51  #define KEY_ECRYPTFS_DESC_LEN 16
3b1826cebe1d53 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13 @52  #define HASH_SIZE SHA256_DIGEST_SIZE
3b1826cebe1d53 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  53  #define MAX_DATA_SIZE 4096
3b1826cebe1d53 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  54  #define MIN_DATA_SIZE  20
9db67581b91d9e security/keys/encrypted-keys/encrypted.c Dave Jiang    2018-12-04  55  #define KEY_ENC32_PAYLOAD_LEN 32
3b1826cebe1d53 security/keys/encrypted_defined.c        Mimi Zohar    2010-12-13  56  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 30594 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
@ 2020-10-10 13:57 kernel test robot
  0 siblings, 0 replies; 9+ messages in thread
From: kernel test robot @ 2020-10-10 13:57 UTC (permalink / raw)
  To: kbuild

[-- Attachment #1: Type: text/plain, Size: 3738 bytes --]

CC: kbuild-all(a)lists.01.org
In-Reply-To: <20201010103854.66746-1-songmuchun@bytedance.com>
References: <20201010103854.66746-1-songmuchun@bytedance.com>
TO: Muchun Song <songmuchun@bytedance.com>
TO: gregkh(a)linuxfoundation.org
TO: rafael(a)kernel.org
TO: mst(a)redhat.com
TO: jasowang(a)redhat.com
TO: davem(a)davemloft.net
TO: kuba(a)kernel.org
TO: adobriyan(a)gmail.com
TO: akpm(a)linux-foundation.org
TO: edumazet(a)google.com
TO: kuznet(a)ms2.inr.ac.ru

Hi Muchun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on ipsec-next/master]
[also build test WARNING on ipsec/master linus/master hnaz-linux-mm/master v5.9-rc8]
[cannot apply to mmotm/master driver-core/driver-core-testing next-20201009]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Muchun-Song/mm-proc-add-Sock-to-proc-meminfo/20201010-184040
base:   https://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec-next.git master
:::::: branch date: 3 hours ago
:::::: commit date: 3 hours ago
config: x86_64-randconfig-s022-20201010 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.2-229-g0c6896f4-dirty
        # https://github.com/0day-ci/linux/commit/9befda995cb116f94e34d4ffdc8d9dd37a91dc49
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Muchun-Song/mm-proc-add-Sock-to-proc-meminfo/20201010-184040
        git checkout 9befda995cb116f94e34d4ffdc8d9dd37a91dc49
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


"sparse warnings: (new ones prefixed by >>)"
   security/keys/encrypted-keys/encrypted.c:52:9: sparse: sparse: preprocessor token HASH_SIZE redefined
   security/keys/encrypted-keys/encrypted.c: note: in included file (through include/linux/elevator.h, include/linux/blkdev.h, include/linux/blk-cgroup.h, ...):
>> include/linux/hashtable.h:27:9: sparse: this was the original definition

vim +27 include/linux/hashtable.h

d9b482c8ba1973 Sasha Levin  2012-10-30  15  
d9b482c8ba1973 Sasha Levin  2012-10-30  16  #define DEFINE_HASHTABLE(name, bits)						\
d9b482c8ba1973 Sasha Levin  2012-10-30  17  	struct hlist_head name[1 << (bits)] =					\
d9b482c8ba1973 Sasha Levin  2012-10-30  18  			{ [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
d9b482c8ba1973 Sasha Levin  2012-10-30  19  
6180d9de61a5c4 Eric Dumazet 2015-11-18  20  #define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)				\
6180d9de61a5c4 Eric Dumazet 2015-11-18  21  	struct hlist_head name[1 << (bits)] __read_mostly =			\
6180d9de61a5c4 Eric Dumazet 2015-11-18  22  			{ [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }
6180d9de61a5c4 Eric Dumazet 2015-11-18  23  
d9b482c8ba1973 Sasha Levin  2012-10-30  24  #define DECLARE_HASHTABLE(name, bits)                                   	\
d9b482c8ba1973 Sasha Levin  2012-10-30  25  	struct hlist_head name[1 << (bits)]
d9b482c8ba1973 Sasha Levin  2012-10-30  26  
d9b482c8ba1973 Sasha Levin  2012-10-30 @27  #define HASH_SIZE(name) (ARRAY_SIZE(name))
d9b482c8ba1973 Sasha Levin  2012-10-30  28  #define HASH_BITS(name) ilog2(HASH_SIZE(name))
d9b482c8ba1973 Sasha Levin  2012-10-30  29  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 28949 bytes --]

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
  2020-10-10 10:38 Muchun Song
@ 2020-10-10 16:36   ` Randy Dunlap
  2020-10-10 13:27 ` kernel test robot
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Randy Dunlap @ 2020-10-10 16:36 UTC (permalink / raw)
  To: Muchun Song, gregkh, rafael, mst, jasowang, davem, kuba,
	adobriyan, akpm, edumazet, kuznet, yoshfuji, steffen.klassert,
	herbert, shakeelb, will, mhocko, guro, neilb, rppt, samitolvanen,
	kirill.shutemov, feng.tang, pabeni, willemb, fw, gustavoars,
	pablo, decui, jakub, peterz, christian.brauner, ebiederm, tglx,
	dave, walken, jannh, chenqiwu, christophe.leroy, minchan, kafai,
	ast, daniel, linmiaohe, keescook
  Cc: linux-fsdevel, netdev, linux-mm, linux-kernel, virtualization

Hi,

On 10/10/20 3:38 AM, Muchun Song wrote:
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel
> is very difficult. On our server with 500GB RAM, sometimes we can see
> 25GB disappear through /proc/meminfo. After our analysis, we found the
> following memory allocation path which consumes the memory with page_owner
> enabled.
> 
>   849698 times:
>   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
>    __alloc_pages_nodemask+0x11d/0x290
>    skb_page_frag_refill+0x68/0xf0
>    sk_page_frag_refill+0x19/0x70
>    tcp_sendmsg_locked+0x2f4/0xd10
>    tcp_sendmsg+0x29/0xa0
>    sock_sendmsg+0x30/0x40
>    sock_write_iter+0x8f/0x100
>    __vfs_write+0x10b/0x190
>    vfs_write+0xb0/0x190
>    ksys_write+0x5a/0xd0
>    do_syscall_64+0x5d/0x110
>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  drivers/base/node.c      |  2 ++
>  drivers/net/virtio_net.c |  3 +--
>  fs/proc/meminfo.c        |  1 +
>  include/linux/mmzone.h   |  1 +
>  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
>  kernel/exit.c            |  3 +--
>  mm/page_alloc.c          |  7 +++++--
>  mm/vmstat.c              |  1 +
>  net/core/sock.c          |  8 ++++----
>  net/ipv4/tcp.c           |  3 +--
>  net/xfrm/xfrm_state.c    |  3 +--
>  11 files changed, 59 insertions(+), 16 deletions(-)

Thanks for finding that.

Please update Documentation/filesystems/proc.rst "meminfo" section also.

-- 
~Randy

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
@ 2020-10-10 16:36   ` Randy Dunlap
  0 siblings, 0 replies; 9+ messages in thread
From: Randy Dunlap @ 2020-10-10 16:36 UTC (permalink / raw)
  To: Muchun Song, gregkh, rafael, mst, jasowang, davem, kuba,
	adobriyan, akpm, edumazet, kuznet, yoshfuji, steffen.klassert,
	herbert, shakeelb, will, mhocko, guro, neilb, rppt, samitolvanen,
	kirill.shutemov, feng.tang, pabeni, willemb, fw, gustavoars,
	pablo, decui, jakub, peterz, christian.brauner, ebiederm, tglx,
	dave, walken, jannh, chenqiwu, christophe.leroy, minchan, kafai,
	ast, daniel, linmiaohe, keescook
  Cc: linux-kernel, virtualization, netdev, linux-fsdevel, linux-mm

Hi,

On 10/10/20 3:38 AM, Muchun Song wrote:
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel
> is very difficult. On our server with 500GB RAM, sometimes we can see
> 25GB disappear through /proc/meminfo. After our analysis, we found the
> following memory allocation path which consumes the memory with page_owner
> enabled.
> 
>   849698 times:
>   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
>    __alloc_pages_nodemask+0x11d/0x290
>    skb_page_frag_refill+0x68/0xf0
>    sk_page_frag_refill+0x19/0x70
>    tcp_sendmsg_locked+0x2f4/0xd10
>    tcp_sendmsg+0x29/0xa0
>    sock_sendmsg+0x30/0x40
>    sock_write_iter+0x8f/0x100
>    __vfs_write+0x10b/0x190
>    vfs_write+0xb0/0x190
>    ksys_write+0x5a/0xd0
>    do_syscall_64+0x5d/0x110
>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  drivers/base/node.c      |  2 ++
>  drivers/net/virtio_net.c |  3 +--
>  fs/proc/meminfo.c        |  1 +
>  include/linux/mmzone.h   |  1 +
>  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
>  kernel/exit.c            |  3 +--
>  mm/page_alloc.c          |  7 +++++--
>  mm/vmstat.c              |  1 +
>  net/core/sock.c          |  8 ++++----
>  net/ipv4/tcp.c           |  3 +--
>  net/xfrm/xfrm_state.c    |  3 +--
>  11 files changed, 59 insertions(+), 16 deletions(-)

Thanks for finding that.

Please update Documentation/filesystems/proc.rst "meminfo" section also.

-- 
~Randy


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
  2020-10-10 10:38 Muchun Song
                   ` (2 preceding siblings ...)
  2020-10-10 16:36   ` Randy Dunlap
@ 2020-10-11 13:52 ` Mike Rapoport
  2020-10-11 18:39   ` Cong Wang
  4 siblings, 0 replies; 9+ messages in thread
From: Mike Rapoport @ 2020-10-11 13:52 UTC (permalink / raw)
  To: Muchun Song
  Cc: gregkh, rafael, mst, jasowang, davem, kuba, adobriyan, akpm,
	edumazet, kuznet, yoshfuji, steffen.klassert, herbert, shakeelb,
	will, mhocko, guro, neilb, samitolvanen, kirill.shutemov,
	feng.tang, pabeni, willemb, rdunlap, fw, gustavoars, pablo, decui,
	jakub, peterz, christian.brauner, ebiederm, tglx, dave, walken,
	jannh, chenqiwu, christophe.leroy, minchan, kafai, ast, daniel,
	linmiaohe, keescook, linux-kernel, virtualization, netdev,
	linux-fsdevel, linux-mm

On Sat, Oct 10, 2020 at 06:38:54PM +0800, Muchun Song wrote:
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel
> is very difficult. On our server with 500GB RAM, sometimes we can see
> 25GB disappear through /proc/meminfo. After our analysis, we found the
> following memory allocation path which consumes the memory with page_owner
> enabled.
 
I have a high lelel question.
There is accounting of the socket memory for memcg that gets called from
the networking layer. Did you check if the same call sites can be used
for the system-wide accounting as well?

>   849698 times:
>   Page allocated via order 3, mask 0x4052c0(GFP_NOWAIT|__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP)
>    __alloc_pages_nodemask+0x11d/0x290
>    skb_page_frag_refill+0x68/0xf0
>    sk_page_frag_refill+0x19/0x70
>    tcp_sendmsg_locked+0x2f4/0xd10
>    tcp_sendmsg+0x29/0xa0
>    sock_sendmsg+0x30/0x40
>    sock_write_iter+0x8f/0x100
>    __vfs_write+0x10b/0x190
>    vfs_write+0xb0/0x190
>    ksys_write+0x5a/0xd0
>    do_syscall_64+0x5d/0x110
>    entry_SYSCALL_64_after_hwframe+0x44/0xa9
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  drivers/base/node.c      |  2 ++
>  drivers/net/virtio_net.c |  3 +--

Is virtio-net the only dirver that requred an update?

>  fs/proc/meminfo.c        |  1 +
>  include/linux/mmzone.h   |  1 +
>  include/linux/skbuff.h   | 43 ++++++++++++++++++++++++++++++++++++++--
>  kernel/exit.c            |  3 +--
>  mm/page_alloc.c          |  7 +++++--
>  mm/vmstat.c              |  1 +
>  net/core/sock.c          |  8 ++++----
>  net/ipv4/tcp.c           |  3 +--
>  net/xfrm/xfrm_state.c    |  3 +--
>  11 files changed, 59 insertions(+), 16 deletions(-)
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
  2020-10-10 10:38 Muchun Song
@ 2020-10-11 18:39   ` Cong Wang
  2020-10-10 13:27 ` kernel test robot
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Cong Wang @ 2020-10-11 18:39 UTC (permalink / raw)
  To: Muchun Song
  Cc: Miaohe Lin, Feng Tang, Michal Hocko, Michael S. Tsirkin,
	Neil Brown, Alexei Starovoitov, LKML, linux-mm, Eric Dumazet,
	Christian Brauner, walken, will, Steffen Klassert, dave,
	Herbert Xu, Daniel Borkmann, rafael, decui, Peter Zijlstra,
	samitolvanen, Alexey Kuznetsov, Paolo Abeni, Alexey Dobriyan,
	Pablo Neira Ayuso, Eric W. Biederman, Kees Cook, Jann Horn,
	shakeelb, Jakub Kicinski, Thomas Gleixner, virtualization,
	chenqiwu, Martin KaFai Lau, Jakub Sitnicki, christophe.leroy,
	Willem de Bruijn, Hideaki YOSHIFUJI, Greg KH, Randy Dunlap,
	Florian Westphal, gustavoars, Roman Gushchin, Minchan Kim, rppt,
	Linux Kernel Network Developers, linux-fsdevel, Andrew Morton,
	David Miller, Kirill A. Shutemov

On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel

We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
rather than /proc/meminfo?

>  static inline void __skb_frag_unref(skb_frag_t *frag)
>  {
> -       put_page(skb_frag_page(frag));
> +       struct page *page = skb_frag_page(frag);
> +
> +       if (put_page_testzero(page)) {
> +               dec_sock_node_page_state(page);
> +               __put_page(page);
> +       }
>  }

You mix socket page frag with skb frag at least, not sure this is exactly
what you want, because clearly skb page frags are frequently used
by network drivers rather than sockets.

Also, which one matches this dec_sock_node_page_state()? Clearly
not skb_fill_page_desc() or __skb_frag_ref().

Thanks.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] mm: proc: add Sock to /proc/meminfo
@ 2020-10-11 18:39   ` Cong Wang
  0 siblings, 0 replies; 9+ messages in thread
From: Cong Wang @ 2020-10-11 18:39 UTC (permalink / raw)
  To: Muchun Song
  Cc: Greg KH, rafael, Michael S. Tsirkin, Jason Wang, David Miller,
	Jakub Kicinski, Alexey Dobriyan, Andrew Morton, Eric Dumazet,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, Steffen Klassert, Herbert Xu,
	shakeelb, will, Michal Hocko, Roman Gushchin, Neil Brown, rppt,
	samitolvanen, Kirill A. Shutemov, Feng Tang, Paolo Abeni,
	Willem de Bruijn, Randy Dunlap, Florian Westphal, gustavoars,
	Pablo Neira Ayuso, decui, Jakub Sitnicki, Peter Zijlstra,
	Christian Brauner, Eric W. Biederman, Thomas Gleixner, dave,
	walken, Jann Horn, chenqiwu, christophe.leroy, Minchan Kim,
	Martin KaFai Lau, Alexei Starovoitov, Daniel Borkmann, Miaohe Lin,
	Kees Cook, LKML, virtualization, Linux Kernel Network Developers,
	linux-fsdevel, linux-mm

On Sat, Oct 10, 2020 at 3:39 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> The amount of memory allocated to sockets buffer can become significant.
> However, we do not display the amount of memory consumed by sockets
> buffer. In this case, knowing where the memory is consumed by the kernel

We do it via `ss -m`. Is it not sufficient? And if not, why not adding it there
rather than /proc/meminfo?

>  static inline void __skb_frag_unref(skb_frag_t *frag)
>  {
> -       put_page(skb_frag_page(frag));
> +       struct page *page = skb_frag_page(frag);
> +
> +       if (put_page_testzero(page)) {
> +               dec_sock_node_page_state(page);
> +               __put_page(page);
> +       }
>  }

You mix socket page frag with skb frag at least, not sure this is exactly
what you want, because clearly skb page frags are frequently used
by network drivers rather than sockets.

Also, which one matches this dec_sock_node_page_state()? Clearly
not skb_fill_page_desc() or __skb_frag_ref().

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-10-11 18:39 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-10-10 13:57 [PATCH] mm: proc: add Sock to /proc/meminfo kernel test robot
  -- strict thread matches above, loose matches on Subject: below --
2020-10-10 10:38 Muchun Song
2020-10-10 13:06 ` kernel test robot
2020-10-10 13:27 ` kernel test robot
2020-10-10 16:36 ` Randy Dunlap
2020-10-10 16:36   ` Randy Dunlap
2020-10-11 13:52 ` Mike Rapoport
2020-10-11 18:39 ` Cong Wang
2020-10-11 18:39   ` Cong Wang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.