netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 3/4] udp: add udp_mem, udp_rmem_min and udp_wmem_min
  2007-11-28 18:48 [PATCH 0/4] UDP memory accounting and limitation (take 9) Hideo AOKI
@ 2007-11-28 18:53 ` Hideo AOKI
  0 siblings, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-11-28 18:53 UTC (permalink / raw)
  To: Herbert Xu, netdev
  Cc: David Miller, Satoshi Oshima, Bill Fink, Andi Kleen,
	Evgeniy Polyakov, Stephen Hemminger, yoshfuji, Yumiko Sugita,
	haoki

This patch adds sysctl parameters for customizing UDP memory accounting:
     /proc/sys/net/ipv4/udp_mem
     /proc/sys/net/ipv4/udp_rmem_min
     /proc/sys/net/ipv4/udp_wmem_min

Udp_mem indicates number of pages which can be used for all UDP sockets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.

This patch is also introduced memory_allocated variable for UDP protocol.

Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 Documentation/networking/ip-sysctl.txt |   18 ++++++++++++++++++
 include/net/udp.h                      |    9 +++++++++
 net/ipv4/af_inet.c                     |    3 +++
 net/ipv4/proc.c                        |    3 ++-
 net/ipv4/sysctl_net_ipv4.c             |   31 +++++++++++++++++++++++++++++++
 net/ipv4/udp.c                         |   27 +++++++++++++++++++++++++++
 6 files changed, 90 insertions(+), 1 deletion(-)

diff -pruN net-2.6-udp-take9a2-p2/Documentation/networking/ip-sysctl.txt net-2.6-udp-take9a2-p3/Documentation/networking/ip-sysctl.txt
--- net-2.6-udp-take9a2-p2/Documentation/networking/ip-sysctl.txt	2007-11-14 10:48:49.000000000 -0500
+++ net-2.6-udp-take9a2-p3/Documentation/networking/ip-sysctl.txt	2007-11-28 12:11:02.000000000 -0500
@@ -446,6 +446,24 @@ tcp_dma_copybreak - INTEGER
 	and CONFIG_NET_DMA is enabled.
 	Default: 4096

+UDP variables:
+
+udp_mem - INTEGER
+	Number of pages allowed for queueing by all UDP sockets.
+	Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+	Minimal size of receive buffer used by UDP sockets. Each UDP socket
+	is able to use the size for receiving data, even if total pages of UDP
+	sockets exceed udp_mem. The unit is byte.
+	Default: 4096
+
+udp_wmem_min - INTEGER
+	Minimal size of send buffer used by UDP sockets. Each UDP socket is
+	able to use the size for sending data, even if total pages of UDP
+	sockets exceed udp_mem. The unit is byte.
+	Default: 4096
+
 CIPSOv4 Variables:

 cipso_cache_enable - BOOLEAN
diff -pruN net-2.6-udp-take9a2-p2/include/net/udp.h net-2.6-udp-take9a2-p3/include/net/udp.h
--- net-2.6-udp-take9a2-p2/include/net/udp.h	2007-11-14 10:49:05.000000000 -0500
+++ net-2.6-udp-take9a2-p3/include/net/udp.h	2007-11-28 12:11:02.000000000 -0500
@@ -65,6 +65,13 @@ extern rwlock_t udp_hash_lock;

 extern struct proto udp_prot;

+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem;
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
 struct sk_buff;

 /*
@@ -173,4 +180,6 @@ extern void udp_proc_unregister(struct u
 extern int  udp4_proc_init(void);
 extern void udp4_proc_exit(void);
 #endif
+
+extern void udp_init(void);
 #endif	/* _UDP_H */
diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/af_inet.c net-2.6-udp-take9a2-p3/net/ipv4/af_inet.c
--- net-2.6-udp-take9a2-p2/net/ipv4/af_inet.c	2007-11-14 10:49:06.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/af_inet.c	2007-11-28 12:11:02.000000000 -0500
@@ -1418,6 +1418,9 @@ static int __init inet_init(void)
 	/* Setup TCP slab cache for open requests. */
 	tcp_init();

+	/* Setup UDP memory threshold */
+	udp_init();
+
 	/* Add UDP-Lite (RFC 3828) */
 	udplite4_register();

diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/proc.c net-2.6-udp-take9a2-p3/net/ipv4/proc.c
--- net-2.6-udp-take9a2-p2/net/ipv4/proc.c	2007-11-14 10:49:07.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/proc.c	2007-11-28 12:11:02.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
 		   sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
 		   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
 		   atomic_read(&tcp_memory_allocated));
-	seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+	seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+		   atomic_read(&udp_memory_allocated));
 	seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
 	seq_printf(seq,  "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/sysctl_net_ipv4.c net-2.6-udp-take9a2-p3/net/ipv4/sysctl_net_ipv4.c
--- net-2.6-udp-take9a2-p2/net/ipv4/sysctl_net_ipv4.c	2007-11-20 10:29:40.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/sysctl_net_ipv4.c	2007-11-28 12:11:02.000000000 -0500
@@ -18,6 +18,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
+#include <net/udp.h>
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>

@@ -885,6 +886,36 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_mem",
+		.data		= &sysctl_udp_mem,
+		.maxlen		= sizeof(sysctl_udp_mem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_rmem_min",
+		.data		= &sysctl_udp_rmem_min,
+		.maxlen		= sizeof(sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_wmem_min",
+		.data		= &sysctl_udp_wmem_min,
+		.maxlen		= sizeof(sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
 	{ .ctl_name = 0 }
 };

diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/udp.c net-2.6-udp-take9a2-p3/net/ipv4/udp.c
--- net-2.6-udp-take9a2-p2/net/ipv4/udp.c	2007-11-14 10:49:07.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/udp.c	2007-11-28 12:11:02.000000000 -0500
@@ -82,6 +82,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <linux/bootmem.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/module.h>
@@ -114,6 +115,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);

+atomic_t udp_memory_allocated;
+int sysctl_udp_mem __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
 static inline int __udp_lib_lport_inuse(__u16 num,
 					const struct hlist_head udptable[])
 {
@@ -1449,6 +1455,10 @@ struct proto udp_prot = {
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = &sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
@@ -1644,6 +1654,23 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */

+void __init udp_init(void)
+{
+	unsigned long limit;
+
+	/* Set the pressure threshold up by the same strategy of TCP. It is a
+	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+	 * toward zero with the amount of memory, with a floor of 128 pages.
+	 */
+	limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+	limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+	limit = max(limit, 128UL);
+	sysctl_udp_mem = limit / 2 * 3;
+
+	sysctl_udp_rmem_min = SK_DATAGRAM_MEM_QUANTUM;
+	sysctl_udp_wmem_min = SK_DATAGRAM_MEM_QUANTUM;
+}
+
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
 EXPORT_SYMBOL(udp_hash_lock);
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min
  2007-12-15  5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
@ 2007-12-15  5:15 ` Hideo AOKI
  0 siblings, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-15  5:15 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
	Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
	Yumiko Sugita, haoki

This patch adds sysctl parameters for customizing UDP memory accounting:
     /proc/sys/net/ipv4/udp_mem
     /proc/sys/net/ipv4/udp_rmem_min
     /proc/sys/net/ipv4/udp_wmem_min

Udp_mem indicates number of pages which can be used for all UDP sockets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.

This patch is also introduced memory_allocated variable for UDP protocol.

Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 Documentation/networking/ip-sysctl.txt |   18 ++++++++++++++++++
 include/net/udp.h                      |    9 +++++++++
 net/ipv4/af_inet.c                     |    3 +++
 net/ipv4/proc.c                        |    3 ++-
 net/ipv4/sysctl_net_ipv4.c             |   31 +++++++++++++++++++++++++++++++
 net/ipv4/udp.c                         |   27 +++++++++++++++++++++++++++
 6 files changed, 90 insertions(+), 1 deletion(-)

diff -pruN net-2.6-udp-take10a4-p2/Documentation/networking/ip-sysctl.txt net-2.6-udp-take10a4-p3/Documentation/networking/ip-sysctl.txt
--- net-2.6-udp-take10a4-p2/Documentation/networking/ip-sysctl.txt	2007-12-11 10:54:41.000000000 -0500
+++ net-2.6-udp-take10a4-p3/Documentation/networking/ip-sysctl.txt	2007-12-14 20:27:54.000000000 -0500
@@ -446,6 +446,24 @@ tcp_dma_copybreak - INTEGER
 	and CONFIG_NET_DMA is enabled.
 	Default: 4096

+UDP variables:
+
+udp_mem - INTEGER
+	Number of pages allowed for queueing by all UDP sockets.
+	Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+	Minimal size of receive buffer used by UDP sockets. Each UDP socket
+	is able to use the size for receiving data, even if total pages of UDP
+	sockets exceed udp_mem. The unit is byte.
+	Default: 4096
+
+udp_wmem_min - INTEGER
+	Minimal size of send buffer used by UDP sockets. Each UDP socket is
+	able to use the size for sending data, even if total pages of UDP
+	sockets exceed udp_mem. The unit is byte.
+	Default: 4096
+
 CIPSOv4 Variables:

 cipso_cache_enable - BOOLEAN
diff -pruN net-2.6-udp-take10a4-p2/include/net/udp.h net-2.6-udp-take10a4-p3/include/net/udp.h
--- net-2.6-udp-take10a4-p2/include/net/udp.h	2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take10a4-p3/include/net/udp.h	2007-12-14 20:27:54.000000000 -0500
@@ -65,6 +65,13 @@ extern rwlock_t udp_hash_lock;

 extern struct proto udp_prot;

+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem;
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
 struct sk_buff;

 /*
@@ -173,4 +180,6 @@ extern void udp_proc_unregister(struct u
 extern int  udp4_proc_init(void);
 extern void udp4_proc_exit(void);
 #endif
+
+extern void udp_init(void);
 #endif	/* _UDP_H */
diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/af_inet.c net-2.6-udp-take10a4-p3/net/ipv4/af_inet.c
--- net-2.6-udp-take10a4-p2/net/ipv4/af_inet.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/af_inet.c	2007-12-14 20:27:54.000000000 -0500
@@ -1418,6 +1418,9 @@ static int __init inet_init(void)
 	/* Setup TCP slab cache for open requests. */
 	tcp_init();

+	/* Setup UDP memory threshold */
+	udp_init();
+
 	/* Add UDP-Lite (RFC 3828) */
 	udplite4_register();

diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/proc.c net-2.6-udp-take10a4-p3/net/ipv4/proc.c
--- net-2.6-udp-take10a4-p2/net/ipv4/proc.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/proc.c	2007-12-14 20:27:54.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
 		   sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
 		   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
 		   atomic_read(&tcp_memory_allocated));
-	seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+	seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+		   atomic_read(&udp_memory_allocated));
 	seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
 	seq_printf(seq,  "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/sysctl_net_ipv4.c net-2.6-udp-take10a4-p3/net/ipv4/sysctl_net_ipv4.c
--- net-2.6-udp-take10a4-p2/net/ipv4/sysctl_net_ipv4.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/sysctl_net_ipv4.c	2007-12-14 20:27:54.000000000 -0500
@@ -18,6 +18,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
+#include <net/udp.h>
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>

@@ -885,6 +886,36 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_mem",
+		.data		= &sysctl_udp_mem,
+		.maxlen		= sizeof(sysctl_udp_mem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_rmem_min",
+		.data		= &sysctl_udp_rmem_min,
+		.maxlen		= sizeof(sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_wmem_min",
+		.data		= &sysctl_udp_wmem_min,
+		.maxlen		= sizeof(sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
 	{ .ctl_name = 0 }
 };

diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/udp.c net-2.6-udp-take10a4-p3/net/ipv4/udp.c
--- net-2.6-udp-take10a4-p2/net/ipv4/udp.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/udp.c	2007-12-14 20:27:54.000000000 -0500
@@ -82,6 +82,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <linux/bootmem.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/module.h>
@@ -114,6 +115,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);

+atomic_t udp_memory_allocated;
+int sysctl_udp_mem __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
 static inline int __udp_lib_lport_inuse(__u16 num,
 					const struct hlist_head udptable[])
 {
@@ -1449,6 +1455,10 @@ struct proto udp_prot = {
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = &sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
@@ -1644,6 +1654,23 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */

+void __init udp_init(void)
+{
+	unsigned long limit;
+
+	/* Set the pressure threshold up by the same strategy of TCP. It is a
+	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+	 * toward zero with the amount of memory, with a floor of 128 pages.
+	 */
+	limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+	limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+	limit = max(limit, 128UL);
+	sysctl_udp_mem = limit / 4 * 3;
+
+	sysctl_udp_rmem_min = SK_DATAGRAM_MEM_QUANTUM;
+	sysctl_udp_wmem_min = SK_DATAGRAM_MEM_QUANTUM;
+}
+
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
 EXPORT_SYMBOL(udp_hash_lock);
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 0/4] [UDP]: memory accounting and limitation (take 11)
@ 2007-12-18  2:33 Hideo AOKI
  2007-12-18  2:38 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-18  2:33 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: haoki, Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima,
	Bill Fink, Andi Kleen, Evgeniy Polyakov, Stephen Hemminger,
	yoshfuji, Yumiko Sugita

Hello,

I updated patch set of  UDP memory accounting and limitation.

The spin lock that I used in previous take was removed from datagram
memory accounting functions. As David commented, I used socket lock
and backlog processing to keep consistency of memory accounting like
TCP. I added socket lock to places where skbuff is freed, since the
locking is needed to change sk_forward_alloc only.

Moreover, I revised memory accounting functions. As Herbert commented,
I stopped using large inline function and tried to reduce amount of
inline functions.

The patch set was tested on net-2.6 tree.


Changelog take 10 -> take 11:
 * stopped using spin lock in memory accounting function
 * socket lock and backlog processing were used to avoid conflict
   between receive system call processing and BH
 * revised memory accounting functions
 * stooped changing sock_queue_rcv_skb() and skb_set_owner_r()
 * added __udp_queue_rcv_skb to set proper destructor
 * removed udp_set_owner_r()
 * removed reclaim in inet_sock_destruct()

Changelog take 9 -> take 10:
 * supported using sk_forward_alloc
 * introduced several memory accounting functions with spin lock
 * changed detagram receive functions to be able to customize
   destructor
 * fixed accounting bugs in previous takes

Changelog take 8 -> take 9:
 * introduced mem_schdeule functions for datargram protocols
 * removed protocol check function, from patch set
 * restructured patch set

Changelog take 7 -> take 8:

 * sk_datagram_pages(): avoided using divide instruction
 * udp_recvmsg(): fixed referring released truesize in accounting

Best regards,
Hideo Aoki

--
Hitachi Computer Products (America) Inc.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/4] [UDP]: fix send buffer check
  2007-12-18  2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
@ 2007-12-18  2:38 ` Hideo AOKI
  2007-12-20 11:31   ` David Miller
  2007-12-18  2:38 ` [PATCH 2/4] [CORE]: datagram: basic memory accounting functions Hideo AOKI
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 16+ messages in thread
From: Hideo AOKI @ 2007-12-18  2:38 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, Bill Fink,
	Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
	Yumiko Sugita, haoki

This patch introduces sndbuf size check before memory allocation for
send buffer.

signed-off-by: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 ip_output.c |    5 +++++
 1 file changed, 5 insertions(+)

diff -pruN net-2.6/net/ipv4/ip_output.c net-2.6-udp-take11a1-p1/net/ipv4/ip_output.c
--- net-2.6/net/ipv4/ip_output.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p1/net/ipv4/ip_output.c	2007-12-17 14:42:31.000000000 -0500
@@ -1004,6 +1004,11 @@ alloc_new_skb:
 					frag = &skb_shinfo(skb)->frags[i];
 				}
 			} else if (i < MAX_SKB_FRAGS) {
+				if (atomic_read(&sk->sk_wmem_alloc) + PAGE_SIZE
+				    > 2 * sk->sk_sndbuf) {
+					err = -ENOBUFS;
+					goto error;
+				}
 				if (copy > PAGE_SIZE)
 					copy = PAGE_SIZE;
 				page = alloc_pages(sk->sk_allocation, 0);
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/4] [CORE]: datagram: basic memory accounting functions
  2007-12-18  2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
  2007-12-18  2:38 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
@ 2007-12-18  2:38 ` Hideo AOKI
  2007-12-19  3:21   ` Hideo AOKI
  2007-12-20 11:43   ` David Miller
  2007-12-18  2:38 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
  2007-12-18  2:38 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
  3 siblings, 2 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-18  2:38 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, Bill Fink,
	Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
	Yumiko Sugita, haoki

This patch includes changes in network core sub system for memory
accounting.

Memory scheduling, charging, uncharging and reclaiming functions are
added. These functions use sk_forward_alloc to store socket local
accounting. They currently support only datagram protocols.

sk_datagram_rfree() is a receive buffer detractor for datagram
protocols which are capable of protocol specific memory accounting.

Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 include/net/sock.h  |   81 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/core/datagram.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)

diff -pruN net-2.6-udp-take11a1-p1/include/net/sock.h net-2.6-udp-take11a1-p2/include/net/sock.h
--- net-2.6-udp-take11a1-p1/include/net/sock.h	2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take11a1-p2/include/net/sock.h	2007-12-17 14:42:39.000000000 -0500
@@ -750,6 +750,9 @@ static inline struct inode *SOCK_INODE(s
 	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
 }

+/*
+ * Functions for memory accounting
+ */
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);

@@ -778,6 +781,82 @@ static inline int sk_stream_wmem_schedul
 	       sk_stream_mem_schedule(sk, size, 0);
 }

+extern void __sk_datagram_mem_reclaim(struct sock *sk);
+extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
+
+#define SK_DATAGRAM_MEM_QUANTUM ((unsigned int)PAGE_SIZE)
+
+static inline int sk_datagram_pages(int amt)
+{
+	/* Cast to unsigned as an optimization, since amt is always positive. */
+	return DIV_ROUND_UP((unsigned int)amt, SK_DATAGRAM_MEM_QUANTUM);
+}
+
+extern int __sk_datagram_account_charge(struct sock *sk, int size, int kind);
+extern void __sk_datagram_mem_reclaim(struct sock *sk);
+extern int sk_datagram_mem_schedule(struct sock *sk, int size, int kind);
+
+static inline void sk_datagram_mem_reclaim(struct sock *sk)
+{
+	if (!sk->sk_prot->memory_allocated)
+		return;
+
+	__sk_datagram_mem_reclaim(sk);
+}
+
+static inline int sk_datagram_rmem_schedule(struct sock *sk, int size)
+{
+	return size <= sk->sk_forward_alloc ||
+		sk_datagram_mem_schedule(sk, size, 1);
+}
+
+static inline int sk_datagram_wmem_schedule(struct sock *sk, int size)
+{
+	return size <= sk->sk_forward_alloc ||
+		sk_datagram_mem_schedule(sk, size, 0);
+}
+
+static inline void sk_mem_reclaim(struct sock *sk)
+{
+	if (sk->sk_type == SOCK_DGRAM)
+		sk_datagram_mem_reclaim(sk);
+}
+
+static inline int sk_wmem_schedule(struct sock *sk, int size)
+{
+	if (sk->sk_type == SOCK_DGRAM)
+		return sk_datagram_wmem_schedule(sk, size);
+	else
+		return 1;
+}
+
+static inline int sk_account_wmem_charge(struct sock *sk, int size)
+{
+	/* account if protocol supports memory accounting. */
+	if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+		return 1;
+
+	return __sk_datagram_account_charge(sk, size, 0);
+}
+
+static inline int sk_account_rmem_charge(struct sock *sk, int size)
+{
+	/* account if protocol supports memory accounting. */
+	if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+		return 1;
+
+	return __sk_datagram_account_charge(sk, size, 1);
+}
+
+static inline void sk_account_uncharge(struct sock *sk, int size)
+{
+	/* account if protocol supports memory accounting. */
+	if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+		return;
+
+	sk->sk_forward_alloc += size;
+}
+
 /* Used by processes to "lock" a socket state, so that
  * interrupts and bottom half handlers won't change it
  * from under us. It essentially blocks any incoming
@@ -1166,6 +1245,8 @@ static inline void skb_set_owner_r(struc
 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 }

+extern void sk_datagram_rfree(struct sk_buff *skb);
+
 extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
 			   unsigned long expires);

diff -pruN net-2.6-udp-take11a1-p1/net/core/datagram.c net-2.6-udp-take11a1-p2/net/core/datagram.c
--- net-2.6-udp-take11a1-p1/net/core/datagram.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p2/net/core/datagram.c	2007-12-17 14:42:39.000000000 -0500
@@ -484,6 +484,91 @@ fault:
 }

 /**
+ *	sk_datagram_rfree - receive buffer detractor for datagram protocls
+ *	@skb: skbuff
+ */
+void sk_datagram_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	skb_truesize_check(skb);
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+	sk_account_uncharge(sk, skb->truesize);
+	sk_datagram_mem_reclaim(sk);
+}
+EXPORT_SYMBOL(sk_datagram_rfree);
+
+
+/**
+ * 	__sk_datagram_account_charge - send buffer for datagram protocls
+ *	@sk: socket
+ *	@size: memory size to charge
+ *	@kind: charge type
+ *
+ *	If kind is 0, it means wmem allocation. Otherwise it means rmem
+ *	allocation.
+ */
+int __sk_datagram_account_charge(struct sock *sk, int size, int kind)
+{
+	if ((kind && sk_datagram_rmem_schedule(sk, size)) ||
+	    (!kind && sk_datagram_wmem_schedule(sk, size))) {
+		sk->sk_forward_alloc -= size;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__sk_datagram_account_charge);
+
+/**
+ * 	__sk_datagram_mem_reclaim - send buffer for datagram protocls
+ *	@sk: socket
+ */
+void __sk_datagram_mem_reclaim(struct sock *sk)
+{
+	if (sk->sk_forward_alloc < SK_DATAGRAM_MEM_QUANTUM)
+		return;
+
+	atomic_sub(sk->sk_forward_alloc / SK_DATAGRAM_MEM_QUANTUM,
+		   sk->sk_prot->memory_allocated);
+	sk->sk_forward_alloc &= SK_DATAGRAM_MEM_QUANTUM - 1;
+}
+EXPORT_SYMBOL(__sk_datagram_mem_reclaim);
+
+/**
+ * 	sk_datagram_mem_schedule - memory accounting for datagram protocls
+ *	@sk: socket
+ *	@size: memory size to allocate
+ *	@kind: allocation type
+ *
+ *	If kind is 0, it means wmem allocation. Otherwise it means rmem
+ *	allocation.
+ */
+int sk_datagram_mem_schedule(struct sock *sk, int size, int kind)
+{
+	int amt;
+	struct proto *prot = sk->sk_prot;
+
+	/* Don't account and limit memory if protocol doesn't support. */
+	if (!prot->memory_allocated)
+		return 1;
+
+	amt = sk_datagram_pages(size);
+	if (atomic_add_return(amt, prot->memory_allocated) >
+	    prot->sysctl_mem[0])
+		if ((kind && atomic_read(&sk->sk_rmem_alloc) + size >=
+		     prot->sysctl_rmem[0]) ||
+		    (!kind && atomic_read(&sk->sk_wmem_alloc) + size >=
+		     prot->sysctl_wmem[0])) {
+			/* Undo changes. */
+			atomic_sub(amt, prot->memory_allocated);
+			return 0;
+		}
+	sk->sk_forward_alloc += amt * SK_DATAGRAM_MEM_QUANTUM;
+	return 1;
+}
+EXPORT_SYMBOL(sk_datagram_mem_schedule);
+
+/**
  * 	datagram_poll - generic datagram poll
  *	@file: file struct
  *	@sock: socket
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min
  2007-12-18  2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
  2007-12-18  2:38 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
  2007-12-18  2:38 ` [PATCH 2/4] [CORE]: datagram: basic memory accounting functions Hideo AOKI
@ 2007-12-18  2:38 ` Hideo AOKI
  2007-12-18  2:38 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
  3 siblings, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-18  2:38 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, Bill Fink,
	Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
	Yumiko Sugita, haoki

This patch adds sysctl parameters for customizing UDP memory accounting:
     /proc/sys/net/ipv4/udp_mem
     /proc/sys/net/ipv4/udp_rmem_min
     /proc/sys/net/ipv4/udp_wmem_min

Udp_mem indicates number of pages which can be used for all UDP sockets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.

This patch is also introduced memory_allocated variable for UDP protocol.

Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 Documentation/networking/ip-sysctl.txt |   18 ++++++++++++++++++
 include/net/udp.h                      |    9 +++++++++
 net/ipv4/af_inet.c                     |    3 +++
 net/ipv4/proc.c                        |    3 ++-
 net/ipv4/sysctl_net_ipv4.c             |   31 +++++++++++++++++++++++++++++++
 net/ipv4/udp.c                         |   27 +++++++++++++++++++++++++++
 6 files changed, 90 insertions(+), 1 deletion(-)

diff -pruN net-2.6-udp-take11a1-p2/Documentation/networking/ip-sysctl.txt net-2.6-udp-take11a1-p3/Documentation/networking/ip-sysctl.txt
--- net-2.6-udp-take11a1-p2/Documentation/networking/ip-sysctl.txt	2007-12-11 10:54:41.000000000 -0500
+++ net-2.6-udp-take11a1-p3/Documentation/networking/ip-sysctl.txt	2007-12-17 14:42:40.000000000 -0500
@@ -446,6 +446,24 @@ tcp_dma_copybreak - INTEGER
 	and CONFIG_NET_DMA is enabled.
 	Default: 4096

+UDP variables:
+
+udp_mem - INTEGER
+	Number of pages allowed for queueing by all UDP sockets.
+	Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+	Minimal size of receive buffer used by UDP sockets. Each UDP socket
+	is able to use the size for receiving data, even if total pages of UDP
+	sockets exceed udp_mem. The unit is byte.
+	Default: 4096
+
+udp_wmem_min - INTEGER
+	Minimal size of send buffer used by UDP sockets. Each UDP socket is
+	able to use the size for sending data, even if total pages of UDP
+	sockets exceed udp_mem. The unit is byte.
+	Default: 4096
+
 CIPSOv4 Variables:

 cipso_cache_enable - BOOLEAN
diff -pruN net-2.6-udp-take11a1-p2/include/net/udp.h net-2.6-udp-take11a1-p3/include/net/udp.h
--- net-2.6-udp-take11a1-p2/include/net/udp.h	2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take11a1-p3/include/net/udp.h	2007-12-17 14:42:40.000000000 -0500
@@ -65,6 +65,13 @@ extern rwlock_t udp_hash_lock;

 extern struct proto udp_prot;

+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem;
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
 struct sk_buff;

 /*
@@ -173,4 +180,6 @@ extern void udp_proc_unregister(struct u
 extern int  udp4_proc_init(void);
 extern void udp4_proc_exit(void);
 #endif
+
+extern void udp_init(void);
 #endif	/* _UDP_H */
diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/af_inet.c net-2.6-udp-take11a1-p3/net/ipv4/af_inet.c
--- net-2.6-udp-take11a1-p2/net/ipv4/af_inet.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/af_inet.c	2007-12-17 14:42:40.000000000 -0500
@@ -1418,6 +1418,9 @@ static int __init inet_init(void)
 	/* Setup TCP slab cache for open requests. */
 	tcp_init();

+	/* Setup UDP memory threshold */
+	udp_init();
+
 	/* Add UDP-Lite (RFC 3828) */
 	udplite4_register();

diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/proc.c net-2.6-udp-take11a1-p3/net/ipv4/proc.c
--- net-2.6-udp-take11a1-p2/net/ipv4/proc.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/proc.c	2007-12-17 14:42:40.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
 		   sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
 		   tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
 		   atomic_read(&tcp_memory_allocated));
-	seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+	seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+		   atomic_read(&udp_memory_allocated));
 	seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
 	seq_printf(seq,  "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/sysctl_net_ipv4.c net-2.6-udp-take11a1-p3/net/ipv4/sysctl_net_ipv4.c
--- net-2.6-udp-take11a1-p2/net/ipv4/sysctl_net_ipv4.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/sysctl_net_ipv4.c	2007-12-17 14:42:40.000000000 -0500
@@ -18,6 +18,7 @@
 #include <net/ip.h>
 #include <net/route.h>
 #include <net/tcp.h>
+#include <net/udp.h>
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>

@@ -885,6 +886,36 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_mem",
+		.data		= &sysctl_udp_mem,
+		.maxlen		= sizeof(sysctl_udp_mem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_rmem_min",
+		.data		= &sysctl_udp_rmem_min,
+		.maxlen		= sizeof(sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "udp_wmem_min",
+		.data		= &sysctl_udp_wmem_min,
+		.maxlen		= sizeof(sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
 	{ .ctl_name = 0 }
 };

diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/udp.c net-2.6-udp-take11a1-p3/net/ipv4/udp.c
--- net-2.6-udp-take11a1-p2/net/ipv4/udp.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/udp.c	2007-12-17 14:42:40.000000000 -0500
@@ -82,6 +82,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
+#include <linux/bootmem.h>
 #include <linux/types.h>
 #include <linux/fcntl.h>
 #include <linux/module.h>
@@ -114,6 +115,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
 struct hlist_head udp_hash[UDP_HTABLE_SIZE];
 DEFINE_RWLOCK(udp_hash_lock);

+atomic_t udp_memory_allocated;
+int sysctl_udp_mem __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
 static inline int __udp_lib_lport_inuse(__u16 num,
 					const struct hlist_head udptable[])
 {
@@ -1449,6 +1455,10 @@ struct proto udp_prot = {
 	.hash		   = udp_lib_hash,
 	.unhash		   = udp_lib_unhash,
 	.get_port	   = udp_v4_get_port,
+	.memory_allocated  = &udp_memory_allocated,
+	.sysctl_mem	   = &sysctl_udp_mem,
+	.sysctl_wmem	   = &sysctl_udp_wmem_min,
+	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
 #ifdef CONFIG_COMPAT
 	.compat_setsockopt = compat_udp_setsockopt,
@@ -1644,6 +1654,23 @@ void udp4_proc_exit(void)
 }
 #endif /* CONFIG_PROC_FS */

+void __init udp_init(void)
+{
+	unsigned long limit;
+
+	/* Set the pressure threshold up by the same strategy of TCP. It is a
+	 * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+	 * toward zero with the amount of memory, with a floor of 128 pages.
+	 */
+	limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+	limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+	limit = max(limit, 128UL);
+	sysctl_udp_mem = limit / 4 * 3;
+
+	sysctl_udp_rmem_min = SK_DATAGRAM_MEM_QUANTUM;
+	sysctl_udp_wmem_min = SK_DATAGRAM_MEM_QUANTUM;
+}
+
 EXPORT_SYMBOL(udp_disconnect);
 EXPORT_SYMBOL(udp_hash);
 EXPORT_SYMBOL(udp_hash_lock);
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 4/4] [UDP]: memory accounting in IPv4
  2007-12-18  2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
                   ` (2 preceding siblings ...)
  2007-12-18  2:38 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
@ 2007-12-18  2:38 ` Hideo AOKI
  2007-12-20 11:44   ` David Miller
  3 siblings, 1 reply; 16+ messages in thread
From: Hideo AOKI @ 2007-12-18  2:38 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, Bill Fink,
	Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
	Yumiko Sugita, haoki

This patch adds UDP memory usage accounting in IPv4.

Send buffer accounting is performed by IP layer, because skbuff is
allocated in the layer.

Receive buffer is charged, when the buffer successfully received.
Destructor of the buffer does uncharging and reclaiming, when the
buffer is freed. To set destructor at proper place, we use
__udp_queue_rcv_skb() instead of sock_queue_rcv_skb(). To maintain
consistency of memory accounting, socket lock is used to free receive
buffer in udp_recvmsg().

New packet will be add to backlog when the socket is used by user.

Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 ip_output.c |   46 +++++++++++++++++++++++++++++++++--
 udp.c       |   78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 119 insertions(+), 5 deletions(-)

diff -pruN net-2.6-udp-take11a1-p3/net/ipv4/ip_output.c net-2.6-udp-take11a1-p4/net/ipv4/ip_output.c
--- net-2.6-udp-take11a1-p3/net/ipv4/ip_output.c	2007-12-17 14:42:31.000000000 -0500
+++ net-2.6-udp-take11a1-p4/net/ipv4/ip_output.c	2007-12-17 14:42:49.000000000 -0500
@@ -707,6 +707,7 @@ static inline int ip_ufo_append_data(str
 {
 	struct sk_buff *skb;
 	int err;
+	int first_size, second_size;

 	/* There is support for UDP fragmentation offload by network
 	 * device, so create one single skb packet containing complete
@@ -720,6 +721,11 @@ static inline int ip_ufo_append_data(str
 		if (skb == NULL)
 			return err;

+		if (!sk_account_wmem_charge(sk, skb->truesize)) {
+			err = -ENOBUFS;
+			goto fail;
+		}
+
 		/* reserve space for Hardware header */
 		skb_reserve(skb, hh_len);

@@ -736,6 +742,7 @@ static inline int ip_ufo_append_data(str
 		skb->csum = 0;
 		sk->sk_sndmsg_off = 0;
 	}
+	first_size = skb->truesize;

 	err = skb_append_datato_frags(sk,skb, getfrag, from,
 			       (length - transhdrlen));
@@ -743,6 +750,15 @@ static inline int ip_ufo_append_data(str
 		/* specify the length of each IP datagram fragment*/
 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+		second_size = skb->truesize - first_size;
+		if (!sk_account_wmem_charge(sk, second_size)) {
+			sk_account_uncharge(sk, first_size);
+			sk_mem_reclaim(sk);
+			err = -ENOBUFS;
+			goto fail;
+		}
+
 		__skb_queue_tail(&sk->sk_write_queue, skb);

 		return 0;
@@ -750,6 +766,7 @@ static inline int ip_ufo_append_data(str
 	/* There is not enough support do UFO ,
 	 * so follow normal path
 	 */
+fail:
 	kfree_skb(skb);
 	return err;
 }
@@ -924,6 +941,11 @@ alloc_new_skb:
 			}
 			if (skb == NULL)
 				goto error;
+			if (!sk_account_wmem_charge(sk, skb->truesize)) {
+				err = -ENOBUFS;
+				kfree_skb(skb);
+				goto error;
+			}

 			/*
 			 *	Fill in the control structures
@@ -954,6 +976,8 @@ alloc_new_skb:
 			copy = datalen - transhdrlen - fraggap;
 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 				err = -EFAULT;
+				sk_account_uncharge(sk, skb->truesize);
+				sk_mem_reclaim(sk);
 				kfree_skb(skb);
 				goto error;
 			}
@@ -1023,6 +1047,10 @@ alloc_new_skb:
 				frag = &skb_shinfo(skb)->frags[i];
 				skb->truesize += PAGE_SIZE;
 				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+				if (!sk_account_wmem_charge(sk, PAGE_SIZE)) {
+					err = -ENOBUFS;
+					goto error;
+				}
 			} else {
 				err = -EMSGSIZE;
 				goto error;
@@ -1124,6 +1152,11 @@ ssize_t	ip_append_page(struct sock *sk,
 				err = -ENOBUFS;
 				goto error;
 			}
+			if (!sk_account_wmem_charge(sk, skb->truesize)) {
+				kfree_skb(skb);
+				err = -ENOBUFS;
+				goto error;
+			}

 			/*
 			 *	Fill in the control structures
@@ -1213,13 +1246,14 @@ int ip_push_pending_frames(struct sock *
 	struct iphdr *iph;
 	__be16 df = 0;
 	__u8 ttl;
-	int err = 0;
+	int err = 0, send_size;

 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
 		goto out;
 	tail_skb = &(skb_shinfo(skb)->frag_list);

 	/* move skb->data to ip header from ext header */
+	send_size = skb->truesize;
 	if (skb->data < skb_network_header(skb))
 		__skb_pull(skb, skb_network_offset(skb));
 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
@@ -1229,6 +1263,7 @@ int ip_push_pending_frames(struct sock *
 		skb->len += tmp_skb->len;
 		skb->data_len += tmp_skb->len;
 		skb->truesize += tmp_skb->truesize;
+		send_size += tmp_skb->truesize;
 		__sock_put(tmp_skb->sk);
 		tmp_skb->destructor = NULL;
 		tmp_skb->sk = NULL;
@@ -1284,6 +1319,8 @@ int ip_push_pending_frames(struct sock *
 	/* Netfilter gets whole the not fragmented skb. */
 	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
 		      skb->dst->dev, dst_output);
+	sk_account_uncharge(sk, send_size);
+	sk_mem_reclaim(sk);
 	if (err) {
 		if (err > 0)
 			err = inet->recverr ? net_xmit_errno(err) : 0;
@@ -1306,10 +1343,15 @@ error:
 void ip_flush_pending_frames(struct sock *sk)
 {
 	struct sk_buff *skb;
+	int truesize = 0;

-	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
+		truesize += skb->truesize;
 		kfree_skb(skb);
+	}

+	sk_account_uncharge(sk, truesize);
+	sk_mem_reclaim(sk);
 	ip_cork_release(inet_sk(sk));
 }

diff -pruN net-2.6-udp-take11a1-p3/net/ipv4/udp.c net-2.6-udp-take11a1-p4/net/ipv4/udp.c
--- net-2.6-udp-take11a1-p3/net/ipv4/udp.c	2007-12-17 14:42:40.000000000 -0500
+++ net-2.6-udp-take11a1-p4/net/ipv4/udp.c	2007-12-17 18:29:06.000000000 -0500
@@ -897,14 +897,18 @@ try_again:
 		err = ulen;

 out_free:
+	lock_sock(sk);
 	skb_free_datagram(sk, skb);
+	release_sock(sk);
 out:
 	return err;

 csum_copy_err:
 	UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite);

+	lock_sock(sk);
 	skb_kill_datagram(sk, skb, flags);
+	release_sock(sk);

 	if (noblock)
 		return -EAGAIN;
@@ -934,6 +938,53 @@ int udp_disconnect(struct sock *sk, int
 	return 0;
 }

+/**
+ * 	__udp_queue_rcv_skb - put new skb to receive queue of socket
+ *	@sk: socket
+ *	@skb: skbuff
+ *
+ *	This function basically does the same thing as sock_queue_rcv_skb().
+ *	The difference to it is to set another destrucfor which is able to
+ *	do memory accouting.
+ */
+int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	int err = 0;
+	int skb_len;
+
+	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
+	   number of warnings when compiling with -W --ANK
+	 */
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = sk_filter(sk, skb);
+	if (err)
+		goto out;
+
+	skb->dev = NULL;
+	skb->sk = sk;
+	skb->destructor = sk_datagram_rfree;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+	/* Cache the SKB length before we tack it onto the receive
+	 * queue.  Once it is added it no longer belongs to us and
+	 * may be freed by other threads of control pulling packets
+	 * from the queue.
+	 */
+	skb_len = skb->len;
+
+	skb_queue_tail(&sk->sk_receive_queue, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb_len);
+out:
+	return err;
+}
+
 /* returns:
  *  -1: error
  *   0: success
@@ -1022,10 +1073,17 @@ int udp_queue_rcv_skb(struct sock * sk,
 			goto drop;
 	}

-	if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
+	if (!sk_account_rmem_charge(sk, skb->truesize)) {
+		UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+		goto drop;
+	}
+
+	if ((rc = __udp_queue_rcv_skb(sk, skb)) < 0) {
 		/* Note that an ENOMEM error is charged twice */
 		if (rc == -ENOMEM)
 			UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+		sk_account_uncharge(sk, skb->truesize);
+		sk_datagram_mem_reclaim(sk);
 		goto drop;
 	}

@@ -1068,7 +1126,15 @@ static int __udp4_lib_mcast_deliver(stru
 				skb1 = skb_clone(skb, GFP_ATOMIC);

 			if (skb1) {
-				int ret = udp_queue_rcv_skb(sk, skb1);
+				int ret = 0;
+
+				bh_lock_sock_nested(sk);
+				if (!sock_owned_by_user(sk))
+					ret = udp_queue_rcv_skb(sk, skb1);
+				else
+					sk_add_backlog(sk, skb1);
+				bh_unlock_sock(sk);
+
 				if (ret > 0)
 					/* we should probably re-process instead
 					 * of dropping packets here. */
@@ -1161,7 +1227,13 @@ int __udp4_lib_rcv(struct sk_buff *skb,
 			       inet_iif(skb), udptable);

 	if (sk != NULL) {
-		int ret = udp_queue_rcv_skb(sk, skb);
+		int ret = 0;
+		bh_lock_sock_nested(sk);
+		if (!sock_owned_by_user(sk))
+			ret = udp_queue_rcv_skb(sk, skb);
+		else
+			sk_add_backlog(sk, skb);
+		bh_unlock_sock(sk);
 		sock_put(sk);

 		/* a return value > 0 means to resubmit the input, but
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] [CORE]: datagram: basic memory accounting functions
  2007-12-18  2:38 ` [PATCH 2/4] [CORE]: datagram: basic memory accounting functions Hideo AOKI
@ 2007-12-19  3:21   ` Hideo AOKI
  2007-12-20 11:43   ` David Miller
  1 sibling, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-19  3:21 UTC (permalink / raw)
  To: David Miller, Herbert Xu, netdev
  Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, Bill Fink,
	Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
	Yumiko Sugita, haoki

Hello,

I would like to update this patch since there were redundant extern declarations.

Regards,
Hideo

---

This patch includes changes in network core sub system for memory
accounting.

Memory scheduling, charging, uncharging and reclaiming functions are
added. These functions use sk_forward_alloc to store socket local
accounting. They currently support only datagram protocols.

sk_datagram_rfree() is a receive buffer detractor for datagram
protocols which are capable of protocol specific memory accounting.

Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---

 include/net/sock.h  |   78 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/datagram.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)

diff -pruN net-2.6-udp-take11a1-p1/include/net/sock.h net-2.6-udp-take11a1-p2/include/net/sock.h
--- net-2.6-udp-take11a1-p1/include/net/sock.h	2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take11a1-p2/include/net/sock.h	2007-12-18 18:57:35.000000000 -0500
@@ -750,6 +750,9 @@ static inline struct inode *SOCK_INODE(s
 	return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
 }

+/*
+ * Functions for memory accounting
+ */
 extern void __sk_stream_mem_reclaim(struct sock *sk);
 extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);

@@ -778,6 +781,79 @@ static inline int sk_stream_wmem_schedul
 	       sk_stream_mem_schedule(sk, size, 0);
 }

+extern int __sk_datagram_account_charge(struct sock *sk, int size, int kind);
+extern void __sk_datagram_mem_reclaim(struct sock *sk);
+extern int sk_datagram_mem_schedule(struct sock *sk, int size, int kind);
+
+#define SK_DATAGRAM_MEM_QUANTUM ((unsigned int)PAGE_SIZE)
+
+static inline int sk_datagram_pages(int amt)
+{
+	/* Cast to unsigned as an optimization, since amt is always positive. */
+	return DIV_ROUND_UP((unsigned int)amt, SK_DATAGRAM_MEM_QUANTUM);
+}
+
+static inline void sk_datagram_mem_reclaim(struct sock *sk)
+{
+	if (!sk->sk_prot->memory_allocated)
+		return;
+
+	__sk_datagram_mem_reclaim(sk);
+}
+
+static inline int sk_datagram_rmem_schedule(struct sock *sk, int size)
+{
+	return size <= sk->sk_forward_alloc ||
+		sk_datagram_mem_schedule(sk, size, 1);
+}
+
+static inline int sk_datagram_wmem_schedule(struct sock *sk, int size)
+{
+	return size <= sk->sk_forward_alloc ||
+		sk_datagram_mem_schedule(sk, size, 0);
+}
+
+static inline void sk_mem_reclaim(struct sock *sk)
+{
+	if (sk->sk_type == SOCK_DGRAM)
+		sk_datagram_mem_reclaim(sk);
+}
+
+static inline int sk_wmem_schedule(struct sock *sk, int size)
+{
+	if (sk->sk_type == SOCK_DGRAM)
+		return sk_datagram_wmem_schedule(sk, size);
+	else
+		return 1;
+}
+
+static inline int sk_account_rmem_charge(struct sock *sk, int size)
+{
+	/* account if protocol supports memory accounting. */
+	if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+		return 1;
+
+	return __sk_datagram_account_charge(sk, size, 1);
+}
+
+static inline int sk_account_wmem_charge(struct sock *sk, int size)
+{
+	/* account if protocol supports memory accounting. */
+	if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+		return 1;
+
+	return __sk_datagram_account_charge(sk, size, 0);
+}
+
+static inline void sk_account_uncharge(struct sock *sk, int size)
+{
+	/* account if protocol supports memory accounting. */
+	if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+		return;
+
+	sk->sk_forward_alloc += size;
+}
+
 /* Used by processes to "lock" a socket state, so that
  * interrupts and bottom half handlers won't change it
  * from under us. It essentially blocks any incoming
@@ -1166,6 +1242,8 @@ static inline void skb_set_owner_r(struc
 	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
 }

+extern void sk_datagram_rfree(struct sk_buff *skb);
+
 extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
 			   unsigned long expires);

diff -pruN net-2.6-udp-take11a1-p1/net/core/datagram.c net-2.6-udp-take11a1-p2/net/core/datagram.c
--- net-2.6-udp-take11a1-p1/net/core/datagram.c	2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p2/net/core/datagram.c	2007-12-18 18:49:39.000000000 -0500
@@ -484,6 +484,91 @@ fault:
 }

 /**
+ *	sk_datagram_rfree - receive buffer detractor for datagram protocls
+ *	@skb: skbuff
+ */
+void sk_datagram_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	skb_truesize_check(skb);
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+	sk_account_uncharge(sk, skb->truesize);
+	sk_datagram_mem_reclaim(sk);
+}
+EXPORT_SYMBOL(sk_datagram_rfree);
+
+
+/**
+ * 	__sk_datagram_account_charge - send buffer for datagram protocls
+ *	@sk: socket
+ *	@size: memory size to charge
+ *	@kind: charge type
+ *
+ *	If kind is 0, it means wmem allocation. Otherwise it means rmem
+ *	allocation.
+ */
+int __sk_datagram_account_charge(struct sock *sk, int size, int kind)
+{
+	if ((kind && sk_datagram_rmem_schedule(sk, size)) ||
+	    (!kind && sk_datagram_wmem_schedule(sk, size))) {
+		sk->sk_forward_alloc -= size;
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__sk_datagram_account_charge);
+
+/**
+ * 	__sk_datagram_mem_reclaim - send buffer for datagram protocls
+ *	@sk: socket
+ */
+void __sk_datagram_mem_reclaim(struct sock *sk)
+{
+	if (sk->sk_forward_alloc < SK_DATAGRAM_MEM_QUANTUM)
+		return;
+
+	atomic_sub(sk->sk_forward_alloc / SK_DATAGRAM_MEM_QUANTUM,
+		   sk->sk_prot->memory_allocated);
+	sk->sk_forward_alloc &= SK_DATAGRAM_MEM_QUANTUM - 1;
+}
+EXPORT_SYMBOL(__sk_datagram_mem_reclaim);
+
+/**
+ * 	sk_datagram_mem_schedule - memory accounting for datagram protocls
+ *	@sk: socket
+ *	@size: memory size to allocate
+ *	@kind: allocation type
+ *
+ *	If kind is 0, it means wmem allocation. Otherwise it means rmem
+ *	allocation.
+ */
+int sk_datagram_mem_schedule(struct sock *sk, int size, int kind)
+{
+	int amt;
+	struct proto *prot = sk->sk_prot;
+
+	/* Don't account and limit memory if protocol doesn't support. */
+	if (!prot->memory_allocated)
+		return 1;
+
+	amt = sk_datagram_pages(size);
+	if (atomic_add_return(amt, prot->memory_allocated) >
+	    prot->sysctl_mem[0])
+		if ((kind && atomic_read(&sk->sk_rmem_alloc) + size >=
+		     prot->sysctl_rmem[0]) ||
+		    (!kind && atomic_read(&sk->sk_wmem_alloc) + size >=
+		     prot->sysctl_wmem[0])) {
+			/* Undo changes. */
+			atomic_sub(amt, prot->memory_allocated);
+			return 0;
+		}
+	sk->sk_forward_alloc += amt * SK_DATAGRAM_MEM_QUANTUM;
+	return 1;
+}
+EXPORT_SYMBOL(sk_datagram_mem_schedule);
+
+/**
  * 	datagram_poll - generic datagram poll
  *	@file: file struct
  *	@sock: socket
-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] [UDP]: fix send buffer check
  2007-12-18  2:38 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
@ 2007-12-20 11:31   ` David Miller
  2007-12-21  3:43     ` Hideo AOKI
  0 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2007-12-20 11:31 UTC (permalink / raw)
  To: haoki
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf

From: Hideo AOKI <haoki@redhat.com>
Date: Mon, 17 Dec 2007 21:38:03 -0500

> This patch introduces sndbuf size check before memory allocation for
> send buffer.
> 
> signed-off-by: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
> signed-off-by: Hideo Aoki <haoki@redhat.com>
 ...
> diff -pruN net-2.6/net/ipv4/ip_output.c net-2.6-udp-take11a1-p1/net/ipv4/ip_output.c
> --- net-2.6/net/ipv4/ip_output.c	2007-12-11 10:54:55.000000000 -0500
> +++ net-2.6-udp-take11a1-p1/net/ipv4/ip_output.c	2007-12-17 14:42:31.000000000 -0500
> @@ -1004,6 +1004,11 @@ alloc_new_skb:
>  					frag = &skb_shinfo(skb)->frags[i];
>  				}
>  			} else if (i < MAX_SKB_FRAGS) {
> +				if (atomic_read(&sk->sk_wmem_alloc) + PAGE_SIZE
> +				    > 2 * sk->sk_sndbuf) {
> +					err = -ENOBUFS;
> +					goto error;
> +				}
>  				if (copy > PAGE_SIZE)
>  					copy = PAGE_SIZE;
>  				page = alloc_pages(sk->sk_allocation, 0);

If we are going to do this, we need to add the same check to
skb_append_datato_frags() which is invoked via ip_ufo_append_data().

We also have to be very careful in this area.  One problem we had a
long time ago was that we would socket account when fragmenting an
outgoing frame.  This was bogus because even if the socket had enough
space for one full sized frame, the packet send would fail because it
could not fit the space for both the original frame and the
fragmented copy of it.

This situation was cured by simply not enforcing accounting for the
fragmented copy.  It is valid because after we fragment, we keep
the fragmented copy but free the original.

This doesn't apply directly to this specific patch, but it is
something to keep in mind when doing these changes.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] [CORE]: datagram: basic memory accounting functions
  2007-12-18  2:38 ` [PATCH 2/4] [CORE]: datagram: basic memory accounting functions Hideo AOKI
  2007-12-19  3:21   ` Hideo AOKI
@ 2007-12-20 11:43   ` David Miller
  2007-12-21  4:18     ` Hideo AOKI
  1 sibling, 1 reply; 16+ messages in thread
From: David Miller @ 2007-12-20 11:43 UTC (permalink / raw)
  To: haoki
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf

From: Hideo AOKI <haoki@redhat.com>
Date: Mon, 17 Dec 2007 21:38:17 -0500

Why do we need seperate stream and datagram accounting functions?

Is it just to facilitate things like the following test?

> +static inline int sk_wmem_schedule(struct sock *sk, int size)
> +{
> +	if (sk->sk_type == SOCK_DGRAM)
> +		return sk_datagram_wmem_schedule(sk, size);
> +	else
> +		return 1;
> +}

If so, this can be greatly improved.

All of these other functions are identical copies of the stream
counterparts, they should all be consolidated.

I still see a lot of special casing, instead of large pieces of common
code.

There should be one core set of functions that handle the memory
accounting, regardless of socket type.  Maybe there is one spot where
something like sk->prot->doing_memory_accounting is tested, but that's
it.

I am still very dissatisfied with these changes.  They are
full of special cases, because they mix generic facilities
(the socket memory accounting) with an unrelated issue
(we only support memory accounting for datagram sockets which
are actually UDP).

Also, the memory accounting is done at different parts in
the socket code paths for stream vs. datagram.  This is why
everything is inconsistent, and, a mess.

What's funny is that I absolutely do not care if these changes are
perfect and pass every possible regression test.  Rather, I'm more
concerned that this thing is designed correctly and will allow us to
have one core set of memory accounting functions regardless of socket
type.  As it is coded now, we have two sets of code paths to
fix, two ways of doing the socket accounting, and therefore twice
as much code to maintain and debug.

The whole thing needs to be consistent and without special cases.

The "protocol supports memory accounting" test can be performed, as
you did in this patch, by simply checking if
sk->sk_prot->memory_allocated is non-NULL.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] [UDP]: memory accounting in IPv4
  2007-12-18  2:38 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
@ 2007-12-20 11:44   ` David Miller
  2007-12-21  3:58     ` Hideo AOKI
  0 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2007-12-20 11:44 UTC (permalink / raw)
  To: haoki
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf

From: Hideo AOKI <haoki@redhat.com>
Date: Mon, 17 Dec 2007 21:38:47 -0500

> This patch adds UDP memory usage accounting in IPv4.
> 
> Send buffer accounting is performed by IP layer, because skbuff is
> allocated in the layer.
> 
> Receive buffer is charged, when the buffer successfully received.
> Destructor of the buffer does uncharging and reclaiming, when the
> buffer is freed. To set destructor at proper place, we use
> __udp_queue_rcv_skb() instead of sock_queue_rcv_skb(). To maintain
> consistency of memory accounting, socket lock is used to free receive
> buffer in udp_recvmsg().
> 
> New packet will be add to backlog when the socket is used by user.
> 
> Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
> signed-off-by: Takahiro Yasui <tyasui@redhat.com>
> signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
> signed-off-by: Hideo Aoki <haoki@redhat.com>

We can't accept these changes, even once the other issues
are fixed, until IPV6 is supported as well.

It's pointless to support proper UDP memory accounting only
in IPV4 and not in IPV6 as well.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/4] [UDP]: fix send buffer check
  2007-12-20 11:31   ` David Miller
@ 2007-12-21  3:43     ` Hideo AOKI
  0 siblings, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-21  3:43 UTC (permalink / raw)
  To: David Miller
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf, haoki

Hello,

David Miller wrote:

>> diff -pruN net-2.6/net/ipv4/ip_output.c net-2.6-udp-take11a1-p1/net/ipv4/ip_output.c
>> --- net-2.6/net/ipv4/ip_output.c	2007-12-11 10:54:55.000000000 -0500
>> +++ net-2.6-udp-take11a1-p1/net/ipv4/ip_output.c	2007-12-17 14:42:31.000000000 -0500
>> @@ -1004,6 +1004,11 @@ alloc_new_skb:
>>  					frag = &skb_shinfo(skb)->frags[i];
>>  				}
>>  			} else if (i < MAX_SKB_FRAGS) {
>> +				if (atomic_read(&sk->sk_wmem_alloc) + PAGE_SIZE
>> +				    > 2 * sk->sk_sndbuf) {
>> +					err = -ENOBUFS;
>> +					goto error;
>> +				}
>>  				if (copy > PAGE_SIZE)
>>  					copy = PAGE_SIZE;
>>  				page = alloc_pages(sk->sk_allocation, 0);
> 
> If we are going to do this, we need to add the same check to
> skb_append_datato_frags() which is invoked via ip_ufo_append_data().
> 
> We also have to be very careful in this area.  One problem we had a
> long time ago was that we would socket account when fragmenting an
> outgoing frame.  This was bogus because even if the socket had enough
> space for one full sized frame, the packet send would fail because it
> could not fit the space for both the original frame and the
> fragmented copy of it.
> 
> This situation was cured by simply not enforcing accounting for the
> fragmented copy.  It is valid because after we fragment, we keep
> the fragmented copy but free the original.
> 
> This doesn't apply directly to this specific patch, but it is
> something to keep in mind when doing these changes.

Hello,

Thank you for sharing your experience.

Let me investigate this code and skb_append_datato_frags().

I'll include the check code in next patch set if it is really needed.

Regards,
Hideo

-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] [UDP]: memory accounting in IPv4
  2007-12-20 11:44   ` David Miller
@ 2007-12-21  3:58     ` Hideo AOKI
  0 siblings, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-21  3:58 UTC (permalink / raw)
  To: David Miller
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf, haoki

David Miller wrote:
> From: Hideo AOKI <haoki@redhat.com>
> Date: Mon, 17 Dec 2007 21:38:47 -0500
> 
>> This patch adds UDP memory usage accounting in IPv4.
>>
> We can't accept these changes, even once the other issues
> are fixed, until IPV6 is supported as well.
> 
> It's pointless to support proper UDP memory accounting only
> in IPV4 and not in IPV6 as well.

I understood. I'll develop IPv6 UDP memory accounting as well and
include it in next UDP memory accounting patch set submission.

Many thanks,
Hideo

-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] [CORE]: datagram: basic memory accounting functions
  2007-12-20 11:43   ` David Miller
@ 2007-12-21  4:18     ` Hideo AOKI
  2007-12-21  4:31       ` David Miller
  0 siblings, 1 reply; 16+ messages in thread
From: Hideo AOKI @ 2007-12-21  4:18 UTC (permalink / raw)
  To: David Miller
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf, haoki

Hello,

Thank you so much for your comments.

David Miller wrote:

> All of these other functions are identical copies of the stream
> counterparts, they should all be consolidated.
> 
> I still see a lot of special casing, instead of large pieces of common
> code.
> 
> There should be one core set of functions that handle the memory
> accounting, regardless of socket type.  Maybe there is one spot where
> something like sk->prot->doing_memory_accounting is tested, but that's
> it.

I understood. I'll re-write my patch set to make memory accounting
core functions.

> Also, the memory accounting is done at different parts in
> the socket code paths for stream vs. datagram.  This is why
> everything is inconsistent, and, a mess.

Could you tell me more detailed information?

Does this comment mean interface and usage of memory accounting functions?
If so, I'll consolidate functions like sk_stream_set_owner_r() and
sk_stream_free_skb(). And, I may have to use memory accounting functions in
memory allocating functions like sk_stream_alloc_pskb() as possible
instead of inside of socket operating functions.

Or, does the comment mean that send buffer accounting in IP layer
(e.g. ip_append_data()) is wrong?

Anyway, in next patch set, I'm going to consolidate mem_schedule
functions and mem_reclaim functions. To do so, some of memory
accounting functions for stream protocols will be renamed or
be moved to core/sock.c from core/stream.c.

I would like to know what kind of enhancement must be needed for
memory accounting core functions.

Again, thank you for taking your time to review this feature.

Best regards,
Hideo

-- 
Hitachi Computer Products (America) Inc.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] [CORE]: datagram: basic memory accounting functions
  2007-12-21  4:18     ` Hideo AOKI
@ 2007-12-21  4:31       ` David Miller
  2007-12-22  0:22         ` Hideo AOKI
  0 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2007-12-21  4:31 UTC (permalink / raw)
  To: haoki
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf

From: Hideo AOKI <haoki@redhat.com>
Date: Thu, 20 Dec 2007 23:18:54 -0500

> > Also, the memory accounting is done at different parts in
> > the socket code paths for stream vs. datagram.  This is why
> > everything is inconsistent, and, a mess.
> 
> Could you tell me more detailed information?

I think the core thing is that TCP and INET protocols call into
the memory accounting internally, either inside their own code
paths or with inet_*() helpers.

This is versus what we really want is everything happening via generic
sk_foo() helpers.

If that's what's happening already, great, just consolidate the
datagram vs. stream stuff and it should be good.


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/4] [CORE]: datagram: basic memory accounting functions
  2007-12-21  4:31       ` David Miller
@ 2007-12-22  0:22         ` Hideo AOKI
  0 siblings, 0 replies; 16+ messages in thread
From: Hideo AOKI @ 2007-12-22  0:22 UTC (permalink / raw)
  To: David Miller
  Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
	andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf

David Miller wrote:
> From: Hideo AOKI <haoki@redhat.com>
> Date: Thu, 20 Dec 2007 23:18:54 -0500
> 
>>> Also, the memory accounting is done at different parts in
>>> the socket code paths for stream vs. datagram.  This is why
>>> everything is inconsistent, and, a mess.
>> Could you tell me more detailed information?
> 
> I think the core thing is that TCP and INET protocols call into
> the memory accounting internally, either inside their own code
> paths or with inet_*() helpers.
> 
> This is versus what we really want is everything happening via generic
> sk_foo() helpers.
> 
> If that's what's happening already, great, just consolidate the
> datagram vs. stream stuff and it should be good.

Thank you for the explanation.

I'll do my best.

Regards,
Hideo

-- 
Hitachi Computer Products (America) Inc.


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2007-12-22  0:32 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-18  2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
2007-12-18  2:38 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
2007-12-20 11:31   ` David Miller
2007-12-21  3:43     ` Hideo AOKI
2007-12-18  2:38 ` [PATCH 2/4] [CORE]: datagram: basic memory accounting functions Hideo AOKI
2007-12-19  3:21   ` Hideo AOKI
2007-12-20 11:43   ` David Miller
2007-12-21  4:18     ` Hideo AOKI
2007-12-21  4:31       ` David Miller
2007-12-22  0:22         ` Hideo AOKI
2007-12-18  2:38 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
2007-12-18  2:38 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
2007-12-20 11:44   ` David Miller
2007-12-21  3:58     ` Hideo AOKI
  -- strict thread matches above, loose matches on Subject: below --
2007-12-15  5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
2007-12-15  5:15 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
2007-11-28 18:48 [PATCH 0/4] UDP memory accounting and limitation (take 9) Hideo AOKI
2007-11-28 18:53 ` [PATCH 3/4] udp: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).