* [PATCH 3/4] udp: add udp_mem, udp_rmem_min and udp_wmem_min
2007-11-28 18:48 [PATCH 0/4] UDP memory accounting and limitation (take 9) Hideo AOKI
@ 2007-11-28 18:53 ` Hideo AOKI
0 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-11-28 18:53 UTC (permalink / raw)
To: Herbert Xu, netdev
Cc: David Miller, Satoshi Oshima, Bill Fink, Andi Kleen,
Evgeniy Polyakov, Stephen Hemminger, yoshfuji, Yumiko Sugita,
haoki
This patch adds sysctl parameters for customizing UDP memory accounting:
/proc/sys/net/ipv4/udp_mem
/proc/sys/net/ipv4/udp_rmem_min
/proc/sys/net/ipv4/udp_wmem_min
Udp_mem indicates number of pages which can be used for all UDP sockets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.
This patch is also introduced memory_allocated variable for UDP protocol.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++
include/net/udp.h | 9 +++++++++
net/ipv4/af_inet.c | 3 +++
net/ipv4/proc.c | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 31 +++++++++++++++++++++++++++++++
net/ipv4/udp.c | 27 +++++++++++++++++++++++++++
6 files changed, 90 insertions(+), 1 deletion(-)
diff -pruN net-2.6-udp-take9a2-p2/Documentation/networking/ip-sysctl.txt net-2.6-udp-take9a2-p3/Documentation/networking/ip-sysctl.txt
--- net-2.6-udp-take9a2-p2/Documentation/networking/ip-sysctl.txt 2007-11-14 10:48:49.000000000 -0500
+++ net-2.6-udp-take9a2-p3/Documentation/networking/ip-sysctl.txt 2007-11-28 12:11:02.000000000 -0500
@@ -446,6 +446,24 @@ tcp_dma_copybreak - INTEGER
and CONFIG_NET_DMA is enabled.
Default: 4096
+UDP variables:
+
+udp_mem - INTEGER
+ Number of pages allowed for queueing by all UDP sockets.
+ Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+ Minimal size of receive buffer used by UDP sockets. Each UDP socket
+ is able to use the size for receiving data, even if total pages of UDP
+ sockets exceed udp_mem. The unit is byte.
+ Default: 4096
+
+udp_wmem_min - INTEGER
+ Minimal size of send buffer used by UDP sockets. Each UDP socket is
+ able to use the size for sending data, even if total pages of UDP
+ sockets exceed udp_mem. The unit is byte.
+ Default: 4096
+
CIPSOv4 Variables:
cipso_cache_enable - BOOLEAN
diff -pruN net-2.6-udp-take9a2-p2/include/net/udp.h net-2.6-udp-take9a2-p3/include/net/udp.h
--- net-2.6-udp-take9a2-p2/include/net/udp.h 2007-11-14 10:49:05.000000000 -0500
+++ net-2.6-udp-take9a2-p3/include/net/udp.h 2007-11-28 12:11:02.000000000 -0500
@@ -65,6 +65,13 @@ extern rwlock_t udp_hash_lock;
extern struct proto udp_prot;
+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem;
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
struct sk_buff;
/*
@@ -173,4 +180,6 @@ extern void udp_proc_unregister(struct u
extern int udp4_proc_init(void);
extern void udp4_proc_exit(void);
#endif
+
+extern void udp_init(void);
#endif /* _UDP_H */
diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/af_inet.c net-2.6-udp-take9a2-p3/net/ipv4/af_inet.c
--- net-2.6-udp-take9a2-p2/net/ipv4/af_inet.c 2007-11-14 10:49:06.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/af_inet.c 2007-11-28 12:11:02.000000000 -0500
@@ -1418,6 +1418,9 @@ static int __init inet_init(void)
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/proc.c net-2.6-udp-take9a2-p3/net/ipv4/proc.c
--- net-2.6-udp-take9a2-p2/net/ipv4/proc.c 2007-11-14 10:49:07.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/proc.c 2007-11-28 12:11:02.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
- seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+ atomic_read(&udp_memory_allocated));
seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
seq_printf(seq, "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/sysctl_net_ipv4.c net-2.6-udp-take9a2-p3/net/ipv4/sysctl_net_ipv4.c
--- net-2.6-udp-take9a2-p2/net/ipv4/sysctl_net_ipv4.c 2007-11-20 10:29:40.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/sysctl_net_ipv4.c 2007-11-28 12:11:02.000000000 -0500
@@ -18,6 +18,7 @@
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
@@ -885,6 +886,36 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
{ .ctl_name = 0 }
};
diff -pruN net-2.6-udp-take9a2-p2/net/ipv4/udp.c net-2.6-udp-take9a2-p3/net/ipv4/udp.c
--- net-2.6-udp-take9a2-p2/net/ipv4/udp.c 2007-11-14 10:49:07.000000000 -0500
+++ net-2.6-udp-take9a2-p3/net/ipv4/udp.c 2007-11-28 12:11:02.000000000 -0500
@@ -82,6 +82,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <linux/bootmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
@@ -114,6 +115,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
+atomic_t udp_memory_allocated;
+int sysctl_udp_mem __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
static inline int __udp_lib_lport_inuse(__u16 num,
const struct hlist_head udptable[])
{
@@ -1449,6 +1455,10 @@ struct proto udp_prot = {
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = &sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
@@ -1644,6 +1654,23 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ /* Set the pressure threshold up by the same strategy of TCP. It is a
+ * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+ * toward zero with the amount of memory, with a floor of 128 pages.
+ */
+ limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+ limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+ limit = max(limit, 128UL);
+ sysctl_udp_mem = limit / 2 * 3;
+
+ sysctl_udp_rmem_min = SK_DATAGRAM_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_DATAGRAM_MEM_QUANTUM;
+}
+
EXPORT_SYMBOL(udp_disconnect);
EXPORT_SYMBOL(udp_hash);
EXPORT_SYMBOL(udp_hash_lock);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 0/4] [UDP]: memory accounting and limitation (take 10)
@ 2007-12-15 5:07 Hideo AOKI
2007-12-15 5:14 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
` (4 more replies)
0 siblings, 5 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-15 5:07 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: haoki, Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita
Hello,
This is the latest patch set of UDP memory accounting and limitation.
To reduce number of atomic access to global variable, the patch set
supports per socket accounting using sk_forward_alloc like stream
protocols.
My colleagues and I tested the patch set on net-2.6 tree.
Please consider applying.
Changelog take 9 -> take 10:
* supported using sk_forward_alloc
* introduced several memory accounting functions with spin lock
* changed detagram receive functions to be able to customize
destructor
* fixed accounting bugs in previous takes
Changelog take 8 -> take 9:
* introduced mem_schdeule functions for datargram protocols
* removed protocol check function, from patch set
* restructured patch set
Changelog take 7 -> take 8:
* sk_datagram_pages(): avoided using divide instruction
* udp_recvmsg(): fixed referring released truesize in accounting
Best regards,
Hideo Aoki
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/4] [UDP]: fix send buffer check
2007-12-15 5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
@ 2007-12-15 5:14 ` Hideo AOKI
2007-12-15 5:15 ` [PATCH 2/4] [CORE]: datagram: mem_scheudle functions Hideo AOKI
` (3 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-15 5:14 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
This patch introduces sndbuf size check before memory allocation for
send buffer.
signed-off-by: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
ip_output.c | 5 +++++
1 file changed, 5 insertions(+)
diff -pruN net-2.6/net/ipv4/ip_output.c net-2.6-udp-take10a4-p1/net/ipv4/ip_output.c
--- net-2.6/net/ipv4/ip_output.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p1/net/ipv4/ip_output.c 2007-12-14 16:42:04.000000000 -0500
@@ -1004,6 +1004,11 @@ alloc_new_skb:
frag = &skb_shinfo(skb)->frags[i];
}
} else if (i < MAX_SKB_FRAGS) {
+ if (atomic_read(&sk->sk_wmem_alloc) + PAGE_SIZE
+ > 2 * sk->sk_sndbuf) {
+ err = -ENOBUFS;
+ goto error;
+ }
if (copy > PAGE_SIZE)
copy = PAGE_SIZE;
page = alloc_pages(sk->sk_allocation, 0);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/4] [CORE]: datagram: mem_scheudle functions
2007-12-15 5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
2007-12-15 5:14 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
@ 2007-12-15 5:15 ` Hideo AOKI
2007-12-15 15:32 ` Herbert Xu
2007-12-15 5:15 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
` (2 subsequent siblings)
4 siblings, 1 reply; 11+ messages in thread
From: Hideo AOKI @ 2007-12-15 5:15 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
This patch includes changes in network core sub system for memory
accounting.
Memory scheduling, charging, uncharging and reclaiming functions are
added. These functions use sk_forward_alloc to store socket local
accounting. They also need to use lock to keep consistency of
sk_forward_alloc and memory_allocated. They currently support only
datagram protocols.
sk_datagram_rfree() is a receive buffer detractor for datagram
protocols which are capable of protocol specific memory accounting.
To enable memory accounting in releasing receive buffer,
sock_queue_rcv_skb() is modified although the interface isn't changed.
The body of the function is implemented in
sock_queue_rcv_skb_with_owner(). Additionally, skb_set_owner_r() is
moved to sock.h to core/datagram.c because we want to use it as a
call back function.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
include/net/sock.h | 117 +++++++++++++++++++++++++++++++++++++++++++++++++---
net/core/datagram.c | 72 ++++++++++++++++++++++++++++++++
net/core/sock.c | 13 ++++-
3 files changed, 193 insertions(+), 9 deletions(-)
diff -pruN net-2.6-udp-take10a4-p1/include/net/sock.h net-2.6-udp-take10a4-p2/include/net/sock.h
--- net-2.6-udp-take10a4-p1/include/net/sock.h 2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take10a4-p2/include/net/sock.h 2007-12-14 20:27:40.000000000 -0500
@@ -750,6 +750,9 @@ static inline struct inode *SOCK_INODE(s
return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}
+/*
+ * Functions for memory accounting
+ */
extern void __sk_stream_mem_reclaim(struct sock *sk);
extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
@@ -778,6 +781,107 @@ static inline int sk_stream_wmem_schedul
sk_stream_mem_schedule(sk, size, 0);
}
+extern void __sk_datagram_mem_reclaim(struct sock *sk);
+extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
+
+#define SK_DATAGRAM_MEM_QUANTUM ((unsigned int)PAGE_SIZE)
+
+static inline int sk_datagram_pages(int amt)
+{
+ /* Cast to unsigned as an optimization, since amt is always positive. */
+ return DIV_ROUND_UP((unsigned int)amt, SK_DATAGRAM_MEM_QUANTUM);
+}
+
+extern void __sk_datagram_mem_reclaim(struct sock *sk);
+extern int sk_datagram_mem_schedule(struct sock *sk, int size, int kind);
+
+static inline void sk_datagram_mem_reclaim(struct sock *sk)
+{
+ unsigned long flags;
+
+ if (!sk->sk_prot->memory_allocated)
+ return;
+
+ spin_lock_irqsave(&sk->sk_lock.slock, flags);
+ __sk_datagram_mem_reclaim(sk);
+ spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
+}
+
+static inline int sk_datagram_rmem_schedule(struct sock *sk, int size)
+{
+ return size <= sk->sk_forward_alloc ||
+ sk_datagram_mem_schedule(sk, size, 1);
+}
+
+static inline int sk_datagram_wmem_schedule(struct sock *sk, int size)
+{
+ return size <= sk->sk_forward_alloc ||
+ sk_datagram_mem_schedule(sk, size, 0);
+}
+
+static inline void sk_mem_reclaim(struct sock *sk)
+{
+ if (sk->sk_type == SOCK_DGRAM)
+ sk_datagram_mem_reclaim(sk);
+}
+
+static inline int sk_wmem_schedule(struct sock *sk, int size)
+{
+ if (sk->sk_type == SOCK_DGRAM)
+ return sk_datagram_wmem_schedule(sk, size);
+ else
+ return 1;
+}
+
+static inline int sk_account_wmem_charge(struct sock *sk, int size)
+{
+ unsigned long flags;
+
+ /* account if protocol supports memory accounting. */
+ if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+ return 1;
+
+ spin_lock_irqsave(&sk->sk_lock.slock, flags);
+ if (sk_datagram_wmem_schedule(sk, size)) {
+ sk->sk_forward_alloc -= size;
+ spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
+ return 1;
+ }
+ spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
+ return 0;
+}
+
+static inline int sk_account_rmem_charge(struct sock *sk, int size)
+{
+ unsigned long flags;
+
+ /* account if protocol supports memory accounting. */
+ if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+ return 1;
+
+ spin_lock_irqsave(&sk->sk_lock.slock, flags);
+ if (sk_datagram_rmem_schedule(sk, size)) {
+ sk->sk_forward_alloc -= size;
+ spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
+ return 1;
+ }
+ spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
+ return 0;
+}
+
+static inline void sk_account_uncharge(struct sock *sk, int size)
+{
+ unsigned long flags;
+
+ /* account if protocol supports memory accounting. */
+ if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
+ return;
+
+ spin_lock_irqsave(&sk->sk_lock.slock, flags);
+ sk->sk_forward_alloc += size;
+ spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
+}
+
/* Used by processes to "lock" a socket state, so that
* interrupts and bottom half handlers won't change it
* from under us. It essentially blocks any incoming
@@ -1159,18 +1263,19 @@ static inline void skb_set_owner_w(struc
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
}
-static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb->sk = sk;
- skb->destructor = sock_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
-}
+extern void skb_set_owner_r(struct sk_buff *skb, struct sock *sk);
+
+void sk_datagram_rfree(struct sk_buff *skb);
extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
unsigned long expires);
extern void sk_stop_timer(struct sock *sk, struct timer_list* timer);
+extern int sock_queue_rcv_skb_with_owner(struct sock *sk, struct sk_buff *skb,
+ void set_owner_r(struct sk_buff *nskb,
+ struct sock* nsk));
+
extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
diff -pruN net-2.6-udp-take10a4-p1/net/core/datagram.c net-2.6-udp-take10a4-p2/net/core/datagram.c
--- net-2.6-udp-take10a4-p1/net/core/datagram.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p2/net/core/datagram.c 2007-12-14 20:26:18.000000000 -0500
@@ -200,6 +200,14 @@ void skb_free_datagram(struct sock *sk,
kfree_skb(skb);
}
+void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ skb->sk = sk;
+ skb->destructor = sock_rfree;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+}
+EXPORT_SYMBOL(skb_set_owner_r);
+
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
@@ -484,6 +492,70 @@ fault:
}
/**
+ * sk_datagram_rfree - receive buffer detractor for datagram protocls
+ * @skb: skbuff
+ */
+void sk_datagram_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ skb_truesize_check(skb);
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_account_uncharge(sk, skb->truesize);
+ sk_datagram_mem_reclaim(sk);
+}
+EXPORT_SYMBOL(sk_datagram_rfree);
+
+/**
+ * __sk_datagram_mem_reclaim - send buffer for datagram protocls
+ * @sk: socket
+ */
+void __sk_datagram_mem_reclaim(struct sock *sk)
+{
+ if (sk->sk_forward_alloc < SK_DATAGRAM_MEM_QUANTUM)
+ return;
+
+ atomic_sub(sk->sk_forward_alloc / SK_DATAGRAM_MEM_QUANTUM,
+ sk->sk_prot->memory_allocated);
+ sk->sk_forward_alloc &= SK_DATAGRAM_MEM_QUANTUM - 1;
+}
+EXPORT_SYMBOL(__sk_datagram_mem_reclaim);
+
+/**
+ * sk_datagram_mem_schedule - memory accounting for datagram protocls
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is 0, it means wmem allocation. Otherwise it means rmem
+ * allocation.
+ */
+int sk_datagram_mem_schedule(struct sock *sk, int size, int kind)
+{
+ int amt;
+ struct proto *prot = sk->sk_prot;
+
+ /* Don't account and limit memory if protocol doesn't support. */
+ if (!prot->memory_allocated)
+ return 1;
+
+ amt = sk_datagram_pages(size);
+ if (atomic_add_return(amt, prot->memory_allocated) >
+ prot->sysctl_mem[0])
+ if ((kind && atomic_read(&sk->sk_rmem_alloc) + size >=
+ prot->sysctl_rmem[0]) ||
+ (!kind && atomic_read(&sk->sk_wmem_alloc) + size >=
+ prot->sysctl_wmem[0])) {
+ /* Undo changes. */
+ atomic_sub(amt, prot->memory_allocated);
+ return 0;
+ }
+ sk->sk_forward_alloc += amt * SK_DATAGRAM_MEM_QUANTUM;
+ return 1;
+}
+EXPORT_SYMBOL(sk_datagram_mem_schedule);
+
+/**
* datagram_poll - generic datagram poll
* @file: file struct
* @sock: socket
diff -pruN net-2.6-udp-take10a4-p1/net/core/sock.c net-2.6-udp-take10a4-p2/net/core/sock.c
--- net-2.6-udp-take10a4-p1/net/core/sock.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p2/net/core/sock.c 2007-12-14 16:42:06.000000000 -0500
@@ -263,8 +263,9 @@ static void sock_disable_timestamp(struc
}
}
-
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int sock_queue_rcv_skb_with_owner(struct sock *sk, struct sk_buff *skb,
+ void set_owner_r(struct sk_buff *nskb,
+ struct sock* nsk))
{
int err = 0;
int skb_len;
@@ -283,7 +284,7 @@ int sock_queue_rcv_skb(struct sock *sk,
goto out;
skb->dev = NULL;
- skb_set_owner_r(skb, sk);
+ set_owner_r(skb, sk);
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
@@ -299,6 +300,12 @@ int sock_queue_rcv_skb(struct sock *sk,
out:
return err;
}
+EXPORT_SYMBOL(sock_queue_rcv_skb_with_owner);
+
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ return sock_queue_rcv_skb_with_owner(sk, skb, skb_set_owner_r);
+}
EXPORT_SYMBOL(sock_queue_rcv_skb);
int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min
2007-12-15 5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
2007-12-15 5:14 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
2007-12-15 5:15 ` [PATCH 2/4] [CORE]: datagram: mem_scheudle functions Hideo AOKI
@ 2007-12-15 5:15 ` Hideo AOKI
2007-12-15 5:15 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
2007-12-16 5:34 ` [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) David Miller
4 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-15 5:15 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
This patch adds sysctl parameters for customizing UDP memory accounting:
/proc/sys/net/ipv4/udp_mem
/proc/sys/net/ipv4/udp_rmem_min
/proc/sys/net/ipv4/udp_wmem_min
Udp_mem indicates number of pages which can be used for all UDP sockets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.
This patch is also introduced memory_allocated variable for UDP protocol.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++
include/net/udp.h | 9 +++++++++
net/ipv4/af_inet.c | 3 +++
net/ipv4/proc.c | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 31 +++++++++++++++++++++++++++++++
net/ipv4/udp.c | 27 +++++++++++++++++++++++++++
6 files changed, 90 insertions(+), 1 deletion(-)
diff -pruN net-2.6-udp-take10a4-p2/Documentation/networking/ip-sysctl.txt net-2.6-udp-take10a4-p3/Documentation/networking/ip-sysctl.txt
--- net-2.6-udp-take10a4-p2/Documentation/networking/ip-sysctl.txt 2007-12-11 10:54:41.000000000 -0500
+++ net-2.6-udp-take10a4-p3/Documentation/networking/ip-sysctl.txt 2007-12-14 20:27:54.000000000 -0500
@@ -446,6 +446,24 @@ tcp_dma_copybreak - INTEGER
and CONFIG_NET_DMA is enabled.
Default: 4096
+UDP variables:
+
+udp_mem - INTEGER
+ Number of pages allowed for queueing by all UDP sockets.
+ Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+ Minimal size of receive buffer used by UDP sockets. Each UDP socket
+ is able to use the size for receiving data, even if total pages of UDP
+ sockets exceed udp_mem. The unit is byte.
+ Default: 4096
+
+udp_wmem_min - INTEGER
+ Minimal size of send buffer used by UDP sockets. Each UDP socket is
+ able to use the size for sending data, even if total pages of UDP
+ sockets exceed udp_mem. The unit is byte.
+ Default: 4096
+
CIPSOv4 Variables:
cipso_cache_enable - BOOLEAN
diff -pruN net-2.6-udp-take10a4-p2/include/net/udp.h net-2.6-udp-take10a4-p3/include/net/udp.h
--- net-2.6-udp-take10a4-p2/include/net/udp.h 2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take10a4-p3/include/net/udp.h 2007-12-14 20:27:54.000000000 -0500
@@ -65,6 +65,13 @@ extern rwlock_t udp_hash_lock;
extern struct proto udp_prot;
+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem;
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
struct sk_buff;
/*
@@ -173,4 +180,6 @@ extern void udp_proc_unregister(struct u
extern int udp4_proc_init(void);
extern void udp4_proc_exit(void);
#endif
+
+extern void udp_init(void);
#endif /* _UDP_H */
diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/af_inet.c net-2.6-udp-take10a4-p3/net/ipv4/af_inet.c
--- net-2.6-udp-take10a4-p2/net/ipv4/af_inet.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/af_inet.c 2007-12-14 20:27:54.000000000 -0500
@@ -1418,6 +1418,9 @@ static int __init inet_init(void)
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/proc.c net-2.6-udp-take10a4-p3/net/ipv4/proc.c
--- net-2.6-udp-take10a4-p2/net/ipv4/proc.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/proc.c 2007-12-14 20:27:54.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
- seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+ atomic_read(&udp_memory_allocated));
seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
seq_printf(seq, "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/sysctl_net_ipv4.c net-2.6-udp-take10a4-p3/net/ipv4/sysctl_net_ipv4.c
--- net-2.6-udp-take10a4-p2/net/ipv4/sysctl_net_ipv4.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/sysctl_net_ipv4.c 2007-12-14 20:27:54.000000000 -0500
@@ -18,6 +18,7 @@
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
@@ -885,6 +886,36 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
{ .ctl_name = 0 }
};
diff -pruN net-2.6-udp-take10a4-p2/net/ipv4/udp.c net-2.6-udp-take10a4-p3/net/ipv4/udp.c
--- net-2.6-udp-take10a4-p2/net/ipv4/udp.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take10a4-p3/net/ipv4/udp.c 2007-12-14 20:27:54.000000000 -0500
@@ -82,6 +82,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <linux/bootmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
@@ -114,6 +115,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
+atomic_t udp_memory_allocated;
+int sysctl_udp_mem __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
static inline int __udp_lib_lport_inuse(__u16 num,
const struct hlist_head udptable[])
{
@@ -1449,6 +1455,10 @@ struct proto udp_prot = {
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = &sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
@@ -1644,6 +1654,23 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ /* Set the pressure threshold up by the same strategy of TCP. It is a
+ * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+ * toward zero with the amount of memory, with a floor of 128 pages.
+ */
+ limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+ limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+ limit = max(limit, 128UL);
+ sysctl_udp_mem = limit / 4 * 3;
+
+ sysctl_udp_rmem_min = SK_DATAGRAM_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_DATAGRAM_MEM_QUANTUM;
+}
+
EXPORT_SYMBOL(udp_disconnect);
EXPORT_SYMBOL(udp_hash);
EXPORT_SYMBOL(udp_hash_lock);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 4/4] [UDP]: memory accounting in IPv4
2007-12-15 5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
` (2 preceding siblings ...)
2007-12-15 5:15 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
@ 2007-12-15 5:15 ` Hideo AOKI
2007-12-16 5:34 ` [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) David Miller
4 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-15 5:15 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
This patch adds UDP memory usage accounting in IPv4.
Send buffer accounting is performed by IP layer, because skbuff is
allocated in the layer.
Receive buffer is charged, when the buffer successfully received.
Destructor of the buffer does un charging and reclaiming, when the
buffer is freed. To set destructor at proper place, we introduce
udp_set_owner_r().
In addition, to make sure that sk_forward_alloc is totally uncharged
in socket destruction, a reclaiming is added to inet_sock_destruct().
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
af_inet.c | 2 ++
ip_output.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
udp.c | 16 +++++++++++++++-
3 files changed, 61 insertions(+), 3 deletions(-)
diff -pruN net-2.6-udp-take10a4-p3/net/ipv4/af_inet.c net-2.6-udp-take10a4-p4/net/ipv4/af_inet.c
--- net-2.6-udp-take10a4-p3/net/ipv4/af_inet.c 2007-12-14 20:27:54.000000000 -0500
+++ net-2.6-udp-take10a4-p4/net/ipv4/af_inet.c 2007-12-14 21:06:54.000000000 -0500
@@ -144,6 +144,8 @@ void inet_sock_destruct(struct sock *sk)
printk("Attempt to release alive inet socket %p\n", sk);
return;
}
+ if (sk->sk_type == SOCK_DGRAM)
+ sk_datagram_mem_reclaim(sk);
BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
diff -pruN net-2.6-udp-take10a4-p3/net/ipv4/ip_output.c net-2.6-udp-take10a4-p4/net/ipv4/ip_output.c
--- net-2.6-udp-take10a4-p3/net/ipv4/ip_output.c 2007-12-14 16:42:04.000000000 -0500
+++ net-2.6-udp-take10a4-p4/net/ipv4/ip_output.c 2007-12-14 21:06:54.000000000 -0500
@@ -707,6 +707,7 @@ static inline int ip_ufo_append_data(str
{
struct sk_buff *skb;
int err;
+ int first_size, second_size;
/* There is support for UDP fragmentation offload by network
* device, so create one single skb packet containing complete
@@ -720,6 +721,11 @@ static inline int ip_ufo_append_data(str
if (skb == NULL)
return err;
+ if (!sk_account_wmem_charge(sk, skb->truesize)) {
+ err = -ENOBUFS;
+ goto fail;
+ }
+
/* reserve space for Hardware header */
skb_reserve(skb, hh_len);
@@ -736,6 +742,7 @@ static inline int ip_ufo_append_data(str
skb->csum = 0;
sk->sk_sndmsg_off = 0;
}
+ first_size = skb->truesize;
err = skb_append_datato_frags(sk,skb, getfrag, from,
(length - transhdrlen));
@@ -743,6 +750,15 @@ static inline int ip_ufo_append_data(str
/* specify the length of each IP datagram fragment*/
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
+
+ second_size = skb->truesize - first_size;
+ if (!sk_account_wmem_charge(sk, second_size)) {
+ sk_account_uncharge(sk, first_size);
+ sk_mem_reclaim(sk);
+ err = -ENOBUFS;
+ goto fail;
+ }
+
__skb_queue_tail(&sk->sk_write_queue, skb);
return 0;
@@ -750,6 +766,7 @@ static inline int ip_ufo_append_data(str
/* There is not enough support do UFO ,
* so follow normal path
*/
+fail:
kfree_skb(skb);
return err;
}
@@ -924,6 +941,11 @@ alloc_new_skb:
}
if (skb == NULL)
goto error;
+ if (!sk_account_wmem_charge(sk, skb->truesize)) {
+ err = -ENOBUFS;
+ kfree_skb(skb);
+ goto error;
+ }
/*
* Fill in the control structures
@@ -954,6 +976,8 @@ alloc_new_skb:
copy = datalen - transhdrlen - fraggap;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
+ sk_account_uncharge(sk, skb->truesize);
+ sk_mem_reclaim(sk);
kfree_skb(skb);
goto error;
}
@@ -1023,6 +1047,10 @@ alloc_new_skb:
frag = &skb_shinfo(skb)->frags[i];
skb->truesize += PAGE_SIZE;
atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+ if (!sk_account_wmem_charge(sk, PAGE_SIZE)) {
+ err = -ENOBUFS;
+ goto error;
+ }
} else {
err = -EMSGSIZE;
goto error;
@@ -1124,6 +1152,11 @@ ssize_t ip_append_page(struct sock *sk,
err = -ENOBUFS;
goto error;
}
+ if (!sk_account_wmem_charge(sk, skb->truesize)) {
+ kfree_skb(skb);
+ err = -ENOBUFS;
+ goto error;
+ }
/*
* Fill in the control structures
@@ -1213,13 +1246,14 @@ int ip_push_pending_frames(struct sock *
struct iphdr *iph;
__be16 df = 0;
__u8 ttl;
- int err = 0;
+ int err = 0, send_size;
if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
goto out;
tail_skb = &(skb_shinfo(skb)->frag_list);
/* move skb->data to ip header from ext header */
+ send_size = skb->truesize;
if (skb->data < skb_network_header(skb))
__skb_pull(skb, skb_network_offset(skb));
while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
@@ -1229,6 +1263,7 @@ int ip_push_pending_frames(struct sock *
skb->len += tmp_skb->len;
skb->data_len += tmp_skb->len;
skb->truesize += tmp_skb->truesize;
+ send_size += tmp_skb->truesize;
__sock_put(tmp_skb->sk);
tmp_skb->destructor = NULL;
tmp_skb->sk = NULL;
@@ -1284,6 +1319,8 @@ int ip_push_pending_frames(struct sock *
/* Netfilter gets whole the not fragmented skb. */
err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
skb->dst->dev, dst_output);
+ sk_account_uncharge(sk, send_size);
+ sk_mem_reclaim(sk);
if (err) {
if (err > 0)
err = inet->recverr ? net_xmit_errno(err) : 0;
@@ -1306,10 +1343,15 @@ error:
void ip_flush_pending_frames(struct sock *sk)
{
struct sk_buff *skb;
+ int truesize = 0;
- while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+ while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
+ truesize += skb->truesize;
kfree_skb(skb);
+ }
+ sk_account_uncharge(sk, truesize);
+ sk_mem_reclaim(sk);
ip_cork_release(inet_sk(sk));
}
diff -pruN net-2.6-udp-take10a4-p3/net/ipv4/udp.c net-2.6-udp-take10a4-p4/net/ipv4/udp.c
--- net-2.6-udp-take10a4-p3/net/ipv4/udp.c 2007-12-14 20:27:54.000000000 -0500
+++ net-2.6-udp-take10a4-p4/net/ipv4/udp.c 2007-12-14 21:06:54.000000000 -0500
@@ -934,6 +934,13 @@ int udp_disconnect(struct sock *sk, int
return 0;
}
+void udp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ skb->sk = sk;
+ skb->destructor = sk_datagram_rfree;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+}
+
/* returns:
* -1: error
* 0: success
@@ -1022,10 +1029,17 @@ int udp_queue_rcv_skb(struct sock * sk,
goto drop;
}
- if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) {
+ if (!sk_account_rmem_charge(sk, skb->truesize)) {
+ UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+ goto drop;
+ }
+
+ if ((rc = sock_queue_rcv_skb_with_owner(sk, skb, udp_set_owner_r)) < 0) {
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, up->pcflag);
+ sk_account_uncharge(sk, skb->truesize);
+ sk_datagram_mem_reclaim(sk);
goto drop;
}
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/4] [CORE]: datagram: mem_scheudle functions
2007-12-15 5:15 ` [PATCH 2/4] [CORE]: datagram: mem_scheudle functions Hideo AOKI
@ 2007-12-15 15:32 ` Herbert Xu
2007-12-16 21:20 ` Hideo AOKI
0 siblings, 1 reply; 11+ messages in thread
From: Herbert Xu @ 2007-12-15 15:32 UTC (permalink / raw)
To: Hideo AOKI
Cc: David Miller, netdev, Takahiro Yasui, Masami Hiramatsu,
Satoshi Oshima, billfink, Andi Kleen, Evgeniy Polyakov,
Stephen Hemminger, yoshfuji, Yumiko Sugita
On Sat, Dec 15, 2007 at 12:15:04AM -0500, Hideo AOKI wrote:
> This patch includes changes in network core sub system for memory
> accounting.
>
> Memory scheduling, charging, uncharging and reclaiming functions are
> added. These functions use sk_forward_alloc to store socket local
> accounting. They also need to use lock to keep consistency of
> sk_forward_alloc and memory_allocated. They currently support only
> datagram protocols.
Thanks for the patch. I think it's generally on the right track
but there's still a few issues with the implementation.
> + spin_lock_irqsave(&sk->sk_lock.slock, flags);
Please use bh_lock_sock since this must never be used from an
IRQ handler.
> +static inline void sk_mem_reclaim(struct sock *sk)
> +{
> + if (sk->sk_type == SOCK_DGRAM)
> + sk_datagram_mem_reclaim(sk);
> +}
Please get rid of these wrappers since we should only get here
for datagram protocols.
> +static inline int sk_account_wmem_charge(struct sock *sk, int size)
> +{
> + unsigned long flags;
> +
> + /* account if protocol supports memory accounting. */
> + if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
> + return 1;
> +
> + spin_lock_irqsave(&sk->sk_lock.slock, flags);
> + if (sk_datagram_wmem_schedule(sk, size)) {
> + sk->sk_forward_alloc -= size;
> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
> + return 1;
> + }
> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
> + return 0;
> +}
This is probably too big to inline.
> +static inline int sk_account_rmem_charge(struct sock *sk, int size)
> +{
> + unsigned long flags;
> +
> + /* account if protocol supports memory accounting. */
> + if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
> + return 1;
> +
> + spin_lock_irqsave(&sk->sk_lock.slock, flags);
> + if (sk_datagram_rmem_schedule(sk, size)) {
> + sk->sk_forward_alloc -= size;
> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
> + return 1;
> + }
> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
> + return 0;
> +}
Why are we duplicating the rmem/wmem versions when they're identical?
> -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
> +int sock_queue_rcv_skb_with_owner(struct sock *sk, struct sk_buff *skb,
> + void set_owner_r(struct sk_buff *nskb,
> + struct sock* nsk))
Just make a new function for this rather than playing with function
pointers.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 0/4] [UDP]: memory accounting and limitation (take 10)
2007-12-15 5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
` (3 preceding siblings ...)
2007-12-15 5:15 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
@ 2007-12-16 5:34 ` David Miller
2007-12-16 21:21 ` Hideo AOKI
4 siblings, 1 reply; 11+ messages in thread
From: David Miller @ 2007-12-16 5:34 UTC (permalink / raw)
To: haoki
Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf
From: Hideo AOKI <haoki@redhat.com>
Date: Sat, 15 Dec 2007 00:07:44 -0500
> Changelog take 9 -> take 10:
> * supported using sk_forward_alloc
> * introduced several memory accounting functions with spin lock
> * changed detagram receive functions to be able to customize
> destructor
> * fixed accounting bugs in previous takes
This is not what Herbert and I meant with our suggestion.
We meant to convert all of UDP and datagram handling to lock
sockets precisely like TCP does, by calling lock_sock()
on entry to functions like udp_recvmsg() and release_sock()
on exit from those functions.
Then in the packet input processing, a sequence, just like
TCP, such as:
bh_lock_sock_nested(sk);
if (!sock_owned_by_user(sk)) {
udp_do_rcv(sk, skb);
} else
sk_add_backlog(sk, skb);
Then a suitably defined ->backlog_rcv is hooked up for these
protocols as well.
Again, use TCP as a guide.
There is much more work involved to implement this properly,
and make the accounting code sharable with TCP, than the
simplistic and minimal spin lock code you added here.
Please do this correctly, thank you.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/4] [CORE]: datagram: mem_scheudle functions
2007-12-15 15:32 ` Herbert Xu
@ 2007-12-16 21:20 ` Hideo AOKI
0 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-16 21:20 UTC (permalink / raw)
To: Herbert Xu
Cc: David Miller, netdev, Takahiro Yasui, Masami Hiramatsu,
Satoshi Oshima, billfink, Andi Kleen, Evgeniy Polyakov,
Stephen Hemminger, yoshfuji, Yumiko Sugita
Hello,
Thank you for your quick comments.
Herbert Xu wrote:
>> + spin_lock_irqsave(&sk->sk_lock.slock, flags);
>
> Please use bh_lock_sock since this must never be used from an
> IRQ handler.
I'll try to re-implement this locking mechanism as David suggested.
>> +static inline void sk_mem_reclaim(struct sock *sk)
>> +{
>> + if (sk->sk_type == SOCK_DGRAM)
>> + sk_datagram_mem_reclaim(sk);
>> +}
>
> Please get rid of these wrappers since we should only get here
> for datagram protocols.
In my understanding, TCP also uses ip_append_data() and
ip_ufo_append_data() via ip_send_reply(). Then I thought TCP could
reach datagram memory accounting functions, and I used the wrappers
since I didn't want to change TCP's sk_alloc_forward and
memory_allocated.
I'll try to remove these wrappers in next patch set. But I'd
appreciate it if you let me know whether we can keep the wrappers
in case TCP memory accounting doesn't work well due to accounting
in IP layer.
>> +static inline int sk_account_wmem_charge(struct sock *sk, int size)
>> +{
>> + unsigned long flags;
>> +
>> + /* account if protocol supports memory accounting. */
>> + if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
>> + return 1;
>> +
>> + spin_lock_irqsave(&sk->sk_lock.slock, flags);
>> + if (sk_datagram_wmem_schedule(sk, size)) {
>> + sk->sk_forward_alloc -= size;
>> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
>> + return 1;
>> + }
>> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
>> + return 0;
>> +}
>
> This is probably too big to inline.
>
>> +static inline int sk_account_rmem_charge(struct sock *sk, int size)
>> +{
>> + unsigned long flags;
>> +
>> + /* account if protocol supports memory accounting. */
>> + if (!sk->sk_prot->memory_allocated || sk->sk_type != SOCK_DGRAM)
>> + return 1;
>> +
>> + spin_lock_irqsave(&sk->sk_lock.slock, flags);
>> + if (sk_datagram_rmem_schedule(sk, size)) {
>> + sk->sk_forward_alloc -= size;
>> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
>> + return 1;
>> + }
>> + spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
>> + return 0;
>> +}
>
> Why are we duplicating the rmem/wmem versions when they're identical?
Good catch. I'll merge the body of these functions into one function.
Furthermore, I'll stop using inline if the code is large.
>> -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
>> +int sock_queue_rcv_skb_with_owner(struct sock *sk, struct sk_buff *skb,
>> + void set_owner_r(struct sk_buff *nskb,
>> + struct sock* nsk))
>
> Just make a new function for this rather than playing with function
> pointers.
I understood. I'll fix it.
Again, many thanks for the review.
Regards,
Hideo
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 0/4] [UDP]: memory accounting and limitation (take 10)
2007-12-16 5:34 ` [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) David Miller
@ 2007-12-16 21:21 ` Hideo AOKI
0 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-16 21:21 UTC (permalink / raw)
To: David Miller
Cc: herbert, netdev, tyasui, mhiramat, satoshi.oshima.fk, billfink,
andi, johnpol, shemminger, yoshfuji, yumiko.sugita.yf, haoki
David Miller wrote:
> From: Hideo AOKI <haoki@redhat.com>
> Date: Sat, 15 Dec 2007 00:07:44 -0500
>
>> Changelog take 9 -> take 10:
>> * supported using sk_forward_alloc
>> * introduced several memory accounting functions with spin lock
>> * changed detagram receive functions to be able to customize
>> destructor
>> * fixed accounting bugs in previous takes
>
> This is not what Herbert and I meant with our suggestion.
>
> We meant to convert all of UDP and datagram handling to lock
> sockets precisely like TCP does, by calling lock_sock()
> on entry to functions like udp_recvmsg() and release_sock()
> on exit from those functions.
>
> Then in the packet input processing, a sequence, just like
> TCP, such as:
>
> bh_lock_sock_nested(sk);
> if (!sock_owned_by_user(sk)) {
> udp_do_rcv(sk, skb);
> } else
> sk_add_backlog(sk, skb);
>
> Then a suitably defined ->backlog_rcv is hooked up for these
> protocols as well.
>
> Again, use TCP as a guide.
>
> There is much more work involved to implement this properly,
> and make the accounting code sharable with TCP, than the
> simplistic and minimal spin lock code you added here.
>
> Please do this correctly, thank you.
Hello,
I appreciate your suggestions.
I'll try to re-implement locking mechanism like TCP.
Regards,
Hideo
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min
2007-12-18 2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
@ 2007-12-18 2:38 ` Hideo AOKI
0 siblings, 0 replies; 11+ messages in thread
From: Hideo AOKI @ 2007-12-18 2:38 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, Bill Fink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
This patch adds sysctl parameters for customizing UDP memory accounting:
/proc/sys/net/ipv4/udp_mem
/proc/sys/net/ipv4/udp_rmem_min
/proc/sys/net/ipv4/udp_wmem_min
Udp_mem indicates number of pages which can be used for all UDP sockets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.
This patch is also introduced memory_allocated variable for UDP protocol.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
Documentation/networking/ip-sysctl.txt | 18 ++++++++++++++++++
include/net/udp.h | 9 +++++++++
net/ipv4/af_inet.c | 3 +++
net/ipv4/proc.c | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 31 +++++++++++++++++++++++++++++++
net/ipv4/udp.c | 27 +++++++++++++++++++++++++++
6 files changed, 90 insertions(+), 1 deletion(-)
diff -pruN net-2.6-udp-take11a1-p2/Documentation/networking/ip-sysctl.txt net-2.6-udp-take11a1-p3/Documentation/networking/ip-sysctl.txt
--- net-2.6-udp-take11a1-p2/Documentation/networking/ip-sysctl.txt 2007-12-11 10:54:41.000000000 -0500
+++ net-2.6-udp-take11a1-p3/Documentation/networking/ip-sysctl.txt 2007-12-17 14:42:40.000000000 -0500
@@ -446,6 +446,24 @@ tcp_dma_copybreak - INTEGER
and CONFIG_NET_DMA is enabled.
Default: 4096
+UDP variables:
+
+udp_mem - INTEGER
+ Number of pages allowed for queueing by all UDP sockets.
+ Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+ Minimal size of receive buffer used by UDP sockets. Each UDP socket
+ is able to use the size for receiving data, even if total pages of UDP
+ sockets exceed udp_mem. The unit is byte.
+ Default: 4096
+
+udp_wmem_min - INTEGER
+ Minimal size of send buffer used by UDP sockets. Each UDP socket is
+ able to use the size for sending data, even if total pages of UDP
+ sockets exceed udp_mem. The unit is byte.
+ Default: 4096
+
CIPSOv4 Variables:
cipso_cache_enable - BOOLEAN
diff -pruN net-2.6-udp-take11a1-p2/include/net/udp.h net-2.6-udp-take11a1-p3/include/net/udp.h
--- net-2.6-udp-take11a1-p2/include/net/udp.h 2007-12-11 10:54:53.000000000 -0500
+++ net-2.6-udp-take11a1-p3/include/net/udp.h 2007-12-17 14:42:40.000000000 -0500
@@ -65,6 +65,13 @@ extern rwlock_t udp_hash_lock;
extern struct proto udp_prot;
+extern atomic_t udp_memory_allocated;
+
+/* sysctl variables for udp */
+extern int sysctl_udp_mem;
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
struct sk_buff;
/*
@@ -173,4 +180,6 @@ extern void udp_proc_unregister(struct u
extern int udp4_proc_init(void);
extern void udp4_proc_exit(void);
#endif
+
+extern void udp_init(void);
#endif /* _UDP_H */
diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/af_inet.c net-2.6-udp-take11a1-p3/net/ipv4/af_inet.c
--- net-2.6-udp-take11a1-p2/net/ipv4/af_inet.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/af_inet.c 2007-12-17 14:42:40.000000000 -0500
@@ -1418,6 +1418,9 @@ static int __init inet_init(void)
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/proc.c net-2.6-udp-take11a1-p3/net/ipv4/proc.c
--- net-2.6-udp-take11a1-p2/net/ipv4/proc.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/proc.c 2007-12-17 14:42:40.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
- seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+ atomic_read(&udp_memory_allocated));
seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
seq_printf(seq, "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/sysctl_net_ipv4.c net-2.6-udp-take11a1-p3/net/ipv4/sysctl_net_ipv4.c
--- net-2.6-udp-take11a1-p2/net/ipv4/sysctl_net_ipv4.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/sysctl_net_ipv4.c 2007-12-17 14:42:40.000000000 -0500
@@ -18,6 +18,7 @@
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
@@ -885,6 +886,36 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
{ .ctl_name = 0 }
};
diff -pruN net-2.6-udp-take11a1-p2/net/ipv4/udp.c net-2.6-udp-take11a1-p3/net/ipv4/udp.c
--- net-2.6-udp-take11a1-p2/net/ipv4/udp.c 2007-12-11 10:54:55.000000000 -0500
+++ net-2.6-udp-take11a1-p3/net/ipv4/udp.c 2007-12-17 14:42:40.000000000 -0500
@@ -82,6 +82,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <linux/bootmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
@@ -114,6 +115,11 @@ DEFINE_SNMP_STAT(struct udp_mib, udp_sta
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
+atomic_t udp_memory_allocated;
+int sysctl_udp_mem __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
static inline int __udp_lib_lport_inuse(__u16 num,
const struct hlist_head udptable[])
{
@@ -1449,6 +1455,10 @@ struct proto udp_prot = {
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
+ .sysctl_mem = &sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
@@ -1644,6 +1654,23 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ /* Set the pressure threshold up by the same strategy of TCP. It is a
+ * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+ * toward zero with the amount of memory, with a floor of 128 pages.
+ */
+ limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+ limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+ limit = max(limit, 128UL);
+ sysctl_udp_mem = limit / 4 * 3;
+
+ sysctl_udp_rmem_min = SK_DATAGRAM_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_DATAGRAM_MEM_QUANTUM;
+}
+
EXPORT_SYMBOL(udp_disconnect);
EXPORT_SYMBOL(udp_hash);
EXPORT_SYMBOL(udp_hash_lock);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2007-12-18 2:45 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-15 5:07 [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) Hideo AOKI
2007-12-15 5:14 ` [PATCH 1/4] [UDP]: fix send buffer check Hideo AOKI
2007-12-15 5:15 ` [PATCH 2/4] [CORE]: datagram: mem_scheudle functions Hideo AOKI
2007-12-15 15:32 ` Herbert Xu
2007-12-16 21:20 ` Hideo AOKI
2007-12-15 5:15 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
2007-12-15 5:15 ` [PATCH 4/4] [UDP]: memory accounting in IPv4 Hideo AOKI
2007-12-16 5:34 ` [PATCH 0/4] [UDP]: memory accounting and limitation (take 10) David Miller
2007-12-16 21:21 ` Hideo AOKI
-- strict thread matches above, loose matches on Subject: below --
2007-12-18 2:33 [PATCH 0/4] [UDP]: memory accounting and limitation (take 11) Hideo AOKI
2007-12-18 2:38 ` [PATCH 3/4] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
2007-11-28 18:48 [PATCH 0/4] UDP memory accounting and limitation (take 9) Hideo AOKI
2007-11-28 18:53 ` [PATCH 3/4] udp: add udp_mem, udp_rmem_min and udp_wmem_min Hideo AOKI
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).