* [PATCH 1/3] [UDP]: add udp_mem, udp_rmem_min and udp_wmem_min
From: Hideo AOKI @ 2007-12-30 9:01 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
In-Reply-To: <47775D8C.5010104@redhat.com>
This patch adds sysctl parameters for customizing UDP memory accounting:
/proc/sys/net/ipv4/udp_mem
/proc/sys/net/ipv4/udp_rmem_min
/proc/sys/net/ipv4/udp_wmem_min
Udp_mem indicates number of pages which can be used for all UDP scokets.
Each UDP packet is dropped, when the number of pages for socket buffer is
beyond udp_mem and the socket already consumes minimum buffer.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
Documentation/networking/ip-sysctl.txt | 27 +++++++++++++++++++++++++++
include/net/udp.h | 7 +++++++
net/ipv4/af_inet.c | 3 +++
net/ipv4/proc.c | 3 ++-
net/ipv4/sysctl_net_ipv4.c | 31 +++++++++++++++++++++++++++++++
net/ipv4/udp.c | 31 +++++++++++++++++++++++++++++++
6 files changed, 101 insertions(+), 1 deletion(-)
diff -pruN net-2.6.25-t12t19m-p4/Documentation/networking/ip-sysctl.txt net-2.6.25-t12t19m-p5/Documentation/networking/ip-sysctl.txt
--- net-2.6.25-t12t19m-p4/Documentation/networking/ip-sysctl.txt 2007-12-27 10:18:41.000000000 -0500
+++ net-2.6.25-t12t19m-p5/Documentation/networking/ip-sysctl.txt 2007-12-29 21:09:21.000000000 -0500
@@ -446,6 +446,33 @@ tcp_dma_copybreak - INTEGER
and CONFIG_NET_DMA is enabled.
Default: 4096
+UDP variables:
+
+udp_mem - vector of 3 INTEGERs: min, pressure, max
+ Number of pages allowed for queueing by all UDP sockets.
+
+ min: Below this number of pages UDP is not bothered about its
+ memory appetite. When amount of memory allocated by UDP exceeds
+ this number, UDP starts to moderate memory usage.
+
+ pressure: This value was introduced to follow format of tcp_mem.
+
+ max: Number of pages allowed for queueing by all UDP sockets.
+
+ Default is calculated at boot time from amount of available memory.
+
+udp_rmem_min - INTEGER
+ Minimal size of receive buffer used by UDP sockets in moderation.
+ Each UDP socket is able to use the size for receiving data, even if
+ total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+ Default: 4096
+
+udp_wmem_min - INTEGER
+ Minimal size of send buffer used by UDP sockets in moderation.
+ Each UDP socket is able to use the size for sending data, even if
+ total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
+ Default: 4096
+
CIPSOv4 Variables:
cipso_cache_enable - BOOLEAN
diff -pruN net-2.6.25-t12t19m-p4/include/net/udp.h net-2.6.25-t12t19m-p5/include/net/udp.h
--- net-2.6.25-t12t19m-p4/include/net/udp.h 2007-12-27 10:18:58.000000000 -0500
+++ net-2.6.25-t12t19m-p5/include/net/udp.h 2007-12-29 21:10:48.000000000 -0500
@@ -65,6 +65,11 @@ extern rwlock_t udp_hash_lock;
extern struct proto udp_prot;
+/* sysctl variables for udp */
+extern int sysctl_udp_mem[3];
+extern int sysctl_udp_rmem_min;
+extern int sysctl_udp_wmem_min;
+
struct sk_buff;
/*
@@ -198,4 +203,6 @@ extern void udp_proc_unregister(struct u
extern int udp4_proc_init(void);
extern void udp4_proc_exit(void);
#endif
+
+extern void udp_init(void);
#endif /* _UDP_H */
diff -pruN net-2.6.25-t12t19m-p4/net/ipv4/af_inet.c net-2.6.25-t12t19m-p5/net/ipv4/af_inet.c
--- net-2.6.25-t12t19m-p4/net/ipv4/af_inet.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p5/net/ipv4/af_inet.c 2007-12-29 21:09:21.000000000 -0500
@@ -1417,6 +1417,9 @@ static int __init inet_init(void)
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
diff -pruN net-2.6.25-t12t19m-p4/net/ipv4/proc.c net-2.6.25-t12t19m-p5/net/ipv4/proc.c
--- net-2.6.25-t12t19m-p4/net/ipv4/proc.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p5/net/ipv4/proc.c 2007-12-29 21:09:21.000000000 -0500
@@ -56,7 +56,8 @@ static int sockstat_seq_show(struct seq_
sock_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated),
atomic_read(&tcp_memory_allocated));
- seq_printf(seq, "UDP: inuse %d\n", sock_prot_inuse(&udp_prot));
+ seq_printf(seq, "UDP: inuse %d mem %d\n", sock_prot_inuse(&udp_prot),
+ atomic_read(&udp_memory_allocated));
seq_printf(seq, "UDPLITE: inuse %d\n", sock_prot_inuse(&udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse(&raw_prot));
seq_printf(seq, "FRAG: inuse %d memory %d\n",
diff -pruN net-2.6.25-t12t19m-p4/net/ipv4/sysctl_net_ipv4.c net-2.6.25-t12t19m-p5/net/ipv4/sysctl_net_ipv4.c
--- net-2.6.25-t12t19m-p4/net/ipv4/sysctl_net_ipv4.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p5/net/ipv4/sysctl_net_ipv4.c 2007-12-29 21:09:21.000000000 -0500
@@ -19,6 +19,7 @@
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
+#include <net/udp.h>
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
@@ -812,6 +813,36 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_mem",
+ .data = &sysctl_udp_mem,
+ .maxlen = sizeof(sysctl_udp_mem),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_rmem_min",
+ .data = &sysctl_udp_rmem_min,
+ .maxlen = sizeof(sysctl_udp_rmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "udp_wmem_min",
+ .data = &sysctl_udp_wmem_min,
+ .maxlen = sizeof(sysctl_udp_wmem_min),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero
+ },
{ .ctl_name = 0 }
};
diff -pruN net-2.6.25-t12t19m-p4/net/ipv4/udp.c net-2.6.25-t12t19m-p5/net/ipv4/udp.c
--- net-2.6.25-t12t19m-p4/net/ipv4/udp.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p5/net/ipv4/udp.c 2007-12-29 21:12:03.000000000 -0500
@@ -82,6 +82,7 @@
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <linux/bootmem.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
@@ -118,6 +119,14 @@ EXPORT_SYMBOL(udp_stats_in6);
struct hlist_head udp_hash[UDP_HTABLE_SIZE];
DEFINE_RWLOCK(udp_hash_lock);
+int sysctl_udp_mem[3] __read_mostly;
+int sysctl_udp_rmem_min __read_mostly;
+int sysctl_udp_wmem_min __read_mostly;
+
+EXPORT_SYMBOL(sysctl_udp_mem);
+EXPORT_SYMBOL(sysctl_udp_rmem_min);
+EXPORT_SYMBOL(sysctl_udp_wmem_min);
+
static inline int __udp_lib_lport_inuse(__u16 num,
const struct hlist_head udptable[])
{
@@ -1460,6 +1469,9 @@ struct proto udp_prot = {
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem = &sysctl_udp_wmem_min,
+ .sysctl_rmem = &sysctl_udp_rmem_min,
.obj_size = sizeof(struct udp_sock),
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_udp_setsockopt,
@@ -1655,6 +1667,25 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+void __init udp_init(void)
+{
+ unsigned long limit;
+
+ /* Set the pressure threshold up by the same strategy of TCP. It is a
+ * fraction of global memory that is up to 1/2 at 256 MB, decreasing
+ * toward zero with the amount of memory, with a floor of 128 pages.
+ */
+ limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
+ limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
+ limit = max(limit, 128UL);
+ sysctl_udp_mem[0] = limit / 4 * 3;
+ sysctl_udp_mem[1] = limit;
+ sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
+
+ sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+ sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+}
+
EXPORT_SYMBOL(udp_disconnect);
EXPORT_SYMBOL(udp_hash);
EXPORT_SYMBOL(udp_hash_lock);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 2/3] [UDP]: memory accounting in IPv4
From: Hideo AOKI @ 2007-12-30 9:02 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
In-Reply-To: <47775D8C.5010104@redhat.com>
This patch adds UDP memory usage accounting in IPv4. Currently,
receiving buffer accounting is only supported.
This patch is also introduced memory_allocated variable for UDP protocol.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
include/net/udp.h | 2 ++
net/ipv4/af_inet.c | 2 ++
net/ipv4/udp.c | 26 ++++++++++++++++++++++++--
3 files changed, 28 insertions(+), 2 deletions(-)
diff -pruN net-2.6.25-t12t19m-p5/include/net/udp.h net-2.6.25-t12t19m-p6/include/net/udp.h
--- net-2.6.25-t12t19m-p5/include/net/udp.h 2007-12-29 21:10:48.000000000 -0500
+++ net-2.6.25-t12t19m-p6/include/net/udp.h 2007-12-29 21:48:33.000000000 -0500
@@ -65,6 +65,8 @@ extern rwlock_t udp_hash_lock;
extern struct proto udp_prot;
+extern atomic_t udp_memory_allocated;
+
/* sysctl variables for udp */
extern int sysctl_udp_mem[3];
extern int sysctl_udp_rmem_min;
diff -pruN net-2.6.25-t12t19m-p5/net/ipv4/af_inet.c net-2.6.25-t12t19m-p6/net/ipv4/af_inet.c
--- net-2.6.25-t12t19m-p5/net/ipv4/af_inet.c 2007-12-29 21:09:21.000000000 -0500
+++ net-2.6.25-t12t19m-p6/net/ipv4/af_inet.c 2007-12-29 21:47:31.000000000 -0500
@@ -139,6 +139,8 @@ void inet_sock_destruct(struct sock *sk)
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
+ sk_mem_reclaim(sk);
+
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
printk("Attempt to release TCP socket in state %d %p\n",
sk->sk_state, sk);
diff -pruN net-2.6.25-t12t19m-p5/net/ipv4/udp.c net-2.6.25-t12t19m-p6/net/ipv4/udp.c
--- net-2.6.25-t12t19m-p5/net/ipv4/udp.c 2007-12-29 21:12:03.000000000 -0500
+++ net-2.6.25-t12t19m-p6/net/ipv4/udp.c 2007-12-29 21:51:17.000000000 -0500
@@ -127,6 +127,9 @@ EXPORT_SYMBOL(sysctl_udp_mem);
EXPORT_SYMBOL(sysctl_udp_rmem_min);
EXPORT_SYMBOL(sysctl_udp_wmem_min);
+atomic_t udp_memory_allocated;
+EXPORT_SYMBOL(udp_memory_allocated);
+
static inline int __udp_lib_lport_inuse(__u16 num,
const struct hlist_head udptable[])
{
@@ -910,13 +913,17 @@ try_again:
err = ulen;
out_free:
+ lock_sock(sk);
skb_free_datagram(sk, skb);
+ release_sock(sk);
out:
return err;
csum_copy_err:
+ lock_sock(sk);
if (!skb_kill_datagram(sk, skb, flags))
UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite);
+ release_sock(sk);
if (noblock)
return -EAGAIN;
@@ -1081,7 +1088,15 @@ static int __udp4_lib_mcast_deliver(stru
skb1 = skb_clone(skb, GFP_ATOMIC);
if (skb1) {
- int ret = udp_queue_rcv_skb(sk, skb1);
+ int ret = 0;
+
+ bh_lock_sock_nested(sk);
+ if (!sock_owned_by_user(sk))
+ ret = udp_queue_rcv_skb(sk, skb1);
+ else
+ sk_add_backlog(sk, skb1);
+ bh_unlock_sock(sk);
+
if (ret > 0)
/* we should probably re-process instead
* of dropping packets here. */
@@ -1174,7 +1189,13 @@ int __udp4_lib_rcv(struct sk_buff *skb,
inet_iif(skb), udptable);
if (sk != NULL) {
- int ret = udp_queue_rcv_skb(sk, skb);
+ int ret = 0;
+ bh_lock_sock_nested(sk);
+ if (!sock_owned_by_user(sk))
+ ret = udp_queue_rcv_skb(sk, skb);
+ else
+ sk_add_backlog(sk, skb);
+ bh_unlock_sock(sk);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
@@ -1469,6 +1490,7 @@ struct proto udp_prot = {
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.get_port = udp_v4_get_port,
+ .memory_allocated = &udp_memory_allocated,
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem = &sysctl_udp_wmem_min,
.sysctl_rmem = &sysctl_udp_rmem_min,
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 0/3] UDP memory accounting and limitation (take 12)
From: Hideo AOKI @ 2007-12-30 8:57 UTC (permalink / raw)
To: David Miller, Herbert Xu, netdev
Cc: haoki, Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita
Hello,
This is the latest patch set of UDP memory accounting and limitation.
I split my patch set into accounting interface consolidation and UDP
memory accounting support. This patch set implements UDP memory
accounting support.
To use consolidated accounting functions, I matched format of udp_mem
sysctl with tcp_mem. UDP IPv6 is finally supported.
However, currently, only receive buffer accounting is implemented.
The patch set was tested on net-2.6.25 tree.
Changelog take 11 -> take 12:
* split into accounting interface consolidation and UDP memory
accounting support.
* used new accounting interface
* supported IPv6
* dropped send buffer accounting
Changelog take 10 -> take 11:
* stoped using spin lock in memory accounting function
* socket lock and backlog processing were used to avoid conflict
between receive system call processing and BH
* revised memory accounting functions
* stooped changing sock_queue_rcv_skb() and skb_set_owner_r()
* added __udp_queue_rcv_skb to set proper destructor
* removed udp_set_owner_r()
* removed reclaim in inet_sock_destruct()
Changelog take 9 -> take 10:
* supported using sk_forward_alloc
* introduced several memory accounting functions with spin lock
* changed detagram receive functions to be able to customize
destructor
* fixed accounting bugs in previous takes
Best regards,
Hideo Aoki
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 3/4] [TCP]: using new interface
From: Hideo AOKI @ 2007-12-30 8:53 UTC (permalink / raw)
To: David Miller, Herbert Xu, vladislav.yasevich, netdev
Cc: lksctp-developers, Takahiro Yasui, Masami Hiramatsu,
Satoshi Oshima, billfink, Andi Kleen, Evgeniy Polyakov,
Stephen Hemminger, yoshfuji, Yumiko Sugita, haoki
In-Reply-To: <47775B25.7020401@redhat.com>
This patch replaces present memory accounting calls with new interface
in TCP.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
include/net/tcp.h | 4 ++--
net/ipv4/tcp.c | 23 ++++++++++++-----------
net/ipv4/tcp_input.c | 26 +++++++++++++-------------
net/ipv4/tcp_output.c | 26 ++++++++++++++++----------
net/ipv4/tcp_timer.c | 8 ++++----
5 files changed, 47 insertions(+), 40 deletions(-)
diff -pruN net-2.6.25-t12t19m-p2/include/net/tcp.h net-2.6.25-t12t19m-p3/include/net/tcp.h
--- net-2.6.25-t12t19m-p2/include/net/tcp.h 2007-12-27 10:18:58.000000000 -0500
+++ net-2.6.25-t12t19m-p3/include/net/tcp.h 2007-12-29 20:49:59.000000000 -0500
@@ -1196,8 +1196,8 @@ static inline void tcp_write_queue_purge
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
- sk_stream_free_skb(sk, skb);
- sk_stream_mem_reclaim(sk);
+ sk_wmem_free_skb(sk, skb);
+ sk_mem_reclaim(sk);
}
static inline struct sk_buff *tcp_write_queue_head(struct sock *sk)
diff -pruN net-2.6.25-t12t19m-p2/net/ipv4/tcp.c net-2.6.25-t12t19m-p3/net/ipv4/tcp.c
--- net-2.6.25-t12t19m-p2/net/ipv4/tcp.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p3/net/ipv4/tcp.c 2007-12-29 20:49:59.000000000 -0500
@@ -308,7 +308,7 @@ struct tcp_splice_state {
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
- * All the sk_stream_mem_schedule() is of this nature: accounting
+ * All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int tcp_memory_pressure __read_mostly;
@@ -485,7 +485,8 @@ static inline void skb_entail(struct soc
tcb->sacked = 0;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk_charge_skb(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
}
@@ -638,7 +639,7 @@ struct sk_buff *sk_stream_alloc_skb(stru
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
if (skb) {
- if (sk_stream_wmem_schedule(sk, skb->truesize)) {
+ if (sk_wmem_schedule(sk, skb->truesize)) {
/*
* Make sure that we have exactly size bytes
* available to the caller, no more, no less.
@@ -707,7 +708,7 @@ new_segment:
tcp_mark_push(tp, skb);
goto new_segment;
}
- if (!sk_stream_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (can_coalesce) {
@@ -721,7 +722,7 @@ new_segment:
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
- sk->sk_forward_alloc -= copy;
+ sk_mem_charge(sk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
tp->write_seq += copy;
TCP_SKB_CB(skb)->end_seq += copy;
@@ -928,7 +929,7 @@ new_segment:
if (copy > PAGE_SIZE - off)
copy = PAGE_SIZE - off;
- if (!sk_stream_wmem_schedule(sk, copy))
+ if (!sk_wmem_schedule(sk, copy))
goto wait_for_memory;
if (!page) {
@@ -1019,7 +1020,7 @@ do_fault:
* reset, where we can be unlinking the send_head.
*/
tcp_check_send_head(sk, skb);
- sk_stream_free_skb(sk, skb);
+ sk_wmem_free_skb(sk, skb);
}
do_error:
@@ -1738,7 +1739,7 @@ void tcp_close(struct sock *sk, long tim
__kfree_skb(skb);
}
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
/* As outlined in RFC 2525, section 2.17, we send a RST here because
* data was lost. To witness the awful effects of the old behavior of
@@ -1841,7 +1842,7 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (tcp_too_many_orphans(sk,
atomic_read(sk->sk_prot->orphan_count))) {
if (net_ratelimit())
@@ -2658,11 +2659,11 @@ void __init tcp_init(void)
limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
max_share = min(4UL*1024*1024, limit);
- sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
+ sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_wmem[1] = 16*1024;
sysctl_tcp_wmem[2] = max(64*1024, max_share);
- sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
+ sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
sysctl_tcp_rmem[1] = 87380;
sysctl_tcp_rmem[2] = max(87380, max_share);
diff -pruN net-2.6.25-t12t19m-p2/net/ipv4/tcp_input.c net-2.6.25-t12t19m-p3/net/ipv4/tcp_input.c
--- net-2.6.25-t12t19m-p2/net/ipv4/tcp_input.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p3/net/ipv4/tcp_input.c 2007-12-29 20:49:59.000000000 -0500
@@ -591,7 +591,7 @@ static void tcp_event_data_recv(struct s
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
@@ -2848,7 +2848,7 @@ static int tcp_clean_rtx_queue(struct so
break;
tcp_unlink_write_queue(skb, sk);
- sk_stream_free_skb(sk, skb);
+ sk_wmem_free_skb(sk, skb);
tcp_clear_all_retrans_hints(tp);
}
@@ -3564,7 +3564,7 @@ static void tcp_fin(struct sk_buff *skb,
__skb_queue_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -3847,12 +3847,12 @@ static void tcp_data_queue(struct sock *
queue_and_out:
if (eaten < 0 &&
(atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_stream_rmem_schedule(sk, skb))) {
+ !sk_rmem_schedule(sk, skb->truesize))) {
if (tcp_prune_queue(sk) < 0 ||
- !sk_stream_rmem_schedule(sk, skb))
+ !sk_rmem_schedule(sk, skb->truesize))
goto drop;
}
- sk_stream_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
}
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -3921,9 +3921,9 @@ drop:
TCP_ECN_check_ce(tp, skb);
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_stream_rmem_schedule(sk, skb)) {
+ !sk_rmem_schedule(sk, skb->truesize)) {
if (tcp_prune_queue(sk) < 0 ||
- !sk_stream_rmem_schedule(sk, skb))
+ !sk_rmem_schedule(sk, skb->truesize))
goto drop;
}
@@ -3934,7 +3934,7 @@ drop:
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
- sk_stream_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
if (!skb_peek(&tp->out_of_order_queue)) {
/* Initial out of order segment, build 1 SACK. */
@@ -4076,7 +4076,7 @@ tcp_collapse(struct sock *sk, struct sk_
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
__skb_insert(nskb, skb->prev, skb, list);
- sk_stream_set_owner_r(nskb, sk);
+ skb_set_owner_r(nskb, sk);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -4174,7 +4174,7 @@ static int tcp_prune_queue(struct sock *
sk->sk_receive_queue.next,
(struct sk_buff*)&sk->sk_receive_queue,
tp->copied_seq, tp->rcv_nxt);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
@@ -4194,7 +4194,7 @@ static int tcp_prune_queue(struct sock *
*/
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
}
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
@@ -4696,7 +4696,7 @@ int tcp_rcv_established(struct sock *sk,
/* Bulk data transfer: receiver */
__skb_pull(skb,tcp_header_len);
__skb_queue_tail(&sk->sk_receive_queue, skb);
- sk_stream_set_owner_r(skb, sk);
+ skb_set_owner_r(skb, sk);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
}
diff -pruN net-2.6.25-t12t19m-p2/net/ipv4/tcp_output.c net-2.6.25-t12t19m-p3/net/ipv4/tcp_output.c
--- net-2.6.25-t12t19m-p2/net/ipv4/tcp_output.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p3/net/ipv4/tcp_output.c 2007-12-29 20:49:59.000000000 -0500
@@ -637,7 +637,8 @@ static void tcp_queue_skb(struct sock *s
tp->write_seq = TCP_SKB_CB(skb)->end_seq;
skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
- sk_charge_skb(sk, skb);
+ sk->sk_wmem_queued += skb->truesize;
+ sk_mem_charge(sk, skb->truesize);
}
static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
@@ -701,7 +702,8 @@ int tcp_fragment(struct sock *sk, struct
if (buff == NULL)
return -ENOMEM; /* We'll just try again later. */
- sk_charge_skb(sk, buff);
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
nlen = skb->len - len - nsize;
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -825,7 +827,7 @@ int tcp_trim_head(struct sock *sk, struc
skb->truesize -= len;
sk->sk_wmem_queued -= len;
- sk->sk_forward_alloc += len;
+ sk_mem_uncharge(sk, len);
sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
/* Any change of skb->len requires recalculation of tso
@@ -1197,7 +1199,8 @@ static int tso_fragment(struct sock *sk,
if (unlikely(buff == NULL))
return -ENOMEM;
- sk_charge_skb(sk, buff);
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -1350,7 +1353,8 @@ static int tcp_mtu_probe(struct sock *sk
/* We're allowed to probe. Build it now. */
if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
return -1;
- sk_charge_skb(sk, nskb);
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
@@ -1377,7 +1381,7 @@ static int tcp_mtu_probe(struct sock *sk
* Throw it away. */
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
tcp_unlink_write_queue(skb, sk);
- sk_stream_free_skb(sk, skb);
+ sk_wmem_free_skb(sk, skb);
} else {
TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
@@ -1744,7 +1748,7 @@ static void tcp_retrans_try_collapse(str
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
- sk_stream_free_skb(sk, next_skb);
+ sk_wmem_free_skb(sk, next_skb);
}
}
@@ -2139,8 +2143,9 @@ int tcp_send_synack(struct sock *sk)
tcp_unlink_write_queue(skb, sk);
skb_header_release(nskb);
__tcp_add_write_queue_head(sk, nskb);
- sk_stream_free_skb(sk, skb);
- sk_charge_skb(sk, nskb);
+ sk_wmem_free_skb(sk, skb);
+ sk->sk_wmem_queued += nskb->truesize;
+ sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
@@ -2343,7 +2348,8 @@ int tcp_connect(struct sock *sk)
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
skb_header_release(buff);
__tcp_add_write_queue_tail(sk, buff);
- sk_charge_skb(sk, buff);
+ sk->sk_wmem_queued += buff->truesize;
+ sk_mem_charge(sk, buff->truesize);
tp->packets_out += tcp_skb_pcount(buff);
tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
diff -pruN net-2.6.25-t12t19m-p2/net/ipv4/tcp_timer.c net-2.6.25-t12t19m-p3/net/ipv4/tcp_timer.c
--- net-2.6.25-t12t19m-p2/net/ipv4/tcp_timer.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p3/net/ipv4/tcp_timer.c 2007-12-29 20:49:59.000000000 -0500
@@ -186,7 +186,7 @@ static void tcp_delack_timer(unsigned lo
goto out_unlock;
}
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
@@ -226,7 +226,7 @@ static void tcp_delack_timer(unsigned lo
out:
if (tcp_memory_pressure)
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
@@ -420,7 +420,7 @@ static void tcp_write_timer(unsigned lon
TCP_CHECK_TIMER(sk);
out:
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
@@ -514,7 +514,7 @@ static void tcp_keepalive_timer (unsigne
}
TCP_CHECK_TIMER(sk);
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
resched:
inet_csk_reset_keepalive_timer (sk, elapsed);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 1/4] [CORE]: introducing new memory accounting interface
From: Hideo AOKI @ 2007-12-30 8:51 UTC (permalink / raw)
To: David Miller, Herbert Xu, vladislav.yasevich, netdev
Cc: lksctp-developers, Takahiro Yasui, Masami Hiramatsu,
Satoshi Oshima, billfink, Andi Kleen, Evgeniy Polyakov,
Stephen Hemminger, yoshfuji, Yumiko Sugita, haoki
In-Reply-To: <47775B25.7020401@redhat.com>
This patch introduces new memory accounting functions for each network
protocol. Most of them are renamed from memory accounting functions
for stream protocols. At the same time, some stream memory accounting
functions are removed since other functions do same thing.
Renaming:
sk_stream_free_skb() -> sk_wmem_free_skb()
__sk_stream_mem_reclaim() -> __sk_mem_reclaim()
sk_stream_mem_reclaim() -> sk_mem_reclaim()
sk_stream_mem_schedule -> __sk_mem_schedule()
sk_stream_pages() -> sk_mem_pages()
sk_stream_rmem_schedule() -> sk_rmem_schedule()
sk_stream_wmem_schedule() -> sk_wmem_schedule()
sk_charge_skb() -> sk_mem_charge()
Removeing
sk_stream_rfree(): consolidates into sock_rfree()
sk_stream_set_owner_r(): consolidates into skb_set_owner_r()
sk_stream_mem_schedule()
The following functions are added.
sk_has_account(): check if the protocol supports accounting
sk_mem_uncharge(): do the opposite of sk_mem_charge()
In addition, to achieve consolidation, updating sk_wmem_queued is
removed from sk_mem_charge().
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
include/net/sock.h | 95 ++++++++++++++++++++++++++++++---------------------
net/core/sock.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++
net/core/stream.c | 82 --------------------------------------------
3 files changed, 152 insertions(+), 122 deletions(-)
diff -pruN net-2.6.25/include/net/sock.h net-2.6.25-t12t19m-p1/include/net/sock.h
--- net-2.6.25/include/net/sock.h 2007-12-27 10:18:58.000000000 -0500
+++ net-2.6.25-t12t19m-p1/include/net/sock.h 2007-12-29 20:16:31.000000000 -0500
@@ -460,25 +460,6 @@ static inline int sk_stream_memory_free(
return sk->sk_wmem_queued < sk->sk_sndbuf;
}
-extern void sk_stream_rfree(struct sk_buff *skb);
-
-static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
-{
- skb->sk = sk;
- skb->destructor = sk_stream_rfree;
- atomic_add(skb->truesize, &sk->sk_rmem_alloc);
- sk->sk_forward_alloc -= skb->truesize;
-}
-
-static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb)
-{
- skb_truesize_check(skb);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- sk->sk_wmem_queued -= skb->truesize;
- sk->sk_forward_alloc += skb->truesize;
- __kfree_skb(skb);
-}
-
/* The per-socket spinlock must be held here. */
static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
@@ -576,7 +557,7 @@ struct proto {
/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
- * All the sk_stream_mem_schedule() is of this nature: accounting
+ * All the __sk_mem_schedule() is of this nature: accounting
* is strict, actions are advisory and have some latency.
*/
int *memory_pressure;
@@ -712,33 +693,73 @@ static inline struct inode *SOCK_INODE(s
return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}
-extern void __sk_stream_mem_reclaim(struct sock *sk);
-extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
+/*
+ * Functions for memory accounting
+ */
+extern int __sk_mem_schedule(struct sock *sk, int size, int kind);
+extern void __sk_mem_reclaim(struct sock *sk);
-#define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE)
-#define SK_STREAM_MEM_QUANTUM_SHIFT ilog2(SK_STREAM_MEM_QUANTUM)
+#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
+#define SK_MEM_SEND 0
+#define SK_MEM_RECV 1
-static inline int sk_stream_pages(int amt)
+static inline int sk_mem_pages(int amt)
{
- return (amt + SK_STREAM_MEM_QUANTUM - 1) >> SK_STREAM_MEM_QUANTUM_SHIFT;
+ return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
}
-static inline void sk_stream_mem_reclaim(struct sock *sk)
+static inline int sk_has_account(struct sock *sk)
{
- if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM)
- __sk_stream_mem_reclaim(sk);
+ /* return true if protocol supports memory accounting */
+ return !!sk->sk_prot->memory_allocated;
}
-static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+static inline int sk_wmem_schedule(struct sock *sk, int size)
{
- return (int)skb->truesize <= sk->sk_forward_alloc ||
- sk_stream_mem_schedule(sk, skb->truesize, 1);
+ if (!sk_has_account(sk))
+ return 1;
+ return size <= sk->sk_forward_alloc ||
+ __sk_mem_schedule(sk, size, SK_MEM_SEND);
}
-static inline int sk_stream_wmem_schedule(struct sock *sk, int size)
+static inline int sk_rmem_schedule(struct sock *sk, int size)
{
+ if (!sk_has_account(sk))
+ return 1;
return size <= sk->sk_forward_alloc ||
- sk_stream_mem_schedule(sk, size, 0);
+ __sk_mem_schedule(sk, size, SK_MEM_RECV);
+}
+
+static inline void sk_mem_reclaim(struct sock *sk)
+{
+ if (!sk_has_account(sk))
+ return;
+ if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
+ __sk_mem_reclaim(sk);
+}
+
+static inline void sk_mem_charge(struct sock *sk, int size)
+{
+ if (!sk_has_account(sk))
+ return;
+ sk->sk_forward_alloc -= size;
+}
+
+static inline void sk_mem_uncharge(struct sock *sk, int size)
+{
+ if (!sk_has_account(sk))
+ return;
+ sk->sk_forward_alloc += size;
+}
+
+static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
+{
+ skb_truesize_check(skb);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ sk->sk_wmem_queued -= skb->truesize;
+ sk_mem_uncharge(sk, skb->truesize);
+ __kfree_skb(skb);
}
/* Used by processes to "lock" a socket state, so that
@@ -1076,12 +1097,6 @@ static inline int sk_can_gso(const struc
extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
-static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb)
-{
- sk->sk_wmem_queued += skb->truesize;
- sk->sk_forward_alloc -= skb->truesize;
-}
-
static inline int skb_copy_to_page(struct sock *sk, char __user *from,
struct sk_buff *skb, struct page *page,
int off, int copy)
diff -pruN net-2.6.25/net/core/sock.c net-2.6.25-t12t19m-p1/net/core/sock.c
--- net-2.6.25/net/core/sock.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p1/net/core/sock.c 2007-12-29 20:16:31.000000000 -0500
@@ -1384,6 +1384,103 @@ int sk_wait_data(struct sock *sk, long *
EXPORT_SYMBOL(sk_wait_data);
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ struct proto *prot = sk->sk_prot;
+ int amt = sk_mem_pages(size);
+ int allocated;
+
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ allocated = atomic_add_return(amt, prot->memory_allocated);
+
+ /* Under limit. */
+ if (allocated <= prot->sysctl_mem[0]) {
+ if (prot->memory_pressure && *prot->memory_pressure)
+ *prot->memory_pressure = 0;
+ return 1;
+ }
+
+ /* Under pressure. */
+ if (allocated > prot->sysctl_mem[1])
+ if (prot->enter_memory_pressure)
+ prot->enter_memory_pressure();
+
+ /* Over hard limit. */
+ if (allocated > prot->sysctl_mem[2])
+ goto suppress_allocation;
+
+ /* guarantee minimum buffer size under pressure */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+ return 1;
+ } else { /* SK_MEM_SEND */
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+ return 1;
+ } else if (atomic_read(&sk->sk_wmem_alloc) <
+ prot->sysctl_wmem[0])
+ return 1;
+ }
+
+ if (prot->memory_pressure) {
+ if (!*prot->memory_pressure ||
+ prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ return 1;
+ }
+
+ /* Alas. Undo changes. */
+ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+ atomic_sub(amt, prot->memory_allocated);
+ return 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_reclaim - reclaim memory_allocated
+ * @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+ struct proto *prot = sk->sk_prot;
+
+ atomic_sub(sk->sk_forward_alloc / SK_MEM_QUANTUM,
+ prot->memory_allocated);
+ sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+ if (prot->memory_pressure && *prot->memory_pressure &&
+ (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+ *prot->memory_pressure = 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
/*
* Set of default routines for initialising struct proto_ops when
* the protocol does not support a particular function. In certain
diff -pruN net-2.6.25/net/core/stream.c net-2.6.25-t12t19m-p1/net/core/stream.c
--- net-2.6.25/net/core/stream.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p1/net/core/stream.c 2007-12-29 20:16:31.000000000 -0500
@@ -172,17 +172,6 @@ do_interrupted:
EXPORT_SYMBOL(sk_stream_wait_memory);
-void sk_stream_rfree(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
-
- skb_truesize_check(skb);
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- sk->sk_forward_alloc += skb->truesize;
-}
-
-EXPORT_SYMBOL(sk_stream_rfree);
-
int sk_stream_error(struct sock *sk, int flags, int err)
{
if (err == -EPIPE)
@@ -194,77 +183,6 @@ int sk_stream_error(struct sock *sk, int
EXPORT_SYMBOL(sk_stream_error);
-void __sk_stream_mem_reclaim(struct sock *sk)
-{
- atomic_sub(sk->sk_forward_alloc >> SK_STREAM_MEM_QUANTUM_SHIFT,
- sk->sk_prot->memory_allocated);
- sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1;
- if (*sk->sk_prot->memory_pressure &&
- (atomic_read(sk->sk_prot->memory_allocated) <
- sk->sk_prot->sysctl_mem[0]))
- *sk->sk_prot->memory_pressure = 0;
-}
-
-EXPORT_SYMBOL(__sk_stream_mem_reclaim);
-
-int sk_stream_mem_schedule(struct sock *sk, int size, int kind)
-{
- int amt = sk_stream_pages(size);
- struct proto *prot = sk->sk_prot;
-
- sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM;
- atomic_add(amt, prot->memory_allocated);
-
- /* Under limit. */
- if (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]) {
- if (*prot->memory_pressure)
- *prot->memory_pressure = 0;
- return 1;
- }
-
- /* Over hard limit. */
- if (atomic_read(prot->memory_allocated) > prot->sysctl_mem[2]) {
- prot->enter_memory_pressure();
- goto suppress_allocation;
- }
-
- /* Under pressure. */
- if (atomic_read(prot->memory_allocated) > prot->sysctl_mem[1])
- prot->enter_memory_pressure();
-
- if (kind) {
- if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
- return 1;
- } else if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
- return 1;
-
- if (!*prot->memory_pressure ||
- prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
- sk_stream_pages(sk->sk_wmem_queued +
- atomic_read(&sk->sk_rmem_alloc) +
- sk->sk_forward_alloc))
- return 1;
-
-suppress_allocation:
-
- if (!kind) {
- sk_stream_moderate_sndbuf(sk);
-
- /* Fail only if socket is _under_ its sndbuf.
- * In this case we cannot block, so that we have to fail.
- */
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
- return 1;
- }
-
- /* Alas. Undo changes. */
- sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM;
- atomic_sub(amt, prot->memory_allocated);
- return 0;
-}
-
-EXPORT_SYMBOL(sk_stream_mem_schedule);
-
void sk_stream_kill_queues(struct sock *sk)
{
/* First the read buffer. */
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 2/4] [CORE]: adding memory accounting points
From: Hideo AOKI @ 2007-12-30 8:51 UTC (permalink / raw)
To: David Miller, Herbert Xu, vladislav.yasevich, netdev
Cc: lksctp-developers, Takahiro Yasui, Masami Hiramatsu,
Satoshi Oshima, billfink, Andi Kleen, Evgeniy Polyakov,
Stephen Hemminger, yoshfuji, Yumiko Sugita, haoki
In-Reply-To: <47775B25.7020401@redhat.com>
To consolidate memory accounting functions, this patch adds memory
accounting calls to network core functions. Moreover, present
memory accounting call is renamed to new accounting call.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
include/net/sock.h | 3 ++-
net/core/datagram.c | 2 ++
net/core/sock.c | 7 +++++++
net/core/stream.c | 2 +-
4 files changed, 12 insertions(+), 2 deletions(-)
diff -pruN net-2.6.25-t12t19m-p1/include/net/sock.h net-2.6.25-t12t19m-p2/include/net/sock.h
--- net-2.6.25-t12t19m-p1/include/net/sock.h 2007-12-29 20:16:31.000000000 -0500
+++ net-2.6.25-t12t19m-p2/include/net/sock.h 2007-12-29 20:28:15.000000000 -0500
@@ -1116,7 +1116,7 @@ static inline int skb_copy_to_page(struc
skb->data_len += copy;
skb->truesize += copy;
sk->sk_wmem_queued += copy;
- sk->sk_forward_alloc -= copy;
+ sk_mem_charge(sk, copy);
return 0;
}
@@ -1142,6 +1142,7 @@ static inline void skb_set_owner_r(struc
skb->sk = sk;
skb->destructor = sock_rfree;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, skb->truesize);
}
extern void sk_reset_timer(struct sock *sk, struct timer_list* timer,
diff -pruN net-2.6.25-t12t19m-p1/net/core/datagram.c net-2.6.25-t12t19m-p2/net/core/datagram.c
--- net-2.6.25-t12t19m-p1/net/core/datagram.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p2/net/core/datagram.c 2007-12-29 20:28:15.000000000 -0500
@@ -209,6 +209,7 @@ struct sk_buff *skb_recv_datagram(struct
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
+ sk_mem_reclaim(sk);
}
/**
@@ -248,6 +249,7 @@ int skb_kill_datagram(struct sock *sk, s
}
kfree_skb(skb);
+ sk_mem_reclaim(sk);
return err;
}
diff -pruN net-2.6.25-t12t19m-p1/net/core/sock.c net-2.6.25-t12t19m-p2/net/core/sock.c
--- net-2.6.25-t12t19m-p1/net/core/sock.c 2007-12-29 20:16:31.000000000 -0500
+++ net-2.6.25-t12t19m-p2/net/core/sock.c 2007-12-29 20:28:15.000000000 -0500
@@ -282,6 +282,11 @@ int sock_queue_rcv_skb(struct sock *sk,
if (err)
goto out;
+ if (!sk_rmem_schedule(sk, skb->truesize)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
skb->dev = NULL;
skb_set_owner_r(skb, sk);
@@ -1107,7 +1112,9 @@ void sock_rfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
+ skb_truesize_check(skb);
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(skb->sk, skb->truesize);
}
diff -pruN net-2.6.25-t12t19m-p1/net/core/stream.c net-2.6.25-t12t19m-p2/net/core/stream.c
--- net-2.6.25-t12t19m-p1/net/core/stream.c 2007-12-29 20:16:31.000000000 -0500
+++ net-2.6.25-t12t19m-p2/net/core/stream.c 2007-12-29 20:28:15.000000000 -0500
@@ -195,7 +195,7 @@ void sk_stream_kill_queues(struct sock *
BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
- sk_stream_mem_reclaim(sk);
+ sk_mem_reclaim(sk);
BUG_TRAP(!sk->sk_wmem_queued);
BUG_TRAP(!sk->sk_forward_alloc);
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 4/4] [SCTP]: using new interface
From: Hideo AOKI @ 2007-12-30 8:54 UTC (permalink / raw)
To: David Miller, Herbert Xu, vladislav.yasevich, netdev,
lksctp-developers
Cc: Takahiro Yasui, Masami Hiramatsu, Satoshi Oshima, billfink,
Andi Kleen, Evgeniy Polyakov, Stephen Hemminger, yoshfuji,
Yumiko Sugita, haoki
In-Reply-To: <47775B25.7020401@redhat.com>
This patch replaces present memory accounting calls with new interface
in SCTP.
Cc: Satoshi Oshima <satoshi.oshima.fk@hitachi.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
signed-off-by: Takahiro Yasui <tyasui@redhat.com>
signed-off-by: Hideo Aoki <haoki@redhat.com>
---
include/net/sctp/sctp.h | 3 +--
net/sctp/protocol.c | 2 +-
net/sctp/sm_statefuns.c | 2 +-
net/sctp/socket.c | 11 ++++++-----
net/sctp/ulpevent.c | 2 +-
net/sctp/ulpqueue.c | 2 +-
6 files changed, 11 insertions(+), 11 deletions(-)
diff -pruN net-2.6.25-t12t19m-p3/include/net/sctp/sctp.h net-2.6.25-t12t19m-p4/include/net/sctp/sctp.h
--- net-2.6.25-t12t19m-p3/include/net/sctp/sctp.h 2007-12-27 10:18:58.000000000 -0500
+++ net-2.6.25-t12t19m-p4/include/net/sctp/sctp.h 2007-12-29 20:59:06.000000000 -0500
@@ -463,8 +463,7 @@ static inline void sctp_skb_set_owner_r(
skb->destructor = sctp_sock_rfree;
atomic_add(event->rmem_len, &sk->sk_rmem_alloc);
/*
- * This mimics the behavior of
- * sk_stream_set_owner_r
+ * This mimics the behavior of skb_set_owner_r
*/
sk->sk_forward_alloc -= event->rmem_len;
}
diff -pruN net-2.6.25-t12t19m-p3/net/sctp/protocol.c net-2.6.25-t12t19m-p4/net/sctp/protocol.c
--- net-2.6.25-t12t19m-p3/net/sctp/protocol.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p4/net/sctp/protocol.c 2007-12-29 20:59:06.000000000 -0500
@@ -1109,7 +1109,7 @@ SCTP_STATIC __init int sctp_init(void)
sysctl_sctp_rmem[1] = (1500 *(sizeof(struct sk_buff) + 1));
sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share);
- sysctl_sctp_wmem[0] = SK_STREAM_MEM_QUANTUM;
+ sysctl_sctp_wmem[0] = SK_MEM_QUANTUM;
sysctl_sctp_wmem[1] = 16*1024;
sysctl_sctp_wmem[2] = max(64*1024, max_share);
diff -pruN net-2.6.25-t12t19m-p3/net/sctp/sm_statefuns.c net-2.6.25-t12t19m-p4/net/sctp/sm_statefuns.c
--- net-2.6.25-t12t19m-p3/net/sctp/sm_statefuns.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p4/net/sctp/sm_statefuns.c 2007-12-29 20:59:06.000000000 -0500
@@ -5866,7 +5866,7 @@ static int sctp_eat_data(const struct sc
/*
* Also try to renege to limit our memory usage in the event that
* we are under memory pressure
- * If we can't renege, don't worry about it, the sk_stream_rmem_schedule
+ * If we can't renege, don't worry about it, the sk_rmem_schedule
* in sctp_ulpevent_make_rcvmsg will drop the frame if we grow our
* memory usage too much
*/
diff -pruN net-2.6.25-t12t19m-p3/net/sctp/socket.c net-2.6.25-t12t19m-p4/net/sctp/socket.c
--- net-2.6.25-t12t19m-p3/net/sctp/socket.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p4/net/sctp/socket.c 2007-12-29 20:59:06.000000000 -0500
@@ -174,7 +174,8 @@ static inline void sctp_set_owner_w(stru
sizeof(struct sctp_chunk);
atomic_add(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
- sk_charge_skb(sk, chunk->skb);
+ sk->sk_wmem_queued += chunk->skb->truesize;
+ sk_mem_charge(sk, chunk->skb->truesize);
}
/* Verify that this is a valid address. */
@@ -6035,10 +6036,10 @@ static void sctp_wfree(struct sk_buff *s
atomic_sub(sizeof(struct sctp_chunk), &sk->sk_wmem_alloc);
/*
- * This undoes what is done via sk_charge_skb
+ * This undoes what is done via sctp_set_owner_w and sk_mem_charge
*/
sk->sk_wmem_queued -= skb->truesize;
- sk->sk_forward_alloc += skb->truesize;
+ sk_mem_uncharge(sk, skb->truesize);
sock_wfree(skb);
__sctp_write_space(asoc);
@@ -6059,9 +6060,9 @@ void sctp_sock_rfree(struct sk_buff *skb
atomic_sub(event->rmem_len, &sk->sk_rmem_alloc);
/*
- * Mimic the behavior of sk_stream_rfree
+ * Mimic the behavior of sock_rfree
*/
- sk->sk_forward_alloc += event->rmem_len;
+ sk_mem_uncharge(sk, event->rmem_len);
}
diff -pruN net-2.6.25-t12t19m-p3/net/sctp/ulpevent.c net-2.6.25-t12t19m-p4/net/sctp/ulpevent.c
--- net-2.6.25-t12t19m-p3/net/sctp/ulpevent.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p4/net/sctp/ulpevent.c 2007-12-29 20:59:06.000000000 -0500
@@ -700,7 +700,7 @@ struct sctp_ulpevent *sctp_ulpevent_make
if (rx_count >= asoc->base.sk->sk_rcvbuf) {
if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
- (!sk_stream_rmem_schedule(asoc->base.sk, chunk->skb)))
+ (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize)))
goto fail;
}
diff -pruN net-2.6.25-t12t19m-p3/net/sctp/ulpqueue.c net-2.6.25-t12t19m-p4/net/sctp/ulpqueue.c
--- net-2.6.25-t12t19m-p3/net/sctp/ulpqueue.c 2007-12-27 10:19:02.000000000 -0500
+++ net-2.6.25-t12t19m-p4/net/sctp/ulpqueue.c 2007-12-29 20:59:06.000000000 -0500
@@ -1046,7 +1046,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *
sctp_ulpq_partial_delivery(ulpq, chunk, gfp);
}
- sk_stream_mem_reclaim(asoc->base.sk);
+ sk_mem_reclaim(asoc->base.sk);
return;
}
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* [PATCH 0/4] New interface for memory accounting (take 1)
From: Hideo AOKI @ 2007-12-30 8:47 UTC (permalink / raw)
To: David Miller, Herbert Xu, vladislav.yasevich, netdev
Cc: lksctp-developers, Takahiro Yasui, Masami Hiramatsu,
Satoshi Oshima, billfink, Andi Kleen, Evgeniy Polyakov,
Stephen Hemminger, yoshfuji, Yumiko Sugita, Hideo AOKI
Hello,
This patch set introduces new memory accounting interface.
Current interface is written for stream protocols only.
To enable memory accounting in other protocols (e.g. UDP),
I enhanced the interface and updated TCP and SCTP memory
accounting.
The patch set consists of the following 4 patches.
[1/4] introducing new memory accounting interface
[2/4] adding memory accounting points to consolidate functions
[3/4] updating TCP to use new interface
[4/4] updating SCTP to use new interface
The patch set was tested on net-2.6.25 tree.
Best regards,
Hideo Aoki
--
Hitachi Computer Products (America) Inc.
^ permalink raw reply
* Re: [RFC/PATCH] e100 driver didn't support any MII-less PHYs...
From: Kok, Auke @ 2007-12-30 5:54 UTC (permalink / raw)
To: andi; +Cc: e1000-devel, netdev, auke-jan.h.kok, bunk, linux-kernel
In-Reply-To: <20071228235118.GA31059@rhlx01.hs-esslingen.de>
Andreas Mohr wrote:
> Hi all,
>
> I was mildly annoyed when rebooting my _headless_ internet gateway after a
> hotplug -> udev migration and witnessing it not coming up again,
> which turned out to be due to an eepro100 / e100 loading conflict
> since eepro100 supported both of my Intel-based network cards,
> whereas e100 only supported the "newer" one and entirely failed on ifup...
> (udev had somehow managed to tweak loading sequence as compared to
> a hotplug setup, which caused the drivers to probe differently)
>
> After investigating this e100 failure for half an hour it was obvious
> that it was failing in e100_hw_init() -> e100_phy_init() since the driver was
> prepared to handle MII-capable PHYs only, not certain older(?) MII-less
> PHYs such as 80c24 or i82503.
> Investigating some FreeBSD etc. drivers it became terribly clear that there
> are also some MII-less PHYs and that one would have to handle them properly.
>
> Thus I decided to add support for those:
> - after PHY init failure, try to detect whether the EEPROM lists one of
> the MII-less PHYs
> - if so, don't fatally fail PHY init function
> - avoid touching MII in various utility functions in case of MII-less
> PHY (FIXME: this may need review, it was a quick hack in some places)
> - add some proper logging on init failure
>
> Note that this is an initial, semi-rough patch only, would love to have
> it corrected/improved by the e1000 team.
> (I also added some spelling updates for good measure, these would have
> to be committed separately obviously)
>
> Frankly I'm quite uncertain as to why one would try to actively deprecate
> a driver which works for many cards with a newer one which fails to work
> for several card types and doesn't seem clearly superiour in hindsight
> after going through it...
> Oh, right, that's in order to brute-force people to report any
> nagging problems with the new driver, which is... errm... very
> understandable after all ;)
> (I hope that me "reporting" this problem via a patch is ok ;)
>
> For reference, I'm using a BNC/AUI/TP PCI combo card
> Intel 82557 645477-004 FCC ID EJMNPDEPR10PCTPCI
>
> This mail written using a reassuringly stable connection over the newly
> adapted driver...
ok, barely glanced over the patch but it might just be fine. Can you split up this
patch and send a separate patch for the spelling mistakes? I'll then have some
quick testing done on the result and do a bit deeper review after newyears.
Cheers,
Auke
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply
* Re: 2.6.24-rc6-mm1
From: Randy Dunlap @ 2007-12-30 5:41 UTC (permalink / raw)
To: Torsten Kaiser
Cc: Herbert Xu, Andrew Morton, linux-kernel, Neil Brown,
J. Bruce Fields, netdev
In-Reply-To: <64bb37e0712291934o77a3d365h56c9c31ac8437469@mail.gmail.com>
On Sun, 30 Dec 2007 04:34:36 +0100 Torsten Kaiser wrote:
> On Dec 30, 2007 2:30 AM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> > On Sat, Dec 29, 2007 at 05:51:13PM +0100, Torsten Kaiser wrote:
> > >
> > > > > The cause, why I am resending this: I just got a crash with
> > > > > 2.6.24-rc6-mm1, again looking network related:
> > > > >
> > > > > [93436.933356] WARNING: at include/net/dst.h:165 dst_release()
> > > > > [93436.936685] Pid: 8079, comm: konqueror Not tainted 2.6.24-rc6-mm1 #11
> > > > > [93436.939292]
> > > > > [93436.939293] Call Trace:
> > > > > [93436.939304] [<ffffffff80531d2d>] skb_release_all+0xdd/0x110
> > > > > [93436.939307] [<ffffffff80531311>] __kfree_skb+0x11/0xa0
> > > > > [93436.939309] [<ffffffff805313b7>] kfree_skb+0x17/0x30
> > > > > [93436.939312] [<ffffffff805a0b48>] unix_release_sock+0x128/0x250
> > > > > [93436.939315] [<ffffffff805a0c91>] unix_release+0x21/0x30
> > > > > [93436.939318] [<ffffffff8052b144>] sock_release+0x24/0x90
> > > > > [93436.939320] [<ffffffff8052b656>] sock_close+0x26/0x50
> > > > > [93436.939324] [<ffffffff8029f921>] __fput+0xc1/0x230
> > > > > [93436.939327] [<ffffffff8029fe46>] fput+0x16/0x20
> > > > > [93436.939329] [<ffffffff8029c576>] filp_close+0x56/0x90
> > > > > [93436.939331] [<ffffffff8029de46>] sys_close+0xa6/0x110
> > > > > [93436.939335] [<ffffffff8020b57b>] system_call_after_swapgs+0x7b/0x80
> > >
> > > >From code inspection I would blame the patch "[SKBUFF]: Free old skb
> > > properly in skb_morph" from Herbert Xu. (CC added)
> >
> > I doubt it. skb_morph is only used on IP fragments so I don't see how
> > you could attribute an error from a Unix domain socket to this patch.
>
> That's why I wrote that I do not know much about the network core...
>
> > In any case, Unix socket packets should not have a dst at all so the
> > very fact that you're in that path means that you have some sort of
> > memory corruption.
>
> ... I did not know about the fact that there should not have been an dst.
>
> Its just that this warning was the first nice clue about the memory
> corruption related to networking that I see since 2.6.24-rc3-mm2.
> The time of the patch (Mon, 26 Nov 2007 15:11:19) even fits into the
> window between -rc3-mm1 and -rc3-mm2.
>
> I doubt that the memory corruption is a hardware problem, because the
> system in question is using ECC ram and I did not see any messages
> about corrected/detected errors.
>
> > Is this the very first OOPS/warning that you see? If not you should
> > ignore all but the very first one as that may have left your system
> > in an inconsistent state which may render all subsequent OOPSes and
> > warnings useless.
>
> I looked into the log in question and the only other warning was a
> circular locking dependency that lockdep detected around 1.5 hour
> before this warning.
>
> As reported in my original mail immeadeatly after the warning the
> system OOPSed and hang:
> [93436.947241] general protection fault: 0000 [1] SMP
> -> first OOPS ^
FYI, that's what this counter is... -----^
> [93436.947243] last sysfs file:
> /sys/devices/pci0000:00/0000:00:0f.0/0000:01:00.1/irq
> [93436.947245] CPU 1
> [93436.947246] Modules linked in: radeon drm nfsd exportfs w83792d
> ipv6 tuner tea5767 tda8290 tuner_xc2
> 028 tda9887 tuner_simple mt20xx tea5761 tvaudio msp3400 bttv ir_common
> compat_ioctl32 videobuf_dma_sg v
> ideobuf_core btcx_risc tveeprom usbhid videodev v4l2_common hid
> v4l1_compat pata_amd sg i2c_nforce2
> [93436.947257] Pid: 8079, comm: konqueror Not tainted 2.6.24-rc6-mm1 #11
> -> not tainted by a previous OOPS
> [93436.947259] RIP: 0010:[<ffffffff80531438>] [<ffffffff80531438>]
> skb_drop_list+0x18/0x30
> [93436.947262] RSP: 0018:ffff810005f4fda8 EFLAGS: 00010286
> [93436.947263] RAX: ab1ed5ca5b74e7de RBX: ab1ed5ca5b74e7de RCX: 000000000000d135
> [93436.947265] RDX: ffff81011d089a80 RSI: 0000000000000001 RDI: ffff81011d089a88
> [93436.947266] RBP: ffff810005f4fdb8 R08: 0000000000000001 R09: 0000000000000006
> [93436.947268] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8100de02c500
> [93436.947269] R13: ffff81011c188a00 R14: 0000000000000001 R15: ffff81011c189198
> [93436.947271] FS: 00007fb5bde0d700(0000) GS:ffff81007ff22000(0000)
> knlGS:0000000000000000
> [93436.947273] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [93436.947274] CR2: 00007fb5bdd76000 CR3: 00000000664d5000 CR4: 00000000000006e0
> [93436.947276] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [93436.947277] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [93436.947279] Process konqueror (pid: 8079, threadinfo
> ffff810005f4e000, task ffff8100a1dec000)
> [93436.947281] Stack: ffff810005f4fdd8 ffff810116c86140
> ffff810005f4fdd8 ffffffff805314ae
> [93436.947284] ffff810116c86140 ffff8100de02c500 ffff810005f4fdf8
> ffffffff80531cf0
> [93436.947286] ffff8100de02c500 ffff81011c188b48 ffff810005f4fe18
> ffffffff80531311
> [93436.947288] Call Trace:
> [93436.947290] [<ffffffff805314ae>] skb_release_data+0x5e/0xa0
> [93436.947293] [<ffffffff80531cf0>] skb_release_all+0xa0/0x110
> [93436.947295] [<ffffffff80531311>] __kfree_skb+0x11/0xa0
> [93436.947297] [<ffffffff805313b7>] kfree_skb+0x17/0x30
> [93436.947299] [<ffffffff805a0b48>] unix_release_sock+0x128/0x250
> [93436.947302] [<ffffffff805a0c91>] unix_release+0x21/0x30
> [93436.947304] [<ffffffff8052b144>] sock_release+0x24/0x90
> [93436.947307] [<ffffffff8052b656>] sock_close+0x26/0x50
> [93436.947309] [<ffffffff8029f921>] __fput+0xc1/0x230
> [93436.947312] [<ffffffff8029fe46>] fput+0x16/0x20
> [93436.947314] [<ffffffff8029c576>] filp_close+0x56/0x90
> [93436.947316] [<ffffffff8029de46>] sys_close+0xa6/0x110
> [93436.947319] [<ffffffff8020b57b>] system_call_after_swapgs+0x7b/0x80
> [93436.947322]
> [93436.947322]
> [93436.947323] Code: 48 8b 18 48 89 c7 e8 5d ff ff ff 48 85 db 75 ed 48 83 c4 08
> [93436.947328] RIP [<ffffffff80531438>] skb_drop_list+0x18/0x30
> [93436.947330] RSP <ffff810005f4fda8>
> [93436.947332] ---[ end trace befb7cc3528ab3b1 ]---
>
> Your patch just fit so "good" to my problems:
> * it had the correct time frame for 2.6.24-rc3-mm2
> * it looked guilty at changing the refcounting of __refcnt because of
> the added dst_release()
> * it added other release / freeing operations so that a use-after-free
> memory corruption seemed possible
>
> I just have no better idea to what caused this OOPS and the other
> hangs in -rc3-mm2.
---
~Randy
desserts: http://www.xenotime.net/linux/recipes/
^ permalink raw reply
* Re: 2.6.24-rc6-mm1
From: Torsten Kaiser @ 2007-12-30 3:34 UTC (permalink / raw)
To: Herbert Xu
Cc: Andrew Morton, linux-kernel, Neil Brown, J. Bruce Fields, netdev
In-Reply-To: <20071230013021.GA13603@gondor.apana.org.au>
On Dec 30, 2007 2:30 AM, Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Sat, Dec 29, 2007 at 05:51:13PM +0100, Torsten Kaiser wrote:
> >
> > > > The cause, why I am resending this: I just got a crash with
> > > > 2.6.24-rc6-mm1, again looking network related:
> > > >
> > > > [93436.933356] WARNING: at include/net/dst.h:165 dst_release()
> > > > [93436.936685] Pid: 8079, comm: konqueror Not tainted 2.6.24-rc6-mm1 #11
> > > > [93436.939292]
> > > > [93436.939293] Call Trace:
> > > > [93436.939304] [<ffffffff80531d2d>] skb_release_all+0xdd/0x110
> > > > [93436.939307] [<ffffffff80531311>] __kfree_skb+0x11/0xa0
> > > > [93436.939309] [<ffffffff805313b7>] kfree_skb+0x17/0x30
> > > > [93436.939312] [<ffffffff805a0b48>] unix_release_sock+0x128/0x250
> > > > [93436.939315] [<ffffffff805a0c91>] unix_release+0x21/0x30
> > > > [93436.939318] [<ffffffff8052b144>] sock_release+0x24/0x90
> > > > [93436.939320] [<ffffffff8052b656>] sock_close+0x26/0x50
> > > > [93436.939324] [<ffffffff8029f921>] __fput+0xc1/0x230
> > > > [93436.939327] [<ffffffff8029fe46>] fput+0x16/0x20
> > > > [93436.939329] [<ffffffff8029c576>] filp_close+0x56/0x90
> > > > [93436.939331] [<ffffffff8029de46>] sys_close+0xa6/0x110
> > > > [93436.939335] [<ffffffff8020b57b>] system_call_after_swapgs+0x7b/0x80
> >
> > >From code inspection I would blame the patch "[SKBUFF]: Free old skb
> > properly in skb_morph" from Herbert Xu. (CC added)
>
> I doubt it. skb_morph is only used on IP fragments so I don't see how
> you could attribute an error from a Unix domain socket to this patch.
That's why I wrote that I do not know much about the network core...
> In any case, Unix socket packets should not have a dst at all so the
> very fact that you're in that path means that you have some sort of
> memory corruption.
... I did not know about the fact that there should not have been an dst.
Its just that this warning was the first nice clue about the memory
corruption related to networking that I see since 2.6.24-rc3-mm2.
The time of the patch (Mon, 26 Nov 2007 15:11:19) even fits into the
window between -rc3-mm1 and -rc3-mm2.
I doubt that the memory corruption is a hardware problem, because the
system in question is using ECC ram and I did not see any messages
about corrected/detected errors.
> Is this the very first OOPS/warning that you see? If not you should
> ignore all but the very first one as that may have left your system
> in an inconsistent state which may render all subsequent OOPSes and
> warnings useless.
I looked into the log in question and the only other warning was a
circular locking dependency that lockdep detected around 1.5 hour
before this warning.
As reported in my original mail immeadeatly after the warning the
system OOPSed and hang:
[93436.947241] general protection fault: 0000 [1] SMP
-> first OOPS
[93436.947243] last sysfs file:
/sys/devices/pci0000:00/0000:00:0f.0/0000:01:00.1/irq
[93436.947245] CPU 1
[93436.947246] Modules linked in: radeon drm nfsd exportfs w83792d
ipv6 tuner tea5767 tda8290 tuner_xc2
028 tda9887 tuner_simple mt20xx tea5761 tvaudio msp3400 bttv ir_common
compat_ioctl32 videobuf_dma_sg v
ideobuf_core btcx_risc tveeprom usbhid videodev v4l2_common hid
v4l1_compat pata_amd sg i2c_nforce2
[93436.947257] Pid: 8079, comm: konqueror Not tainted 2.6.24-rc6-mm1 #11
-> not tainted by a previous OOPS
[93436.947259] RIP: 0010:[<ffffffff80531438>] [<ffffffff80531438>]
skb_drop_list+0x18/0x30
[93436.947262] RSP: 0018:ffff810005f4fda8 EFLAGS: 00010286
[93436.947263] RAX: ab1ed5ca5b74e7de RBX: ab1ed5ca5b74e7de RCX: 000000000000d135
[93436.947265] RDX: ffff81011d089a80 RSI: 0000000000000001 RDI: ffff81011d089a88
[93436.947266] RBP: ffff810005f4fdb8 R08: 0000000000000001 R09: 0000000000000006
[93436.947268] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8100de02c500
[93436.947269] R13: ffff81011c188a00 R14: 0000000000000001 R15: ffff81011c189198
[93436.947271] FS: 00007fb5bde0d700(0000) GS:ffff81007ff22000(0000)
knlGS:0000000000000000
[93436.947273] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[93436.947274] CR2: 00007fb5bdd76000 CR3: 00000000664d5000 CR4: 00000000000006e0
[93436.947276] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[93436.947277] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[93436.947279] Process konqueror (pid: 8079, threadinfo
ffff810005f4e000, task ffff8100a1dec000)
[93436.947281] Stack: ffff810005f4fdd8 ffff810116c86140
ffff810005f4fdd8 ffffffff805314ae
[93436.947284] ffff810116c86140 ffff8100de02c500 ffff810005f4fdf8
ffffffff80531cf0
[93436.947286] ffff8100de02c500 ffff81011c188b48 ffff810005f4fe18
ffffffff80531311
[93436.947288] Call Trace:
[93436.947290] [<ffffffff805314ae>] skb_release_data+0x5e/0xa0
[93436.947293] [<ffffffff80531cf0>] skb_release_all+0xa0/0x110
[93436.947295] [<ffffffff80531311>] __kfree_skb+0x11/0xa0
[93436.947297] [<ffffffff805313b7>] kfree_skb+0x17/0x30
[93436.947299] [<ffffffff805a0b48>] unix_release_sock+0x128/0x250
[93436.947302] [<ffffffff805a0c91>] unix_release+0x21/0x30
[93436.947304] [<ffffffff8052b144>] sock_release+0x24/0x90
[93436.947307] [<ffffffff8052b656>] sock_close+0x26/0x50
[93436.947309] [<ffffffff8029f921>] __fput+0xc1/0x230
[93436.947312] [<ffffffff8029fe46>] fput+0x16/0x20
[93436.947314] [<ffffffff8029c576>] filp_close+0x56/0x90
[93436.947316] [<ffffffff8029de46>] sys_close+0xa6/0x110
[93436.947319] [<ffffffff8020b57b>] system_call_after_swapgs+0x7b/0x80
[93436.947322]
[93436.947322]
[93436.947323] Code: 48 8b 18 48 89 c7 e8 5d ff ff ff 48 85 db 75 ed 48 83 c4 08
[93436.947328] RIP [<ffffffff80531438>] skb_drop_list+0x18/0x30
[93436.947330] RSP <ffff810005f4fda8>
[93436.947332] ---[ end trace befb7cc3528ab3b1 ]---
Your patch just fit so "good" to my problems:
* it had the correct time frame for 2.6.24-rc3-mm2
* it looked guilty at changing the refcounting of __refcnt because of
the added dst_release()
* it added other release / freeing operations so that a use-after-free
memory corruption seemed possible
I just have no better idea to what caused this OOPS and the other
hangs in -rc3-mm2.
Torsten
^ permalink raw reply
* Re: [PATCH] net: santize headers for iproute2
From: David Miller @ 2007-12-30 3:22 UTC (permalink / raw)
To: shemminger; +Cc: vgusev, netdev
In-Reply-To: <20071225171310.1c43a0e2@deepthought>
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Tue, 25 Dec 2007 17:13:10 -0800
> Well changing netinet/tcp.h is just not a realistic proposition, it takes
> too long to filter from glibc through distro's to be worth it.
But you have to, the ifdef mess you are suggesting is so much
worse.
Look at the reason you even have to do add the ifdefs, it's because
lo' and behold the congestion control defines have already propagated
properly into netinet/tcp.h
And this alone proves that your argument against putting this stuff in
the right place has no basis in reality.
Please just submit a patch to the appropriate place to get the
interfaces you need into netinet/tcp.h included instead of crapping
all over the kernel headers.
In the interum you can put a special header into the iproute2
distribution to handle this until it's sorted.
^ permalink raw reply
* Re: [PATCH][ROSE][AX25] af_ax25: possible circular locking
From: David Miller @ 2007-12-30 3:14 UTC (permalink / raw)
To: jarkao2; +Cc: f6bvp, ralf, adobriyan, netdev
In-Reply-To: <20071228214857.GA3290@ami.dom.local>
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Fri, 28 Dec 2007 22:48:57 +0100
> lockdep is worried about the different order here:
>
> #1 (rose_neigh_list_lock){-+..}:
> #3 (ax25_list_lock){-+..}:
>
> #0 (linkfail_lock){-+..}:
> #1 (rose_neigh_list_lock){-+..}:
>
> #3 (ax25_list_lock){-+..}:
> #0 (linkfail_lock){-+..}:
>
> So, ax25_list_lock could be taken before and after linkfail_lock.
> I don't know if this three-thread clutch is very probable (or
> possible at all), but it seems another bug reported by Bernard
> ("[...] system impossible to reboot with linux-2.6.24-rc5")
> could have similar source - namely ax25_list_lock held by
> ax25_kill_by_device() during ax25_disconnect(). It looks like the
> only place which calls ax25_disconnect() this way, so I guess, it
> isn't necessary.
>
> This patch is breaking the lock for ax25_disconnect(), with some
> failsafe and debugging added to detect unforeseen problems.
>
>
> Reported-and-tested-by: Bernard Pidoux <f6bvp@free.fr>
> Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
I can't apply this fix, sorry.
You can't just drop this linked list lock and expect it to stay
consistent like that.
Once you drop it, any thread of control can get in there and delete
entries from the list.
Since we know it can happen, using a WARN_ON_ONCE(1) is not
appropriate. And if it triggers it will do the wrong thing, because
by branching back to "again" we can call ax25_disconnect() multiple
times on the same entry which isn't right.
You'll thus need to resolve this locking conflict more properly.
I know it's hard, but your current fix is worse because it adds
a new known bug.
^ permalink raw reply
* Re: [PATCH/RFC] [v3] TCP: use non-delayed ACK for congestion control RTT
From: David Miller @ 2007-12-30 3:09 UTC (permalink / raw)
To: Gavin.McCullagh; +Cc: ilpo.jarvinen, netdev
In-Reply-To: <20071230012549.GB30997@nuim.ie>
Never mind about making the relative patch, I didn't want to have
to wait for you to send me that and have it block my merge of
fixes with Linus this evening.
The following is what I applied on top of your other patch:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6fb7989..cbba288 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2660,6 +2660,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
u32 packets_acked;
u8 sacked = scb->sacked;
+ /* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
!after(tp->snd_una, scb->seq))
@@ -2694,10 +2695,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else {
ca_seq_rtt = now - scb->when;
+ last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
- if (fully_acked)
- last_ackt = skb->tstamp;
}
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(cnt, reord);
@@ -2713,10 +2713,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
tp->urg_mode = 0;
} else {
ca_seq_rtt = now - scb->when;
+ last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
- if (fully_acked)
- last_ackt = skb->tstamp;
}
reord = min(cnt, reord);
}
^ permalink raw reply related
* Re: [PATCH/RFC] [v3] TCP: use non-delayed ACK for congestion control RTT
From: David Miller @ 2007-12-30 3:06 UTC (permalink / raw)
To: Gavin.McCullagh; +Cc: ilpo.jarvinen, netdev
In-Reply-To: <20071230011500.GA30997@nuim.ie>
From: Gavin McCullagh <Gavin.McCullagh@nuim.ie>
Date: Sun, 30 Dec 2007 01:15:00 +0000
> A combined patch will follow this mail.
Please send a relative patch, I've already applied your
original patch to:
kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6.git
and will be sending that to Linus shortly.
^ permalink raw reply
* Re: [Bugme-new] [Bug 9661] New: Booting from nfsroot fails
From: David Miller @ 2007-12-30 3:05 UTC (permalink / raw)
To: akpm; +Cc: bugs, bugme-daemon, netdev, horms
In-Reply-To: <20071229151401.ee32b8f7.akpm@linux-foundation.org>
From: Andrew Morton <akpm@linux-foundation.org>
Date: Sat, 29 Dec 2007 15:14:01 -0800
> Thanks for the fix, but please send patches via email and not via bugzilla?
>
> Suitable recipients for this one would be
>
> netdev@vger.kernel.org
> Simon Horman <horms@verge.net.au>
> "David S. Miller" <davem@davemloft.net>
Simon already sent me a fix for this a few days ago, it's
in my tree and will be sent to Linus in due time.
^ permalink raw reply
* Re: [PATCH] Fix broken ip= parsing
From: David Miller @ 2007-12-30 2:59 UTC (permalink / raw)
To: tsbogend; +Cc: linux-kernel, netdev, horms
In-Reply-To: <20071229170849.C2E68C2EEE@solo.franken.de>
From: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Date: Sat, 29 Dec 2007 18:08:49 +0100 (CET)
> Commit a6c05c3d064dbb83be88cba3189beb5db9d2dfc3 breaks ip= parsing
> completly, because ic_enable is never set. The patch below puts
> back the way ic_enable was set before.
>
> Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
I already have this exact patch in my net-2.6 tree from Simon.
Thanks.
^ permalink raw reply
* Re: 2.6.24-rc6-mm1
From: Herbert Xu @ 2007-12-30 1:30 UTC (permalink / raw)
To: Torsten Kaiser
Cc: Andrew Morton, linux-kernel, Neil Brown, J. Bruce Fields, netdev
In-Reply-To: <64bb37e0712290851r6d41768dk270e47884713a3de@mail.gmail.com>
On Sat, Dec 29, 2007 at 05:51:13PM +0100, Torsten Kaiser wrote:
>
> > > The cause, why I am resending this: I just got a crash with
> > > 2.6.24-rc6-mm1, again looking network related:
> > >
> > > [93436.933356] WARNING: at include/net/dst.h:165 dst_release()
> > > [93436.936685] Pid: 8079, comm: konqueror Not tainted 2.6.24-rc6-mm1 #11
> > > [93436.939292]
> > > [93436.939293] Call Trace:
> > > [93436.939304] [<ffffffff80531d2d>] skb_release_all+0xdd/0x110
> > > [93436.939307] [<ffffffff80531311>] __kfree_skb+0x11/0xa0
> > > [93436.939309] [<ffffffff805313b7>] kfree_skb+0x17/0x30
> > > [93436.939312] [<ffffffff805a0b48>] unix_release_sock+0x128/0x250
> > > [93436.939315] [<ffffffff805a0c91>] unix_release+0x21/0x30
> > > [93436.939318] [<ffffffff8052b144>] sock_release+0x24/0x90
> > > [93436.939320] [<ffffffff8052b656>] sock_close+0x26/0x50
> > > [93436.939324] [<ffffffff8029f921>] __fput+0xc1/0x230
> > > [93436.939327] [<ffffffff8029fe46>] fput+0x16/0x20
> > > [93436.939329] [<ffffffff8029c576>] filp_close+0x56/0x90
> > > [93436.939331] [<ffffffff8029de46>] sys_close+0xa6/0x110
> > > [93436.939335] [<ffffffff8020b57b>] system_call_after_swapgs+0x7b/0x80
>
> >From code inspection I would blame the patch "[SKBUFF]: Free old skb
> properly in skb_morph" from Herbert Xu. (CC added)
I doubt it. skb_morph is only used on IP fragments so I don't see how
you could attribute an error from a Unix domain socket to this patch.
In any case, Unix socket packets should not have a dst at all so the
very fact that you're in that path means that you have some sort of
memory corruption.
Is this the very first OOPS/warning that you see? If not you should
ignore all but the very first one as that may have left your system
in an inconsistent state which may render all subsequent OOPSes and
warnings useless.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* Re: [PATCH/RFC] [v3] TCP: use non-delayed ACK for congestion control RTT
From: Gavin McCullagh @ 2007-12-30 1:25 UTC (permalink / raw)
To: Ilpo Järvinen; +Cc: David Miller, Netdev
In-Reply-To: <20071230011500.GA30997@nuim.ie>
When a delayed ACK representing two packets arrives, there are two RTT
samples available, one for each packet. The first (in order of seq number)
will be artificially long due to the delay waiting for the second packet,
the second will trigger the ACK and so will not itself be delayed.
According to rfc1323, the SRTT used for RTO calculation should use the
first rtt, so receivers echo the timestamp from the first packet in the
delayed ack. For congestion control however, it seems measuring delayed
ack delay is not desirable as it varies independently of congestion.
The patch below causes seq_rtt and last_ackt to be updated with any
available later packet rtts which should have less (and hopefully zero)
delack delay. The rtt value then gets passed to ca_ops->pkts_acked().
Where TCP_CONG_RTT_STAMP was set, effort was made to supress RTTs from
within a TSO chunk (!fully_acked), using only the final ACK (which includes
any TSO delay) to generate RTTs. This patch removes these checks so RTTs
are passed for each ACK to ca_ops->pkts_acked().
For non-delay based congestion control (cubic, h-tcp), rtt is sometimes
used for rtt-scaling. In shortening the RTT, this may make them a little
less aggressive. Delay-based schemes (eg vegas, veno, illinois) should get
a cleaner, more accurate congestion signal, particularly for small cwnds.
The congestion control module can potentially also filter out bad RTTs due
to the delayed ack alarm by looking at the associated cnt which (where
delayed acking is in use) should probably be 1 if the alarm went off or
greater if the ACK was triggered by a packet.
Signed-off-by: Gavin McCullagh <gavin.mccullagh@nuim.ie>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 889c893..cbba288 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2651,6 +2651,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
u32 cnt = 0;
u32 reord = tp->packets_out;
s32 seq_rtt = -1;
+ s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
@@ -2659,6 +2660,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
u32 packets_acked;
u8 sacked = scb->sacked;
+ /* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
!after(tp->snd_una, scb->seq))
@@ -2686,15 +2688,16 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
if (sacked & TCPCB_SACKED_RETRANS)
tp->retrans_out -= packets_acked;
flag |= FLAG_RETRANS_DATA_ACKED;
+ ca_seq_rtt = -1;
seq_rtt = -1;
if ((flag & FLAG_DATA_ACKED) ||
(packets_acked > 1))
flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else {
+ ca_seq_rtt = now - scb->when;
+ last_ackt = skb->tstamp;
if (seq_rtt < 0) {
- seq_rtt = now - scb->when;
- if (fully_acked)
- last_ackt = skb->tstamp;
+ seq_rtt = ca_seq_rtt;
}
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(cnt, reord);
@@ -2709,10 +2712,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
!before(end_seq, tp->snd_up))
tp->urg_mode = 0;
} else {
+ ca_seq_rtt = now - scb->when;
+ last_ackt = skb->tstamp;
if (seq_rtt < 0) {
- seq_rtt = now - scb->when;
- if (fully_acked)
- last_ackt = skb->tstamp;
+ seq_rtt = ca_seq_rtt;
}
reord = min(cnt, reord);
}
@@ -2772,8 +2775,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p,
net_invalid_timestamp()))
rtt_us = ktime_us_delta(ktime_get_real(),
last_ackt);
- else if (seq_rtt > 0)
- rtt_us = jiffies_to_usecs(seq_rtt);
+ else if (ca_seq_rtt > 0)
+ rtt_us = jiffies_to_usecs(ca_seq_rtt);
}
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
^ permalink raw reply related
* Re: [PATCH/RFC] [v3] TCP: use non-delayed ACK for congestion control RTT
From: Gavin McCullagh @ 2007-12-30 1:15 UTC (permalink / raw)
To: Ilpo Järvinen; +Cc: David Miller, Netdev
In-Reply-To: <Pine.LNX.4.64.0712211545580.7301@kivilampi-30.cs.helsinki.fi>
Hi,
On Fri, 21 Dec 2007, Ilpo Järvinen wrote:
> > I need to re-read properly, but I think the same problem affects the
> > microsecond values where TCP_CONG_RTT_STAMP is set (used by vegas, veno,
> > yeah, illinois). I might follow up with another patch which changes the
> > behaviour where TCP_CONG_RTT_STAMP when I'm more sure of that.
>
> Please do, you might have to remove fully_acked checks to do that right
> though so it won't be as straight-forward change as this one and requires
> some amount of thinking to result in a right thing.
The TCP_CONG_RTT_STAMP code does need to be fixed similarly. A combined
patch will follow this mail. As I understand it, the fully_acked checks
kick in where the ACK is a portion of a TSO chunk and doesn't completely
ACK that chunk. Leaving the checks in place means you get one rtt for each
TSO chunk, based on the ACK for the last byte of the chunk. This rtt
therefore is the maximum available and includes the time-lag between the
first and last chunk being acked. Removing the tests gives you an RTT
value for each ACK in a tso chunk, including the minimum and maximum. It
seems the minimum rtt is the best indicator of congestion. On the other
hand having all available RTTs gives the congestion avoidance greater
knowledge of how the RTT is evolving (albeit somewhat coloured by TSO
delays which don't seem too severe in my tests).
The patch I'll suggest for now takes all RTT values available, rather than
the old maximum or adding extra logic to pluck out the minimum.
I've captured some rtt values from timestamps and from
tcp_clean_rtx_queue() to demonstrate the effect of this patch:
http://www.hamilton.ie/gavinmc/linux/tcp_clean_rtx_queue.html#usec
As always, comments are most welcome,
Gavin
^ permalink raw reply
* [PATCH net-2.6.25 7/7][ATM]: [he] fixing compilation when you define USE_RBPS_POOL/USE_RBPL_POOL
From: chas williams - CONTRACTOR @ 2007-12-30 1:08 UTC (permalink / raw)
To: netdev; +Cc: davem
commit 62024377f1da8a3f5b49cfd60d892c778f5a5741
Author: Jorge Boncompte <jorge@dti2.net>
Date: Thu Dec 13 16:14:40 2007 -0500
[ATM]: [he] fixing compilation when you define USE_RBPS_POOL/USE_RBPL_POOL
Signed-off-by: Jorge Boncompte <jorge@dti2.net>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 3b64a99..20df4bb 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -1643,6 +1643,8 @@ he_stop(struct he_dev *he_dev)
if (he_dev->rbpl_base) {
#ifdef USE_RBPL_POOL
+ int i;
+
for (i = 0; i < CONFIG_RBPL_SIZE; ++i) {
void *cpuaddr = he_dev->rbpl_virt[i].virt;
dma_addr_t dma_handle = he_dev->rbpl_base[i].phys;
@@ -1665,6 +1667,8 @@ he_stop(struct he_dev *he_dev)
#ifdef USE_RBPS
if (he_dev->rbps_base) {
#ifdef USE_RBPS_POOL
+ int i;
+
for (i = 0; i < CONFIG_RBPS_SIZE; ++i) {
void *cpuaddr = he_dev->rbps_virt[i].virt;
dma_addr_t dma_handle = he_dev->rbps_base[i].phys;
^ permalink raw reply related
* [PATCH net-2.6.25 6/7][ATM]: [ambassador] kmalloc + memset conversion to kzalloc
From: chas williams - CONTRACTOR @ 2007-12-30 1:08 UTC (permalink / raw)
To: netdev; +Cc: davem
commit 87c1811be8f1c6587ed3b57d23f93c7e91afdde7
Author: Joonwoo Park <joonwpark81@gmail.com>
Date: Tue Nov 27 09:43:04 2007 -0500
[ATM]: [ambassador] kmalloc + memset conversion to kzalloc
Signed-off-by: Joonwoo Park <joonwpark81@gmail.com>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index b34b382..7b44a59 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -2163,7 +2163,6 @@ static int __devinit amb_init (amb_dev * dev)
static void setup_dev(amb_dev *dev, struct pci_dev *pci_dev)
{
unsigned char pool;
- memset (dev, 0, sizeof(amb_dev));
// set up known dev items straight away
dev->pci_dev = pci_dev;
@@ -2253,7 +2252,7 @@ static int __devinit amb_probe(struct pci_dev *pci_dev, const struct pci_device_
goto out_disable;
}
- dev = kmalloc (sizeof(amb_dev), GFP_KERNEL);
+ dev = kzalloc(sizeof(amb_dev), GFP_KERNEL);
if (!dev) {
PRINTK (KERN_ERR, "out of memory!");
err = -ENOMEM;
^ permalink raw reply related
* [PATCH net-2.6.25 5/7][ATM]: [br2864] whitespace cleanup
From: chas williams - CONTRACTOR @ 2007-12-30 1:07 UTC (permalink / raw)
To: netdev; +Cc: davem
commit 6b11f93a718dc916198feb1099ae0cef39ce3936
Author: Chas Williams <chas@vger.cmf.nrl.navy.mil>
Date: Sat Oct 27 08:33:40 2007 -0400
[ATM]: [br2864] whitespace cleanup
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
diff --git a/include/linux/atmbr2684.h b/include/linux/atmbr2684.h
index ccdab6c..52bf72a 100644
--- a/include/linux/atmbr2684.h
+++ b/include/linux/atmbr2684.h
@@ -15,7 +15,7 @@
#define BR2684_MEDIA_802_6 (4) /* 802.6 */
/* used only at device creation: */
-#define BR2684_FLAG_ROUTED (1<<16) /* payload is routed, not bridged */
+#define BR2684_FLAG_ROUTED (1<<16) /* payload is routed, not bridged */
/*
* Is there FCS inbound on this VC? This currently isn't supported.
@@ -45,17 +45,16 @@
#define BR2684_PAYLOAD_ROUTED (0)
#define BR2684_PAYLOAD_BRIDGED (1)
-
/*
* This is for the ATM_NEWBACKENDIF call - these are like socket families:
* the first element of the structure is the backend number and the rest
* is per-backend specific
*/
struct atm_newif_br2684 {
- atm_backend_t backend_num; /* ATM_BACKEND_BR2684 */
- int media; /* BR2684_MEDIA_*, flags in upper bits */
- char ifname[IFNAMSIZ];
- int mtu;
+ atm_backend_t backend_num; /* ATM_BACKEND_BR2684 */
+ int media; /* BR2684_MEDIA_*, flags in upper bits */
+ char ifname[IFNAMSIZ];
+ int mtu;
};
/*
@@ -66,10 +65,10 @@ struct atm_newif_br2684 {
#define BR2684_FIND_BYNUM (1)
#define BR2684_FIND_BYIFNAME (2)
struct br2684_if_spec {
- int method; /* BR2684_FIND_* */
+ int method; /* BR2684_FIND_* */
union {
- char ifname[IFNAMSIZ];
- int devnum;
+ char ifname[IFNAMSIZ];
+ int devnum;
} spec;
};
@@ -79,16 +78,16 @@ struct br2684_if_spec {
* is per-backend specific
*/
struct atm_backend_br2684 {
- atm_backend_t backend_num; /* ATM_BACKEND_BR2684 */
+ atm_backend_t backend_num; /* ATM_BACKEND_BR2684 */
struct br2684_if_spec ifspec;
- int fcs_in; /* BR2684_FCSIN_* */
- int fcs_out; /* BR2684_FCSOUT_* */
- int fcs_auto; /* 1: fcs_{in,out} disabled if no FCS rx'ed */
- int encaps; /* BR2684_ENCAPS_* */
- int has_vpiid; /* 1: use vpn_id - Unsupported */
- __u8 vpn_id[7];
- int send_padding; /* unsupported */
- int min_size; /* we will pad smaller packets than this */
+ int fcs_in; /* BR2684_FCSIN_* */
+ int fcs_out; /* BR2684_FCSOUT_* */
+ int fcs_auto; /* 1: fcs_{in,out} disabled if no FCS rx'ed */
+ int encaps; /* BR2684_ENCAPS_* */
+ int has_vpiid; /* 1: use vpn_id - Unsupported */
+ __u8 vpn_id[7];
+ int send_padding; /* unsupported */
+ int min_size; /* we will pad smaller packets than this */
};
/*
@@ -97,8 +96,8 @@ struct atm_backend_br2684 {
* efficient per-if in/out filters, this support will be removed
*/
struct br2684_filter {
- __be32 prefix; /* network byte order */
- __be32 netmask; /* 0 = disable filter */
+ __be32 prefix; /* network byte order */
+ __be32 netmask; /* 0 = disable filter */
};
struct br2684_filter_set {
@@ -107,8 +106,8 @@ struct br2684_filter_set {
};
enum br2684_payload {
- p_routed = BR2684_PAYLOAD_ROUTED,
- p_bridged = BR2684_PAYLOAD_BRIDGED,
+ p_routed = BR2684_PAYLOAD_ROUTED,
+ p_bridged = BR2684_PAYLOAD_BRIDGED,
};
#define BR2684_SETFILT _IOW( 'a', ATMIOC_BACKEND + 0, \
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index d9bb2a1..f5a3794 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -1,9 +1,10 @@
/*
-Ethernet netdevice using ATM AAL5 as underlying carrier
-(RFC1483 obsoleted by RFC2684) for Linux
-Authors: Marcell GAL, 2000, XDSL Ltd, Hungary
- Eric Kinzie, 2006-2007, US Naval Research Laboratory
-*/
+ * Ethernet netdevice using ATM AAL5 as underlying carrier
+ * (RFC1483 obsoleted by RFC2684) for Linux
+ *
+ * Authors: Marcell GAL, 2000, XDSL Ltd, Hungary
+ * Eric Kinzie, 2006-2007, US Naval Research Laboratory
+ */
#include <linux/module.h>
#include <linux/init.h>
@@ -51,28 +52,24 @@ static void skb_debug(const struct sk_buff *skb)
#define ETHERTYPE_IPV6 0x86, 0xdd
#define PAD_BRIDGED 0x00, 0x00
-static unsigned char ethertype_ipv4[] =
- { ETHERTYPE_IPV4 };
-static unsigned char ethertype_ipv6[] =
- { ETHERTYPE_IPV6 };
+static unsigned char ethertype_ipv4[] = { ETHERTYPE_IPV4 };
+static unsigned char ethertype_ipv6[] = { ETHERTYPE_IPV6 };
static unsigned char llc_oui_pid_pad[] =
- { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED };
-static unsigned char llc_oui_ipv4[] =
- { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 };
-static unsigned char llc_oui_ipv6[] =
- { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 };
+ { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED };
+static unsigned char llc_oui_ipv4[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 };
+static unsigned char llc_oui_ipv6[] = { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 };
enum br2684_encaps {
- e_vc = BR2684_ENCAPS_VC,
+ e_vc = BR2684_ENCAPS_VC,
e_llc = BR2684_ENCAPS_LLC,
};
struct br2684_vcc {
- struct atm_vcc *atmvcc;
+ struct atm_vcc *atmvcc;
struct net_device *device;
- /* keep old push,pop functions for chaining */
- void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb);
- /* void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); */
+ /* keep old push, pop functions for chaining */
+ void (*old_push) (struct atm_vcc * vcc, struct sk_buff * skb);
+ /* void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); */
enum br2684_encaps encaps;
struct list_head brvccs;
#ifdef CONFIG_ATM_BR2684_IPFILTER
@@ -85,7 +82,7 @@ struct br2684_dev {
struct net_device *net_dev;
struct list_head br2684_devs;
int number;
- struct list_head brvccs; /* one device <=> one vcc (before xmas) */
+ struct list_head brvccs; /* one device <=> one vcc (before xmas) */
struct net_device_stats stats;
int mac_was_set;
enum br2684_payload payload;
@@ -104,7 +101,7 @@ static LIST_HEAD(br2684_devs);
static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev)
{
- return (struct br2684_dev *) net_dev->priv;
+ return (struct br2684_dev *)net_dev->priv;
}
static inline struct net_device *list_entry_brdev(const struct list_head *le)
@@ -114,7 +111,7 @@ static inline struct net_device *list_entry_brdev(const struct list_head *le)
static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc)
{
- return (struct br2684_vcc *) (atmvcc->user_back);
+ return (struct br2684_vcc *)(atmvcc->user_back);
}
static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le)
@@ -152,7 +149,7 @@ static struct net_device *br2684_find_dev(const struct br2684_if_spec *s)
* otherwise false
*/
static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
- struct br2684_vcc *brvcc)
+ struct br2684_vcc *brvcc)
{
struct atm_vcc *atmvcc;
int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2;
@@ -171,21 +168,24 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
if (brvcc->encaps == e_llc) {
if (brdev->payload == p_bridged) {
skb_push(skb, sizeof(llc_oui_pid_pad));
- skb_copy_to_linear_data(skb, llc_oui_pid_pad, sizeof(llc_oui_pid_pad));
+ skb_copy_to_linear_data(skb, llc_oui_pid_pad,
+ sizeof(llc_oui_pid_pad));
} else if (brdev->payload == p_routed) {
unsigned short prot = ntohs(skb->protocol);
skb_push(skb, sizeof(llc_oui_ipv4));
switch (prot) {
- case ETH_P_IP:
- skb_copy_to_linear_data(skb, llc_oui_ipv4, sizeof(llc_oui_ipv4));
- break;
- case ETH_P_IPV6:
- skb_copy_to_linear_data(skb, llc_oui_ipv6, sizeof(llc_oui_ipv6));
- break;
- default:
- dev_kfree_skb(skb);
- return 0;
+ case ETH_P_IP:
+ skb_copy_to_linear_data(skb, llc_oui_ipv4,
+ sizeof(llc_oui_ipv4));
+ break;
+ case ETH_P_IPV6:
+ skb_copy_to_linear_data(skb, llc_oui_ipv6,
+ sizeof(llc_oui_ipv6));
+ break;
+ default:
+ dev_kfree_skb(skb);
+ return 0;
}
}
} else {
@@ -198,13 +198,14 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev);
if (!atm_may_send(atmvcc, skb->truesize)) {
- /* we free this here for now, because we cannot know in a higher
- layer whether the skb point it supplied wasn't freed yet.
- now, it always is.
- */
+ /*
+ * We free this here for now, because we cannot know in a higher
+ * layer whether the skb pointer it supplied wasn't freed yet.
+ * Now, it always is.
+ */
dev_kfree_skb(skb);
return 0;
- }
+ }
atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc);
ATM_SKB(skb)->atm_options = atmvcc->atm_options;
brdev->stats.tx_packets++;
@@ -214,10 +215,9 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
}
static inline struct br2684_vcc *pick_outgoing_vcc(struct sk_buff *skb,
- struct br2684_dev *brdev)
+ struct br2684_dev *brdev)
{
- return list_empty(&brdev->brvccs) ? NULL :
- list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */
+ return list_empty(&brdev->brvccs) ? NULL : list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */
}
static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -241,11 +241,10 @@ static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev)
/*
* We should probably use netif_*_queue() here, but that
* involves added complication. We need to walk before
- * we can run
+ * we can run.
+ *
+ * Don't free here! this pointer might be no longer valid!
*/
- /* don't free here! this pointer might be no longer valid!
- dev_kfree_skb(skb);
- */
brdev->stats.tx_errors++;
brdev->stats.tx_fifo_errors++;
}
@@ -259,12 +258,11 @@ static struct net_device_stats *br2684_get_stats(struct net_device *dev)
return &BRPRIV(dev)->stats;
}
-
/*
* We remember when the MAC gets set, so we don't override it later with
* the ESI of the ATM card of the first VC
*/
-static int (*my_eth_mac_addr)(struct net_device *, void *);
+static int (*my_eth_mac_addr) (struct net_device *, void *);
static int br2684_mac_addr(struct net_device *dev, void *p)
{
int err = my_eth_mac_addr(dev, p);
@@ -275,7 +273,7 @@ static int br2684_mac_addr(struct net_device *dev, void *p)
#ifdef CONFIG_ATM_BR2684_IPFILTER
/* this IOCTL is experimental. */
-static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg)
+static int br2684_setfilt(struct atm_vcc *atmvcc, void __user * arg)
{
struct br2684_vcc *brvcc;
struct br2684_filter_set fs;
@@ -285,13 +283,12 @@ static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg)
if (fs.ifspec.method != BR2684_FIND_BYNOTHING) {
/*
* This is really a per-vcc thing, but we can also search
- * by device
+ * by device.
*/
struct br2684_dev *brdev;
read_lock(&devs_lock);
brdev = BRPRIV(br2684_find_dev(&fs.ifspec));
- if (brdev == NULL || list_empty(&brdev->brvccs) ||
- brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */
+ if (brdev == NULL || list_empty(&brdev->brvccs) || brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */
brvcc = NULL;
else
brvcc = list_entry_brvcc(brdev->brvccs.next);
@@ -309,15 +306,16 @@ static inline int
packet_fails_filter(__be16 type, struct br2684_vcc *brvcc, struct sk_buff *skb)
{
if (brvcc->filter.netmask == 0)
- return 0; /* no filter in place */
+ return 0; /* no filter in place */
if (type == htons(ETH_P_IP) &&
- (((struct iphdr *) (skb->data))->daddr & brvcc->filter.
+ (((struct iphdr *)(skb->data))->daddr & brvcc->filter.
netmask) == brvcc->filter.prefix)
return 0;
if (type == htons(ETH_P_ARP))
return 0;
- /* TODO: we should probably filter ARPs too.. don't want to have
- * them returning values that don't make sense, or is that ok?
+ /*
+ * TODO: we should probably filter ARPs too.. don't want to have
+ * them returning values that don't make sense, or is that ok?
*/
return 1; /* drop */
}
@@ -367,10 +365,17 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
/* accept packets that have "ipv[46]" in the snap header */
if ((skb->len >= (sizeof(llc_oui_ipv4)))
- && (memcmp(skb->data, llc_oui_ipv4, sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) {
- if (memcmp(skb->data + 6, ethertype_ipv6, sizeof(ethertype_ipv6)) == 0)
+ &&
+ (memcmp
+ (skb->data, llc_oui_ipv4,
+ sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) {
+ if (memcmp
+ (skb->data + 6, ethertype_ipv6,
+ sizeof(ethertype_ipv6)) == 0)
skb->protocol = __constant_htons(ETH_P_IPV6);
- else if (memcmp(skb->data + 6, ethertype_ipv4, sizeof(ethertype_ipv4)) == 0)
+ else if (memcmp
+ (skb->data + 6, ethertype_ipv4,
+ sizeof(ethertype_ipv4)) == 0)
skb->protocol = __constant_htons(ETH_P_IP);
else {
brdev->stats.rx_errors++;
@@ -380,12 +385,13 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
skb_pull(skb, sizeof(llc_oui_ipv4));
skb_reset_network_header(skb);
skb->pkt_type = PACKET_HOST;
-
- /* let us waste some time for checking the encapsulation.
- Note, that only 7 char is checked so frames with a valid FCS
- are also accepted (but FCS is not checked of course) */
+ /*
+ * Let us waste some time for checking the encapsulation.
+ * Note, that only 7 char is checked so frames with a valid FCS
+ * are also accepted (but FCS is not checked of course).
+ */
} else if ((skb->len >= sizeof(llc_oui_pid_pad)) &&
- (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
+ (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
skb_pull(skb, sizeof(llc_oui_pid_pad));
skb->protocol = eth_type_trans(skb, net_dev);
} else {
@@ -401,7 +407,7 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
dev_kfree_skb(skb);
return;
}
- skb_pull(skb, BR2684_PAD_LEN + ETH_HLEN); /* pad, dstmac, srcmac, ethtype */
+ skb_pull(skb, BR2684_PAD_LEN + ETH_HLEN); /* pad, dstmac, srcmac, ethtype */
skb->protocol = eth_type_trans(skb, net_dev);
}
@@ -428,11 +434,12 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
netif_rx(skb);
}
-static int br2684_regvcc(struct atm_vcc *atmvcc, void __user *arg)
+/*
+ * Assign a vcc to a dev
+ * Note: we do not have explicit unassign, but look at _push()
+ */
+static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
{
-/* assign a vcc to a dev
-Note: we do not have explicit unassign, but look at _push()
-*/
int err;
struct br2684_vcc *brvcc;
struct sk_buff *skb;
@@ -451,7 +458,7 @@ Note: we do not have explicit unassign, but look at _push()
net_dev = br2684_find_dev(&be.ifspec);
if (net_dev == NULL) {
printk(KERN_ERR
- "br2684: tried to attach to non-existant device\n");
+ "br2684: tried to attach to non-existant device\n");
err = -ENXIO;
goto error;
}
@@ -467,13 +474,15 @@ Note: we do not have explicit unassign, but look at _push()
}
if (be.fcs_in != BR2684_FCSIN_NO || be.fcs_out != BR2684_FCSOUT_NO ||
be.fcs_auto || be.has_vpiid || be.send_padding || (be.encaps !=
- BR2684_ENCAPS_VC && be.encaps != BR2684_ENCAPS_LLC) ||
- be.min_size != 0) {
+ BR2684_ENCAPS_VC
+ && be.encaps !=
+ BR2684_ENCAPS_LLC)
+ || be.min_size != 0) {
err = -EINVAL;
goto error;
}
- pr_debug("br2684_regvcc vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps,
- brvcc);
+ pr_debug("br2684_regvcc vcc=%p, encaps=%d, brvcc=%p\n", atmvcc,
+ be.encaps, brvcc);
if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) {
unsigned char *esi = atmvcc->dev->esi;
if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5])
@@ -486,7 +495,7 @@ Note: we do not have explicit unassign, but look at _push()
brvcc->device = net_dev;
brvcc->atmvcc = atmvcc;
atmvcc->user_back = brvcc;
- brvcc->encaps = (enum br2684_encaps) be.encaps;
+ brvcc->encaps = (enum br2684_encaps)be.encaps;
brvcc->old_push = atmvcc->push;
barrier();
atmvcc->push = br2684_push;
@@ -517,7 +526,7 @@ Note: we do not have explicit unassign, but look at _push()
}
__module_get(THIS_MODULE);
return 0;
- error:
+ error:
write_unlock_irq(&devs_lock);
kfree(brvcc);
return err;
@@ -556,7 +565,7 @@ static void br2684_setup_routed(struct net_device *netdev)
INIT_LIST_HEAD(&brdev->brvccs);
}
-static int br2684_create(void __user *arg)
+static int br2684_create(void __user * arg)
{
int err;
struct net_device *netdev;
@@ -574,7 +583,7 @@ static int br2684_create(void __user *arg)
payload = p_routed;
else
payload = p_bridged;
- ni.media &= 0xffff; /* strip flags */
+ ni.media &= 0xffff; /* strip flags */
if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500) {
return -EINVAL;
@@ -583,7 +592,7 @@ static int br2684_create(void __user *arg)
netdev = alloc_netdev(sizeof(struct br2684_dev),
ni.ifname[0] ? ni.ifname : "nas%d",
(payload == p_routed) ?
- br2684_setup_routed : br2684_setup);
+ br2684_setup_routed : br2684_setup);
if (!netdev)
return -ENOMEM;
@@ -612,16 +621,16 @@ static int br2684_create(void __user *arg)
* -ENOIOCTLCMD for any unrecognized ioctl
*/
static int br2684_ioctl(struct socket *sock, unsigned int cmd,
- unsigned long arg)
+ unsigned long arg)
{
struct atm_vcc *atmvcc = ATM_SD(sock);
void __user *argp = (void __user *)arg;
+ atm_backend_t b;
int err;
- switch(cmd) {
+ switch (cmd) {
case ATM_SETBACKEND:
- case ATM_NEWBACKENDIF: {
- atm_backend_t b;
+ case ATM_NEWBACKENDIF:
err = get_user(b, (atm_backend_t __user *) argp);
if (err)
return -EFAULT;
@@ -633,7 +642,6 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd,
return br2684_regvcc(atmvcc, argp);
else
return br2684_create(argp);
- }
#ifdef CONFIG_ATM_BR2684_IPFILTER
case BR2684_SETFILT:
if (atmvcc->push != br2684_push)
@@ -641,6 +649,7 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
err = br2684_setfilt(atmvcc, argp);
+
return err;
#endif /* CONFIG_ATM_BR2684_IPFILTER */
}
@@ -648,19 +657,18 @@ static int br2684_ioctl(struct socket *sock, unsigned int cmd,
}
static struct atm_ioctl br2684_ioctl_ops = {
- .owner = THIS_MODULE,
- .ioctl = br2684_ioctl,
+ .owner = THIS_MODULE,
+ .ioctl = br2684_ioctl,
};
-
#ifdef CONFIG_PROC_FS
-static void *br2684_seq_start(struct seq_file *seq, loff_t *pos)
+static void *br2684_seq_start(struct seq_file *seq, loff_t * pos)
{
read_lock(&devs_lock);
return seq_list_start(&br2684_devs, *pos);
}
-static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t * pos)
{
return seq_list_next(v, &br2684_devs, pos);
}
@@ -673,7 +681,7 @@ static void br2684_seq_stop(struct seq_file *seq, void *v)
static int br2684_seq_show(struct seq_file *seq, void *v)
{
const struct br2684_dev *brdev = list_entry(v, struct br2684_dev,
- br2684_devs);
+ br2684_devs);
const struct net_device *net_dev = brdev->net_dev;
const struct br2684_vcc *brvcc;
DECLARE_MAC_BUF(mac);
@@ -686,20 +694,18 @@ static int br2684_seq_show(struct seq_file *seq, void *v)
list_for_each_entry(brvcc, &brdev->brvccs, brvccs) {
seq_printf(seq, " vcc %d.%d.%d: encaps=%s payload=%s"
- ", failed copies %u/%u"
- "\n", brvcc->atmvcc->dev->number,
- brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
- (brvcc->encaps == e_llc) ? "LLC" : "VC",
- (brdev->payload == p_bridged) ? "bridged" : "routed",
- brvcc->copies_failed,
- brvcc->copies_needed);
+ ", failed copies %u/%u"
+ "\n", brvcc->atmvcc->dev->number,
+ brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
+ (brvcc->encaps == e_llc) ? "LLC" : "VC",
+ (brdev->payload == p_bridged) ? "bridged" : "routed",
+ brvcc->copies_failed, brvcc->copies_needed);
#ifdef CONFIG_ATM_BR2684_IPFILTER
#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte]
#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
- if (brvcc->filter.netmask != 0)
- seq_printf(seq, " filter=%d.%d.%d.%d/"
- "%d.%d.%d.%d\n",
- bs(prefix), bs(netmask));
+ if (brvcc->filter.netmask != 0)
+ seq_printf(seq, " filter=%d.%d.%d.%d/"
+ "%d.%d.%d.%d\n", bs(prefix), bs(netmask));
#undef bs
#undef b1
#endif /* CONFIG_ATM_BR2684_IPFILTER */
@@ -709,9 +715,9 @@ static int br2684_seq_show(struct seq_file *seq, void *v)
static const struct seq_operations br2684_seq_ops = {
.start = br2684_seq_start,
- .next = br2684_seq_next,
- .stop = br2684_seq_stop,
- .show = br2684_seq_show,
+ .next = br2684_seq_next,
+ .stop = br2684_seq_stop,
+ .show = br2684_seq_show,
};
static int br2684_proc_open(struct inode *inode, struct file *file)
@@ -720,15 +726,15 @@ static int br2684_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations br2684_proc_ops = {
- .owner = THIS_MODULE,
- .open = br2684_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
+ .owner = THIS_MODULE,
+ .open = br2684_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
.release = seq_release,
};
extern struct proc_dir_entry *atm_proc_root; /* from proc.c */
-#endif
+#endif /* CONFIG_PROC_FS */
static int __init br2684_init(void)
{
^ permalink raw reply related
* [PATCH net-2.6.25 4/7][ATM]: [br2864] routed support
From: chas williams - CONTRACTOR @ 2007-12-30 1:07 UTC (permalink / raw)
To: netdev; +Cc: davem
commit fea6b121bcc150fc91186e5012466c91944ce64d
Author: Eric Kinzie <ekinzie@cmf.nrl.navy.mil>
Date: Fri Oct 26 08:05:08 2007 -0400
[ATM]: [br2864] routed support
From: Eric Kinzie <ekinzie@cmf.nrl.navy.mil>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
diff --git a/include/linux/atmbr2684.h b/include/linux/atmbr2684.h
index 969fb6c..ccdab6c 100644
--- a/include/linux/atmbr2684.h
+++ b/include/linux/atmbr2684.h
@@ -14,6 +14,9 @@
#define BR2684_MEDIA_FDDI (3)
#define BR2684_MEDIA_802_6 (4) /* 802.6 */
+ /* used only at device creation: */
+#define BR2684_FLAG_ROUTED (1<<16) /* payload is routed, not bridged */
+
/*
* Is there FCS inbound on this VC? This currently isn't supported.
*/
@@ -36,13 +39,21 @@
#define BR2684_ENCAPS_AUTODETECT (2) /* Unsuported */
/*
+ * Is this VC bridged or routed?
+ */
+
+#define BR2684_PAYLOAD_ROUTED (0)
+#define BR2684_PAYLOAD_BRIDGED (1)
+
+
+/*
* This is for the ATM_NEWBACKENDIF call - these are like socket families:
* the first element of the structure is the backend number and the rest
* is per-backend specific
*/
struct atm_newif_br2684 {
atm_backend_t backend_num; /* ATM_BACKEND_BR2684 */
- int media; /* BR2684_MEDIA_* */
+ int media; /* BR2684_MEDIA_*, flags in upper bits */
char ifname[IFNAMSIZ];
int mtu;
};
@@ -95,6 +106,11 @@ struct br2684_filter_set {
struct br2684_filter filter;
};
+enum br2684_payload {
+ p_routed = BR2684_PAYLOAD_ROUTED,
+ p_bridged = BR2684_PAYLOAD_BRIDGED,
+};
+
#define BR2684_SETFILT _IOW( 'a', ATMIOC_BACKEND + 0, \
struct br2684_filter_set)
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index ba6428f..d9bb2a1 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -1,7 +1,8 @@
/*
-Experimental ethernet netdevice using ATM AAL5 as underlying carrier
-(RFC1483 obsoleted by RFC2684) for Linux 2.4
-Author: Marcell GAL, 2000, XDSL Ltd, Hungary
+Ethernet netdevice using ATM AAL5 as underlying carrier
+(RFC1483 obsoleted by RFC2684) for Linux
+Authors: Marcell GAL, 2000, XDSL Ltd, Hungary
+ Eric Kinzie, 2006-2007, US Naval Research Laboratory
*/
#include <linux/module.h>
@@ -39,9 +40,27 @@ static void skb_debug(const struct sk_buff *skb)
#define skb_debug(skb) do {} while (0)
#endif
+#define BR2684_ETHERTYPE_LEN 2
+#define BR2684_PAD_LEN 2
+
+#define LLC 0xaa, 0xaa, 0x03
+#define SNAP_BRIDGED 0x00, 0x80, 0xc2
+#define SNAP_ROUTED 0x00, 0x00, 0x00
+#define PID_ETHERNET 0x00, 0x07
+#define ETHERTYPE_IPV4 0x08, 0x00
+#define ETHERTYPE_IPV6 0x86, 0xdd
+#define PAD_BRIDGED 0x00, 0x00
+
+static unsigned char ethertype_ipv4[] =
+ { ETHERTYPE_IPV4 };
+static unsigned char ethertype_ipv6[] =
+ { ETHERTYPE_IPV6 };
static unsigned char llc_oui_pid_pad[] =
- { 0xAA, 0xAA, 0x03, 0x00, 0x80, 0xC2, 0x00, 0x07, 0x00, 0x00 };
-#define PADLEN (2)
+ { LLC, SNAP_BRIDGED, PID_ETHERNET, PAD_BRIDGED };
+static unsigned char llc_oui_ipv4[] =
+ { LLC, SNAP_ROUTED, ETHERTYPE_IPV4 };
+static unsigned char llc_oui_ipv6[] =
+ { LLC, SNAP_ROUTED, ETHERTYPE_IPV6 };
enum br2684_encaps {
e_vc = BR2684_ENCAPS_VC,
@@ -69,6 +88,7 @@ struct br2684_dev {
struct list_head brvccs; /* one device <=> one vcc (before xmas) */
struct net_device_stats stats;
int mac_was_set;
+ enum br2684_payload payload;
};
/*
@@ -136,6 +156,7 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
{
struct atm_vcc *atmvcc;
int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2;
+
if (skb_headroom(skb) < minheadroom) {
struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom);
brvcc->copies_needed++;
@@ -146,11 +167,32 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev,
}
skb = skb2;
}
- skb_push(skb, minheadroom);
- if (brvcc->encaps == e_llc)
- skb_copy_to_linear_data(skb, llc_oui_pid_pad, 10);
- else
- memset(skb->data, 0, 2);
+
+ if (brvcc->encaps == e_llc) {
+ if (brdev->payload == p_bridged) {
+ skb_push(skb, sizeof(llc_oui_pid_pad));
+ skb_copy_to_linear_data(skb, llc_oui_pid_pad, sizeof(llc_oui_pid_pad));
+ } else if (brdev->payload == p_routed) {
+ unsigned short prot = ntohs(skb->protocol);
+
+ skb_push(skb, sizeof(llc_oui_ipv4));
+ switch (prot) {
+ case ETH_P_IP:
+ skb_copy_to_linear_data(skb, llc_oui_ipv4, sizeof(llc_oui_ipv4));
+ break;
+ case ETH_P_IPV6:
+ skb_copy_to_linear_data(skb, llc_oui_ipv6, sizeof(llc_oui_ipv6));
+ break;
+ default:
+ dev_kfree_skb(skb);
+ return 0;
+ }
+ }
+ } else {
+ skb_push(skb, 2);
+ if (brdev->payload == p_bridged)
+ memset(skb->data, 0, 2);
+ }
skb_debug(skb);
ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc;
@@ -299,7 +341,6 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
struct br2684_vcc *brvcc = BR2684_VCC(atmvcc);
struct net_device *net_dev = brvcc->device;
struct br2684_dev *brdev = BRPRIV(net_dev);
- int plen = sizeof(llc_oui_pid_pad) + ETH_HLEN;
pr_debug("br2684_push\n");
@@ -320,35 +361,50 @@ static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
atm_return(atmvcc, skb->truesize);
pr_debug("skb from brdev %p\n", brdev);
if (brvcc->encaps == e_llc) {
+
+ if (skb->len > 7 && skb->data[7] == 0x01)
+ __skb_trim(skb, skb->len - 4);
+
+ /* accept packets that have "ipv[46]" in the snap header */
+ if ((skb->len >= (sizeof(llc_oui_ipv4)))
+ && (memcmp(skb->data, llc_oui_ipv4, sizeof(llc_oui_ipv4) - BR2684_ETHERTYPE_LEN) == 0)) {
+ if (memcmp(skb->data + 6, ethertype_ipv6, sizeof(ethertype_ipv6)) == 0)
+ skb->protocol = __constant_htons(ETH_P_IPV6);
+ else if (memcmp(skb->data + 6, ethertype_ipv4, sizeof(ethertype_ipv4)) == 0)
+ skb->protocol = __constant_htons(ETH_P_IP);
+ else {
+ brdev->stats.rx_errors++;
+ dev_kfree_skb(skb);
+ return;
+ }
+ skb_pull(skb, sizeof(llc_oui_ipv4));
+ skb_reset_network_header(skb);
+ skb->pkt_type = PACKET_HOST;
+
/* let us waste some time for checking the encapsulation.
Note, that only 7 char is checked so frames with a valid FCS
are also accepted (but FCS is not checked of course) */
- if (memcmp(skb->data, llc_oui_pid_pad, 7)) {
+ } else if ((skb->len >= sizeof(llc_oui_pid_pad)) &&
+ (memcmp(skb->data, llc_oui_pid_pad, 7) == 0)) {
+ skb_pull(skb, sizeof(llc_oui_pid_pad));
+ skb->protocol = eth_type_trans(skb, net_dev);
+ } else {
brdev->stats.rx_errors++;
dev_kfree_skb(skb);
return;
}
- /* Strip FCS if present */
- if (skb->len > 7 && skb->data[7] == 0x01)
- __skb_trim(skb, skb->len - 4);
} else {
- plen = PADLEN + ETH_HLEN; /* pad, dstmac,srcmac, ethtype */
/* first 2 chars should be 0 */
if (*((u16 *) (skb->data)) != 0) {
brdev->stats.rx_errors++;
dev_kfree_skb(skb);
return;
}
- }
- if (skb->len < plen) {
- brdev->stats.rx_errors++;
- dev_kfree_skb(skb); /* dev_ not needed? */
- return;
+ skb_pull(skb, BR2684_PAD_LEN + ETH_HLEN); /* pad, dstmac, srcmac, ethtype */
+ skb->protocol = eth_type_trans(skb, net_dev);
}
- skb_pull(skb, plen - ETH_HLEN);
- skb->protocol = eth_type_trans(skb, net_dev);
#ifdef CONFIG_ATM_BR2684_IPFILTER
if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) {
brdev->stats.rx_dropped++;
@@ -482,25 +538,52 @@ static void br2684_setup(struct net_device *netdev)
INIT_LIST_HEAD(&brdev->brvccs);
}
+static void br2684_setup_routed(struct net_device *netdev)
+{
+ struct br2684_dev *brdev = BRPRIV(netdev);
+ brdev->net_dev = netdev;
+
+ netdev->hard_header_len = 0;
+ my_eth_mac_addr = netdev->set_mac_address;
+ netdev->set_mac_address = br2684_mac_addr;
+ netdev->hard_start_xmit = br2684_start_xmit;
+ netdev->get_stats = br2684_get_stats;
+ netdev->addr_len = 0;
+ netdev->mtu = 1500;
+ netdev->type = ARPHRD_PPP;
+ netdev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+ netdev->tx_queue_len = 100;
+ INIT_LIST_HEAD(&brdev->brvccs);
+}
+
static int br2684_create(void __user *arg)
{
int err;
struct net_device *netdev;
struct br2684_dev *brdev;
struct atm_newif_br2684 ni;
+ enum br2684_payload payload;
pr_debug("br2684_create\n");
if (copy_from_user(&ni, arg, sizeof ni)) {
return -EFAULT;
}
+
+ if (ni.media & BR2684_FLAG_ROUTED)
+ payload = p_routed;
+ else
+ payload = p_bridged;
+ ni.media &= 0xffff; /* strip flags */
+
if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500) {
return -EINVAL;
}
netdev = alloc_netdev(sizeof(struct br2684_dev),
ni.ifname[0] ? ni.ifname : "nas%d",
- br2684_setup);
+ (payload == p_routed) ?
+ br2684_setup_routed : br2684_setup);
if (!netdev)
return -ENOMEM;
@@ -516,6 +599,7 @@ static int br2684_create(void __user *arg)
}
write_lock_irq(&devs_lock);
+ brdev->payload = payload;
brdev->number = list_empty(&br2684_devs) ? 1 :
BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1;
list_add_tail(&brdev->br2684_devs, &br2684_devs);
@@ -601,14 +685,14 @@ static int br2684_seq_show(struct seq_file *seq, void *v)
brdev->mac_was_set ? "set" : "auto");
list_for_each_entry(brvcc, &brdev->brvccs, brvccs) {
- seq_printf(seq, " vcc %d.%d.%d: encaps=%s"
- ", failed copies %u/%u"
- "\n", brvcc->atmvcc->dev->number,
- brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
- (brvcc->encaps == e_llc) ? "LLC" : "VC"
- , brvcc->copies_failed
- , brvcc->copies_needed
- );
+ seq_printf(seq, " vcc %d.%d.%d: encaps=%s payload=%s"
+ ", failed copies %u/%u"
+ "\n", brvcc->atmvcc->dev->number,
+ brvcc->atmvcc->vpi, brvcc->atmvcc->vci,
+ (brvcc->encaps == e_llc) ? "LLC" : "VC",
+ (brdev->payload == p_bridged) ? "bridged" : "routed",
+ brvcc->copies_failed,
+ brvcc->copies_needed);
#ifdef CONFIG_ATM_BR2684_IPFILTER
#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte]
#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3)
^ permalink raw reply related
* [PATCH net-2.6.25 3/7][ATM]: [he] This patch removes the ancient version string.
From: chas williams - CONTRACTOR @ 2007-12-30 1:06 UTC (permalink / raw)
To: netdev; +Cc: davem
commit 41d6b7e20dd6d57bec4f14697f654c8211ff5d57
Author: Adrian Bunk <bunk@stusta.de>
Date: Fri Oct 26 07:29:47 2007 -0400
[ATM]: [he] This patch removes the ancient version string.
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Chas Williams <chas@cmf.nrl.navy.mil>
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index d33aba6..4f1c7e6 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -1,5 +1,3 @@
-/* $Id: he.c,v 1.18 2003/05/06 22:57:15 chas Exp $ */
-
/*
he.c
@@ -99,10 +97,6 @@
#define HPRINTK(fmt,args...) do { } while (0)
#endif /* HE_DEBUG */
-/* version definition */
-
-static char *version = "$Id: he.c,v 1.18 2003/05/06 22:57:15 chas Exp $";
-
/* declarations */
static int he_open(struct atm_vcc *vcc);
@@ -366,7 +360,7 @@ he_init_one(struct pci_dev *pci_dev, const struct pci_device_id *pci_ent)
struct he_dev *he_dev = NULL;
int err = 0;
- printk(KERN_INFO "he: %s\n", version);
+ printk(KERN_INFO "ATM he driver\n");
if (pci_enable_device(pci_dev))
return -EIO;
@@ -2933,7 +2927,7 @@ he_proc_read(struct atm_dev *dev, loff_t *pos, char *page)
left = *pos;
if (!left--)
- return sprintf(page, "%s\n", version);
+ return sprintf(page, "ATM he driver\n");
if (!left--)
return sprintf(page, "%s%s\n\n",
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox