* [PATCH 10/12] mm: Micro-optimise slab to avoid a function call
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
Getting and putting objects in SLAB currently requires a function call
but the bulk of the work is related to PFMEMALLOC reserves which are
only consumed when network-backed storage is critical. Use an inline
function to determine if the function call is required.
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
mm/slab.c | 28 ++++++++++++++++++++++++++--
1 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/mm/slab.c b/mm/slab.c
index 8f81d17..0e9980b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,8 @@
#include <linux/kmemcheck.h>
#include <linux/memory.h>
+#include <net/sock.h>
+
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -941,7 +943,7 @@ static void check_ac_pfmemalloc(struct kmem_cache *cachep,
ac->pfmemalloc = false;
}
-static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
gfp_t flags, bool force_refill)
{
int i;
@@ -988,7 +990,20 @@ static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
return objp;
}
-static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+ struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+ void *objp;
+
+ if (unlikely(sk_memalloc_socks()))
+ objp = __ac_get_obj(cachep, ac, flags, force_refill);
+ else
+ objp = ac->entry[--ac->avail];
+
+ return objp;
+}
+
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
void *objp)
{
struct slab *slabp;
@@ -1001,6 +1016,15 @@ static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
set_obj_pfmemalloc(&objp);
}
+ return objp;
+}
+
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ if (unlikely(sk_memalloc_socks()))
+ objp = __ac_put_obj(cachep, ac, objp);
+
ac->entry[ac->avail++] = objp;
}
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 09/12] netvm: Set PF_MEMALLOC as appropriate during SKB processing
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
In order to make sure pfmemalloc packets receive all memory needed to proceed,
ensure processing of pfmemalloc SKBs happens under PF_MEMALLOC. This is
limited to a subset of protocols that are expected to be used for writing
to swap. Taps are not allowed to use PF_MEMALLOC as these are expected to
communicate with userspace processes which could be paged out.
[a.p.zijlstra@chello.nl: Ideas taken from various patches]
[jslaby@suse.cz: Lock imbalance fix]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/net/sock.h | 5 +++++
net/core/dev.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
net/core/sock.c | 16 ++++++++++++++++
3 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 1d8a26b..3ea9c2d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -667,8 +667,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s
return 0;
}
+extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
+ if (skb_pfmemalloc(skb))
+ return __sk_backlog_rcv(sk, skb);
+
return sk->sk_backlog_rcv(sk, skb);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 6561021..6ab41f6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3012,6 +3012,27 @@ int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
}
EXPORT_SYMBOL(__skb_bond_should_drop);
+/*
+ * Limit which protocols can use the PFMEMALLOC reserves to those that are
+ * expected to be used for communication with swap.
+ */
+static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
+{
+ if (skb_pfmemalloc(skb))
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_ARP):
+ case __constant_htons(ETH_P_IP):
+ case __constant_htons(ETH_P_IPV6):
+ case __constant_htons(ETH_P_8021Q):
+ break;
+
+ default:
+ return false;
+ }
+
+ return true;
+}
+
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
@@ -3022,15 +3043,28 @@ static int __netif_receive_skb(struct sk_buff *skb)
struct net_device *orig_or_bond;
int ret = NET_RX_DROP;
__be16 type;
+ unsigned long pflags = current->flags;
if (!netdev_tstamp_prequeue)
net_timestamp_check(skb);
trace_netif_receive_skb(skb);
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ if (skb_pfmemalloc(skb))
+ current->flags |= PF_MEMALLOC;
+
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
- return NET_RX_DROP;
+ goto out;
if (!skb->skb_iif)
skb->skb_iif = skb->dev->ifindex;
@@ -3071,6 +3105,9 @@ static int __netif_receive_skb(struct sk_buff *skb)
}
#endif
+ if (skb_pfmemalloc(skb))
+ goto skip_taps;
+
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
@@ -3080,13 +3117,17 @@ static int __netif_receive_skb(struct sk_buff *skb)
}
}
+skip_taps:
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
- goto out;
+ goto unlock;
ncls:
#endif
+ if (!skb_pfmemalloc_protocol(skb))
+ goto drop;
+
/* Handle special case of bridge or macvlan */
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
@@ -3096,7 +3137,7 @@ ncls:
}
skb = rx_handler(skb);
if (!skb)
- goto out;
+ goto unlock;
}
if (vlan_tx_tag_present(skb)) {
@@ -3138,6 +3179,7 @@ ncls:
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
+drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
@@ -3146,8 +3188,10 @@ ncls:
ret = NET_RX_DROP;
}
-out:
+unlock:
rcu_read_unlock();
+out:
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
return ret;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 7aac82b..eb38fbc 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -250,6 +250,22 @@ void sk_clear_memalloc(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+ unsigned long pflags = current->flags;
+
+ /* these should have been dropped before queueing */
+ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
+
+ current->flags |= PF_MEMALLOC;
+ ret = sk->sk_backlog_rcv(sk, skb);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+ return ret;
+}
+EXPORT_SYMBOL(__sk_backlog_rcv);
+
#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
int net_cls_subsys_id = -1;
EXPORT_SYMBOL_GPL(net_cls_subsys_id);
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 08/12] netvm: Allow skb allocation to use PFMEMALLOC reserves
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
Change the skb allocation API to indicate RX usage and use this to fall back
to the PFMEMALLOC reserve when needed. SKBs allocated from the reserve are
tagged in skb->pfmemalloc. If an SKB is allocated from the reserve and
the socket is later found to be unrelated to page reclaim, the packet is
dropped so that the memory remains available for page reclaim. Network
protocols are expected to recover from this packet loss.
[a.p.zijlstra@chello.nl: Ideas taken from various patches]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/linux/gfp.h | 3 ++
include/linux/skbuff.h | 19 ++++++++--
include/net/sock.h | 6 +++
mm/internal.h | 3 --
net/core/filter.c | 8 ++++
net/core/skbuff.c | 95 ++++++++++++++++++++++++++++++++++++++++--------
net/core/sock.c | 9 +++++
7 files changed, 121 insertions(+), 22 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d54eb02..ebf4d4f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -374,6 +374,9 @@ void drain_local_pages(void *dummy);
extern gfp_t gfp_allowed_mask;
+/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+
extern void pm_restrict_gfp_mask(void);
extern void pm_restore_gfp_mask(void);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bf221d6..5cd4d23 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -393,6 +393,7 @@ struct sk_buff {
#else
__u8 deliver_no_wcard:1;
#endif
+ __u8 pfmemalloc:1;
__u8 ooo_okay:1;
kmemcheck_bitfield_end(flags2);
@@ -431,6 +432,15 @@ struct sk_buff {
#include <asm/system.h>
+#define SKB_ALLOC_FCLONE 0x01
+#define SKB_ALLOC_RX 0x02
+
+/* Returns true if the skb was allocated from PFMEMALLOC reserves */
+static inline bool skb_pfmemalloc(struct sk_buff *skb)
+{
+ return unlikely(skb->pfmemalloc);
+}
+
/*
* skb might have a dst pointer attached, refcounted or not.
* _skb_refdst low order bit is set if refcount was _not_ taken
@@ -488,7 +498,7 @@ extern void kfree_skb(struct sk_buff *skb);
extern void consume_skb(struct sk_buff *skb);
extern void __kfree_skb(struct sk_buff *skb);
extern struct sk_buff *__alloc_skb(unsigned int size,
- gfp_t priority, int fclone, int node);
+ gfp_t priority, int flags, int node);
static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
{
@@ -498,7 +508,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size,
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
gfp_t priority)
{
- return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
+ return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}
extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
@@ -1524,7 +1534,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
gfp_t gfp_mask)
{
- struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+ struct sk_buff *skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
if (likely(skb))
skb_reserve(skb, NET_SKB_PAD);
return skb;
@@ -1575,7 +1586,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
*/
static inline struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
{
- return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+ return alloc_pages_node(NUMA_NO_NODE, gfp_mask | __GFP_MEMALLOC, 0);
}
/**
diff --git a/include/net/sock.h b/include/net/sock.h
index e6ead99..1d8a26b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -585,6 +585,12 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
return test_bit(flag, &sk->sk_flags);
}
+extern int memalloc_socks;
+static inline int sk_memalloc_socks(void)
+{
+ return memalloc_socks;
+}
+
static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
{
return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
diff --git a/mm/internal.h b/mm/internal.h
index 110c9a2..6948820 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -189,9 +189,6 @@ static inline struct page *mem_map_next(struct page *iter,
#define __paginginit __init
#endif
-/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
-
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
diff --git a/net/core/filter.c b/net/core/filter.c
index afc5837..5223b48 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -138,6 +138,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
int err;
struct sk_filter *filter;
+ /*
+ * If the skb was allocated from pfmemalloc reserves, only
+ * allow SOCK_MEMALLOC sockets to use it as this socket is
+ * helping free memory
+ */
+ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
+ return -ENOMEM;
+
err = security_sock_rcv_skb(sk, skb);
if (err)
return err;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d883dcc..f3702ae 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -146,6 +146,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
BUG();
}
+
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
+ __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
+ bool *pfmemalloc)
+{
+ void *obj;
+ bool ret_pfmemalloc = false;
+
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
+}
+
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* 'private' fields and also do memory statistics to find all the
* [BEEP] leaks.
@@ -156,8 +193,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
- * @fclone: allocate from fclone cache instead of head cache
- * and allocate a cloned (child) skb
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -168,14 +207,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int fclone, int node)
+ int flags, int node)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
+ bool pfmemalloc;
+
+ cache = (flags & SKB_ALLOC_FCLONE)
+ ? skbuff_fclone_cache : skbuff_head_cache;
- cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
/* Get the HEAD */
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -184,8 +228,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
prefetchw(skb);
size = SKB_DATA_ALIGN(size);
- data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask, node);
+ data = kmalloc_reserve(size + sizeof(struct skb_shared_info),
+ gfp_mask, node, &pfmemalloc);
if (!data)
goto nodata;
prefetchw(data + size);
@@ -196,6 +240,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->pfmemalloc = pfmemalloc;
skb->truesize = size + sizeof(struct sk_buff);
atomic_set(&skb->users, 1);
skb->head = data;
@@ -212,7 +257,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(&shinfo->dataref, 1);
kmemcheck_annotate_variable(shinfo->destructor_arg);
- if (fclone) {
+ if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
@@ -222,6 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
atomic_set(fclone_ref, 1);
child->fclone = SKB_FCLONE_UNAVAILABLE;
+ child->pfmemalloc = pfmemalloc;
}
out:
return skb;
@@ -250,7 +296,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
{
struct sk_buff *skb;
- skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE);
+ skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
+ SKB_ALLOC_RX, NUMA_NO_NODE);
if (likely(skb)) {
skb_reserve(skb, NET_SKB_PAD);
skb->dev = dev;
@@ -527,6 +574,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
new->ipvs_property = old->ipvs_property;
#endif
+ new->pfmemalloc = old->pfmemalloc;
new->protocol = old->protocol;
new->mark = old->mark;
new->skb_iif = old->skb_iif;
@@ -621,6 +669,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
n->fclone = SKB_FCLONE_CLONE;
atomic_inc(fclone_ref);
} else {
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
@@ -657,6 +708,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
+static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
+{
+ if (skb_pfmemalloc((struct sk_buff *)skb))
+ return SKB_ALLOC_RX;
+ return 0;
+}
+
/**
* skb_copy - create private copy of an sk_buff
* @skb: buffer to copy
@@ -678,7 +736,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -712,7 +771,8 @@ EXPORT_SYMBOL(skb_copy);
struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
{
unsigned int size = skb_end_pointer(skb) - skb->head;
- struct sk_buff *n = alloc_skb(size, gfp_mask);
+ struct sk_buff *n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
goto out;
@@ -803,7 +863,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
goto adjust_others;
}
- data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+ if (skb_pfmemalloc(skb))
+ gfp_mask |= __GFP_MEMALLOC;
+ data = kmalloc_reserve(size + sizeof(struct skb_shared_info), gfp_mask,
+ NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
@@ -904,8 +967,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask);
+ struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
int off;
@@ -2555,8 +2619,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
- nskb = alloc_skb(hsize + doffset + headroom,
- GFP_ATOMIC);
+ nskb = __alloc_skb(hsize + doffset + headroom,
+ GFP_ATOMIC, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (unlikely(!nskb))
goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 1deb48b..7aac82b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -219,6 +219,9 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
+static DEFINE_MUTEX(memalloc_socks_lock);
+int memalloc_socks __read_mostly;
+
/**
* sk_set_memalloc - sets %SOCK_MEMALLOC
* @sk: socket to set it on
@@ -231,6 +234,9 @@ void sk_set_memalloc(struct sock *sk)
{
sock_set_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation |= __GFP_MEMALLOC;
+ mutex_lock(&memalloc_socks_lock);
+ memalloc_socks++;
+ mutex_unlock(&memalloc_socks_lock);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);
@@ -238,6 +244,9 @@ void sk_clear_memalloc(struct sock *sk)
{
sock_reset_flag(sk, SOCK_MEMALLOC);
sk->sk_allocation &= ~__GFP_MEMALLOC;
+ mutex_lock(&memalloc_socks_lock);
+ memalloc_socks--;
+ mutex_unlock(&memalloc_socks_lock);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);
--
1.7.3.4
^ permalink raw reply related
* [PATCH 07/12] netvm: Allow the use of __GFP_MEMALLOC by specific sockets
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
Allow specific sockets to be tagged SOCK_MEMALLOC and use __GFP_MEMALLOC
for their allocations. These sockets will be able to go below watermarks
and allocate from the emergency reserve. Such sockets are to be used
to service the VM (iow. to swap over). They must be handled kernel side,
exposing such a socket to user-space is a bug.
There is a risk that the reserves be depleted so for now, the administrator is
responsible for increasing min_free_kbytes as necessary to prevent deadlock
for their workloads.
[a.p.zijlstra@chello.nl: Original patches]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/net/sock.h | 5 ++++-
net/core/sock.c | 22 ++++++++++++++++++++++
2 files changed, 26 insertions(+), 1 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index f687e0b..e6ead99 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -553,6 +553,7 @@ enum sock_flags {
SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+ SOCK_MEMALLOC, /* VM depends on this socket for swapping */
SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */
SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */
SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */
@@ -586,7 +587,7 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
{
- return gfp_mask;
+ return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}
static inline void sk_acceptq_removed(struct sock *sk)
@@ -716,6 +717,8 @@ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
extern int sk_stream_error(struct sock *sk, int flags, int err);
extern void sk_stream_kill_queues(struct sock *sk);
+extern void sk_set_memalloc(struct sock *sk);
+extern void sk_clear_memalloc(struct sock *sk);
extern int sk_wait_data(struct sock *sk, long *timeo);
diff --git a/net/core/sock.c b/net/core/sock.c
index 7dfed79..1deb48b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -219,6 +219,28 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
+/**
+ * sk_set_memalloc - sets %SOCK_MEMALLOC
+ * @sk: socket to set it on
+ *
+ * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
+ * It's the responsibility of the admin to adjust min_free_kbytes
+ * to meet the requirements
+ */
+void sk_set_memalloc(struct sock *sk)
+{
+ sock_set_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation |= __GFP_MEMALLOC;
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
+void sk_clear_memalloc(struct sock *sk)
+{
+ sock_reset_flag(sk, SOCK_MEMALLOC);
+ sk->sk_allocation &= ~__GFP_MEMALLOC;
+}
+EXPORT_SYMBOL_GPL(sk_clear_memalloc);
+
#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
int net_cls_subsys_id = -1;
EXPORT_SYMBOL_GPL(net_cls_subsys_id);
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 06/12] net: Introduce sk_allocation() to allow addition of GFP flags depending on the individual socket
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
Introduce sk_allocation(), this function allows to inject sock specific
flags to each sock related allocation. It is only used on allocation
paths that may be required for writing pages back to network storage.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/net/sock.h | 5 +++++
net/ipv4/tcp.c | 3 ++-
net/ipv4/tcp_output.c | 13 +++++++------
net/ipv6/tcp_ipv6.c | 12 +++++++++---
4 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index bc1cf7d8..f687e0b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -584,6 +584,11 @@ static inline int sock_flag(struct sock *sk, enum sock_flags flag)
return test_bit(flag, &sk->sk_flags);
}
+static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
+{
+ return gfp_mask;
+}
+
static inline void sk_acceptq_removed(struct sock *sk)
{
sk->sk_ack_backlog--;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eec..8939804 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -689,7 +689,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
- skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ skb = alloc_skb_fclone(size + sk->sk_prot->max_header,
+ sk_allocation(sk, gfp));
if (skb) {
if (sk_wmem_schedule(sk, skb->truesize)) {
/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dfa5beb..550c701 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2323,7 +2323,7 @@ void tcp_send_fin(struct sock *sk)
/* Socket is locked, keep trying until memory is available. */
for (;;) {
skb = alloc_skb_fclone(MAX_TCP_HEADER,
- sk->sk_allocation);
+ sk_allocation(sk, GFP_KERNEL));
if (skb)
break;
yield();
@@ -2349,7 +2349,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
struct sk_buff *skb;
/* NOTE: No TCP options attached and we never retransmit this. */
- skb = alloc_skb(MAX_TCP_HEADER, priority);
+ skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, priority));
if (!skb) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
return;
@@ -2422,7 +2422,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
s_data_desired = cvp->s_data_desired;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1,
+ sk_allocation(sk, GFP_ATOMIC));
if (skb == NULL)
return NULL;
@@ -2718,7 +2719,7 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
if (buff == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -2733,7 +2734,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+ tcp_transmit_skb(sk, buff, 0, sk_allocation(sk, GFP_ATOMIC));
}
/* This routine sends a packet with an out of date sequence
@@ -2753,7 +2754,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
if (skb == NULL)
return -1;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 20aa95e..3a555de 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -610,7 +610,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer,
} else {
/* reallocate new list if current one is full. */
if (!tp->md5sig_info) {
- tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info), GFP_ATOMIC);
+ tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
+ sk_allocation(sk, GFP_ATOMIC));
if (!tp->md5sig_info) {
kfree(newkey);
return -ENOMEM;
@@ -623,7 +624,8 @@ static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer,
}
if (tp->md5sig_info->alloced6 == tp->md5sig_info->entries6) {
keys = kmalloc((sizeof (tp->md5sig_info->keys6[0]) *
- (tp->md5sig_info->entries6 + 1)), GFP_ATOMIC);
+ (tp->md5sig_info->entries6 + 1)),
+ sk_allocation(sk, GFP_ATOMIC));
if (!keys) {
tcp_free_md5sig_pool();
@@ -747,7 +749,8 @@ static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval,
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *p;
- p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct tcp_md5sig_info),
+ sk_allocation(sk, GFP_KERNEL));
if (!p)
return -ENOMEM;
@@ -1098,6 +1101,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
struct tcphdr *th = tcp_hdr(skb);
u32 seq = 0, ack_seq = 0;
struct tcp_md5sig_key *key = NULL;
+ gfp_t gfp_mask = GFP_ATOMIC;
if (th->rst)
return;
@@ -1109,6 +1113,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
if (sk)
key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr);
#endif
+ if (sk)
+ gfp_mask = sk_allocation(sk, gfp_mask);
if (th->ack)
seq = ntohl(th->ack_seq);
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 05/12] mm: Ignore mempolicies when using ALLOC_NO_WATERMARK
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
The reserve is proportionally distributed over all !highmem zones in the
system. So we need to allow an emergency allocation access to all zones.
In order to do that we need to break out of any mempolicy boundaries we
might have.
In my opinion that does not break mempolicies as those are user oriented
and not system oriented. That is, system allocations are not guaranteed to
be within mempolicy boundaries. For instance IRQs don't even have a mempolicy.
So breaking out of mempolicy boundaries for 'rare' emergency allocations,
which are always system allocations (as opposed to user) is ok.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
mm/page_alloc.c | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8318cf2..2b87dfd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2064,6 +2064,13 @@ restart:
rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
+ /*
+ * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+ * the allocation is high priority and these type of
+ * allocations are system rather than user orientated
+ */
+ zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 04/12] mm: allow PF_MEMALLOC from softirq context
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
This is needed to allow network softirq packet processing to make use
of PF_MEMALLOC.
Currently softirq context cannot use PF_MEMALLOC due to it not being
associated with a task, and therefore not having task flags to fiddle with -
thus the gfp to alloc flag mapping ignores the task flags when in interrupts
(hard or soft) context.
Allowing softirqs to make use of PF_MEMALLOC therefore requires some trickery.
We basically borrow the task flags from whatever process happens to be
preempted by the softirq.
So we modify the gfp to alloc flags mapping to not exclude task flags in
softirq context, and modify the softirq code to save, clear and restore
the PF_MEMALLOC flag.
The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't
leak into the softirq. The restore ensures a softirq's PF_MEMALLOC flag
cannot leak back into the preempted process.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/linux/sched.h | 7 +++++++
kernel/softirq.c | 3 +++
mm/page_alloc.c | 5 ++++-
3 files changed, 14 insertions(+), 1 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 777d8a5..b753de6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1799,6 +1799,13 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif
+static inline void tsk_restore_flags(struct task_struct *p,
+ unsigned long pflags, unsigned long mask)
+{
+ p->flags &= ~mask;
+ p->flags |= pflags & mask;
+}
+
#ifdef CONFIG_SMP
extern int set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5ef..cfa9ba4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -210,6 +210,8 @@ asmlinkage void __do_softirq(void)
__u32 pending;
int max_restart = MAX_SOFTIRQ_RESTART;
int cpu;
+ unsigned long pflags = current->flags;
+ current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_system_vtime(current);
@@ -265,6 +267,7 @@ restart:
account_system_vtime(current);
__local_bh_enable(SOFTIRQ_OFFSET);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f897c3..8318cf2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1984,7 +1984,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (gfp_mask & __GFP_MEMALLOC)
alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
+ else if (!in_irq() && (current->flags & PF_MEMALLOC))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (!in_interrupt() &&
+ unlikely(test_thread_flag(TIF_MEMDIE)))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 03/12] mm: Introduce __GFP_MEMALLOC to allow access to emergency reserves
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
__GFP_MEMALLOC will allow the allocation to disregard the watermarks,
much like PF_MEMALLOC. It allows one to pass along the memalloc state in
object related allocation flags as opposed to task related flags, such
as sk->sk_allocation. This removes the need for ALLOC_PFMEMALLOC as
callers using __GFP_MEMALLOC can get the ALLOC_NO_WATERMARK flag which
is now enough to identify allocations related to page reclaim.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/linux/gfp.h | 4 +++-
include/linux/mm_types.h | 2 +-
mm/page_alloc.c | 14 ++++++--------
mm/slab.c | 2 +-
4 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index dca3176..d54eb02 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -23,6 +23,7 @@ struct vm_area_struct;
#define ___GFP_REPEAT 0x400u
#define ___GFP_NOFAIL 0x800u
#define ___GFP_NORETRY 0x1000u
+#define ___GFP_MEMALLOC 0x2000u
#define ___GFP_COMP 0x4000u
#define ___GFP_ZERO 0x8000u
#define ___GFP_NOMEMALLOC 0x10000u
@@ -74,6 +75,7 @@ struct vm_area_struct;
#define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */
#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */
#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */
+#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */
#define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */
#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */
@@ -125,7 +127,7 @@ struct vm_area_struct;
/* Control page allocator reclaim behavior */
#define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
- __GFP_NORETRY|__GFP_NOMEMALLOC)
+ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
/* Control slab gfp mask during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS))
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1a5e14b..d166b21 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -72,7 +72,7 @@ struct page {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
bool pfmemalloc; /* If set by the page allocator,
- * ALLOC_PFMEMALLOC was set and the
+ * ALLOC_NO_WATERMARKS was set and the
* low watermark was not met implying
* that the system is under some
* pressure. The caller should try
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fb34549..2f897c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1360,7 +1360,6 @@ failed:
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-#define ALLOC_PFMEMALLOC 0x80 /* Caller has PF_MEMALLOC set */
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -1982,11 +1981,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))) {
- alloc_flags |= ALLOC_PFMEMALLOC;
-
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
+ if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+ if (gfp_mask & __GFP_MEMALLOC)
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1995,7 +1993,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
- return gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC;
+ return gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS;
}
static inline struct page *
@@ -2183,7 +2181,7 @@ got_pg:
* steps that will free more memory. The caller should avoid the
* page being used for !PFMEMALLOC purposes.
*/
- page->pfmemalloc = (alloc_flags & ALLOC_PFMEMALLOC);
+ page->pfmemalloc = (alloc_flags & ALLOC_NO_WATERMARKS);
return page;
}
diff --git a/mm/slab.c b/mm/slab.c
index 953e6263..8f81d17 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2977,7 +2977,7 @@ static int cache_grow(struct kmem_cache *cachep,
if (!slabp)
goto opps1;
- /* Record if ALLOC_PFMEMALLOC was set when allocating the slab */
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (pfmemalloc) {
struct array_cache *ac = cpu_cache_get(cachep);
slabp->pfmemalloc = true;
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 02/12] mm: sl[au]b: Add knowledge of PFMEMALLOC reserve pages
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
Allocations of pages below the min watermark run a risk of the machine
hanging due to lack of memory. To prevent this, only callers who
have PF_MEMALLOC or TIF_MEMDIE set and not processing an interrupt are
allowed to allocate with ALLOC_NO_WATERMARKS. Once they are allocated
to a slab though, nothing prevents other callers consuming free objects
within those slabs. This patch limits access to slab pages that were
alloced from the PFMEMALLOC reserves.
Pages allocated from the reserve are returned with page->pfmemalloc
set and it's up to the caller to determine how the page should be
protected. SLAB restricts access to any page with page->pfmemalloc set
to callers which are known to able to access the PFMEMALLOC reserve. If
one is not available, an attempt is made to allocate a new page rather
than use a reserve. SLUB is a bit more relaxed in that it only records
if the current per-CPU page was allocated from PFMEMALLOC reserve and
uses another partial slab if the caller does not have the necessary
GFP or process flags. This was found to be sufficient in tests to
avoid hangs due to SLUB generally maintaining smaller lists than SLAB.
In low-memory conditions it does mean that !PFMEMALLOC allocators
can fail a slab allocation even though free objects are available
because they are being preserved for callers that are freeing pages.
[a.p.zijlstra@chello.nl: Original implementation]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
include/linux/mm_types.h | 8 ++
include/linux/slub_def.h | 1 +
mm/internal.h | 3 +
mm/page_alloc.c | 27 +++++-
mm/slab.c | 216 +++++++++++++++++++++++++++++++++++++++-------
mm/slub.c | 35 ++++++--
6 files changed, 246 insertions(+), 44 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26bc4e2..1a5e14b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -71,6 +71,14 @@ struct page {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
+ bool pfmemalloc; /* If set by the page allocator,
+ * ALLOC_PFMEMALLOC was set and the
+ * low watermark was not met implying
+ * that the system is under some
+ * pressure. The caller should try
+ * ensure this page is only used to
+ * free other pages.
+ */
};
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 8b6e8ae..f6cdbce 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -38,6 +38,7 @@ struct kmem_cache_cpu {
void **freelist; /* Pointer to first free per cpu object */
struct page *page; /* The slab from which we are allocating */
int node; /* The node of the page (or -1 for debug) */
+ bool pfmemalloc; /* Slab page had pfmemalloc set */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6948820..110c9a2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -189,6 +189,9 @@ static inline struct page *mem_map_next(struct page *iter,
#define __paginginit __init
#endif
+/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
+
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 93afea3..fb34549 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -647,6 +647,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
+ page->pfmemalloc = false;
if (PageAnon(page))
page->mapping = NULL;
for (i = 0; i < (1 << order); i++)
@@ -1165,6 +1166,7 @@ void free_hot_cold_page(struct page *page, int cold)
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
+ page->pfmemalloc = false;
local_irq_save(flags);
if (unlikely(wasMlocked))
free_page_mlock(page);
@@ -1358,6 +1360,7 @@ failed:
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_PFMEMALLOC 0x80 /* Caller has PF_MEMALLOC set */
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -1979,16 +1982,22 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
+ if ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))) {
+ alloc_flags |= ALLOC_PFMEMALLOC;
+
+ if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt())
alloc_flags |= ALLOC_NO_WATERMARKS;
}
return alloc_flags;
}
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2167,8 +2176,16 @@ nopage:
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
- return page;
+ /*
+ * page->pfmemalloc is set when the caller had PFMEMALLOC set or is
+ * been OOM killed. The expectation is that the caller is taking
+ * steps that will free more memory. The caller should avoid the
+ * page being used for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = (alloc_flags & ALLOC_PFMEMALLOC);
+
+ return page;
}
/*
diff --git a/mm/slab.c b/mm/slab.c
index 37961d1f..953e6263 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -120,6 +120,8 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
+#include "internal.h"
+
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -204,6 +206,7 @@ struct slab {
unsigned int inuse; /* num of objs active in slab */
kmem_bufctl_t free;
unsigned short nodeid;
+ bool pfmemalloc; /* Slab page had pfmemalloc set */
};
/*
@@ -244,15 +247,37 @@ struct array_cache {
unsigned int avail;
unsigned int limit;
unsigned int batchcount;
- unsigned int touched;
+ bool touched;
+ bool pfmemalloc;
spinlock_t lock;
void *entry[]; /*
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
+ *
+ * Entries should not be directly dereferenced as
+ * entries belonging to slabs marked pfmemalloc will
+ * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
+#define SLAB_OBJ_PFMEMALLOC 1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+ return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+
+static inline void set_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+ return;
+}
+
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
+
/*
* bootstrap: The caches do not work without cpuarrays anymore, but the
* cpuarrays are allocated from the generic caches...
@@ -885,12 +910,100 @@ static struct array_cache *alloc_arraycache(int node, int entries,
nc->avail = 0;
nc->limit = entries;
nc->batchcount = batchcount;
- nc->touched = 0;
+ nc->touched = false;
spin_lock_init(&nc->lock);
}
return nc;
}
+/* Clears ac->pfmemalloc if no slabs have pfmalloc set */
+static void check_ac_pfmemalloc(struct kmem_cache *cachep,
+ struct array_cache *ac)
+{
+ struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+ struct slab *slabp;
+
+ if (!ac->pfmemalloc)
+ return;
+
+ list_for_each_entry(slabp, &l3->slabs_full, list)
+ if (slabp->pfmemalloc)
+ return;
+
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
+ if (slabp->pfmemalloc)
+ return;
+
+ list_for_each_entry(slabp, &l3->slabs_free, list)
+ if (slabp->pfmemalloc)
+ return;
+
+ ac->pfmemalloc = false;
+}
+
+static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ gfp_t flags, bool force_refill)
+{
+ int i;
+ void *objp = ac->entry[--ac->avail];
+
+ /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+ if (unlikely(is_obj_pfmemalloc(objp))) {
+ struct kmem_list3 *l3;
+
+ if (gfp_pfmemalloc_allowed(flags)) {
+ clear_obj_pfmemalloc(&objp);
+ return objp;
+ }
+
+ /* The caller cannot use PFMEMALLOC objects, find another one */
+ for (i = 1; i < ac->avail; i++) {
+ /* If a !PFMEMALLOC object is found, swap them */
+ if (!is_obj_pfmemalloc(ac->entry[i])) {
+ objp = ac->entry[i];
+ ac->entry[i] = ac->entry[ac->avail];
+ ac->entry[ac->avail] = objp;
+ return objp;
+ }
+ }
+
+ /*
+ * If there are full empty slabs and we were not forced to
+ * allocate a slab, mark this one !pfmemalloc
+ */
+ l3 = cachep->nodelists[numa_mem_id()];
+ if (!list_empty(&l3->slabs_free) && force_refill) {
+ struct slab *slabp = virt_to_slab(objp);
+ slabp->pfmemalloc = false;
+ clear_obj_pfmemalloc(&objp);
+ check_ac_pfmemalloc(cachep, ac);
+ return objp;
+ }
+
+ /* No !PFMEMALLOC objects available */
+ ac->avail++;
+ objp = NULL;
+ }
+
+ return objp;
+}
+
+static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ struct slab *slabp;
+
+ /* If there are pfmemalloc slabs, check if the object is part of one */
+ if (unlikely(ac->pfmemalloc)) {
+ slabp = virt_to_slab(objp);
+
+ if (slabp->pfmemalloc)
+ set_obj_pfmemalloc(&objp);
+ }
+
+ ac->entry[ac->avail++] = objp;
+}
+
/*
* Transfer objects in one arraycache to another.
* Locking must be handled by the caller.
@@ -1067,7 +1180,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, alien, nodeid);
}
- alien->entry[alien->avail++] = objp;
+ ac_put_obj(cachep, alien, objp);
spin_unlock(&alien->lock);
} else {
spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1674,7 +1787,8 @@ __initcall(cpucache_init);
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+ bool *pfmemalloc)
{
struct page *page;
int nr_pages;
@@ -1695,6 +1809,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page)
return NULL;
+ *pfmemalloc = page->pfmemalloc;
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -2127,7 +2242,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
cpu_cache_get(cachep)->batchcount = 1;
- cpu_cache_get(cachep)->touched = 0;
+ cpu_cache_get(cachep)->touched = false;
cachep->batchcount = 1;
cachep->limit = BOOT_CPUCACHE_ENTRIES;
return 0;
@@ -2676,6 +2791,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
slabp->s_mem = objp + colour_off;
slabp->nodeid = nodeid;
slabp->free = 0;
+ slabp->pfmemalloc = false;
return slabp;
}
@@ -2807,7 +2923,7 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, void *objp)
+ gfp_t flags, int nodeid, void *objp, bool pfmemalloc)
{
struct slab *slabp;
size_t offset;
@@ -2851,7 +2967,7 @@ static int cache_grow(struct kmem_cache *cachep,
* 'nodeid'.
*/
if (!objp)
- objp = kmem_getpages(cachep, local_flags, nodeid);
+ objp = kmem_getpages(cachep, local_flags, nodeid, &pfmemalloc);
if (!objp)
goto failed;
@@ -2861,6 +2977,13 @@ static int cache_grow(struct kmem_cache *cachep,
if (!slabp)
goto opps1;
+ /* Record if ALLOC_PFMEMALLOC was set when allocating the slab */
+ if (pfmemalloc) {
+ struct array_cache *ac = cpu_cache_get(cachep);
+ slabp->pfmemalloc = true;
+ ac->pfmemalloc = 1;
+ }
+
slab_map_pages(cachep, slabp, objp);
cache_init_objs(cachep, slabp);
@@ -3002,16 +3125,19 @@ bad:
#define check_slabp(x,y) do { } while(0)
#endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+ bool force_refill)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;
-retry:
check_irq_off();
node = numa_mem_id();
+ if (unlikely(force_refill))
+ goto force_grow;
+retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3029,7 +3155,7 @@ retry:
/* See if we can refill from the shared array */
if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
- l3->shared->touched = 1;
+ l3->shared->touched = true;
goto alloc_done;
}
@@ -3061,8 +3187,8 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
- node);
+ ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
+ node));
}
check_slabp(cachep, slabp);
@@ -3081,18 +3207,25 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+force_grow:
+ x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL, false);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
- if (!x && ac->avail == 0) /* no objects in sight? abort */
+
+ /* no objects in sight? abort */
+ if (!x && (ac->avail == 0 || force_refill))
return NULL;
- if (!ac->avail) /* objects refilled by interrupt? */
+ /* objects refilled by interrupt? */
+ if (!ac->avail) {
+ node = numa_node_id();
goto retry;
+ }
}
- ac->touched = 1;
- return ac->entry[--ac->avail];
+ ac->touched = true;
+
+ return ac_get_obj(cachep, ac, flags, force_refill);
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3175,23 +3308,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
+ bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
- STATS_INC_ALLOCHIT(cachep);
- ac->touched = 1;
- objp = ac->entry[--ac->avail];
- } else {
- STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags);
+ ac->touched = true;
+ objp = ac_get_obj(cachep, ac, flags, false);
+
/*
- * the 'ac' may be updated by cache_alloc_refill(),
- * and kmemleak_erase() requires its correct value.
+ * Allow for the possibility all avail objects are not allowed
+ * by the current flags
*/
- ac = cpu_cache_get(cachep);
+ if (objp) {
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
+ }
+ force_refill = true;
}
+
+ STATS_INC_ALLOCMISS(cachep);
+ objp = cache_alloc_refill(cachep, flags, force_refill);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
+
+out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3244,6 +3389,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
+ bool pfmemalloc;
if (flags & __GFP_THISNODE)
return NULL;
@@ -3280,7 +3426,8 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_mem_id());
+ obj = kmem_getpages(cache, local_flags, numa_mem_id(),
+ &pfmemalloc);
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {
@@ -3288,7 +3435,7 @@ retry:
* Insert into the appropriate per node queues
*/
nid = page_to_nid(virt_to_page(obj));
- if (cache_grow(cache, flags, nid, obj)) {
+ if (cache_grow(cache, flags, nid, obj, pfmemalloc)) {
obj = ____cache_alloc_node(cache,
flags | GFP_THISNODE, nid);
if (!obj)
@@ -3360,7 +3507,7 @@ retry:
must_grow:
spin_unlock(&l3->list_lock);
- x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL, false);
if (x)
goto retry;
@@ -3510,9 +3657,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
struct kmem_list3 *l3;
for (i = 0; i < nr_objects; i++) {
- void *objp = objpp[i];
+ void *objp;
struct slab *slabp;
+ clear_obj_pfmemalloc(&objpp[i]);
+ objp = objpp[i];
+
slabp = virt_to_slab(objp);
l3 = cachep->nodelists[node];
list_del(&slabp->list);
@@ -3624,12 +3774,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
- ac->entry[ac->avail++] = objp;
+ ac_put_obj(cachep, ac, objp);
return;
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
- ac->entry[ac->avail++] = objp;
+ ac_put_obj(cachep, ac, objp);
}
}
@@ -4061,7 +4211,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
if (!ac || !ac->avail)
return;
if (ac->touched && !force) {
- ac->touched = 0;
+ ac->touched = false;
} else {
spin_lock_irq(&l3->list_lock);
if (ac->avail) {
diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f..24aed12 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -30,6 +30,8 @@
#include <trace/events/kmem.h>
+#include "internal.h"
+
/*
* Lock order:
* 1. slab_lock(page)
@@ -1183,7 +1185,8 @@ static void setup_object(struct kmem_cache *s, struct page *page,
s->ctor(object);
}
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node,
+ bool *pfmemalloc)
{
struct page *page;
void *start;
@@ -1198,6 +1201,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
goto out;
inc_slabs_node(s, page_to_nid(page), page->objects);
+ *pfmemalloc = page->pfmemalloc;
page->slab = s;
page->flags |= 1 << PG_slab;
@@ -1629,6 +1633,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
}
}
+#define SLAB_PAGE_PFMEMALLOC 1
+
+static inline bool pfmemalloc_match(struct kmem_cache_cpu *c, gfp_t gfpflags)
+{
+ if (unlikely(c->pfmemalloc))
+ return gfp_pfmemalloc_allowed(gfpflags);
+
+ return true;
+}
+
/*
* Slow path. The lockless freelist is empty or we need to perform
* debugging duties.
@@ -1652,6 +1666,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
{
void **object;
struct page *new;
+ bool pfmemalloc = false;
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
@@ -1660,7 +1675,13 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
goto new_slab;
slab_lock(c->page);
- if (unlikely(!node_match(c, node)))
+
+ /*
+ * By rights, we should be searching for a slab page that was
+ * PFMEMALLOC but right now, we are losing the pfmemalloc
+ * information when the page leaves the per-cpu allocator
+ */
+ if (unlikely(!pfmemalloc_match(c, gfpflags) || !node_match(c, node)))
goto another_slab;
stat(s, ALLOC_REFILL);
@@ -1696,7 +1717,7 @@ new_slab:
if (gfpflags & __GFP_WAIT)
local_irq_enable();
- new = new_slab(s, gfpflags, node);
+ new = new_slab(s, gfpflags, node, &pfmemalloc);
if (gfpflags & __GFP_WAIT)
local_irq_disable();
@@ -1709,6 +1730,7 @@ new_slab:
slab_lock(new);
__SetPageSlubFrozen(new);
c->page = new;
+ c->pfmemalloc = pfmemalloc;
goto load_freelist;
}
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
@@ -1747,8 +1769,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
local_irq_save(flags);
c = __this_cpu_ptr(s->cpu_slab);
object = c->freelist;
- if (unlikely(!object || !node_match(c, node)))
-
+ if (unlikely(!object || !node_match(c, node) ||
+ !pfmemalloc_match(c, gfpflags)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
@@ -2131,10 +2153,11 @@ static void early_kmem_cache_node_alloc(int node)
struct page *page;
struct kmem_cache_node *n;
unsigned long flags;
+ bool pfmemalloc; /* Ignore this early in boot */
BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+ page = new_slab(kmem_cache_node, GFP_NOWAIT, node, &pfmemalloc);
BUG_ON(!page);
if (page_to_nid(page) != node) {
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 01/12] mm: Serialize access to min_free_kbytes
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
In-Reply-To: <1302777698-28237-1-git-send-email-mgorman@suse.de>
There is a race between the min_free_kbytes sysctl, memory hotplug
and transparent hugepage support enablement. Memory hotplug uses a
zonelists_mutex to avoid a race when building zonelists. Reuse it to
serialise watermark updates.
[a.p.zijlstra@chello.nl: Older patch fixed the race with spinlock]
Signed-off-by: Mel Gorman <mgorman@suse.de>
---
mm/page_alloc.c | 23 +++++++++++++++--------
1 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdef1d4..93afea3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4928,14 +4928,7 @@ static void setup_per_zone_lowmem_reserve(void)
calculate_totalreserve_pages();
}
-/**
- * setup_per_zone_wmarks - called when min_free_kbytes changes
- * or when memory is hot-{added|removed}
- *
- * Ensures that the watermark[min,low,high] values for each zone are set
- * correctly with respect to min_free_kbytes.
- */
-void setup_per_zone_wmarks(void)
+static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
@@ -4990,6 +4983,20 @@ void setup_per_zone_wmarks(void)
calculate_totalreserve_pages();
}
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+ mutex_lock(&zonelists_mutex);
+ __setup_per_zone_wmarks();
+ mutex_unlock(&zonelists_mutex);
+}
+
/*
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
--
1.7.3.4
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply related
* [PATCH 00/12] Swap-over-NBD without deadlocking v1
From: Mel Gorman @ 2011-04-14 10:41 UTC (permalink / raw)
To: Linux-MM, Linux-Netdev; +Cc: LKML, Peter Zijlstra, Mel Gorman
Swapping over NBD is something that is technically possible but not
often advised. While there are number of guides on the internet
on how to configure it and nbd-client supports a -swap switch to
"prevent deadlocks", the fact of the matter is a machine using NBD
for swap can be locked up within minutes if swap is used intensively.
The problem is that network block devices do not use mempools like
normal block devices do. As the host cannot control where they receive
packets from, they cannot reliably work out in advance how much memory
they might need.
Some years ago, Peter Ziljstra developed a series of patches that
supported swap over an NFS that some distributions are carrying in
their kernels. This patch series borrows very heavily from Peter's work
to support swapping over NBD (the relatively straight-forward case)
and uses throttling instead of dynamically resized memory reserves
so the series is not too unwieldy for review.
Patch 1 serialises access to min_free_kbytes. It's not strictly needed
by this series but as the series cares about watermarks in
general, it's a harmless fix. It could be merged independently.
Patch 2 adds knowledge of the PFMEMALLOC reserves to SLAB and SLUB to
preserve access to pages allocated under low memory situations
to callers that are freeying memory.
Patch 3 introduces __GFP_MEMALLOC to allow access to the PFMEMALLOC
reserves without setting PFMEMALLOC.
Patch 4 opens the possibility for softirqs to use PFMEMALLOC reserves
for later use by network packet processing.
Patch 5 ignores memory policies when ALLOC_NO_WATERMARKS is set.
Patches 6-9 allows network processing to use PFMEMALLOC reserves when
the socket has been marked as being used by the VM to clean
pages. If packets are received and stored in pages that were
allocated under low-memory situations and are unrelated to
the VM, the packets are dropped.
Patch 10 is a micro-optimisation to avoid a function call in the
common case.
Patch 11 tags NBD sockets as being SOCK_MEMALLOC so they can use
PFMEMALLOC if necessary.
Patch 12 notes that it is still possible for the PFMEMALLOC reserve
to be depleted. To prevent this, direct reclaimers get
throttled on a waitqueue if 50% of the PFMEMALLOC reserves are
depleted. It is expected that kswapd and the direct reclaimers
already running will clean enough pages for the low watermark
to be reached and the throttled processes are woken up.
Some basic performance testing was run using kernel builds, netperf
on loopback for UDP and TCP, hackbench (pipes and sockets), iozone
and sysbench. Each of them were expected to use the sl*b allocators
reasonably heavily but there did not appear to be significant
performance variances. Here is the results from netperf using
slab as an example
NETPERF UDP
netperf-udp udp-swapnbd
vanilla-slab v1r17-slab
64 178.06 ( 0.00%)* 189.46 ( 6.02%)
1.02% 1.00%
128 355.06 ( 0.00%) 370.75 ( 4.23%)
256 662.47 ( 0.00%) 721.62 ( 8.20%)
1024 2229.39 ( 0.00%) 2567.04 (13.15%)
2048 3974.20 ( 0.00%) 4114.70 ( 3.41%)
3312 5619.89 ( 0.00%) 5800.09 ( 3.11%)
4096 6460.45 ( 0.00%) 6702.45 ( 3.61%)
8192 9580.24 ( 0.00%) 9927.97 ( 3.50%)
16384 13259.14 ( 0.00%) 13493.88 ( 1.74%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds) 2960.17 2540.14
Total Elapsed Time (seconds) 3554.10 3050.10
NETPERF TCP
netperf-tcp tcp-swapnbd
vanilla-slab v1r17-slab
64 1230.29 ( 0.00%) 1273.17 ( 3.37%)
128 2309.97 ( 0.00%) 2375.22 ( 2.75%)
256 3659.32 ( 0.00%) 3704.87 ( 1.23%)
1024 7267.80 ( 0.00%) 7251.02 (-0.23%)
2048 8358.26 ( 0.00%) 8204.74 (-1.87%)
3312 8631.07 ( 0.00%) 8637.62 ( 0.08%)
4096 8770.95 ( 0.00%) 8704.08 (-0.77%)
8192 9749.33 ( 0.00%) 9769.06 ( 0.20%)
16384 11151.71 ( 0.00%) 11135.32 (-0.15%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds) 1245.04 1619.89
Total Elapsed Time (seconds) 1250.66 1622.18
Here is the equivalent test for SLUB
NETPERF UDP
netperf-udp udp-swapnbd
vanilla-slub v1r17-slub
64 180.83 ( 0.00%) 183.68 ( 1.55%)
128 357.29 ( 0.00%) 367.11 ( 2.67%)
256 679.64 ( 0.00%)* 724.03 ( 6.13%)
1.15% 1.00%
1024 2343.40 ( 0.00%)* 2610.63 (10.24%)
1.68% 1.00%
2048 3971.53 ( 0.00%) 4102.21 ( 3.19%)*
1.00% 1.40%
3312 5677.04 ( 0.00%) 5748.69 ( 1.25%)
4096 6436.75 ( 0.00%) 6549.41 ( 1.72%)
8192 9698.56 ( 0.00%) 9808.84 ( 1.12%)
16384 13337.06 ( 0.00%) 13404.38 ( 0.50%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds) 2880.15 2180.13
Total Elapsed Time (seconds) 3458.10 2618.09
NETPERF TCP
netperf-tcp tcp-swapnbd
vanilla-slub v1r17-slub
64 1256.79 ( 0.00%) 1287.32 ( 2.37%)
128 2308.71 ( 0.00%) 2371.09 ( 2.63%)
256 3672.03 ( 0.00%) 3771.05 ( 2.63%)
1024 7245.08 ( 0.00%) 7261.60 ( 0.23%)
2048 8315.17 ( 0.00%) 8244.14 (-0.86%)
3312 8611.43 ( 0.00%) 8616.90 ( 0.06%)
4096 8711.64 ( 0.00%) 8695.97 (-0.18%)
8192 9795.71 ( 0.00%) 9774.11 (-0.22%)
16384 11145.48 ( 0.00%) 11225.70 ( 0.71%)
MMTests Statistics: duration
User/Sys Time Running Test (seconds) 1345.05 1425.06
Total Elapsed Time (seconds) 1350.61 1430.66
Time to completion varied a lot but this can happen with netperf as
it tries to find results within a sufficiently high confidence. I
wouldn't read too much into the performance gains of netperf-udp
as it can sometimes be affected by code just shuffling around for
whatever reason.
For testing swap-over-NBD, a machine was booted with 2G of RAM with a
swapfile backed by NBD. 16*NUM_CPU processes were started that create
anonymous memory mappings and read them linearly in a loop. The total
size of the mappings were 4*PHYSICAL_MEMORY to use swap heavily under
memory pressure. Without the patches, the machine locks up within
minutes and runs to completion with them applied.
Comments?
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: [PATCH] net: ipv4: add IPPROTO_ICMP socket kind
From: Alexey Dobriyan @ 2011-04-14 9:16 UTC (permalink / raw)
To: Vasiliy Kulikov
Cc: linux-kernel, netdev, Pavel Kankovsky, Solar Designer, Kees Cook,
Dan Rosenberg, Eugene Teo, Nelson Elhage, David S. Miller,
Alexey Kuznetsov, Pekka Savola, James Morris, Hideaki YOSHIFUJI,
Patrick McHardy
In-Reply-To: <20110413113204.GB6948@albatros>
On Wed, Apr 13, 2011 at 2:32 PM, Vasiliy Kulikov <segoon@openwall.com> wrote:
> On Wed, Apr 13, 2011 at 13:29 +0300, Alexey Dobriyan wrote:
>> On Sat, Apr 9, 2011 at 1:15 PM, Vasiliy Kulikov <segoon@openwall.com> wrote:
>> > This patch adds IPPROTO_ICMP socket kind.
>>
>> > + seq_printf(f, "%5d: %08X:%04X %08X:%04X"
>> > + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
>> > + bucket, src, srcp, dest, destp, sp->sk_state,
>> > + sk_wmem_alloc_get(sp),
>> > + sk_rmem_alloc_get(sp),
>> > + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
>>
>> These zeroes can be embedded into format string for slightly faster printing.
>
> Is it really needed? I mean, it is not a fast path, so such a small
> overhead is not very bad. But embedding them into the string makes it a
> bit more difficult to read.
In fact, if field is always zero, it can be removed altogether.
Also, there was big discussion re exposing kernel socket pointers,
which this file continue to do.
> + atomic_read(&sp->sk_refcnt), sp,
> + atomic_read(&sp->sk_drops), len);
^ permalink raw reply
* Re: [Bugme-new] [Bug 33042] New: Marvell 88E1145 phy configured incorrectly in fiber mode
From: Alex Dubov @ 2011-04-14 7:59 UTC (permalink / raw)
To: Andy Fleming
Cc: Andrew Morton, David Daney, netdev, bugzilla-daemon, bugme-daemon,
Grant Likely, Andy Fleming
In-Reply-To: <BANLkTikH09jMZZk1ZgeQi+HOoeFqgTpdBA@mail.gmail.com>
--- On Thu, 14/4/11, Andy Fleming <afleming@gmail.com> wrote:
>
> I've just rewritten the U-Boot code for PHY management, so
> I'd be
> interested in hearing if this breaks your board. But
> what's
> interesting to me is that, in order for U-Boot to report
> that the link
> is a "fiber" link, something had to set the TSEC_FIBER
> flag, and only
> one PHY in the public source did. This implies to me
> that your board
> isn't supported by mainline U-Boot, and suggests that
> someone may have
> modified the 88e1145 driver. Otherwise, I don't see any
> fiber-related
> differences between the U-Boot 1145 driver, and the Linux
> one.
I had not seen any difference, that's true. But the problem somehow
creeps in.
The u-boot is standard stock u-boot pulled from the recent git,
no special configuration involved.
I actually managed to make kernel transmit stuff by playing with register
values from other marvell phy varieties, but it keeps receiving garbage,
so the link is still not operational.
I tried to prevent kernel from reconfiguring the phy, but to no avail.
It seems very weird to me, because I did quite a lot of testing with
u-boot and network just works on that interface. However, when kernel
starts booting it suddenly looses the ability to talk to it.
I have a copper link attached to the same transceiver and it works fine
all along.
>
>
>
> Reading the configuration from U-Boot is
> straightforward. use the
> "mii" command to read the registers. But don't forget
> to set register
> 22 (16 - mii command only reads hex) to 1, and read all of
> the
> registers that way, too.
I have no recourse but to keep investigating.
>
> You will either need to add some code to detect when the
> PHY is using
> fiber, and change the phydev->port to PORT_FIBRE, or you
> will need to
> add a board-level "fixup" to change the port to PORT_FIBRE
> on your
> board.
>
> Then the PHY driver should use that information to do the
> right configuration.
>
> Andy
>
^ permalink raw reply
* Re: [PATCH] Clean up 'FLAG_POINTTOPOINT' and 'FLAG_MULTI_PACKET' overlaps in usbnet.h
From: David Miller @ 2011-04-14 7:22 UTC (permalink / raw)
To: huajun.li.lee; +Cc: gottfried.haider, netdev
In-Reply-To: <BANLkTinyWLjVRLKTZB09BZaH2o0gB0p8JA@mail.gmail.com>
From: huajun li <huajun.li.lee@gmail.com>
Date: Thu, 14 Apr 2011 09:43:32 +0800
> USB tethering does not work anymore since 2.6.39-rc2, but it's okay in
> -rc1. The root cause is the new added mask code 'FLAG_POINTTOPOINT'
> overlaps 'FLAG_MULTI_PACKET' in include/linux/usb/usbnet.h, this
> causes logic issue in rx_process(). This patch cleans up the overlap.
>
> Reported-and-Tested-by: Gottfried Haider <gottfried.haider@gmail.com>
> Signed-off-by: Huajun Li <huajun.li.lee@gmail.com>
Applied, thanks.
^ permalink raw reply
* Re: [PATCH] NET: AX.25, NETROM, ROSE: Remove SOCK_DEBUG calls
From: David Miller @ 2011-04-14 7:20 UTC (permalink / raw)
To: ralf; +Cc: netdev, linux-hams
In-Reply-To: <20110414064917.GA5344@linux-mips.org>
From: Ralf Baechle <ralf@linux-mips.org>
Date: Thu, 14 Apr 2011 08:49:17 +0200
> Nobody alive seems to recall when they last were useful.
>
> Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
:-) Applied, thanks Ralf.
^ permalink raw reply
* Re: [net-next-2.6 00/26 v2][pull request] Intel Wired LAN Driver Update
From: David Miller @ 2011-04-14 7:18 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, gospo, bphilips
In-Reply-To: <1302748718-16927-1-git-send-email-jeffrey.t.kirsher@intel.com>
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Wed, 13 Apr 2011 19:38:38 -0700
> The following series contains updates to e1000e, igb and ixgbe.
...
> -v2 Drop the e1000 patch to convert to the new set_phys_id interface
Pulled, thanks Jeff.
^ permalink raw reply
* [PATCH 1/1] ipv6: RTA_PREFSRC support for ipv6 route source address selection
From: Daniel Walter @ 2011-04-14 7:10 UTC (permalink / raw)
To: netdev; +Cc: linux-kernel, davem
[ipv6] Add support for RTA_PREFSRC
This patch allows a user to select the preferred source address
for a specific IPv6-Route. It can be set via a netlink message
setting RTA_PREFSRC to a valid IPv6 address which must be
up on the device the route will be bound to.
Signed-off-by: Daniel Walter <dwalter@barracuda.com>
---
Repost patch, after fixing some warnings pointed out on netdev@
applies clean against current linux-2.6 HEAD
include/net/ip6_fib.h | 2 +
include/net/ip6_route.h | 7 ++++
net/ipv6/addrconf.c | 2 +
net/ipv6/ip6_output.c | 8 ++--
net/ipv6/route.c | 72 +++++++++++++++++++++++++++++++++++++++++++++--
5 files changed, 84 insertions(+), 7 deletions(-)
---
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index bc3cde0..98348d5 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -42,6 +42,7 @@ struct fib6_config {
struct in6_addr fc_dst;
struct in6_addr fc_src;
+ struct in6_addr fc_prefsrc;
struct in6_addr fc_gateway;
unsigned long fc_expires;
@@ -107,6 +108,7 @@ struct rt6_info {
struct rt6key rt6i_dst ____cacheline_aligned_in_smp;
u32 rt6i_flags;
struct rt6key rt6i_src;
+ struct rt6key rt6i_prefsrc;
u32 rt6i_metric;
u32 rt6i_peer_genid;
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index c850e5f..86b1cb4 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -84,6 +84,12 @@ extern int ip6_route_add(struct fib6_config *cfg);
extern int ip6_ins_rt(struct rt6_info *);
extern int ip6_del_rt(struct rt6_info *);
+extern int ip6_route_get_saddr(struct net *net,
+ struct rt6_info *rt,
+ struct in6_addr *daddr,
+ unsigned int prefs,
+ struct in6_addr *saddr);
+
extern struct rt6_info *rt6_lookup(struct net *net,
const struct in6_addr *daddr,
const struct in6_addr *saddr,
@@ -141,6 +147,7 @@ struct rt6_rtnl_dump_arg {
extern int rt6_dump_route(struct rt6_info *rt, void *p_arg);
extern void rt6_ifdown(struct net *net, struct net_device *dev);
extern void rt6_mtu_change(struct net_device *dev, unsigned mtu);
+extern void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
/*
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1493534..129d7e1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -825,6 +825,8 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
dst_release(&rt->dst);
}
+ /* clean up prefsrc entries */
+ rt6_remove_prefsrc(ifp);
out:
in6_ifa_put(ifp);
}
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 46cf7be..1f4c096 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -930,10 +930,10 @@ static int ip6_dst_lookup_tail(struct sock *sk,
goto out_err_release;
if (ipv6_addr_any(&fl6->saddr)) {
- err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
- &fl6->daddr,
- sk ? inet6_sk(sk)->srcprefs : 0,
- &fl6->saddr);
+ struct rt6_info *rt = (struct rt6_info *) *dst;
+ err = ip6_route_get_saddr(net, rt, &fl6->daddr,
+ sk ? inet6_sk(sk)->srcprefs : 0,
+ &fl6->saddr);
if (err)
goto out_err_release;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 843406f..af26cc10 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1325,6 +1325,16 @@ int ip6_route_add(struct fib6_config *cfg)
if (dev == NULL)
goto out;
+ if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
+ if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
+ err = -EINVAL;
+ goto out;
+ }
+ ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
+ rt->rt6i_prefsrc.plen = 128;
+ } else
+ rt->rt6i_prefsrc.plen = 0;
+
if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
if (IS_ERR(rt->rt6i_nexthop)) {
@@ -2037,6 +2047,55 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
return rt;
}
+int ip6_route_get_saddr(struct net *net,
+ struct rt6_info *rt,
+ struct in6_addr *daddr,
+ unsigned int prefs,
+ struct in6_addr *saddr)
+{
+ struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
+ int err = 0;
+ if (rt->rt6i_prefsrc.plen)
+ ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
+ else
+ err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
+ daddr, prefs, saddr);
+ return err;
+}
+
+/* remove deleted ip from prefsrc entries */
+struct arg_dev_net_ip {
+ struct net_device *dev;
+ struct net *net;
+ struct in6_addr *addr;
+};
+
+static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
+{
+ struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
+ struct net *net = ((struct arg_dev_net_ip *)arg)->net;
+ struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
+
+ if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
+ rt != net->ipv6.ip6_null_entry &&
+ ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
+ /* remove prefsrc entry */
+ rt->rt6i_prefsrc.plen = 0;
+ }
+ return 0;
+}
+
+void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
+{
+ struct net *net = dev_net(ifp->idev->dev);
+ struct arg_dev_net_ip adni = {
+ .dev = ifp->idev->dev,
+ .net = net,
+ .addr = &ifp->addr,
+ };
+ fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
+}
+
struct arg_dev_net {
struct net_device *dev;
struct net *net;
@@ -2183,6 +2242,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
}
+ if (tb[RTA_PREFSRC])
+ nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
+
if (tb[RTA_OIF])
cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
@@ -2325,13 +2387,17 @@ static int rt6_fill_node(struct net *net,
#endif
NLA_PUT_U32(skb, RTA_IIF, iif);
} else if (dst) {
- struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
struct in6_addr saddr_buf;
- if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
- dst, 0, &saddr_buf) == 0)
+ if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
}
+ if (rt->rt6i_prefsrc.plen) {
+ struct in6_addr saddr_buf;
+ ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
+ NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
+ }
+
if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
goto nla_put_failure;
^ permalink raw reply related
* [PATCH 1/1] ipv6: ignore looped-back NA while dad is running
From: Daniel Walter @ 2011-04-14 7:09 UTC (permalink / raw)
To: netdev; +Cc: linux-kernel, davem
[ipv6] Ignore looped-back NAs while in Duplicate Address Detection
If we send an unsolicited NA shortly after bringing up an
IPv6 address, the duplicate address detection algorithm
fails and the ip stays in tentative mode forever.
This is due a missing check if the NA is looped-back to us.
Signed-off-by: Daniel Walter <dwalter@barracuda.com>
---
apply against linux-2.6 HEAD
net/ipv6/ndisc.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)
---
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 92f952d..f057ff3 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -945,9 +945,10 @@ static void ndisc_recv_na(struct sk_buff *skb)
}
ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
if (ifp) {
- if (ifp->flags & IFA_F_TENTATIVE) {
- addrconf_dad_failure(ifp);
- return;
+ if (skb->pkt_type != PACKET_LOOPBACK
+ && (ifp->flags & IFA_F_TENTATIVE)) {
+ addrconf_dad_failure(ifp);
+ return;
}
/* What should we make now? The advertisement
is invalid, but ndisc specs say nothing
^ permalink raw reply related
* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-14 6:58 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Alexander Duyck, Peter Zijlstra, netdev, Kirsher, Jeffrey T
In-Reply-To: <1302762810.3549.233.camel@edumazet-laptop>
I did the single flow test, it shows no rx error with 300kpps. While I was start multiple flow with same 300Kpps traffic, then it looks really bad with high rx_missing_error.
Multiple Flow:
SUM: 191925 ETH8: 0 ETH10: 191925 ETH6: 0 ETH4: 0
SUM: 214634 ETH8: 0 ETH10: 214634 ETH6: 0 ETH4: 0
SUM: 237600 ETH8: 0 ETH10: 237600 ETH6: 0 ETH4: 0
SUM: 198925 ETH8: 0 ETH10: 198925 ETH6: 0 ETH4: 0
SUM: 249290 ETH8: 0 ETH10: 249290 ETH6: 0 ETH4: 0
Single Flow:
SUM: 302018 ETH8: 0 ETH10: 302018 ETH6: 0 ETH4: 0
SUM: 301849 ETH8: 0 ETH10: 301849 ETH6: 0 ETH4: 0
SUM: 302163 ETH8: 0 ETH10: 302163 ETH6: 0 ETH4: 0
Thanks
WeiGu
-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Thursday, April 14, 2011 2:34 PM
To: Wei Gu
Cc: Alexander Duyck; Peter Zijlstra; netdev; Kirsher, Jeffrey T
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
Le jeudi 14 avril 2011 à 08:07 +0200, Eric Dumazet a écrit :
> Le jeudi 14 avril 2011 à 13:42 +0800, Wei Gu a écrit :
> > Hi guys,
> > Do you think it was a bug in the kernel from 2.6.35.2 with Intel 10GE ixgbe driver?
> > If so shall I issue a Bug on the bugzilla, and which category? Cause I'm not sure it was driver problem Or sched problem.
>
> This makes no sense to me.
>
> What is the maximum throughput you can get in pps before having packet
> drops ?
>
> Please try with a single flow (to hit one queue, one cpu)
>
> Thanks
>
Also, please try to check if using smaller or bigger packets makes any change in this max throughput
^ permalink raw reply
* [PATCH] NET: AX.25, NETROM, ROSE: Remove SOCK_DEBUG calls
From: Ralf Baechle @ 2011-04-14 6:49 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev, linux-hams
Nobody alive seems to recall when they last were useful.
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
net/ax25/af_ax25.c | 16 +---------------
net/netrom/af_netrom.c | 12 +-----------
net/rose/af_rose.c | 16 ++--------------
3 files changed, 4 insertions(+), 40 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 6da5dae..e7c69f4 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1538,8 +1538,6 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
}
/* Build a packet */
- SOCK_DEBUG(sk, "AX.25: sendto: Addresses built. Building packet.\n");
-
/* Assume the worst case */
size = len + ax25->ax25_dev->dev->hard_header_len;
@@ -1549,8 +1547,6 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
skb_reserve(skb, size - len);
- SOCK_DEBUG(sk, "AX.25: Appending user data\n");
-
/* User data follows immediately after the AX.25 data */
if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) {
err = -EFAULT;
@@ -1564,8 +1560,6 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
if (!ax25->pidincl)
*skb_push(skb, 1) = sk->sk_protocol;
- SOCK_DEBUG(sk, "AX.25: Transmitting buffer\n");
-
if (sk->sk_type == SOCK_SEQPACKET) {
/* Connected mode sockets go via the LAPB machine */
if (sk->sk_state != TCP_ESTABLISHED) {
@@ -1583,22 +1577,14 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
skb_push(skb, 1 + ax25_addr_size(dp));
- SOCK_DEBUG(sk, "Building AX.25 Header (dp=%p).\n", dp);
-
- if (dp != NULL)
- SOCK_DEBUG(sk, "Num digipeaters=%d\n", dp->ndigi);
+ /* Building AX.25 Header */
/* Build an AX.25 header */
lv = ax25_addr_build(skb->data, &ax25->source_addr, &sax.sax25_call,
dp, AX25_COMMAND, AX25_MODULUS);
- SOCK_DEBUG(sk, "Built header (%d bytes)\n",lv);
-
skb_set_transport_header(skb, lv);
- SOCK_DEBUG(sk, "base=%p pos=%p\n",
- skb->data, skb_transport_header(skb));
-
*skb_transport_header(skb) = AX25_UI;
/* Datagram frames go straight out of the door as UI */
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 06cb027..732152f 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -591,7 +591,6 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
return -EINVAL;
}
if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) {
- SOCK_DEBUG(sk, "NET/ROM: bind failed: invalid node callsign\n");
release_sock(sk);
return -EADDRNOTAVAIL;
}
@@ -632,7 +631,7 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
sock_reset_flag(sk, SOCK_ZAPPED);
dev_put(dev);
release_sock(sk);
- SOCK_DEBUG(sk, "NET/ROM: socket is bound\n");
+
return 0;
}
@@ -1082,8 +1081,6 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
sax.sax25_call = nr->dest_addr;
}
- SOCK_DEBUG(sk, "NET/ROM: sendto: Addresses built.\n");
-
/* Build a packet - the conventional user limit is 236 bytes. We can
do ludicrously large NetROM frames but must not overflow */
if (len > 65536) {
@@ -1091,7 +1088,6 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
- SOCK_DEBUG(sk, "NET/ROM: sendto: building packet.\n");
size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN;
if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL)
@@ -1105,7 +1101,6 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
*/
asmptr = skb_push(skb, NR_TRANSPORT_LEN);
- SOCK_DEBUG(sk, "Building NET/ROM Header.\n");
/* Build a NET/ROM Transport header */
@@ -1114,15 +1109,12 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
*asmptr++ = 0; /* To be filled in later */
*asmptr++ = 0; /* Ditto */
*asmptr++ = NR_INFO;
- SOCK_DEBUG(sk, "Built header.\n");
/*
* Put the data on the end
*/
skb_put(skb, len);
- SOCK_DEBUG(sk, "NET/ROM: Appending user data\n");
-
/* User data follows immediately after the NET/ROM transport header */
if (memcpy_fromiovec(skb_transport_header(skb), msg->msg_iov, len)) {
kfree_skb(skb);
@@ -1130,8 +1122,6 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
goto out;
}
- SOCK_DEBUG(sk, "NET/ROM: Transmitting buffer\n");
-
if (sk->sk_state != TCP_ESTABLISHED) {
kfree_skb(skb);
err = -ENOTCONN;
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index a80aef6..f9ea925 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -682,10 +682,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS)
return -EINVAL;
- if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) {
- SOCK_DEBUG(sk, "ROSE: bind failed: invalid address\n");
+ if ((dev = rose_dev_get(&addr->srose_addr)) == NULL)
return -EADDRNOTAVAIL;
- }
source = &addr->srose_call;
@@ -716,7 +714,7 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
rose_insert_socket(sk);
sock_reset_flag(sk, SOCK_ZAPPED);
- SOCK_DEBUG(sk, "ROSE: socket is bound\n");
+
return 0;
}
@@ -1109,10 +1107,7 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock,
srose.srose_digis[n] = rose->dest_digis[n];
}
- SOCK_DEBUG(sk, "ROSE: sendto: Addresses built.\n");
-
/* Build a packet */
- SOCK_DEBUG(sk, "ROSE: sendto: building packet.\n");
/* Sanity check the packet size */
if (len > 65535)
return -EMSGSIZE;
@@ -1127,7 +1122,6 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock,
/*
* Put the data on the end
*/
- SOCK_DEBUG(sk, "ROSE: Appending user data\n");
skb_reset_transport_header(skb);
skb_put(skb, len);
@@ -1152,8 +1146,6 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock,
*/
asmptr = skb_push(skb, ROSE_MIN_LEN);
- SOCK_DEBUG(sk, "ROSE: Building Network Header.\n");
-
/* Build a ROSE Network header */
asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI;
asmptr[1] = (rose->lci >> 0) & 0xFF;
@@ -1162,10 +1154,6 @@ static int rose_sendmsg(struct kiocb *iocb, struct socket *sock,
if (qbit)
asmptr[0] |= ROSE_Q_BIT;
- SOCK_DEBUG(sk, "ROSE: Built header.\n");
-
- SOCK_DEBUG(sk, "ROSE: Transmitting buffer\n");
-
if (sk->sk_state != TCP_ESTABLISHED) {
kfree_skb(skb);
return -ENOTCONN;
^ permalink raw reply related
* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Eric Dumazet @ 2011-04-14 6:33 UTC (permalink / raw)
To: Wei Gu; +Cc: Alexander Duyck, Peter Zijlstra, netdev, Kirsher, Jeffrey T
In-Reply-To: <1302761251.3549.198.camel@edumazet-laptop>
Le jeudi 14 avril 2011 à 08:07 +0200, Eric Dumazet a écrit :
> Le jeudi 14 avril 2011 à 13:42 +0800, Wei Gu a écrit :
> > Hi guys,
> > Do you think it was a bug in the kernel from 2.6.35.2 with Intel 10GE ixgbe driver?
> > If so shall I issue a Bug on the bugzilla, and which category? Cause I'm not sure it was driver problem Or sched problem.
>
> This makes no sense to me.
>
> What is the maximum throughput you can get in pps before having packet
> drops ?
>
> Please try with a single flow (to hit one queue, one cpu)
>
> Thanks
>
Also, please try to check if using smaller or bigger packets makes any
change in this max throughput
^ permalink raw reply
* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Eric Dumazet @ 2011-04-14 6:07 UTC (permalink / raw)
To: Wei Gu; +Cc: Alexander Duyck, Peter Zijlstra, netdev, Kirsher, Jeffrey T
In-Reply-To: <D12839161ADD3A4B8DA63D1A134D084026E49535F2@ESGSCCMS0001.eapac.ericsson.se>
Le jeudi 14 avril 2011 à 13:42 +0800, Wei Gu a écrit :
> Hi guys,
> Do you think it was a bug in the kernel from 2.6.35.2 with Intel 10GE ixgbe driver?
> If so shall I issue a Bug on the bugzilla, and which category? Cause I'm not sure it was driver problem Or sched problem.
This makes no sense to me.
What is the maximum throughput you can get in pps before having packet
drops ?
Please try with a single flow (to hit one queue, one cpu)
Thanks
^ permalink raw reply
* Re: [Bugme-new] [Bug 32832] New: shutdown(2) does not fully shut down socket any more
From: Simon Horman @ 2011-04-14 2:34 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, akpm, netdev, bugzilla-daemon, bugme-daemon, kees
In-Reply-To: <1302663327.2811.55.camel@edumazet-laptop>
On Wed, Apr 13, 2011 at 04:55:27AM +0200, Eric Dumazet wrote:
> Le mardi 12 avril 2011 à 16:17 -0700, David Miller a écrit :
> > From: Andrew Morton <akpm@linux-foundation.org>
> > Date: Tue, 12 Apr 2011 16:15:56 -0700
> >
> > >
> > > (switched to email. Please respond via emailed reply-to-all, not via the
> > > bugzilla web interface).
> >
> > Stephen Hemminger forwarded this to the list last week, and Eric
> > Dumazet is actively working on a fix.
>
> I worked on it this week end to discover FreeBSD 8.1 would not allow
> several CLOSE sockets bound to same port even with REUSEADDR.
>
> So haproxy claim is a bit wrong (its trick doesnt work on FreeBSD), and
> used an undocumented linux feature.
>
> I feel this case is a call for SO_REUSEPORT, eventually.
>
> http://www.unixguide.net/network/socketfaq/4.11.shtml
>
> SO_REUSEADDR allows your server to bind to an address which is in a
> TIME_WAIT state. It does not allow more than one server to bind to
> the same address. It was mentioned that use of this flag can create a
> security risk because another server can bind to a the same port, by
> binding to a specific address as opposed to INADDR_ANY. The
> SO_REUSEPORT flag allows multiple processes to bind to the same
> address provided all of them use the SO_REUSEPORT option.
>
>
> Since SO_REUSEPORT is not a 'stable fix', I suggest we revert the patch,
> and eventually work on SO_REUSEPORT on net-next-2.6
>
> What do you think ?
Not entirely related, but FWIW I think that SO_REUSEPORT would
be rather useful for haproxy.
I've been working on allowing haproxy to be reconfigured without dropping
or reusing connections. I have done this by re-using open sockets. But it
would have been rather a lot easier to achieve using SO_REUSEPORT -
assuming that I understand SO_REUSEPORT correctly.
^ permalink raw reply
* RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
From: Wei Gu @ 2011-04-14 5:42 UTC (permalink / raw)
To: Eric Dumazet, Alexander Duyck, Peter Zijlstra; +Cc: netdev, Kirsher, Jeffrey T
In-Reply-To: <1302536577.4605.1.camel@edumazet-laptop>
Hi guys,
Do you think it was a bug in the kernel from 2.6.35.2 with Intel 10GE ixgbe driver?
If so shall I issue a Bug on the bugzilla, and which category? Cause I'm not sure it was driver problem Or sched problem.
Thans
WeiGu
-----Original Message-----
From: Wei Gu
Sent: Tuesday, April 12, 2011 12:40 PM
To: 'Eric Dumazet'; 'Alexander Duyck'; 'Peter Zijlstra'
Cc: 'netdev'; 'Kirsher, Jeffrey T'
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
Hi,
I found the problem was introduced by this revert patch "2010-08-13 Peter Zijlstra sched: Revert nohz_ratelimit() for now"
I tried the remove this patch from 2.6.35.2 and then build the application again, then the ixgbe driver looks works fine.
I don't know why this time revert the nohz_ratelimit() will cause the problem on ixgbe driver, since this nohz_ratelimit was first introduced "2010-03-11". And before that time with 2.6.32 kernel it also doesn't have this problem with ixgbe driver.
Some log from git:
=========================================================================================
2.6.35.2
2010-08-13 Peter Zijlstra sched: Revert nohz_ratelimit() for now
2.6.35.1
2010-08-01 Linus Torvalds Linux 2.6.35 v2.6.35
2010-06-17 Peter Zijlstra nohz: Fix nohz ratelimit
2.6.35-rc3
2010-03-11 Mike Galbraith sched: Rate-limit nohz
Thanks
WeiGu
-----Original Message-----
From: Wei Gu
Sent: Tuesday, April 12, 2011 9:23 AM
To: 'Eric Dumazet'
Cc: Alexander Duyck; netdev; Kirsher, Jeffrey T
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
I was not stress the NIC/CPU, since I only send 290Kpps 400byte packets towards eth10. the CPU load almost 100%IDEL.
BTW, there are some problem with perf tool on 2.6.35.2, I will try to get you the top offenders if possible.
Thanks
WeiGu
-----Original Message-----
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
Sent: Monday, April 11, 2011 11:43 PM
To: Wei Gu
Cc: Alexander Duyck; netdev; Kirsher, Jeffrey T
Subject: RE: Low performance Intel 10GE NIC (3.2.10) on 2.6.38 Kernel
Le lundi 11 avril 2011 à 23:14 +0800, Wei Gu a écrit :
> I tried the ixgbe-3.3.8 (insmod ixgbe.ko RSS=8,8,8,8,8,8,8,8 FdirMode=0,0,0,0,0,0,0,0 Node=0,0,1,1,2,2,3,3) from e1000.sf.net both on 2.6.35.1 and 2.6.35.2, same observation as 3.2.10 ixgbe driver, On 2.6.35.2 it have high rx errors:
> Ethtool -S eth10 |grep error
> rx_errors: 0
> tx_errors: 0
> rx_over_errors: 0
> rx_crc_errors: 0
> rx_frame_errors: 0
> rx_fifo_errors: 0
> rx_missed_errors: 2263088
> tx_aborted_errors: 0
> tx_carrier_errors: 0
> tx_fifo_errors: 0
> tx_heartbeat_errors: 0
> rx_long_length_errors: 0
> rx_short_length_errors: 0
> rx_csum_offload_errors: 0
> fcoe_last_errors: 0
>
It would be nice you post perf record / perf report results
During your stress , do
perf record -a -g sleep 10
perf report
And post "top offenders"
Thanks
^ permalink raw reply
* RE: [stable] [PATCH NET-2.6 1/1] qlcnic: limit skb frags for non tso packet
From: Amit Salecha @ 2011-04-14 5:22 UTC (permalink / raw)
To: Greg KH; +Cc: David Miller, netdev, Anirban Chakraborty, stable, Ameen Rahman
In-Reply-To: <20110413161534.GA20578@kroah.com>
> > Footer will present in my reply to this email. But footer should not
> be there in patches sent by me.
> > Can you verify patch version 2 again ? Here
> http://patchwork.ozlabs.org/patch/90938/ I don't see any footer.
> > If you see footer with patch version 2, please send me that.
>
> Your footer was not in your patch, correct. But it was in this email.
>
> And that's the issue, you can't have that footer on emails you send to
> a
> public list where you are going to be collaborating on a public
> project,
> otherwise no one can use anything you say.
>
> Now if you only think that people will just accept your patches,
> without
> being able to have you participate in development and maintance of
> those
> patches (which is required to be done through email), you are mistaken.
>
> So please fix your email issue, otherwise it is not going to work.
>
> Note, other people at qualcomm have fixed this, so you are not alone.
>
Ok.
Our IT has fixed the footer issue. Sending this email to verify.
-Amit
This message and any attached documents contain information from QLogic Corporation or its wholly-owned subsidiaries that may be confidential. If you are not the intended recipient, you may not read, copy, distribute, or use this information. If you have received this transmission in error, please notify the sender immediately by reply e-mail and then delete this message.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox