Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH 0/3] route cache deletion and cleanups
From: David Miller @ 2011-02-18  0:34 UTC (permalink / raw)
  To: netdev


Here is a respin of the route cache deletion patch, with some minor
cleanups that become possible only afterwards.

Enjoy.

^ permalink raw reply

* [RFC PATCH 1/3] ipv4: Delete routing cache.
From: David Miller @ 2011-02-18  0:34 UTC (permalink / raw)
  To: netdev


Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h     |    1 -
 net/ipv4/fib_frontend.c |    5 -
 net/ipv4/route.c        |  903 ++---------------------------------------------
 3 files changed, 24 insertions(+), 885 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index bf790c1..fcf1b11 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -117,7 +117,6 @@ extern int		ip_rt_init(void);
 extern void		ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
 				       __be32 src, struct net_device *dev);
 extern void		rt_cache_flush(struct net *net, int how);
-extern void		rt_cache_flush_batch(struct net *net);
 extern int		__ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2a49c06..694145c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -978,11 +978,6 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
 		rt_cache_flush(dev_net(dev), 0);
 		break;
 	case NETDEV_UNREGISTER_BATCH:
-		/* The batch unregister is only called on the first
-		 * device in the list of devices being unregistered.
-		 * Therefore we should not pass dev_net(dev) in here.
-		 */
-		rt_cache_flush_batch(NULL);
 		break;
 	}
 	return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 2facde0..f74149c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,7 +129,6 @@ static int ip_rt_gc_elasticity __read_mostly	= 8;
 static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly	= 256;
-static int rt_chain_length_max __read_mostly	= 20;
 
 /*
  *	Interface to generic destination cache.
@@ -222,184 +221,30 @@ const __u8 ip_tos2prio[16] = {
 };
 
 
-/*
- * Route cache.
- */
-
-/* The locking scheme is rather straight forward:
- *
- * 1) Read-Copy Update protects the buckets of the central route hash.
- * 2) Only writers remove entries, and they hold the lock
- *    as they look at rtable reference counts.
- * 3) Only readers acquire references to rtable entries,
- *    they do so with atomic increments and with the
- *    lock held.
- */
-
-struct rt_hash_bucket {
-	struct rtable __rcu	*chain;
-};
-
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
-	defined(CONFIG_PROVE_LOCKING)
-/*
- * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
- * The size of this table is a power of two and depends on the number of CPUS.
- * (on lockdep we have a quite big spinlock_t, so keep the size down there)
- */
-#ifdef CONFIG_LOCKDEP
-# define RT_HASH_LOCK_SZ	256
-#else
-# if NR_CPUS >= 32
-#  define RT_HASH_LOCK_SZ	4096
-# elif NR_CPUS >= 16
-#  define RT_HASH_LOCK_SZ	2048
-# elif NR_CPUS >= 8
-#  define RT_HASH_LOCK_SZ	1024
-# elif NR_CPUS >= 4
-#  define RT_HASH_LOCK_SZ	512
-# else
-#  define RT_HASH_LOCK_SZ	256
-# endif
-#endif
-
-static spinlock_t	*rt_hash_locks;
-# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
-
-static __init void rt_hash_lock_init(void)
-{
-	int i;
-
-	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
-			GFP_KERNEL);
-	if (!rt_hash_locks)
-		panic("IP: failed to allocate rt_hash_locks\n");
-
-	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
-		spin_lock_init(&rt_hash_locks[i]);
-}
-#else
-# define rt_hash_lock_addr(slot) NULL
-
-static inline void rt_hash_lock_init(void)
-{
-}
-#endif
-
-static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
-static unsigned			rt_hash_mask __read_mostly;
-static unsigned int		rt_hash_log  __read_mostly;
-
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
-				   int genid)
-{
-	return jhash_3words((__force u32)daddr, (__force u32)saddr,
-			    idx, genid)
-		& rt_hash_mask;
-}
-
 static inline int rt_genid(struct net *net)
 {
 	return atomic_read(&net->ipv4.rt_genid);
 }
 
 #ifdef CONFIG_PROC_FS
-struct rt_cache_iter_state {
-	struct seq_net_private p;
-	int bucket;
-	int genid;
-};
-
-static struct rtable *rt_cache_get_first(struct seq_file *seq)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	struct rtable *r = NULL;
-
-	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
-		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
-			continue;
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-		while (r) {
-			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
-			    r->rt_genid == st->genid)
-				return r;
-			r = rcu_dereference_bh(r->dst.rt_next);
-		}
-		rcu_read_unlock_bh();
-	}
-	return r;
-}
-
-static struct rtable *__rt_cache_get_next(struct seq_file *seq,
-					  struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-
-	r = rcu_dereference_bh(r->dst.rt_next);
-	while (!r) {
-		rcu_read_unlock_bh();
-		do {
-			if (--st->bucket < 0)
-				return NULL;
-		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
-		rcu_read_lock_bh();
-		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_next(struct seq_file *seq,
-					struct rtable *r)
-{
-	struct rt_cache_iter_state *st = seq->private;
-	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
-		if (dev_net(r->dst.dev) != seq_file_net(seq))
-			continue;
-		if (r->rt_genid == st->genid)
-			break;
-	}
-	return r;
-}
-
-static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
-{
-	struct rtable *r = rt_cache_get_first(seq);
-
-	if (r)
-		while (pos && (r = rt_cache_get_next(seq, r)))
-			--pos;
-	return pos ? NULL : r;
-}
-
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct rt_cache_iter_state *st = seq->private;
 	if (*pos)
-		return rt_cache_get_idx(seq, *pos - 1);
-	st->genid = rt_genid(seq_file_net(seq));
+		return NULL;
 	return SEQ_START_TOKEN;
 }
 
 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct rtable *r;
-
-	if (v == SEQ_START_TOKEN)
-		r = rt_cache_get_first(seq);
-	else
-		r = rt_cache_get_next(seq, v);
 	++*pos;
-	return r;
+	return NULL;
 }
 
 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 {
-	if (v && v != SEQ_START_TOKEN)
-		rcu_read_unlock_bh();
 }
 
 static int rt_cache_seq_show(struct seq_file *seq, void *v)
@@ -409,29 +254,6 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
 			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 			   "HHUptod\tSpecDst");
-	else {
-		struct rtable *r = v;
-		int len;
-
-		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
-			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
-			r->dst.dev ? r->dst.dev->name : "*",
-			(__force u32)r->rt_dst,
-			(__force u32)r->rt_gateway,
-			r->rt_flags, atomic_read(&r->dst.__refcnt),
-			r->dst.__use, 0, (__force u32)r->rt_src,
-			dst_metric_advmss(&r->dst) + 40,
-			dst_metric(&r->dst, RTAX_WINDOW),
-			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
-			      dst_metric(&r->dst, RTAX_RTTVAR)),
-			r->fl.fl4_tos,
-			r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
-			r->dst.hh ? (r->dst.hh->hh_output ==
-				       dev_queue_xmit) : 0,
-			r->rt_spec_dst, &len);
-
-		seq_printf(seq, "%*s\n", 127 - len, "");
-	}
 	return 0;
 }
 
@@ -444,8 +266,7 @@ static const struct seq_operations rt_cache_seq_ops = {
 
 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open_net(inode, file, &rt_cache_seq_ops,
-			sizeof(struct rt_cache_iter_state));
+	return seq_open_net(inode, file, &rt_cache_seq_ops, 0);
 }
 
 static const struct file_operations rt_cache_seq_fops = {
@@ -643,184 +464,12 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void rt_free(struct rtable *rt)
-{
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline void rt_drop(struct rtable *rt)
-{
-	ip_rt_put(rt);
-	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
-}
-
-static inline int rt_fast_clean(struct rtable *rth)
-{
-	/* Kill broadcast/multicast entries very aggresively, if they
-	   collide in hash table with more useful entries */
-	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
-		rt_is_input_route(rth) && rth->dst.rt_next;
-}
-
-static inline int rt_valuable(struct rtable *rth)
-{
-	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
-		(rth->peer && rth->peer->pmtu_expires);
-}
-
-static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
-{
-	unsigned long age;
-	int ret = 0;
-
-	if (atomic_read(&rth->dst.__refcnt))
-		goto out;
-
-	age = jiffies - rth->dst.lastuse;
-	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
-	    (age <= tmo2 && rt_valuable(rth)))
-		goto out;
-	ret = 1;
-out:	return ret;
-}
-
-/* Bits of score are:
- * 31: very valuable
- * 30: not quite useless
- * 29..0: usage counter
- */
-static inline u32 rt_score(struct rtable *rt)
-{
-	u32 score = jiffies - rt->dst.lastuse;
-
-	score = ~score & ~(3<<30);
-
-	if (rt_valuable(rt))
-		score |= (1<<31);
-
-	if (rt_is_output_route(rt) ||
-	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
-		score |= (1<<30);
-
-	return score;
-}
-
-static inline bool rt_caching(const struct net *net)
-{
-	return net->ipv4.current_rt_cache_rebuild_count <=
-		net->ipv4.sysctl_rt_cache_rebuild_count;
-}
-
-static inline bool compare_hash_inputs(const struct flowi *fl1,
-					const struct flowi *fl2)
-{
-	return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
-		((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
-		(fl1->iif ^ fl2->iif)) == 0);
-}
-
-static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
-{
-	return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
-		((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
-		(fl1->mark ^ fl2->mark) |
-		(*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
-		(fl1->oif ^ fl2->oif) |
-		(fl1->iif ^ fl2->iif)) == 0;
-}
-
-static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
-{
-	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
-}
-
 static inline int rt_is_expired(struct rtable *rth)
 {
 	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
 /*
- * Perform a full scan of hash table and free all entries.
- * Can be called by a softirq or a process.
- * In the later case, we want to be reschedule if necessary
- */
-static void rt_do_flush(struct net *net, int process_context)
-{
-	unsigned int i;
-	struct rtable *rth, *next;
-
-	for (i = 0; i <= rt_hash_mask; i++) {
-		struct rtable __rcu **pprev;
-		struct rtable *list;
-
-		if (process_context && need_resched())
-			cond_resched();
-		rth = rcu_dereference_raw(rt_hash_table[i].chain);
-		if (!rth)
-			continue;
-
-		spin_lock_bh(rt_hash_lock_addr(i));
-
-		list = NULL;
-		pprev = &rt_hash_table[i].chain;
-		rth = rcu_dereference_protected(*pprev,
-			lockdep_is_held(rt_hash_lock_addr(i)));
-
-		while (rth) {
-			next = rcu_dereference_protected(rth->dst.rt_next,
-				lockdep_is_held(rt_hash_lock_addr(i)));
-
-			if (!net ||
-			    net_eq(dev_net(rth->dst.dev), net)) {
-				rcu_assign_pointer(*pprev, next);
-				rcu_assign_pointer(rth->dst.rt_next, list);
-				list = rth;
-			} else {
-				pprev = &rth->dst.rt_next;
-			}
-			rth = next;
-		}
-
-		spin_unlock_bh(rt_hash_lock_addr(i));
-
-		for (; list; list = next) {
-			next = rcu_dereference_protected(list->dst.rt_next, 1);
-			rt_free(list);
-		}
-	}
-}
-
-/*
- * While freeing expired entries, we compute average chain length
- * and standard deviation, using fixed-point arithmetic.
- * This to have an estimation of rt_chain_length_max
- *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
- * We use 3 bits for frational part, and 29 (or 61) for magnitude.
- */
-
-#define FRACT_BITS 3
-#define ONE (1UL << FRACT_BITS)
-
-/*
- * Given a hash chain and an item in this hash chain,
- * find if a previous entry has the same hash_inputs
- * (but differs on tos, mark or oif)
- * Returns 0 if an alias is found.
- * Returns ONE if rth has no alias before itself.
- */
-static int has_noalias(const struct rtable *head, const struct rtable *rth)
-{
-	const struct rtable *aux = head;
-
-	while (aux != rth) {
-		if (compare_hash_inputs(&aux->fl, &rth->fl))
-			return 0;
-		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
-	}
-	return ONE;
-}
-
-/*
  * Pertubation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
  * many times (2^24) without giving recent rt_genid.
@@ -841,366 +490,32 @@ static void rt_cache_invalidate(struct net *net)
 void rt_cache_flush(struct net *net, int delay)
 {
 	rt_cache_invalidate(net);
-	if (delay >= 0)
-		rt_do_flush(net, !in_softirq());
 }
 
-/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(struct net *net)
-{
-	rt_do_flush(net, !in_softirq());
-}
-
-static void rt_emergency_hash_rebuild(struct net *net)
-{
-	if (net_ratelimit())
-		printk(KERN_WARNING "Route hash chain too long!\n");
-	rt_cache_invalidate(net);
-}
-
-/*
-   Short description of GC goals.
-
-   We want to build algorithm, which will keep routing cache
-   at some equilibrium point, when number of aged off entries
-   is kept approximately equal to newly generated ones.
-
-   Current expiration strength is variable "expire".
-   We try to adjust it dynamically, so that if networking
-   is idle expires is large enough to keep enough of warm entries,
-   and when load increases it reduces to limit cache size.
- */
-
 static int rt_garbage_collect(struct dst_ops *ops)
 {
-	static unsigned long expire = RT_GC_TIMEOUT;
-	static unsigned long last_gc;
-	static int rover;
-	static int equilibrium;
-	struct rtable *rth;
-	struct rtable __rcu **rthp;
-	unsigned long now = jiffies;
-	int goal;
-	int entries = dst_entries_get_fast(&ipv4_dst_ops);
-
-	/*
-	 * Garbage collection is pretty expensive,
-	 * do not make it too frequently.
-	 */
-
 	RT_CACHE_STAT_INC(gc_total);
-
-	if (now - last_gc < ip_rt_gc_min_interval &&
-	    entries < ip_rt_max_size) {
-		RT_CACHE_STAT_INC(gc_ignored);
-		goto out;
-	}
-
-	entries = dst_entries_get_slow(&ipv4_dst_ops);
-	/* Calculate number of entries, which we want to expire now. */
-	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
-	if (goal <= 0) {
-		if (equilibrium < ipv4_dst_ops.gc_thresh)
-			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = entries - equilibrium;
-		if (goal > 0) {
-			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = entries - equilibrium;
-		}
-	} else {
-		/* We are in dangerous area. Try to reduce cache really
-		 * aggressively.
-		 */
-		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = entries - goal;
-	}
-
-	if (now - last_gc >= ip_rt_gc_min_interval)
-		last_gc = now;
-
-	if (goal <= 0) {
-		equilibrium += goal;
-		goto work_done;
-	}
-
-	do {
-		int i, k;
-
-		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
-			unsigned long tmo = expire;
-
-			k = (k + 1) & rt_hash_mask;
-			rthp = &rt_hash_table[k].chain;
-			spin_lock_bh(rt_hash_lock_addr(k));
-			while ((rth = rcu_dereference_protected(*rthp,
-					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
-				if (!rt_is_expired(rth) &&
-					!rt_may_expire(rth, tmo, expire)) {
-					tmo >>= 1;
-					rthp = &rth->dst.rt_next;
-					continue;
-				}
-				*rthp = rth->dst.rt_next;
-				rt_free(rth);
-				goal--;
-			}
-			spin_unlock_bh(rt_hash_lock_addr(k));
-			if (goal <= 0)
-				break;
-		}
-		rover = k;
-
-		if (goal <= 0)
-			goto work_done;
-
-		/* Goal is not achieved. We stop process if:
-
-		   - if expire reduced to zero. Otherwise, expire is halfed.
-		   - if table is not full.
-		   - if we are called from interrupt.
-		   - jiffies check is just fallback/debug loop breaker.
-		     We will not spin here for long time in any case.
-		 */
-
-		RT_CACHE_STAT_INC(gc_goal_miss);
-
-		if (expire == 0)
-			break;
-
-		expire >>= 1;
-#if RT_CACHE_DEBUG >= 2
-		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
-				dst_entries_get_fast(&ipv4_dst_ops), goal, i);
-#endif
-
-		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-			goto out;
-	} while (!in_softirq() && time_before_eq(jiffies, now));
-
-	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
-		goto out;
-	if (net_ratelimit())
-		printk(KERN_WARNING "dst cache overflow\n");
-	RT_CACHE_STAT_INC(gc_dst_overflow);
-	return 1;
-
-work_done:
-	expire += ip_rt_gc_min_interval;
-	if (expire > ip_rt_gc_timeout ||
-	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
-	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
-		expire = ip_rt_gc_timeout;
-#if RT_CACHE_DEBUG >= 2
-	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
-			dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
-#endif
-out:	return 0;
-}
-
-/*
- * Returns number of entries in a hash chain that have different hash_inputs
- */
-static int slow_chain_length(const struct rtable *head)
-{
-	int length = 0;
-	const struct rtable *rth = head;
-
-	while (rth) {
-		length += has_noalias(head, rth);
-		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
-	}
-	return length >> FRACT_BITS;
+	return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt,
-			  struct rtable **rp, struct sk_buff *skb, int ifindex)
+static int rt_finalize(struct rtable *rt, struct rtable **rp, struct sk_buff *skb)
 {
-	struct rtable	*rth, *cand;
-	struct rtable __rcu **rthp, **candp;
-	unsigned long	now;
-	u32 		min_score;
-	int		chain_length;
-	int attempts = !in_softirq();
-
-restart:
-	chain_length = 0;
-	min_score = ~(u32)0;
-	cand = NULL;
-	candp = NULL;
-	now = jiffies;
-
-	if (!rt_caching(dev_net(rt->dst.dev))) {
-		/*
-		 * If we're not caching, just tell the caller we
-		 * were successful and don't touch the route.  The
-		 * caller hold the sole reference to the cache entry, and
-		 * it will be released when the caller is done with it.
-		 * If we drop it here, the callers have no way to resolve routes
-		 * when we're not caching.  Instead, just point *rp at rt, so
-		 * the caller gets a single use out of the route
-		 * Note that we do rt_free on this new route entry, so that
-		 * once its refcount hits zero, we are still able to reap it
-		 * (Thanks Alexey)
-		 * Note: To avoid expensive rcu stuff for this uncached dst,
-		 * we set DST_NOCACHE so that dst_release() can free dst without
-		 * waiting a grace period.
-		 */
-
-		rt->dst.flags |= DST_NOCACHE;
-		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
-			int err = arp_bind_neighbour(&rt->dst);
-			if (err) {
-				if (net_ratelimit())
-					printk(KERN_WARNING
-					    "Neighbour table failure & not caching routes.\n");
-				ip_rt_put(rt);
-				return err;
-			}
-		}
-
-		goto skip_hashing;
-	}
-
-	rthp = &rt_hash_table[hash].chain;
-
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	while ((rth = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (rt_is_expired(rth)) {
-			*rthp = rth->dst.rt_next;
-			rt_free(rth);
-			continue;
-		}
-		if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
-			/* Put it first */
-			*rthp = rth->dst.rt_next;
-			/*
-			 * Since lookup is lockfree, the deletion
-			 * must be visible to another weakly ordered CPU before
-			 * the insertion at the start of the hash chain.
-			 */
-			rcu_assign_pointer(rth->dst.rt_next,
-					   rt_hash_table[hash].chain);
-			/*
-			 * Since lookup is lockfree, the update writes
-			 * must be ordered for consistency on SMP.
-			 */
-			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
-
-			dst_use(&rth->dst, now);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			rt_drop(rt);
-			if (rp)
-				*rp = rth;
-			else
-				skb_dst_set(skb, &rth->dst);
-			return 0;
-		}
-
-		if (!atomic_read(&rth->dst.__refcnt)) {
-			u32 score = rt_score(rth);
-
-			if (score <= min_score) {
-				cand = rth;
-				candp = rthp;
-				min_score = score;
-			}
-		}
-
-		chain_length++;
-
-		rthp = &rth->dst.rt_next;
-	}
-
-	if (cand) {
-		/* ip_rt_gc_elasticity used to be average length of chain
-		 * length, when exceeded gc becomes really aggressive.
-		 *
-		 * The second limit is less certain. At the moment it allows
-		 * only 2 entries per bucket. We will see.
-		 */
-		if (chain_length > ip_rt_gc_elasticity) {
-			*candp = cand->dst.rt_next;
-			rt_free(cand);
-		}
-	} else {
-		if (chain_length > rt_chain_length_max &&
-		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
-			struct net *net = dev_net(rt->dst.dev);
-			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(net)) {
-				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
-					rt->dst.dev->name, num);
-			}
-			rt_emergency_hash_rebuild(net);
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
-					ifindex, rt_genid(net));
-			goto restart;
-		}
-	}
-
-	/* Try to bind route to arp only if it is output
-	   route or unicast forwarding path.
+	/* To avoid expensive rcu stuff for this uncached dst, we set
+	 * DST_NOCACHE so that dst_release() can free dst without
+	 * waiting a grace period.
 	 */
+	rt->dst.flags |= DST_NOCACHE;
 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
 		int err = arp_bind_neighbour(&rt->dst);
 		if (err) {
-			spin_unlock_bh(rt_hash_lock_addr(hash));
-
-			if (err != -ENOBUFS) {
-				rt_drop(rt);
-				return err;
-			}
-
-			/* Neighbour tables are full and nothing
-			   can be released. Try to shrink route cache,
-			   it is most likely it holds some neighbour records.
-			 */
-			if (attempts-- > 0) {
-				int saved_elasticity = ip_rt_gc_elasticity;
-				int saved_int = ip_rt_gc_min_interval;
-				ip_rt_gc_elasticity	= 1;
-				ip_rt_gc_min_interval	= 0;
-				rt_garbage_collect(&ipv4_dst_ops);
-				ip_rt_gc_min_interval	= saved_int;
-				ip_rt_gc_elasticity	= saved_elasticity;
-				goto restart;
-			}
-
 			if (net_ratelimit())
-				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
-			rt_drop(rt);
-			return -ENOBUFS;
+				printk(KERN_WARNING
+				       "Neighbour table failure & not caching routes.\n");
+			ip_rt_put(rt);
+			return err;
 		}
 	}
 
-	rt->dst.rt_next = rt_hash_table[hash].chain;
-
-#if RT_CACHE_DEBUG >= 2
-	if (rt->dst.rt_next) {
-		struct rtable *trt;
-		printk(KERN_DEBUG "rt_cache @%02x: %pI4",
-		       hash, &rt->rt_dst);
-		for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
-			printk(" . %pI4", &trt->rt_dst);
-		printk("\n");
-	}
-#endif
-	/*
-	 * Since lookup is lockfree, we must make sure
-	 * previous writes to rt are comitted to memory
-	 * before making rt visible to other CPUS.
-	 */
-	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
-
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-
-skip_hashing:
 	if (rp)
 		*rp = rt;
 	else
@@ -1270,26 +585,6 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void rt_del(unsigned hash, struct rtable *rt)
-{
-	struct rtable __rcu **rthp;
-	struct rtable *aux;
-
-	rthp = &rt_hash_table[hash].chain;
-	spin_lock_bh(rt_hash_lock_addr(hash));
-	ip_rt_put(rt);
-	while ((aux = rcu_dereference_protected(*rthp,
-			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
-		if (aux == rt || rt_is_expired(aux)) {
-			*rthp = aux->dst.rt_next;
-			rt_free(aux);
-			continue;
-		}
-		rthp = &aux->dst.rt_next;
-	}
-	spin_unlock_bh(rt_hash_lock_addr(hash));
-}
-
 /* called in rcu_read_lock() section */
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
@@ -1348,14 +643,11 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ip_rt_put(rt);
 			ret = NULL;
 		} else if (rt->rt_flags & RTCF_REDIRECTED) {
-			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
-						rt->fl.oif,
-						rt_genid(dev_net(dst->dev)));
 #if RT_CACHE_DEBUG >= 1
 			printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
-				&rt->rt_dst, rt->fl.fl4_tos);
+			       &rt->rt_dst, rt->fl.fl4_tos);
 #endif
-			rt_del(hash, rt);
+			ip_rt_put(rt);
 			ret = NULL;
 		} else if (rt->peer &&
 			   rt->peer->pmtu_expires &&
@@ -1833,7 +1125,6 @@ static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 				u8 tos, struct net_device *dev, int our)
 {
-	unsigned int hash;
 	struct rtable *rth;
 	__be32 spec_dst;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -1895,8 +1186,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
 	RT_CACHE_STAT_INC(in_slow_mc);
 
-	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
+	return rt_finalize(rth, NULL, skb);
 
 e_nobufs:
 	return -ENOBUFS;
@@ -2036,7 +1326,6 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 	struct rtable* rth = NULL;
 	int err;
-	unsigned hash;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
@@ -2049,9 +1338,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 		return err;
 
 	/* put it into the cache */
-	hash = rt_hash(daddr, saddr, fl->iif,
-		       rt_genid(dev_net(rth->dst.dev)));
-	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
+	return rt_finalize(rth, NULL, skb);
 }
 
 /*
@@ -2079,7 +1366,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned	flags = 0;
 	u32		itag = 0;
 	struct rtable * rth;
-	unsigned	hash;
 	__be32		spec_dst;
 	int		err = -EINVAL;
 	struct net    * net = dev_net(dev);
@@ -2193,8 +1479,7 @@ local_input:
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
 	rth->rt_type	= res.type;
-	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
+	err = rt_finalize(rth, NULL, skb);
 	goto out;
 
 no_route:
@@ -2238,47 +1523,10 @@ martian_source_keep_err:
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			   u8 tos, struct net_device *dev, bool noref)
 {
-	struct rtable * rth;
-	unsigned	hash;
-	int iif = dev->ifindex;
-	struct net *net;
 	int res;
 
-	net = dev_net(dev);
-
 	rcu_read_lock();
 
-	if (!rt_caching(net))
-		goto skip_cache;
-
-	tos &= IPTOS_RT_MASK;
-	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
-
-	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
-	     rth = rcu_dereference(rth->dst.rt_next)) {
-		if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
-		     ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
-		     (rth->fl.iif ^ iif) |
-		     rth->fl.oif |
-		     (rth->fl.fl4_tos ^ tos)) == 0 &&
-		    rth->fl.mark == skb->mark &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			if (noref) {
-				dst_use_noref(&rth->dst, jiffies);
-				skb_dst_set_noref(skb, &rth->dst);
-			} else {
-				dst_use(&rth->dst, jiffies);
-				skb_dst_set(skb, &rth->dst);
-			}
-			RT_CACHE_STAT_INC(in_hit);
-			rcu_read_unlock();
-			return 0;
-		}
-		RT_CACHE_STAT_INC(in_hlist_search);
-	}
-
-skip_cache:
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
 	   hardware multicast filters :-( As result the host on multicasting
@@ -2419,11 +1667,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 
 /*
  * Major route resolver routine.
- * called with rcu_read_lock();
  */
 
-static int ip_route_output_slow(struct net *net, struct rtable **rp,
-				const struct flowi *oldflp)
+int __ip_route_output_key(struct net *net, struct rtable **rp,
+			  const struct flowi *oldflp)
 {
 	u32 tos	= RT_FL_TOS(oldflp);
 	struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
@@ -2600,55 +1847,13 @@ make_route:
 	rth = __mkroute_output(&res, &fl, oldflp, dev_out, flags);
 	if (IS_ERR(rth))
 		err = PTR_ERR(rth);
-	else {
-		unsigned int hash;
-
-		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
-			       rt_genid(dev_net(dev_out)));
-		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
-	}
+	else
+		err = rt_finalize(rth, rp, NULL);
 
 out:
 	rcu_read_unlock();
 	return err;
 }
-
-int __ip_route_output_key(struct net *net, struct rtable **rp,
-			  const struct flowi *flp)
-{
-	struct rtable *rth;
-	unsigned int hash;
-
-	if (!rt_caching(net))
-		goto slow_output;
-
-	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
-
-	rcu_read_lock_bh();
-	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
-		rth = rcu_dereference_bh(rth->dst.rt_next)) {
-		if (rth->fl.fl4_dst == flp->fl4_dst &&
-		    rth->fl.fl4_src == flp->fl4_src &&
-		    rt_is_output_route(rth) &&
-		    rth->fl.oif == flp->oif &&
-		    rth->fl.mark == flp->mark &&
-		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
-			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
-		    net_eq(dev_net(rth->dst.dev), net) &&
-		    !rt_is_expired(rth)) {
-			dst_use(&rth->dst, jiffies);
-			RT_CACHE_STAT_INC(out_hit);
-			rcu_read_unlock_bh();
-			*rp = rth;
-			return 0;
-		}
-		RT_CACHE_STAT_INC(out_hlist_search);
-	}
-	rcu_read_unlock_bh();
-
-slow_output:
-	return ip_route_output_slow(net, rp, flp);
-}
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
@@ -2942,43 +2147,6 @@ errout_free:
 
 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 {
-	struct rtable *rt;
-	int h, s_h;
-	int idx, s_idx;
-	struct net *net;
-
-	net = sock_net(skb->sk);
-
-	s_h = cb->args[0];
-	if (s_h < 0)
-		s_h = 0;
-	s_idx = idx = cb->args[1];
-	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
-		if (!rt_hash_table[h].chain)
-			continue;
-		rcu_read_lock_bh();
-		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
-		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
-			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
-				continue;
-			if (rt_is_expired(rt))
-				continue;
-			skb_dst_set_noref(skb, &rt->dst);
-			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
-					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-					 1, NLM_F_MULTI) <= 0) {
-				skb_dst_drop(skb);
-				rcu_read_unlock_bh();
-				goto done;
-			}
-			skb_dst_drop(skb);
-		}
-		rcu_read_unlock_bh();
-	}
-
-done:
-	cb->args[0] = h;
-	cb->args[1] = idx;
 	return skb->len;
 }
 
@@ -3211,16 +2379,6 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
 #endif /* CONFIG_IP_ROUTE_CLASSID */
 
-static __initdata unsigned long rhash_entries;
-static int __init set_rhash_entries(char *str)
-{
-	if (!str)
-		return 0;
-	rhash_entries = simple_strtoul(str, &str, 0);
-	return 1;
-}
-__setup("rhash_entries=", set_rhash_entries);
-
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3243,21 +2401,8 @@ int __init ip_rt_init(void)
 	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
 		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
 
-	rt_hash_table = (struct rt_hash_bucket *)
-		alloc_large_system_hash("IP route cache",
-					sizeof(struct rt_hash_bucket),
-					rhash_entries,
-					(totalram_pages >= 128 * 1024) ?
-					15 : 17,
-					0,
-					&rt_hash_log,
-					&rt_hash_mask,
-					rhash_entries ? 0 : 512 * 1024);
-	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-	rt_hash_lock_init();
-
-	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
-	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+	ipv4_dst_ops.gc_thresh = ~0;
+	ip_rt_max_size = INT_MAX;
 
 	devinet_init();
 	ip_fib_init();
-- 
1.7.4.1


^ permalink raw reply related

* [RFC PATCH 2/3] ipv4: Kill ip_route_input_noref().
From: David Miller @ 2011-02-18  0:34 UTC (permalink / raw)
  To: netdev


The "noref" argument to ip_route_input_common() is now always ignored
because we do not cache routes, and in that case we must always grab
a reference to the resulting 'dst'.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/route.h    |   16 ++--------------
 net/ipv4/arp.c         |    2 +-
 net/ipv4/ip_input.c    |    4 ++--
 net/ipv4/route.c       |    6 +++---
 net/ipv4/xfrm4_input.c |    4 ++--
 5 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index fcf1b11..c403a69 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -121,20 +121,8 @@ extern int		__ip_route_output_key(struct net *, struct rtable **, const struct f
 extern int		ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
 extern int		ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
 
-extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin, bool noref);
-
-static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
-				 u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, false);
-}
-
-static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
-				       u8 tos, struct net_device *devin)
-{
-	return ip_route_input_common(skb, dst, src, tos, devin, true);
-}
+extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
+			  u8 tos, struct net_device *devin);
 
 extern unsigned short	ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev);
 extern void		ip_rt_send_redirect(struct sk_buff *skb);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7927589..555b412 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -873,7 +873,7 @@ static int arp_process(struct sk_buff *skb)
 	}
 
 	if (arp->ar_op == htons(ARPOP_REQUEST) &&
-	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
+	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
 
 		rt = skb_rtable(skb);
 		addr_type = rt->rt_type;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d7b2b09..577eb45 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -324,8 +324,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (skb_dst(skb) == NULL) {
-		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					       iph->tos, skb->dev);
+		int err = ip_route_input(skb, iph->daddr, iph->saddr,
+					 iph->tos, skb->dev);
 		if (unlikely(err)) {
 			if (err == -EHOSTUNREACH)
 				IP_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f74149c..488094d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1520,8 +1520,8 @@ martian_source_keep_err:
 	goto out;
 }
 
-int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-			   u8 tos, struct net_device *dev, bool noref)
+int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		   u8 tos, struct net_device *dev)
 {
 	int res;
 
@@ -1564,7 +1564,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rcu_read_unlock();
 	return res;
 }
-EXPORT_SYMBOL(ip_route_input_common);
+EXPORT_SYMBOL(ip_route_input);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 06814b6..58d23a5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
 	if (skb_dst(skb) == NULL) {
 		const struct iphdr *iph = ip_hdr(skb);
 
-		if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					 iph->tos, skb->dev))
+		if (ip_route_input(skb, iph->daddr, iph->saddr,
+				   iph->tos, skb->dev))
 			goto drop;
 	}
 	return dst_input(skb);
-- 
1.7.4.1


^ permalink raw reply related

* [RFC PATCH 3/3] ipv4: Set DST_NOCACHE in rt_dst_alloc().
From: David Miller @ 2011-02-18  0:34 UTC (permalink / raw)
  To: netdev


Instead of using a read/modify/write in rt_finalize().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c |   11 +++++------
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 488094d..01b27ff 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -500,11 +500,6 @@ static int rt_garbage_collect(struct dst_ops *ops)
 
 static int rt_finalize(struct rtable *rt, struct rtable **rp, struct sk_buff *skb)
 {
-	/* To avoid expensive rcu stuff for this uncached dst, we set
-	 * DST_NOCACHE so that dst_release() can free dst without
-	 * waiting a grace period.
-	 */
-	rt->dst.flags |= DST_NOCACHE;
 	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
 		int err = arp_bind_neighbour(&rt->dst);
 		if (err) {
@@ -1114,7 +1109,11 @@ static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
 	if (rt) {
 		rt->dst.obsolete = -1;
 
-		rt->dst.flags = DST_HOST |
+		/* To avoid expensive rcu stuff for this uncached dst, we set
+		 * DST_NOCACHE so that dst_release() can free dst without
+		 * waiting a grace period.
+		 */
+		rt->dst.flags = DST_NOCACHE | DST_HOST |
 			(nopolicy ? DST_NOPOLICY : 0) |
 			(noxfrm ? DST_NOXFRM : 0);
 	}
-- 
1.7.4.1


^ permalink raw reply related

* linux-next: manual merge of the net tree with the net-current tree
From: Stephen Rothwell @ 2011-02-18  1:20 UTC (permalink / raw)
  To: David Miller, netdev
  Cc: linux-next, linux-kernel, Jesse Brandeburg, Jeff Kirsher

Hi all,

Today's linux-next merge of the net tree got a conflict in
drivers/net/e1000e/netdev.c between commit
713b3c9e4c1a6da6b45da6474ed554ed0a48de69 ("e1000e: flush all writebacks
before unload") from the net-current tree and commit
67fd4fcb78a7ced369a6bd8a131ec8c65ebd2bbb ("e1000e: convert to stats64")
from the net tree.

Just context changes.  I fixed it up (see below) and can carry the fix as
necessary.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc drivers/net/e1000e/netdev.c
index 3fa110d,7cedfeb..0000000
--- a/drivers/net/e1000e/netdev.c
+++ b/drivers/net/e1000e/netdev.c
@@@ -3344,21 -3335,8 +3341,23 @@@ int e1000e_up(struct e1000_adapter *ada
  	return 0;
  }
  
 +static void e1000e_flush_descriptors(struct e1000_adapter *adapter)
 +{
 +	struct e1000_hw *hw = &adapter->hw;
 +
 +	if (!(adapter->flags2 & FLAG2_DMA_BURST))
 +		return;
 +
 +	/* flush pending descriptor writebacks to memory */
 +	ew32(TIDV, adapter->tx_int_delay | E1000_TIDV_FPD);
 +	ew32(RDTR, adapter->rx_int_delay | E1000_RDTR_FPD);
 +
 +	/* execute the writes immediately */
 +	e1e_flush();
 +}
 +
+ static void e1000e_update_stats(struct e1000_adapter *adapter);
+ 
  void e1000e_down(struct e1000_adapter *adapter)
  {
  	struct net_device *netdev = adapter->netdev;
@@@ -4179,11 -4154,7 +4186,10 @@@ static void e1000_watchdog_task(struct 
  	struct e1000_ring *tx_ring = adapter->tx_ring;
  	struct e1000_hw *hw = &adapter->hw;
  	u32 link, tctl;
- 	int tx_pending = 0;
  
 +	if (test_bit(__E1000_DOWN, &adapter->state))
 +		return;
 +
  	link = e1000e_has_link(adapter);
  	if ((netif_carrier_ok(netdev)) && link) {
  		/* Cancel scheduled suspend requests. */

^ permalink raw reply

* Netconsole crash on 2.6.38-rc3
From: Sarah Sharp @ 2011-02-18  1:28 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Greg KH

[-- Attachment #1: Type: text/plain, Size: 803 bytes --]

I'm trying to debug an xHCI driver crash on 2.6.38-rc3, and netconsole
is crashing when I try to load it.  I will try to update to 2.6.38-rc5,
but I'm sort of stuck on rc3 since Greg KH's USB tree is based on that.

Attached is the two scripts I use to set up my box and call netconsole.
netconsole-on-network.sh is called first, followed by
netconsole-ending-on-network.sh.

When I invoked the netconsole-ending-on-network.sh script, netconsole
failed to load with an error about having the wrong ethernet device.  My
ethernet device apparently migrated from eth1 to eth0 on that box.

After modifying my script, unloading the netconsole driver, and
re-running netconsole-ending-on-network.sh, I got a "Killed" message
with the attached trace in dmesg.

Is this a known bug on 2.6.38-rc3?

Sarah Sharp

[-- Attachment #2: netconsole-crash.txt --]
[-- Type: text/plain, Size: 3918 bytes --]

[   30.336011] eth0: no IPv6 routers present
[   62.165508] netconsole: local port 6665
[   62.165512] netconsole: local IP 0.0.0.0
[   62.165513] netconsole: interface 'eth1'
[   62.165514] netconsole: remote port 6666
[   62.165515] netconsole: remote IP 192.168.1.138
[   62.165517] netconsole: remote ethernet address ff:ff:ff:ff:ff:ff
[   62.165518] netconsole: eth1 doesn't exist, aborting.
[   62.165520] netconsole: cleaning up
[   98.791662] netconsole: local port 6665
[   98.791666] netconsole: local IP 0.0.0.0
[   98.791667] netconsole: interface 'eth0'
[   98.791668] netconsole: remote port 6666
[   98.791669] netconsole: remote IP 192.168.1.138
[   98.791671] netconsole: remote ethernet address ff:ff:ff:ff:ff:ff
[   98.791673] netconsole: local IP 192.168.1.8
[   98.791690] BUG: unable to handle kernel NULL pointer dereference at           (null)
[   98.791693] IP: [<ffffffff81131977>] d_delete+0x47/0x180
[   98.791698] PGD 221b45067 PUD 221bce067 PMD 0 
[   98.791701] Oops: 0000 [#1] SMP 
[   98.791702] last sysfs file: /sys/devices/pci0000:00/0000:00:1f.2/host1/target1:0:0/1:0:0:0/block/sda/sda1/stat
[   98.791705] CPU 0 
[   98.791706] Modules linked in: netconsole(+) i915 drm_kms_helper drm binfmt_misc i2c_algo_bit ppdev bridge stp bnep video lp parport snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_intel snd_hda_codec snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi snd_rawmidi snd_seq_midi_event snd_seq snd_timer snd_seq_device snd soundcore usbhid pcspkr snd_page_alloc intel_agp intel_gtt iTCO_wdt iTCO_vendor_support ehci_hcd uhci_hcd usbcore floppy
[   98.791726] 
[   98.791728] Pid: 3337, comm: modprobe Not tainted 2.6.38-rc3+ #179 P5Q-EM/System Product Name
[   98.791730] RIP: 0010:[<ffffffff81131977>]  [<ffffffff81131977>] d_delete+0x47/0x180
[   98.791732] RSP: 0018:ffff880221b8fe68  EFLAGS: 00010246
[   98.791734] RAX: 0000000000000202 RBX: ffff8802255e2180 RCX: ffffffff81ababe0
[   98.791735] RDX: 0000000000000000 RSI: ffff8802255e21b8 RDI: ffff8802255e21dc
[   98.791737] RBP: ffff880221b8fe88 R08: ffff8800cda14420 R09: 0000000000000000
[   98.791738] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[   98.791740] R13: ffff8802255e21dc R14: 0000000000000000 R15: ffff880221b8fee8
[   98.791742] FS:  00007fa4795896f0(0000) GS:ffff8800cda00000(0000) knlGS:0000000000000000
[   98.791744] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   98.791745] CR2: 0000000000000000 CR3: 0000000221b98000 CR4: 00000000000406b0
[   98.791747] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   98.791749] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   98.791750] Process modprobe (pid: 3337, threadinfo ffff880221b8e000, task ffff88021cc243b0)
[   98.791752] Stack:
[   98.791752]  ffff8802255e2180 00000000ffffffef ffffffffa01b80a0 ffff880221bd9c68
[   98.791755]  ffff880221b8fec8 ffffffff8118bf33 0000000a685497ea ffffffffa01b80a8
[   98.791758]  ffff880221b8fec8 ffff880221bd9c00 0000000000000000 0000000000000000
[   98.791760] Call Trace:
[   98.791765]  [<ffffffff8118bf33>] configfs_register_subsystem+0x103/0x1c0
[   98.791768]  [<ffffffffa0006260>] init_netconsole+0x260/0x1000 [netconsole]
[   98.791771]  [<ffffffffa0006000>] ? init_netconsole+0x0/0x1000 [netconsole]
[   98.791775]  [<ffffffff810001de>] do_one_initcall+0x3e/0x170
[   98.791778]  [<ffffffff81086e26>] sys_init_module+0x106/0x280
[   98.791780]  [<ffffffff81002dbb>] system_call_fastpath+0x16/0x1b
[   98.791781] Code: 84 83 00 00 00 49 8d 7c 24 20 e8 45 c4 3f 00 85 c0 0f 1f 00 75 6e 41 fe 45 00 f3 90 4c 89 ef 45 31 f6 e8 5d c4 3f 00 4c 8b 63 30 <41> 0f b7 04 24 25 00 f0 00 00 3d 00 40 00 00 41 0f 94 c6 83 7b 
[   98.791799] RIP  [<ffffffff81131977>] d_delete+0x47/0x180
[   98.791801]  RSP <ffff880221b8fe68>
[   98.791802] CR2: 0000000000000000
[   98.791804] ---[ end trace 0bbba7195aad7eaa ]---

[-- Attachment #3: netconsole-on-network.sh --]
[-- Type: application/x-sh, Size: 146 bytes --]

[-- Attachment #4: netconsole-ending-on-network.sh --]
[-- Type: application/x-sh, Size: 88 bytes --]

^ permalink raw reply

* [PATCH] bonding: bond_select_queue off by one
From: Phil Oester @ 2011-02-18  2:07 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 506 bytes --]

The bonding driver's bond_select_queue function simply returns
skb->queue_mapping.  However queue_mapping could be == 16
for queue #16.  This causes the following message to be flooded
to syslog:

kernel: bondx selects TX queue 16, but real number of TX queues is 16

ndo_select_queue wants a zero-based number, so bonding driver needs
to subtract one to return the proper queue number.  Also fix grammar in
a comment while in the vicinity.

Phil Oester

Signed-off-by: Phil Oester <kernel@linuxace.com>



[-- Attachment #2: patch-bond-txq --]
[-- Type: text/plain, Size: 691 bytes --]

--- linux-2.6/drivers/net/bonding/bond_main.c.orig	2011-01-30 09:15:09.813843817 -0800
+++ linux-2.6/drivers/net/bonding/bond_main.c	2011-02-17 18:02:46.919050909 -0800
@@ -4537,11 +4537,11 @@
 {
 	/*
 	 * This helper function exists to help dev_pick_tx get the correct
-	 * destination queue.  Using a helper function skips the a call to
+	 * destination queue.  Using a helper function skips a call to
 	 * skb_tx_hash and will put the skbs in the queue we expect on their
 	 * way down to the bonding driver.
 	 */
-	return skb->queue_mapping;
+	return skb->queue_mapping ? skb->queue_mapping - 1 : 0;
 }
 
 static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)

^ permalink raw reply

* Re: Mass udp flow reboot linux with RealTek RTL-8169 Gigabit
From: Seblu @ 2011-02-18  2:54 UTC (permalink / raw)
  To: Francois Romieu; +Cc: Eric Dumazet, lkml, netdev, Ivan Vecera
In-Reply-To: <20110213203417.GA11442@electric-eye.fr.zoreil.com>

[-- Attachment #1: Type: text/plain, Size: 571 bytes --]

On Sun, Feb 13, 2011 at 9:34 PM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> Seblu <seblu@seblu.net> :
> [...]
>> > NIC seems to be reset frequently but host stop rebooting. \o//
>> ok after about 1 hour of iperf, host reboot.
>
> Can you apply the patch below on top of 2.6.38-rc4 ?
>

I've applyed your patch on 2.6.38-rc5. Host have rebooted 2mn after udp start.
After this reboot, host is still on after 2 hour under a 1Gbit/s udp flow.

I attached a dmesg output before reboot. Do you need anything else?

-- 
Sébastien Luttringer
www.seblu.net

[-- Attachment #2: dmesg.2.6.38-rc5-seblu.xz --]
[-- Type: application/x-xz, Size: 12392 bytes --]

^ permalink raw reply

* Re: Netconsole crash on 2.6.38-rc3
From: Greg KH @ 2011-02-18  3:02 UTC (permalink / raw)
  To: Sarah Sharp; +Cc: David S. Miller, netdev
In-Reply-To: <20110218012847.GA8980@xanatos>

On Thu, Feb 17, 2011 at 05:28:47PM -0800, Sarah Sharp wrote:
> I'm trying to debug an xHCI driver crash on 2.6.38-rc3, and netconsole
> is crashing when I try to load it.  I will try to update to 2.6.38-rc5,
> but I'm sort of stuck on rc3 since Greg KH's USB tree is based on that.

No it isn't, it's synced up with 2.6.38-rc5 at the moment, which was
required to handle some merge conflicts.  You might want to update your
version :)

thanks,

greg k-h

^ permalink raw reply

* [PATCH 08/29] timberdale: mfd_cell is now implicitly available to drivers (v2)
From: Andres Salomon @ 2011-02-18  3:07 UTC (permalink / raw)
  To: Samuel Ortiz
  Cc: David Brownell, richard.rojfors-gfIc91nka+FZroRs9YW3xA,
	David S. Miller, netdev-u79uwXL29TY76Z2rM5mHXA, Mark Brown,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Mauro Carvalho Chehab,
	linux-i2c-u79uwXL29TY76Z2rM5mHXA, Ben Dooks (embedded platforms),
	spi-devel-general-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	Jean Delvare (PC drivers, core), Peter Korsgaard, Dan Williams,
	Andrew Morton, linux-media-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1297998456-7615-1-git-send-email-dilinger-pFFUokh25LWsTnJN9+BGXg@public.gmane.org>

The cell's platform_data is now accessed with a helper function;
change clients to use that, and remove the now-unused data_size.

Note that the mfd's platform_data is marked __devinitdata.  This
is still correct in all cases except for the timbgpio driver, whose
remove hook has been changed to no longer reference the pdata.

v2: add some missing mfd/core.h includes.

Signed-off-by: Andres Salomon <dilinger-pFFUokh25LWsTnJN9+BGXg@public.gmane.org>
---
 drivers/dma/timb_dma.c           |    3 ++-
 drivers/gpio/timbgpio.c          |    6 +++---
 drivers/i2c/busses/i2c-ocores.c  |    3 ++-
 drivers/i2c/busses/i2c-xiic.c    |    3 ++-
 drivers/media/radio/radio-timb.c |    3 ++-
 drivers/media/video/timblogiw.c  |    3 ++-
 drivers/mfd/timberdale.c         |   27 ---------------------------
 drivers/net/ks8842.c             |    3 ++-
 drivers/spi/xilinx_spi.c         |    3 ++-
 9 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/drivers/dma/timb_dma.c b/drivers/dma/timb_dma.c
index 3b88a4e..ea8705b 100644
--- a/drivers/dma/timb_dma.c
+++ b/drivers/dma/timb_dma.c
@@ -27,6 +27,7 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/slab.h>
 
 #include <linux/timb_dma.h>
@@ -684,7 +685,7 @@ static irqreturn_t td_irq(int irq, void *devid)
 
 static int __devinit td_probe(struct platform_device *pdev)
 {
-	struct timb_dma_platform_data *pdata = pdev->dev.platform_data;
+	struct timb_dma_platform_data *pdata = mfd_get_data(pdev);
 	struct timb_dma *td;
 	struct resource *iomem;
 	int irq;
diff --git a/drivers/gpio/timbgpio.c b/drivers/gpio/timbgpio.c
index 58c8f30..ffcd815 100644
--- a/drivers/gpio/timbgpio.c
+++ b/drivers/gpio/timbgpio.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/gpio.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/irq.h>
 #include <linux/io.h>
 #include <linux/timb_gpio.h>
@@ -228,7 +229,7 @@ static int __devinit timbgpio_probe(struct platform_device *pdev)
 	struct gpio_chip *gc;
 	struct timbgpio *tgpio;
 	struct resource *iomem;
-	struct timbgpio_platform_data *pdata = pdev->dev.platform_data;
+	struct timbgpio_platform_data *pdata = mfd_get_data(pdev);
 	int irq = platform_get_irq(pdev, 0);
 
 	if (!pdata || pdata->nr_pins > 32) {
@@ -319,14 +320,13 @@ err_mem:
 static int __devexit timbgpio_remove(struct platform_device *pdev)
 {
 	int err;
-	struct timbgpio_platform_data *pdata = pdev->dev.platform_data;
 	struct timbgpio *tgpio = platform_get_drvdata(pdev);
 	struct resource *iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	int irq = platform_get_irq(pdev, 0);
 
 	if (irq >= 0 && tgpio->irq_base > 0) {
 		int i;
-		for (i = 0; i < pdata->nr_pins; i++) {
+		for (i = 0; i < tgpio->gpio.ngpio; i++) {
 			set_irq_chip(tgpio->irq_base + i, NULL);
 			set_irq_chip_data(tgpio->irq_base + i, NULL);
 		}
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index ef3bcb1..a3f4799 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -49,6 +49,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
@@ -305,7 +306,7 @@ static int __devinit ocores_i2c_probe(struct platform_device *pdev)
 		return -EIO;
 	}
 
-	pdata = pdev->dev.platform_data;
+	pdata = mfd_get_data(pdev);
 	if (pdata) {
 		i2c->regstep = pdata->regstep;
 		i2c->clock_khz = pdata->clock_khz;
diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c
index a9c419e..9fbd7e6 100644
--- a/drivers/i2c/busses/i2c-xiic.c
+++ b/drivers/i2c/busses/i2c-xiic.c
@@ -34,6 +34,7 @@
 #include <linux/errno.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/i2c.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
@@ -704,7 +705,7 @@ static int __devinit xiic_i2c_probe(struct platform_device *pdev)
 	if (irq < 0)
 		goto resource_missing;
 
-	pdata = (struct xiic_i2c_platform_data *) pdev->dev.platform_data;
+	pdata = mfd_get_data(pdev);
 	if (!pdata)
 		return -EINVAL;
 
diff --git a/drivers/media/radio/radio-timb.c b/drivers/media/radio/radio-timb.c
index a185610..1e3a8dd 100644
--- a/drivers/media/radio/radio-timb.c
+++ b/drivers/media/radio/radio-timb.c
@@ -21,6 +21,7 @@
 #include <media/v4l2-ioctl.h>
 #include <media/v4l2-device.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
@@ -148,7 +149,7 @@ static const struct v4l2_file_operations timbradio_fops = {
 
 static int __devinit timbradio_probe(struct platform_device *pdev)
 {
-	struct timb_radio_platform_data *pdata = pdev->dev.platform_data;
+	struct timb_radio_platform_data *pdata = mfd_get_data(pdev);
 	struct timbradio *tr;
 	int err;
 
diff --git a/drivers/media/video/timblogiw.c b/drivers/media/video/timblogiw.c
index fc611eb..84d4c7c 100644
--- a/drivers/media/video/timblogiw.c
+++ b/drivers/media/video/timblogiw.c
@@ -24,6 +24,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/dmaengine.h>
+#include <linux/mfd/core.h>
 #include <linux/scatterlist.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
@@ -790,7 +791,7 @@ static int __devinit timblogiw_probe(struct platform_device *pdev)
 {
 	int err;
 	struct timblogiw *lw = NULL;
-	struct timb_video_platform_data *pdata = pdev->dev.platform_data;
+	struct timb_video_platform_data *pdata = mfd_get_data(pdev);
 
 	if (!pdata) {
 		dev_err(&pdev->dev, "No platform data\n");
diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c
index 6ad8a7f..6353921 100644
--- a/drivers/mfd/timberdale.c
+++ b/drivers/mfd/timberdale.c
@@ -385,7 +385,6 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg0[] = {
 		.num_resources = ARRAY_SIZE(timberdale_dma_resources),
 		.resources = timberdale_dma_resources,
 		.platform_data = &timb_dma_platform_data,
-		.data_size = sizeof(timb_dma_platform_data),
 	},
 	{
 		.name = "timb-uart",
@@ -397,42 +396,36 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg0[] = {
 		.num_resources = ARRAY_SIZE(timberdale_xiic_resources),
 		.resources = timberdale_xiic_resources,
 		.platform_data = &timberdale_xiic_platform_data,
-		.data_size = sizeof(timberdale_xiic_platform_data),
 	},
 	{
 		.name = "timb-gpio",
 		.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
 		.resources = timberdale_gpio_resources,
 		.platform_data = &timberdale_gpio_platform_data,
-		.data_size = sizeof(timberdale_gpio_platform_data),
 	},
 	{
 		.name = "timb-video",
 		.num_resources = ARRAY_SIZE(timberdale_video_resources),
 		.resources = timberdale_video_resources,
 		.platform_data = &timberdale_video_platform_data,
-		.data_size = sizeof(timberdale_video_platform_data),
 	},
 	{
 		.name = "timb-radio",
 		.num_resources = ARRAY_SIZE(timberdale_radio_resources),
 		.resources = timberdale_radio_resources,
 		.platform_data = &timberdale_radio_platform_data,
-		.data_size = sizeof(timberdale_radio_platform_data),
 	},
 	{
 		.name = "xilinx_spi",
 		.num_resources = ARRAY_SIZE(timberdale_spi_resources),
 		.resources = timberdale_spi_resources,
 		.platform_data = &timberdale_xspi_platform_data,
-		.data_size = sizeof(timberdale_xspi_platform_data),
 	},
 	{
 		.name = "ks8842",
 		.num_resources = ARRAY_SIZE(timberdale_eth_resources),
 		.resources = timberdale_eth_resources,
 		.platform_data = &timberdale_ks8842_platform_data,
-		.data_size = sizeof(timberdale_ks8842_platform_data)
 	},
 };
 
@@ -442,7 +435,6 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg1[] = {
 		.num_resources = ARRAY_SIZE(timberdale_dma_resources),
 		.resources = timberdale_dma_resources,
 		.platform_data = &timb_dma_platform_data,
-		.data_size = sizeof(timb_dma_platform_data),
 	},
 	{
 		.name = "timb-uart",
@@ -459,14 +451,12 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg1[] = {
 		.num_resources = ARRAY_SIZE(timberdale_xiic_resources),
 		.resources = timberdale_xiic_resources,
 		.platform_data = &timberdale_xiic_platform_data,
-		.data_size = sizeof(timberdale_xiic_platform_data),
 	},
 	{
 		.name = "timb-gpio",
 		.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
 		.resources = timberdale_gpio_resources,
 		.platform_data = &timberdale_gpio_platform_data,
-		.data_size = sizeof(timberdale_gpio_platform_data),
 	},
 	{
 		.name = "timb-mlogicore",
@@ -478,28 +468,24 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg1[] = {
 		.num_resources = ARRAY_SIZE(timberdale_video_resources),
 		.resources = timberdale_video_resources,
 		.platform_data = &timberdale_video_platform_data,
-		.data_size = sizeof(timberdale_video_platform_data),
 	},
 	{
 		.name = "timb-radio",
 		.num_resources = ARRAY_SIZE(timberdale_radio_resources),
 		.resources = timberdale_radio_resources,
 		.platform_data = &timberdale_radio_platform_data,
-		.data_size = sizeof(timberdale_radio_platform_data),
 	},
 	{
 		.name = "xilinx_spi",
 		.num_resources = ARRAY_SIZE(timberdale_spi_resources),
 		.resources = timberdale_spi_resources,
 		.platform_data = &timberdale_xspi_platform_data,
-		.data_size = sizeof(timberdale_xspi_platform_data),
 	},
 	{
 		.name = "ks8842",
 		.num_resources = ARRAY_SIZE(timberdale_eth_resources),
 		.resources = timberdale_eth_resources,
 		.platform_data = &timberdale_ks8842_platform_data,
-		.data_size = sizeof(timberdale_ks8842_platform_data)
 	},
 };
 
@@ -509,7 +495,6 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg2[] = {
 		.num_resources = ARRAY_SIZE(timberdale_dma_resources),
 		.resources = timberdale_dma_resources,
 		.platform_data = &timb_dma_platform_data,
-		.data_size = sizeof(timb_dma_platform_data),
 	},
 	{
 		.name = "timb-uart",
@@ -521,35 +506,30 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg2[] = {
 		.num_resources = ARRAY_SIZE(timberdale_xiic_resources),
 		.resources = timberdale_xiic_resources,
 		.platform_data = &timberdale_xiic_platform_data,
-		.data_size = sizeof(timberdale_xiic_platform_data),
 	},
 	{
 		.name = "timb-gpio",
 		.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
 		.resources = timberdale_gpio_resources,
 		.platform_data = &timberdale_gpio_platform_data,
-		.data_size = sizeof(timberdale_gpio_platform_data),
 	},
 	{
 		.name = "timb-video",
 		.num_resources = ARRAY_SIZE(timberdale_video_resources),
 		.resources = timberdale_video_resources,
 		.platform_data = &timberdale_video_platform_data,
-		.data_size = sizeof(timberdale_video_platform_data),
 	},
 	{
 		.name = "timb-radio",
 		.num_resources = ARRAY_SIZE(timberdale_radio_resources),
 		.resources = timberdale_radio_resources,
 		.platform_data = &timberdale_radio_platform_data,
-		.data_size = sizeof(timberdale_radio_platform_data),
 	},
 	{
 		.name = "xilinx_spi",
 		.num_resources = ARRAY_SIZE(timberdale_spi_resources),
 		.resources = timberdale_spi_resources,
 		.platform_data = &timberdale_xspi_platform_data,
-		.data_size = sizeof(timberdale_xspi_platform_data),
 	},
 };
 
@@ -559,7 +539,6 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg3[] = {
 		.num_resources = ARRAY_SIZE(timberdale_dma_resources),
 		.resources = timberdale_dma_resources,
 		.platform_data = &timb_dma_platform_data,
-		.data_size = sizeof(timb_dma_platform_data),
 	},
 	{
 		.name = "timb-uart",
@@ -571,42 +550,36 @@ static __devinitdata struct mfd_cell timberdale_cells_bar0_cfg3[] = {
 		.num_resources = ARRAY_SIZE(timberdale_ocores_resources),
 		.resources = timberdale_ocores_resources,
 		.platform_data = &timberdale_ocores_platform_data,
-		.data_size = sizeof(timberdale_ocores_platform_data),
 	},
 	{
 		.name = "timb-gpio",
 		.num_resources = ARRAY_SIZE(timberdale_gpio_resources),
 		.resources = timberdale_gpio_resources,
 		.platform_data = &timberdale_gpio_platform_data,
-		.data_size = sizeof(timberdale_gpio_platform_data),
 	},
 	{
 		.name = "timb-video",
 		.num_resources = ARRAY_SIZE(timberdale_video_resources),
 		.resources = timberdale_video_resources,
 		.platform_data = &timberdale_video_platform_data,
-		.data_size = sizeof(timberdale_video_platform_data),
 	},
 	{
 		.name = "timb-radio",
 		.num_resources = ARRAY_SIZE(timberdale_radio_resources),
 		.resources = timberdale_radio_resources,
 		.platform_data = &timberdale_radio_platform_data,
-		.data_size = sizeof(timberdale_radio_platform_data),
 	},
 	{
 		.name = "xilinx_spi",
 		.num_resources = ARRAY_SIZE(timberdale_spi_resources),
 		.resources = timberdale_spi_resources,
 		.platform_data = &timberdale_xspi_platform_data,
-		.data_size = sizeof(timberdale_xspi_platform_data),
 	},
 	{
 		.name = "ks8842",
 		.num_resources = ARRAY_SIZE(timberdale_eth_resources),
 		.resources = timberdale_eth_resources,
 		.platform_data = &timberdale_ks8842_platform_data,
-		.data_size = sizeof(timberdale_ks8842_platform_data)
 	},
 };
 
diff --git a/drivers/net/ks8842.c b/drivers/net/ks8842.c
index 928b2b8..efd44af 100644
--- a/drivers/net/ks8842.c
+++ b/drivers/net/ks8842.c
@@ -26,6 +26,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
@@ -1145,7 +1146,7 @@ static int __devinit ks8842_probe(struct platform_device *pdev)
 	struct resource *iomem;
 	struct net_device *netdev;
 	struct ks8842_adapter *adapter;
-	struct ks8842_platform_data *pdata = pdev->dev.platform_data;
+	struct ks8842_platform_data *pdata = mfd_get_data(pdev);
 	u16 id;
 	unsigned i;
 
diff --git a/drivers/spi/xilinx_spi.c b/drivers/spi/xilinx_spi.c
index 7adaef6..c9bf074 100644
--- a/drivers/spi/xilinx_spi.c
+++ b/drivers/spi/xilinx_spi.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/spi/spi.h>
 #include <linux/spi/spi_bitbang.h>
 #include <linux/spi/xilinx_spi.h>
@@ -474,7 +475,7 @@ static int __devinit xilinx_spi_probe(struct platform_device *dev)
 	struct spi_master *master;
 	u8 i;
 
-	pdata = dev->dev.platform_data;
+	pdata = mfd_get_data(dev);
 	if (pdata) {
 		num_cs = pdata->num_chipselect;
 		little_endian = pdata->little_endian;
-- 
1.7.2.3


------------------------------------------------------------------------------
The ultimate all-in-one performance toolkit: Intel(R) Parallel Studio XE:
Pinpoint memory and threading errors before they happen.
Find and fix more than 250 security defects in the development cycle.
Locate bottlenecks in serial and parallel code that limit performance.
http://p.sf.net/sfu/intel-dev2devfeb

^ permalink raw reply related

* [PATCH 15/29] janz: mfd_cell is now implicitly available to drivers (v2)
From: Andres Salomon @ 2011-02-18  3:07 UTC (permalink / raw)
  To: Samuel Ortiz
  Cc: socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA, Mark Brown,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Wolfgang Grandegger
In-Reply-To: <1297998456-7615-1-git-send-email-dilinger-pFFUokh25LWsTnJN9+BGXg@public.gmane.org>

The cell's platform_data is now accessed with a helper function;
change clients to use that, and remove the now-unused data_size.

v2: add some missing mfd/core.h includes.

Signed-off-by: Andres Salomon <dilinger-pFFUokh25LWsTnJN9+BGXg@public.gmane.org>
---
 drivers/gpio/janz-ttl.c      |    3 ++-
 drivers/mfd/janz-cmodio.c    |    1 -
 drivers/net/can/janz-ican3.c |    3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpio/janz-ttl.c b/drivers/gpio/janz-ttl.c
index 813ac07..2514fb0 100644
--- a/drivers/gpio/janz-ttl.c
+++ b/drivers/gpio/janz-ttl.c
@@ -15,6 +15,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
 #include <linux/slab.h>
@@ -149,7 +150,7 @@ static int __devinit ttl_probe(struct platform_device *pdev)
 	struct resource *res;
 	int ret;
 
-	pdata = pdev->dev.platform_data;
+	pdata = mfd_get_data(pdev);
 	if (!pdata) {
 		dev_err(dev, "no platform data\n");
 		ret = -ENXIO;
diff --git a/drivers/mfd/janz-cmodio.c b/drivers/mfd/janz-cmodio.c
index 36a166b..58de1e2 100644
--- a/drivers/mfd/janz-cmodio.c
+++ b/drivers/mfd/janz-cmodio.c
@@ -87,7 +87,6 @@ static int __devinit cmodio_setup_subdevice(struct cmodio_device *priv,
 	/* Add platform data */
 	pdata->modno = modno;
 	cell->platform_data = pdata;
-	cell->data_size = sizeof(*pdata);
 
 	/* MODULbus registers -- PCI BAR3 is big-endian MODULbus access */
 	res->flags = IORESOURCE_MEM;
diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c
index 366f5cc..102b16c 100644
--- a/drivers/net/can/janz-ican3.c
+++ b/drivers/net/can/janz-ican3.c
@@ -15,6 +15,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
+#include <linux/mfd/core.h>
 
 #include <linux/netdevice.h>
 #include <linux/can.h>
@@ -1643,7 +1644,7 @@ static int __devinit ican3_probe(struct platform_device *pdev)
 	struct device *dev;
 	int ret;
 
-	pdata = pdev->dev.platform_data;
+	pdata = mfd_get_data(pdev);
 	if (!pdata)
 		return -ENXIO;
 
-- 
1.7.2.3

^ permalink raw reply related

* Re: IGMP and rwlock: Dead ocurred again on TILEPro
From: Cypher Wu @ 2011-02-18  3:16 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: David Miller, xiyou.wangcong, linux-kernel, eric.dumazet, netdev
In-Reply-To: <4D5DACC5.60105@tilera.com>

On Fri, Feb 18, 2011 at 7:18 AM, Chris Metcalf <cmetcalf@tilera.com> wrote:
> On 2/17/2011 6:11 PM, David Miller wrote:
>> From: Chris Metcalf <cmetcalf@tilera.com>
>> Date: Thu, 17 Feb 2011 18:04:13 -0500
>>
>>> On 2/17/2011 5:53 PM, David Miller wrote:
>>>> From: Chris Metcalf <cmetcalf@tilera.com>
>>>> Date: Thu, 17 Feb 2011 17:49:46 -0500
>>>>
>>>>> The fix is to disable interrupts for the arch_read_lock family of methods.
>>>> How does that help handle the race when it happens between different
>>>> cpus, instead of between IRQ and non-IRQ context on the same CPU?
>>> There's no race in that case, since the lock code properly backs off and
>>> retries until the other cpu frees it.  The distinction here is that the
>>> non-IRQ context is "wedged" by the IRQ context.
>>>
>>>> Why don't you just use the generic spinlock based rwlock code on Tile,
>>>> since that is all that your atomic instructions can handle
>>>> sufficiently?
>>> The tile-specific code encodes reader/writer information in the same 32-bit
>>> word that the test-and-set instruction manipulates, so it's more efficient
>>> both in space and time.  This may not really matter for rwlocks, since no
>>> one cares much about them any more, but that was the motivation.
>> Ok, but IRQ disabling is going to be very expensive.
>
> The interrupt architecture on Tile allows a write to a special-purpose
> register to put you into a "critical section" where no interrupts or faults
> are delivered.  So we just need to bracket the read_lock operations with
> two SPR writes; each takes six machine cycles, so we're only adding 12
> cycles to the total cost of taking or releasing a read lock on an rwlock.
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
>
>

I agree that just lock interrupt for read operations should be enough,
but read_unlock() is also the place we should lock interrupt, right?
If interrupt occurred when it hold lock-val after TNS deadlock still
can occur.

When will you release out that patch? Since time is tight, so maybe
I've to fix-up it myself. Though the problem is clearly now, I still
have two questions to confirm:

1. If we use SPR_INTERRUPT_CRITICAL_SECTION it will disable all the
interrupt which claimed 'CM', is that right? Should we have to same
its original value and restore it later?
There is some code in Linux:
int __tns_atomic_acquire(atomic_t *lock)
{
	int ret;
	u32 iterations = 0;

	BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);

	while((ret = __insn_tns(&lock->counter)) == 1)
		delay_backoff(iterations++);
	return ret;
}
It just use BUG_ON to check SPR_INTERRUPT_CRITICAL_SECTION have to be
0, is that means that SPR is only used in special situations like
that?

2. Should we lock interrupt for the whole operation of
read_lock()/read_unlock(), or we should leave interrupt critical
section if it run into  __raw_read_lock_slow() and before have to
delay_backoff() some time, and re-enter interrupt critical section
again before TNS?




-- 
Cyberman Wu

^ permalink raw reply

* Re: IGMP and rwlock: Dead ocurred again on TILEPro
From: Cypher Wu @ 2011-02-18  3:19 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: David Miller, xiyou.wangcong, linux-kernel, eric.dumazet, netdev
In-Reply-To: <4D5DACC5.60105@tilera.com>

On Fri, Feb 18, 2011 at 7:18 AM, Chris Metcalf <cmetcalf@tilera.com> wrote:
> On 2/17/2011 6:11 PM, David Miller wrote:
>> From: Chris Metcalf <cmetcalf@tilera.com>
>> Date: Thu, 17 Feb 2011 18:04:13 -0500
>>
>>> On 2/17/2011 5:53 PM, David Miller wrote:
>>>> From: Chris Metcalf <cmetcalf@tilera.com>
>>>> Date: Thu, 17 Feb 2011 17:49:46 -0500
>>>>
>>>>> The fix is to disable interrupts for the arch_read_lock family of methods.
>>>> How does that help handle the race when it happens between different
>>>> cpus, instead of between IRQ and non-IRQ context on the same CPU?
>>> There's no race in that case, since the lock code properly backs off and
>>> retries until the other cpu frees it.  The distinction here is that the
>>> non-IRQ context is "wedged" by the IRQ context.
>>>
>>>> Why don't you just use the generic spinlock based rwlock code on Tile,
>>>> since that is all that your atomic instructions can handle
>>>> sufficiently?
>>> The tile-specific code encodes reader/writer information in the same 32-bit
>>> word that the test-and-set instruction manipulates, so it's more efficient
>>> both in space and time.  This may not really matter for rwlocks, since no
>>> one cares much about them any more, but that was the motivation.
>> Ok, but IRQ disabling is going to be very expensive.
>
> The interrupt architecture on Tile allows a write to a special-purpose
> register to put you into a "critical section" where no interrupts or faults
> are delivered.  So we just need to bracket the read_lock operations with
> two SPR writes; each takes six machine cycles, so we're only adding 12
> cycles to the total cost of taking or releasing a read lock on an rwlock.
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
>
>


Bye the way, other RISC platforms, say ARM and MIPS, use store
conditional rather that TNS a temp value for lock-val, does Fx have
similar instructions?


-- 
Cyberman Wu

^ permalink raw reply

* Re: KIND
From: Mr.David Gurupatham @ 2011-02-18  1:20 UTC (permalink / raw)
  To: netdev@vger.kernel.org

My name is David Gurupatham a legal practitioner with David Gurupatham  & Associates

in Kuala Lumpur Msia.

I found your contact/profile some where over the Internet and it gave me the

greatest joy, that you are the one I have been looking for. Whom I strongly believe could

execute this project with me. Kindly get back to me for more information

Best regards,

Mr David Gurupatham{Esq}

^ permalink raw reply

* Re: [PATCH] bonding: bond_select_queue off by one
From: Ben Hutchings @ 2011-02-18  3:46 UTC (permalink / raw)
  To: Phil Oester; +Cc: netdev
In-Reply-To: <20110218020713.GA9696@linuxace.com>

On Thu, 2011-02-17 at 18:07 -0800, Phil Oester wrote:
> The bonding driver's bond_select_queue function simply returns
> skb->queue_mapping.  However queue_mapping could be == 16
> for queue #16.  This causes the following message to be flooded
> to syslog:
> 
> kernel: bondx selects TX queue 16, but real number of TX queues is 16
> 
> ndo_select_queue wants a zero-based number, so bonding driver needs
> to subtract one to return the proper queue number.  Also fix grammar in
> a comment while in the vicinity.
> 
> Phil Oester
> 
> Signed-off-by: Phil Oester <kernel@linuxace.com>

> --- linux-2.6/drivers/net/bonding/bond_main.c.orig      2011-01-30 09:15:09.813843817 -0800
> +++ linux-2.6/drivers/net/bonding/bond_main.c   2011-02-17 18:02:46.919050909 -0800
> @@ -4537,11 +4537,11 @@
>  {
>         /*
>          * This helper function exists to help dev_pick_tx get the correct
> -        * destination queue.  Using a helper function skips the a call to
> +        * destination queue.  Using a helper function skips a call to
>          * skb_tx_hash and will put the skbs in the queue we expect on their
>          * way down to the bonding driver.
>          */
> -       return skb->queue_mapping;
> +       return skb->queue_mapping ? skb->queue_mapping - 1 : 0;
>  }
>  
>  static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) 

This looks basically correct, but it should use the proper functions:

	skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] bonding: bond_select_queue off by one
From: Jay Vosburgh @ 2011-02-18  4:41 UTC (permalink / raw)
  To: Andy Gospodarek, Phil Oester; +Cc: netdev
In-Reply-To: <20110218020713.GA9696@linuxace.com>

Phil Oester <kernel@linuxace.com> wrote:

>The bonding driver's bond_select_queue function simply returns
>skb->queue_mapping.  However queue_mapping could be == 16
>for queue #16.  This causes the following message to be flooded
>to syslog:
>
>kernel: bondx selects TX queue 16, but real number of TX queues is 16
>
>ndo_select_queue wants a zero-based number, so bonding driver needs
>to subtract one to return the proper queue number.  Also fix grammar in
>a comment while in the vicinity.

	Andy, can you comment on this?

	If memory serves, the omission of queue ID zero was on purpose;
is this patch going to break any of the functionality added by:

commit bb1d912323d5dd50e1079e389f4e964be14f0ae3
Author: Andy Gospodarek <andy@greyhouse.net>
Date:   Wed Jun 2 08:40:18 2010 +0000

    bonding: allow user-controlled output slave selection


Ben Hutchings <bhutchings@solarflare.com> wrote:

>This looks basically correct, but it should use the proper functions:
>
>	skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;

	As Ben points out, skb_rx_queue_recorded, skb_record_rx_queue,
et al, do the offset by one internally, but the bond_slave_override
function is comparing the slave's queue_id to the skb->queue_mapping.

	That makes me wonder if this patch is going to mess things up,
and if bond_slave_override should also use the skb_rx_queue_recorded, et
al, functions.

	-J

>Phil Oester
>
>Signed-off-by: Phil Oester <kernel@linuxace.com>
>
>
>--- linux-2.6/drivers/net/bonding/bond_main.c.orig	2011-01-30 09:15:09.813843817 -0800
>+++ linux-2.6/drivers/net/bonding/bond_main.c	2011-02-17 18:02:46.919050909 -0800
>@@ -4537,11 +4537,11 @@
> {
> 	/*
> 	 * This helper function exists to help dev_pick_tx get the correct
>-	 * destination queue.  Using a helper function skips the a call to
>+	 * destination queue.  Using a helper function skips a call to
> 	 * skb_tx_hash and will put the skbs in the queue we expect on their
> 	 * way down to the bonding driver.
> 	 */
>-	return skb->queue_mapping;
>+	return skb->queue_mapping ? skb->queue_mapping - 1 : 0;
> }
>
> static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* [PATCH 0/2] Add ipv4 if addr hash.
From: David Miller @ 2011-02-18  4:46 UTC (permalink / raw)
  To: netdev

These changes are based upon something I brought up the other day,
namely that we do a full FIB lookup just to find whether a particular
unicast ipv4 address is attached to an interface.

Like ipv6, I added an RCU/RTNL protected hash table and reimplemented
__ip_dev_find() in terms of it.

These patches apply both with and without the route cache deletion
patch applied, but with the route cache removal applied the udpflood
test gets faster by ~13 seconds:

real	2m57.500s
user	0m10.640s
sys	2m46.910s

We can probably use this new table for other things as well.

^ permalink raw reply

* [PATCH 1/2] ipv4: Add hash table of interface addresses.
From: David Miller @ 2011-02-18  4:46 UTC (permalink / raw)
  To: netdev


This will be used to optimize __ip_dev_find() and friends.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |    1 +
 net/ipv4/devinet.c         |   44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ae8fdc5..5f81466 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -144,6 +144,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
 
 struct in_ifaddr {
+	struct hlist_node	hash;
 	struct in_ifaddr	*ifa_next;
 	struct in_device	*ifa_dev;
 	struct rcu_head		rcu_head;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 748cb5b..c1f2552 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -92,6 +92,38 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
 	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
 };
 
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value.  So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE	256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+
+static inline unsigned int inet_addr_hash(__be32 addr)
+{
+	u32 val = (__force u32) addr;
+
+	return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+		(IN4_ADDR_HSIZE - 1));
+}
+
+static void inet_hash_insert(struct in_ifaddr *ifa)
+{
+	unsigned int hash = inet_addr_hash(ifa->ifa_address);
+
+	spin_lock(&inet_addr_hash_lock);
+	hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+	spin_unlock(&inet_addr_hash_lock);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+	spin_lock(&inet_addr_hash_lock);
+	hlist_del_init_rcu(&ifa->hash);
+	spin_unlock(&inet_addr_hash_lock);
+}
+
 static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +297,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			}
 
 			if (!do_promote) {
+				inet_hash_remove(ifa);
 				*ifap1 = ifa->ifa_next;
 
 				rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -281,6 +314,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 	/* 2. Unlink it */
 
 	*ifap = ifa1->ifa_next;
+	inet_hash_remove(ifa1);
 
 	/* 3. Announce address deletion */
 
@@ -368,6 +402,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
 	ifa->ifa_next = *ifap;
 	*ifap = ifa;
 
+	inet_hash_insert(ifa);
+
 	/* Send message first, then call notifier.
 	   Notifier will trigger FIB update, so that
 	   listeners of netlink will know about new ifaddr */
@@ -521,6 +557,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
 	if (tb[IFA_ADDRESS] == NULL)
 		tb[IFA_ADDRESS] = tb[IFA_LOCAL];
 
+	INIT_HLIST_NODE(&ifa->hash);
 	ifa->ifa_prefixlen = ifm->ifa_prefixlen;
 	ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
 	ifa->ifa_flags = ifm->ifa_flags;
@@ -728,6 +765,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 		if (!ifa) {
 			ret = -ENOBUFS;
 			ifa = inet_alloc_ifa();
+			INIT_HLIST_NODE(&ifa->hash);
 			if (!ifa)
 				break;
 			if (colon)
@@ -1069,6 +1107,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 			struct in_ifaddr *ifa = inet_alloc_ifa();
 
 			if (ifa) {
+				INIT_HLIST_NODE(&ifa->hash);
 				ifa->ifa_local =
 				  ifa->ifa_address = htonl(INADDR_LOOPBACK);
 				ifa->ifa_prefixlen = 8;
@@ -1710,6 +1749,11 @@ static struct rtnl_af_ops inet_af_ops = {
 
 void __init devinet_init(void)
 {
+	int i;
+
+	for (i = 0; i < IN4_ADDR_HSIZE; i++)
+		INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
 	register_pernet_subsys(&devinet_ops);
 
 	register_gifconf(PF_INET, inet_gifconf);
-- 
1.7.4.1


^ permalink raw reply related

* [PATCH 2/2] ipv4: Implement __ip_dev_find using new interface address hash.
From: David Miller @ 2011-02-18  4:46 UTC (permalink / raw)
  To: netdev


Much quicker than going through the FIB tables.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c      |   33 +++++++++++++++++++++++++++++++++
 net/ipv4/fib_frontend.c |   40 ----------------------------------------
 2 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c1f2552..1550881 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -124,6 +124,39 @@ static void inet_hash_remove(struct in_ifaddr *ifa)
 	spin_unlock(&inet_addr_hash_lock);
 }
 
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+	unsigned int hash = inet_addr_hash(addr);
+	struct net_device *result = NULL;
+	struct in_ifaddr *ifa;
+	struct hlist_node *node;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+		struct net_device *dev = ifa->ifa_dev->dev;
+
+		if (!net_eq(dev_net(dev), net))
+			continue;
+		if (ifa->ifa_address == addr) {
+			result = dev;
+			break;
+		}
+	}
+	if (result && devref)
+		dev_hold(result);
+	rcu_read_unlock();
+	return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
 static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
 
 static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2a49c06..ad0778a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -132,46 +132,6 @@ static void fib_flush(struct net *net)
 		rt_cache_flush(net, -1);
 }
 
-/**
- * __ip_dev_find - find the first device with a given source address.
- * @net: the net namespace
- * @addr: the source address
- * @devref: if true, take a reference on the found device
- *
- * If a caller uses devref=false, it should be protected by RCU, or RTNL
- */
-struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
-{
-	struct flowi fl = {
-		.fl4_dst = addr,
-	};
-	struct fib_result res = { 0 };
-	struct net_device *dev = NULL;
-	struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	res.r = NULL;
-#endif
-
-	rcu_read_lock();
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!local_table ||
-	    fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	if (res.type != RTN_LOCAL)
-		goto out;
-	dev = FIB_RES_DEV(res);
-
-	if (dev && devref)
-		dev_hold(dev);
-out:
-	rcu_read_unlock();
-	return dev;
-}
-EXPORT_SYMBOL(__ip_dev_find);
-
 /*
  * Find address type as if only "dev" was present in the system. If
  * on_dev is NULL then all interfaces are taken into consideration.
-- 
1.7.4.1


^ permalink raw reply related

* Re: [PATCH 1/2] ipv4: Add hash table of interface addresses.
From: Eric Dumazet @ 2011-02-18  6:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110217.204638.104041410.davem@davemloft.net>

Le jeudi 17 février 2011 à 20:46 -0800, David Miller a écrit :
> This will be used to optimize __ip_dev_find() and friends.
> 
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  include/linux/inetdevice.h |    1 +
>  net/ipv4/devinet.c         |   44 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 45 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
> index ae8fdc5..5f81466 100644
> --- a/include/linux/inetdevice.h
> +++ b/include/linux/inetdevice.h
> @@ -144,6 +144,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
>  #define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
>  
>  struct in_ifaddr {
> +	struct hlist_node	hash;
>  	struct in_ifaddr	*ifa_next;
>  	struct in_device	*ifa_dev;
>  	struct rcu_head		rcu_head;
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index 748cb5b..c1f2552 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -92,6 +92,38 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
>  	[IFA_LABEL]     	= { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
>  };
>  
> +/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
> + * value.  So if you change this define, make appropriate changes to
> + * inet_addr_hash as well.
> + */
> +#define IN4_ADDR_HSIZE	256
> +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
> +static DEFINE_SPINLOCK(inet_addr_hash_lock);
> +
> +static inline unsigned int inet_addr_hash(__be32 addr)
> +{
> +	u32 val = (__force u32) addr;
> +
> +	return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
> +		(IN4_ADDR_HSIZE - 1));
> +}

Maybe you should take into account net pointer here, or machines with
many net namespaces will hash collide for 127.0.0.1




^ permalink raw reply

* Re: 2.6.38-rc5 tcp_connect oops: EIP = 0x0
From: Eric Dumazet @ 2011-02-18  6:32 UTC (permalink / raw)
  To: George Spelvin; +Cc: linux-kernel, netdev
In-Reply-To: <20110218050321.10415.qmail@science.horizon.com>

Le vendredi 18 février 2011 à 00:03 -0500, George Spelvin a écrit :
> But wonder of wonders, kernel mode switching worked so I got to see the
> oops on the text-mode console.  It happened just as I tried to ssh out.
> 

CC netdev (removed linux-netdev)

I'll take a look this morning, thanks for the report.

> It's a Core 2 duo laptop (Dell E1405), 2 GB RAM, running a 32-bit kernel.
> It's worth noting that I was using wired internet (b44 driver) and
> not wireless.
> 
> I've been having a lot of weird lockups with 2.6.38-rcX, quite a change
> from the very stable 2.6.36, but this is the first time I booted -rc5.
> Also, the symptoms are very different; before the lockup did not seen
> correlated with any particular activity, but the "lockup" was more like
> something getting wedged in the kernel that more and more tasks would
> get stuck on until everything stopped responding.
> 
> I should mention that this is transcribed by hand from the screen.
> Oh, and also, it is far from the first time I ran ssh this boot.
> (I re-tested it ater rebooting, just to be sure.  Not a consistent
> crash.)
> 
> Anyway, jumping to address 0 looks "interesting", so it seems worth reporting.
> 
> BUG: unable to handle kernel NULL pointer dereference at   (null)
> IP: [<  (null)>]   (null)
> *pde = 00000000
> Oops: 0000 [#1] SMP
> last sysfs file: /sys/devices/pci0000:00/0000:00:1f.2/host0/target0:0:0/0:0:0:0/block/sda/uevent
> Modules linked in: rfcomm btusb sco l2cap crc16 bluetooth b43 mac80211 cfg80211 [last unloaded: sha256_generic]
> 
> Pid: 12178, comm: ssh Not tainted 2.6.38-rc5 #227 Dell Inc. MXC061                          /0MG532
> EIP: 0060:[<00000000>] EFLAGS: 0021246 CPU: 0
> EIP is at 0x0
> EAX: f5947e00 EBX: f5982f80 ECX: 00000024 EDX: c148fe00
> ESI: f5947e00 EDI: ebf29e54 EBP: ebf29ee0 ESP: ebf29dd0
>  DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
> Process ssh (pid: 12178, ti=ebf28000 task=ebefc6c0 task.ti = ebef28000)
> Stack:
>  c12c83ea 07e2427d 02000000 7b018f79 c117b827 0b996609 7b018f79 5f5f6644
>  f5982f80 00000000 ebf29e58 ebf29ee0 c12cc0e0 00001600 00000000 00000001
>  03641600 00000000 00000000 00000000 00000000 036423c0 3e6423c0 00000000
> Call Trace:
>  [<c12c83ea>] ? tcp_connect+0xdd/0x3fd
>  [<c117b827>] ? secure_tcp_sequence_number+0x4f/0x65
>  [<c12cc0e0>] ? tcp_v4_connect+0x3c1/0x417
>  [<c12d6726>] ? inet_stream_connect+0x88/0x1fc
>  [<c1110705>] ? _copy_from_user+0x2b/0x10e
>  [<c129307d>] ? sys_connect+0x70/0x98
>  [<c108df0f>] ? get_empty_filp+0x9f/0x121
>  [<c108dfa0>] ? alloc_file+0xf/0x85
>  [<c1293287>] ? sock_alloc_file+0x97/0xeb
>  [<c108b758>] ? fd_install+0x1b/0x38
>  [<c12932f6>] ? sock_map_fd+0x1b/0x20
>  [<c1293bdf>] ? sys_socketcall+0x9d/0x291
>  [<c1002750>] ? sysenter_do_call+0x12/0x26
> Code:  Bad EIP value
> EIP: [<00000000>] 0x0 SS:ESP 0068:ebf29dd0
> CR2: 0000000000000000

^ permalink raw reply

* Re: [PATCH v2 09/13] can: pruss CAN driver.
From: Subhasish Ghosh @ 2011-02-18  7:07 UTC (permalink / raw)
  To: Kurt Van Dijck
  Cc: sachi-EvXpCiN+lbve9wHmmfpqLFaTQe2KTcn/,
	davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/,
	open list:CAN NETWORK DRIVERS, nsekhar-l0cyMroinI0, open list,
	open list:CAN NETWORK DRIVERS,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	m-watkins-l0cyMroinI0, Wolfgang Grandegger
In-Reply-To: <20110211152026.GC373-MxZ6Iy/zr/UdbCeoMzGj59i2O/JbrIOy@public.gmane.org>

--------------------------------------------------
From: "Kurt Van Dijck" <kurt.van.dijck-/BeEPy95v10@public.gmane.org>
Sent: Friday, February 11, 2011 8:50 PM
To: "Subhasish Ghosh" <subhasish-EvXpCiN+lbve9wHmmfpqLFaTQe2KTcn/@public.gmane.org>
Cc: <davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/@public.gmane.org>; 
<linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r@public.gmane.org>; <m-watkins-l0cyMroinI0@public.gmane.org>; 
<nsekhar-l0cyMroinI0@public.gmane.org>; <sachi-EvXpCiN+lbve9wHmmfpqLFaTQe2KTcn/@public.gmane.org>; "Wolfgang Grandegger" 
<wg-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>; "open list:CAN NETWORK DRIVERS" 
<socketcan-core-0fE9KPoRgkgATYTw5x5z8w@public.gmane.org>; "open list:CAN NETWORK DRIVERS" 
<netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>; "open list" <linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org>
Subject: Re: [PATCH v2 09/13] can: pruss CAN driver.

> Hi,
>
> I looked a bit at the TX path:
>
> On Fri, Feb 11, 2011 at 08:21:28PM +0530, Subhasish Ghosh wrote:
>> +static int omapl_pru_can_set_bittiming(struct net_device *ndev)
>> +{
>> + struct omapl_pru_can_priv *priv = netdev_priv(ndev);
>> + struct can_bittiming *bt = &priv->can.bittiming;
>> + long bit_error = 0;
>> +
>> + if (priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) {
>> + dev_warn(priv->dev, "WARN: Triple"
>> + "sampling not set due to h/w limitations");
> You should not have enabled CAN_CTRLMODE_3_SAMPLES in the first place?

SG - Ok Will remove.
>> + }
>> + if (pru_can_calc_timing(priv->dev, priv->can.clock.freq,
>> + bt->bitrate) != 0)
>> + return -EINVAL;
>> + bit_error =
>> +     (((priv->timer_freq / (priv->timer_freq / bt->bitrate)) -
>> +       bt->bitrate) * 1000) / bt->bitrate;
>> + if (bit_error) {
>> + bit_error =
>> +     (((priv->timer_freq / (priv->timer_freq / bt->bitrate)) -
>> +       bt->bitrate) * 1000000) / bt->bitrate;
>> + printk(KERN_INFO "\nBitrate error %ld.%ld%%\n",
>> + bit_error / 10000, bit_error % 1000);
>> + } else
>> + printk(KERN_INFO "\nBitrate error 0.0%%\n");
>> +
>> + return 0;
>> +}
> I wonder how much of this code is duplicated from drivers/net/can/dev.c ?
SG - Well, I just followed ti_hecc.c :-)

>
>> +static netdev_tx_t omapl_pru_can_start_xmit(struct sk_buff *skb,
>> +     struct net_device *ndev)
>> +{
>> + struct omapl_pru_can_priv *priv = netdev_priv(ndev);
>> + struct can_frame *cf = (struct can_frame *)skb->data;
>> + int count;
>> + u8 *data = cf->data;
>> + u8 dlc = cf->can_dlc;
>> + u8 *ptr8data = NULL;
>> +
> most drivers start with:
> if (can_dropped_invalid_skb(dev, skb))
> return NETDEV_TX_OK;

SG - Will do.
>
>> + netif_stop_queue(ndev);
> why would you stop when you just resumed the queue?

SG - I do not want more than one transmit request at one time. Hence, on 
entering the transmit
I am using netif_stop_queue to disable tx.

>> + if (cf->can_id & CAN_EFF_FLAG) /* Extended frame format */
>> + *((u32 *) &priv->can_tx_hndl.strcanmailbox) =
>> +     (cf->can_id & CAN_EFF_MASK) | PRU_CANMID_IDE;
>> + else /* Standard frame format */
>> + *((u32 *) &priv->can_tx_hndl.strcanmailbox) =
>> +     (cf->can_id & CAN_SFF_MASK) << 18;
>> +
>> + if (cf->can_id & CAN_RTR_FLAG) /* Remote transmission request */
>> + *((u32 *) &priv->can_tx_hndl.strcanmailbox) |= CAN_RTR_FLAG;
>> +
>> + ptr8data = &priv->can_tx_hndl.strcanmailbox.u8data7 + (dlc - 1);
>> + for (count = 0; count < (u8) dlc; count++) {
>> + *ptr8data-- = *data++;
>> + }
>> + *((u32 *) &priv->can_tx_hndl.strcanmailbox.u16datalength) = (u32) dlc;
>> +/*
>> + * search for the next available mbx
>> + * if the next mbx is busy, then try the next + 1
>> + * do this until the head is reached.
>> + * if still unable to tx, stop accepting any packets
>> + * if able to tx and the head is reached, then reset next to tail, i.e 
>> mbx0
>> + * if head is not reached, then just point to the next mbx
>> + */
>> + for (; priv->tx_next <= priv->tx_head; priv->tx_next++) {
>> + priv->can_tx_hndl.ecanmailboxnumber =
>> +     (can_mailbox_number) priv->tx_next;
>> + if (-1 == pru_can_write_data_to_mailbox(priv->dev,
>> + &priv->can_tx_hndl)) {
>> + if (priv->tx_next == priv->tx_head) {
>> + priv->tx_next = priv->tx_tail;
>> + if (!netif_queue_stopped(ndev))
> If you get here, the queue is not stopped. This test is therefore useless.

SG -Ok, will remove
>> + netif_stop_queue(ndev); /* IF stalled */
>> + dev_err(priv->dev,
>> + "%s: no tx mbx available", __func__);
>> + return NETDEV_TX_BUSY;
>> + } else
>> + continue;
>> + } else {
>> + /* set transmit request */
>> + pru_can_tx(priv->dev, priv->tx_next, CAN_TX_PRU_1);
>> + pru_can_tx_mode_set(priv->dev, false, ecanreceive);
>> + pru_can_tx_mode_set(priv->dev, true, ecantransmit);
>> + pru_can_start_abort_tx(priv->dev, PRU_CAN_START);
>> + priv->tx_next++;
>> + can_put_echo_skb(skb, ndev, 0);
>> + break;
>> + }
>> + }
>> + if (priv->tx_next > priv->tx_head) {
>> + priv->tx_next = priv->tx_tail;
>> + }
>> + return NETDEV_TX_OK;
>> +}
>> +
>> +
>
>> +irqreturn_t omapl_tx_can_intr(int irq, void *dev_id)
>> +{
>> + struct net_device *ndev = dev_id;
>> + struct omapl_pru_can_priv *priv = netdev_priv(ndev);
>> + struct net_device_stats *stats = &ndev->stats;
>> + u32 bit_set, mbxno;
>> +
>> + pru_can_get_intr_status(priv->dev, &priv->can_tx_hndl);
>> + if ((PRU_CAN_ISR_BIT_CCI & priv->can_tx_hndl.u32interruptstatus)
>> +     || (PRU_CAN_ISR_BIT_SRDI & priv->can_tx_hndl.u32interruptstatus)) {
>> + __can_debug("tx_int_status = 0x%X\n",
>> +     priv->can_tx_hndl.u32interruptstatus);
>> + can_free_echo_skb(ndev, 0);
>> + } else {
>> + for (bit_set = 0; ((priv->can_tx_hndl.u32interruptstatus & 0xFF)
>> + >> bit_set != 0); bit_set++)
>> + ;
>> + if (0 == bit_set) {
>> + __can_err("%s: invalid mailbox number\n", __func__);
>> + can_free_echo_skb(ndev, 0);
>> + } else {
>> + mbxno = bit_set - 1; /* mail box numbering starts from 0 */
>> + if (PRU_CAN_ISR_BIT_ESI & priv->can_tx_hndl.
>> +     u32interruptstatus) {
>> + /* read gsr and ack pru */
>> + pru_can_get_global_status(priv->dev, &priv->can_tx_hndl);
>> + omapl_pru_can_err(ndev,
>> +   priv->can_tx_hndl.
>> +   u32interruptstatus,
>> +   priv->can_tx_hndl.
>> +   u32globalstatus);
>> + } else {
>> + stats->tx_packets++;
>> + /* stats->tx_bytes += dlc; */
>> + /*can_get_echo_skb(ndev, 0);*/
>> + }
>> + }
>> + }
>> + if (netif_queue_stopped(ndev))
> you can call netif_wake_queue(ndev) multiple times, so there is no need
> for netif_queue_stopped()

SG -Ok, will remove

>> + netif_wake_queue(ndev);
>> +
>> + can_get_echo_skb(ndev, 0);
>> + pru_can_tx_mode_set(priv->dev, true, ecanreceive);
>> + return IRQ_HANDLED;
>> +}
>> +
>> +static int omapl_pru_can_open(struct net_device *ndev)
>> +{
>> + struct omapl_pru_can_priv *priv = netdev_priv(ndev);
>> + int err;
>> +
>> + /* register interrupt handler */
>> + err = request_irq(priv->trx_irq, &omapl_rx_can_intr, IRQF_SHARED,
>> +   "pru_can_irq", ndev);
> you're doing a lot of work _in_ the irq handler. Maybe threaded irq?
>
SG -Ok, will do

>> +static int omapl_pru_can_close(struct net_device *ndev)
>> +{
>> + struct omapl_pru_can_priv *priv = netdev_priv(ndev);
>> +
>> + if (!netif_queue_stopped(ndev))
> check is not needed.

SG -Ok, will remove

>> + netif_stop_queue(ndev);
>> +
>> + close_candev(ndev);
>> +
>> + free_irq(priv->trx_irq, ndev);
>> + return 0;
>> +}
>> +
>
> Regards,
> Kurt 

^ permalink raw reply

* Re: IGMP and rwlock: Dead ocurred again on TILEPro
From: Cypher Wu @ 2011-02-18  7:08 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: David Miller, xiyou.wangcong, linux-kernel, eric.dumazet, netdev
In-Reply-To: <4D5DACC5.60105@tilera.com>

On Fri, Feb 18, 2011 at 7:18 AM, Chris Metcalf <cmetcalf@tilera.com> wrote:
> On 2/17/2011 6:11 PM, David Miller wrote:
>> From: Chris Metcalf <cmetcalf@tilera.com>
>> Date: Thu, 17 Feb 2011 18:04:13 -0500
>>
>>> On 2/17/2011 5:53 PM, David Miller wrote:
>>>> From: Chris Metcalf <cmetcalf@tilera.com>
>>>> Date: Thu, 17 Feb 2011 17:49:46 -0500
>>>>
>>>>> The fix is to disable interrupts for the arch_read_lock family of methods.
>>>> How does that help handle the race when it happens between different
>>>> cpus, instead of between IRQ and non-IRQ context on the same CPU?
>>> There's no race in that case, since the lock code properly backs off and
>>> retries until the other cpu frees it.  The distinction here is that the
>>> non-IRQ context is "wedged" by the IRQ context.
>>>
>>>> Why don't you just use the generic spinlock based rwlock code on Tile,
>>>> since that is all that your atomic instructions can handle
>>>> sufficiently?
>>> The tile-specific code encodes reader/writer information in the same 32-bit
>>> word that the test-and-set instruction manipulates, so it's more efficient
>>> both in space and time.  This may not really matter for rwlocks, since no
>>> one cares much about them any more, but that was the motivation.
>> Ok, but IRQ disabling is going to be very expensive.
>
> The interrupt architecture on Tile allows a write to a special-purpose
> register to put you into a "critical section" where no interrupts or faults
> are delivered.  So we just need to bracket the read_lock operations with
> two SPR writes; each takes six machine cycles, so we're only adding 12
> cycles to the total cost of taking or releasing a read lock on an rwlock.
>
> --
> Chris Metcalf, Tilera Corp.
> http://www.tilera.com
>
>

Adding that to SPR writes should be fine, but it may cause interrupt
delay a little more that other platform's read_lock()?

Another question: What NMI in the former mail means?

Looking forward to your patch.

Regards

-- 
Cyberman Wu

^ permalink raw reply

* Re: [PATCH v2 09/13] can: pruss CAN driver.
From: Wolfgang Grandegger @ 2011-02-18  7:53 UTC (permalink / raw)
  To: Subhasish Ghosh
  Cc: sachi-EvXpCiN+lbve9wHmmfpqLFaTQe2KTcn/,
	davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/,
	CAN NETWORK DRIVERS, nsekhar-l0cyMroinI0, open list,
	CAN NETWORK DRIVERS, m-watkins-l0cyMroinI0,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <32A5399EB727427C98185089E5DBFA65@subhasishg>

On 02/18/2011 08:07 AM, Subhasish Ghosh wrote:
> --------------------------------------------------
> From: "Kurt Van Dijck" <kurt.van.dijck-/BeEPy95v10@public.gmane.org>

...
>>> + /* register interrupt handler */
>>> + err = request_irq(priv->trx_irq, &omapl_rx_can_intr, IRQF_SHARED,
>>> +   "pru_can_irq", ndev);
>> you're doing a lot of work _in_ the irq handler. Maybe threaded irq?
>>
> SG -Ok, will do

No, please use NAPI instead.

Wolfgang

^ permalink raw reply

* Re: [PATCH v2 09/13] can: pruss CAN driver.
From: Subhasish Ghosh @ 2011-02-18  8:15 UTC (permalink / raw)
  To: Wolfgang Grandegger
  Cc: sachi-EvXpCiN+lbve9wHmmfpqLFaTQe2KTcn/,
	davinci-linux-open-source-VycZQUHpC/PFrsHnngEfi1aTQe2KTcn/,
	open list:CAN NETWORK DRIVERS, nsekhar-l0cyMroinI0, open list,
	open list:CAN NETWORK DRIVERS, m-watkins-l0cyMroinI0,
	linux-arm-kernel-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r
In-Reply-To: <4D5E2570.10108-5Yr1BZd7O62+XT7JhA+gdA@public.gmane.org>

> On 02/18/2011 08:07 AM, Subhasish Ghosh wrote:
>> --------------------------------------------------
>> From: "Kurt Van Dijck" <kurt.van.dijck-/BeEPy95v10@public.gmane.org>
>
> ...
>>>> + /* register interrupt handler */
>>>> + err = request_irq(priv->trx_irq, &omapl_rx_can_intr, IRQF_SHARED,
>>>> +   "pru_can_irq", ndev);
>>> you're doing a lot of work _in_ the irq handler. Maybe threaded irq?
>>>
>> SG -Ok, will do
>
> No, please use NAPI instead.

We are using h/w filters, so the number of interrupts coming into the 
processor are not hogging it.
I feel that we may not require an interrupt mitigation.

-Subhasish 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox