* [PATCH v3 03/10] net: Add netdev interfaces recording send/compl
From: Tom Herbert @ 2011-11-23 5:52 UTC (permalink / raw)
To: davem, netdev
Add interfaces for drivers to call for recording number of packets and
bytes at send time and transmit completion. Also, added a function to
"reset" a queue. These will be used by Byte Queue Limits.
Signed-off-by: Tom Herbert <therbert@google.com>
---
include/linux/netdevice.h | 29 +++++++++++++++++++++++++++++
1 files changed, 29 insertions(+), 0 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index dfb50ed..8b3eb8a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1924,6 +1924,35 @@ static inline int netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_qu
return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}
+static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
+ unsigned int pkts, unsigned int bytes)
+{
+}
+
+static inline void netdev_sent_queue(struct net_device *dev,
+ unsigned int pkts, unsigned int bytes)
+{
+ netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
+}
+
+static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
+ unsigned pkts, unsigned bytes)
+{
+}
+
+static inline void netdev_completed_queue(struct net_device *dev,
+ unsigned pkts, unsigned bytes)
+{
+ netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
+}
+
+static inline void netdev_tx_reset_queue(struct netdev_queue *q)
+{
+}
+
+static inline void netdev_reset_queue(struct net_device *dev_queue)
+{
+ netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0));
}
/**
--
1.7.3.1
^ permalink raw reply related
* [PATCH v3 02/10] net: Add queue state xoff flag for stack
From: Tom Herbert @ 2011-11-23 5:52 UTC (permalink / raw)
To: davem, netdev
Create separate queue state flags so that either the stack or drivers
can turn on XOFF. Added a set of functions used in the stack to determine
if a queue is really stopped (either by stack or driver)
Signed-off-by: Tom Herbert <therbert@google.com>
---
include/linux/netdevice.h | 41 ++++++++++++++++++++++++++++++-----------
net/core/dev.c | 4 ++--
net/core/netpoll.c | 4 ++--
net/core/pktgen.c | 2 +-
net/sched/sch_generic.c | 8 ++++----
net/sched/sch_multiq.c | 6 ++++--
net/sched/sch_teql.c | 6 +++---
7 files changed, 46 insertions(+), 25 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cbeb586..dfb50ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -516,11 +516,23 @@ static inline void napi_synchronize(const struct napi_struct *n)
#endif
enum netdev_queue_state_t {
- __QUEUE_STATE_XOFF,
+ __QUEUE_STATE_DRV_XOFF,
+ __QUEUE_STATE_STACK_XOFF,
__QUEUE_STATE_FROZEN,
-#define QUEUE_STATE_XOFF_OR_FROZEN ((1 << __QUEUE_STATE_XOFF) | \
- (1 << __QUEUE_STATE_FROZEN))
+#define QUEUE_STATE_ANY_XOFF ((1 << __QUEUE_STATE_DRV_XOFF) | \
+ (1 << __QUEUE_STATE_STACK_XOFF))
+#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
+ (1 << __QUEUE_STATE_FROZEN))
};
+/*
+ * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue. The
+ * netif_tx_* functions below are used to manipulate this flag. The
+ * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
+ * queue independently. The netif_xmit_*stopped functions below are called
+ * to check if the queue has been stopped by the driver or stack (either
+ * of the XOFF bits are set in the state). Drivers should not need to call
+ * netif_xmit*stopped functions, they should only be using netif_tx_*.
+ */
struct netdev_queue {
/*
@@ -1783,7 +1795,7 @@ extern void __netif_schedule(struct Qdisc *q);
static inline void netif_schedule_queue(struct netdev_queue *txq)
{
- if (!test_bit(__QUEUE_STATE_XOFF, &txq->state))
+ if (!(txq->state & QUEUE_STATE_ANY_XOFF))
__netif_schedule(txq->qdisc);
}
@@ -1797,7 +1809,7 @@ static inline void netif_tx_schedule_all(struct net_device *dev)
static inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
- clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
+ clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}
/**
@@ -1829,7 +1841,7 @@ static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue)
return;
}
#endif
- if (test_and_clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state))
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state))
__netif_schedule(dev_queue->qdisc);
}
@@ -1861,7 +1873,7 @@ static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
pr_info("netif_stop_queue() cannot be called before register_netdev()\n");
return;
}
- set_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
+ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}
/**
@@ -1888,7 +1900,7 @@ static inline void netif_tx_stop_all_queues(struct net_device *dev)
static inline int netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
- return test_bit(__QUEUE_STATE_XOFF, &dev_queue->state);
+ return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}
/**
@@ -1902,9 +1914,16 @@ static inline int netif_queue_stopped(const struct net_device *dev)
return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}
-static inline int netif_tx_queue_frozen_or_stopped(const struct netdev_queue *dev_queue)
+static inline int netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
- return dev_queue->state & QUEUE_STATE_XOFF_OR_FROZEN;
+ return dev_queue->state & QUEUE_STATE_ANY_XOFF;
+}
+
+static inline int netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
+{
+ return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
+}
+
}
/**
@@ -1991,7 +2010,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
if (netpoll_trap())
return;
#endif
- if (test_and_clear_bit(__QUEUE_STATE_XOFF, &txq->state))
+ if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state))
__netif_schedule(txq->qdisc);
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ba50a1..8ca56c0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2256,7 +2256,7 @@ gso:
return rc;
}
txq_trans_update(txq);
- if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
+ if (unlikely(netif_xmit_stopped(txq) && skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
@@ -2530,7 +2530,7 @@ int dev_queue_xmit(struct sk_buff *skb)
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
__this_cpu_dec(xmit_recursion);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index cf64c1f..f1667f8 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -76,7 +76,7 @@ static void queue_process(struct work_struct *work)
local_irq_save(flags);
__netif_tx_lock(txq, smp_processor_id());
- if (netif_tx_queue_frozen_or_stopped(txq) ||
+ if (netif_xmit_frozen_or_stopped(txq) ||
ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
__netif_tx_unlock(txq);
@@ -317,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
if (__netif_tx_trylock(txq)) {
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
status = ops->ndo_start_xmit(skb, dev);
if (status == NETDEV_TX_OK)
txq_trans_update(txq);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0001c24..de2f017 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3345,7 +3345,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
__netif_tx_lock_bh(txq);
- if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) {
+ if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69fca27..7c84f08 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -60,7 +60,7 @@ static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
/* check the reason of requeuing without tx lock first */
txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
- if (!netif_tx_queue_frozen_or_stopped(txq)) {
+ if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
q->q.qlen--;
} else
@@ -121,7 +121,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
spin_unlock(root_lock);
HARD_TX_LOCK(dev, txq, smp_processor_id());
- if (!netif_tx_queue_frozen_or_stopped(txq))
+ if (!netif_xmit_frozen_or_stopped(txq))
ret = dev_hard_start_xmit(skb, dev, txq);
HARD_TX_UNLOCK(dev, txq);
@@ -143,7 +143,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
ret = dev_requeue_skb(skb, q);
}
- if (ret && netif_tx_queue_frozen_or_stopped(txq))
+ if (ret && netif_xmit_frozen_or_stopped(txq))
ret = 0;
return ret;
@@ -242,7 +242,7 @@ static void dev_watchdog(unsigned long arg)
* old device drivers set dev->trans_start
*/
trans_start = txq->trans_start ? : dev->trans_start;
- if (netif_tx_queue_stopped(txq) &&
+ if (netif_xmit_stopped(txq) &&
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
some_queue_timedout = 1;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index edc1950..49131d7 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -107,7 +107,8 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
- if (!__netif_subqueue_stopped(qdisc_dev(sch), q->curband)) {
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
@@ -138,7 +139,8 @@ static struct sk_buff *multiq_peek(struct Qdisc *sch)
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
- if (!__netif_subqueue_stopped(qdisc_dev(sch), curband)) {
+ if (!netif_xmit_stopped(
+ netdev_get_tx_queue(qdisc_dev(sch), curband))) {
qdisc = q->queues[curband];
skb = qdisc->ops->peek(qdisc);
if (skb)
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index a3b7120..283bfe3 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -301,7 +301,7 @@ restart:
if (slave_txq->qdisc_sleeping != q)
continue;
- if (__netif_subqueue_stopped(slave, subq) ||
+ if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
!netif_running(slave)) {
busy = 1;
continue;
@@ -312,7 +312,7 @@ restart:
if (__netif_tx_trylock(slave_txq)) {
unsigned int length = qdisc_pkt_len(skb);
- if (!netif_tx_queue_frozen_or_stopped(slave_txq) &&
+ if (!netif_xmit_frozen_or_stopped(slave_txq) &&
slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
txq_trans_update(slave_txq);
__netif_tx_unlock(slave_txq);
@@ -324,7 +324,7 @@ restart:
}
__netif_tx_unlock(slave_txq);
}
- if (netif_queue_stopped(dev))
+ if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
busy = 1;
break;
case 1:
--
1.7.3.1
^ permalink raw reply related
* [PATCH v3 01/10] dql: Dynamic queue limits
From: Tom Herbert @ 2011-11-23 5:52 UTC (permalink / raw)
To: davem, netdev
Implementation of dynamic queue limits (dql). This is a libary which
allows a queue limit to be dynamically managed. The goal of dql is
to set the queue limit, number of objects to the queue, to be minimized
without allowing the queue to be starved.
dql would be used with a queue which has these properties:
1) Objects are queued up to some limit which can be expressed as a
count of objects.
2) Periodically a completion process executes which retires consumed
objects.
3) Starvation occurs when limit has been reached, all queued data has
actually been consumed but completion processing has not yet run,
so queuing new data is blocked.
4) Minimizing the amount of queued data is desirable.
A canonical example of such a queue would be a NIC HW transmit queue.
The queue limit is dynamic, it will increase or decrease over time
depending on the workload. The queue limit is recalculated each time
completion processing is done. Increases occur when the queue is
starved and can exponentially increase over successive intervals.
Decreases occur when more data is being maintained in the queue than
needed to prevent starvation. The number of extra objects, or "slack",
is measured over successive intervals, and to avoid hysteresis the
limit is only reduced by the miminum slack seen over a configurable
time period.
dql API provides routines to manage the queue:
- dql_init is called to intialize the dql structure
- dql_reset is called to reset dynamic values
- dql_queued called when objects are being enqueued
- dql_avail returns availability in the queue
- dql_completed is called when objects have be consumed in the queue
Configuration consists of:
- max_limit, maximum limit
- min_limit, minimum limit
- slack_hold_time, time to measure instances of slack before reducing
queue limit
Signed-off-by: Tom Herbert <therbert@google.com>
---
include/linux/dynamic_queue_limits.h | 81 +++++++++++++++++++++
lib/Kconfig | 3 +
lib/Makefile | 2 +
lib/dynamic_queue_limits.c | 132 ++++++++++++++++++++++++++++++++++
4 files changed, 218 insertions(+), 0 deletions(-)
create mode 100644 include/linux/dynamic_queue_limits.h
create mode 100644 lib/dynamic_queue_limits.c
diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
new file mode 100644
index 0000000..8953187
--- /dev/null
+++ b/include/linux/dynamic_queue_limits.h
@@ -0,0 +1,81 @@
+/*
+ * Dynamic queue limits (dql) - Definitions
+ *
+ * Copyright (c) 2011, Tom Herbert <therbert@google.com>
+ *
+ * This header file contains the definitions for dynamic queue limits (dql).
+ * dql would be used in conjunction with a producer/consumer type queue
+ * (possibly a HW queue). Such a queue would have these general properties:
+ *
+ * 1) Objects are queued up to some limit specified as number of objects.
+ * 2) Periodically a completion process executes which retires consumed
+ * objects.
+ * 3) Starvation occurs when limit has been reached, all queued data has
+ * actually been consumed, but completion processing has not yet run
+ * so queuing new data is blocked.
+ * 4) Minimizing the amount of queued data is desirable.
+ *
+ * The goal of dql is to calculate the limit as the minimum number of objects
+ * needed to prevent starvation.
+ *
+ * The dql implementation does not implement any locking for the dql data
+ * structures, the higher layer should provide this.
+ */
+
+#ifndef _LINUX_DQL_H
+#define _LINUX_DQL_H
+
+#ifdef __KERNEL__
+
+struct dql {
+ unsigned long num_queued; /* Total ever queued */
+ unsigned long last_obj_cnt; /* Count at last queuing */
+
+ unsigned long limit ____cacheline_aligned_in_smp; /* Current limit */
+ unsigned long prev_ovlimit; /* Previous over limit */
+
+ unsigned long prev_num_queued; /* Previous queue total */
+ unsigned long num_completed; /* Total ever completed */
+
+ unsigned long prev_last_obj_cnt; /* Previous queuing cnt */
+
+ unsigned long lowest_slack; /* Lowest slack found */
+ unsigned long slack_start_time; /* Time slacks seen */
+
+ unsigned long max_limit ____cacheline_aligned_in_smp; /* Max limit */
+ unsigned long min_limit; /* Minimum limit */
+ unsigned slack_hold_time; /* Time to measure slack */
+};
+
+/* Set some static maximums */
+#define DQL_MAX_OBJECT (-1UL / 16)
+#define DQL_MAX_LIMIT ((-1UL / 2) - DQL_MAX_OBJECT)
+
+/* Record number of objects queued. */
+static inline void dql_queued(struct dql *dql, unsigned long count)
+{
+ BUG_ON(count > DQL_MAX_OBJECT);
+ BUG_ON(dql->num_queued - dql->num_completed > DQL_MAX_LIMIT);
+
+ dql->num_queued += count;
+ dql->last_obj_cnt = count;
+}
+
+/* Returns how many objects can be queued, < 0 indicates over limit. */
+static inline long dql_avail(struct dql *dql)
+{
+ return dql->limit - (dql->num_queued - dql->num_completed);
+}
+
+/* Record number of completed objects and recalculate the limit. */
+extern void dql_completed(struct dql *dql, unsigned long count);
+
+/* Reset dql state */
+extern void dql_reset(struct dql *dql);
+
+/* Initialize dql state */
+extern int dql_init(struct dql *dql, unsigned hold_time);
+
+#endif /* _KERNEL_ */
+
+#endif /* _LINUX_DQL_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 32f3e5a..63b5782 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -244,6 +244,9 @@ config CPU_RMAP
bool
depends on SMP
+config DQL
+ bool
+
#
# Netlink attribute parsing support is select'ed if needed
#
diff --git a/lib/Makefile b/lib/Makefile
index a4da283..ff00d4d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -115,6 +115,8 @@ obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
obj-$(CONFIG_CORDIC) += cordic.o
+obj-$(CONFIG_DQL) += dynamic_queue_limits.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
new file mode 100644
index 0000000..9b9edb0
--- /dev/null
+++ b/lib/dynamic_queue_limits.c
@@ -0,0 +1,132 @@
+/*
+ * Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h
+ *
+ * Copyright (c) 2011, Tom Herbert <therbert@google.com>
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/kernel.h>
+#include <linux/dynamic_queue_limits.h>
+
+#define POSDIFF(A, B) ((A) > (B) ? (A) - (B) : 0)
+
+/* Records completed count and recalculates the queue limit */
+void dql_completed(struct dql *dql, unsigned long count)
+{
+ unsigned long inprogress, prev_inprogress, limit;
+ unsigned long ovlimit, all_prev_completed, completed;
+
+ /* Can't complete more than what's in queue */
+ BUG_ON(count > dql->num_queued - dql->num_completed);
+
+ completed = dql->num_completed + count;
+ limit = dql->limit;
+ ovlimit = POSDIFF(dql->num_queued - dql->num_completed, limit);
+ inprogress = dql->num_queued - completed;
+ prev_inprogress = dql->prev_num_queued - dql->num_completed;
+ all_prev_completed = POSDIFF(completed, dql->prev_num_queued);
+
+ if ((ovlimit && !inprogress) ||
+ (dql->prev_ovlimit && all_prev_completed)) {
+ /*
+ * Queue considered starved if:
+ * - The queue was over-limit in the last interval,
+ * and there is no more data in the queue.
+ * OR
+ * - The queue was over-limit in the previous interval and
+ * when enqueuing it was possible that all queued data
+ * had been consumed. This covers the case when queue
+ * may have becomes starved between completion processing
+ * running and next time enqueue was scheduled.
+ *
+ * When queue is starved increase the limit by the amount
+ * of bytes both sent and completed in the last interval,
+ * plus any previous over-limit.
+ */
+ limit += POSDIFF(completed, dql->prev_num_queued) +
+ dql->prev_ovlimit;
+ dql->slack_start_time = jiffies;
+ dql->lowest_slack = -1UL;
+ } else if (inprogress && prev_inprogress && !all_prev_completed) {
+ /*
+ * Queue was not starved, check if the limit can be decreased.
+ * A decrease is only considered if the queue has been busy in
+ * the whole interval (the check above).
+ *
+ * If there is slack, the amount of execess data queued above
+ * the the amount needed to prevent starvation, the queue limit
+ * can be decreased. To avoid hysteresis we consider the
+ * minimum amount of slack found over several iterations of the
+ * completion routine.
+ */
+ unsigned long slack, slack_last_objs;
+
+ /*
+ * Slack is the maximum of
+ * - The queue limit plus previous over-limit minus twice
+ * the number of objects completed. Note that two times
+ * number of completed bytes is a basis for an upper bound
+ * of the limit.
+ * - Portion of objects in the last queuing operation that
+ * was not part of non-zero previous over-limit. That is
+ * "round down" by non-overlimit portion of the last
+ * queueing operation.
+ */
+ slack = POSDIFF(limit + dql->prev_ovlimit,
+ 2 * (completed - dql->num_completed));
+ slack_last_objs = dql->prev_ovlimit ?
+ POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0;
+
+ slack = max(slack, slack_last_objs);
+
+ if (slack < dql->lowest_slack)
+ dql->lowest_slack = slack;
+
+ if (time_after(jiffies,
+ dql->slack_start_time + dql->slack_hold_time)) {
+ limit = POSDIFF(limit, dql->lowest_slack);
+ dql->slack_start_time = jiffies;
+ dql->lowest_slack = -1UL;
+ }
+ }
+
+ /* Enforce bounds on limit */
+ limit = clamp(limit, dql->min_limit, dql->max_limit);
+
+ if (limit != dql->limit) {
+ dql->limit = limit;
+ ovlimit = 0;
+ }
+
+ dql->prev_ovlimit = ovlimit;
+ dql->prev_last_obj_cnt = dql->last_obj_cnt;
+ dql->num_completed = completed;
+ dql->prev_num_queued = dql->num_queued;
+}
+EXPORT_SYMBOL(dql_completed);
+
+void dql_reset(struct dql *dql)
+{
+ /* Reset all dynamic values */
+ dql->limit = 0;
+ dql->num_queued = 0;
+ dql->num_completed = 0;
+ dql->last_obj_cnt = 0;
+ dql->prev_num_queued = 0;
+ dql->prev_last_obj_cnt = 0;
+ dql->prev_ovlimit = 0;
+ dql->lowest_slack = -1UL;
+ dql->slack_start_time = jiffies;
+}
+EXPORT_SYMBOL(dql_reset);
+
+int dql_init(struct dql *dql, unsigned hold_time)
+{
+ dql->max_limit = DQL_MAX_LIMIT;
+ dql->min_limit = 0;
+ dql->slack_hold_time = hold_time;
+ dql_reset(dql);
+ return 0;
+}
+EXPORT_SYMBOL(dql_init);
--
1.7.3.1
^ permalink raw reply related
* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Christian Kujau @ 2011-11-23 5:51 UTC (permalink / raw)
To: Benjamin Herrenschmidt
Cc: Eric Dumazet, Christoph Lameter, Markus Trippelsdorf, Alex,Shi,
linux-kernel@vger.kernel.org, linux-mm@kvack.org, Pekka Enberg,
Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <1322007501.14573.15.camel@pasglop>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 1132 bytes --]
On Wed, 23 Nov 2011 at 11:18, Benjamin Herrenschmidt wrote:
> I'd say only this one liner for now, just don't do slabinfo :-) I just
> want to see whether your network + heavy IO load problem goes away with
> that one patch.
OK, with Christoph's patch applied, 3.2.0-rc2-00274-g6fe4c6d-dirty survives
on this machine, with the disk & cpu workload that caused the machine to
panic w/o the patch. Load was at 4-5 this time, which is expected for this
box. I'll run a few more tests later on, but it seems ok for now.
I couldn't resist and ran "slabinfo" anyway (after the workload!) - the
box survived, nothing was printed in syslog either. Output attached.
Thanks!
Christian.
--- linux-2.6.orig/mm/slub.c 2011-11-21 21:15:41.575673204 -0600
+++ linux-2.6/mm/slub.c 2011-11-21 21:16:33.442336849 -0600
@@ -1969,7 +1969,7 @@
page->pobjects = pobjects;
page->next = oldpage;
- } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ } while (irqsafe_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
stat(s, CPU_PARTIAL_FREE);
return pobjects;
}
--
BOFH excuse #6:
global warming
[-- Attachment #2: Type: TEXT/plain, Size: 6091 bytes --]
Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg
:at-0000040 102 36 4.0K 0/0/1 102 0 0 89 *a
:at-0000064 41288 64 4.0M 964/565/15 64 0 57 65 *a
:t-0000008 2048 8 16.3K 4294967078/0/222 512 0 0 100 *
:t-0000016 2624 16 94.2K 4294967069/15/250 256 0 65 44 *
:t-0000024 3279 24 98.3K 19/17/5 170 0 70 80 *
:t-0000032 750 32 28.6K 4294967175/4/128 128 0 57 83 *
:t-0000056 771 56 69.6K 17/7/0 73 0 41 62 *
:t-0000064 5226 64 610.3K 137/101/12 64 0 67 54 *
:t-0000072 112 72 8.1K 4294967253/0/45 56 0 0 98 *
:t-0000088 4013 88 360.4K 69/3/19 46 0 3 97 *
:t-0000096 9184 96 1.0M 239/72/23 42 0 27 82 *
:t-0000104 26 104 4.0K 1/1/0 39 0 100 66 *
:t-0000128 2127 128 512.0K 106/91/19 32 0 72 53 *
:t-0000144 56 144 8.1K 4294967269/0/29 28 0 0 98 *
:t-0000152 430 152 147.4K 11/29/25 26 0 80 44 *
:t-0000160 7000 160 1.1M 268/0/12 25 0 0 97 *A
:t-0000192 2959 192 638.9K 147/24/9 21 0 15 88 *
:t-0000320 9482 320 3.6M 879/167/0 12 0 18 84 *A
:t-0000352 31 352 24.5K 4/4/2 11 0 66 44 *A
:t-0000480 36 480 20.4K 5/1/0 8 0 20 84 *A
:t-0000736 2 728 8.1K 1/1/0 11 1 100 17 *A
:t-0000768 268 768 253.9K 25/9/6 10 1 29 81 *A
:t-0002048 256 2048 573.4K 31/6/4 8 2 17 91 *
:t-0004096 168 4096 720.8K 20/1/2 8 3 4 95 *
anon_vma 2117 88 217.0K 51/4/2 42 0 7 85
arp_cache 6 380 4.0K 1/1/0 10 0 100 55 A
bdev_cache 25 744 24.5K 3/1/0 10 1 33 75 Aa
biovec-128 20 1536 32.7K 4294967287/0/11 10 2 0 93 A
biovec-256 98 3072 360.4K 11/2/0 10 3 18 83 A
blkdev_queue 15 1240 32.7K 2/1/0 13 2 50 56
blkdev_requests 62 216 24.5K 4294967273/3/29 18 0 50 54
dentry 219259 160 36.3M 8842/224/21 25 0 2 96 a
eventpoll_pwq 92 36 20.4K 5/5/0 102 0 100 16
ext4_allocation_context 0 108 4.0K 1/1/0 36 0 100 0 a
ext4_groupinfo_4k 7498 148 1.1M 289/1/0 26 0 0 93 a
ext4_inode_cache 206222 936 199.6M 12177/159/8 17 2 1 10 a
ext4_io_end 13 592 8.1K 0/0/1 13 1 0 93 a
ext4_io_page 512 8 4.0K 0/0/1 512 0 0 100 a
ext4_prealloc_space 17 96 4.0K 1/1/0 42 0 100 39 a
files_cache 90 208 20.4K 4294967288/0/13 18 0 0 91 A
inode_cache 3735 512 2.0M 248/0/1 15 1 0 93 a
jbd2_revoke_record 0 24 4.0K 1/1/0 128 0 100 0 Aa
jbd2_revoke_table 4 12 4.0K 1/1/0 256 0 100 1 a
kmalloc-1024 310 1024 352.2K 36/7/7 8 1 16 90
kmalloc-256 110 256 65.5K 4/13/12 16 0 81 42
kmalloc-512 532 512 282.6K 58/5/11 8 0 7 96
kmalloc-8192 7 8192 98.3K 3/2/0 4 3 66 58
kmem_cache 28 116 4.0K 1/1/0 32 0 100 79 *A
kmem_cache_node 100 60 8.1K 2/1/0 64 0 50 73 *A
mm_struct 70 540 49.1K 4/4/2 15 1 66 76 A
mqueue_inode_cache 1 744 8.1K 1/1/0 10 1 100 9 A
ndisc_cache 2 392 8.1K 1/1/0 19 1 100 9 A
posix_timers_cache 0 136 4.0K 1/1/0 30 0 100 0
proc_inode_cache 1194 544 753.6K 79/24/13 14 1 26 86 a
radix_tree_node 17934 296 6.8M 1669/962/14 13 0 57 77 a
RAWv6 5 896 8.1K 1/1/0 9 1 100 54 A
rpc_inode_cache 12 680 16.3K 2/1/0 11 1 50 49 Aa
sgpool-128 2 2560 32.7K 1/1/0 12 3 100 15 A
sgpool-32 2 640 8.1K 1/1/0 12 1 100 15 A
sgpool-64 2 1280 16.3K 1/1/0 12 2 100 15 A
shmem_inode_cache 700 576 409.6K 42/0/8 14 1 0 98
sighand_cache 141 1356 229.3K 3/3/4 23 3 42 83 A
sock_inode_cache 225 544 155.6K 1/7/18 14 1 36 78 Aa
task_delay_info 159 112 20.4K 4294967295/1/6 36 0 20 86
task_struct 143 2992 524.2K 11/5/5 10 3 31 81
taskstats 1 328 4.0K 1/1/0 12 0 100 8
TCP 59 1480 262.1K 2/6/6 21 3 75 33 A
TCPv6 6 1616 16.3K 1/1/0 10 2 100 59 A
tw_sock_TCP 10 104 4.0K 1/1/0 32 0 100 25 A
UDP 30 744 24.5K 2/0/1 10 1 0 90 A
UDPv6 9 880 8.1K 0/0/1 9 1 0 96 A
xfs_da_state 0 392 4.0K 1/1/0 10 0 100 0
xfs_inode 192270 984 200.9M 24533/1199/0 8 1 4 8 Aa
xfs_log_ticket 1 208 4.0K 1/1/0 19 0 100 5
^ permalink raw reply
* Re: [PATCH net-next 4/4] net: Add Open vSwitch kernel components.
From: Chris Wright @ 2011-11-23 5:37 UTC (permalink / raw)
To: John Fastabend
Cc: dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Jamal Hadi Salim,
David S. Miller
In-Reply-To: <4ECC5AAB.8000605-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
* John Fastabend (john.r.fastabend-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org) wrote:
> He is pushing and popping entire tags off 802.1Q for now but
> you can easily imagine MPLS tags and all sorts of other things
> people will _need_.
Right, already doing some generic encap/decap for tunnelling.
> Do we want tc and likely the skbedit action to explode into a
> packet mangling tool? Would it make sense to plug into ebtables
> perhaps with a new family, NFPROTO_OPENFLOW or even on the
> existing NFPROTO_BRIDGE.
Of course, ovs is not limited to layer2 switching.
> Although doing it with classifiers and more actions would flush
> out that TODO in act_mirred, and get us an mq_ingress among
> other things.
^ permalink raw reply
* Re: [GIT PULL v2] Open vSwitch
From: Chris Wright @ 2011-11-23 5:34 UTC (permalink / raw)
To: Stephen Hemminger
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
David Miller
In-Reply-To: <20111122151854.198da33d-We1ePj4FEcvRI77zikRAJc56i+j3xesD0e7PPNI6Mm0@public.gmane.org>
* Stephen Hemminger (shemminger-ZtmgI6mnKB3QT0dZR+AlfA@public.gmane.org) wrote:
> Maybe someone with more insight than me can explain the relationship
> between Openflow and Open vSwitch. It maybe that the portability
> of Openflow makes the old qdisc, classifiers to use/implement.
I'm sure I can't answer the last bit as well as you'd like. But openflow
is the control plane protocol between a controller and a switch.
The controller's job is to program the switch to enforce the controller's
view of network policy. For ovs, the protocol termination is essentially
in userspace.
The switch's flow table is managed via the controller and obviously
consulted on the datapath in the kernel. I think Jesse was already clear
that portability concerns were constrained to userspace. You could
imagine all kinds of funky ways that userspace could in turn ask the
kernel to enforce the flow table actions that the controller requested.
Your and Jamal's questions seem pretty clear...what does ovs do that
tc can't/doesn't and is that a fundamental gap, a cumbersome interface,
or a need to port existing functionality.
The only part I was unclear on in that question is whether you're
talking about the internals only, or also the netlink interface?
thanks,
-chris
^ permalink raw reply
* Re: Missing TCP SYN on loopback, retransmits after 1s
From: Eric Dumazet @ 2011-11-23 5:24 UTC (permalink / raw)
To: John Heffner; +Cc: Jesse Young, netdev
In-Reply-To: <CABrhC0m75OHmdDD-crr=tvTiRq_hnh7WpHtLjM9TKgTa24MSOA@mail.gmail.com>
Le mardi 22 novembre 2011 à 21:06 -0500, John Heffner a écrit :
> Offhand, I'd guess you're overflowing the TCP SYN queue. (You can try
> tuning tcp_max_syn_backlog.)
>
There is one litle thing called "netstat -s", a very useful tool,
included in many distros :)
^ permalink raw reply
* [PATCH 2/2] ax25: integer overflows in ax25_ctl_ioctl()
From: Xi Wang @ 2011-11-23 4:35 UTC (permalink / raw)
To: linux-kernel; +Cc: Joerg Reuter, Ralf Baechle, David Miller, linux-hams, netdev
ax25_ctl_ioctl() misses several bound checks on the user-controlled value.
Signed-off-by: Xi Wang <xi.wang@gmail.com>
---
net/ax25/af_ax25.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index be6a8cf..bd47e22 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -402,14 +402,14 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
break;
case AX25_T1:
- if (ax25_ctl.arg < 1)
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 30)
goto einval_put;
ax25->rtt = (ax25_ctl.arg * HZ) / 2;
ax25->t1 = ax25_ctl.arg * HZ;
break;
case AX25_T2:
- if (ax25_ctl.arg < 1)
+ if (ax25_ctl.arg < 1 || ax25_ctl.arg > 20)
goto einval_put;
ax25->t2 = ax25_ctl.arg * HZ;
break;
@@ -422,10 +422,14 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
break;
case AX25_T3:
+ if (ax25_ctl.arg > 3600)
+ goto einval_put;
ax25->t3 = ax25_ctl.arg * HZ;
break;
case AX25_IDLE:
+ if (ax25_ctl.arg > 65535)
+ goto einval_put;
ax25->idle = ax25_ctl.arg * 60 * HZ;
break;
--
1.7.5.4
^ permalink raw reply related
* [PATCH 1/2] ax25: integer overflows in ax25_setsockopt()
From: Xi Wang @ 2011-11-23 4:28 UTC (permalink / raw)
To: linux-kernel; +Cc: Joerg Reuter, Ralf Baechle, David Miller, linux-hams, netdev
ax25_setsockopt() misses several upper-bound checks on the
user-controlled value.
Reported-by: Fan Long <longfancn@gmail.com>
Signed-off-by: Xi Wang <xi.wang@gmail.com>
---
net/ax25/af_ax25.c | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index e7c69f4..be6a8cf 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -571,7 +571,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T1:
- if (opt < 1) {
+ if (opt < 1 || opt > 30) {
res = -EINVAL;
break;
}
@@ -580,7 +580,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T2:
- if (opt < 1) {
+ if (opt < 1 || opt > 20) {
res = -EINVAL;
break;
}
@@ -596,7 +596,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T3:
- if (opt < 1) {
+ if (opt < 0 || opt > 3600) {
res = -EINVAL;
break;
}
@@ -604,7 +604,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_IDLE:
- if (opt < 0) {
+ if (opt < 0 || opt > 65535) {
res = -EINVAL;
break;
}
--
1.7.5.4
^ permalink raw reply related
* linux-next: build failure after merge of the final tree (net-next tree related)
From: Stephen Rothwell @ 2011-11-23 4:00 UTC (permalink / raw)
To: David Miller, netdev; +Cc: linux-next, linux-kernel, Neil Horman
[-- Attachment #1: Type: text/plain, Size: 1966 bytes --]
Hi all,
After merging the final tree, today's linux-next build (powerpc allnoconfig)
failed like this:
In file included from include/linux/netdevice.h:53:0,
from include/linux/icmpv6.h:173,
from include/linux/ipv6.h:220,
from include/net/ipv6.h:16,
from include/linux/sunrpc/clnt.h:26,
from include/linux/nfs_fs.h:50,
from init/do_mounts.c:20:
include/net/netprio_cgroup.h:23:29: error: field 'css' has incomplete type
And several more similar.
Caused by commit 5bc1421e34ec ("net: add network priority cgroup
infrastructure (v4)").
I have added the following (minimal, but probably not optimal) patch for
today:
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Wed, 23 Nov 2011 14:49:49 +1100
Subject: [PATCH] net: fix build error in network priority cgroup
infrastructure
Fixes this error:
In file included from include/linux/netdevice.h:53:0,
from include/linux/icmpv6.h:173,
from include/linux/ipv6.h:220,
from include/net/ipv6.h:16,
from include/linux/sunrpc/clnt.h:26,
from include/linux/nfs_fs.h:50,
from init/do_mounts.c:20:
include/net/netprio_cgroup.h:23:29: error: field 'css' has incomplete type
When CONFIG_CGROUPS is not set.
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
include/net/netprio_cgroup.h | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index c432e99..da71b91 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -20,7 +20,9 @@
struct cgroup_netprio_state
{
+#ifdef CONFIG_CGROUPS
struct cgroup_subsys_state css;
+#endif
u32 prioidx;
};
--
1.7.7.3
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply related
* Re: linux-next: manual merge of the akpm tree with the net-next tree
From: Neil Horman @ 2011-11-23 3:40 UTC (permalink / raw)
To: Stephen Rothwell
Cc: Andrew Morton, linux-next, linux-kernel, Frederic Weisbecker,
David Miller, netdev
In-Reply-To: <20111123142742.9b4fb79fd3748325753a896a@canb.auug.org.au>
On Wed, Nov 23, 2011 at 02:27:42PM +1100, Stephen Rothwell wrote:
> Hi Andrew,
>
> Today's linux-next merge of the akpm got a conflict in
> include/linux/cgroup_subsys.h between commit 5bc1421e34ec ("net: add
> network priority cgroup infrastructure (v4)") from the net-next tree and
> commit 22a20a26f21c ("cgroups: add a task counter subsystem") from the
> akpm tree.
>
> Just overlapping additions. I fixed it up (see below) and can carry the
> fix as necessary.
ACK, thanks Stephen.
Neil
^ permalink raw reply
* linux-next: manual merge of the akpm tree with the net-next tree
From: Stephen Rothwell @ 2011-11-23 3:27 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-next, linux-kernel, Neil Horman, Frederic Weisbecker,
David Miller, netdev
[-- Attachment #1: Type: text/plain, Size: 827 bytes --]
Hi Andrew,
Today's linux-next merge of the akpm got a conflict in
include/linux/cgroup_subsys.h between commit 5bc1421e34ec ("net: add
network priority cgroup infrastructure (v4)") from the net-next tree and
commit 22a20a26f21c ("cgroups: add a task counter subsystem") from the
akpm tree.
Just overlapping additions. I fixed it up (see below) and can carry the
fix as necessary.
--
Cheers,
Stephen Rothwell sfr@canb.auug.org.au
diff --cc include/linux/cgroup_subsys.h
index 0bd390c,5425822..0000000
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@@ -67,8 -67,8 +67,14 @@@ SUBSYS(perf
/* */
+#ifdef CONFIG_NETPRIO_CGROUP
+SUBSYS(net_prio)
+#endif
+
+/* */
++
+ #ifdef CONFIG_CGROUP_TASK_COUNTER
+ SUBSYS(tasks)
+ #endif
+
+ /* */
[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]
^ permalink raw reply
* [PATCH v3] ipv4 : igmp : fix error handle in ip_mc_add_src()
From: Jun Zhao @ 2011-11-23 3:19 UTC (permalink / raw)
To: davem; +Cc: eric.dumazet, dlstevens, netdev, Jun Zhao
from: Jun Zhao <mypopydev@gmail.com>
When add sources to interface failure, need to roll back the sfcount[MODE]
to before state. We need to match it corresponding.
Acked-by: David L Stevens <dlstevens@us.ibm.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jun Zhao <mypopydev@gmail.com>
---
net/ipv4/igmp.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c7472ef..b2ca095 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1716,7 +1716,8 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
if (err) {
int j;
- pmc->sfcount[sfmode]--;
+ if (!delta)
+ pmc->sfcount[sfmode]--;
for (j=0; j<i; j++)
(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
--
1.7.2.5
^ permalink raw reply related
* [PATCH v4] net: add calxeda xgmac ethernet driver
From: Rob Herring @ 2011-11-23 3:18 UTC (permalink / raw)
To: netdev, devicetree-discuss
Cc: joe, saeed.bishara, davem, bhutchings, Rob Herring
From: Rob Herring <rob.herring@calxeda.com>
Add support for the XGMAC 10Gb ethernet device in the Calxeda Highbank
SOC.
Signed-off-by: Rob Herring <rob.herring@calxeda.com>
---
v4:
- remove use of SZ_8K
- Implement review comments from Ben Hutchings:
- clean-up/rename of some statistics counters
- freeze h/w counters on reading to avoid rollovers and protect with
spinlock. Also, fixed the size on some counters which are only 32-bit.
- Return CHECKSUM_NONE on IP receive errors
- s/mc_filter/hash_filter/
- Rework xgmac_release function to not hang if called multiple times.
- Rename xgmac_release to xgmac_stop.
- fix some ethtool return values
- add netif_napi_del calls on probe err path and remove function.
v3:
- rebase to 3.2-rc2
- add dma_unmap_page for frags
- use module_platform_driver
v2:
- use __le32 for descriptor fields and cpu_to_le32/le32_to_cpu to access
- use u32 instead of dma_addr_t for descriptor phys addresses
- improve allocation error handling for descriptor ring allocations
- convert all prints to netdev_XXX
- rebase to current Linus master
- move into drivers/net/ethernet
.../devicetree/bindings/net/calxeda-xgmac.txt | 16 +
drivers/net/ethernet/Kconfig | 1 +
drivers/net/ethernet/Makefile | 1 +
drivers/net/ethernet/calxeda/Kconfig | 7 +
drivers/net/ethernet/calxeda/Makefile | 2 +
drivers/net/ethernet/calxeda/xgmac.c | 1928 ++++++++++++++++++++
6 files changed, 1955 insertions(+), 0 deletions(-)
create mode 100644 Documentation/devicetree/bindings/net/calxeda-xgmac.txt
create mode 100644 drivers/net/ethernet/calxeda/Kconfig
create mode 100644 drivers/net/ethernet/calxeda/Makefile
create mode 100644 drivers/net/ethernet/calxeda/xgmac.c
diff --git a/Documentation/devicetree/bindings/net/calxeda-xgmac.txt b/Documentation/devicetree/bindings/net/calxeda-xgmac.txt
new file mode 100644
index 0000000..c03a7bc
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/calxeda-xgmac.txt
@@ -0,0 +1,16 @@
+* Calxeda Highbank 10Gb XGMAC Ethernet
+
+Required properties:
+- compatible : Should be "calxeda,hb-xgmac"
+- reg : Address and length of the register set for the device
+- interrupts : Should contain 3 xgmac interrupts. The 1st is main interrupt.
+ The 2nd is pwr mgt interrupt. The 3rd is low power state interrupt.
+
+Example:
+
+ethernet@fff50000 {
+ compatible = "calxeda,hb-xgmac";
+ reg = <0xfff50000 0x1000>;
+ interrupts = <0 77 4 0 78 4 0 79 4>;
+};
+
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index 597f4d4..3474a61 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -28,6 +28,7 @@ source "drivers/net/ethernet/cadence/Kconfig"
source "drivers/net/ethernet/adi/Kconfig"
source "drivers/net/ethernet/broadcom/Kconfig"
source "drivers/net/ethernet/brocade/Kconfig"
+source "drivers/net/ethernet/calxeda/Kconfig"
source "drivers/net/ethernet/chelsio/Kconfig"
source "drivers/net/ethernet/cirrus/Kconfig"
source "drivers/net/ethernet/cisco/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index be5dde0..cd6d69a 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_NET_ATMEL) += cadence/
obj-$(CONFIG_NET_BFIN) += adi/
obj-$(CONFIG_NET_VENDOR_BROADCOM) += broadcom/
obj-$(CONFIG_NET_VENDOR_BROCADE) += brocade/
+obj-$(CONFIG_NET_CALXEDA_XGMAC) += calxeda/
obj-$(CONFIG_NET_VENDOR_CHELSIO) += chelsio/
obj-$(CONFIG_NET_VENDOR_CIRRUS) += cirrus/
obj-$(CONFIG_NET_VENDOR_CISCO) += cisco/
diff --git a/drivers/net/ethernet/calxeda/Kconfig b/drivers/net/ethernet/calxeda/Kconfig
new file mode 100644
index 0000000..a52e725
--- /dev/null
+++ b/drivers/net/ethernet/calxeda/Kconfig
@@ -0,0 +1,7 @@
+config NET_CALXEDA_XGMAC
+ tristate "Calxeda 1G/10G XGMAC Ethernet driver"
+
+ select CRC32
+ help
+ This is the driver for the XGMAC Ethernet IP block found on Calxeda
+ Highbank platforms.
diff --git a/drivers/net/ethernet/calxeda/Makefile b/drivers/net/ethernet/calxeda/Makefile
new file mode 100644
index 0000000..5057cd2
--- /dev/null
+++ b/drivers/net/ethernet/calxeda/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_NET_CALXEDA_XGMAC) += xgmac.o
+
diff --git a/drivers/net/ethernet/calxeda/xgmac.c b/drivers/net/ethernet/calxeda/xgmac.c
new file mode 100644
index 0000000..481c781
--- /dev/null
+++ b/drivers/net/ethernet/calxeda/xgmac.c
@@ -0,0 +1,1928 @@
+/*
+ * Copyright 2010-2011 Calxeda, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/circ_buf.h>
+#include <linux/interrupt.h>
+#include <linux/etherdevice.h>
+#include <linux/platform_device.h>
+#include <linux/skbuff.h>
+#include <linux/ethtool.h>
+#include <linux/if.h>
+#include <linux/crc32.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+/* XGMAC Register definitions */
+#define XGMAC_CONTROL 0x00000000 /* MAC Configuration */
+#define XGMAC_FRAME_FILTER 0x00000004 /* MAC Frame Filter */
+#define XGMAC_FLOW_CTRL 0x00000018 /* MAC Flow Control */
+#define XGMAC_VLAN_TAG 0x0000001C /* VLAN Tags */
+#define XGMAC_VERSION 0x00000020 /* Version */
+#define XGMAC_VLAN_INCL 0x00000024 /* VLAN tag for tx frames */
+#define XGMAC_LPI_CTRL 0x00000028 /* LPI Control and Status */
+#define XGMAC_LPI_TIMER 0x0000002C /* LPI Timers Control */
+#define XGMAC_TX_PACE 0x00000030 /* Transmit Pace and Stretch */
+#define XGMAC_VLAN_HASH 0x00000034 /* VLAN Hash Table */
+#define XGMAC_DEBUG 0x00000038 /* Debug */
+#define XGMAC_INT_STAT 0x0000003C /* Interrupt and Control */
+#define XGMAC_ADDR_HIGH(reg) (0x00000040 + ((reg) * 8))
+#define XGMAC_ADDR_LOW(reg) (0x00000044 + ((reg) * 8))
+#define XGMAC_HASH(n) (0x00000300 + (n) * 4) /* HASH table regs */
+#define XGMAC_NUM_HASH 16
+#define XGMAC_OMR 0x00000400
+#define XGMAC_REMOTE_WAKE 0x00000700 /* Remote Wake-Up Frm Filter */
+#define XGMAC_PMT 0x00000704 /* PMT Control and Status */
+#define XGMAC_MMC_CTRL 0x00000800 /* XGMAC MMC Control */
+#define XGMAC_MMC_INTR_RX 0x00000804 /* Recieve Interrupt */
+#define XGMAC_MMC_INTR_TX 0x00000808 /* Transmit Interrupt */
+#define XGMAC_MMC_INTR_MASK_RX 0x0000080c /* Recieve Interrupt Mask */
+#define XGMAC_MMC_INTR_MASK_TX 0x00000810 /* Transmit Interrupt Mask */
+
+/* Hardware TX Statistics Counters */
+#define XGMAC_MMC_TXOCTET_GB_LO 0x00000814
+#define XGMAC_MMC_TXOCTET_GB_HI 0x00000818
+#define XGMAC_MMC_TXFRAME_GB_LO 0x0000081C
+#define XGMAC_MMC_TXFRAME_GB_HI 0x00000820
+#define XGMAC_MMC_TXBCFRAME_G 0x00000824
+#define XGMAC_MMC_TXMCFRAME_G 0x0000082C
+#define XGMAC_MMC_TXUCFRAME_GB 0x00000864
+#define XGMAC_MMC_TXMCFRAME_GB 0x0000086C
+#define XGMAC_MMC_TXBCFRAME_GB 0x00000874
+#define XGMAC_MMC_TXUNDERFLOW 0x0000087C
+#define XGMAC_MMC_TXOCTET_G_LO 0x00000884
+#define XGMAC_MMC_TXOCTET_G_HI 0x00000888
+#define XGMAC_MMC_TXFRAME_G_LO 0x0000088C
+#define XGMAC_MMC_TXFRAME_G_HI 0x00000890
+#define XGMAC_MMC_TXPAUSEFRAME 0x00000894
+#define XGMAC_MMC_TXVLANFRAME 0x0000089C
+
+/* Hardware RX Statistics Counters */
+#define XGMAC_MMC_RXFRAME_GB_LO 0x00000900
+#define XGMAC_MMC_RXFRAME_GB_HI 0x00000904
+#define XGMAC_MMC_RXOCTET_GB_LO 0x00000908
+#define XGMAC_MMC_RXOCTET_GB_HI 0x0000090C
+#define XGMAC_MMC_RXOCTET_G_LO 0x00000910
+#define XGMAC_MMC_RXOCTET_G_HI 0x00000914
+#define XGMAC_MMC_RXBCFRAME_G 0x00000918
+#define XGMAC_MMC_RXMCFRAME_G 0x00000920
+#define XGMAC_MMC_RXCRCERR 0x00000928
+#define XGMAC_MMC_RXRUNT 0x00000930
+#define XGMAC_MMC_RXJABBER 0x00000934
+#define XGMAC_MMC_RXUCFRAME_G 0x00000970
+#define XGMAC_MMC_RXLENGTHERR 0x00000978
+#define XGMAC_MMC_RXPAUSEFRAME 0x00000988
+#define XGMAC_MMC_RXOVERFLOW 0x00000990
+#define XGMAC_MMC_RXVLANFRAME 0x00000998
+#define XGMAC_MMC_RXWATCHDOG 0x000009a0
+
+/* DMA Control and Status Registers */
+#define XGMAC_DMA_BUS_MODE 0x00000f00 /* Bus Mode */
+#define XGMAC_DMA_TX_POLL 0x00000f04 /* Transmit Poll Demand */
+#define XGMAC_DMA_RX_POLL 0x00000f08 /* Received Poll Demand */
+#define XGMAC_DMA_RX_BASE_ADDR 0x00000f0c /* Receive List Base */
+#define XGMAC_DMA_TX_BASE_ADDR 0x00000f10 /* Transmit List Base */
+#define XGMAC_DMA_STATUS 0x00000f14 /* Status Register */
+#define XGMAC_DMA_CONTROL 0x00000f18 /* Ctrl (Operational Mode) */
+#define XGMAC_DMA_INTR_ENA 0x00000f1c /* Interrupt Enable */
+#define XGMAC_DMA_MISS_FRAME_CTR 0x00000f20 /* Missed Frame Counter */
+#define XGMAC_DMA_RI_WDOG_TIMER 0x00000f24 /* RX Intr Watchdog Timer */
+#define XGMAC_DMA_AXI_BUS 0x00000f28 /* AXI Bus Mode */
+#define XGMAC_DMA_AXI_STATUS 0x00000f2C /* AXI Status */
+#define XGMAC_DMA_HW_FEATURE 0x00000f58 /* Enabled Hardware Features */
+
+#define XGMAC_ADDR_AE 0x80000000
+#define XGMAC_MAX_FILTER_ADDR 31
+
+/* PMT Control and Status */
+#define XGMAC_PMT_POINTER_RESET 0x80000000
+#define XGMAC_PMT_GLBL_UNICAST 0x00000200
+#define XGMAC_PMT_WAKEUP_RX_FRM 0x00000040
+#define XGMAC_PMT_MAGIC_PKT 0x00000020
+#define XGMAC_PMT_WAKEUP_FRM_EN 0x00000004
+#define XGMAC_PMT_MAGIC_PKT_EN 0x00000002
+#define XGMAC_PMT_POWERDOWN 0x00000001
+
+#define XGMAC_CONTROL_SPD 0x40000000 /* Speed control */
+#define XGMAC_CONTROL_SPD_MASK 0x60000000
+#define XGMAC_CONTROL_SPD_1G 0x60000000
+#define XGMAC_CONTROL_SPD_2_5G 0x40000000
+#define XGMAC_CONTROL_SPD_10G 0x00000000
+#define XGMAC_CONTROL_SARC 0x10000000 /* Source Addr Insert/Replace */
+#define XGMAC_CONTROL_SARK_MASK 0x18000000
+#define XGMAC_CONTROL_CAR 0x04000000 /* CRC Addition/Replacement */
+#define XGMAC_CONTROL_CAR_MASK 0x06000000
+#define XGMAC_CONTROL_DP 0x01000000 /* Disable Padding */
+#define XGMAC_CONTROL_WD 0x00800000 /* Disable Watchdog on rx */
+#define XGMAC_CONTROL_JD 0x00400000 /* Jabber disable */
+#define XGMAC_CONTROL_JE 0x00100000 /* Jumbo frame */
+#define XGMAC_CONTROL_LM 0x00001000 /* Loop-back mode */
+#define XGMAC_CONTROL_IPC 0x00000400 /* Checksum Offload */
+#define XGMAC_CONTROL_ACS 0x00000080 /* Automatic Pad/FCS Strip */
+#define XGMAC_CONTROL_DDIC 0x00000010 /* Disable Deficit Idle Count */
+#define XGMAC_CONTROL_TE 0x00000008 /* Transmitter Enable */
+#define XGMAC_CONTROL_RE 0x00000004 /* Receiver Enable */
+
+/* XGMAC Frame Filter defines */
+#define XGMAC_FRAME_FILTER_PR 0x00000001 /* Promiscuous Mode */
+#define XGMAC_FRAME_FILTER_HUC 0x00000002 /* Hash Unicast */
+#define XGMAC_FRAME_FILTER_HMC 0x00000004 /* Hash Multicast */
+#define XGMAC_FRAME_FILTER_DAIF 0x00000008 /* DA Inverse Filtering */
+#define XGMAC_FRAME_FILTER_PM 0x00000010 /* Pass all multicast */
+#define XGMAC_FRAME_FILTER_DBF 0x00000020 /* Disable Broadcast frames */
+#define XGMAC_FRAME_FILTER_SAIF 0x00000100 /* Inverse Filtering */
+#define XGMAC_FRAME_FILTER_SAF 0x00000200 /* Source Address Filter */
+#define XGMAC_FRAME_FILTER_HPF 0x00000400 /* Hash or perfect Filter */
+#define XGMAC_FRAME_FILTER_VHF 0x00000800 /* VLAN Hash Filter */
+#define XGMAC_FRAME_FILTER_VPF 0x00001000 /* VLAN Perfect Filter */
+#define XGMAC_FRAME_FILTER_RA 0x80000000 /* Receive all mode */
+
+/* XGMAC FLOW CTRL defines */
+#define XGMAC_FLOW_CTRL_PT_MASK 0xffff0000 /* Pause Time Mask */
+#define XGMAC_FLOW_CTRL_PT_SHIFT 16
+#define XGMAC_FLOW_CTRL_DZQP 0x00000080 /* Disable Zero-Quanta Phase */
+#define XGMAC_FLOW_CTRL_PLT 0x00000020 /* Pause Low Threshhold */
+#define XGMAC_FLOW_CTRL_PLT_MASK 0x00000030 /* PLT MASK */
+#define XGMAC_FLOW_CTRL_UP 0x00000008 /* Unicast Pause Frame Detect */
+#define XGMAC_FLOW_CTRL_RFE 0x00000004 /* Rx Flow Control Enable */
+#define XGMAC_FLOW_CTRL_TFE 0x00000002 /* Tx Flow Control Enable */
+#define XGMAC_FLOW_CTRL_FCB_BPA 0x00000001 /* Flow Control Busy ... */
+
+/* XGMAC_INT_STAT reg */
+#define XGMAC_INT_STAT_PMT 0x0080 /* PMT Interrupt Status */
+#define XGMAC_INT_STAT_LPI 0x0040 /* LPI Interrupt Status */
+
+/* DMA Bus Mode register defines */
+#define DMA_BUS_MODE_SFT_RESET 0x00000001 /* Software Reset */
+#define DMA_BUS_MODE_DSL_MASK 0x0000007c /* Descriptor Skip Length */
+#define DMA_BUS_MODE_DSL_SHIFT 2 /* (in DWORDS) */
+#define DMA_BUS_MODE_ATDS 0x00000080 /* Alternate Descriptor Size */
+
+/* Programmable burst length */
+#define DMA_BUS_MODE_PBL_MASK 0x00003f00 /* Programmable Burst Len */
+#define DMA_BUS_MODE_PBL_SHIFT 8
+#define DMA_BUS_MODE_FB 0x00010000 /* Fixed burst */
+#define DMA_BUS_MODE_RPBL_MASK 0x003e0000 /* Rx-Programmable Burst Len */
+#define DMA_BUS_MODE_RPBL_SHIFT 17
+#define DMA_BUS_MODE_USP 0x00800000
+#define DMA_BUS_MODE_8PBL 0x01000000
+#define DMA_BUS_MODE_AAL 0x02000000
+
+/* DMA Bus Mode register defines */
+#define DMA_BUS_PR_RATIO_MASK 0x0000c000 /* Rx/Tx priority ratio */
+#define DMA_BUS_PR_RATIO_SHIFT 14
+#define DMA_BUS_FB 0x00010000 /* Fixed Burst */
+
+/* DMA Control register defines */
+#define DMA_CONTROL_ST 0x00002000 /* Start/Stop Transmission */
+#define DMA_CONTROL_SR 0x00000002 /* Start/Stop Receive */
+#define DMA_CONTROL_DFF 0x01000000 /* Disable flush of rx frames */
+
+/* DMA Normal interrupt */
+#define DMA_INTR_ENA_NIE 0x00010000 /* Normal Summary */
+#define DMA_INTR_ENA_AIE 0x00008000 /* Abnormal Summary */
+#define DMA_INTR_ENA_ERE 0x00004000 /* Early Receive */
+#define DMA_INTR_ENA_FBE 0x00002000 /* Fatal Bus Error */
+#define DMA_INTR_ENA_ETE 0x00000400 /* Early Transmit */
+#define DMA_INTR_ENA_RWE 0x00000200 /* Receive Watchdog */
+#define DMA_INTR_ENA_RSE 0x00000100 /* Receive Stopped */
+#define DMA_INTR_ENA_RUE 0x00000080 /* Receive Buffer Unavailable */
+#define DMA_INTR_ENA_RIE 0x00000040 /* Receive Interrupt */
+#define DMA_INTR_ENA_UNE 0x00000020 /* Tx Underflow */
+#define DMA_INTR_ENA_OVE 0x00000010 /* Receive Overflow */
+#define DMA_INTR_ENA_TJE 0x00000008 /* Transmit Jabber */
+#define DMA_INTR_ENA_TUE 0x00000004 /* Transmit Buffer Unavail */
+#define DMA_INTR_ENA_TSE 0x00000002 /* Transmit Stopped */
+#define DMA_INTR_ENA_TIE 0x00000001 /* Transmit Interrupt */
+
+#define DMA_INTR_NORMAL (DMA_INTR_ENA_NIE | DMA_INTR_ENA_RIE | \
+ DMA_INTR_ENA_TUE)
+
+#define DMA_INTR_ABNORMAL (DMA_INTR_ENA_AIE | DMA_INTR_ENA_FBE | \
+ DMA_INTR_ENA_RWE | DMA_INTR_ENA_RSE | \
+ DMA_INTR_ENA_RUE | DMA_INTR_ENA_UNE | \
+ DMA_INTR_ENA_OVE | DMA_INTR_ENA_TJE | \
+ DMA_INTR_ENA_TSE)
+
+/* DMA default interrupt mask */
+#define DMA_INTR_DEFAULT_MASK (DMA_INTR_NORMAL | DMA_INTR_ABNORMAL)
+
+/* DMA Status register defines */
+#define DMA_STATUS_GMI 0x08000000 /* MMC interrupt */
+#define DMA_STATUS_GLI 0x04000000 /* GMAC Line interface int */
+#define DMA_STATUS_EB_MASK 0x00380000 /* Error Bits Mask */
+#define DMA_STATUS_EB_TX_ABORT 0x00080000 /* Error Bits - TX Abort */
+#define DMA_STATUS_EB_RX_ABORT 0x00100000 /* Error Bits - RX Abort */
+#define DMA_STATUS_TS_MASK 0x00700000 /* Transmit Process State */
+#define DMA_STATUS_TS_SHIFT 20
+#define DMA_STATUS_RS_MASK 0x000e0000 /* Receive Process State */
+#define DMA_STATUS_RS_SHIFT 17
+#define DMA_STATUS_NIS 0x00010000 /* Normal Interrupt Summary */
+#define DMA_STATUS_AIS 0x00008000 /* Abnormal Interrupt Summary */
+#define DMA_STATUS_ERI 0x00004000 /* Early Receive Interrupt */
+#define DMA_STATUS_FBI 0x00002000 /* Fatal Bus Error Interrupt */
+#define DMA_STATUS_ETI 0x00000400 /* Early Transmit Interrupt */
+#define DMA_STATUS_RWT 0x00000200 /* Receive Watchdog Timeout */
+#define DMA_STATUS_RPS 0x00000100 /* Receive Process Stopped */
+#define DMA_STATUS_RU 0x00000080 /* Receive Buffer Unavailable */
+#define DMA_STATUS_RI 0x00000040 /* Receive Interrupt */
+#define DMA_STATUS_UNF 0x00000020 /* Transmit Underflow */
+#define DMA_STATUS_OVF 0x00000010 /* Receive Overflow */
+#define DMA_STATUS_TJT 0x00000008 /* Transmit Jabber Timeout */
+#define DMA_STATUS_TU 0x00000004 /* Transmit Buffer Unavail */
+#define DMA_STATUS_TPS 0x00000002 /* Transmit Process Stopped */
+#define DMA_STATUS_TI 0x00000001 /* Transmit Interrupt */
+
+/* Common MAC defines */
+#define MAC_ENABLE_TX 0x00000008 /* Transmitter Enable */
+#define MAC_ENABLE_RX 0x00000004 /* Receiver Enable */
+
+/* XGMAC Operation Mode Register */
+#define XGMAC_OMR_TSF 0x00200000 /* TX FIFO Store and Forward */
+#define XGMAC_OMR_FTF 0x00100000 /* Flush Transmit FIFO */
+#define XGMAC_OMR_TTC 0x00020000 /* Transmit Threshhold Ctrl */
+#define XGMAC_OMR_TTC_MASK 0x00030000
+#define XGMAC_OMR_RFD 0x00006000 /* FC Deactivation Threshhold */
+#define XGMAC_OMR_RFD_MASK 0x00007000 /* FC Deact Threshhold MASK */
+#define XGMAC_OMR_RFA 0x00000600 /* FC Activation Threshhold */
+#define XGMAC_OMR_RFA_MASK 0x00000E00 /* FC Act Threshhold MASK */
+#define XGMAC_OMR_EFC 0x00000100 /* Enable Hardware FC */
+#define XGMAC_OMR_FEF 0x00000080 /* Forward Error Frames */
+#define XGMAC_OMR_DT 0x00000040 /* Drop TCP/IP csum Errors */
+#define XGMAC_OMR_RSF 0x00000020 /* RX FIFO Store and Forward */
+#define XGMAC_OMR_RTC 0x00000010 /* RX Threshhold Ctrl */
+#define XGMAC_OMR_RTC_MASK 0x00000018 /* RX Threshhold Ctrl MASK */
+
+/* XGMAC HW Features Register */
+#define DMA_HW_FEAT_TXCOESEL 0x00010000 /* TX Checksum offload */
+
+#define XGMAC_MMC_CTRL_CNT_FRZ 0x00000008
+
+/* XGMAC Descriptor Defines */
+#define MAX_DESC_BUF_SZ (0x2000 - 8)
+
+#define RXDESC_EXT_STATUS 0x00000001
+#define RXDESC_CRC_ERR 0x00000002
+#define RXDESC_RX_ERR 0x00000008
+#define RXDESC_RX_WDOG 0x00000010
+#define RXDESC_FRAME_TYPE 0x00000020
+#define RXDESC_GIANT_FRAME 0x00000080
+#define RXDESC_LAST_SEG 0x00000100
+#define RXDESC_FIRST_SEG 0x00000200
+#define RXDESC_VLAN_FRAME 0x00000400
+#define RXDESC_OVERFLOW_ERR 0x00000800
+#define RXDESC_LENGTH_ERR 0x00001000
+#define RXDESC_SA_FILTER_FAIL 0x00002000
+#define RXDESC_DESCRIPTOR_ERR 0x00004000
+#define RXDESC_ERROR_SUMMARY 0x00008000
+#define RXDESC_FRAME_LEN_OFFSET 16
+#define RXDESC_FRAME_LEN_MASK 0x3fff0000
+#define RXDESC_DA_FILTER_FAIL 0x40000000
+
+#define RXDESC1_END_RING 0x00008000
+
+#define RXDESC_IP_PAYLOAD_MASK 0x00000003
+#define RXDESC_IP_PAYLOAD_UDP 0x00000001
+#define RXDESC_IP_PAYLOAD_TCP 0x00000002
+#define RXDESC_IP_PAYLOAD_ICMP 0x00000003
+#define RXDESC_IP_HEADER_ERR 0x00000008
+#define RXDESC_IP_PAYLOAD_ERR 0x00000010
+#define RXDESC_IPV4_PACKET 0x00000040
+#define RXDESC_IPV6_PACKET 0x00000080
+#define TXDESC_UNDERFLOW_ERR 0x00000001
+#define TXDESC_JABBER_TIMEOUT 0x00000002
+#define TXDESC_LOCAL_FAULT 0x00000004
+#define TXDESC_REMOTE_FAULT 0x00000008
+#define TXDESC_VLAN_FRAME 0x00000010
+#define TXDESC_FRAME_FLUSHED 0x00000020
+#define TXDESC_IP_HEADER_ERR 0x00000040
+#define TXDESC_PAYLOAD_CSUM_ERR 0x00000080
+#define TXDESC_ERROR_SUMMARY 0x00008000
+#define TXDESC_SA_CTRL_INSERT 0x00040000
+#define TXDESC_SA_CTRL_REPLACE 0x00080000
+#define TXDESC_2ND_ADDR_CHAINED 0x00100000
+#define TXDESC_END_RING 0x00200000
+#define TXDESC_CSUM_IP 0x00400000
+#define TXDESC_CSUM_IP_PAYLD 0x00800000
+#define TXDESC_CSUM_ALL 0x00C00000
+#define TXDESC_CRC_EN_REPLACE 0x01000000
+#define TXDESC_CRC_EN_APPEND 0x02000000
+#define TXDESC_DISABLE_PAD 0x04000000
+#define TXDESC_FIRST_SEG 0x10000000
+#define TXDESC_LAST_SEG 0x20000000
+#define TXDESC_INTERRUPT 0x40000000
+
+#define DESC_OWN 0x80000000
+#define DESC_BUFFER1_SZ_MASK 0x00001fff
+#define DESC_BUFFER2_SZ_MASK 0x1fff0000
+#define DESC_BUFFER2_SZ_OFFSET 16
+
+struct xgmac_dma_desc {
+ __le32 flags;
+ __le32 buf_size;
+ __le32 buf1_addr; /* Buffer 1 Address Pointer */
+ __le32 buf2_addr; /* Buffer 2 Address Pointer */
+ __le32 ext_status;
+ __le32 res[3];
+};
+
+struct xgmac_extra_stats {
+ /* Transmit errors */
+ unsigned long tx_jabber;
+ unsigned long tx_frame_flushed;
+ unsigned long tx_payload_error;
+ unsigned long tx_ip_header_error;
+ unsigned long tx_local_fault;
+ unsigned long tx_remote_fault;
+ /* Receive errors */
+ unsigned long rx_watchdog;
+ unsigned long rx_da_filter_fail;
+ unsigned long rx_sa_filter_fail;
+ unsigned long rx_payload_error;
+ unsigned long rx_ip_header_error;
+ /* Tx/Rx IRQ errors */
+ unsigned long tx_undeflow;
+ unsigned long tx_process_stopped;
+ unsigned long rx_buf_unav;
+ unsigned long rx_process_stopped;
+ unsigned long tx_early;
+ unsigned long fatal_bus_error;
+};
+
+struct xgmac_priv {
+ struct xgmac_dma_desc *dma_rx;
+ struct sk_buff **rx_skbuff;
+ unsigned int rx_tail;
+ unsigned int rx_head;
+
+ struct xgmac_dma_desc *dma_tx;
+ struct sk_buff **tx_skbuff;
+ unsigned int tx_head;
+ unsigned int tx_tail;
+
+ void __iomem *base;
+ struct sk_buff_head rx_recycle;
+ unsigned int dma_buf_sz;
+ dma_addr_t dma_rx_phy;
+ dma_addr_t dma_tx_phy;
+
+ struct net_device *dev;
+ struct device *device;
+ struct napi_struct napi;
+
+ struct xgmac_extra_stats xstats;
+
+ spinlock_t stats_lock;
+ int pmt_irq;
+ char rx_pause;
+ char tx_pause;
+ int wolopts;
+};
+
+/* XGMAC Configuration Settings */
+#define MAX_MTU 9000
+#define PAUSE_TIME 0x400
+
+#define DMA_RX_RING_SZ 256
+#define DMA_TX_RING_SZ 128
+/* minimum number of free TX descriptors required to wake up TX process */
+#define TX_THRESH (DMA_TX_RING_SZ/4)
+
+/* DMA descriptor ring helpers */
+#define dma_ring_incr(n, s) (((n) + 1) & ((s) - 1))
+#define dma_ring_space(h, t, s) CIRC_SPACE(h, t, s)
+#define dma_ring_cnt(h, t, s) CIRC_CNT(h, t, s)
+
+/* XGMAC Descriptor Access Helpers */
+static inline void desc_set_buf_len(struct xgmac_dma_desc *p, u32 buf_sz)
+{
+ if (buf_sz > MAX_DESC_BUF_SZ)
+ p->buf_size = cpu_to_le32(MAX_DESC_BUF_SZ |
+ (buf_sz - MAX_DESC_BUF_SZ) << DESC_BUFFER2_SZ_OFFSET);
+ else
+ p->buf_size = cpu_to_le32(buf_sz);
+}
+
+static inline int desc_get_buf_len(struct xgmac_dma_desc *p)
+{
+ u32 len = cpu_to_le32(p->flags);
+ return (len & DESC_BUFFER1_SZ_MASK) +
+ ((len & DESC_BUFFER2_SZ_MASK) >> DESC_BUFFER2_SZ_OFFSET);
+}
+
+static inline void desc_init_rx_desc(struct xgmac_dma_desc *p, int ring_size,
+ int buf_sz)
+{
+ struct xgmac_dma_desc *end = p + ring_size - 1;
+
+ memset(p, 0, sizeof(*p) * ring_size);
+
+ for (; p <= end; p++)
+ desc_set_buf_len(p, buf_sz);
+
+ end->buf_size |= cpu_to_le32(RXDESC1_END_RING);
+}
+
+static inline void desc_init_tx_desc(struct xgmac_dma_desc *p, u32 ring_size)
+{
+ memset(p, 0, sizeof(*p) * ring_size);
+ p[ring_size - 1].flags = cpu_to_le32(TXDESC_END_RING);
+}
+
+static inline int desc_get_owner(struct xgmac_dma_desc *p)
+{
+ return le32_to_cpu(p->flags) & DESC_OWN;
+}
+
+static inline void desc_set_rx_owner(struct xgmac_dma_desc *p)
+{
+ /* Clear all fields and set the owner */
+ p->flags = cpu_to_le32(DESC_OWN);
+}
+
+static inline void desc_set_tx_owner(struct xgmac_dma_desc *p, u32 flags)
+{
+ u32 tmpflags = le32_to_cpu(p->flags);
+ tmpflags &= TXDESC_END_RING;
+ tmpflags |= flags | DESC_OWN;
+ p->flags = cpu_to_le32(tmpflags);
+}
+
+static inline int desc_get_tx_ls(struct xgmac_dma_desc *p)
+{
+ return le32_to_cpu(p->flags) & TXDESC_LAST_SEG;
+}
+
+static inline u32 desc_get_buf_addr(struct xgmac_dma_desc *p)
+{
+ return le32_to_cpu(p->buf1_addr);
+}
+
+static inline void desc_set_buf_addr(struct xgmac_dma_desc *p,
+ u32 paddr, int len)
+{
+ p->buf1_addr = cpu_to_le32(paddr);
+ if (len > MAX_DESC_BUF_SZ)
+ p->buf2_addr = cpu_to_le32(paddr + MAX_DESC_BUF_SZ);
+}
+
+static inline void desc_set_buf_addr_and_size(struct xgmac_dma_desc *p,
+ u32 paddr, int len)
+{
+ desc_set_buf_len(p, len);
+ desc_set_buf_addr(p, paddr, len);
+}
+
+static inline int desc_get_rx_frame_len(struct xgmac_dma_desc *p)
+{
+ u32 data = le32_to_cpu(p->flags);
+ u32 len = (data & RXDESC_FRAME_LEN_MASK) >> RXDESC_FRAME_LEN_OFFSET;
+ if (data & RXDESC_FRAME_TYPE)
+ len -= ETH_FCS_LEN;
+
+ return len;
+}
+
+static void xgmac_dma_flush_tx_fifo(void __iomem *ioaddr)
+{
+ int timeout = 1000;
+ u32 reg = readl(ioaddr + XGMAC_OMR);
+ writel(reg | XGMAC_OMR_FTF, ioaddr + XGMAC_OMR);
+
+ while ((timeout-- > 0) && readl(ioaddr + XGMAC_OMR) & XGMAC_OMR_FTF)
+ udelay(1);
+}
+
+static int desc_get_tx_status(struct xgmac_priv *priv, struct xgmac_dma_desc *p)
+{
+ struct xgmac_extra_stats *x = &priv->xstats;
+ u32 status = le32_to_cpu(p->flags);
+
+ if (!(status & TXDESC_ERROR_SUMMARY))
+ return 0;
+
+ netdev_dbg(priv->dev, "tx desc error = 0x%08x\n", status);
+ if (status & TXDESC_JABBER_TIMEOUT)
+ x->tx_jabber++;
+ if (status & TXDESC_FRAME_FLUSHED)
+ x->tx_frame_flushed++;
+ if (status & TXDESC_UNDERFLOW_ERR)
+ xgmac_dma_flush_tx_fifo(priv->base);
+ if (status & TXDESC_IP_HEADER_ERR)
+ x->tx_ip_header_error++;
+ if (status & TXDESC_LOCAL_FAULT)
+ x->tx_local_fault++;
+ if (status & TXDESC_REMOTE_FAULT)
+ x->tx_remote_fault++;
+ if (status & TXDESC_PAYLOAD_CSUM_ERR)
+ x->tx_payload_error++;
+
+ return -1;
+}
+
+static int desc_get_rx_status(struct xgmac_priv *priv, struct xgmac_dma_desc *p)
+{
+ struct xgmac_extra_stats *x = &priv->xstats;
+ int ret = CHECKSUM_UNNECESSARY;
+ u32 status = le32_to_cpu(p->flags);
+ u32 ext_status = le32_to_cpu(p->ext_status);
+
+ if (status & RXDESC_DA_FILTER_FAIL) {
+ netdev_dbg(priv->dev, "XGMAC RX : Dest Address filter fail\n");
+ x->rx_da_filter_fail++;
+ return -1;
+ }
+
+ /* Check if packet has checksum already */
+ if ((status & RXDESC_FRAME_TYPE) && (status & RXDESC_EXT_STATUS) &&
+ !(ext_status & RXDESC_IP_PAYLOAD_MASK))
+ ret = CHECKSUM_NONE;
+
+ netdev_dbg(priv->dev, "rx status - frame type=%d, csum = %d, ext stat %08x\n",
+ (status & RXDESC_FRAME_TYPE) ? 1 : 0, ret, ext_status);
+
+ if (!(status & RXDESC_ERROR_SUMMARY))
+ return ret;
+
+ /* Handle any errors */
+ if (status & (RXDESC_DESCRIPTOR_ERR | RXDESC_OVERFLOW_ERR |
+ RXDESC_GIANT_FRAME | RXDESC_LENGTH_ERR | RXDESC_CRC_ERR))
+ return -1;
+
+ if (status & RXDESC_EXT_STATUS) {
+ if (ext_status & RXDESC_IP_HEADER_ERR)
+ x->rx_ip_header_error++;
+ if (ext_status & RXDESC_IP_PAYLOAD_ERR)
+ x->rx_payload_error++;
+ netdev_dbg(priv->dev, "IP checksum error - stat %08x\n",
+ ext_status);
+ return CHECKSUM_NONE;
+ }
+
+ return ret;
+}
+
+static inline void xgmac_mac_enable(void __iomem *ioaddr)
+{
+ u32 value = readl(ioaddr + XGMAC_CONTROL);
+ value |= MAC_ENABLE_RX | MAC_ENABLE_TX;
+ writel(value, ioaddr + XGMAC_CONTROL);
+
+ value = readl(ioaddr + XGMAC_DMA_CONTROL);
+ value |= DMA_CONTROL_ST | DMA_CONTROL_SR;
+ writel(value, ioaddr + XGMAC_DMA_CONTROL);
+}
+
+static inline void xgmac_mac_disable(void __iomem *ioaddr)
+{
+ u32 value = readl(ioaddr + XGMAC_DMA_CONTROL);
+ value &= ~(DMA_CONTROL_ST | DMA_CONTROL_SR);
+ writel(value, ioaddr + XGMAC_DMA_CONTROL);
+
+ value = readl(ioaddr + XGMAC_CONTROL);
+ value &= ~(MAC_ENABLE_TX | MAC_ENABLE_RX);
+ writel(value, ioaddr + XGMAC_CONTROL);
+}
+
+static void xgmac_set_mac_addr(void __iomem *ioaddr, unsigned char *addr,
+ int num)
+{
+ u32 data;
+
+ data = (addr[5] << 8) | addr[4] | (num ? XGMAC_ADDR_AE : 0);
+ writel(data, ioaddr + XGMAC_ADDR_HIGH(num));
+ data = (addr[3] << 24) | (addr[2] << 16) | (addr[1] << 8) | addr[0];
+ writel(data, ioaddr + XGMAC_ADDR_LOW(num));
+}
+
+static void xgmac_get_mac_addr(void __iomem *ioaddr, unsigned char *addr,
+ int num)
+{
+ u32 hi_addr, lo_addr;
+
+ /* Read the MAC address from the hardware */
+ hi_addr = readl(ioaddr + XGMAC_ADDR_HIGH(num));
+ lo_addr = readl(ioaddr + XGMAC_ADDR_LOW(num));
+
+ /* Extract the MAC address from the high and low words */
+ addr[0] = lo_addr & 0xff;
+ addr[1] = (lo_addr >> 8) & 0xff;
+ addr[2] = (lo_addr >> 16) & 0xff;
+ addr[3] = (lo_addr >> 24) & 0xff;
+ addr[4] = hi_addr & 0xff;
+ addr[5] = (hi_addr >> 8) & 0xff;
+}
+
+static int xgmac_set_flow_ctrl(struct xgmac_priv *priv, int rx, int tx)
+{
+ u32 reg;
+ unsigned int flow = 0;
+
+ priv->rx_pause = rx;
+ priv->tx_pause = tx;
+
+ if (rx || tx) {
+ if (rx)
+ flow |= XGMAC_FLOW_CTRL_RFE;
+ if (tx)
+ flow |= XGMAC_FLOW_CTRL_TFE;
+
+ flow |= XGMAC_FLOW_CTRL_PLT | XGMAC_FLOW_CTRL_UP;
+ flow |= (PAUSE_TIME << XGMAC_FLOW_CTRL_PT_SHIFT);
+
+ writel(flow, priv->base + XGMAC_FLOW_CTRL);
+
+ reg = readl(priv->base + XGMAC_OMR);
+ reg |= XGMAC_OMR_EFC;
+ writel(reg, priv->base + XGMAC_OMR);
+ } else {
+ writel(0, priv->base + XGMAC_FLOW_CTRL);
+
+ reg = readl(priv->base + XGMAC_OMR);
+ reg &= ~XGMAC_OMR_EFC;
+ writel(reg, priv->base + XGMAC_OMR);
+ }
+
+ return 0;
+}
+
+static void xgmac_rx_refill(struct xgmac_priv *priv)
+{
+ struct xgmac_dma_desc *p;
+ dma_addr_t paddr;
+
+ while (dma_ring_space(priv->rx_head, priv->rx_tail, DMA_RX_RING_SZ) > 1) {
+ int entry = priv->rx_head;
+ struct sk_buff *skb;
+
+ p = priv->dma_rx + entry;
+
+ if (priv->rx_skbuff[entry] != NULL)
+ continue;
+
+ skb = __skb_dequeue(&priv->rx_recycle);
+ if (skb == NULL)
+ skb = netdev_alloc_skb(priv->dev, priv->dma_buf_sz);
+ if (unlikely(skb == NULL))
+ break;
+
+ priv->rx_skbuff[entry] = skb;
+ paddr = dma_map_single(priv->device, skb->data,
+ priv->dma_buf_sz, DMA_FROM_DEVICE);
+ desc_set_buf_addr(p, paddr, priv->dma_buf_sz);
+
+ netdev_dbg(priv->dev, "rx ring: head %d, tail %d\n",
+ priv->rx_head, priv->rx_tail);
+
+ priv->rx_head = dma_ring_incr(priv->rx_head, DMA_RX_RING_SZ);
+ /* Ensure descriptor is in memory before handing to h/w */
+ wmb();
+ desc_set_rx_owner(p);
+ }
+}
+
+/**
+ * init_xgmac_dma_desc_rings - init the RX/TX descriptor rings
+ * @dev: net device structure
+ * Description: this function initializes the DMA RX/TX descriptors
+ * and allocates the socket buffers.
+ */
+static int xgmac_dma_desc_rings_init(struct net_device *dev)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ unsigned int bfsize;
+
+ /* Set the Buffer size according to the MTU;
+ * indeed, in case of jumbo we need to bump-up the buffer sizes.
+ */
+ bfsize = ALIGN(dev->mtu + ETH_HLEN + ETH_FCS_LEN + NET_IP_ALIGN + 64,
+ 64);
+
+ netdev_dbg(priv->dev, "mtu [%d] bfsize [%d]\n", dev->mtu, bfsize);
+
+ priv->rx_skbuff = kzalloc(sizeof(struct sk_buff *) * DMA_RX_RING_SZ,
+ GFP_KERNEL);
+ if (!priv->rx_skbuff)
+ return -ENOMEM;
+
+ priv->dma_rx = dma_alloc_coherent(priv->device,
+ DMA_RX_RING_SZ *
+ sizeof(struct xgmac_dma_desc),
+ &priv->dma_rx_phy,
+ GFP_KERNEL);
+ if (!priv->dma_rx)
+ goto err_dma_rx;
+
+ priv->tx_skbuff = kzalloc(sizeof(struct sk_buff *) * DMA_TX_RING_SZ,
+ GFP_KERNEL);
+ if (!priv->tx_skbuff)
+ goto err_tx_skb;
+
+ priv->dma_tx = dma_alloc_coherent(priv->device,
+ DMA_TX_RING_SZ *
+ sizeof(struct xgmac_dma_desc),
+ &priv->dma_tx_phy,
+ GFP_KERNEL);
+ if (!priv->dma_tx)
+ goto err_dma_tx;
+
+ netdev_dbg(priv->dev, "DMA desc rings: virt addr (Rx %p, "
+ "Tx %p)\n\tDMA phy addr (Rx 0x%08x, Tx 0x%08x)\n",
+ priv->dma_rx, priv->dma_tx,
+ (unsigned int)priv->dma_rx_phy, (unsigned int)priv->dma_tx_phy);
+
+ priv->rx_tail = 0;
+ priv->rx_head = 0;
+ priv->dma_buf_sz = bfsize;
+ desc_init_rx_desc(priv->dma_rx, DMA_RX_RING_SZ, priv->dma_buf_sz);
+ xgmac_rx_refill(priv);
+
+ priv->tx_tail = 0;
+ priv->tx_head = 0;
+ desc_init_tx_desc(priv->dma_tx, DMA_TX_RING_SZ);
+
+ writel(priv->dma_tx_phy, priv->base + XGMAC_DMA_TX_BASE_ADDR);
+ writel(priv->dma_rx_phy, priv->base + XGMAC_DMA_RX_BASE_ADDR);
+
+ return 0;
+
+err_dma_tx:
+ kfree(priv->tx_skbuff);
+err_tx_skb:
+ dma_free_coherent(priv->device,
+ DMA_RX_RING_SZ * sizeof(struct xgmac_dma_desc),
+ priv->dma_rx, priv->dma_rx_phy);
+err_dma_rx:
+ kfree(priv->rx_skbuff);
+ return -ENOMEM;
+}
+
+static void xgmac_free_rx_skbufs(struct xgmac_priv *priv)
+{
+ int i;
+ struct xgmac_dma_desc *p;
+
+ if (!priv->rx_skbuff)
+ return;
+
+ for (i = 0; i < DMA_RX_RING_SZ; i++) {
+ if (priv->rx_skbuff[i] == NULL)
+ continue;
+
+ p = priv->dma_rx + i;
+ dma_unmap_single(priv->device, desc_get_buf_addr(p),
+ priv->dma_buf_sz, DMA_FROM_DEVICE);
+ dev_kfree_skb_any(priv->rx_skbuff[i]);
+ priv->rx_skbuff[i] = NULL;
+ }
+}
+
+static void xgmac_free_tx_skbufs(struct xgmac_priv *priv)
+{
+ int i, f;
+ struct xgmac_dma_desc *p;
+
+ if (!priv->tx_skbuff)
+ return;
+
+ for (i = 0; i < DMA_TX_RING_SZ; i++) {
+ if (priv->tx_skbuff[i] == NULL)
+ continue;
+
+ p = priv->dma_tx + i;
+ dma_unmap_single(priv->device, desc_get_buf_addr(p),
+ desc_get_buf_len(p), DMA_TO_DEVICE);
+
+ for (f = 0; f < skb_shinfo(priv->tx_skbuff[i])->nr_frags; f++) {
+ p = priv->dma_tx + i++;
+ dma_unmap_page(priv->device, desc_get_buf_addr(p),
+ desc_get_buf_len(p), DMA_TO_DEVICE);
+ }
+
+ dev_kfree_skb_any(priv->tx_skbuff[i]);
+ priv->tx_skbuff[i] = NULL;
+ }
+}
+
+static void xgmac_free_dma_desc_rings(struct xgmac_priv *priv)
+{
+ /* Release the DMA TX/RX socket buffers */
+ xgmac_free_rx_skbufs(priv);
+ xgmac_free_tx_skbufs(priv);
+
+ /* Free the consistent memory allocated for descriptor rings */
+ if (priv->dma_tx) {
+ dma_free_coherent(priv->device,
+ DMA_TX_RING_SZ * sizeof(struct xgmac_dma_desc),
+ priv->dma_tx, priv->dma_tx_phy);
+ priv->dma_tx = NULL;
+ }
+ if (priv->dma_rx) {
+ dma_free_coherent(priv->device,
+ DMA_RX_RING_SZ * sizeof(struct xgmac_dma_desc),
+ priv->dma_rx, priv->dma_rx_phy);
+ priv->dma_rx = NULL;
+ }
+ kfree(priv->rx_skbuff);
+ priv->rx_skbuff = NULL;
+ kfree(priv->tx_skbuff);
+ priv->tx_skbuff = NULL;
+}
+
+/**
+ * xgmac_tx:
+ * @priv: private driver structure
+ * Description: it reclaims resources after transmission completes.
+ */
+static void xgmac_tx_complete(struct xgmac_priv *priv)
+{
+ int i;
+ void __iomem *ioaddr = priv->base;
+
+ writel(DMA_STATUS_TU | DMA_STATUS_NIS, ioaddr + XGMAC_DMA_STATUS);
+
+ while (dma_ring_cnt(priv->tx_head, priv->tx_tail, DMA_TX_RING_SZ)) {
+ unsigned int entry = priv->tx_tail;
+ struct sk_buff *skb = priv->tx_skbuff[entry];
+ struct xgmac_dma_desc *p = priv->dma_tx + entry;
+
+ /* Check if the descriptor is owned by the DMA. */
+ if (desc_get_owner(p))
+ break;
+
+ /* Verify tx error by looking at the last segment */
+ if (desc_get_tx_ls(p))
+ desc_get_tx_status(priv, p);
+
+ netdev_dbg(priv->dev, "tx ring: curr %d, dirty %d\n",
+ priv->tx_head, priv->tx_tail);
+
+ dma_unmap_single(priv->device, desc_get_buf_addr(p),
+ desc_get_buf_len(p), DMA_TO_DEVICE);
+
+ priv->tx_skbuff[entry] = NULL;
+ priv->tx_tail = dma_ring_incr(entry, DMA_TX_RING_SZ);
+
+ if (!skb) {
+ continue;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ entry = priv->tx_tail = dma_ring_incr(priv->tx_tail,
+ DMA_TX_RING_SZ);
+ p = priv->dma_tx + priv->tx_tail;
+
+ dma_unmap_page(priv->device, desc_get_buf_addr(p),
+ desc_get_buf_len(p), DMA_TO_DEVICE);
+ }
+
+ /*
+ * If there's room in the queue (limit it to size)
+ * we add this skb back into the pool,
+ * if it's the right size.
+ */
+ if ((skb_queue_len(&priv->rx_recycle) <
+ DMA_RX_RING_SZ) &&
+ skb_recycle_check(skb, priv->dma_buf_sz))
+ __skb_queue_head(&priv->rx_recycle, skb);
+ else
+ dev_kfree_skb(skb);
+ }
+
+ if (dma_ring_space(priv->tx_head, priv->tx_tail, DMA_TX_RING_SZ) >
+ TX_THRESH)
+ netif_wake_queue(priv->dev);
+}
+
+/**
+ * xgmac_tx_err:
+ * @priv: pointer to the private device structure
+ * Description: it cleans the descriptors and restarts the transmission
+ * in case of errors.
+ */
+static void xgmac_tx_err(struct xgmac_priv *priv)
+{
+ u32 reg, value, inten;
+
+ netif_stop_queue(priv->dev);
+
+ inten = readl(priv->base + XGMAC_DMA_INTR_ENA);
+ writel(0, priv->base + XGMAC_DMA_INTR_ENA);
+
+ reg = readl(priv->base + XGMAC_DMA_CONTROL);
+ writel(reg & ~DMA_CONTROL_ST, priv->base + XGMAC_DMA_CONTROL);
+ do {
+ value = readl(priv->base + XGMAC_DMA_STATUS) & 0x700000;
+ } while (value && (value != 0x600000));
+
+ xgmac_free_tx_skbufs(priv);
+ desc_init_tx_desc(priv->dma_tx, DMA_TX_RING_SZ);
+ priv->tx_tail = 0;
+ priv->tx_head = 0;
+ writel(reg | DMA_CONTROL_ST, priv->base + XGMAC_DMA_CONTROL);
+
+ writel(DMA_STATUS_TU | DMA_STATUS_TPS | DMA_STATUS_NIS | DMA_STATUS_AIS,
+ priv->base + XGMAC_DMA_STATUS);
+ writel(inten, priv->base + XGMAC_DMA_INTR_ENA);
+
+ netif_wake_queue(priv->dev);
+}
+
+static int xgmac_hw_init(struct net_device *dev)
+{
+ u32 value, ctrl;
+ int limit;
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *ioaddr = priv->base;
+
+ /* Save the ctrl register value */
+ ctrl = readl(ioaddr + XGMAC_CONTROL) & XGMAC_CONTROL_SPD_MASK;
+
+ /* SW reset */
+ value = DMA_BUS_MODE_SFT_RESET;
+ writel(value, ioaddr + XGMAC_DMA_BUS_MODE);
+ limit = 15000;
+ while (limit-- &&
+ (readl(ioaddr + XGMAC_DMA_BUS_MODE) & DMA_BUS_MODE_SFT_RESET))
+ cpu_relax();
+ if (limit < 0)
+ return -EBUSY;
+
+ value = (0x10 << DMA_BUS_MODE_PBL_SHIFT) |
+ (0x10 << DMA_BUS_MODE_RPBL_SHIFT) |
+ DMA_BUS_MODE_FB | DMA_BUS_MODE_ATDS | DMA_BUS_MODE_AAL;
+ writel(value, ioaddr + XGMAC_DMA_BUS_MODE);
+
+ /* Enable interrupts */
+ writel(DMA_INTR_DEFAULT_MASK, ioaddr + XGMAC_DMA_STATUS);
+ writel(DMA_INTR_DEFAULT_MASK, ioaddr + XGMAC_DMA_INTR_ENA);
+
+ /* XGMAC requires AXI bus init. This is a 'magic number' for now */
+ writel(0x000100E, ioaddr + XGMAC_DMA_AXI_BUS);
+
+ ctrl |= XGMAC_CONTROL_DDIC | XGMAC_CONTROL_JE | XGMAC_CONTROL_ACS |
+ XGMAC_CONTROL_CAR;
+ if (dev->features & NETIF_F_RXCSUM)
+ ctrl |= XGMAC_CONTROL_IPC;
+ writel(ctrl, ioaddr + XGMAC_CONTROL);
+
+ value = DMA_CONTROL_DFF;
+ writel(value, ioaddr + XGMAC_DMA_CONTROL);
+
+ /* Set the HW DMA mode and the COE */
+ writel(XGMAC_OMR_TSF | XGMAC_OMR_RSF | XGMAC_OMR_RFD | XGMAC_OMR_RFA,
+ ioaddr + XGMAC_OMR);
+
+ /* Reset the MMC counters */
+ writel(1, ioaddr + XGMAC_MMC_CTRL);
+ return 0;
+}
+
+/**
+ * xgmac_open - open entry point of the driver
+ * @dev : pointer to the device structure.
+ * Description:
+ * This function is the open entry point of the driver.
+ * Return value:
+ * 0 on success and an appropriate (-)ve integer as defined in errno.h
+ * file on failure.
+ */
+static int xgmac_open(struct net_device *dev)
+{
+ int ret;
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *ioaddr = priv->base;
+
+ /* Check that the MAC address is valid. If its not, refuse
+ * to bring the device up. The user must specify an
+ * address using the following linux command:
+ * ifconfig eth0 hw ether xx:xx:xx:xx:xx:xx */
+ if (!is_valid_ether_addr(dev->dev_addr)) {
+ random_ether_addr(dev->dev_addr);
+ netdev_dbg(priv->dev, "generated random MAC address %pM\n",
+ dev->dev_addr);
+ }
+
+ skb_queue_head_init(&priv->rx_recycle);
+ memset(&priv->xstats, 0, sizeof(struct xgmac_extra_stats));
+
+ /* Initialize the XGMAC and descriptors */
+ xgmac_hw_init(dev);
+ xgmac_set_mac_addr(ioaddr, dev->dev_addr, 0);
+ xgmac_set_flow_ctrl(priv, priv->rx_pause, priv->tx_pause);
+
+ ret = xgmac_dma_desc_rings_init(dev);
+ if (ret < 0)
+ return ret;
+
+ /* Enable the MAC Rx/Tx */
+ xgmac_mac_enable(ioaddr);
+
+ napi_enable(&priv->napi);
+ netif_start_queue(dev);
+
+ return 0;
+}
+
+/**
+ * xgmac_release - close entry point of the driver
+ * @dev : device pointer.
+ * Description:
+ * This is the stop entry point of the driver.
+ */
+static int xgmac_stop(struct net_device *dev)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+
+ netif_stop_queue(dev);
+
+ if (readl(priv->base + XGMAC_DMA_INTR_ENA))
+ napi_disable(&priv->napi);
+
+ writel(0, priv->base + XGMAC_DMA_INTR_ENA);
+ skb_queue_purge(&priv->rx_recycle);
+
+ /* Disable the MAC core */
+ xgmac_mac_disable(priv->base);
+
+ /* Release and free the Rx/Tx resources */
+ xgmac_free_dma_desc_rings(priv);
+
+ return 0;
+}
+
+/**
+ * xgmac_xmit:
+ * @skb : the socket buffer
+ * @dev : device pointer
+ * Description : Tx entry point of the driver.
+ */
+static netdev_tx_t xgmac_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ unsigned int entry;
+ int i;
+ int nfrags = skb_shinfo(skb)->nr_frags;
+ struct xgmac_dma_desc *desc, *first;
+ unsigned int desc_flags;
+ unsigned int len;
+ dma_addr_t paddr;
+
+ if (dma_ring_space(priv->tx_head, priv->tx_tail, DMA_TX_RING_SZ) <
+ (nfrags + 1)) {
+ writel(DMA_INTR_DEFAULT_MASK | DMA_INTR_ENA_TIE,
+ priv->base + XGMAC_DMA_INTR_ENA);
+ netif_stop_queue(dev);
+ return NETDEV_TX_BUSY;
+ }
+
+ desc_flags = (skb->ip_summed == CHECKSUM_PARTIAL) ?
+ TXDESC_CSUM_ALL : 0;
+ entry = priv->tx_head;
+ desc = priv->dma_tx + entry;
+ first = desc;
+
+ len = skb_headlen(skb);
+ paddr = dma_map_single(priv->device, skb->data, len, DMA_TO_DEVICE);
+ if (dma_mapping_error(priv->device, paddr)) {
+ dev_kfree_skb(skb);
+ return -EIO;
+ }
+ priv->tx_skbuff[entry] = skb;
+ desc_set_buf_addr_and_size(desc, paddr, len);
+
+ for (i = 0; i < nfrags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ len = frag->size;
+
+ paddr = skb_frag_dma_map(priv->device, frag, 0, len,
+ DMA_TO_DEVICE);
+ if (dma_mapping_error(priv->device, paddr)) {
+ dev_kfree_skb(skb);
+ return -EIO;
+ }
+
+ entry = dma_ring_incr(entry, DMA_TX_RING_SZ);
+ desc = priv->dma_tx + entry;
+ priv->tx_skbuff[entry] = NULL;
+
+ desc_set_buf_addr_and_size(desc, paddr, len);
+ if (i < (nfrags - 1))
+ desc_set_tx_owner(desc, desc_flags);
+ }
+
+ /* Interrupt on completition only for the latest segment */
+ if (desc != first)
+ desc_set_tx_owner(desc, desc_flags |
+ TXDESC_LAST_SEG | TXDESC_INTERRUPT);
+ else
+ desc_flags |= TXDESC_LAST_SEG | TXDESC_INTERRUPT;
+
+ /* Set owner on first desc last to avoid race condition */
+ wmb();
+ desc_set_tx_owner(first, desc_flags | TXDESC_FIRST_SEG);
+
+ priv->tx_head = dma_ring_incr(entry, DMA_TX_RING_SZ);
+
+ writel(1, priv->base + XGMAC_DMA_TX_POLL);
+
+ return NETDEV_TX_OK;
+}
+
+static int xgmac_rx(struct xgmac_priv *priv, int limit)
+{
+ unsigned int entry;
+ unsigned int count = 0;
+ struct xgmac_dma_desc *p;
+
+ while (count < limit) {
+ int ip_checksum;
+ struct sk_buff *skb;
+ int frame_len;
+
+ writel(DMA_STATUS_RI | DMA_STATUS_NIS,
+ priv->base + XGMAC_DMA_STATUS);
+
+ entry = priv->rx_tail;
+ p = priv->dma_rx + entry;
+ if (desc_get_owner(p))
+ break;
+
+ count++;
+ priv->rx_tail = dma_ring_incr(priv->rx_tail, DMA_RX_RING_SZ);
+
+ /* read the status of the incoming frame */
+ ip_checksum = desc_get_rx_status(priv, p);
+ if (ip_checksum < 0)
+ continue;
+
+ skb = priv->rx_skbuff[entry];
+ if (unlikely(!skb)) {
+ netdev_err(priv->dev, "Inconsistent Rx descriptor chain\n");
+ break;
+ }
+ priv->rx_skbuff[entry] = NULL;
+
+ frame_len = desc_get_rx_frame_len(p);
+ netdev_dbg(priv->dev, "RX frame size %d, COE status: %d\n",
+ frame_len, ip_checksum);
+
+ skb_put(skb, frame_len);
+ dma_unmap_single(priv->device, desc_get_buf_addr(p),
+ frame_len, DMA_FROM_DEVICE);
+
+ skb->protocol = eth_type_trans(skb, priv->dev);
+ skb->ip_summed = ip_checksum;
+ if (ip_checksum == CHECKSUM_NONE)
+ netif_receive_skb(skb);
+ else
+ napi_gro_receive(&priv->napi, skb);
+ }
+
+ xgmac_rx_refill(priv);
+
+ writel(1, priv->base + XGMAC_DMA_RX_POLL);
+
+ return count;
+}
+
+/**
+ * xgmac_poll - xgmac poll method (NAPI)
+ * @napi : pointer to the napi structure.
+ * @budget : maximum number of packets that the current CPU can receive from
+ * all interfaces.
+ * Description :
+ * This function implements the the reception process.
+ * Also it runs the TX completion thread
+ */
+static int xgmac_poll(struct napi_struct *napi, int budget)
+{
+ struct xgmac_priv *priv = container_of(napi,
+ struct xgmac_priv, napi);
+ int work_done = 0;
+
+ xgmac_tx_complete(priv);
+ work_done = xgmac_rx(priv, budget);
+
+ if (work_done < budget) {
+ napi_complete(napi);
+ writel(DMA_INTR_DEFAULT_MASK, priv->base + XGMAC_DMA_INTR_ENA);
+ }
+ return work_done;
+}
+
+/**
+ * xgmac_tx_timeout
+ * @dev : Pointer to net device structure
+ * Description: this function is called when a packet transmission fails to
+ * complete within a reasonable tmrate. The driver will mark the error in the
+ * netdev structure and arrange for the device to be reset to a sane state
+ * in order to transmit a new packet.
+ */
+static void xgmac_tx_timeout(struct net_device *dev)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+
+ /* Clear Tx resources and restart transmitting again */
+ xgmac_tx_err(priv);
+}
+
+/**
+ * xgmac_set_rx_mode - entry point for multicast addressing
+ * @dev : pointer to the device structure
+ * Description:
+ * This function is a driver entry point which gets called by the kernel
+ * whenever multicast addresses must be enabled/disabled.
+ * Return value:
+ * void.
+ */
+static void xgmac_set_rx_mode(struct net_device *dev)
+{
+ int i;
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *ioaddr = priv->base;
+ unsigned int value = 0;
+ u32 hash_filter[XGMAC_NUM_HASH];
+ int reg = 1;
+ struct netdev_hw_addr *ha;
+ bool use_hash = false;
+
+ netdev_dbg(priv->dev, "# mcasts %d, # unicast %d\n",
+ netdev_mc_count(dev), netdev_uc_count(dev));
+
+ if (dev->flags & IFF_PROMISC) {
+ writel(XGMAC_FRAME_FILTER_PR, ioaddr + XGMAC_FRAME_FILTER);
+ return;
+ }
+
+ memset(hash_filter, 0, sizeof(hash_filter));
+
+ if (netdev_uc_count(dev) > XGMAC_MAX_FILTER_ADDR) {
+ use_hash = true;
+ value |= XGMAC_FRAME_FILTER_HUC | XGMAC_FRAME_FILTER_HPF;
+ }
+ netdev_for_each_uc_addr(ha, dev) {
+ if (use_hash) {
+ u32 bit_nr = ~ether_crc(ETH_ALEN, ha->addr) >> 23;
+
+ /* The most significant 4 bits determine the register to
+ * use (H/L) while the other 5 bits determine the bit
+ * within the register. */
+ hash_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
+ } else {
+ xgmac_set_mac_addr(ioaddr, ha->addr, reg);
+ reg++;
+ }
+ }
+
+ if (dev->flags & IFF_ALLMULTI) {
+ value |= XGMAC_FRAME_FILTER_PM;
+ goto out;
+ }
+
+ if ((netdev_mc_count(dev) + reg - 1) > XGMAC_MAX_FILTER_ADDR) {
+ use_hash = true;
+ value |= XGMAC_FRAME_FILTER_HMC | XGMAC_FRAME_FILTER_HPF;
+ }
+ netdev_for_each_mc_addr(ha, dev) {
+ if (use_hash) {
+ u32 bit_nr = ~ether_crc(ETH_ALEN, ha->addr) >> 23;
+
+ /* The most significant 4 bits determine the register to
+ * use (H/L) while the other 5 bits determine the bit
+ * within the register. */
+ hash_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
+ } else {
+ xgmac_set_mac_addr(ioaddr, ha->addr, reg);
+ reg++;
+ }
+ }
+
+out:
+ for (i = 0; i < XGMAC_NUM_HASH; i++)
+ writel(hash_filter[i], ioaddr + XGMAC_HASH(i));
+
+ writel(value, ioaddr + XGMAC_FRAME_FILTER);
+}
+
+/**
+ * xgmac_change_mtu - entry point to change MTU size for the device.
+ * @dev : device pointer.
+ * @new_mtu : the new MTU size for the device.
+ * Description: the Maximum Transfer Unit (MTU) is used by the network layer
+ * to drive packet transmission. Ethernet has an MTU of 1500 octets
+ * (ETH_DATA_LEN). This value can be changed with ifconfig.
+ * Return value:
+ * 0 on success and an appropriate (-)ve integer as defined in errno.h
+ * file on failure.
+ */
+static int xgmac_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ int old_mtu;
+
+ if ((new_mtu < 46) || (new_mtu > MAX_MTU)) {
+ netdev_err(priv->dev, "invalid MTU, max MTU is: %d\n", MAX_MTU);
+ return -EINVAL;
+ }
+
+ old_mtu = dev->mtu;
+ dev->mtu = new_mtu;
+
+ /* return early if the buffer sizes will not change */
+ if (old_mtu <= ETH_DATA_LEN && new_mtu <= ETH_DATA_LEN)
+ return 0;
+ if (old_mtu == new_mtu)
+ return 0;
+
+ /* Stop everything, get ready to change the MTU */
+ if (!netif_running(dev))
+ return 0;
+
+ /* Bring the interface down and then back up */
+ xgmac_stop(dev);
+ return xgmac_open(dev);
+}
+
+static irqreturn_t xgmac_pmt_interrupt(int irq, void *dev_id)
+{
+ u32 intr_status;
+ struct net_device *dev = (struct net_device *)dev_id;
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *ioaddr = priv->base;
+
+ intr_status = readl(ioaddr + XGMAC_INT_STAT);
+ if (intr_status & XGMAC_INT_STAT_PMT) {
+ netdev_dbg(priv->dev, "received Magic frame\n");
+ /* clear the PMT bits 5 and 6 by reading the PMT */
+ readl(ioaddr + XGMAC_PMT);
+ }
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t xgmac_interrupt(int irq, void *dev_id)
+{
+ u32 intr_status;
+ bool tx_err = false;
+ struct net_device *dev = (struct net_device *)dev_id;
+ struct xgmac_priv *priv = netdev_priv(dev);
+ struct xgmac_extra_stats *x = &priv->xstats;
+
+ /* read the status register (CSR5) */
+ intr_status = readl(priv->base + XGMAC_DMA_STATUS);
+ intr_status &= readl(priv->base + XGMAC_DMA_INTR_ENA);
+ writel(intr_status, priv->base + XGMAC_DMA_STATUS);
+
+ /* It displays the DMA process states (CSR5 register) */
+ /* ABNORMAL interrupts */
+ if (unlikely(intr_status & DMA_STATUS_AIS)) {
+ if (intr_status & DMA_STATUS_TJT) {
+ netdev_err(priv->dev, "transmit jabber\n");
+ x->tx_jabber++;
+ }
+ if (intr_status & DMA_STATUS_RU)
+ x->rx_buf_unav++;
+ if (intr_status & DMA_STATUS_RPS) {
+ netdev_err(priv->dev, "receive process stopped\n");
+ x->rx_process_stopped++;
+ }
+ if (intr_status & DMA_STATUS_ETI) {
+ netdev_err(priv->dev, "transmit early interrupt\n");
+ x->tx_early++;
+ }
+ if (intr_status & DMA_STATUS_TPS) {
+ netdev_err(priv->dev, "transmit process stopped\n");
+ x->tx_process_stopped++;
+ tx_err = true;
+ }
+ if (intr_status & DMA_STATUS_FBI) {
+ netdev_err(priv->dev, "fatal bus error\n");
+ x->fatal_bus_error++;
+ tx_err = true;
+ }
+
+ if (tx_err)
+ xgmac_tx_err(priv);
+ }
+
+ /* TX/RX NORMAL interrupts */
+ if (intr_status & (DMA_STATUS_RI | DMA_STATUS_TU)) {
+ writel(DMA_INTR_ABNORMAL, priv->base + XGMAC_DMA_INTR_ENA);
+ napi_schedule(&priv->napi);
+ }
+
+ return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling receive - used by NETCONSOLE and other diagnostic tools
+ * to allow network I/O with interrupts disabled. */
+static void xgmac_poll_controller(struct net_device *dev)
+{
+ disable_irq(dev->irq);
+ xgmac_interrupt(dev->irq, dev);
+ enable_irq(dev->irq);
+}
+#endif
+
+struct rtnl_link_stats64 *
+xgmac_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *storage)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *base = priv->base;
+ u32 count;
+
+ spin_lock_bh(&priv->stats_lock);
+ writel(XGMAC_MMC_CTRL_CNT_FRZ, base + XGMAC_MMC_CTRL);
+
+ storage->rx_bytes = readl(base + XGMAC_MMC_RXOCTET_G_LO);
+ storage->rx_bytes |= (u64)(readl(base + XGMAC_MMC_RXOCTET_G_HI)) << 32;
+
+ storage->rx_packets = readl(base + XGMAC_MMC_RXFRAME_GB_LO);
+ storage->multicast = readl(base + XGMAC_MMC_RXMCFRAME_G);
+ storage->rx_crc_errors = readl(base + XGMAC_MMC_RXCRCERR);
+ storage->rx_length_errors = readl(base + XGMAC_MMC_RXLENGTHERR);
+ storage->rx_missed_errors = readl(base + XGMAC_MMC_RXOVERFLOW);
+
+ storage->tx_bytes = readl(base + XGMAC_MMC_TXOCTET_G_LO);
+ storage->tx_bytes |= (u64)(readl(base + XGMAC_MMC_TXOCTET_G_HI)) << 32;
+
+ count = readl(base + XGMAC_MMC_TXFRAME_GB_LO);
+ storage->tx_errors = count - readl(base + XGMAC_MMC_TXFRAME_G_LO);
+ storage->tx_packets = count;
+ storage->tx_fifo_errors = readl(base + XGMAC_MMC_TXUNDERFLOW);
+
+ writel(0, base + XGMAC_MMC_CTRL);
+ spin_unlock_bh(&priv->stats_lock);
+ return storage;
+}
+
+static int xgmac_set_mac_address(struct net_device *dev, void *p)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *ioaddr = priv->base;
+ struct sockaddr *addr = p;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+
+ xgmac_set_mac_addr(ioaddr, dev->dev_addr, 0);
+
+ return 0;
+}
+
+static int xgmac_set_features(struct net_device *dev, u32 features)
+{
+ u32 ctrl;
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void __iomem *ioaddr = priv->base;
+ u32 changed = dev->features ^ features;
+
+ if (!(changed & NETIF_F_RXCSUM))
+ return 0;
+
+ ctrl = readl(ioaddr + XGMAC_CONTROL);
+ if (features & NETIF_F_RXCSUM)
+ ctrl |= XGMAC_CONTROL_IPC;
+ else
+ ctrl &= ~XGMAC_CONTROL_IPC;
+ writel(ctrl, ioaddr + XGMAC_CONTROL);
+
+ return 0;
+}
+
+static const struct net_device_ops xgmac_netdev_ops = {
+ .ndo_open = xgmac_open,
+ .ndo_start_xmit = xgmac_xmit,
+ .ndo_stop = xgmac_stop,
+ .ndo_change_mtu = xgmac_change_mtu,
+ .ndo_set_rx_mode = xgmac_set_rx_mode,
+ .ndo_tx_timeout = xgmac_tx_timeout,
+ .ndo_get_stats64 = xgmac_get_stats64,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = xgmac_poll_controller,
+#endif
+ .ndo_set_mac_address = xgmac_set_mac_address,
+ .ndo_set_features = xgmac_set_features,
+};
+
+static int xgmac_ethtool_getsettings(struct net_device *dev,
+ struct ethtool_cmd *cmd)
+{
+ cmd->autoneg = 0;
+ cmd->duplex = DUPLEX_FULL;
+ ethtool_cmd_speed_set(cmd, 10000);
+ cmd->supported = 0;
+ cmd->advertising = 0;
+ cmd->transceiver = XCVR_INTERNAL;
+ return 0;
+}
+
+static void xgmac_get_pauseparam(struct net_device *netdev,
+ struct ethtool_pauseparam *pause)
+{
+ struct xgmac_priv *priv = netdev_priv(netdev);
+
+ pause->rx_pause = priv->rx_pause;
+ pause->tx_pause = priv->tx_pause;
+}
+
+static int xgmac_set_pauseparam(struct net_device *netdev,
+ struct ethtool_pauseparam *pause)
+{
+ struct xgmac_priv *priv = netdev_priv(netdev);
+
+ if (pause->autoneg)
+ return -EINVAL;
+
+ return xgmac_set_flow_ctrl(priv, pause->rx_pause, pause->tx_pause);
+}
+
+struct xgmac_stats {
+ char stat_string[ETH_GSTRING_LEN];
+ int stat_offset;
+ bool is_reg;
+};
+
+#define XGMAC_STAT(m) \
+ { #m, offsetof(struct xgmac_priv, xstats.m), false }
+#define XGMAC_HW_STAT(m, reg_offset) \
+ { #m, reg_offset, true }
+
+static const struct xgmac_stats xgmac_gstrings_stats[] = {
+ XGMAC_STAT(tx_frame_flushed),
+ XGMAC_STAT(tx_payload_error),
+ XGMAC_STAT(tx_ip_header_error),
+ XGMAC_STAT(tx_local_fault),
+ XGMAC_STAT(tx_remote_fault),
+ XGMAC_STAT(tx_early),
+ XGMAC_STAT(tx_process_stopped),
+ XGMAC_STAT(tx_jabber),
+ XGMAC_STAT(rx_buf_unav),
+ XGMAC_STAT(rx_process_stopped),
+ XGMAC_STAT(rx_payload_error),
+ XGMAC_STAT(rx_ip_header_error),
+ XGMAC_STAT(rx_da_filter_fail),
+ XGMAC_STAT(rx_sa_filter_fail),
+ XGMAC_STAT(fatal_bus_error),
+ XGMAC_HW_STAT(rx_watchdog, XGMAC_MMC_RXWATCHDOG),
+ XGMAC_HW_STAT(tx_vlan, XGMAC_MMC_TXVLANFRAME),
+ XGMAC_HW_STAT(rx_vlan, XGMAC_MMC_RXVLANFRAME),
+ XGMAC_HW_STAT(tx_pause, XGMAC_MMC_TXPAUSEFRAME),
+ XGMAC_HW_STAT(rx_pause, XGMAC_MMC_RXPAUSEFRAME),
+};
+#define XGMAC_STATS_LEN ARRAY_SIZE(xgmac_gstrings_stats)
+
+static void xgmac_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *dummy,
+ u64 *data)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ void *p = priv;
+ int i;
+
+ for (i = 0; i < XGMAC_STATS_LEN; i++) {
+ if (xgmac_gstrings_stats[i].is_reg)
+ *data++ = readl(priv->base +
+ xgmac_gstrings_stats[i].stat_offset);
+ else
+ *data++ = *(u32 *)(p +
+ xgmac_gstrings_stats[i].stat_offset);
+ }
+}
+
+static int xgmac_get_sset_count(struct net_device *netdev, int sset)
+{
+ switch (sset) {
+ case ETH_SS_STATS:
+ return XGMAC_STATS_LEN;
+ default:
+ return -EINVAL;
+ }
+}
+
+static void xgmac_get_strings(struct net_device *dev, u32 stringset,
+ u8 *data)
+{
+ int i;
+ u8 *p = data;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ for (i = 0; i < XGMAC_STATS_LEN; i++) {
+ memcpy(p, xgmac_gstrings_stats[i].stat_string,
+ ETH_GSTRING_LEN);
+ p += ETH_GSTRING_LEN;
+ }
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+}
+
+static void xgmac_get_wol(struct net_device *dev,
+ struct ethtool_wolinfo *wol)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+
+ if (device_can_wakeup(priv->device)) {
+ wol->supported = WAKE_MAGIC | WAKE_UCAST;
+ wol->wolopts = priv->wolopts;
+ }
+}
+
+static int xgmac_set_wol(struct net_device *dev,
+ struct ethtool_wolinfo *wol)
+{
+ struct xgmac_priv *priv = netdev_priv(dev);
+ u32 support = WAKE_MAGIC | WAKE_UCAST;
+
+ if (!device_can_wakeup(priv->device))
+ return -ENOTSUPP;
+
+ if (wol->wolopts & ~support)
+ return -EINVAL;
+
+ priv->wolopts = wol->wolopts;
+
+ if (wol->wolopts) {
+ device_set_wakeup_enable(priv->device, 1);
+ enable_irq_wake(dev->irq);
+ } else {
+ device_set_wakeup_enable(priv->device, 0);
+ disable_irq_wake(dev->irq);
+ }
+
+ return 0;
+}
+
+static struct ethtool_ops xgmac_ethtool_ops = {
+ .get_settings = xgmac_ethtool_getsettings,
+ .get_link = ethtool_op_get_link,
+ .get_pauseparam = xgmac_get_pauseparam,
+ .set_pauseparam = xgmac_set_pauseparam,
+ .get_ethtool_stats = xgmac_get_ethtool_stats,
+ .get_strings = xgmac_get_strings,
+ .get_wol = xgmac_get_wol,
+ .set_wol = xgmac_set_wol,
+ .get_sset_count = xgmac_get_sset_count,
+};
+
+/**
+ * xgmac_probe
+ * @pdev: platform device pointer
+ * Description: the driver is initialized through platform_device.
+ */
+static int xgmac_probe(struct platform_device *pdev)
+{
+ int ret = 0;
+ struct resource *res;
+ struct net_device *ndev = NULL;
+ struct xgmac_priv *priv = NULL;
+ u32 uid;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENODEV;
+
+ if (!request_mem_region(res->start, resource_size(res), pdev->name))
+ return -EBUSY;
+
+ ndev = alloc_etherdev(sizeof(struct xgmac_priv));
+ if (!ndev) {
+ ret = -ENOMEM;
+ goto err_alloc;
+ }
+
+ SET_NETDEV_DEV(ndev, &pdev->dev);
+ priv = netdev_priv(ndev);
+ platform_set_drvdata(pdev, ndev);
+ ether_setup(ndev);
+ ndev->netdev_ops = &xgmac_netdev_ops;
+ SET_ETHTOOL_OPS(ndev, &xgmac_ethtool_ops);
+ spin_lock_init(&priv->stats_lock);
+
+ priv->device = &pdev->dev;
+ priv->dev = ndev;
+ priv->rx_pause = 1;
+ priv->tx_pause = 1;
+
+ priv->base = ioremap(res->start, resource_size(res));
+ if (!priv->base) {
+ netdev_err(ndev, "ioremap failed\n");
+ ret = -ENOMEM;
+ goto err_io;
+ }
+
+ uid = readl(priv->base + XGMAC_VERSION);
+ netdev_info(ndev, "h/w version is 0x%x\n", uid);
+
+ writel(0, priv->base + XGMAC_DMA_INTR_ENA);
+ ndev->irq = platform_get_irq(pdev, 0);
+ if (ndev->irq == -ENXIO) {
+ netdev_err(ndev, "No irq resource\n");
+ ret = ndev->irq;
+ goto err_irq;
+ }
+
+ ret = request_irq(ndev->irq, xgmac_interrupt, 0,
+ dev_name(&pdev->dev), ndev);
+ if (ret < 0) {
+ netdev_err(ndev, "Could not request irq %d - ret %d)\n",
+ ndev->irq, ret);
+ goto err_irq;
+ }
+
+ priv->pmt_irq = platform_get_irq(pdev, 1);
+ if (priv->pmt_irq == -ENXIO) {
+ netdev_err(ndev, "No pmt irq resource\n");
+ ret = priv->pmt_irq;
+ goto err_pmt_irq;
+ }
+
+ ret = request_irq(priv->pmt_irq, xgmac_pmt_interrupt, 0,
+ dev_name(&pdev->dev), ndev);
+ if (ret < 0) {
+ netdev_err(ndev, "Could not request irq %d - ret %d)\n",
+ priv->pmt_irq, ret);
+ goto err_pmt_irq;
+ }
+
+ device_set_wakeup_capable(&pdev->dev, 1);
+ if (device_can_wakeup(priv->device))
+ priv->wolopts = WAKE_MAGIC; /* Magic Frame as default */
+
+ ndev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
+ if (readl(priv->base + XGMAC_DMA_HW_FEATURE) & DMA_HW_FEAT_TXCOESEL)
+ ndev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+ NETIF_F_RXCSUM;
+ ndev->features |= ndev->hw_features;
+ ndev->priv_flags |= IFF_UNICAST_FLT;
+
+ /* Get the MAC address */
+ xgmac_get_mac_addr(priv->base, ndev->dev_addr, 0);
+ if (!is_valid_ether_addr(ndev->dev_addr))
+ netdev_warn(ndev, "MAC address %pM not valid",
+ ndev->dev_addr);
+
+ netif_napi_add(ndev, &priv->napi, xgmac_poll, 64);
+ ret = register_netdev(ndev);
+ if (ret)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ netif_napi_del(&priv->napi);
+ free_irq(priv->pmt_irq, ndev);
+err_pmt_irq:
+ free_irq(ndev->irq, ndev);
+err_irq:
+ iounmap(priv->base);
+err_io:
+ free_netdev(ndev);
+err_alloc:
+ release_mem_region(res->start, resource_size(res));
+ platform_set_drvdata(pdev, NULL);
+ return ret;
+}
+
+/**
+ * xgmac_dvr_remove
+ * @pdev: platform device pointer
+ * Description: this function resets the TX/RX processes, disables the MAC RX/TX
+ * changes the link status, releases the DMA descriptor rings,
+ * unregisters the MDIO bus and unmaps the allocated memory.
+ */
+static int xgmac_remove(struct platform_device *pdev)
+{
+ struct net_device *ndev = platform_get_drvdata(pdev);
+ struct xgmac_priv *priv = netdev_priv(ndev);
+ struct resource *res;
+
+ xgmac_mac_disable(priv->base);
+
+ /* Free the IRQ lines */
+ free_irq(ndev->irq, ndev);
+ free_irq(priv->pmt_irq, ndev);
+
+ platform_set_drvdata(pdev, NULL);
+ unregister_netdev(ndev);
+ netif_napi_del(&priv->napi);
+
+ iounmap(priv->base);
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ release_mem_region(res->start, resource_size(res));
+
+ free_netdev(ndev);
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static void xgmac_pmt(void __iomem *ioaddr, unsigned long mode)
+{
+ unsigned int pmt = 0;
+
+ if (mode & WAKE_MAGIC)
+ pmt |= XGMAC_PMT_POWERDOWN | XGMAC_PMT_MAGIC_PKT;
+ if (mode & WAKE_UCAST)
+ pmt |= XGMAC_PMT_POWERDOWN | XGMAC_PMT_GLBL_UNICAST;
+
+ writel(pmt, ioaddr + XGMAC_PMT);
+}
+
+static int xgmac_suspend(struct device *dev)
+{
+ struct net_device *ndev = platform_get_drvdata(to_platform_device(dev));
+ struct xgmac_priv *priv = netdev_priv(ndev);
+ u32 value;
+
+ if (!ndev || !netif_running(ndev))
+ return 0;
+
+ netif_device_detach(ndev);
+ napi_disable(&priv->napi);
+ writel(0, priv->base + XGMAC_DMA_INTR_ENA);
+
+ if (device_may_wakeup(priv->device)) {
+ /* Stop TX/RX DMA Only */
+ value = readl(priv->base + XGMAC_DMA_CONTROL);
+ value &= ~(DMA_CONTROL_ST | DMA_CONTROL_SR);
+ writel(value, priv->base + XGMAC_DMA_CONTROL);
+
+ xgmac_pmt(priv->base, priv->wolopts);
+ } else
+ xgmac_mac_disable(priv->base);
+
+ return 0;
+}
+
+static int xgmac_resume(struct device *dev)
+{
+ struct net_device *ndev = platform_get_drvdata(to_platform_device(dev));
+ struct xgmac_priv *priv = netdev_priv(ndev);
+ void __iomem *ioaddr = priv->base;
+
+ if (!netif_running(ndev))
+ return 0;
+
+ xgmac_pmt(ioaddr, 0);
+
+ /* Enable the MAC and DMA */
+ xgmac_mac_enable(ioaddr);
+ writel(DMA_INTR_DEFAULT_MASK, ioaddr + XGMAC_DMA_STATUS);
+ writel(DMA_INTR_DEFAULT_MASK, ioaddr + XGMAC_DMA_INTR_ENA);
+
+ netif_device_attach(ndev);
+ napi_enable(&priv->napi);
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(xgmac_pm_ops, xgmac_suspend, xgmac_resume);
+#define XGMAC_PM_OPS (&xgmac_pm_ops)
+#else
+#define XGMAC_PM_OPS NULL
+#endif /* CONFIG_PM_SLEEP */
+
+static const struct of_device_id xgmac_of_match[] = {
+ { .compatible = "calxeda,hb-xgmac", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, xgmac_of_match);
+
+static struct platform_driver xgmac_driver = {
+ .driver = {
+ .name = "calxedaxgmac",
+ .of_match_table = xgmac_of_match,
+ },
+ .probe = xgmac_probe,
+ .remove = xgmac_remove,
+ .driver.pm = XGMAC_PM_OPS,
+};
+
+module_platform_driver(xgmac_driver);
+
+MODULE_AUTHOR("Calxeda, Inc.");
+MODULE_DESCRIPTION("Calxeda 10G XGMAC driver");
+MODULE_LICENSE("GPL v2");
--
1.7.5.4
^ permalink raw reply related
* Re: [PATCH net-next 4/4] net: Add Open vSwitch kernel components.
From: Jamal Hadi Salim @ 2011-11-23 3:07 UTC (permalink / raw)
To: John Fastabend
Cc: dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David S. Miller
In-Reply-To: <4ECC5AAB.8000605-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
On Tue, 2011-11-22 at 18:30 -0800, John Fastabend wrote:
> He is pushing and popping entire tags off 802.1Q for now but
> you can easily imagine MPLS tags and all sorts of other things
> people will _need_.
Lots of packet munging already happening with actions.
We can pedit/nat/iptables/checksum/etc/. Works very well.
> Do we want tc and likely the skbedit action to explode into a
> packet mangling tool?
skbedit is the wrong action to use. I would write one with
a cutnpaste of the code they have to do vlan manipulation.
> Would it make sense to plug into ebtables
> perhaps with a new family, NFPROTO_OPENFLOW or even on the
> existing NFPROTO_BRIDGE.
There is _nothing_ that openflow needs that cant be done
in classifier-action piece.
A good number of the actions they need exist already.
> Although doing it with classifiers and more actions would flush
> out that TODO in act_mirred, and get us an mq_ingress among
> other things.
The packet redirect to user space is achieveable in many other
ways, thats why it was not added.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH net-next 4/4] net: Add Open vSwitch kernel components.
From: John Fastabend @ 2011-11-23 2:30 UTC (permalink / raw)
To: Jamal Hadi Salim
Cc: dev-yBygre7rU0TnMu66kgdUjQ@public.gmane.org,
netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, David S. Miller
In-Reply-To: <1322012755.2039.36.camel@mojatatu>
On 11/22/2011 5:45 PM, Jamal Hadi Salim wrote:
> On Tue, 2011-11-22 at 15:11 -0800, Jesse Gross wrote:
>
>
>> As you mention, one of the biggest benefits of Open vSwitch is how
>> simple the kernel portions are (it's less than 6000 lines).
>
> I said that was the reason _you_ were using to justify things
> and i argue that is not accurate.
> You will be adding more actions and more classification fields to
> the datapath - and you are going to add them to that monolithic
> "simple" code. And it is going to grow.
>
> BTW, you _are using some of the actions_ already (the policer for
> example to do rate control; no disrespect intended but in a terrible
> way).
> Eventually you will canibalize that in your code because it is "simpler"
> to do that.
> So to be explicit: I dont think this is a good arguement.
>
>> It's
>> existed as an out-of-tree project for several years now so it's
>> actually fairly mature already and unlikely that there will be a
>> sudden influx of new code over the coming months. There's already
>> quite a bit of functionality that has been implemented on top of it
>> and it's been mentioned that several other components can be written
>> in terms of it
>
> I very much empathize with this point. But that is not a technical
> issue.
>
>> so I think that it's fairly generic infrastructure that
>> can be used in many ways. Over time, I think it will result in a net
>> reduction of code in the kernel as the design is heavily focused on
>> delegating work to userspace.
>
> Both your goal and the Linux qos/filtering/action code is to be be
> modular and move policy control out of the kernel. In our case,
> any of the actions, classifiers, qos schedulers can be experimented
> with out of tree with zero patch needs and when ready pushed into the
> kernel with zero code changes to the core. So nothing in what we have
> says the policy control sits in the kernel.
>
>> I would view it as similar in many ways to the recently added team
>> device, which is based on the idea of keeping simple things simple.
>
> Good analogy, but wrong direction: Bonding is a monolithic christmas
> tree which people kept adding code to because it was "simpler" to do
> so.
> Your code is heading that way because as openflow progresses or some
> new thing comes along (I notice capwap) you'll be adding more code for
> more classifiers and more actions and maybe more schedulers and will
> have to replicate things we provide. And they all go into this
> monolithic code because it is "simpler".
>
> Is there anything we do that makes it hard for you to use the
> infrastructure provided? Is there anything you do that we cant
> provide via the classifier-action-scheduler infrastructure?
> If you need help let me know.
>
>
> cheers,
> jamal
>
He is pushing and popping entire tags off 802.1Q for now but
you can easily imagine MPLS tags and all sorts of other things
people will _need_.
Do we want tc and likely the skbedit action to explode into a
packet mangling tool? Would it make sense to plug into ebtables
perhaps with a new family, NFPROTO_OPENFLOW or even on the
existing NFPROTO_BRIDGE.
Although doing it with classifiers and more actions would flush
out that TODO in act_mirred, and get us an mq_ingress among
other things.
.John
^ permalink raw reply
* Re: Missing TCP SYN on loopback, retransmits after 1s
From: John Heffner @ 2011-11-23 2:06 UTC (permalink / raw)
To: Jesse Young; +Cc: netdev
In-Reply-To: <20111122181320.38a70cf8@telperion.jlyo.org>
Offhand, I'd guess you're overflowing the TCP SYN queue. (You can try
tuning tcp_max_syn_backlog.)
-John
On Tue, Nov 22, 2011 at 7:13 PM, Jesse Young <jlyo@jlyo.org> wrote:
> Hi all,
>
> I am experiencing packet loss over TCP/IPv[46], which causes 1 second
> delays when connect()ing to a socket. This happens even on loopback, and
> on multiple kernels. On the older kernels, the connect() time is nearly
> 3 seconds, I believe this is due to a recent TCP connect retrasmit
> parameter changed in the kernel.
>
> 1. Linux dc-s1000-2114 2.6.32-35-server #78-Ubuntu SMP Tue Oct 11
> 16:26:12 UTC 2011 x86_64 GNU/Linux
> 2. Linux dc-a1000-2131.cleversafelabs.com 2.6.39.4-2-clevos+ #1 SMP
> Tue Nov 8 09:06:49 CST 2011 x86_64 x86_64 x86_64 GNU/Linux
> 3. Linux telperion.jlyo.org 3.1.0-4-ARCH #1 SMP PREEMPT Mon Nov 7
> 22:47:18 CET 2011 x86_64 Intel(R) Core(TM) i7-2630QM CPU @ 2.00GHz
> GenuineIntel GNU/Linux
>
> I have created some test cases which reify this problem, the first set
> of tests use select() multiplexing, and have some problems, however,
> they exhibit odd behavior as well, especially in the difference between
> tcp4 and tcp6.
>
> Please note: these tests will quickly exaust the amount of available
> ephemeral TCP ports on your system, which will cause any TCP connect()
> calls in other processes to return with EADDRNOTAVAIL. However, ports
> will become available after a short while.
>
> The first test fails super quick, while the others haven't timed out
> so far. NOTE: The second test requires /proc/sys/net/ipv6/bindv6only
> to be set to 1.
>
> ./packetloss :: ::1
> ./packetloss :: 127.0.0.1
> ./packetloss 0.0.0.0 127.0.0.1
>
> The other tests run a client and server in different processes.
> Run the "close" daemon using one of:
> ./closed ::
> ./closed 0.0.0.0
>
> And flood connect() pings against 8009, the port closed listens on.
> ./tcping -f -p8009 ::1
> ./tcping -f -p8009 127.0.0.1
>
> Wait for a pause, then ^C, and notice the max statistic is ~1000ms.
>
> These tests have been rn between machines on a relativley noiseless
> ethernet LAN with similar results.
>
> What's also puzzling, is that I see no packet drop reporting in
> $ ifconfig lo
> lo: flags=73<UP,LOOPBACK,RUNNING> mtu 16436 metric 1
> inet 127.0.0.1 netmask 255.0.0.0
> inet6 ::1 prefixlen 128 scopeid 0x10<host>
> loop txqueuelen 0 (Local Loopback)
> RX packets 276411482 bytes 15822880567 (14.7 GiB)
> RX errors 0 dropped 0 overruns 0 frame 0
> TX packets 276411482 bytes 15822880567 (14.7 GiB)
> TX errors 0 dropped 0 overruns 0 carrier 0 collisions
>
> I'm thinking this may be a bug in the TCP/IP stack, however, I'm not
> certain if I'm missing a socket option, or some other configuration
> that may elimiate this behavior.
>
> If there's anything else I can help you with, please don't hesitate
> to Cc me.
>
> Thanks,
> Jesse
>
> Attached: syndrop.pcap
>
> Get the code here
> https://github.com/jlyo/packetloss
> git clone git://github.com/jlyo/packetloss.git
>
> https://github.com/jlyo/tcping
> git clone git://github.com/jlyo/tcping.git
>
> https://github.com/jlyo/closed
> git clone git://github.com/jlyo/closed.git
^ permalink raw reply
* Re: [PATCH net-next v2 0/4] e1000e: ethtool setfeatures fixes + loopback
From: Maciej Żenczykowski @ 2011-11-23 1:56 UTC (permalink / raw)
To: David Decotigny
Cc: jeffrey.t.kirsher, Brandeburg, Jesse, Allan, Bruce W,
Wyborny, Carolyn, Skidmore, Donald C, Rose, Gregory V,
Waskiewicz Jr, Peter P, Duyck, Alexander H, Ronciak, John,
e1000-devel@lists.sourceforge.net, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, David S. Miller, Eric Dumazet,
Ian Campbell, Paul Gortmaker, Mahesh Bandewar
In-Reply-To: <CAG88wWY_6o_1Ecxe6+XiA8ga6+t4ua+L6fZ8AOOFyrKgZ8LDMQ@mail.gmail.com>
David, could you test the trivial-forward-port patch I sent out and
verify that it continues to work correctly (was there anything you had
to fix in my patch?), a quick glance at the driver doesn't seem to
show any changes between 2.6.34 and now that would cause it to break
in unexpected ways... but who knows...?
There's a few features of it that I prefer to your version of the patch:
- it should be more robust to e1e_rphy timeout failures
- it checks loopback later, which makes that more robust, and can
check it multiple times.
- fixed up comments, variable renames, etc. better readability (IMHO)
Obviously it should probably be split into a separate patch to fix
bugs, and a separate one to implement loopback carrier faking.
The basic problem is that normally e1e_rphy() succeeds very quickly,
but in actuality the code is kind of more like:
e1e_rphy() {
for (i = 0; i < MAX_ITER; ++i) {
try to read register
on success return value
pause
}
return failure;
}
There is potential for the read to fail due to the firmware having
locked you out, hence the delay loop. Unfortunately in certain cases
this fails for longer then MAX_ITER * pause_time, causing e1e_rphy()
as a whole to fail.
However, once it succeeds, subsequent reads are also very likely to
succeed and be very fast as well (ie. failures are _very_
time-correlated).
Hence, you cannot check for anything only once and expect it to
reliably behave. This is the reason why in my patch loopback
detection was deeper in the loop instead of the first thing one does
(this also causes loopback detection code to not trigger on the normal
case of link up, so it's faster in the normal case).
There's also a lot of places where intervals == 0 on call, which is
why my patch has the extra read before the loop - it makes the
intervals == 0 case just slightly more likely to succeed.
Remember e1e_rphy() has 3 cases:
timeout
success after time < timeout
immediate success.
And in practice, failed reads are more likely to be back-to-back than random.
A relatively likely scenario is 3 reads behaving like so:
timeout, success after time < timeout, immediate success.
^ permalink raw reply
* [PATCH] sctp: integer overflow in sctp_auth_create_key()
From: Xi Wang @ 2011-11-23 1:55 UTC (permalink / raw)
To: linux-kernel
Cc: Vlad Yasevich, Sridhar Samudrala, David S. Miller, linux-sctp,
netdev, security
The previous commit 30c2235c is incomplete and cannot prevent integer
overflows. For example, when key_len is 0x80000000 (INT_MAX + 1), the
left-hand side of the check, (INT_MAX - key_len), which is unsigned,
becomes 0xffffffff (UINT_MAX) and bypasses the check.
Signed-off-by: Xi Wang <xi.wang@gmail.com>
---
net/sctp/auth.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 865e68f..989e0fd 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -82,7 +82,7 @@ static struct sctp_auth_bytes *sctp_auth_create_key(__u32 key_len, gfp_t gfp)
struct sctp_auth_bytes *key;
/* Verify that we are not going to overflow INT_MAX */
- if ((INT_MAX - key_len) < sizeof(struct sctp_auth_bytes))
+ if (key_len > INT_MAX - sizeof(struct sctp_auth_bytes))
return NULL;
/* Allocate the shared key */
--
1.7.5.4
^ permalink raw reply related
* Re: [PATCH net-next 4/4] net: Add Open vSwitch kernel components.
From: Jamal Hadi Salim @ 2011-11-23 1:45 UTC (permalink / raw)
To: Jesse Gross
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
David S. Miller
In-Reply-To: <CAEP_g=8puZh8hihoyoHTc4f6cBu4jiDJQ6tqk6suQxR=dchyjA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
On Tue, 2011-11-22 at 15:11 -0800, Jesse Gross wrote:
> As you mention, one of the biggest benefits of Open vSwitch is how
> simple the kernel portions are (it's less than 6000 lines).
I said that was the reason _you_ were using to justify things
and i argue that is not accurate.
You will be adding more actions and more classification fields to
the datapath - and you are going to add them to that monolithic
"simple" code. And it is going to grow.
BTW, you _are using some of the actions_ already (the policer for
example to do rate control; no disrespect intended but in a terrible
way).
Eventually you will canibalize that in your code because it is "simpler"
to do that.
So to be explicit: I dont think this is a good arguement.
> It's
> existed as an out-of-tree project for several years now so it's
> actually fairly mature already and unlikely that there will be a
> sudden influx of new code over the coming months. There's already
> quite a bit of functionality that has been implemented on top of it
> and it's been mentioned that several other components can be written
> in terms of it
I very much empathize with this point. But that is not a technical
issue.
> so I think that it's fairly generic infrastructure that
> can be used in many ways. Over time, I think it will result in a net
> reduction of code in the kernel as the design is heavily focused on
> delegating work to userspace.
Both your goal and the Linux qos/filtering/action code is to be be
modular and move policy control out of the kernel. In our case,
any of the actions, classifiers, qos schedulers can be experimented
with out of tree with zero patch needs and when ready pushed into the
kernel with zero code changes to the core. So nothing in what we have
says the policy control sits in the kernel.
> I would view it as similar in many ways to the recently added team
> device, which is based on the idea of keeping simple things simple.
Good analogy, but wrong direction: Bonding is a monolithic christmas
tree which people kept adding code to because it was "simpler" to do
so.
Your code is heading that way because as openflow progresses or some
new thing comes along (I notice capwap) you'll be adding more code for
more classifiers and more actions and maybe more schedulers and will
have to replicate things we provide. And they all go into this
monolithic code because it is "simpler".
Is there anything we do that makes it hard for you to use the
infrastructure provided? Is there anything you do that we cant
provide via the classifier-action-scheduler infrastructure?
If you need help let me know.
cheers,
jamal
^ permalink raw reply
* Re: Missing TCP SYN on loopback, retransmits after 1s
From: Hagen Paul Pfeifer @ 2011-11-23 1:44 UTC (permalink / raw)
To: Jesse Young; +Cc: David Miller, netdev
In-Reply-To: <20111122183727.01ab6f04@telperion.jlyo.org>
* Jesse Young | 2011-11-22 18:37:27 [-0600]:
>I presume that the drop is occuring in between the NET layer, and the sys
>call interface, do you agree? Where should I begin looking?
perf script net_dropmonitor.py from Neil should do the job.
HGN
^ permalink raw reply
* Re: WARNING: at mm/slub.c:3357, kernel BUG at mm/slub.c:3413
From: Benjamin Herrenschmidt @ 2011-11-23 1:43 UTC (permalink / raw)
To: Christian Kujau
Cc: Eric Dumazet, Christoph Lameter, Markus Trippelsdorf, Alex,Shi,
linux-kernel@vger.kernel.org, linux-mm@kvack.org, Pekka Enberg,
Matt Mackall, netdev@vger.kernel.org, Tejun Heo
In-Reply-To: <alpine.DEB.2.01.1111221711410.8000@trent.utfs.org>
> > I just want to see whether your network + heavy IO load problem goes
> > away with that one patch.
>
> Sorry, I should have been clearer in that mail: the high "load" value
> isn't a problem - the intermittent panics are. What I meant to say was:
> the panics usually occur when lots of disk & cpu IO is in progress (rsync
> to an external but local disk over firewire). While doing this the load is
> usally at 3-5, but that's "normal" and expected for a machine of that age.
No, I understand your problem. What I meant above is to see whether you
reproduce the crash caused by network + heavy IO :-)
> But then the machine crashes with recent kernels. After setting the
> cpu_partial files to 0 I tried to reproduce the same I/O pattern, *plus* a
> bit more, to really stress the machine, so load went up to 6-7 and the
> machine did not crash. So the load of 6-7 was expected and I'm glad that
> the machine did not crash with that workaround. I don't know of the
> implications of setting cpu_partial to 0 though.
Right. Now we want to check if that patch from Christoph fixes cpu
partial.
> As soon as the build with Christoph's one-liner is done I'll test w/o
> setting cpu_partial to 0 and see what it gives.
Thanks !
Cheers,
Ben.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply
* Re: [PATCH net-next 4/4] net: Add Open vSwitch kernel components.
From: David Miller @ 2011-11-23 1:42 UTC (permalink / raw)
To: jesse-l0M0P4e3n4LQT0dZR+AlfA
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
jhs-jkUAjuhPggJWk0Htik3J/w
In-Reply-To: <CAEP_g=8EqkfRq4XmGy_8-f1FrXrvjHNBT6u70a_2mqDnyp2Miw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
From: Jesse Gross <jesse-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
Date: Tue, 22 Nov 2011 17:34:57 -0800
> On Tue, Nov 22, 2011 at 3:19 PM, David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote:
>> From: Jesse Gross <jesse-l0M0P4e3n4LQT0dZR+AlfA@public.gmane.org>
>> Date: Tue, 22 Nov 2011 15:11:33 -0800
>>
>>> As you mention, one of the biggest benefits of Open vSwitch is how
>>> simple the kernel portions are (it's less than 6000 lines). It's
>>> existed as an out-of-tree project for several years now so it's
>>> actually fairly mature already and unlikely that there will be a
>>> sudden influx of new code over the coming months.
>>
>> The packet scheduler classification and packet action infrastructure
>> has been around 5 times longer.
>
> The only point that I was trying to make with that comment is that
> it's unlikely that Open vSwitch will suddenly get more complex due to
> the introduction of new code.
And it will get instantly less complex if it uses existing
infrastructure.
There is no reasonable argument against using what we have already.
^ permalink raw reply
* Re: [PATCH net-next 4/4] net: Add Open vSwitch kernel components.
From: Jesse Gross @ 2011-11-23 1:34 UTC (permalink / raw)
To: David Miller
Cc: dev-yBygre7rU0TnMu66kgdUjQ, netdev-u79uwXL29TY76Z2rM5mHXA,
jhs-jkUAjuhPggJWk0Htik3J/w
In-Reply-To: <20111122.181930.1186109067515095173.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
On Tue, Nov 22, 2011 at 3:19 PM, David Miller <davem@davemloft.net> wrote:
> From: Jesse Gross <jesse@nicira.com>
> Date: Tue, 22 Nov 2011 15:11:33 -0800
>
>> As you mention, one of the biggest benefits of Open vSwitch is how
>> simple the kernel portions are (it's less than 6000 lines). It's
>> existed as an out-of-tree project for several years now so it's
>> actually fairly mature already and unlikely that there will be a
>> sudden influx of new code over the coming months.
>
> The packet scheduler classification and packet action infrastructure
> has been around 5 times longer.
The only point that I was trying to make with that comment is that
it's unlikely that Open vSwitch will suddenly get more complex due to
the introduction of new code.
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev
^ permalink raw reply
* Re: [PATCH 7/8] net/mlx4_en: adding loopback support
From: Mahesh Bandewar @ 2011-11-23 1:34 UTC (permalink / raw)
To: Yevgeny Petrilin; +Cc: davem, netdev, ogerlitz, oren, amirv
In-Reply-To: <4ECA0F4A.30308@mellanox.co.il>
On Mon, Nov 21, 2011 at 12:43 AM, Yevgeny Petrilin
<yevgenyp@mellanox.co.il> wrote:
>
> From: Amir Vadai <amirv@mellanox.co.il>
>
> Device must be in promiscious mode or DMAC must be same as the host MAC, or
> else packet will be dropped by the HW rx filtering.
>
> Signed-off-by: Amir Vadai <amirv@mellanox.co.il>
> ---
> drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 +
> drivers/net/ethernet/mellanox/mlx4/en_tx.c | 3 +++
> include/linux/mlx4/qp.h | 1 +
> 3 files changed, 5 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 78d776b..55fdbce 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -1088,6 +1088,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
> dev->features = dev->hw_features | NETIF_F_HIGHDMA |
> NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
> NETIF_F_HW_VLAN_FILTER;
> + dev->hw_features |= NETIF_F_LOOPBACK;
>
> mdev->pndev[port] = dev;
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> index 3094f94..f9093b5 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
> @@ -681,6 +681,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
> tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f;
> tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE |
> MLX4_WQE_CTRL_SOLICITED);
> + if (dev->features & NETIF_F_LOOPBACK)
> + tx_desc->ctrl.srcrb_flags |=
> + cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
This is re-calculated for every xmit. Is it required to be that way?
May be you can pre-calculate it and just assign / add it to the
control-flags here.
> if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) {
> tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
> MLX4_WQE_CTRL_TCP_UDP_CSUM);
> diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
> index 6562ff6..bee8fa2 100644
> --- a/include/linux/mlx4/qp.h
> +++ b/include/linux/mlx4/qp.h
> @@ -210,6 +210,7 @@ struct mlx4_wqe_ctrl_seg {
> * [4] IP checksum
> * [3:2] C (generate completion queue entry)
> * [1] SE (solicited event)
> + * [0] FL (force loopback)
> */
> __be32 srcrb_flags;
> /*
> --
> 1.7.7
>
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox