Netdev List
 help / color / mirror / Atom feed
* [PATCH RESEND 03/10] cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Vipul Pandya @ 2011-10-20 17:11 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW, Vipul Pandya

    - recover LLD EQs for DB drop interrupts.  This includes adding a new
    db_lock, a spin lock disabling BH too, used by the recovery thread and
    the ring_tx_db() paths to allow db drop recovery.

    - cleaned up initial db avoidance code.

    - add read_eq_indices() - allows the LLD to use the pcie mw to efficiently
    read hw eq contexts.

    - add cxgb4_sync_txq_pidx() - called by iw_cxgb4 to sync up the sw/hw pidx
    value.

    - add flush_eq_cache() and cxgb4_flush_eq_cache().  This allows iw_cxgb4
    to flush the sge eq context cache before beginning db drop recovery.

    - add module parameter, dbfoifo_int_thresh, to allow tuning the db
    interrupt threshold value.

    - add dbfifo_int_thresh to cxgb4_lld_info so iw_cxgb4 knows the threshold.

    - add module parameter, dbfoifo_drain_delay, to allow tuning the amount
    of time delay between DB FULL and EMPTY upcalls to iw_cxgb4.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |    7 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  214 +++++++++++++++++++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |    4 +
 drivers/net/ethernet/chelsio/cxgb4/sge.c        |   20 ++-
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h    |   53 ++++++
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |   23 +++
 6 files changed, 279 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index e18b5ad..f202cb9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -51,6 +51,8 @@
 #define FW_VERSION_MINOR 1
 #define FW_VERSION_MICRO 0
 
+#define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
+
 enum {
 	MAX_NPORTS = 4,     /* max # of ports */
 	SERNUM_LEN = 24,    /* Serial # length */
@@ -403,6 +405,9 @@ struct sge_txq {
 	struct tx_sw_desc *sdesc;   /* address of SW Tx descriptor ring */
 	struct sge_qstat *stat;     /* queue status entry */
 	dma_addr_t    phys_addr;    /* physical address of the ring */
+	spinlock_t db_lock;
+	int db_disabled;
+	unsigned short db_pidx;
 };
 
 struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
@@ -475,6 +480,7 @@ struct adapter {
 	void __iomem *regs;
 	struct pci_dev *pdev;
 	struct device *pdev_dev;
+	unsigned int mbox;
 	unsigned int fn;
 	unsigned int flags;
 
@@ -607,6 +613,7 @@ irqreturn_t t4_sge_intr_msix(int irq, void *cookie);
 void t4_sge_init(struct adapter *adap);
 void t4_sge_start(struct adapter *adap);
 void t4_sge_stop(struct adapter *adap);
+extern int dbfifo_int_thresh;
 
 #define for_each_port(adapter, iter) \
 	for (iter = 0; iter < (adapter)->params.nports; ++iter)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 870c320..64ad1c8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -149,15 +149,6 @@ static unsigned int pfvfres_pmask(struct adapter *adapter,
 #endif
 
 enum {
-	MEMWIN0_APERTURE = 65536,
-	MEMWIN0_BASE     = 0x30000,
-	MEMWIN1_APERTURE = 32768,
-	MEMWIN1_BASE     = 0x28000,
-	MEMWIN2_APERTURE = 2048,
-	MEMWIN2_BASE     = 0x1b800,
-};
-
-enum {
 	MAX_TXQ_ENTRIES      = 16384,
 	MAX_CTRL_TXQ_ENTRIES = 1024,
 	MAX_RSPQ_ENTRIES     = 16384,
@@ -369,6 +360,15 @@ static int set_addr_filters(const struct net_device *dev, bool sleep)
 				uhash | mhash, sleep);
 }
 
+int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */
+module_param(dbfifo_int_thresh, int, 0644);
+MODULE_PARM_DESC(dbfifo_int_thresh, "doorbell fifo interrupt threshold");
+
+int dbfifo_drain_delay = 1000; /* usecs to sleep while draining the dbfifo */
+module_param(dbfifo_drain_delay, int, 0644);
+MODULE_PARM_DESC(dbfifo_drain_delay,
+		 "usecs to sleep while draining the dbfifo");
+
 /*
  * Set Rx properties of a port, such as promiscruity, address filters, and MTU.
  * If @mtu is -1 it is left unchanged.
@@ -387,6 +387,8 @@ static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok)
 	return ret;
 }
 
+static struct workqueue_struct *workq;
+
 /**
  *	link_start - enable a port
  *	@dev: the port to enable
@@ -2201,7 +2203,7 @@ static void cxgb4_queue_tid_release(struct tid_info *t, unsigned int chan,
 	adap->tid_release_head = (void **)((uintptr_t)p | chan);
 	if (!adap->tid_release_task_busy) {
 		adap->tid_release_task_busy = true;
-		schedule_work(&adap->tid_release_task);
+		queue_work(workq, &adap->tid_release_task);
 	}
 	spin_unlock_bh(&adap->tid_release_lock);
 }
@@ -2428,6 +2430,59 @@ void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
 }
 EXPORT_SYMBOL(cxgb4_iscsi_init);
 
+int cxgb4_flush_eq_cache(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+	int ret;
+
+	ret = t4_fwaddrspace_write(adap, adap->mbox,
+				   0xe1000000 + A_SGE_CTXT_CMD, 0x20000000);
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_flush_eq_cache);
+
+static int read_eq_indices(struct adapter *adap, u16 qid, u16 *pidx, u16 *cidx)
+{
+	u32 addr = t4_read_reg(adap, A_SGE_DBQ_CTXT_BADDR) + 24 * qid + 8;
+	__be64 indices;
+	int ret;
+
+	ret = t4_mem_win_read_len(adap, addr, (__be32 *)&indices, 8);
+	if (!ret) {
+		indices = be64_to_cpu(indices);
+		*cidx = (indices >> 25) & 0xffff;
+		*pidx = (indices >> 9) & 0xffff;
+	}
+	return ret;
+}
+
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx,
+			u16 size)
+{
+	struct adapter *adap = netdev2adap(dev);
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	ret = read_eq_indices(adap, qid, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+
+	if (pidx != hw_pidx) {
+		u16 delta;
+
+		if (pidx >= hw_pidx)
+			delta = pidx - hw_pidx;
+		else
+			delta = size - hw_pidx + pidx;
+		wmb();
+		t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+			     V_QID(qid) | V_PIDX(delta));
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_sync_txq_pidx);
+
 static struct pci_driver cxgb4_driver;
 
 static void check_neigh_update(struct neighbour *neigh)
@@ -2461,6 +2516,95 @@ static struct notifier_block cxgb4_netevent_nb = {
 	.notifier_call = netevent_cb
 };
 
+static void drain_db_fifo(struct adapter *adap, int usecs)
+{
+	u32 v;
+
+	do {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(usecs));
+		v = t4_read_reg(adap, A_SGE_DBFIFO_STATUS);
+		if (G_LP_COUNT(v) == 0 && G_HP_COUNT(v) == 0)
+			break;
+	} while (1);
+}
+
+static void disable_txq_db(struct sge_txq *q)
+{
+	spin_lock_irq(&q->db_lock);
+	q->db_disabled = 1;
+	spin_unlock_irq(&q->db_lock);
+}
+
+static void enable_txq_db(struct sge_txq *q)
+{
+	spin_lock_irq(&q->db_lock);
+	q->db_disabled = 0;
+	spin_unlock_irq(&q->db_lock);
+}
+
+static void disable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		disable_txq_db(&adap->sge.ctrlq[i].q);
+}
+
+static void enable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		enable_txq_db(&adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		enable_txq_db(&adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		enable_txq_db(&adap->sge.ctrlq[i].q);
+}
+
+static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q)
+{
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	spin_lock_bh(&q->db_lock);
+	ret = read_eq_indices(adap, (u16)q->cntxt_id, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+	if (q->db_pidx != hw_pidx) {
+		u16 delta;
+
+		if (q->db_pidx >= hw_pidx)
+			delta = q->db_pidx - hw_pidx;
+		else
+			delta = q->size - hw_pidx + q->db_pidx;
+		wmb();
+		t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+				V_QID(q->cntxt_id) | V_PIDX(delta));
+	}
+out:
+	q->db_disabled = 0;
+	spin_unlock_bh(&q->db_lock);
+	if (ret)
+		CH_WARN(adap, "DB drop recovery failed.\n");
+}
+static void recover_all_queues(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		sync_txq_pidx(adap, &adap->sge.ctrlq[i].q);
+}
+
 static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd)
 {
 	mutex_lock(&uld_mutex);
@@ -2473,55 +2617,41 @@ static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd)
 static void process_db_full(struct work_struct *work)
 {
 	struct adapter *adap;
-	static int delay = 1000;
-	u32 v;
 
 	adap = container_of(work, struct adapter, db_full_task);
 
-
-	/* stop LLD queues */
-
 	notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL);
-	do {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(usecs_to_jiffies(delay));
-		v = t4_read_reg(adap, A_SGE_DBFIFO_STATUS);
-		if (G_LP_COUNT(v) == 0 && G_HP_COUNT(v) == 0)
-			break;
-	} while (1);
+	drain_db_fifo(adap, dbfifo_drain_delay);
+	t4_set_reg_field(adap, A_SGE_INT_ENABLE3,
+			F_DBFIFO_HP_INT | F_DBFIFO_LP_INT,
+			F_DBFIFO_HP_INT | F_DBFIFO_LP_INT);
 	notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY);
-
-
-	/*
-	 * The more we get db full interrupts, the more we'll delay
-	 * in re-enabling db rings on queues, capped off at 200ms.
-	 */
-	delay = min(delay << 1, 200000);
-
-	/* resume LLD queues */
 }
 
 static void process_db_drop(struct work_struct *work)
 {
 	struct adapter *adap;
-	adap = container_of(work, struct adapter, db_drop_task);
 
+	adap = container_of(work, struct adapter, db_drop_task);
 
-	/*
-	 * sync the PIDX values in HW and SW for LLD queues.
-	 */
-
+	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_DROPPED_DB, 0);
+	disable_dbs(adap);
 	notify_rdma_uld(adap, CXGB4_CONTROL_DB_DROP);
+	drain_db_fifo(adap, 1);
+	recover_all_queues(adap);
+	enable_dbs(adap);
 }
 
 void t4_db_full(struct adapter *adap)
 {
-	schedule_work(&adap->db_full_task);
+	t4_set_reg_field(adap, A_SGE_INT_ENABLE3,
+			F_DBFIFO_HP_INT | F_DBFIFO_LP_INT, 0);
+	queue_work(workq, &adap->db_full_task);
 }
 
 void t4_db_dropped(struct adapter *adap)
 {
-	schedule_work(&adap->db_drop_task);
+	queue_work(workq, &adap->db_drop_task);
 }
 
 static void uld_attach(struct adapter *adap, unsigned int uld)
@@ -2557,6 +2687,7 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 	lli.gts_reg = adap->regs + MYPF_REG(SGE_PF_GTS);
 	lli.db_reg = adap->regs + MYPF_REG(SGE_PF_KDOORBELL);
 	lli.fw_vers = adap->params.fw_vers;
+	lli.dbfifo_int_thresh = dbfifo_int_thresh;
 
 	handle = ulds[uld].add(&lli);
 	if (IS_ERR(handle)) {
@@ -3673,6 +3804,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 
 	adapter->pdev = pdev;
 	adapter->pdev_dev = &pdev->dev;
+	adapter->mbox = func;
 	adapter->fn = func;
 	adapter->msg_enable = dflt_msg_enable;
 	memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map));
@@ -3868,6 +4000,10 @@ static int __init cxgb4_init_module(void)
 {
 	int ret;
 
+	workq = create_singlethread_workqueue("cxgb4");
+	if (!workq)
+		return -ENOMEM;
+
 	/* Debugfs support is optional, just warn if this fails */
 	cxgb4_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
 	if (!cxgb4_debugfs_root)
@@ -3883,6 +4019,8 @@ static void __exit cxgb4_cleanup_module(void)
 {
 	pci_unregister_driver(&cxgb4_driver);
 	debugfs_remove(cxgb4_debugfs_root);  /* NULL ok */
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
 }
 
 module_init(cxgb4_init_module);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 5cc2f27..d79980c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -218,6 +218,7 @@ struct cxgb4_lld_info {
 	unsigned short ucq_density;          /* # of user CQs/page */
 	void __iomem *gts_reg;               /* address of GTS register */
 	void __iomem *db_reg;                /* address of kernel doorbell */
+	int dbfifo_int_thresh;		     /* doorbell fifo int threshold */
 };
 
 struct cxgb4_uld_info {
@@ -226,6 +227,7 @@ struct cxgb4_uld_info {
 	int (*rx_handler)(void *handle, const __be64 *rsp,
 			  const struct pkt_gl *gl);
 	int (*state_change)(void *handle, enum cxgb4_state new_state);
+	int (*control)(void *handle, enum cxgb4_control control, ...);
 };
 
 int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
@@ -243,4 +245,6 @@ void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
 		      const unsigned int *pgsz_order);
 struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
 				   unsigned int skb_len, unsigned int pull_len);
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx, u16 size);
+int cxgb4_flush_eq_cache(struct net_device *dev);
 #endif  /* !__CXGB4_OFLD_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 3631fbb..65ecf1e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -767,8 +767,13 @@ static void write_sgl(const struct sk_buff *skb, struct sge_txq *q,
 static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
 {
 	wmb();            /* write descriptors before telling HW */
-	t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
-		     QID(q->cntxt_id) | PIDX(n));
+	spin_lock(&q->db_lock);
+	if (!q->db_disabled) {
+		t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+			     V_QID(q->cntxt_id) | V_PIDX(n));
+	}
+	q->db_pidx = q->pidx;
+	spin_unlock(&q->db_lock);
 }
 
 /**
@@ -2080,6 +2085,7 @@ static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
 	q->stops = q->restarts = 0;
 	q->stat = (void *)&q->desc[q->size];
 	q->cntxt_id = id;
+	spin_lock_init(&q->db_lock);
 	adap->sge.egr_map[id - adap->sge.egr_start] = q;
 }
 
@@ -2414,9 +2420,15 @@ void t4_sge_init(struct adapter *adap)
 			 RXPKTCPLMODE |
 			 (STAT_LEN == 128 ? EGRSTATUSPAGESIZE : 0));
 
+	/*
+	 * Set up to drop DOORBELL writes when the DOORBELL FIFO overflows
+	 * and generate an interrupt when this occurs so we can recover.
+	 */
 	t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS,
-			V_HP_INT_THRESH(5) | V_LP_INT_THRESH(5),
-			V_HP_INT_THRESH(5) | V_LP_INT_THRESH(5));
+			V_HP_INT_THRESH(M_HP_INT_THRESH) |
+			V_LP_INT_THRESH(M_LP_INT_THRESH),
+			V_HP_INT_THRESH(dbfifo_int_thresh) |
+			V_LP_INT_THRESH(dbfifo_int_thresh));
 	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_ENABLE_DROP,
 			F_ENABLE_DROP);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 0adc5bc..111fc32 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -190,6 +190,59 @@
 #define SGE_DEBUG_DATA_LOW 0x10d4
 #define SGE_INGRESS_QUEUES_PER_PAGE_PF 0x10f4
 
+#define S_LP_INT_THRESH    12
+#define V_LP_INT_THRESH(x) ((x) << S_LP_INT_THRESH)
+#define S_HP_INT_THRESH    28
+#define V_HP_INT_THRESH(x) ((x) << S_HP_INT_THRESH)
+#define A_SGE_DBFIFO_STATUS 0x10a4
+
+#define S_ENABLE_DROP    13
+#define V_ENABLE_DROP(x) ((x) << S_ENABLE_DROP)
+#define F_ENABLE_DROP    V_ENABLE_DROP(1U)
+#define A_SGE_DOORBELL_CONTROL 0x10a8
+
+#define A_SGE_CTXT_CMD 0x11fc
+#define A_SGE_DBQ_CTXT_BADDR 0x1084
+
+#define A_SGE_PF_KDOORBELL 0x0
+
+#define S_QID 15
+#define V_QID(x) ((x) << S_QID)
+
+#define S_PIDX 0
+#define V_PIDX(x) ((x) << S_PIDX)
+
+#define M_LP_COUNT 0x7ffU
+#define S_LP_COUNT 0
+#define G_LP_COUNT(x) (((x) >> S_LP_COUNT) & M_LP_COUNT)
+
+#define M_HP_COUNT 0x7ffU
+#define S_HP_COUNT 16
+#define G_HP_COUNT(x) (((x) >> S_HP_COUNT) & M_HP_COUNT)
+
+#define A_SGE_INT_ENABLE3 0x1040
+
+#define S_DBFIFO_HP_INT 8
+#define V_DBFIFO_HP_INT(x) ((x) << S_DBFIFO_HP_INT)
+#define F_DBFIFO_HP_INT V_DBFIFO_HP_INT(1U)
+
+#define S_DBFIFO_LP_INT 7
+#define V_DBFIFO_LP_INT(x) ((x) << S_DBFIFO_LP_INT)
+#define F_DBFIFO_LP_INT V_DBFIFO_LP_INT(1U)
+
+#define S_DROPPED_DB 0
+#define V_DROPPED_DB(x) ((x) << S_DROPPED_DB)
+#define F_DROPPED_DB V_DROPPED_DB(1U)
+
+#define S_ERR_DROPPED_DB 18
+#define V_ERR_DROPPED_DB(x) ((x) << S_ERR_DROPPED_DB)
+#define F_ERR_DROPPED_DB V_ERR_DROPPED_DB(1U)
+
+#define A_PCIE_MEM_ACCESS_OFFSET 0x306c
+
+#define M_HP_INT_THRESH 0xfU
+#define M_LP_INT_THRESH 0xfU
+
 #define PCIE_PF_CLI 0x44
 #define PCIE_INT_CAUSE 0x3004
 #define  UNXSPLCPLERR  0x20000000U
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 83ca454..0579e98 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1625,4 +1625,27 @@ int t4_mem_win_read_len(struct adapter *adap, u32 addr, __be32 *data, int len);
 int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
 			  u32 addr, u32 val);
 
+#define S_FW_CMD_OP 24
+#define V_FW_CMD_OP(x) ((x) << S_FW_CMD_OP)
+
+#define S_FW_CMD_REQUEST 23
+#define V_FW_CMD_REQUEST(x) ((x) << S_FW_CMD_REQUEST)
+#define F_FW_CMD_REQUEST V_FW_CMD_REQUEST(1U)
+
+#define S_FW_CMD_WRITE 21
+#define V_FW_CMD_WRITE(x) ((x) << S_FW_CMD_WRITE)
+#define F_FW_CMD_WRITE V_FW_CMD_WRITE(1U)
+
+#define S_FW_LDST_CMD_ADDRSPACE 0
+#define V_FW_LDST_CMD_ADDRSPACE(x) ((x) << S_FW_LDST_CMD_ADDRSPACE)
+
+enum {
+	MEMWIN0_APERTURE = 65536,
+	MEMWIN0_BASE     = 0x30000,
+	MEMWIN1_APERTURE = 32768,
+	MEMWIN1_BASE     = 0x28000,
+	MEMWIN2_APERTURE = 2048,
+	MEMWIN2_BASE     = 0x1b800,
+};
+
 #endif /* _T4FW_INTERFACE_H_ */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: [PATCH 07/10] RDMA/cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Roland Dreier @ 2011-10-20 17:17 UTC (permalink / raw)
  To: Steve Wise
  Cc: Vipul Pandya, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ
In-Reply-To: <4EA030F5.3000007-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>

> I believe 5 and 7 have build dependencies.

Right, missed that one too.

But it seems 4,6,8,9,10 are independent of the rest of the series?

ie I can trivially apply them and then worry about working out
the drivers/net / drivers/infiniband interdependency a bit later?

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* RE: [PATCH V2 02/10] cxgb4: Common platform specific changes for DB Drop Recovery
From: Felix Marti @ 2011-10-20 17:18 UTC (permalink / raw)
  To: Vipul Pandya, linux-rdma, netdev
  Cc: roland, davem, Divy Le Ray, Dimitrios Michailidis, Kumar A S,
	Steve Wise, Vipul Pandya
In-Reply-To: <1319130620-9240-1-git-send-email-vipul@chelsio.com>

Don't add stuff to the t4fw_interface.h, that is owned by firmware.

> -----Original Message-----
> From: linux-rdma-owner@vger.kernel.org [mailto:linux-rdma-
> owner@vger.kernel.org] On Behalf Of Vipul Pandya
> Sent: Thursday, October 20, 2011 10:10 AM
> To: linux-rdma@vger.kernel.org; netdev@vger.kernel.org
> Cc: roland@purestorage.com; davem@davemloft.net; Divy Le Ray;
Dimitrios
> Michailidis; Kumar A S; Steve Wise; Vipul Pandya
> Subject: [PATCH V2 02/10] cxgb4: Common platform specific changes for
DB
> Drop Recovery
> 
>     - Add platform-specific callback functions for interrupts.  This
is
>     needed to do a single read-clear of the CAUSE register and then
call
>     out to platform specific functions for DB threshold interrupts and
DB
>     drop interrupts.
> 
>     - Add t4_mem_win_read_len() - mem-window reads for arbitrary
lengths.
>     This is used to read the CIDX/PIDX values from EC contexts during
DB
>     drop recovery.
> 
>     - Add t4_fwaddrspace_write() - sends addrspace write cmds to the
fw.
>     Needed to flush the sge eq context cache.
> 
> Signed-off-by: Vipul Pandya <vipul@chelsio.com>
> Signed-off-by: Steve Wise <swise@opengridcomputing.com>
> ---
> Changes:
> V2: Corrected the subject for patch.
> 
>  drivers/net/ethernet/chelsio/cxgb4/t4_hw.c    |   69
> +++++++++++++++++++++----
>  drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |    5 ++
>  2 files changed, 63 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
> b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
> index 13609bf..32e1dd5 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
> @@ -868,11 +868,14 @@ int t4_restart_aneg(struct adapter *adap,
unsigned
> int mbox, unsigned int port)
>  	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);  }
> 
> +typedef void (*int_handler_t)(struct adapter *adap);
> +
>  struct intr_info {
>  	unsigned int mask;       /* bits to check in interrupt status */
>  	const char *msg;         /* message to print or NULL */
>  	short stat_idx;          /* stat counter to increment or -1 */
>  	unsigned short fatal;    /* whether the condition reported is
fatal */
> +	int_handler_t int_handler; /* platform-specific int handler */
>  };
> 
>  /**
> @@ -905,6 +908,8 @@ static int t4_handle_intr_status(struct adapter
> *adapter, unsigned int reg,
>  		} else if (acts->msg && printk_ratelimit())
>  			dev_warn(adapter->pdev_dev, "%s (0x%x)\n", acts-
> >msg,
>  				 status & acts->mask);
> +		if (acts->int_handler)
> +			acts->int_handler(adapter);
>  		mask |= acts->mask;
>  	}
>  	status &= mask;
> @@ -1013,9 +1018,9 @@ static void sge_intr_handler(struct adapter
> *adapter)
>  		{ ERR_INVALID_CIDX_INC,
>  		  "SGE GTS CIDX increment too large", -1, 0 },
>  		{ ERR_CPL_OPCODE_0, "SGE received 0-length CPL", -1, 0
},
> -		{ F_DBFIFO_LP_INT, NULL, -1, 0 },
> -		{ F_DBFIFO_HP_INT, NULL, -1, 0 },
> -		{ ERR_DROPPED_DB, "SGE doorbell dropped", -1, 0 },
> +		{ F_DBFIFO_LP_INT, NULL, -1, 0, t4_db_full },
> +		{ F_DBFIFO_HP_INT, NULL, -1, 0, t4_db_full },
> +		{ F_ERR_DROPPED_DB, NULL, -1, 0, t4_db_dropped },
>  		{ ERR_DATA_CPL_ON_HIGH_QID1 |
> ERR_DATA_CPL_ON_HIGH_QID0,
>  		  "SGE IQID > 1023 received CPL for FL", -1, 0 },
>  		{ ERR_BAD_DB_PIDX3, "SGE DBP 3 pidx increment too
large",
> -1, @@ -1036,20 +1041,14 @@ static void sge_intr_handler(struct
adapter
> *adapter)
>  	};
> 
>  	v = (u64)t4_read_reg(adapter, SGE_INT_CAUSE1) |
> -	    ((u64)t4_read_reg(adapter, SGE_INT_CAUSE2) << 32);
> +		((u64)t4_read_reg(adapter, SGE_INT_CAUSE2) << 32);
>  	if (v) {
>  		dev_alert(adapter->pdev_dev, "SGE parity error
(%#llx)\n",
> -			 (unsigned long long)v);
> +				(unsigned long long)v);
>  		t4_write_reg(adapter, SGE_INT_CAUSE1, v);
>  		t4_write_reg(adapter, SGE_INT_CAUSE2, v >> 32);
>  	}
> 
> -	err = t4_read_reg(adapter, A_SGE_INT_CAUSE3);
> -	if (err & (F_DBFIFO_HP_INT|F_DBFIFO_LP_INT))
> -		t4_db_full(adapter);
> -	if (err & F_ERR_DROPPED_DB)
> -		t4_db_dropped(adapter);
> -
>  	if (t4_handle_intr_status(adapter, SGE_INT_CAUSE3,
sge_intr_info)
> ||
>  	    v != 0)
>  		t4_fatal_err(adapter);
> @@ -1995,6 +1994,54 @@ int t4_wol_pat_enable(struct adapter *adap,
> unsigned int port, unsigned int map,
>  	(var).retval_len16 = htonl(FW_LEN16(var)); \  } while (0)
> 
> +int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
> +			  u32 addr, u32 val)
> +{
> +	struct fw_ldst_cmd c;
> +
> +	memset(&c, 0, sizeof(c));
> +	c.op_to_addrspace = htonl(V_FW_CMD_OP(FW_LDST_CMD) |
> F_FW_CMD_REQUEST |
> +			    F_FW_CMD_WRITE |
> +
> V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_FIRMWARE));
> +	c.cycles_to_len16 = htonl(FW_LEN16(c));
> +	c.u.addrval.addr = htonl(addr);
> +	c.u.addrval.val = htonl(val);
> +
> +	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL); }
> +
> +/*
> + *     t4_mem_win_read_len - read memory through PCIE memory window
> + *     @adap: the adapter
> + *     @addr: address of first byte requested aligned on 32b.
> + *     @data: len bytes to hold the data read
> + *     @len: amount of data to read from window.  Must be <=
> + *            MEMWIN0_APERATURE after adjusting for 16B alignment
> + *            requirements of the the memory window.
> + *
> + *     Read len bytes of data from MC starting at @addr.
> + */
> +int t4_mem_win_read_len(struct adapter *adap, u32 addr, __be32 *data,
> +int len) {
> +	int i;
> +	int off;
> +
> +	/*
> +	 * Align on a 16B boundary.
> +	 */
> +	off = addr & 15;
> +	if ((addr & 3) || (len + off) > MEMWIN0_APERTURE)
> +		return -EINVAL;
> +
> +	t4_write_reg(adap, A_PCIE_MEM_ACCESS_OFFSET, addr & ~15);
> +	t4_read_reg(adap, A_PCIE_MEM_ACCESS_OFFSET);
> +
> +	for (i = 0; i < len; i += 4)
> +		*data++ = t4_read_reg(adap, (MEMWIN0_BASE + off + i));
> +
> +	return 0;
> +}
> +
>  /**
>   *	t4_mdio_rd - read a PHY register through MDIO
>   *	@adap: the adapter
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
> b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
> index edcfd7e..83ca454 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
> +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
> @@ -1620,4 +1620,9 @@ struct fw_hdr {
>  #define FW_HDR_FW_VER_MINOR_GET(x) (((x) >> 16) & 0xff)  #define
> FW_HDR_FW_VER_MICRO_GET(x) (((x) >> 8) & 0xff)  #define
> FW_HDR_FW_VER_BUILD_GET(x) (((x) >> 0) & 0xff)
> +
> +int t4_mem_win_read_len(struct adapter *adap, u32 addr, __be32 *data,
> +int len); int t4_fwaddrspace_write(struct adapter *adap, unsigned int
> mbox,
> +			  u32 addr, u32 val);
> +
>  #endif /* _T4FW_INTERFACE_H_ */
> --
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
in the
> body of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH][RFC] ipconfig: Add new kernel parameter to force MAC address
From: Naohiro Aota @ 2011-10-20 17:27 UTC (permalink / raw)
  To: David S. Miller, Alexey Kuznetsov, James Morris,
	Hideaki YOSHIFUJI, Patrick McHardy <kabe
  Cc: Naohiro Aota, masami.hiramatsu.pt, linux-kernel, netdev

There are some boards (such as Pandaboard) that doesn't have
persistent MAC address and so the kernel generate random MAC for
it. Usually you can configure the device to use some static MAC
address with userland tools. However it's not possible when you are
using NFS root since the kernel already set the random MAC address and
you cannot change it after system boot up process.

This patch add new kernel parameter "mac". When this parameter is
used, the kernel auto configuration routine force the network device
to use the specified MAC address.

Signed-off-by: Naohiro Aota <naota@elisp.net>
---

Here is Pandaboard block diagram. http://pandaboard.org/node/223/

As you can see, the ethernet device is connected via USB so that the
kernel use smsc95xx to configure this device. Since the device lacks
EEPROM, it doesn't have its own MAC and the kernel specify some random
MAC for it.

I've also considered a way to force random_ether_addr() to use kernel
parameter specified MAC. However there are two problems with it.

1. There are much use of the function among Linux kernel tree so that it
   should be difficult and hard way to take.

2. If there are more than one ethernet device available, one cannot
   distinguish which device get the MAC address.

 Documentation/filesystems/nfs/nfsroot.txt |    8 +++
 Documentation/kernel-parameters.txt       |    3 +
 net/ipv4/Kconfig                          |   13 ++++
 net/ipv4/ipconfig.c                       |   91 +++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 0 deletions(-)

diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index ffdd9d8..fa2bea3 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -159,6 +159,14 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>
                 Default: any
 
 
+mac=<mac-address>
+
+  This parameter tells the kernel to try using the specified MAC
+  address. This is useful when your network device does not have its
+  own MAC address and so the kernel select some random address for the
+  device.
+
+
 nfsrootdebug
 
   This parameter enables debugging messages to appear in the kernel
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d6e6724..4bc7d84 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1333,6 +1333,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	ltpc=		[NET]
 			Format: <io>,<irq>,<dma>
 
+	mac=		[IP_PNP]
+			See Documentation/filesystems/nfs/nfsroot.txt.
+
 	machvec=	[IA-64] Force the use of a particular machine-vector
 			(machvec) in a generic kernel.
 			Example: machvec=hpzx1_swiotlb
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index cbb505b..ac4da23 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -163,6 +163,19 @@ config IP_PNP_RARP
 	  operating on your network. Read
 	  <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
 
+config IP_PNP_SPECIFIC_MAC
+       bool "IP: Specific MAC address support"
+       depends on IP_PNP
+       help
+	 If you want your Linux box to mount its whole root file system (the
+	 one containing the directory /) from some other computer over the net
+	 via NFS and you want the IP address of your computer to be discovered
+	 automatically at boot time using some protocols like DHCP, BOOTP or
+	 RARP but your network device does not have static MAC address, you may
+	 want to say Y here. This option add kernel boot parameter mac= to
+	 specify the MAC address to be used for automatic IP address
+	 configuration.
+
 # not yet ready..
 #   bool '    IP: ARP support' CONFIG_IP_PNP_ARP
 config NET_IPIP
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 472a8c4..9075abc 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -82,6 +82,9 @@
 #if defined(CONFIG_IP_PNP_RARP)
 #define IPCONFIG_RARP
 #endif
+#if defined(CONFIG_IP_PNP_SPECIFIC_MAC)
+#define IPCONFIG_MAC
+#endif
 #if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
 #define IPCONFIG_DYNAMIC
 #endif
@@ -145,6 +148,11 @@ u32 ic_dev_xid;		/* Device under configuration */
 /* vendor class identifier */
 static char vendor_class_identifier[253] __initdata;
 
+#ifdef IPCONFIG_MAC
+static int ic_mac_addr_set __initdata;
+static u8 ic_mac[ETH_ALEN] __initdata;
+#endif
+
 /* Persistent data: */
 
 static int ic_proto_used;			/* Protocol used, if any */
@@ -1120,6 +1128,26 @@ drop:
 
 #ifdef IPCONFIG_DYNAMIC
 
+#ifdef IPCONFIG_MAC
+static int __init change_mac(struct ic_device *d, u8 *new_mac)
+{
+	struct sockaddr s_addr;
+	int err;
+
+	memcpy(s_addr.sa_data, new_mac, ETH_ALEN);
+	s_addr.sa_family = d->dev->type;
+
+	rtnl_lock();
+	dev_change_flags(d->dev, d->flags);
+	err = dev_set_mac_address(d->dev, &s_addr);
+	dev_change_flags(d->dev, d->flags | IFF_UP);
+	rtnl_unlock();
+	msleep(CONF_POST_OPEN);
+
+	return err;
+}
+#endif
+
 static int __init ic_dynamic(void)
 {
 	int retries;
@@ -1127,6 +1155,10 @@ static int __init ic_dynamic(void)
 	unsigned long start_jiffies, timeout, jiff;
 	int do_bootp = ic_proto_have_if & IC_BOOTP;
 	int do_rarp = ic_proto_have_if & IC_RARP;
+#ifdef IPCONFIG_MAC
+	u8 mac_before[ETH_ALEN];
+	struct ic_device *mac_changed_dev = NULL;
+#endif
 
 	/*
 	 * If none of DHCP/BOOTP/RARP was selected, return with an error.
@@ -1183,6 +1215,31 @@ static int __init ic_dynamic(void)
 	get_random_bytes(&timeout, sizeof(timeout));
 	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
 	for (;;) {
+#ifdef IPCONFIG_MAC
+		if (ic_mac_addr_set && d != mac_changed_dev) {
+			/* restore MAC */
+			if (mac_changed_dev) {
+				if (change_mac(mac_changed_dev,
+					       mac_before) == 0) {
+					printk(KERN_INFO "MAC address on %s restored to %pM\n",
+					       mac_changed_dev->dev->name,
+					       mac_before);
+				}
+			}
+
+			/* backup MAC */
+			memcpy(mac_before, d->dev->dev_addr, ETH_ALEN);
+
+			mac_changed_dev = NULL;
+			/* set MAC address to specified one */
+			if (change_mac(d, ic_mac) == 0) {
+				printk(KERN_INFO "MAC address on %s changed to %pM\n",
+				       d->dev->name, ic_mac);
+				mac_changed_dev = d;
+			}
+		}
+#endif
+
 		/* Track the device we are configuring */
 		ic_dev_xid = d->xid;
 
@@ -1242,6 +1299,15 @@ static int __init ic_dynamic(void)
 
 	if (!ic_got_reply) {
 		ic_myaddr = NONE;
+#ifdef IPCONFIG_MAC
+		/* restore MAC */
+		if (mac_changed_dev) {
+			if (change_mac(mac_changed_dev, mac_before) == 0) {
+				printk(KERN_INFO "MAC address on %s restored to %pM\n",
+				       mac_changed_dev->dev->name, mac_before);
+			}
+		}
+#endif
 		return -1;
 	}
 
@@ -1630,6 +1696,32 @@ static int __init vendor_class_identifier_setup(char *addrs)
 	return 1;
 }
 
+#ifdef IPCONFIG_MAC
+static int __init mac_config_setup(char *addrs)
+{
+	char buf[ETH_ALEN*3];
+	if (strlcpy(buf, addrs, sizeof(buf)) >= sizeof(buf))
+		goto fail;
+
+	buf[sizeof(buf)-1] = '\0';
+	if (mac_pton(buf, ic_mac) == 0)
+		goto fail;
+
+	printk(KERN_INFO "Using specified MAC address %s to set Auto-IP\n",
+	       addrs);
+
+	ic_mac_addr_set = 1;
+	return 1;
+fail:
+	printk(KERN_WARNING "Invalid MAC address\n");
+	return 1;
+}
+#endif
+
 __setup("ip=", ip_auto_config_setup);
 __setup("nfsaddrs=", nfsaddrs_config_setup);
 __setup("dhcpclass=", vendor_class_identifier_setup);
+#ifdef IPCONFIG_MAC
+__setup("mac=", mac_config_setup);
+#endif
+
-- 
1.7.6.1

^ permalink raw reply related

* Re: [PATCH 07/10] RDMA/cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Steve Wise @ 2011-10-20 17:28 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Vipul Pandya, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ
In-Reply-To: <CAL1RGDUDk_MAM+S+MaEcPQzug8TvE-mFAfNqX3FqRApLQu9H2g-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On 10/20/2011 12:17 PM, Roland Dreier wrote:
>> I believe 5 and 7 have build dependencies.
> Right, missed that one too.
>
> But it seems 4,6,8,9,10 are independent of the rest of the series?
>
> ie I can trivially apply them and then worry about working out
> the drivers/net / drivers/infiniband interdependency a bit later?
>

Some of these might be dependent on prior patches the series.   But if they aren't, yes, you could do that.

Stevo
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] route: fix ICMP redirect validation
From: Flavio Leitner @ 2011-10-20 17:47 UTC (permalink / raw)
  To: Flavio Leitner; +Cc: David Miller, netdev
In-Reply-To: <20111019160537.4aeedef8@asterix.rh>

On Wed, 19 Oct 2011 16:05:37 -0200
Flavio Leitner <fbl@redhat.com> wrote:

> On Mon, 17 Oct 2011 19:43:44 -0400 (EDT)
> David Miller <davem@davemloft.net> wrote:
> 
> > From: Flavio Leitner <fbl@redhat.com>
> > Date: Wed,  5 Oct 2011 11:20:04 -0300
> > 
> > > The commit f39925dbde7788cfb96419c0f092b086aa325c0f
> > > (ipv4: Cache learned redirect information in inetpeer.)
> > > removed some ICMP packet validations which are required by
> > > RFC 1122, section 3.2.2.2:
> > 
> > The reason for putting this into the inetpeer cache was so that we
> > didn't need to consult the routing cache at all.  We're working to
> > remove it at some point, so every dependency matters.
> > 
> > Can you implement this such that only an inetpeer cache probe is
> > necessary?
> > 
> 
> Sure, I have reviewed your patch series to remove the routing
> cache and I believe this version works with and without it, though
> I have tested only with current net-next code.
> 
> Thanks for your time reviewing, I appreciate it.
...
> @@ -1331,13 +1337,40 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
>  			goto reject_redirect;
>  	}
>  
> -	peer = inet_getpeer_v4(daddr, 1);
> -	if (peer) {
> -		peer->redirect_learned.a4 = new_gw;
> +	memset(&fl4, 0, sizeof(fl4));
> +	fl4.daddr = daddr;
> +	for (s = 0; s < 2; s++) {
> +		for (i = 0; i < 2; i++) {
> +			fl4.flowi4_oif = ikeys[i];
> +			fl4.saddr = skeys[s];
> +			rt = __ip_route_output_key(net, &fl4);
> +			if (IS_ERR(rt))
> +				continue;
>  
> -		inet_putpeer(peer);
> +			if (rt->dst.error || rt->dst.dev != dev ||
> +			    rt->rt_gateway != old_gw) {
> +				ip_rt_put(rt);
> +				continue;
> +			}
>  
> -		atomic_inc(&__rt_peer_genid);
> +			peer = rt->peer;
> +			if (!peer) {
> +				peer = inet_getpeer_v4(daddr, 1);
> +				putpeer = true;
> +			}

I was reviewing this again and instead of doing the above, it would
be better to use rt_bind_peer() to update rt->peer as well.

                        if (!rt->peer)
                                rt_bind_peer(rt, rt->rt_dst, 1);

                        peer = rt->peer;
                        if (peer) {
                                peer->redirect_learned.a4 = new_gw;
                                atomic_inc(&__rt_peer_genid);
                        }


but I am not sure if I understood you completely when you say
to do such that only an inetpeer cache probe is necessary.

thanks again,
fbl

^ permalink raw reply

* Re: iwlagn: WARN_ON() in iwl_get_idle_rx_chain_count()
From: wwguy @ 2011-10-20 18:54 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: Intel Linux Wireless,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <20111020185756.GA24185-CoA6ZxLDdyEEUmgCuDUIdw@public.gmane.org>

On Thu, 2011-10-20 at 11:57 -0700, Michał Mirosław wrote:
> On Fri, Oct 14, 2011 at 09:21:05PM +0200, Michał Mirosław wrote:
> > On Fri, Oct 14, 2011 at 08:29:18AM -0700, wwguy wrote:
> > > Could you try the attach patch and see if it fix your problem.
> > [attached patch removed]
> > Backported and applied. I'll test it for couple of days.
> 
> I haven't tripped on the warnings in those last days with your
> patch applied. I think the backported version should be included
> in 3.1.
> 
Thank you for testing it, I will push it upstream into 3.1

Best Regards
Wey


--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: iwlagn: WARN_ON() in iwl_get_idle_rx_chain_count()
From: Michał Mirosław @ 2011-10-20 18:57 UTC (permalink / raw)
  To: wwguy
  Cc: Intel Linux Wireless, linux-wireless@vger.kernel.org,
	netdev@vger.kernel.org
In-Reply-To: <20111014192105.GA23640@rere.qmqm.pl>

On Fri, Oct 14, 2011 at 09:21:05PM +0200, Michał Mirosław wrote:
> On Fri, Oct 14, 2011 at 08:29:18AM -0700, wwguy wrote:
> > Could you try the attach patch and see if it fix your problem.
> [attached patch removed]
> Backported and applied. I'll test it for couple of days.

I haven't tripped on the warnings in those last days with your
patch applied. I think the backported version should be included
in 3.1.

Best Regards,
Michał Mirosław

^ permalink raw reply

* [PATCH net-next] tcp: use TCP_DEFAULT_INIT_RCVWND in tcp_fixup_rcvbuf()
From: Eric Dumazet @ 2011-10-20 19:16 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Since commit 356f039822b (TCP: increase default initial receive
window.), we allow sender to send 10 (TCP_DEFAULT_INIT_RCVWND) segments.

Change tcp_fixup_rcvbuf() to reflect this change, even if no real change 
is expected, since sysctl_tcp_rmem[1] = 87380 and this value
is bigger than tcp_fixup_rcvbuf() computed rcvmem (~23720)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/tcp_input.c |   16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1e848b2..5a29ecc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -345,17 +345,15 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
 
 static void tcp_fixup_rcvbuf(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	int rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
+	const struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int mss = min_t(unsigned int, tp->advmss, 1460);
+	int rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
 
-	/* Try to select rcvbuf so that 4 mss-sized segments
-	 * will fit to window and corresponding skbs will fit to our rcvbuf.
-	 * (was 3; 4 is minimum to allow fast retransmit to work.)
-	 */
-	while (tcp_win_from_space(rcvmem) < tp->advmss)
+	while (tcp_win_from_space(rcvmem) < mss)
 		rcvmem += 128;
-	if (sk->sk_rcvbuf < 4 * rcvmem)
-		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+	rcvmem *= TCP_DEFAULT_INIT_RCVWND;
+	if (sk->sk_rcvbuf < rcvmem)
+		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
 }
 
 /* 4. Try to fixup all. It is made immediately after connection enters

^ permalink raw reply related

* [PATCH net-next] igbvf: fix truesize underestimation
From: Eric Dumazet @ 2011-10-20 19:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Jeff Kirsher

igbvf allocates half a page per skb fragment. We must account
PAGE_SIZE/2 increments on skb->truesize, not the actual frag length.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igbvf/netdev.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 1bd9abd..db29817 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -312,7 +312,7 @@ static bool igbvf_clean_rx_irq(struct igbvf_adapter *adapter,
 
 			skb->len += length;
 			skb->data_len += length;
-			skb->truesize += length;
+			skb->truesize += PAGE_SIZE / 2;
 		}
 send_up:
 		i++;

^ permalink raw reply related

* Re: Kernel panic from tg3 net driver
From: Ari Savolainen @ 2011-10-20 19:30 UTC (permalink / raw)
  To: David S. Miller, Richard Cochran, netdev, linux-kernel
In-Reply-To: <CAEbykaXz+J8abVzU3yjqZG2+yhHNrQWsQy2c8wNyoERWPpxWKQ@mail.gmail.com>

I finally got time to continue bisecting. The commit that causes the
kernel panic is:  2669069aacc9 "tg3: enable transmit time stamping."

Ari

2011/10/15 Ari Savolainen <ari.m.savolainen@gmail.com>:
> Hi,
>
> I get this panic when I try to print from a virtual machine:
>
> https://docs.google.com/leaf?id=0B7LPWLwa6EUaODIxYTY2YmQtNWJlZS00M2ViLTk5ZmEtNDM2ZTZmNzE2MDEz&hl=fi
>
> I tried to bisect it, but couldn't finish, because after the last step
> the boot process got stuck right after selecting the kernel in grub
> and I ran out of time:
>
> git bisect start
> # bad: [322a8b034003c0d46d39af85bf24fee27b902f48] Linux 3.1-rc1
> git bisect bad 322a8b034003c0d46d39af85bf24fee27b902f48
> # good: [02f8c6aee8df3cdc935e9bdd4f2d020306035dbe] Linux 3.0
> git bisect good 02f8c6aee8df3cdc935e9bdd4f2d020306035dbe
> # bad: [0003230e8200699860f0b10af524dc47bf8aecad] Merge branch
> 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6
> git bisect bad 0003230e8200699860f0b10af524dc47bf8aecad
> # bad: [72f96e0e38d7e29ba16dcfd824ecaebe38b8293e] Merge branch
> 'for-linus-core' of
> git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending
> git bisect bad 72f96e0e38d7e29ba16dcfd824ecaebe38b8293e
> # good: [204d1641d200709c759d8c269458cbc7de378c40] Merge branch
> 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6
> into for-davem
> git bisect good 204d1641d200709c759d8c269458cbc7de378c40
> # bad: [415b3334a21aa67806c52d1acf4e72e14f7f402f] icmp: Fix regression
> in nexthop resolution during replies.
> git bisect bad 415b3334a21aa67806c52d1acf4e72e14f7f402f
> # bad: [95a943c162d74b20d869917bdf5df11293c35b63] Merge branch
> 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6
> into for-davem
> git bisect bad 95a943c162d74b20d869917bdf5df11293c35b63
>
> In the first bad kernel (3.1-rc1) there was this in the log:
>
> [  105.612095]
> [  105.612096] ===================================================
> [  105.612100] [ INFO: suspicious rcu_dereference_check() usage. ]
> [  105.612101] ---------------------------------------------------
> [  105.612103] include/net/dst.h:91 invoked rcu_dereference_check()
> without protection!
> [  105.612105]
> [  105.612106] other info that might help us debug this:
> [  105.612106]
> [  105.612108]
> [  105.612108] rcu_scheduler_active = 1, debug_locks = 0
> [  105.612110] 1 lock held by dnsmasq/2618:
> [  105.612111]  #0:  (rtnl_mutex){+.+.+.}, at: [<ffffffff815df8c7>]
> rtnl_lock+0x17/0x20
> [  105.612120]
> [  105.612121] stack backtrace:
> [  105.612123] Pid: 2618, comm: dnsmasq Not tainted 3.1.0-rc1 #41
> [  105.612125] Call Trace:
> [  105.612129]  [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0
> [  105.612132]  [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0
> [  105.612135]  [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220
> [  105.612139]  [<ffffffff81639298>] arp_req_set+0xb8/0x230
> [  105.612142]  [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310
> [  105.612146]  [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60
> [  105.612150]  [<ffffffff8163fb75>] inet_ioctl+0x85/0x90
> [  105.612154]  [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70
> [  105.612157]  [<ffffffff815b55d3>] sock_ioctl+0x73/0x280
> [  105.612162]  [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570
> [  105.612165]  [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0
> [  105.612168]  [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80
> [  105.612172]  [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b
>

^ permalink raw reply

* Re: [PATCH net-next] tcp: use TCP_DEFAULT_INIT_RCVWND in tcp_fixup_rcvbuf()
From: David Miller @ 2011-10-20 19:50 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1319138186.2854.5.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Oct 2011 21:16:26 +0200

> Since commit 356f039822b (TCP: increase default initial receive
> window.), we allow sender to send 10 (TCP_DEFAULT_INIT_RCVWND) segments.
> 
> Change tcp_fixup_rcvbuf() to reflect this change, even if no real change 
> is expected, since sysctl_tcp_rmem[1] = 87380 and this value
> is bigger than tcp_fixup_rcvbuf() computed rcvmem (~23720)
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
 ...
> +	unsigned int mss = min_t(unsigned int, tp->advmss, 1460);

I don't understand where this calculation comes from, and even if it
should be obvious it isn't to me and deserves a mention in the commit
message at a minimum.

^ permalink raw reply

* Re: Kernel panic from tg3 net driver
From: David Miller @ 2011-10-20 19:56 UTC (permalink / raw)
  To: ari.m.savolainen; +Cc: richardcochran, netdev, linux-kernel
In-Reply-To: <CAEbykaX4UPDBxOHmNr=dodaquSPjhPr-pLCcUn5hSu4-xLZy-g@mail.gmail.com>

From: Ari Savolainen <ari.m.savolainen@gmail.com>
Date: Thu, 20 Oct 2011 22:30:44 +0300

> I finally got time to continue bisecting. The commit that causes the
> kernel panic is:  2669069aacc9 "tg3: enable transmit time stamping."

I thought initially that the issue might be that we have to do the
skb_tx_timestamp() call before we advance the mailbox transmit
descriptor pointer.

But that shouldn't matter, we run with a lock held, and TX reclaim takes
that same lock.

So I'm sort of stumped at the moment.

^ permalink raw reply

* RE: [net-next 5/6] ixgbe: add hardware timestamping support
From: Keller, Jacob E @ 2011-10-20 19:57 UTC (permalink / raw)
  To: Richard Cochran, Jacob Keller
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, sassmann@redhat.com
In-Reply-To: <20111020145637.GC1949@netboy.at.omicron.at>



> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Thursday, October 20, 2011 7:57 AM
> To: Jacob Keller
> Cc: Kirsher, Jeffrey T; davem@davemloft.net; Keller, Jacob E;
> netdev@vger.kernel.org; gospo@redhat.com; sassmann@redhat.com
> Subject: Re: [net-next 5/6] ixgbe: add hardware timestamping support
> 
> On Wed, Oct 19, 2011 at 10:04:33AM -0700, Jacob Keller wrote:
> > On Mon, Oct 17, 2011 at 9:44 AM, Richard Cochran
> <richardcochran@gmail.com> wrote:
> > > So, is this wrap around due to the fact that you are tied to the
> > > system time via time_compare? Or, putting it another way, can't you
> > > program the hardware time stamping unit so that the registers have
> > > some reasonable resolution (like 64 bits worth of nanoseconds) and
> > > just offer RAW timestamps?
> >
> > The wrap around is due to hardware limitations. The ixgbe devices
> > cannot support 64bits worth of nanoseconds and still have the ability
> > to adjust the frequency in parts per billion. A larger increment
> > increases the resolution available for frequency adjustments, but
> > decreases the time it takes for the cycle counter to wrap around.
> 
> Oh, well. That stinks.
> 
> I think you do want to offer ppb adjustment.
> 
Correct, which is why the cycle counter wraps around every 35 seconds.

> > > I would really like to move away from the timecompare hacks and
> > > towards a proper PHC->SYS PPS solution.
> > >
> >
> > I agree that this is the correct approach. The timecompare
> > functionality does have issues.
> 
> And these cards are highlighting timecompare weaknesses I had not even
> thought of.
> 
> I expect that if you offer the RAW time stamps, then it should be
> possible to have the time stamp values always correct (or nearly so)
> even with a changing link speed. If the link speed change gives an
> interrupt, then the ISR can reprogram the frequency compensation
> registers and let the counter continue.
> 

The cyclecounter is based off of the DMA clock on the NIC which changes frequency with the link speed. So at 10G link, the DMA ticks once every 6.4ns. The cycle counter gets a value (specified in the TIMINCA register) added to it every DMA tick. In order to allow for ppb adjustments to the cycle counter, I have the TIMINCA value be as many bits wide as possible. Then I use the cyclecounter/timecounter structures to detect wraparound and convert to a ns value.

If we return the raw cycle counter stamps directly, they would not be measured in nanoseconds, but in a division of the DMA clock tick (DMA clock tick / TIMINCA value). This means for the values I chose we are somewhere in the range of femto seconds or so. The problem is that all of the upper stack expects values as nanoseconds. We wouldn't be able to frequency adjust in ppb to get down to nanoseconds again.

This is due to a limitation in the way the hardware was designed, (Ideally it would allow for precise adjustments, but still provide a nanosecond counter. The 82580 igb device does this.)

> > > Again, doing the update thing on every packet won't work for real
> > > world PTP scenarios.
> > >
> > Which is why the PHC solution is better. Work on implementing this
> > support is in progress. Out of curiosity, what is the sync rate for
> > the scenario that breaks this? I would like to try that rate out on
> my
> > setup.
> 
> For the audio/video profile, they have a max of 32 sync packets per
> second. Not sure about delay request rate, maybe 16 per second.
> 

Thanks :)

> Thanks,
> Richard

^ permalink raw reply

* Re: [PATCH net-next] tcp: use TCP_DEFAULT_INIT_RCVWND in tcp_fixup_rcvbuf()
From: Eric Dumazet @ 2011-10-20 20:02 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111020.155002.555965434596006787.davem@davemloft.net>

Le jeudi 20 octobre 2011 à 15:50 -0400, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 20 Oct 2011 21:16:26 +0200
> 
> > Since commit 356f039822b (TCP: increase default initial receive
> > window.), we allow sender to send 10 (TCP_DEFAULT_INIT_RCVWND) segments.
> > 
> > Change tcp_fixup_rcvbuf() to reflect this change, even if no real change 
> > is expected, since sysctl_tcp_rmem[1] = 87380 and this value
> > is bigger than tcp_fixup_rcvbuf() computed rcvmem (~23720)
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>  ...
> > +	unsigned int mss = min_t(unsigned int, tp->advmss, 1460);
> 
> I don't understand where this calculation comes from, and even if it
> should be obvious it isn't to me and deserves a mention in the commit
> message at a minimum.

This is the calculation done in commit 356f039822b as well.

The window is 10*MSS, but no more than 14600

On loopback, this matters, because we could end with rcvmem=219680

^ permalink raw reply

* Re: Kernel panic from tg3 net driver
From: Eric Dumazet @ 2011-10-20 20:05 UTC (permalink / raw)
  To: David Miller; +Cc: ari.m.savolainen, richardcochran, netdev, linux-kernel
In-Reply-To: <20111020.155659.486754557434415381.davem@davemloft.net>

Le jeudi 20 octobre 2011 à 15:56 -0400, David Miller a écrit :
> From: Ari Savolainen <ari.m.savolainen@gmail.com>
> Date: Thu, 20 Oct 2011 22:30:44 +0300
> 
> > I finally got time to continue bisecting. The commit that causes the
> > kernel panic is:  2669069aacc9 "tg3: enable transmit time stamping."
> 
> I thought initially that the issue might be that we have to do the
> skb_tx_timestamp() call before we advance the mailbox transmit
> descriptor pointer.
> 
> But that shouldn't matter, we run with a lock held, and TX reclaim takes
> that same lock.
> 
> So I'm sort of stumped at the moment.

But its not a panic, its a RCU splat ?

> [  105.612129]  [<ffffffff810ccdcb>] lockdep_rcu_dereference+0xbb/0xc0
> [  105.612132]  [<ffffffff815dc5a9>] neigh_update+0x4f9/0x5f0
> [  105.612135]  [<ffffffff815da001>] ? neigh_lookup+0xe1/0x220
> [  105.612139]  [<ffffffff81639298>] arp_req_set+0xb8/0x230
> [  105.612142]  [<ffffffff8163a59f>] arp_ioctl+0x1bf/0x310
> [  105.612146]  [<ffffffff810baa40>] ? lock_hrtimer_base.isra.26+0x30/0x60
> [  105.612150]  [<ffffffff8163fb75>] inet_ioctl+0x85/0x90
> [  105.612154]  [<ffffffff815b5520>] sock_do_ioctl+0x30/0x70
> [  105.612157]  [<ffffffff815b55d3>] sock_ioctl+0x73/0x280
> [  105.612162]  [<ffffffff811b7698>] do_vfs_ioctl+0x98/0x570
> [  105.612165]  [<ffffffff811a5c40>] ? fget_light+0x340/0x3a0
> [  105.612168]  [<ffffffff811b7bbf>] sys_ioctl+0x4f/0x80
> [  105.612172]  [<ffffffff816fdcab>] system_call_fastpath+0x16/0x1b

And I think this was fixed yesterday ?

De: 	roy.qing.li@gmail.com
À: 	ari.m.savolainen@gmail.com, netdev@vger.kernel.org
Sujet: 	[PATCH net-next] neigh: fix rcu splat in neigh_update()
Date: 	Tue, 18 Oct 2011 16:32:42 +0800 (18/10/2011 10:32:42)

^ permalink raw reply

* [PATCH net-next] myri10ge: fix truesize underestimation
From: Eric Dumazet @ 2011-10-20 20:10 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Jon Mason

skb->truesize must account for allocated memory, not the used part of
it. Doing this work is important to avoid unexpected OOM situations.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Jon Mason <mason@myri.com>
---
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index c970a48..0778edc 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -1210,7 +1210,6 @@ myri10ge_rx_skb_build(struct sk_buff *skb, u8 * va,
 	struct skb_frag_struct *skb_frags;
 
 	skb->len = skb->data_len = len;
-	skb->truesize = len + sizeof(struct sk_buff);
 	/* attach the page(s) */
 
 	skb_frags = skb_shinfo(skb)->frags;
@@ -1385,6 +1384,8 @@ myri10ge_rx_done(struct myri10ge_slice_state *ss, int len, __wsum csum,
 	if (skb_frag_size(&skb_shinfo(skb)->frags[0]) <= 0) {
 		skb_frag_unref(skb, 0);
 		skb_shinfo(skb)->nr_frags = 0;
+	} else {
+		skb->truesize += bytes * skb_shinfo(skb)->nr_frags;
 	}
 	skb->protocol = eth_type_trans(skb, dev);
 	skb_record_rx_queue(skb, ss - &mgp->ss[0]);

^ permalink raw reply related

* Re: Kernel panic from tg3 net driver
From: David Miller @ 2011-10-20 20:11 UTC (permalink / raw)
  To: eric.dumazet; +Cc: ari.m.savolainen, richardcochran, netdev, linux-kernel
In-Reply-To: <1319141125.2854.14.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Oct 2011 22:05:25 +0200

> And I think this was fixed yesterday ?
> 
> De: 	roy.qing.li@gmail.com
> À: 	ari.m.savolainen@gmail.com, netdev@vger.kernel.org
> Sujet: 	[PATCH net-next] neigh: fix rcu splat in neigh_update()
> Date: 	Tue, 18 Oct 2011 16:32:42 +0800 (18/10/2011 10:32:42)
> 

Good catch, it seems to be this bug.

^ permalink raw reply

* Re: [PATCH net-next] tcp: use TCP_DEFAULT_INIT_RCVWND in tcp_fixup_rcvbuf()
From: David Miller @ 2011-10-20 20:13 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1319140954.2854.12.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 20 Oct 2011 22:02:34 +0200

> Le jeudi 20 octobre 2011 à 15:50 -0400, David Miller a écrit :
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Thu, 20 Oct 2011 21:16:26 +0200
>> 
>> > Since commit 356f039822b (TCP: increase default initial receive
>> > window.), we allow sender to send 10 (TCP_DEFAULT_INIT_RCVWND) segments.
>> > 
>> > Change tcp_fixup_rcvbuf() to reflect this change, even if no real change 
>> > is expected, since sysctl_tcp_rmem[1] = 87380 and this value
>> > is bigger than tcp_fixup_rcvbuf() computed rcvmem (~23720)
>> > 
>> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
>>  ...
>> > +	unsigned int mss = min_t(unsigned int, tp->advmss, 1460);
>> 
>> I don't understand where this calculation comes from, and even if it
>> should be obvious it isn't to me and deserves a mention in the commit
>> message at a minimum.
> 
> This is the calculation done in commit 356f039822b as well.
> 
> The window is 10*MSS, but no more than 14600
> 
> On loopback, this matters, because we could end with rcvmem=219680

Thanks, please help weak brains like mine by adding this to the commit message.
:-)

^ permalink raw reply

* Re: [PATCH] dev: use name hash for dev_seq_ops
From: David Miller @ 2011-10-20 20:17 UTC (permalink / raw)
  To: mihai.maruseac
  Cc: shemminger, eric.dumazet, mirq-linux, therbert, jpirko, netdev,
	linux-kernel, dbaluta, mmaruseac
In-Reply-To: <1319097717-14910-1-git-send-email-mmaruseac@ixiacom.com>

From: Mihai Maruseac <mihai.maruseac@gmail.com>
Date: Thu, 20 Oct 2011 11:01:57 +0300

> Instead of using the dev->next chain and trying to resync at each call to
> dev_seq_start, use the name hash, keeping the bucket and the offset in
> seq->private field.

I'm totally fine with this patch from a technical perspective, but I'd
like one small thing tidied up before I apply this.

> +	unsigned int pos; /* bucket << 24 + offset */

Please don't mention this as a constant in the comment, if we ever
change NETDEV_HASHBITS this comment will be inaccurate.

I'd suggest putting the BUCKET_SPACE define before the dev_iter_state
definition, and using BUCKET_SPACE in the comment instead of 24.

Thanks.

^ permalink raw reply

* Re: Kernel panic from tg3 net driver
From: Eric Dumazet @ 2011-10-20 20:17 UTC (permalink / raw)
  To: David Miller; +Cc: ari.m.savolainen, richardcochran, netdev, linux-kernel
In-Reply-To: <20111020.161147.33259825921677777.davem@davemloft.net>

Le jeudi 20 octobre 2011 à 16:11 -0400, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 20 Oct 2011 22:05:25 +0200
> 
> > And I think this was fixed yesterday ?
> > 
> > De: 	roy.qing.li@gmail.com
> > À: 	ari.m.savolainen@gmail.com, netdev@vger.kernel.org
> > Sujet: 	[PATCH net-next] neigh: fix rcu splat in neigh_update()
> > Date: 	Tue, 18 Oct 2011 16:32:42 +0800 (18/10/2011 10:32:42)
> > 
> 
> Good catch, it seems to be this bug.

Oh well, sorry, it seems it was one bug hit during bisection, but maybe
its completely unrelated to the real problem.

^ permalink raw reply

* Re: [PATCH] route: fix ICMP redirect validation
From: David Miller @ 2011-10-20 20:19 UTC (permalink / raw)
  To: fbl; +Cc: netdev
In-Reply-To: <20111020154702.13f69021@asterix.rh>

From: Flavio Leitner <fbl@redhat.com>
Date: Thu, 20 Oct 2011 15:47:02 -0200

> I was reviewing this again and instead of doing the above, it would
> be better to use rt_bind_peer() to update rt->peer as well.
> 
>                         if (!rt->peer)
>                                 rt_bind_peer(rt, rt->rt_dst, 1);
> 
>                         peer = rt->peer;
>                         if (peer) {
>                                 peer->redirect_learned.a4 = new_gw;
>                                 atomic_inc(&__rt_peer_genid);
>                         }
> 
> 
> but I am not sure if I understood you completely when you say
> to do such that only an inetpeer cache probe is necessary.

If you have the route entry available already and you're doing the
inetpeer lookup anyways, you might as well use rt_bind_peer() since
all of the expensive work has to be done anyways.

So yes, using rt_bind_peer() would be the best thing to do here.

^ permalink raw reply

* Re: PROBLEM: System call 'sendmsg' of process ospfd (quagga) causes kernel oops
From: David Miller @ 2011-10-20 20:21 UTC (permalink / raw)
  To: herbert; +Cc: eric.dumazet, evonlanthen, linux-kernel, netdev, timo.teras
In-Reply-To: <20111020093541.GA3024@gondor.apana.org.au>

From: Herbert Xu <herbert@gondor.hengli.com.au>
Date: Thu, 20 Oct 2011 11:35:41 +0200

> On Thu, Oct 20, 2011 at 05:30:50AM -0400, David Miller wrote:
>>
>> So I'm a little confused what your suggestion for rc10 really
>> is :-)
> 
> I meant his first initial patch :)
> 
> While it is suboptimal in the sense that should the value of
> needed_headroom increase we'll end up constantly reallocating
> skbs, I believe that it is at least semantically correct.

Ok, I applied Eric's patch which removes the dynamic changing of the
needed_headroom in IP_GRE.

Thanks everyone!

^ permalink raw reply

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: David Miller @ 2011-10-20 20:24 UTC (permalink / raw)
  To: eric.dumazet
  Cc: bhutchings, daniel.turull, netdev, robert, voravit, jens.laas
In-Reply-To: <1318949264.2657.97.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 18 Oct 2011 16:47:44 +0200

> Le mardi 18 octobre 2011 à 15:00 +0100, Ben Hutchings a écrit :
> 
>> AIUI, the reason for limits on delays is not that it's bad practice to
>> spin for so long, but that the delay calculations may overflow or
>> otherwise become inaccurate.
> 
> OK, I can understand that, then a more appropriate patch would be :

I think doing the udelay/ndelay thing is the way to go for 'net' and
-stable.  We can do something sophisticated with ktime et al. in
'net-next'.

Eric, could you please formally submit this patch with proper
changelog etc.?

Thanks.

^ permalink raw reply

* Re: [patch net-next]alx: Atheros AR8131/AR8151/AR8152/AR8161 Ethernet driver
From: Luis R. Rodriguez @ 2011-10-20 20:33 UTC (permalink / raw)
  To: Ren, Cloud
  Cc: David Miller, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <6349D7A510622448B1BA0967850A8438011CC2A0@nasanexd02d.na.qualcomm.com>

On Thu, Oct 20, 2011 at 2:48 AM, Ren, Cloud <cjren@qca.qualcomm.com> wrote:
>
>>From: "Ren, Cloud" <cjren@qca.qualcomm.com>
>>Date: Thu, 20 Oct 2011 09:23:07 +0000
>>
>>> As you saw, should I do the two following steps?
>>> 1. I firstly try to submit code to linux-staging.git.
>>> 2. After the driver have been accepted by  linux-staging.git, I submit to net-
>>next.git again.
>>
>>You submit and get it into staging so that it can sit there for some time and get
>>reviewed and improved by others.
>>
>>One doesn't submit directly to net-next right after it gets into staging, staging
>>is a place where your driver lives while it still smelly funky and needs more
>>work.
>
> The driver will support the next generation NICs of Atheros. Meanwhile, the driver can
> also have better optimization for AR8131 and AR8151 than atl1c. For some reason, we
> don't plan to patch atl1c driver to support our new NIC, such as AR8161. So I hope the driver
> can stay in net-next in the end. Of course, I will be responsible for modify source code and
> let it match kernel requirements.

Cloud,

If you want to skip staging (which I recommend) then you need to
address all upstream concerns expressed. Given that you indicate that
you will be working on following up with the driver until its
acceptable upstream my recommendation is either to clean up the driver
very well and review it internally at Atheros prior to a public
submission *or* just dump into staging and get the benefit of
community cleanup and eventually wait until it is ready for proper
upstream. If you want internal private review at Atheros you can use
the internal private ath9k-devel list.

Also are you going to maintain the older atlx drivers? While at it can
you clear up who maintains what as far as Atheros is concerned for
Ethernet?

  Luis

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox