Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH RESEND 03/10] cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Vipul Pandya @ 2011-10-20 17:11 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW, Vipul Pandya

    - recover LLD EQs for DB drop interrupts.  This includes adding a new
    db_lock, a spin lock disabling BH too, used by the recovery thread and
    the ring_tx_db() paths to allow db drop recovery.

    - cleaned up initial db avoidance code.

    - add read_eq_indices() - allows the LLD to use the pcie mw to efficiently
    read hw eq contexts.

    - add cxgb4_sync_txq_pidx() - called by iw_cxgb4 to sync up the sw/hw pidx
    value.

    - add flush_eq_cache() and cxgb4_flush_eq_cache().  This allows iw_cxgb4
    to flush the sge eq context cache before beginning db drop recovery.

    - add module parameter, dbfoifo_int_thresh, to allow tuning the db
    interrupt threshold value.

    - add dbfifo_int_thresh to cxgb4_lld_info so iw_cxgb4 knows the threshold.

    - add module parameter, dbfoifo_drain_delay, to allow tuning the amount
    of time delay between DB FULL and EMPTY upcalls to iw_cxgb4.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |    7 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  214 +++++++++++++++++++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h  |    4 +
 drivers/net/ethernet/chelsio/cxgb4/sge.c        |   20 ++-
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h    |   53 ++++++
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |   23 +++
 6 files changed, 279 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index e18b5ad..f202cb9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -51,6 +51,8 @@
 #define FW_VERSION_MINOR 1
 #define FW_VERSION_MICRO 0
 
+#define CH_WARN(adap, fmt, ...) dev_warn(adap->pdev_dev, fmt, ## __VA_ARGS__)
+
 enum {
 	MAX_NPORTS = 4,     /* max # of ports */
 	SERNUM_LEN = 24,    /* Serial # length */
@@ -403,6 +405,9 @@ struct sge_txq {
 	struct tx_sw_desc *sdesc;   /* address of SW Tx descriptor ring */
 	struct sge_qstat *stat;     /* queue status entry */
 	dma_addr_t    phys_addr;    /* physical address of the ring */
+	spinlock_t db_lock;
+	int db_disabled;
+	unsigned short db_pidx;
 };
 
 struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
@@ -475,6 +480,7 @@ struct adapter {
 	void __iomem *regs;
 	struct pci_dev *pdev;
 	struct device *pdev_dev;
+	unsigned int mbox;
 	unsigned int fn;
 	unsigned int flags;
 
@@ -607,6 +613,7 @@ irqreturn_t t4_sge_intr_msix(int irq, void *cookie);
 void t4_sge_init(struct adapter *adap);
 void t4_sge_start(struct adapter *adap);
 void t4_sge_stop(struct adapter *adap);
+extern int dbfifo_int_thresh;
 
 #define for_each_port(adapter, iter) \
 	for (iter = 0; iter < (adapter)->params.nports; ++iter)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 870c320..64ad1c8 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -149,15 +149,6 @@ static unsigned int pfvfres_pmask(struct adapter *adapter,
 #endif
 
 enum {
-	MEMWIN0_APERTURE = 65536,
-	MEMWIN0_BASE     = 0x30000,
-	MEMWIN1_APERTURE = 32768,
-	MEMWIN1_BASE     = 0x28000,
-	MEMWIN2_APERTURE = 2048,
-	MEMWIN2_BASE     = 0x1b800,
-};
-
-enum {
 	MAX_TXQ_ENTRIES      = 16384,
 	MAX_CTRL_TXQ_ENTRIES = 1024,
 	MAX_RSPQ_ENTRIES     = 16384,
@@ -369,6 +360,15 @@ static int set_addr_filters(const struct net_device *dev, bool sleep)
 				uhash | mhash, sleep);
 }
 
+int dbfifo_int_thresh = 10; /* 10 == 640 entry threshold */
+module_param(dbfifo_int_thresh, int, 0644);
+MODULE_PARM_DESC(dbfifo_int_thresh, "doorbell fifo interrupt threshold");
+
+int dbfifo_drain_delay = 1000; /* usecs to sleep while draining the dbfifo */
+module_param(dbfifo_drain_delay, int, 0644);
+MODULE_PARM_DESC(dbfifo_drain_delay,
+		 "usecs to sleep while draining the dbfifo");
+
 /*
  * Set Rx properties of a port, such as promiscruity, address filters, and MTU.
  * If @mtu is -1 it is left unchanged.
@@ -387,6 +387,8 @@ static int set_rxmode(struct net_device *dev, int mtu, bool sleep_ok)
 	return ret;
 }
 
+static struct workqueue_struct *workq;
+
 /**
  *	link_start - enable a port
  *	@dev: the port to enable
@@ -2201,7 +2203,7 @@ static void cxgb4_queue_tid_release(struct tid_info *t, unsigned int chan,
 	adap->tid_release_head = (void **)((uintptr_t)p | chan);
 	if (!adap->tid_release_task_busy) {
 		adap->tid_release_task_busy = true;
-		schedule_work(&adap->tid_release_task);
+		queue_work(workq, &adap->tid_release_task);
 	}
 	spin_unlock_bh(&adap->tid_release_lock);
 }
@@ -2428,6 +2430,59 @@ void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
 }
 EXPORT_SYMBOL(cxgb4_iscsi_init);
 
+int cxgb4_flush_eq_cache(struct net_device *dev)
+{
+	struct adapter *adap = netdev2adap(dev);
+	int ret;
+
+	ret = t4_fwaddrspace_write(adap, adap->mbox,
+				   0xe1000000 + A_SGE_CTXT_CMD, 0x20000000);
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_flush_eq_cache);
+
+static int read_eq_indices(struct adapter *adap, u16 qid, u16 *pidx, u16 *cidx)
+{
+	u32 addr = t4_read_reg(adap, A_SGE_DBQ_CTXT_BADDR) + 24 * qid + 8;
+	__be64 indices;
+	int ret;
+
+	ret = t4_mem_win_read_len(adap, addr, (__be32 *)&indices, 8);
+	if (!ret) {
+		indices = be64_to_cpu(indices);
+		*cidx = (indices >> 25) & 0xffff;
+		*pidx = (indices >> 9) & 0xffff;
+	}
+	return ret;
+}
+
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx,
+			u16 size)
+{
+	struct adapter *adap = netdev2adap(dev);
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	ret = read_eq_indices(adap, qid, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+
+	if (pidx != hw_pidx) {
+		u16 delta;
+
+		if (pidx >= hw_pidx)
+			delta = pidx - hw_pidx;
+		else
+			delta = size - hw_pidx + pidx;
+		wmb();
+		t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+			     V_QID(qid) | V_PIDX(delta));
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(cxgb4_sync_txq_pidx);
+
 static struct pci_driver cxgb4_driver;
 
 static void check_neigh_update(struct neighbour *neigh)
@@ -2461,6 +2516,95 @@ static struct notifier_block cxgb4_netevent_nb = {
 	.notifier_call = netevent_cb
 };
 
+static void drain_db_fifo(struct adapter *adap, int usecs)
+{
+	u32 v;
+
+	do {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(usecs));
+		v = t4_read_reg(adap, A_SGE_DBFIFO_STATUS);
+		if (G_LP_COUNT(v) == 0 && G_HP_COUNT(v) == 0)
+			break;
+	} while (1);
+}
+
+static void disable_txq_db(struct sge_txq *q)
+{
+	spin_lock_irq(&q->db_lock);
+	q->db_disabled = 1;
+	spin_unlock_irq(&q->db_lock);
+}
+
+static void enable_txq_db(struct sge_txq *q)
+{
+	spin_lock_irq(&q->db_lock);
+	q->db_disabled = 0;
+	spin_unlock_irq(&q->db_lock);
+}
+
+static void disable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		disable_txq_db(&adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		disable_txq_db(&adap->sge.ctrlq[i].q);
+}
+
+static void enable_dbs(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		enable_txq_db(&adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		enable_txq_db(&adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		enable_txq_db(&adap->sge.ctrlq[i].q);
+}
+
+static void sync_txq_pidx(struct adapter *adap, struct sge_txq *q)
+{
+	u16 hw_pidx, hw_cidx;
+	int ret;
+
+	spin_lock_bh(&q->db_lock);
+	ret = read_eq_indices(adap, (u16)q->cntxt_id, &hw_pidx, &hw_cidx);
+	if (ret)
+		goto out;
+	if (q->db_pidx != hw_pidx) {
+		u16 delta;
+
+		if (q->db_pidx >= hw_pidx)
+			delta = q->db_pidx - hw_pidx;
+		else
+			delta = q->size - hw_pidx + q->db_pidx;
+		wmb();
+		t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+				V_QID(q->cntxt_id) | V_PIDX(delta));
+	}
+out:
+	q->db_disabled = 0;
+	spin_unlock_bh(&q->db_lock);
+	if (ret)
+		CH_WARN(adap, "DB drop recovery failed.\n");
+}
+static void recover_all_queues(struct adapter *adap)
+{
+	int i;
+
+	for_each_ethrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ethtxq[i].q);
+	for_each_ofldrxq(&adap->sge, i)
+		sync_txq_pidx(adap, &adap->sge.ofldtxq[i].q);
+	for_each_port(adap, i)
+		sync_txq_pidx(adap, &adap->sge.ctrlq[i].q);
+}
+
 static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd)
 {
 	mutex_lock(&uld_mutex);
@@ -2473,55 +2617,41 @@ static void notify_rdma_uld(struct adapter *adap, enum cxgb4_control cmd)
 static void process_db_full(struct work_struct *work)
 {
 	struct adapter *adap;
-	static int delay = 1000;
-	u32 v;
 
 	adap = container_of(work, struct adapter, db_full_task);
 
-
-	/* stop LLD queues */
-
 	notify_rdma_uld(adap, CXGB4_CONTROL_DB_FULL);
-	do {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(usecs_to_jiffies(delay));
-		v = t4_read_reg(adap, A_SGE_DBFIFO_STATUS);
-		if (G_LP_COUNT(v) == 0 && G_HP_COUNT(v) == 0)
-			break;
-	} while (1);
+	drain_db_fifo(adap, dbfifo_drain_delay);
+	t4_set_reg_field(adap, A_SGE_INT_ENABLE3,
+			F_DBFIFO_HP_INT | F_DBFIFO_LP_INT,
+			F_DBFIFO_HP_INT | F_DBFIFO_LP_INT);
 	notify_rdma_uld(adap, CXGB4_CONTROL_DB_EMPTY);
-
-
-	/*
-	 * The more we get db full interrupts, the more we'll delay
-	 * in re-enabling db rings on queues, capped off at 200ms.
-	 */
-	delay = min(delay << 1, 200000);
-
-	/* resume LLD queues */
 }
 
 static void process_db_drop(struct work_struct *work)
 {
 	struct adapter *adap;
-	adap = container_of(work, struct adapter, db_drop_task);
 
+	adap = container_of(work, struct adapter, db_drop_task);
 
-	/*
-	 * sync the PIDX values in HW and SW for LLD queues.
-	 */
-
+	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_DROPPED_DB, 0);
+	disable_dbs(adap);
 	notify_rdma_uld(adap, CXGB4_CONTROL_DB_DROP);
+	drain_db_fifo(adap, 1);
+	recover_all_queues(adap);
+	enable_dbs(adap);
 }
 
 void t4_db_full(struct adapter *adap)
 {
-	schedule_work(&adap->db_full_task);
+	t4_set_reg_field(adap, A_SGE_INT_ENABLE3,
+			F_DBFIFO_HP_INT | F_DBFIFO_LP_INT, 0);
+	queue_work(workq, &adap->db_full_task);
 }
 
 void t4_db_dropped(struct adapter *adap)
 {
-	schedule_work(&adap->db_drop_task);
+	queue_work(workq, &adap->db_drop_task);
 }
 
 static void uld_attach(struct adapter *adap, unsigned int uld)
@@ -2557,6 +2687,7 @@ static void uld_attach(struct adapter *adap, unsigned int uld)
 	lli.gts_reg = adap->regs + MYPF_REG(SGE_PF_GTS);
 	lli.db_reg = adap->regs + MYPF_REG(SGE_PF_KDOORBELL);
 	lli.fw_vers = adap->params.fw_vers;
+	lli.dbfifo_int_thresh = dbfifo_int_thresh;
 
 	handle = ulds[uld].add(&lli);
 	if (IS_ERR(handle)) {
@@ -3673,6 +3804,7 @@ static int __devinit init_one(struct pci_dev *pdev,
 
 	adapter->pdev = pdev;
 	adapter->pdev_dev = &pdev->dev;
+	adapter->mbox = func;
 	adapter->fn = func;
 	adapter->msg_enable = dflt_msg_enable;
 	memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map));
@@ -3868,6 +4000,10 @@ static int __init cxgb4_init_module(void)
 {
 	int ret;
 
+	workq = create_singlethread_workqueue("cxgb4");
+	if (!workq)
+		return -ENOMEM;
+
 	/* Debugfs support is optional, just warn if this fails */
 	cxgb4_debugfs_root = debugfs_create_dir(KBUILD_MODNAME, NULL);
 	if (!cxgb4_debugfs_root)
@@ -3883,6 +4019,8 @@ static void __exit cxgb4_cleanup_module(void)
 {
 	pci_unregister_driver(&cxgb4_driver);
 	debugfs_remove(cxgb4_debugfs_root);  /* NULL ok */
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
 }
 
 module_init(cxgb4_init_module);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index 5cc2f27..d79980c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -218,6 +218,7 @@ struct cxgb4_lld_info {
 	unsigned short ucq_density;          /* # of user CQs/page */
 	void __iomem *gts_reg;               /* address of GTS register */
 	void __iomem *db_reg;                /* address of kernel doorbell */
+	int dbfifo_int_thresh;		     /* doorbell fifo int threshold */
 };
 
 struct cxgb4_uld_info {
@@ -226,6 +227,7 @@ struct cxgb4_uld_info {
 	int (*rx_handler)(void *handle, const __be64 *rsp,
 			  const struct pkt_gl *gl);
 	int (*state_change)(void *handle, enum cxgb4_state new_state);
+	int (*control)(void *handle, enum cxgb4_control control, ...);
 };
 
 int cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p);
@@ -243,4 +245,6 @@ void cxgb4_iscsi_init(struct net_device *dev, unsigned int tag_mask,
 		      const unsigned int *pgsz_order);
 struct sk_buff *cxgb4_pktgl_to_skb(const struct pkt_gl *gl,
 				   unsigned int skb_len, unsigned int pull_len);
+int cxgb4_sync_txq_pidx(struct net_device *dev, u16 qid, u16 pidx, u16 size);
+int cxgb4_flush_eq_cache(struct net_device *dev);
 #endif  /* !__CXGB4_OFLD_H */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 3631fbb..65ecf1e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -767,8 +767,13 @@ static void write_sgl(const struct sk_buff *skb, struct sge_txq *q,
 static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
 {
 	wmb();            /* write descriptors before telling HW */
-	t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
-		     QID(q->cntxt_id) | PIDX(n));
+	spin_lock(&q->db_lock);
+	if (!q->db_disabled) {
+		t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL),
+			     V_QID(q->cntxt_id) | V_PIDX(n));
+	}
+	q->db_pidx = q->pidx;
+	spin_unlock(&q->db_lock);
 }
 
 /**
@@ -2080,6 +2085,7 @@ static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
 	q->stops = q->restarts = 0;
 	q->stat = (void *)&q->desc[q->size];
 	q->cntxt_id = id;
+	spin_lock_init(&q->db_lock);
 	adap->sge.egr_map[id - adap->sge.egr_start] = q;
 }
 
@@ -2414,9 +2420,15 @@ void t4_sge_init(struct adapter *adap)
 			 RXPKTCPLMODE |
 			 (STAT_LEN == 128 ? EGRSTATUSPAGESIZE : 0));
 
+	/*
+	 * Set up to drop DOORBELL writes when the DOORBELL FIFO overflows
+	 * and generate an interrupt when this occurs so we can recover.
+	 */
 	t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS,
-			V_HP_INT_THRESH(5) | V_LP_INT_THRESH(5),
-			V_HP_INT_THRESH(5) | V_LP_INT_THRESH(5));
+			V_HP_INT_THRESH(M_HP_INT_THRESH) |
+			V_LP_INT_THRESH(M_LP_INT_THRESH),
+			V_HP_INT_THRESH(dbfifo_int_thresh) |
+			V_LP_INT_THRESH(dbfifo_int_thresh));
 	t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_ENABLE_DROP,
 			F_ENABLE_DROP);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
index 0adc5bc..111fc32 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h
@@ -190,6 +190,59 @@
 #define SGE_DEBUG_DATA_LOW 0x10d4
 #define SGE_INGRESS_QUEUES_PER_PAGE_PF 0x10f4
 
+#define S_LP_INT_THRESH    12
+#define V_LP_INT_THRESH(x) ((x) << S_LP_INT_THRESH)
+#define S_HP_INT_THRESH    28
+#define V_HP_INT_THRESH(x) ((x) << S_HP_INT_THRESH)
+#define A_SGE_DBFIFO_STATUS 0x10a4
+
+#define S_ENABLE_DROP    13
+#define V_ENABLE_DROP(x) ((x) << S_ENABLE_DROP)
+#define F_ENABLE_DROP    V_ENABLE_DROP(1U)
+#define A_SGE_DOORBELL_CONTROL 0x10a8
+
+#define A_SGE_CTXT_CMD 0x11fc
+#define A_SGE_DBQ_CTXT_BADDR 0x1084
+
+#define A_SGE_PF_KDOORBELL 0x0
+
+#define S_QID 15
+#define V_QID(x) ((x) << S_QID)
+
+#define S_PIDX 0
+#define V_PIDX(x) ((x) << S_PIDX)
+
+#define M_LP_COUNT 0x7ffU
+#define S_LP_COUNT 0
+#define G_LP_COUNT(x) (((x) >> S_LP_COUNT) & M_LP_COUNT)
+
+#define M_HP_COUNT 0x7ffU
+#define S_HP_COUNT 16
+#define G_HP_COUNT(x) (((x) >> S_HP_COUNT) & M_HP_COUNT)
+
+#define A_SGE_INT_ENABLE3 0x1040
+
+#define S_DBFIFO_HP_INT 8
+#define V_DBFIFO_HP_INT(x) ((x) << S_DBFIFO_HP_INT)
+#define F_DBFIFO_HP_INT V_DBFIFO_HP_INT(1U)
+
+#define S_DBFIFO_LP_INT 7
+#define V_DBFIFO_LP_INT(x) ((x) << S_DBFIFO_LP_INT)
+#define F_DBFIFO_LP_INT V_DBFIFO_LP_INT(1U)
+
+#define S_DROPPED_DB 0
+#define V_DROPPED_DB(x) ((x) << S_DROPPED_DB)
+#define F_DROPPED_DB V_DROPPED_DB(1U)
+
+#define S_ERR_DROPPED_DB 18
+#define V_ERR_DROPPED_DB(x) ((x) << S_ERR_DROPPED_DB)
+#define F_ERR_DROPPED_DB V_ERR_DROPPED_DB(1U)
+
+#define A_PCIE_MEM_ACCESS_OFFSET 0x306c
+
+#define M_HP_INT_THRESH 0xfU
+#define M_LP_INT_THRESH 0xfU
+
 #define PCIE_PF_CLI 0x44
 #define PCIE_INT_CAUSE 0x3004
 #define  UNXSPLCPLERR  0x20000000U
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 83ca454..0579e98 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1625,4 +1625,27 @@ int t4_mem_win_read_len(struct adapter *adap, u32 addr, __be32 *data, int len);
 int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
 			  u32 addr, u32 val);
 
+#define S_FW_CMD_OP 24
+#define V_FW_CMD_OP(x) ((x) << S_FW_CMD_OP)
+
+#define S_FW_CMD_REQUEST 23
+#define V_FW_CMD_REQUEST(x) ((x) << S_FW_CMD_REQUEST)
+#define F_FW_CMD_REQUEST V_FW_CMD_REQUEST(1U)
+
+#define S_FW_CMD_WRITE 21
+#define V_FW_CMD_WRITE(x) ((x) << S_FW_CMD_WRITE)
+#define F_FW_CMD_WRITE V_FW_CMD_WRITE(1U)
+
+#define S_FW_LDST_CMD_ADDRSPACE 0
+#define V_FW_LDST_CMD_ADDRSPACE(x) ((x) << S_FW_LDST_CMD_ADDRSPACE)
+
+enum {
+	MEMWIN0_APERTURE = 65536,
+	MEMWIN0_BASE     = 0x30000,
+	MEMWIN1_APERTURE = 32768,
+	MEMWIN1_BASE     = 0x28000,
+	MEMWIN2_APERTURE = 2048,
+	MEMWIN2_BASE     = 0x1b800,
+};
+
 #endif /* _T4FW_INTERFACE_H_ */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH V2 02/10] cxgb4: Common platform specific changes for DB Drop Recovery
From: Vipul Pandya @ 2011-10-20 17:10 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: roland-BHEL68pLQRGGvPXPguhicg, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ,
	swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW, Vipul Pandya

    - Add platform-specific callback functions for interrupts.  This is
    needed to do a single read-clear of the CAUSE register and then call
    out to platform specific functions for DB threshold interrupts and DB
    drop interrupts.

    - Add t4_mem_win_read_len() - mem-window reads for arbitrary lengths.
    This is used to read the CIDX/PIDX values from EC contexts during DB
    drop recovery.

    - Add t4_fwaddrspace_write() - sends addrspace write cmds to the fw.
    Needed to flush the sge eq context cache.

Signed-off-by: Vipul Pandya <vipul-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Steve Wise <swise-7bPotxP6k4+P2YhJcF5u+vpXobYPEAuW@public.gmane.org>
---
Changes:
V2: Corrected the subject for patch.

 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c    |   69 +++++++++++++++++++++----
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h |    5 ++
 2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 13609bf..32e1dd5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -868,11 +868,14 @@ int t4_restart_aneg(struct adapter *adap, unsigned int mbox, unsigned int port)
 	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
 }
 
+typedef void (*int_handler_t)(struct adapter *adap);
+
 struct intr_info {
 	unsigned int mask;       /* bits to check in interrupt status */
 	const char *msg;         /* message to print or NULL */
 	short stat_idx;          /* stat counter to increment or -1 */
 	unsigned short fatal;    /* whether the condition reported is fatal */
+	int_handler_t int_handler; /* platform-specific int handler */
 };
 
 /**
@@ -905,6 +908,8 @@ static int t4_handle_intr_status(struct adapter *adapter, unsigned int reg,
 		} else if (acts->msg && printk_ratelimit())
 			dev_warn(adapter->pdev_dev, "%s (0x%x)\n", acts->msg,
 				 status & acts->mask);
+		if (acts->int_handler)
+			acts->int_handler(adapter);
 		mask |= acts->mask;
 	}
 	status &= mask;
@@ -1013,9 +1018,9 @@ static void sge_intr_handler(struct adapter *adapter)
 		{ ERR_INVALID_CIDX_INC,
 		  "SGE GTS CIDX increment too large", -1, 0 },
 		{ ERR_CPL_OPCODE_0, "SGE received 0-length CPL", -1, 0 },
-		{ F_DBFIFO_LP_INT, NULL, -1, 0 },
-		{ F_DBFIFO_HP_INT, NULL, -1, 0 },
-		{ ERR_DROPPED_DB, "SGE doorbell dropped", -1, 0 },
+		{ F_DBFIFO_LP_INT, NULL, -1, 0, t4_db_full },
+		{ F_DBFIFO_HP_INT, NULL, -1, 0, t4_db_full },
+		{ F_ERR_DROPPED_DB, NULL, -1, 0, t4_db_dropped },
 		{ ERR_DATA_CPL_ON_HIGH_QID1 | ERR_DATA_CPL_ON_HIGH_QID0,
 		  "SGE IQID > 1023 received CPL for FL", -1, 0 },
 		{ ERR_BAD_DB_PIDX3, "SGE DBP 3 pidx increment too large", -1,
@@ -1036,20 +1041,14 @@ static void sge_intr_handler(struct adapter *adapter)
 	};
 
 	v = (u64)t4_read_reg(adapter, SGE_INT_CAUSE1) |
-	    ((u64)t4_read_reg(adapter, SGE_INT_CAUSE2) << 32);
+		((u64)t4_read_reg(adapter, SGE_INT_CAUSE2) << 32);
 	if (v) {
 		dev_alert(adapter->pdev_dev, "SGE parity error (%#llx)\n",
-			 (unsigned long long)v);
+				(unsigned long long)v);
 		t4_write_reg(adapter, SGE_INT_CAUSE1, v);
 		t4_write_reg(adapter, SGE_INT_CAUSE2, v >> 32);
 	}
 
-	err = t4_read_reg(adapter, A_SGE_INT_CAUSE3);
-	if (err & (F_DBFIFO_HP_INT|F_DBFIFO_LP_INT))
-		t4_db_full(adapter);
-	if (err & F_ERR_DROPPED_DB)
-		t4_db_dropped(adapter);
-
 	if (t4_handle_intr_status(adapter, SGE_INT_CAUSE3, sge_intr_info) ||
 	    v != 0)
 		t4_fatal_err(adapter);
@@ -1995,6 +1994,54 @@ int t4_wol_pat_enable(struct adapter *adap, unsigned int port, unsigned int map,
 	(var).retval_len16 = htonl(FW_LEN16(var)); \
 } while (0)
 
+int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
+			  u32 addr, u32 val)
+{
+	struct fw_ldst_cmd c;
+
+	memset(&c, 0, sizeof(c));
+	c.op_to_addrspace = htonl(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST |
+			    F_FW_CMD_WRITE |
+			    V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_FIRMWARE));
+	c.cycles_to_len16 = htonl(FW_LEN16(c));
+	c.u.addrval.addr = htonl(addr);
+	c.u.addrval.val = htonl(val);
+
+	return t4_wr_mbox(adap, mbox, &c, sizeof(c), NULL);
+}
+
+/*
+ *     t4_mem_win_read_len - read memory through PCIE memory window
+ *     @adap: the adapter
+ *     @addr: address of first byte requested aligned on 32b.
+ *     @data: len bytes to hold the data read
+ *     @len: amount of data to read from window.  Must be <=
+ *            MEMWIN0_APERATURE after adjusting for 16B alignment
+ *            requirements of the the memory window.
+ *
+ *     Read len bytes of data from MC starting at @addr.
+ */
+int t4_mem_win_read_len(struct adapter *adap, u32 addr, __be32 *data, int len)
+{
+	int i;
+	int off;
+
+	/*
+	 * Align on a 16B boundary.
+	 */
+	off = addr & 15;
+	if ((addr & 3) || (len + off) > MEMWIN0_APERTURE)
+		return -EINVAL;
+
+	t4_write_reg(adap, A_PCIE_MEM_ACCESS_OFFSET, addr & ~15);
+	t4_read_reg(adap, A_PCIE_MEM_ACCESS_OFFSET);
+
+	for (i = 0; i < len; i += 4)
+		*data++ = t4_read_reg(adap, (MEMWIN0_BASE + off + i));
+
+	return 0;
+}
+
 /**
  *	t4_mdio_rd - read a PHY register through MDIO
  *	@adap: the adapter
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index edcfd7e..83ca454 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1620,4 +1620,9 @@ struct fw_hdr {
 #define FW_HDR_FW_VER_MINOR_GET(x) (((x) >> 16) & 0xff)
 #define FW_HDR_FW_VER_MICRO_GET(x) (((x) >> 8) & 0xff)
 #define FW_HDR_FW_VER_BUILD_GET(x) (((x) >> 0) & 0xff)
+
+int t4_mem_win_read_len(struct adapter *adap, u32 addr, __be32 *data, int len);
+int t4_fwaddrspace_write(struct adapter *adap, unsigned int mbox,
+			  u32 addr, u32 val);
+
 #endif /* _T4FW_INTERFACE_H_ */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re:
From: Linda Williams @ 2011-10-20 16:53 UTC (permalink / raw)
  To: mail1

[-- Attachment #1: Type: text/plain, Size: 0 bytes --]



[-- Attachment #2: cb1120.pdf --]
[-- Type: application/pdf, Size: 24007 bytes --]

^ permalink raw reply

* Re: [PATCH net-next 1/1] net: validate HWTSTAMP ioctl parameters
From: Richard Cochran @ 2011-10-20 16:35 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, David Miller
In-Reply-To: <1319059016.2829.68.camel@bwh-desktop>

On Wed, Oct 19, 2011 at 10:16:56PM +0100, Ben Hutchings wrote:
> On Fri, 2011-10-14 at 11:37 +0200, Richard Cochran wrote:
> > This patch adds a sanity check on the values provided by user space for
> > the hardware time stamping configuration. If the values lie outside of
> > the absolute limits, then the ioctl request will be denied.
> [...]
> 
> What does this validation buy us?  The driver still has to copy the
> values into kernel space again, at which point they may have been
> changed to be invalid.  Depending on how the driver uses them (perhaps
> as array indices), it may have to validate them again to avoid a
> security vulnerability.

Oops, you are right.

The drivers will really need to check the configuration again in any
case, since no driver will support every option.

I understood David's request as simply a sanity check on the absolute
limits.

> I think that either SIOCHWTSTAMP should be handled through a discrete
> device operation (not ndo_ioctl) which receives a pointer to the
> validated structure in kernel memory,

Okay, I'll take a stab at this.

> or a validation function should be
> exported to drivers so that they can call it from their ndo_ioctl
> implementations after copying the structure into kernel memory.

I think it better to do the sanity check in one place, to guard
against lazy or sloppy drivers.

Thanks,
Richard

^ permalink raw reply

* RE: [v3 PATCH 0/2] NETFILTER new target module, HMARK
From: Hans Schillström @ 2011-10-20 15:16 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: kaber@trash.net, jengelh@medozas.de,
	netfilter-devel@vger.kernel.org, netdev@vger.kernel.org,
	hans@schillstrom.com
In-Reply-To: <20111020085411.GA19050@1984>

Hello
>Hi Hans,
>
>On Thu, Oct 13, 2011 at 09:02:08PM +0200, Hans Schillstrom wrote:
>> The target allows you to create rules in the "raw" and "mangle" tables
>> which alter the netfilter mark (nfmark) field within a given range.
>> First a 32 bit hash value is generated then modulus by <limit> and
>> finally an offset is added before it's written to nfmark.
>> Prior to routing, the nfmark can influence the routing method (see
>> "Use netfilter MARK value as routing key") and can also be used by
>> other subsystems to change their behaviour.
[snip]
>> Example:
>>                                       App Server (Real Server)
>>
>>                                            +---------+
>>                                         -->| Service |
>>      Gateway A                             +---------+
>>                           /
>>             +----------+ /     +----+      +---------+
>> --- if -A---| selector |---->  |ipvs|  --->| Service |
>>             +----------+ \     +----+      +---------+
>>                           \
>>                                +----+      +---------+
>>                                |ipvs|   -->| Service |
>>                                +----+      +---------+
>>       Gateway C
>>             +----------+ /     +----+
>> --- if-B ---| selector | --->  |ipvs|
>>             +----------+ \     +----+      +---------+
>>                                            | Service |
>>                                            +---------+
>>                           /
>>             +----------+ /     +----+     ..
>> --- if-B ---| selector | --->  |ipvs|      +---------+
>>             +----------+ \     +----+      | Service |
>>                           \                +---------+
>> #
>> # Example with four ipvs loadbalancers
>> #
>> iptables -t mangle -I PREROUTING -d $IPADDR -j HMARK --hmark-mod 4 --hmark-offs 100
>
>I think you can replace this rule by:
>
>iptables -t mangle -I PREROUTING -d $IPADDR -m cluster \
>        --cluster-total-nodes 4 --cluster-local-node 1
>        --cluster-hash-seed 0xdeadbeef -j MARK --set-mark 100
>iptables -t mangle -I PREROUTING -d $IPADDR -m cluster \
>        --cluster-total-nodes 4 --cluster-local-node 2
>        --cluster-hash-seed 0xdeadbeef -j MARK --set-mark 101
>iptables -t mangle -I PREROUTING -d $IPADDR -m cluster \
>        --cluster-total-nodes 4 --cluster-local-node 3
>        --cluster-hash-seed 0xdeadbeef -j MARK --set-mark 103
>iptables -t mangle -I PREROUTING -d $IPADDR -m cluster \
>        --cluster-total-nodes 4 --cluster-local-node 4
>        --cluster-hash-seed 0xdeadbeef -j MARK --set-mark 104
>
>The hashing is done by the cluster match, which is currently based on
>the source address.
>
>This match currently depends on the connection tracking system, so you
>could save the ctmark with CONNMARK. Thus, you only has to hash the
>first packet of the flow, instead of hashing every single packet.
>

In our case we have 256 destinations and 1-20 different HMARK targets so, the no of iptables rules will be > 5000
(20 x 256 rules ...) with cluster target.

One other important difference is that HMARK can spread the packages based on a 5 tuple not only source
and of course skip any field.  That is usually important for load distribution.

I also dig into ICMP errors so the packet can be marked based on the original packet
i.e. routed to the right machine.

I some way the cluster target and hmark targets are similar, in other where different.
One of the main ideas with hmark is to place it in-front of a a clusters (and load balancers) where you have multiple 10G lines in.
(and in that case contrack is not our friend...) i.e. do partitioning of the load balancers.

Regards
Hans




^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Alexander Smirnov @ 2011-10-20 15:11 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
In-Reply-To: <4EA02818.9000206-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

2011/10/20 Dmitry Eremin-Solenikov <dbaryshkov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>:
> On 10/20/2011 03:17 PM, Alexander Smirnov wrote:
>> Hello everybody,
>>
>> below is the patch which adds support for fragmentation in 6LoWPAN
>
> Thanks for the patch!
>
>> point to point networks. This activity needs because of difference
>> in MTU: 1280 ipv6 and 128 ieee802.15.4
>
> 127.
>
>>
>> This patch is just a draft. Could anyone please look at
>> it and let me know your opinion.
>>
>> The most doubtful moments for me are:
>> 1. Should the list 'frag_list' and the mutex 'flist_lock' be
>> included into dev private data?
>
> I'd also think about being lock-free here via using RCU.
>
>> 2. Can I use 'dev_queue_xmit' to send fragments to queue?
>
> Yes.
>
> It seems I see the source of your problems. You try to fragment skb from the
> header_create function. It is not designed for this task. Please, don't do
> this! You are not the owner of the skb ATM. You can't just drop it from that
> function. Strictly speaking you can't be sure that this skb will really hit
> the device queue.
>
> You really should push this part into queue processing on the device.
>
>
>> From 48472bae269b7b1a4047967ec21eadb217c4fd6d Mon Sep 17 00:00:00 2001
>> From: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> Date: Thu, 20 Oct 2011 15:02:36 +0400
>> Subject: [PATCH] 6LoWPAN fragmentation support
>>
>> Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>> ---
>>  net/ieee802154/6lowpan.c |  286
>> +++++++++++++++++++++++++++++++++++++++++++++-
>>  net/ieee802154/6lowpan.h |    3 +
>>  2 files changed, 288 insertions(+), 1 deletions(-)
>>
>> diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
>> index 96877bd..1923ec7 100644
>> --- a/net/ieee802154/6lowpan.c
>> +++ b/net/ieee802154/6lowpan.c
>> @@ -113,6 +113,24 @@ struct lowpan_dev_record {
>>      struct list_head list;
>>  };
>>
>> +struct lowpan_fragment {
>> +    u8 in_progress;            /* assembling is in progress */
>> +    struct sk_buff *skb;        /* skb to be assembled */
>> +    u8 *data;            /* data to be stored */
>> +    struct mutex lock;        /* concurency lock */
>> +    u16 length;            /* frame length to be assemled */
>> +    u32 bytes_rcv;            /* bytes received */
>> +    u16 tag;            /* current fragment tag */
>> +    struct timer_list timer;    /* assembling timer */
>> +    struct list_head list;        /* fragments list handler */
>> +};
>> +
>> +static unsigned short fragment_tag;
>
> What is this? Is it a part of 6lowpan standard? There is a long history
> behind being able to predict various packet/stream parameters. Please
> rethink and adjust this.
>
> Ideally (if it's not contra the standard) this could be a part of a hash
> (probably even 16 bits from CRC32 could work) calculated from a set of
> values like jiffies, cpu number, some other variables.
>
>> +
>> +/* TODO: bind mutex and list to device */
>> +static LIST_HEAD(lowpan_fragments);
>> +struct mutex flist_lock;
>> +
>>  static inline struct
>>  lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
>>  {
>> @@ -244,6 +262,18 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff *skb)
>>      return ret;
>>  }
>>
>> +static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
>> +{
>> +    u16 ret;
>> +
>> +    BUG_ON(skb->len < 2);
>> +
>> +    ret = skb->data[0] | (skb->data[1] << 8);
>> +    skb_pull(skb, 2);
>> +    return ret;
>> +}
>> +
>> +static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device
>> *dev);
>>  static int lowpan_header_create(struct sk_buff *skb,
>>                 struct net_device *dev,
>>                 unsigned short type, const void *_daddr,
>> @@ -467,9 +497,102 @@ static int lowpan_header_create(struct sk_buff *skb,
>>          memcpy(&(sa.hwaddr), saddr, 8);
>>
>>          mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
>> +
>> +        /* frame fragmentation */
>> +
>> +        /*
>> +         * if payload + mac header doesn't fit MTU-sized frame
>> +         * we need to fragment it.
>> +         */
>> +        if (skb->len > (127 - 24)) { /* MTU - MAC_HEADER_LENGTH */
>
> Magic constants. And the statement will have to be adjusted  after adding
> security handling. Does 6lowpan specify the maximum fragment size? IIRC
> there is a setting in the standard which exactly describes
> which should be the maximum data size: either 127 - max_header -
> max_security_header or just 'data + MPDU headers should fit into 127'.
>
> Could you please recheck this in both standards?
>
>> +            struct sk_buff *fr_skb;
>> +            u16 b_sent = 0;
>> +            unsigned short payload_len = skb->len;
>> +            int stat = 0;
>> +
>> +            pr_debug("%s: the frame is too big (0x%x),"
>> +                 "fragmentation needed, using tag = 0x%x\n",
>> +                 __func__, payload_len, fragment_tag);
>> +
>> +            fr_skb = skb_copy(skb, GFP_KERNEL);
>> +            if (!fr_skb)
>> +                goto error;
>> +
>> +            /* 40-bit - fragment dispatch size */
>> +            head = kzalloc(5, GFP_KERNEL);
>
> No real need to kzalloc. Could you please allocate it on stack?
>
>> +            if (!head)
>> +                goto error;
>> +
>> +            /* first fagment header */
>> +            head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_len & 0x7);
>> +            head[1] = (payload_len >> 3) & 0xff;
>> +            head[2] = fragment_tag & 0xff;
>> +            head[3] = fragment_tag >> 8;
>> +
>
> This is not atomic!!! You should get the fragment tag value once for the
> whole skb and then use the same value in the whole function.
>
>> +
>> +            lowpan_raw_dump_inline(__func__, "first header",
>> +                            head, 4);
>> +
>> +            memcpy(skb_push(fr_skb, 4), head, 4);
>
> And what if there is no 4-byte space for the header?
>
>> +            skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
>> +
>> +            dev_hard_header(fr_skb, lowpan_dev_info(dev)->real_dev,
>> +                type, (void *)&da, (void *)&sa, fr_skb->len);
>> +
>> +            /* send fragment to dev queue */
>> +            dev_queue_xmit(fr_skb);
>> +
>> +            /* next fragments headers */
>> +            head[0] |= 0x20;
>
> Magic value
>
>> +
>> +            lowpan_raw_dump_inline(__func__, "next headers",
>> +                            head, 5);
>> +
>> +            while (b_sent < payload_len) {
>> +                /* not the first fragment */
>> +                if (b_sent)
>> +                    skb_pull(skb, LOWPAN_FRAG_SIZE);
>
> Are you the owner of the original skb here? Seems you are not. So you can't
> change the original skb.
>
>> +
>> +                pr_debug("%s: preparing fragment %d\n",
>> +                    __func__, b_sent / LOWPAN_FRAG_SIZE);
>> +
>> +                /*
>> +                 * create copy of current buffer and trim it
>> +                 * down to fragment size
>> +                 */
>> +                fr_skb = skb_copy(skb, GFP_KERNEL);
>> +                if (!fr_skb)
>> +                    goto error;
>> +
>> +                skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
>> +
>> +                /* add fragment header */
>> +                head[4] = b_sent / 8;
>> +                memcpy(skb_push(fr_skb, 5), head, 5);
>> +
>> +                b_sent += LOWPAN_FRAG_SIZE;
>> +
>> +                lowpan_raw_dump_table(__func__,
>> +                   "fragment data", fr_skb->data, fr_skb->len);
>> +
>> +                stat = dev_hard_header(fr_skb,
>> +                    lowpan_dev_info(dev)->real_dev, type,
>> +                    (void *)&da, (void *)&sa, fr_skb->len);
>> +
>> +                dev_queue_xmit(fr_skb);
>> +            }
>
> I don't like this piece of code. Please refactor it to a separate function
> that can send both first and next fragments.
>
> Also I don't see a point in copying original skb again and again. It would
> be wiser to allocate fragment skb's via dev_alloc_skb, reserve header space,
> push fragmentation header, then memcpy the rest of the data.
> Or you can use non-linear skb's referencing fragments from the data part of
> the original skb.
>
>
>> +
>> +            /* TODO: what's the correct way to skip default skb? */
>> +
>> +            fragment_tag++;
>
> Hmmm.
>
>> +            return stat;
>> +        } else
>>          return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
>>                  type, (void *)&da, (void *)&sa, skb->len);
>>      }
>> +error:
>> +    kfree_skb(skb);
>> +    return -ENOMEM;
>>  }
>>
>>  static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
>> @@ -511,6 +634,23 @@ static int lowpan_skb_deliver(struct sk_buff *skb,
>> struct ipv6hdr *hdr)
>>      return stat;
>>  }
>>
>> +static void lowpan_fragment_timer_expired(unsigned long tag)
>> +{
>> +    struct lowpan_fragment *entry, *tmp;
>> +
>> +    pr_debug("%s: timer expired for frame with tag %lu\n", __func__,
>> tag);
>> +
>> +    mutex_lock(&flist_lock);
>> +    list_for_each_entry_safe(entry, tmp, &lowpan_fragments, list)
>> +        if (entry->tag == tag) {
>> +            list_del(&entry->list);
>> +            kfree(entry->data);
>> +            kfree(entry);
>> +            break;
>> +        }
>> +    mutex_unlock(&flist_lock);
>> +}
>
> Rather than using a timer here, I'd use a delayed job (that can drop several
> fragmentated packets at once, if it's execution is delayed) plus a sorted
> fragments list to ease the calculation of the next fragment time out wipe.
>
>> +
>>  static int
>>  lowpan_process_data(struct sk_buff *skb)
>>  {
>> @@ -525,6 +665,139 @@ lowpan_process_data(struct sk_buff *skb)
>>      if (skb->len < 2)
>>          goto drop;
>>      iphc0 = lowpan_fetch_skb_u8(skb);
>> +
>> +    /* fragments assmebling */
>> +    switch (iphc0 & 0xf8) {
>> +    /* first fragment of the frame */
>> +    case LOWPAN_DISPATCH_FRAG1:
>> +    {
>> +        struct lowpan_fragment *entry, *frame;
>> +        u16 tag;
>> +
>> +        lowpan_raw_dump_inline(__func__, "first frame fragment header",
>> +                                skb->data, 3);
>> +
>> +        tmp = lowpan_fetch_skb_u8(skb);
>> +        tag = lowpan_fetch_skb_u16(skb);
>> +
>> +        /*
>> +         * check if frame assembling with the same tag is
>> +         * already in progress
>> +         */
>> +        rcu_read_lock();
>> +        list_for_each_entry_rcu(entry, &lowpan_fragments, list)
>> +            if (entry->tag == tag) {
>> +                pr_debug("%s ERROR: frame with this tag is"
>> +                     "alredy in assembling", __func__);
>> +                goto drop_rcu;
>> +            }
>> +        rcu_read_unlock();
>
> I'm not quite sure that your RCU/locking usage is correct.
>
>> +
>> +        /* alloc new frame structure */
>> +        frame = kzalloc(sizeof(struct lowpan_fragment), GFP_KERNEL);
>> +        if (!frame)
>> +            goto drop;
>> +
>> +        INIT_LIST_HEAD(&frame->list);
>> +
>> +        frame->bytes_rcv = 0;
>> +        frame->length = (iphc0 & 7) | (tmp << 3);
>> +        frame->tag = tag;
>> +        /* allocate buffer for frame assembling */
>> +        frame->data = kzalloc(frame->length, GFP_KERNEL);
>
> Why not allocate an skb here? You can do all fragments processing on the top
> of one skb + ranges handling.
>
> BTW: Did you study the skb reassembly code of IPv4?
>
>> +        if (!frame->data) {
>> +            kfree(frame);
>> +            goto drop;
>> +        }
>> +
>> +        pr_debug("%s: frame to be assembled: length = 0x%x, "
>> +             "tag = 0x%x\n", __func__, frame->length, frame->tag);
>> +
>> +        init_timer(&frame->timer);
>> +        /* (number of fragments) * (fragment processing time-out) */
>> +        frame->timer.expires = jiffies +
>> +          (frame->length / LOWPAN_FRAG_SIZE + 1) * LOWPAN_FRAG_TIMEOUT;
>> +        frame->timer.data = tag;
>> +        frame->timer.function = lowpan_fragment_timer_expired;
>> +
>> +        add_timer(&frame->timer);
>> +
>> +        mutex_lock(&flist_lock);
>> +        list_add_tail(&frame->list, &lowpan_fragments);
>> +        mutex_unlock(&flist_lock);
>> +
>> +        return kfree_skb(skb), 0;
>> +    }
>> +    /* second and next fragment of the frame */
>> +    case LOWPAN_DISPATCH_FRAGN:
>> +    {
>> +        u16 tag;
>> +        struct lowpan_fragment *entry, *t;
>> +
>> +        lowpan_raw_dump_inline(__func__, "next fragment header",
>> +                    skb->data, 4);
>> +
>> +        lowpan_fetch_skb_u8(skb); /* skip frame length byte */
>> +        tag = lowpan_fetch_skb_u16(skb);
>> +
>> +        rcu_read_lock();
>> +        list_for_each_entry_rcu(entry, &lowpan_fragments, list)
>> +            if (entry->tag == tag)
>> +                break;
>> +        rcu_read_unlock();
>> +
>> +        if (entry->tag != tag) {
>> +            pr_debug("%s ERROR: no frame structure found for this"
>> +                 "fragment", __func__);
>> +            goto drop;
>> +        }
>
> Can you be sure that you won't receive fragments out of order? No, you can
> not!
>
>> +
>> +        tmp = lowpan_fetch_skb_u8(skb); /* fetch offset */
>> +
>> +        lowpan_raw_dump_table(__func__, "next fragment payload",
>> +                    skb->data, skb->len);
>> +
>> +        /* if payload fits buffer, copy it */
>> +        if ((tmp * 8 + skb->len) <= entry->length) /* TODO: likely? */
>> +            memcpy(entry->data + tmp * 8, skb->data, skb->len);
>> +        else
>> +            goto drop;
>> +
>> +        entry->bytes_rcv += skb->len;
>> +
>> +        pr_debug("%s: frame length = 0x%x, bytes received = 0x%x\n",
>> +             __func__, entry->length, entry->bytes_rcv);
>> +
>> +        /* frame assembling complete */
>> +        if (entry->bytes_rcv == entry->length) {
>> +            struct sk_buff *tmp = skb;
>
>
> WTF?
>
>> +
>> +            mutex_lock(&flist_lock);
>> +            list_for_each_entry_safe(entry, t, &lowpan_fragments, list)
>> +                if (entry->tag == tag) {
>> +                    list_del(&entry->list);
>> +                    /* copy and clear skb */
>> +                    skb = skb_copy_expand(skb, entry->length,
>> skb_tailroom(skb), GFP_KERNEL);
>> +                    skb_pull(skb, skb->len);
>> +                    /* copy new data to skb */
>> +                    memcpy(skb_push(skb, entry->length), entry->data,
>> entry->length);
>> +                    kfree_skb(tmp);
>> +                    del_timer(&entry->timer);
>> +                    kfree(entry->data);
>> +                    kfree(entry);
>
> This is not the optimal way to code this. Consider reading about string
> concatenation in Java or Python.
>
>> +
>> +                    iphc0 = lowpan_fetch_skb_u8(skb);
>> +                    break;
>> +                }
>> +            mutex_unlock(&flist_lock);
>> +            break;
>> +        }
>> +        return kfree_skb(skb), 0;
>> +    }
>> +    default:
>> +        break;
>> +    }
>> +
>>      iphc1 = lowpan_fetch_skb_u8(skb);
>>
>>      _saddr = mac_cb(skb)->sa.hwaddr;
>> @@ -674,6 +947,8 @@ lowpan_process_data(struct sk_buff *skb)
>>      lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
>>                              sizeof(hdr));
>>      return lowpan_skb_deliver(skb, &hdr);
>> +drop_rcu:
>> +    rcu_read_unlock();
>>  drop:
>>      kfree(skb);
>>      return -EINVAL;
>> @@ -765,8 +1040,15 @@ static int lowpan_rcv(struct sk_buff *skb, struct
>> net_device *dev,
>>          goto drop;
>>
>>      /* check that it's our buffer */
>> -    if ((skb->data[0] & 0xe0) == 0x60)
>> +    switch (skb->data[0] & 0xe0) {
>> +    case 0x60:        /* ipv6 datagram */
>> +    case 0xc0:        /* first fragment header */
>> +    case 0xe0:        /* next fragments headers */
>>          lowpan_process_data(skb);
>> +        break;
>> +    default:
>> +        break;
>> +    }
>>
>>      return NET_RX_SUCCESS;
>>
>> @@ -793,6 +1075,8 @@ static int lowpan_newlink(struct net *src_net, struct
>> net_device *dev,
>>      lowpan_dev_info(dev)->real_dev = real_dev;
>>      mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
>>
>> +    mutex_init(&flist_lock);
>> +
>>      entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
>>      if (!entry)
>>          return -ENOMEM;
>> diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
>> index 5d8cf80..e8e57c4 100644
>> --- a/net/ieee802154/6lowpan.h
>> +++ b/net/ieee802154/6lowpan.h
>> @@ -159,6 +159,9 @@
>>  #define LOWPAN_DISPATCH_FRAG1    0xc0 /* 11000xxx */
>>  #define LOWPAN_DISPATCH_FRAGN    0xe0 /* 11100xxx */
>>
>> +#define LOWPAN_FRAG_SIZE    40        /* fragment payload size */
>> +#define LOWPAN_FRAG_TIMEOUT    (HZ * 2)    /* processing time: 2 sec */
>
> Is it a standard defined interval?
>
>
> --
> With best wishes
> Dmitry
>

Hi Dmitry,

please don't judge so strictly. This patch doesn't pretend to be
merged. It's just a draft and I only asked some help regarding how to
implement some doubtful moments. And you gave it to me ;-)

But in any case thanks a lot to everybody!

With best regards,
Alexander

------------------------------------------------------------------------------
The demand for IT networking professionals continues to grow, and the
demand for specialized networking skills is growing even more rapidly.
Take a complimentary Learning@Ciosco Self-Assessment and learn 
about Cisco certifications, training, and career opportunities. 
http://p.sf.net/sfu/cisco-dev2dev

^ permalink raw reply

* Re: [net-next 5/6] ixgbe: add hardware timestamping support
From: Richard Cochran @ 2011-10-20 14:56 UTC (permalink / raw)
  To: Jacob Keller; +Cc: Jeff Kirsher, davem, Jacob Keller, netdev, gospo, sassmann
In-Reply-To: <CA+P7+xoP7VE0FEKnLCbzygw499xLr1PFynPn6W0+ud6KwZHgEg@mail.gmail.com>

On Wed, Oct 19, 2011 at 10:04:33AM -0700, Jacob Keller wrote:
> On Mon, Oct 17, 2011 at 9:44 AM, Richard Cochran <richardcochran@gmail.com> wrote:
> > So, is this wrap around due to the fact that you are tied to the
> > system time via time_compare? Or, putting it another way, can't you
> > program the hardware time stamping unit so that the registers have
> > some reasonable resolution (like 64 bits worth of nanoseconds) and
> > just offer RAW timestamps?
> 
> The wrap around is due to hardware limitations. The ixgbe devices
> cannot support 64bits worth of nanoseconds and still have the ability
> to adjust the frequency in parts per billion. A larger increment
> increases the resolution available for frequency adjustments, but
> decreases the time it takes for the cycle counter to wrap around.

Oh, well. That stinks.

I think you do want to offer ppb adjustment.

> > I would really like to move away from the timecompare hacks and
> > towards a proper PHC->SYS PPS solution.
> >
> 
> I agree that this is the correct approach. The timecompare
> functionality does have issues.

And these cards are highlighting timecompare weaknesses I had not even
thought of.

I expect that if you offer the RAW time stamps, then it should be
possible to have the time stamp values always correct (or nearly so)
even with a changing link speed. If the link speed change gives an
interrupt, then the ISR can reprogram the frequency compensation
registers and let the counter continue.

> > Again, doing the update thing on every packet won't work for real
> > world PTP scenarios.
> >
> Which is why the PHC solution is better. Work on implementing this
> support is in progress. Out of curiosity, what is the sync rate for
> the scenario that breaks this? I would like to try that rate out on my
> setup.

For the audio/video profile, they have a max of 32 sync packets per
second. Not sure about delay request rate, maybe 16 per second.

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH 07/10] RDMA/cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Steve Wise @ 2011-10-20 14:32 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Vipul Pandya, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	divy-ut6Up61K2wZBDgjK7y7TUQ, dm-ut6Up61K2wZBDgjK7y7TUQ,
	kumaras-ut6Up61K2wZBDgjK7y7TUQ
In-Reply-To: <CAL1RGDWCfJHD+PjXQ3STk4uc-fEyQ2GsQR5ALU6i4-f7w2bXUA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On 10/19/2011 05:12 PM, Roland Dreier wrote:
> This looks like the only drivers/infiniband patch that depends on the
> drivers/net changes?
>
>   - R.

I believe 5 and 7 have build dependencies.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 5/5] macvtap: Fix the minor device number allocation
From: Eric W. Biederman @ 2011-10-20 14:29 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1botbu481.fsf_-_@fess.ebiederm.org>


On systems that create and delete lots of dynamic devices the
31bit linux ifindex fails to fit in the 16bit macvtap minor,
resulting in unusable macvtap devices.  I have systems running
automated tests that that hit this condition in just a few days.

Use a linux idr allocator to track which mavtap minor numbers
are available and and to track the association between macvtap
minor numbers and macvtap network devices.

Remove the unnecessary unneccessary check to see if the network
device we have found is indeed a macvtap device.  With macvtap
specific data structures it is impossible to find any other
kind of networking device.

Increase the macvtap minor range from 65536 to the full 20 bits
that is supported by linux device numbers.  It doesn't solve the
original problem but there is no penalty for a larger minor
device range.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c      |   87 ++++++++++++++++++++++++++++++++++++--------
 include/linux/if_macvlan.h |    1 +
 2 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 25689e9..1b7082d 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -51,15 +51,13 @@ static struct proto macvtap_proto = {
 };
 
 /*
- * Minor number matches netdev->ifindex, so need a potentially
- * large value. This also makes it possible to split the
- * tap functionality out again in the future by offering it
- * from other drivers besides macvtap. As long as every device
- * only has one tap, the interface numbers assure that the
- * device nodes are unique.
+ * Variables for dealing with macvtaps device numbers.
  */
 static dev_t macvtap_major;
-#define MACVTAP_NUM_DEVS 65536
+#define MACVTAP_NUM_DEVS (1U << MINORBITS)
+static DEFINE_MUTEX(minor_lock);
+static DEFINE_IDR(minor_idr);
+
 #define GOODCOPY_LEN 128
 static struct class *macvtap_class;
 static struct cdev macvtap_cdev;
@@ -275,6 +273,58 @@ static int macvtap_receive(struct sk_buff *skb)
 	return macvtap_forward(skb->dev, skb);
 }
 
+static int macvtap_get_minor(struct macvlan_dev *vlan)
+{
+	int retval = -ENOMEM;
+	int id;
+
+	mutex_lock(&minor_lock);
+	if (idr_pre_get(&minor_idr, GFP_KERNEL) == 0)
+		goto exit;
+
+	retval = idr_get_new_above(&minor_idr, vlan, 1, &id);
+	if (retval < 0) {
+		if (retval == -EAGAIN)
+			retval = -ENOMEM;
+		goto exit;
+	}
+	if (id < MACVTAP_NUM_DEVS) {
+		vlan->minor = id;
+	} else {
+		printk(KERN_ERR "too many macvtap devices\n");
+		retval = -EINVAL;
+		idr_remove(&minor_idr, id);
+	}
+exit:
+	mutex_unlock(&minor_lock);
+	return retval;
+}
+
+static void macvtap_free_minor(struct macvlan_dev *vlan)
+{
+	mutex_lock(&minor_lock);
+	if (vlan->minor) {
+		idr_remove(&minor_idr, vlan->minor);
+		vlan->minor = 0;
+	}
+	mutex_unlock(&minor_lock);
+}
+
+static struct net_device *dev_get_by_macvtap_minor(int minor)
+{
+	struct net_device *dev = NULL;
+	struct macvlan_dev *vlan;
+
+	mutex_lock(&minor_lock);
+	vlan = idr_find(&minor_idr, minor);
+	if (vlan) {
+		dev = vlan->dev;
+		dev_hold(dev);
+	}
+	mutex_unlock(&minor_lock);
+	return dev;
+}
+
 static int macvtap_newlink(struct net *src_net,
 			   struct net_device *dev,
 			   struct nlattr *tb[],
@@ -329,7 +379,7 @@ static void macvtap_sock_destruct(struct sock *sk)
 static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
-	struct net_device *dev = dev_get_by_index(net, iminor(inode));
+	struct net_device *dev = dev_get_by_macvtap_minor(iminor(inode));
 	struct macvtap_queue *q;
 	int err;
 
@@ -337,11 +387,6 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	if (!dev)
 		goto out;
 
-	/* check if this is a macvtap device */
-	err = -EINVAL;
-	if (dev->rtnl_link_ops != &macvtap_link_ops)
-		goto out;
-
 	err = -ENOMEM;
 	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
 					     &macvtap_proto);
@@ -961,12 +1006,15 @@ static int macvtap_device_event(struct notifier_block *unused,
 				unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
+	struct macvlan_dev *vlan;
 	struct device *classdev;
 	dev_t devt;
+	int err;
 
 	if (dev->rtnl_link_ops != &macvtap_link_ops)
 		return NOTIFY_DONE;
 
+	vlan = netdev_priv(dev);
 
 	switch (event) {
 	case NETDEV_REGISTER:
@@ -974,15 +1022,22 @@ static int macvtap_device_event(struct notifier_block *unused,
 		 * been registered but before register_netdevice has
 		 * finished running.
 		 */
-		devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+		err = macvtap_get_minor(vlan);
+		if (err)
+			return notifier_from_errno(err);
+
+		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
 		classdev = device_create(macvtap_class, &dev->dev, devt,
 					 dev, "tap%d", dev->ifindex);
-		if (IS_ERR(classdev))
+		if (IS_ERR(classdev)) {
+			macvtap_free_minor(vlan);
 			return notifier_from_errno(PTR_ERR(classdev));
+		}
 		break;
 	case NETDEV_UNREGISTER:
-		devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+		devt = MKDEV(MAJOR(macvtap_major), vlan->minor);
 		device_destroy(macvtap_class, devt);
+		macvtap_free_minor(vlan);
 		break;
 	}
 
diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h
index e28b2e4..d103dca 100644
--- a/include/linux/if_macvlan.h
+++ b/include/linux/if_macvlan.h
@@ -64,6 +64,7 @@ struct macvlan_dev {
 	int (*forward)(struct net_device *dev, struct sk_buff *skb);
 	struct macvtap_queue	*taps[MAX_MACVTAP_QUEUES];
 	int			numvtaps;
+	int			minor;
 };
 
 static inline void macvlan_count_rx(const struct macvlan_dev *vlan,
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 4/5] macvtap: Rewrite macvtap_newlink so the error handling works.
From: Eric W. Biederman @ 2011-10-20 14:28 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1hb33u4ab.fsf_-_@fess.ebiederm.org>


Place macvlan_common_newlink at the end of macvtap_newlink because
failing in newlink after registering your network device is not
supported.

Move device_create into a netdevice creation notifier.   The network device
notifier is the only hook that is called after the network device has been
registered with the device layer and before register_network_device returns
success.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |   73 +++++++++++++++++++++++++++++++++----------------
 1 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 515aa87..25689e9 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -280,34 +280,16 @@ static int macvtap_newlink(struct net *src_net,
 			   struct nlattr *tb[],
 			   struct nlattr *data[])
 {
-	struct device *classdev;
-	dev_t devt;
-	int err;
-
-	err = macvlan_common_newlink(src_net, dev, tb, data,
-				     macvtap_receive, macvtap_forward);
-	if (err)
-		goto out;
-
-	devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
-
-	classdev = device_create(macvtap_class, &dev->dev, devt,
-				 dev, "tap%d", dev->ifindex);
-	if (IS_ERR(classdev)) {
-		err = PTR_ERR(classdev);
-		macvtap_del_queues(dev);
-	}
-
-out:
-	return err;
+	/* Don't put anything that may fail after macvlan_common_newlink
+	 * because we can't undo what it does.
+	 */
+	return macvlan_common_newlink(src_net, dev, tb, data,
+				      macvtap_receive, macvtap_forward);
 }
 
 static void macvtap_dellink(struct net_device *dev,
 			    struct list_head *head)
 {
-	device_destroy(macvtap_class,
-		       MKDEV(MAJOR(macvtap_major), dev->ifindex));
-
 	macvtap_del_queues(dev);
 	macvlan_dellink(dev, head);
 }
@@ -975,6 +957,42 @@ struct socket *macvtap_get_socket(struct file *file)
 }
 EXPORT_SYMBOL_GPL(macvtap_get_socket);
 
+static int macvtap_device_event(struct notifier_block *unused,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct device *classdev;
+	dev_t devt;
+
+	if (dev->rtnl_link_ops != &macvtap_link_ops)
+		return NOTIFY_DONE;
+
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		/* Create the device node here after the network device has
+		 * been registered but before register_netdevice has
+		 * finished running.
+		 */
+		devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+		classdev = device_create(macvtap_class, &dev->dev, devt,
+					 dev, "tap%d", dev->ifindex);
+		if (IS_ERR(classdev))
+			return notifier_from_errno(PTR_ERR(classdev));
+		break;
+	case NETDEV_UNREGISTER:
+		devt = MKDEV(MAJOR(macvtap_major), dev->ifindex);
+		device_destroy(macvtap_class, devt);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block macvtap_notifier_block __read_mostly = {
+	.notifier_call	= macvtap_device_event,
+};
+
 static int macvtap_init(void)
 {
 	int err;
@@ -995,12 +1013,18 @@ static int macvtap_init(void)
 		goto out3;
 	}
 
-	err = macvlan_link_register(&macvtap_link_ops);
+	err = register_netdevice_notifier(&macvtap_notifier_block);
 	if (err)
 		goto out4;
 
+	err = macvlan_link_register(&macvtap_link_ops);
+	if (err)
+		goto out5;
+
 	return 0;
 
+out5:
+	unregister_netdevice_notifier(&macvtap_notifier_block);
 out4:
 	class_unregister(macvtap_class);
 out3:
@@ -1015,6 +1039,7 @@ module_init(macvtap_init);
 static void macvtap_exit(void)
 {
 	rtnl_link_unregister(&macvtap_link_ops);
+	unregister_netdevice_notifier(&macvtap_notifier_block);
 	class_unregister(macvtap_class);
 	cdev_del(&macvtap_cdev);
 	unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS);
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 3/5] macvtap: Don't leak unreceived packets when we delete a macvtap device.
From: Eric W. Biederman @ 2011-10-20 14:27 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1mxcvu4bk.fsf_-_@fess.ebiederm.org>


To avoid leaking packets in the receive queue.  Add a socket destructor
that will run whenever destroy a macvtap socket.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |    6 ++++++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 1d9c9c2..515aa87 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -339,6 +339,11 @@ static void macvtap_sock_write_space(struct sock *sk)
 		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
 }
 
+static void macvtap_sock_destruct(struct sock *sk)
+{
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
 static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
@@ -369,6 +374,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	q->sock.ops = &macvtap_socket_ops;
 	sock_init_data(&q->sock, &q->sk);
 	q->sk.sk_write_space = macvtap_sock_write_space;
+	q->sk.sk_destruct = macvtap_sock_destruct;
 	q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
 	q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
 
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 2/5] macvtap: Fix macvtap_open races in the zero copy enable code.
From: Eric W. Biederman @ 2011-10-20 14:26 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1sjmnu4cm.fsf@fess.ebiederm.org>


To see if it is appropriate to enable the macvtap zero copy feature
don't test the lowerdev network device flags.   Instead test the
macvtap network device flags which are a direct copy of the lowerdev
flags.  This is important because nothing holds a reference to lowerdev
and on a very bad day we lowerdev could be a pointer to stale memory.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |   11 +++++------
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 70aa628..1d9c9c2 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -343,7 +343,6 @@ static int macvtap_open(struct inode *inode, struct file *file)
 {
 	struct net *net = current->nsproxy->net_ns;
 	struct net_device *dev = dev_get_by_index(net, iminor(inode));
-	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct macvtap_queue *q;
 	int err;
 
@@ -376,12 +375,12 @@ static int macvtap_open(struct inode *inode, struct file *file)
 	/*
 	 * so far only KVM virtio_net uses macvtap, enable zero copy between
 	 * guest kernel and host kernel when lower device supports zerocopy
+	 *
+	 * The macvlan supports zerocopy iff the lower device supports zero
+	 * copy so we don't have to look at the lower device directly.
 	 */
-	if (vlan) {
-		if ((vlan->lowerdev->features & NETIF_F_HIGHDMA) &&
-		    (vlan->lowerdev->features & NETIF_F_SG))
-			sock_set_flag(&q->sk, SOCK_ZEROCOPY);
-	}
+	if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+		sock_set_flag(&q->sk, SOCK_ZEROCOPY);
 
 	err = macvtap_set_queue(dev, file, q);
 	if (err)
-- 
1.7.2.5

^ permalink raw reply related

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: Daniel Turull @ 2011-10-20 14:26 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ben Hutchings, David Miller, netdev, Robert Olsson,
	Voravit Tanyingyong, Jens Laas
In-Reply-To: <1319118253.7735.9.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

On 10/20/2011 03:44 PM, Eric Dumazet wrote:
> Le jeudi 20 octobre 2011 à 15:22 +0200, Daniel Turull a écrit :
>> Hi,
>>
>> I tested the patch and it works well.
>>
> 
> Thanks !
> 
> 
>>
>> I think if we increase the constant to 1ms, we will reduce the jitter if we have
>> a rate between 1kpps and 10 kpps, but I guess is not a big deal.
>>
> 
> 
>> I've plot this new graph with this patch:
>> http://tslab.ssvl.kth.se/pktgen/img/inter_eric1.eps
> 
> Unfortunately, the sender cpu might be preempted by timer irq or other
> expensive irq, so the Min/Max values are not very different I guess.
> 
> I dont understand your Min values.
> 
> At 100 pps, how is it possible to have a Min value of ~5000 ns ?
> 
> 
My assumption is that for low rate, the min value is caused in the 
beginning of the test. When we start the transmission in pktgen_run(),
we set the pkt_dev->next_tx to the current time but the are
more operation to do, so the first transmission is a bit delayed. 
Even more if the cpu is preempted.
For the second packet, we are taking the pkt_dev->next_tx as a reference
and add the delay, in order to decide when to send.
So, my guess is that the first packet is delayed
and then we send the second packet only after a short time, in order
to keep the average rate in the transmission.

Daniel

^ permalink raw reply

* [PATCH 1/5] macvtap: Close a race between macvtap_open and macvtap_dellink.
From: Eric W. Biederman @ 2011-10-20 14:26 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma
In-Reply-To: <m1wrbzu4ec.fsf@fess.ebiederm.org>


There is a small window in macvtap_open between looking up a
networking device and calling macvtap_set_queue in which
macvtap_del_queues called from macvtap_dellink.   After
calling macvtap_del_queues it is totally incorrect to
allow macvtap_set_queue to proceed so prevent success by
reporting that all of the available queues are in use.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/macvtap.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 3da5578..70aa628 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -231,6 +231,8 @@ static void macvtap_del_queues(struct net_device *dev)
 		}
 	}
 	BUG_ON(vlan->numvtaps != 0);
+	/* guarantee that any future macvtap_set_queue will fail */
+	vlan->numvtaps = MAX_MACVTAP_QUEUES;
 	spin_unlock(&macvtap_lock);
 
 	synchronize_rcu();
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 0/5] macvtap fixes.
From: Eric W. Biederman @ 2011-10-20 14:24 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Arnd Bergmann, Jason Wang, Michael S. Tsirkin,
	Ian Campbell, Shirly Ma


This series of patches fixes a series of minor bugs in the macvtap code.

The fixes to handle failures in newlink and the change in how we handle
minor device number allocations are particularly significant.

Eric W. Biederman (5):
      macvtap: Close a race between macvtap_open and macvtap_dellink.
      macvtap: Fix macvtap_open races in the zero copy enable code.
      macvtap: Don't leak unreceived packets when we delete a macvtap device.
      macvtap: Rewrite macvtap_newlink so the error handling works.
      macvtap: Fix the minor device number allocation

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Serge E. Hallyn @ 2011-10-20 14:24 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-kernel, akpm, oleg, richard, mikevs, segoon, gregkh,
	dhowells, eparis, Serge E. Hallyn, netdev
In-Reply-To: <m1fwinvl8u.fsf@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> "Serge E. Hallyn" <serge@hallyn.com> writes:
> 
> > Quoting Eric W. Biederman (ebiederm@xmission.com):
> >> Serge Hallyn <serge@hallyn.com> writes:
> >> 
> >> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
> >> >
> >> > Currently uids are compared without regard for the user namespace.
> >> > Fix that to prevent tasks in a different user namespace from
> >> > wrongly matching on SCM_CREDENTIALS.
> >> >
> >> > In the past, either your uids had to match, or you had to have
> >> > CAP_SETXID.  In a namespaced world, you must either (both be in the
> >> > same user namespace and have your uids match), or you must have
> >> > CAP_SETXID targeted at the other user namespace.  The latter can
> >> > happen for instance if uid 500 created a new user namespace and
> >> > now interacts with uid 0 in it.
> >> 
> >> Serge this approach is wrong.
> >
> > Thanks for looking, Eric.
> >
> >> Because we pass the cred and the pid through the socket socket itself
> >> is just a conduit and should be ignored in this context.
> >
> > Ok, that makes sense, but
> >
> >> The only interesting test should be are you allowed to impersonate other
> >> users in your current userk namespace.
> >
> > Why in your current user namespace?  Shouldn't it be in the
> > target user ns?  I understand it could be wrong to tie the
> > user ns owning the socket to the target userns (though I still
> > kind of like it), but just because I have CAP_SETUID in my
> > own user_ns doesn't mean I should be able to pose as another
> > uid in your user_ns.
> 
> First and foremost it is important that you be able if you have the
> capability to impersonate other users in your current user namespace.
> That is what the capability actually controls.
> 
> None of this allows you to impersonate any user in any other user
> namespace.  The translation between users prevents that.
> 
> > (Now I also see that cred_to_ucred() translates to the current
> > user_ns, so that should have been a hint to me before about
> > your intent, but I'm not convinced I agree with your intent).
> >
> > And you do the same with the pid.  Why is that a valid assumption?
> 
> Yes.  Basically all the code is allow you to impersonate people you
> would have been able to impersonate before.  If your target is in
> another namespace you can not fool them.
> 
> With pids the logic should be a lot clearer.  Pretend to be a pid you can
> see in your current pid namespace.  Lookup and convert to struct pid aka
> the namespace agnostic object.  On output return the pid value that
> the target process will know you as.
> 
> Ultimately I think we need a ns_capable for the current user namespace
> instead of a global one.  But I don't see any rush to introduce
> ns_capable here.

I think I agree - I was mistakenly thinking that without this patch
there is an opportunity for a less privileged task in child user ns
to impersonate, but that's not possible, so let's drop this patch
for now!

thanks,
-serge

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Serge E. Hallyn @ 2011-10-20 14:14 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Serge E. Hallyn, linux-kernel, akpm, oleg, richard, mikevs,
	segoon, gregkh, dhowells, eparis, netdev
In-Reply-To: <m1fwinvl8u.fsf@fess.ebiederm.org>

Quoting Eric W. Biederman (ebiederm@xmission.com):
> "Serge E. Hallyn" <serge@hallyn.com> writes:
> 
> > Quoting Eric W. Biederman (ebiederm@xmission.com):
> >> Serge Hallyn <serge@hallyn.com> writes:
> >> 
> >> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
> >> >
> >> > Currently uids are compared without regard for the user namespace.
> >> > Fix that to prevent tasks in a different user namespace from
> >> > wrongly matching on SCM_CREDENTIALS.
> >> >
> >> > In the past, either your uids had to match, or you had to have
> >> > CAP_SETXID.  In a namespaced world, you must either (both be in the
> >> > same user namespace and have your uids match), or you must have
> >> > CAP_SETXID targeted at the other user namespace.  The latter can
> >> > happen for instance if uid 500 created a new user namespace and
> >> > now interacts with uid 0 in it.
> >> 
> >> Serge this approach is wrong.
> >
> > Thanks for looking, Eric.
> >
> >> Because we pass the cred and the pid through the socket socket itself
> >> is just a conduit and should be ignored in this context.
> >
> > Ok, that makes sense, but
> >
> >> The only interesting test should be are you allowed to impersonate other
> >> users in your current userk namespace.
> >
> > Why in your current user namespace?  Shouldn't it be in the
> > target user ns?  I understand it could be wrong to tie the
> > user ns owning the socket to the target userns (though I still
> > kind of like it), but just because I have CAP_SETUID in my
> > own user_ns doesn't mean I should be able to pose as another
> > uid in your user_ns.
> 
> First and foremost it is important that you be able if you have the
> capability to impersonate other users in your current user namespace.
> That is what the capability actually controls.
> 
> None of this allows you to impersonate any user in any other user
> namespace.  The translation between users prevents that.
> 
> > (Now I also see that cred_to_ucred() translates to the current
> > user_ns, so that should have been a hint to me before about
> > your intent, but I'm not convinced I agree with your intent).
> >
> > And you do the same with the pid.  Why is that a valid assumption?
> 
> Yes.  Basically all the code is allow you to impersonate people you
> would have been able to impersonate before.  If your target is in
> another namespace you can not fool them.
> 
> With pids the logic should be a lot clearer.  Pretend to be a pid you can
> see in your current pid namespace.  Lookup and convert to struct pid aka
> the namespace agnostic object.  On output return the pid value that

No.  That conversion is happending before the user-specified pid is
set.

> the target process will know you as.
> 
> Ultimately I think we need a ns_capable for the current user namespace
> instead of a global one.  But I don't see any rush to introduce
> ns_capable here.
> 
> Eric
> 

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Dmitry Eremin-Solenikov @ 2011-10-20 13:54 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: eric.dumazet-Re5JQEeQqe8AvxtiuMwx3w,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	kernel-janitors-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q
In-Reply-To: <20111020111718.GA32181-AUGNqIMGY+aR2kOLt6zJ8ErlnG4Plg33XqFh9Ls21Oc@public.gmane.org>

On 10/20/2011 03:17 PM, Alexander Smirnov wrote:
 > Hello everybody,
 >
 > below is the patch which adds support for fragmentation in 6LoWPAN

Thanks for the patch!

 > point to point networks. This activity needs because of difference
 > in MTU: 1280 ipv6 and 128 ieee802.15.4

127.

 >
 > This patch is just a draft. Could anyone please look at
 > it and let me know your opinion.
 >
 > The most doubtful moments for me are:
 > 1. Should the list 'frag_list' and the mutex 'flist_lock' be
 > included into dev private data?

I'd also think about being lock-free here via using RCU.

 > 2. Can I use 'dev_queue_xmit' to send fragments to queue?

Yes.

It seems I see the source of your problems. You try to fragment skb from 
the header_create function. It is not designed for this task. Please, 
don't do this! You are not the owner of the skb ATM. You can't just drop 
it from that function. Strictly speaking you can't be sure that this skb 
will really hit the device queue.

You really should push this part into queue processing on the device.


 > From 48472bae269b7b1a4047967ec21eadb217c4fd6d Mon Sep 17 00:00:00 2001
 > From: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
 > Date: Thu, 20 Oct 2011 15:02:36 +0400
 > Subject: [PATCH] 6LoWPAN fragmentation support
 >
 > Signed-off-by: Alexander Smirnov <alex.bluesman.smirnov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
 > ---
 >  net/ieee802154/6lowpan.c |  286 
+++++++++++++++++++++++++++++++++++++++++++++-
 >  net/ieee802154/6lowpan.h |    3 +
 >  2 files changed, 288 insertions(+), 1 deletions(-)
 >
 > diff --git a/net/ieee802154/6lowpan.c b/net/ieee802154/6lowpan.c
 > index 96877bd..1923ec7 100644
 > --- a/net/ieee802154/6lowpan.c
 > +++ b/net/ieee802154/6lowpan.c
 > @@ -113,6 +113,24 @@ struct lowpan_dev_record {
 >      struct list_head list;
 >  };
 >
 > +struct lowpan_fragment {
 > +    u8 in_progress;            /* assembling is in progress */
 > +    struct sk_buff *skb;        /* skb to be assembled */
 > +    u8 *data;            /* data to be stored */
 > +    struct mutex lock;        /* concurency lock */
 > +    u16 length;            /* frame length to be assemled */
 > +    u32 bytes_rcv;            /* bytes received */
 > +    u16 tag;            /* current fragment tag */
 > +    struct timer_list timer;    /* assembling timer */
 > +    struct list_head list;        /* fragments list handler */
 > +};
 > +
 > +static unsigned short fragment_tag;

What is this? Is it a part of 6lowpan standard? There is a long history 
behind being able to predict various packet/stream parameters. Please 
rethink and adjust this.

Ideally (if it's not contra the standard) this could be a part of a hash 
(probably even 16 bits from CRC32 could work) calculated from a set of 
values like jiffies, cpu number, some other variables.

 > +
 > +/* TODO: bind mutex and list to device */
 > +static LIST_HEAD(lowpan_fragments);
 > +struct mutex flist_lock;
 > +
 >  static inline struct
 >  lowpan_dev_info *lowpan_dev_info(const struct net_device *dev)
 >  {
 > @@ -244,6 +262,18 @@ static u8 lowpan_fetch_skb_u8(struct sk_buff *skb)
 >      return ret;
 >  }
 >
 > +static u16 lowpan_fetch_skb_u16(struct sk_buff *skb)
 > +{
 > +    u16 ret;
 > +
 > +    BUG_ON(skb->len < 2);
 > +
 > +    ret = skb->data[0] | (skb->data[1] << 8);
 > +    skb_pull(skb, 2);
 > +    return ret;
 > +}
 > +
 > +static netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct 
net_device *dev);
 >  static int lowpan_header_create(struct sk_buff *skb,
 >                 struct net_device *dev,
 >                 unsigned short type, const void *_daddr,
 > @@ -467,9 +497,102 @@ static int lowpan_header_create(struct sk_buff 
*skb,
 >          memcpy(&(sa.hwaddr), saddr, 8);
 >
 >          mac_cb(skb)->flags = IEEE802154_FC_TYPE_DATA;
 > +
 > +        /* frame fragmentation */
 > +
 > +        /*
 > +         * if payload + mac header doesn't fit MTU-sized frame
 > +         * we need to fragment it.
 > +         */
 > +        if (skb->len > (127 - 24)) { /* MTU - MAC_HEADER_LENGTH */

Magic constants. And the statement will have to be adjusted  after 
adding security handling. Does 6lowpan specify the maximum fragment 
size? IIRC there is a setting in the standard which exactly describes
which should be the maximum data size: either 127 - max_header - 
max_security_header or just 'data + MPDU headers should fit into 127'.

Could you please recheck this in both standards?

 > +            struct sk_buff *fr_skb;
 > +            u16 b_sent = 0;
 > +            unsigned short payload_len = skb->len;
 > +            int stat = 0;
 > +
 > +            pr_debug("%s: the frame is too big (0x%x),"
 > +                 "fragmentation needed, using tag = 0x%x\n",
 > +                 __func__, payload_len, fragment_tag);
 > +
 > +            fr_skb = skb_copy(skb, GFP_KERNEL);
 > +            if (!fr_skb)
 > +                goto error;
 > +
 > +            /* 40-bit - fragment dispatch size */
 > +            head = kzalloc(5, GFP_KERNEL);

No real need to kzalloc. Could you please allocate it on stack?

 > +            if (!head)
 > +                goto error;
 > +
 > +            /* first fagment header */
 > +            head[0] = LOWPAN_DISPATCH_FRAG1 | (payload_len & 0x7);
 > +            head[1] = (payload_len >> 3) & 0xff;
 > +            head[2] = fragment_tag & 0xff;
 > +            head[3] = fragment_tag >> 8;
 > +

This is not atomic!!! You should get the fragment tag value once for the 
whole skb and then use the same value in the whole function.

 > +
 > +            lowpan_raw_dump_inline(__func__, "first header",
 > +                            head, 4);
 > +
 > +            memcpy(skb_push(fr_skb, 4), head, 4);

And what if there is no 4-byte space for the header?

 > +            skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
 > +
 > +            dev_hard_header(fr_skb, lowpan_dev_info(dev)->real_dev,
 > +                type, (void *)&da, (void *)&sa, fr_skb->len);
 > +
 > +            /* send fragment to dev queue */
 > +            dev_queue_xmit(fr_skb);
 > +
 > +            /* next fragments headers */
 > +            head[0] |= 0x20;

Magic value

 > +
 > +            lowpan_raw_dump_inline(__func__, "next headers",
 > +                            head, 5);
 > +
 > +            while (b_sent < payload_len) {
 > +                /* not the first fragment */
 > +                if (b_sent)
 > +                    skb_pull(skb, LOWPAN_FRAG_SIZE);

Are you the owner of the original skb here? Seems you are not. So you 
can't change the original skb.

 > +
 > +                pr_debug("%s: preparing fragment %d\n",
 > +                    __func__, b_sent / LOWPAN_FRAG_SIZE);
 > +
 > +                /*
 > +                 * create copy of current buffer and trim it
 > +                 * down to fragment size
 > +                 */
 > +                fr_skb = skb_copy(skb, GFP_KERNEL);
 > +                if (!fr_skb)
 > +                    goto error;
 > +
 > +                skb_trim(fr_skb, LOWPAN_FRAG_SIZE);
 > +
 > +                /* add fragment header */
 > +                head[4] = b_sent / 8;
 > +                memcpy(skb_push(fr_skb, 5), head, 5);
 > +
 > +                b_sent += LOWPAN_FRAG_SIZE;
 > +
 > +                lowpan_raw_dump_table(__func__,
 > +                   "fragment data", fr_skb->data, fr_skb->len);
 > +
 > +                stat = dev_hard_header(fr_skb,
 > +                    lowpan_dev_info(dev)->real_dev, type,
 > +                    (void *)&da, (void *)&sa, fr_skb->len);
 > +
 > +                dev_queue_xmit(fr_skb);
 > +            }

I don't like this piece of code. Please refactor it to a separate 
function that can send both first and next fragments.

Also I don't see a point in copying original skb again and again. It 
would be wiser to allocate fragment skb's via dev_alloc_skb, reserve 
header space, push fragmentation header, then memcpy the rest of the data.
Or you can use non-linear skb's referencing fragments from the data part 
of the original skb.


 > +
 > +            /* TODO: what's the correct way to skip default skb? */
 > +
 > +            fragment_tag++;

Hmmm.

 > +            return stat;
 > +        } else
 >          return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev,
 >                  type, (void *)&da, (void *)&sa, skb->len);
 >      }
 > +error:
 > +    kfree_skb(skb);
 > +    return -ENOMEM;
 >  }
 >
 >  static int lowpan_skb_deliver(struct sk_buff *skb, struct ipv6hdr *hdr)
 > @@ -511,6 +634,23 @@ static int lowpan_skb_deliver(struct sk_buff 
*skb, struct ipv6hdr *hdr)
 >      return stat;
 >  }
 >
 > +static void lowpan_fragment_timer_expired(unsigned long tag)
 > +{
 > +    struct lowpan_fragment *entry, *tmp;
 > +
 > +    pr_debug("%s: timer expired for frame with tag %lu\n", __func__, 
tag);
 > +
 > +    mutex_lock(&flist_lock);
 > +    list_for_each_entry_safe(entry, tmp, &lowpan_fragments, list)
 > +        if (entry->tag == tag) {
 > +            list_del(&entry->list);
 > +            kfree(entry->data);
 > +            kfree(entry);
 > +            break;
 > +        }
 > +    mutex_unlock(&flist_lock);
 > +}

Rather than using a timer here, I'd use a delayed job (that can drop 
several fragmentated packets at once, if it's execution is delayed) plus 
a sorted fragments list to ease the calculation of the next fragment 
time out wipe.

 > +
 >  static int
 >  lowpan_process_data(struct sk_buff *skb)
 >  {
 > @@ -525,6 +665,139 @@ lowpan_process_data(struct sk_buff *skb)
 >      if (skb->len < 2)
 >          goto drop;
 >      iphc0 = lowpan_fetch_skb_u8(skb);
 > +
 > +    /* fragments assmebling */
 > +    switch (iphc0 & 0xf8) {
 > +    /* first fragment of the frame */
 > +    case LOWPAN_DISPATCH_FRAG1:
 > +    {
 > +        struct lowpan_fragment *entry, *frame;
 > +        u16 tag;
 > +
 > +        lowpan_raw_dump_inline(__func__, "first frame fragment header",
 > +                                skb->data, 3);
 > +
 > +        tmp = lowpan_fetch_skb_u8(skb);
 > +        tag = lowpan_fetch_skb_u16(skb);
 > +
 > +        /*
 > +         * check if frame assembling with the same tag is
 > +         * already in progress
 > +         */
 > +        rcu_read_lock();
 > +        list_for_each_entry_rcu(entry, &lowpan_fragments, list)
 > +            if (entry->tag == tag) {
 > +                pr_debug("%s ERROR: frame with this tag is"
 > +                     "alredy in assembling", __func__);
 > +                goto drop_rcu;
 > +            }
 > +        rcu_read_unlock();

I'm not quite sure that your RCU/locking usage is correct.

 > +
 > +        /* alloc new frame structure */
 > +        frame = kzalloc(sizeof(struct lowpan_fragment), GFP_KERNEL);
 > +        if (!frame)
 > +            goto drop;
 > +
 > +        INIT_LIST_HEAD(&frame->list);
 > +
 > +        frame->bytes_rcv = 0;
 > +        frame->length = (iphc0 & 7) | (tmp << 3);
 > +        frame->tag = tag;
 > +        /* allocate buffer for frame assembling */
 > +        frame->data = kzalloc(frame->length, GFP_KERNEL);

Why not allocate an skb here? You can do all fragments processing on the 
top of one skb + ranges handling.

BTW: Did you study the skb reassembly code of IPv4?

 > +        if (!frame->data) {
 > +            kfree(frame);
 > +            goto drop;
 > +        }
 > +
 > +        pr_debug("%s: frame to be assembled: length = 0x%x, "
 > +             "tag = 0x%x\n", __func__, frame->length, frame->tag);
 > +
 > +        init_timer(&frame->timer);
 > +        /* (number of fragments) * (fragment processing time-out) */
 > +        frame->timer.expires = jiffies +
 > +          (frame->length / LOWPAN_FRAG_SIZE + 1) * LOWPAN_FRAG_TIMEOUT;
 > +        frame->timer.data = tag;
 > +        frame->timer.function = lowpan_fragment_timer_expired;
 > +
 > +        add_timer(&frame->timer);
 > +
 > +        mutex_lock(&flist_lock);
 > +        list_add_tail(&frame->list, &lowpan_fragments);
 > +        mutex_unlock(&flist_lock);
 > +
 > +        return kfree_skb(skb), 0;
 > +    }
 > +    /* second and next fragment of the frame */
 > +    case LOWPAN_DISPATCH_FRAGN:
 > +    {
 > +        u16 tag;
 > +        struct lowpan_fragment *entry, *t;
 > +
 > +        lowpan_raw_dump_inline(__func__, "next fragment header",
 > +                    skb->data, 4);
 > +
 > +        lowpan_fetch_skb_u8(skb); /* skip frame length byte */
 > +        tag = lowpan_fetch_skb_u16(skb);
 > +
 > +        rcu_read_lock();
 > +        list_for_each_entry_rcu(entry, &lowpan_fragments, list)
 > +            if (entry->tag == tag)
 > +                break;
 > +        rcu_read_unlock();
 > +
 > +        if (entry->tag != tag) {
 > +            pr_debug("%s ERROR: no frame structure found for this"
 > +                 "fragment", __func__);
 > +            goto drop;
 > +        }

Can you be sure that you won't receive fragments out of order? No, you 
can not!

 > +
 > +        tmp = lowpan_fetch_skb_u8(skb); /* fetch offset */
 > +
 > +        lowpan_raw_dump_table(__func__, "next fragment payload",
 > +                    skb->data, skb->len);
 > +
 > +        /* if payload fits buffer, copy it */
 > +        if ((tmp * 8 + skb->len) <= entry->length) /* TODO: likely? */
 > +            memcpy(entry->data + tmp * 8, skb->data, skb->len);
 > +        else
 > +            goto drop;
 > +
 > +        entry->bytes_rcv += skb->len;
 > +
 > +        pr_debug("%s: frame length = 0x%x, bytes received = 0x%x\n",
 > +             __func__, entry->length, entry->bytes_rcv);
 > +
 > +        /* frame assembling complete */
 > +        if (entry->bytes_rcv == entry->length) {
 > +            struct sk_buff *tmp = skb;


WTF?

 > +
 > +            mutex_lock(&flist_lock);
 > +            list_for_each_entry_safe(entry, t, &lowpan_fragments, list)
 > +                if (entry->tag == tag) {
 > +                    list_del(&entry->list);
 > +                    /* copy and clear skb */
 > +                    skb = skb_copy_expand(skb, entry->length, 
skb_tailroom(skb), GFP_KERNEL);
 > +                    skb_pull(skb, skb->len);
 > +                    /* copy new data to skb */
 > +                    memcpy(skb_push(skb, entry->length), 
entry->data, entry->length);
 > +                    kfree_skb(tmp);
 > +                    del_timer(&entry->timer);
 > +                    kfree(entry->data);
 > +                    kfree(entry);

This is not the optimal way to code this. Consider reading about string 
concatenation in Java or Python.

 > +
 > +                    iphc0 = lowpan_fetch_skb_u8(skb);
 > +                    break;
 > +                }
 > +            mutex_unlock(&flist_lock);
 > +            break;
 > +        }
 > +        return kfree_skb(skb), 0;
 > +    }
 > +    default:
 > +        break;
 > +    }
 > +
 >      iphc1 = lowpan_fetch_skb_u8(skb);
 >
 >      _saddr = mac_cb(skb)->sa.hwaddr;
 > @@ -674,6 +947,8 @@ lowpan_process_data(struct sk_buff *skb)
 >      lowpan_raw_dump_table(__func__, "raw header dump", (u8 *)&hdr,
 >                              sizeof(hdr));
 >      return lowpan_skb_deliver(skb, &hdr);
 > +drop_rcu:
 > +    rcu_read_unlock();
 >  drop:
 >      kfree(skb);
 >      return -EINVAL;
 > @@ -765,8 +1040,15 @@ static int lowpan_rcv(struct sk_buff *skb, 
struct net_device *dev,
 >          goto drop;
 >
 >      /* check that it's our buffer */
 > -    if ((skb->data[0] & 0xe0) == 0x60)
 > +    switch (skb->data[0] & 0xe0) {
 > +    case 0x60:        /* ipv6 datagram */
 > +    case 0xc0:        /* first fragment header */
 > +    case 0xe0:        /* next fragments headers */
 >          lowpan_process_data(skb);
 > +        break;
 > +    default:
 > +        break;
 > +    }
 >
 >      return NET_RX_SUCCESS;
 >
 > @@ -793,6 +1075,8 @@ static int lowpan_newlink(struct net *src_net, 
struct net_device *dev,
 >      lowpan_dev_info(dev)->real_dev = real_dev;
 >      mutex_init(&lowpan_dev_info(dev)->dev_list_mtx);
 >
 > +    mutex_init(&flist_lock);
 > +
 >      entry = kzalloc(sizeof(struct lowpan_dev_record), GFP_KERNEL);
 >      if (!entry)
 >          return -ENOMEM;
 > diff --git a/net/ieee802154/6lowpan.h b/net/ieee802154/6lowpan.h
 > index 5d8cf80..e8e57c4 100644
 > --- a/net/ieee802154/6lowpan.h
 > +++ b/net/ieee802154/6lowpan.h
 > @@ -159,6 +159,9 @@
 >  #define LOWPAN_DISPATCH_FRAG1    0xc0 /* 11000xxx */
 >  #define LOWPAN_DISPATCH_FRAGN    0xe0 /* 11100xxx */
 >
 > +#define LOWPAN_FRAG_SIZE    40        /* fragment payload size */
 > +#define LOWPAN_FRAG_TIMEOUT    (HZ * 2)    /* processing time: 2 sec */

Is it a standard defined interval?


-- 
With best wishes
Dmitry

------------------------------------------------------------------------------
The demand for IT networking professionals continues to grow, and the
demand for specialized networking skills is growing even more rapidly.
Take a complimentary Learning@Ciosco Self-Assessment and learn 
about Cisco certifications, training, and career opportunities. 
http://p.sf.net/sfu/cisco-dev2dev

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Eric Dumazet @ 2011-10-20 13:51 UTC (permalink / raw)
  To: Dmitry Eremin-Solenikov
  Cc: Alexander Smirnov, davem, slapin, linux-zigbee-devel, netdev,
	jonsmirl
In-Reply-To: <4EA01E77.8040300@gmail.com>

Le jeudi 20 octobre 2011 à 17:13 +0400, Dmitry Eremin-Solenikov a
écrit :
> Hi, Alexander, colleagues,
> 
> On 10/20/2011 04:50 PM, Alexander Smirnov wrote:
> > Hi Eric,
> >
> > thank you for the replies. And another question I forgot to ask:
> >
> > when I send fragments, I still have original skb buffer. What should I
> > do with it, is there any
> > "proper/good" ways to drop it? Because I've already fragmented it and
> > do not need to send
> > original skb to queue.
> 
> You might want to check the TCP/IP fragmentation code path. I think you 
> can drop it with kfree_skb, but I ain't sure ATM.

In the TCP/IP frag code path, we own the skb and can do many things,
like using the skb to store one of the fragment.

In a driver ndo_start_xmit(), things are a bit different.

Special care must be taken if skb_cloned(skb) is true...

^ permalink raw reply

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: Eric Dumazet @ 2011-10-20 13:44 UTC (permalink / raw)
  To: Daniel Turull
  Cc: Ben Hutchings, David Miller, netdev, Robert Olsson,
	Voravit Tanyingyong, Jens Laas
In-Reply-To: <4EA020B3.4030208@gmail.com>

Le jeudi 20 octobre 2011 à 15:22 +0200, Daniel Turull a écrit :
> Hi,
> 
> I tested the patch and it works well.
> 

Thanks !


> 
> I think if we increase the constant to 1ms, we will reduce the jitter if we have
> a rate between 1kpps and 10 kpps, but I guess is not a big deal.
> 


> I've plot this new graph with this patch:
> http://tslab.ssvl.kth.se/pktgen/img/inter_eric1.eps

Unfortunately, the sender cpu might be preempted by timer irq or other
expensive irq, so the Min/Max values are not very different I guess.

I dont understand your Min values.

At 100 pps, how is it possible to have a Min value of ~5000 ns ?

^ permalink raw reply

* Re: [PATCH 9/9] make net/core/scm.c uid comparisons user namespace aware
From: Eric W. Biederman @ 2011-10-20 13:35 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: linux-kernel, akpm, oleg, richard, mikevs, segoon, gregkh,
	dhowells, eparis, Serge E. Hallyn, netdev
In-Reply-To: <20111020125801.GA1315@hallyn.com>

"Serge E. Hallyn" <serge@hallyn.com> writes:

> Quoting Eric W. Biederman (ebiederm@xmission.com):
>> Serge Hallyn <serge@hallyn.com> writes:
>> 
>> > From: "Serge E. Hallyn" <serge.hallyn@canonical.com>
>> >
>> > Currently uids are compared without regard for the user namespace.
>> > Fix that to prevent tasks in a different user namespace from
>> > wrongly matching on SCM_CREDENTIALS.
>> >
>> > In the past, either your uids had to match, or you had to have
>> > CAP_SETXID.  In a namespaced world, you must either (both be in the
>> > same user namespace and have your uids match), or you must have
>> > CAP_SETXID targeted at the other user namespace.  The latter can
>> > happen for instance if uid 500 created a new user namespace and
>> > now interacts with uid 0 in it.
>> 
>> Serge this approach is wrong.
>
> Thanks for looking, Eric.
>
>> Because we pass the cred and the pid through the socket socket itself
>> is just a conduit and should be ignored in this context.
>
> Ok, that makes sense, but
>
>> The only interesting test should be are you allowed to impersonate other
>> users in your current userk namespace.
>
> Why in your current user namespace?  Shouldn't it be in the
> target user ns?  I understand it could be wrong to tie the
> user ns owning the socket to the target userns (though I still
> kind of like it), but just because I have CAP_SETUID in my
> own user_ns doesn't mean I should be able to pose as another
> uid in your user_ns.

First and foremost it is important that you be able if you have the
capability to impersonate other users in your current user namespace.
That is what the capability actually controls.

None of this allows you to impersonate any user in any other user
namespace.  The translation between users prevents that.

> (Now I also see that cred_to_ucred() translates to the current
> user_ns, so that should have been a hint to me before about
> your intent, but I'm not convinced I agree with your intent).
>
> And you do the same with the pid.  Why is that a valid assumption?

Yes.  Basically all the code is allow you to impersonate people you
would have been able to impersonate before.  If your target is in
another namespace you can not fool them.

With pids the logic should be a lot clearer.  Pretend to be a pid you can
see in your current pid namespace.  Lookup and convert to struct pid aka
the namespace agnostic object.  On output return the pid value that
the target process will know you as.

Ultimately I think we need a ns_capable for the current user namespace
instead of a global one.  But I don't see any rush to introduce
ns_capable here.

Eric

^ permalink raw reply

* Re: [patch] pktgen: bug when calling ndelay in x86 architectures
From: Daniel Turull @ 2011-10-20 13:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ben Hutchings, David Miller, netdev, Robert Olsson,
	Voravit Tanyingyong, Jens Laas
In-Reply-To: <1319019235.3103.10.camel@edumazet-laptop>

Hi,

I tested the patch and it works well.


On 10/19/2011 12:13 PM, Eric Dumazet wrote:
> Le mercredi 19 octobre 2011 à 11:33 +0200, Daniel Turull a écrit :
>> Hi,
>> then if we want to use the spin more often.
>> maybe we can increase the constant from 100000 (0.1ms) to 1000000 (1ms)?
>> How was the current value chosen?
>>
> 
> Based on user needs ;)

I think if we increase the constant to 1ms, we will reduce the jitter if we have
a rate between 1kpps and 10 kpps, but I guess is not a big deal.

I've plot this new graph with this patch:
http://tslab.ssvl.kth.se/pktgen/img/inter_eric1.eps

> 
>> I did some measurements of the inter-arrival time between packets
>> and with bigger values the maximal is reduced in the rates between
>> 2kpps and 20kpps.
>>
> 
> ndelay()/udelay() have some inaccuracies, for 'long' values, because of
> rounding errors.
> 
> If we spin, just call ktime_now() in a loop until spin_until is
> reached...
> 
> That way you get max possible resolution, given kernel time service
> constraints.
> 
> Untested patch :
> 
> diff --git a/net/core/pktgen.c b/net/core/pktgen.c
> index 38d6577..5c7e900 100644
> --- a/net/core/pktgen.c
> +++ b/net/core/pktgen.c
> @@ -2145,9 +2145,11 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
>  	}
>  
>  	start_time = ktime_now();
> -	if (remaining < 100000)
> -		ndelay(remaining);	/* really small just spin */
> -	else {
> +	if (remaining < 100000) {
> +		do {
> +			end_time = ktime_now();
> +		} while (ktime_lt(end_time, spin_until));
> +	} else {
>  		/* see do_nanosleep */
>  		hrtimer_init_sleeper(&t, current);
>  		do {
> @@ -2162,8 +2164,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
>  			hrtimer_cancel(&t.timer);
>  		} while (t.task && pkt_dev->running && !signal_pending(current));
>  		__set_current_state(TASK_RUNNING);
> +		end_time = ktime_now();
>  	}
> -	end_time = ktime_now();
>  
>  	pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
>  	pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
> 
> 

Daniel

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Dmitry Eremin-Solenikov @ 2011-10-20 13:13 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: Eric Dumazet, davem, slapin, linux-zigbee-devel, netdev, jonsmirl
In-Reply-To: <CAJmB2rCe3BJKD07TOSyAT0vbDq_K1VHLOtECqeOMzaTsg3DokA@mail.gmail.com>

Hi, Alexander, colleagues,

On 10/20/2011 04:50 PM, Alexander Smirnov wrote:
> Hi Eric,
>
> thank you for the replies. And another question I forgot to ask:
>
> when I send fragments, I still have original skb buffer. What should I
> do with it, is there any
> "proper/good" ways to drop it? Because I've already fragmented it and
> do not need to send
> original skb to queue.

You might want to check the TCP/IP fragmentation code path. I think you 
can drop it with kfree_skb, but I ain't sure ATM.

>
> Thank you,
> Alexander

P.S. Top posting is really a bad style. And it's now that welcome in the 
MLs.

-- 
With best wishes
Dmitry

^ permalink raw reply

* Requesting for your partnership
From: Park Zihao @ 2011-10-20 12:42 UTC (permalink / raw)




I am Mr Park Zihao, an Account Officer with the International bank of Taipei,I need your assistance in a business deal and you will be paid 30% for your Management Fees". Please reply for details

^ permalink raw reply

* Re: [IEEE802.15.4][6LoWPAN] draft for fragmentation support
From: Eric Dumazet @ 2011-10-20 13:11 UTC (permalink / raw)
  To: Alexander Smirnov
  Cc: davem, dbaryshkov, slapin, linux-zigbee-devel, netdev, jonsmirl
In-Reply-To: <CAJmB2rCe3BJKD07TOSyAT0vbDq_K1VHLOtECqeOMzaTsg3DokA@mail.gmail.com>

Le jeudi 20 octobre 2011 à 16:50 +0400, Alexander Smirnov a écrit :
> Hi Eric,
> 
> thank you for the replies. And another question I forgot to ask:
> 
> when I send fragments, I still have original skb buffer. What should I
> do with it, is there any
> "proper/good" ways to drop it? Because I've already fragmented it and
> do not need to send
> original skb to queue.

I dont quite understand. Once your xmits are done, you must free the
original skb.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox