Netdev List
 help / color / mirror / Atom feed
* [PATCH net v3 5/7] net/rds: Set fr_state only to FRMR_IS_FREE if IB_WR_LOCAL_INV had been successful
From: Gerd Rausch @ 2019-07-16 22:29 UTC (permalink / raw)
  To: Santosh Shilimkar, netdev, linux-rdma, rds-devel; +Cc: David Miller

Fix a bug where fr_state first goes to FRMR_IS_STALE, because of a failure
of operation IB_WR_LOCAL_INV, but then gets set back to "FRMR_IS_FREE"
uncoditionally, even though the operation failed.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
---
 net/rds/ib_frmr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 708c553c3da5..adaa8e99e5a9 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -309,7 +309,8 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	}
 
 	if (frmr->fr_inv) {
-		frmr->fr_state = FRMR_IS_FREE;
+		if (frmr->fr_state == FRMR_IS_INUSE)
+			frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
 		wake_up(&frmr->fr_inv_done);
 	}
-- 
2.22.0



^ permalink raw reply related

* [PATCH net v3 4/7] net/rds: Fix NULL/ERR_PTR inconsistency
From: Gerd Rausch @ 2019-07-16 22:29 UTC (permalink / raw)
  To: Santosh Shilimkar, netdev, linux-rdma, rds-devel; +Cc: David Miller

Make function "rds_ib_try_reuse_ibmr" return NULL in case
memory region could not be allocated, since callers
simply check if the return value is not NULL.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
---
 net/rds/ib_rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 6b047e63a769..c8c1e3ae8d84 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -450,7 +450,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
 				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
 			else
 				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
-			return ERR_PTR(-EAGAIN);
+			break;
 		}
 
 		/* We do have some empty MRs. Flush them out. */
@@ -464,7 +464,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
 			return ibmr;
 	}
 
-	return ibmr;
+	return NULL;
 }
 
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
-- 
2.22.0



^ permalink raw reply related

* [PATCH net v3 3/7] net/rds: Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition after posting IB_WR_LOCAL_INV
From: Gerd Rausch @ 2019-07-16 22:29 UTC (permalink / raw)
  To: Santosh Shilimkar, netdev, linux-rdma, rds-devel; +Cc: David Miller

In order to:
1) avoid a silly bouncing between "clean_list" and "drop_list"
   triggered by function "rds_ib_reg_frmr" as it is releases frmr
   regions whose state is not "FRMR_IS_FREE" right away.

2) prevent an invalid access error in a race from a pending
   "IB_WR_LOCAL_INV" operation with a teardown ("dma_unmap_sg", "put_page")
   and de-registration ("ib_dereg_mr") of the corresponding
   memory region.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
---
 net/rds/ib_frmr.c | 65 +++++++++++++++++++++++++++--------------------
 net/rds/ib_mr.h   |  2 ++
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 6038138d6e38..708c553c3da5 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -76,6 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
 
 	frmr->fr_state = FRMR_IS_FREE;
 	init_waitqueue_head(&frmr->fr_inv_done);
+	init_waitqueue_head(&frmr->fr_reg_done);
 	return ibmr;
 
 out_no_cigar:
@@ -124,6 +125,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	 */
 	ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
 	frmr->fr_state = FRMR_IS_INUSE;
+	frmr->fr_reg = true;
 
 	memset(&reg_wr, 0, sizeof(reg_wr));
 	reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
@@ -144,7 +146,17 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 		if (printk_ratelimit())
 			pr_warn("RDS/IB: %s returned error(%d)\n",
 				__func__, ret);
+		goto out;
 	}
+
+	/* Wait for the registration to complete in order to prevent an invalid
+	 * access error resulting from a race between the memory region already
+	 * being accessed while registration is still pending.
+	 */
+	wait_event(frmr->fr_reg_done, !frmr->fr_reg);
+
+out:
+
 	return ret;
 }
 
@@ -262,6 +274,19 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
+
+	/* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
+	 * 1) avoid a silly bouncing between "clean_list" and "drop_list"
+	 *    triggered by function "rds_ib_reg_frmr" as it is releases frmr
+	 *    regions whose state is not "FRMR_IS_FREE" right away.
+	 * 2) prevents an invalid access error in a race
+	 *    from a pending "IB_WR_LOCAL_INV" operation
+	 *    with a teardown ("dma_unmap_sg", "put_page")
+	 *    and de-registration ("ib_dereg_mr") of the corresponding
+	 *    memory region.
+	 */
+	wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
+
 out:
 	return ret;
 }
@@ -289,6 +314,11 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 		wake_up(&frmr->fr_inv_done);
 	}
 
+	if (frmr->fr_reg) {
+		frmr->fr_reg = false;
+		wake_up(&frmr->fr_reg_done);
+	}
+
 	atomic_inc(&ic->i_fastreg_wrs);
 }
 
@@ -297,14 +327,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
 {
 	struct rds_ib_mr *ibmr, *next;
 	struct rds_ib_frmr *frmr;
-	int ret = 0;
+	int ret = 0, ret2;
 	unsigned int freed = *nfreed;
 
 	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
 	list_for_each_entry(ibmr, list, unmap_list) {
-		if (ibmr->sg_dma_len)
-			ret |= rds_ib_post_inv(ibmr);
+		if (ibmr->sg_dma_len) {
+			ret2 = rds_ib_post_inv(ibmr);
+			if (ret2 && !ret)
+				ret = ret2;
+		}
 	}
+
 	if (ret)
 		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
 
@@ -347,31 +381,8 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
 	}
 
 	do {
-		if (ibmr) {
-			/* Memory regions make it onto the "clean_list" via
-			 * "rds_ib_flush_mr_pool", after the memory region has
-			 * been posted for invalidation via "rds_ib_post_inv".
-			 *
-			 * At that point in time, "fr_state" may still be
-			 * in state "FRMR_IS_INUSE", since the only place where
-			 * "fr_state" transitions to "FRMR_IS_FREE" is in
-			 * is in "rds_ib_mr_cqe_handler", which is
-			 * triggered by a tasklet.
-			 *
-			 * So we wait for "fr_inv_done" to trigger
-			 * and only put memory regions onto the drop_list
-			 * that failed (i.e. not marked "FRMR_IS_FREE").
-			 *
-			 * This avoids the problem of memory-regions bouncing
-			 * between "clean_list" and "drop_list" before they
-			 * even have a chance to be properly invalidated.
-			 */
-			frmr = &ibmr->u.frmr;
-			wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
-			if (frmr->fr_state == FRMR_IS_FREE)
-				break;
+		if (ibmr)
 			rds_ib_free_frmr(ibmr, true);
-		}
 		ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
 		if (IS_ERR(ibmr))
 			return ibmr;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index ab26c20ed66f..9045a8c0edff 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -58,6 +58,8 @@ struct rds_ib_frmr {
 	enum rds_ib_fr_state	fr_state;
 	bool			fr_inv;
 	wait_queue_head_t	fr_inv_done;
+	bool			fr_reg;
+	wait_queue_head_t	fr_reg_done;
 	struct ib_send_wr	fr_wr;
 	unsigned int		dma_npages;
 	unsigned int		sg_byte_len;
-- 
2.22.0



^ permalink raw reply related

* [PATCH net v3 2/7] net/rds: Get rid of "wait_clean_list_grace" and add locking
From: Gerd Rausch @ 2019-07-16 22:28 UTC (permalink / raw)
  To: Santosh Shilimkar, netdev, linux-rdma, rds-devel; +Cc: David Miller

Waiting for activity on the "clean_list" to quiesce is no substitute
for proper locking.

We can have multiple threads competing for "llist_del_first"
via "rds_ib_reuse_mr", and a single thread competing
for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool".

Since "llist_del_first" depends on "list->first->next" not to change
in the midst of the operation, simply waiting for all current calls
to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate:

By the time "wait_clean_list_grace" is done iterating over all CPUs to see
that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may
have just shown up on the first CPU.

Furthermore, <linux/llist.h> explicitly calls out the need for locking:
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer,
 * and llist_del_first or llist_del_all used in other consumers,
 * then a lock is needed.

Also, while at it, drop the unused "pool" parameter
from "list_to_llist_nodes".

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
---
 net/rds/ib_mr.h   |  1 +
 net/rds/ib_rdma.c | 56 +++++++++++++++--------------------------------
 2 files changed, 19 insertions(+), 38 deletions(-)

diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 42daccb7b5eb..ab26c20ed66f 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -98,6 +98,7 @@ struct rds_ib_mr_pool {
 	struct llist_head	free_list;	/* unused MRs */
 	struct llist_head	clean_list;	/* unused & unmapped MRs */
 	wait_queue_head_t	flush_wait;
+	spinlock_t		clean_lock;	/* "clean_list" concurrency */
 
 	atomic_t		free_pinned;	/* memory pinned by free MRs */
 	unsigned long		max_items;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0b347f46b2f4..6b047e63a769 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -40,9 +40,6 @@
 
 struct workqueue_struct *rds_ib_mr_wq;
 
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
-#define CLEAN_LIST_BUSY_BIT 0
-
 static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 {
 	struct rds_ib_device *rds_ibdev;
@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
 	struct llist_node *ret;
-	unsigned long *flag;
+	unsigned long flags;
 
-	preempt_disable();
-	flag = this_cpu_ptr(&clean_list_grace);
-	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	spin_lock_irqsave(&pool->clean_lock, flags);
 	ret = llist_del_first(&pool->clean_list);
+	spin_unlock_irqrestore(&pool->clean_lock, flags);
 	if (ret) {
 		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
 		if (pool->pool_type == RDS_IB_MR_8K_POOL)
@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
 	}
 
-	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
-	preempt_enable();
 	return ibmr;
 }
 
-static inline void wait_clean_list_grace(void)
-{
-	int cpu;
-	unsigned long *flag;
-
-	for_each_online_cpu(cpu) {
-		flag = &per_cpu(clean_list_grace, cpu);
-		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-			cpu_relax();
-	}
-}
-
 void rds_ib_sync_mr(void *trans_private, int direction)
 {
 	struct rds_ib_mr *ibmr = trans_private;
@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
  * of clusters.  Each cluster has linked llist nodes of
  * MR_CLUSTER_SIZE mrs that are ready for reuse.
  */
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
-				struct list_head *list,
+static void list_to_llist_nodes(struct list_head *list,
 				struct llist_node **nodes_head,
 				struct llist_node **nodes_tail)
 {
@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 	 */
 	dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
 	dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
-	if (free_all)
+	if (free_all) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&pool->clean_lock, flags);
 		llist_append_to_list(&pool->clean_list, &unmap_list);
+		spin_unlock_irqrestore(&pool->clean_lock, flags);
+	}
 
 	free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
 
 	if (!list_empty(&unmap_list)) {
-		/* we have to make sure that none of the things we're about
-		 * to put on the clean list would race with other cpus trying
-		 * to pull items off.  The llist would explode if we managed to
-		 * remove something from the clean list and then add it back again
-		 * while another CPU was spinning on that same item in llist_del_first.
-		 *
-		 * This is pretty unlikely, but just in case  wait for an llist grace period
-		 * here before adding anything back into the clean list.
-		 */
-		wait_clean_list_grace();
-
-		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
+		unsigned long flags;
+
+		list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
 		if (ibmr_ret) {
 			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
 			clean_nodes = clean_nodes->next;
 		}
 		/* more than one entry in llist nodes */
-		if (clean_nodes)
+		if (clean_nodes) {
+			spin_lock_irqsave(&pool->clean_lock, flags);
 			llist_add_batch(clean_nodes, clean_tail,
 					&pool->clean_list);
-
+			spin_unlock_irqrestore(&pool->clean_lock, flags);
+		}
 	}
 
 	atomic_sub(unpinned, &pool->free_pinned);
@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
 	init_llist_head(&pool->free_list);
 	init_llist_head(&pool->drop_list);
 	init_llist_head(&pool->clean_list);
+	spin_lock_init(&pool->clean_lock);
 	mutex_init(&pool->flush_lock);
 	init_waitqueue_head(&pool->flush_wait);
 	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
-- 
2.22.0



^ permalink raw reply related

* [PATCH net v3 1/7] net/rds: Give fr_state a chance to transition to FRMR_IS_FREE
From: Gerd Rausch @ 2019-07-16 22:28 UTC (permalink / raw)
  To: Santosh Shilimkar, netdev, linux-rdma, rds-devel; +Cc: David Miller

In the context of FRMR (ib_frmr.c):

Memory regions make it onto the "clean_list" via "rds_ib_flush_mr_pool",
after the memory region has been posted for invalidation via
"rds_ib_post_inv".

At that point in time, "fr_state" may still be in state "FRMR_IS_INUSE",
since the only place where "fr_state" transitions to "FRMR_IS_FREE"
is in "rds_ib_mr_cqe_handler", which is triggered by a tasklet.

So in case we notice that "fr_state != FRMR_IS_FREE" (see below),
we wait for "fr_inv_done" to trigger with a maximum of 10msec.
Then we check again, and only put the memory region onto the drop_list
(via "rds_ib_free_frmr") in case the situation remains unchanged.

This avoids the problem of memory-regions bouncing between "clean_list"
and "drop_list" before they even have a chance to be properly invalidated.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
---
 net/rds/ib_frmr.c | 27 ++++++++++++++++++++++++++-
 net/rds/ib_mr.h   |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 32ae26ed58a0..6038138d6e38 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -75,6 +75,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
 		pool->max_items_soft = pool->max_items;
 
 	frmr->fr_state = FRMR_IS_FREE;
+	init_waitqueue_head(&frmr->fr_inv_done);
 	return ibmr;
 
 out_no_cigar:
@@ -285,6 +286,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
+		wake_up(&frmr->fr_inv_done);
 	}
 
 	atomic_inc(&ic->i_fastreg_wrs);
@@ -345,8 +347,31 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
 	}
 
 	do {
-		if (ibmr)
+		if (ibmr) {
+			/* Memory regions make it onto the "clean_list" via
+			 * "rds_ib_flush_mr_pool", after the memory region has
+			 * been posted for invalidation via "rds_ib_post_inv".
+			 *
+			 * At that point in time, "fr_state" may still be
+			 * in state "FRMR_IS_INUSE", since the only place where
+			 * "fr_state" transitions to "FRMR_IS_FREE" is in
+			 * is in "rds_ib_mr_cqe_handler", which is
+			 * triggered by a tasklet.
+			 *
+			 * So we wait for "fr_inv_done" to trigger
+			 * and only put memory regions onto the drop_list
+			 * that failed (i.e. not marked "FRMR_IS_FREE").
+			 *
+			 * This avoids the problem of memory-regions bouncing
+			 * between "clean_list" and "drop_list" before they
+			 * even have a chance to be properly invalidated.
+			 */
+			frmr = &ibmr->u.frmr;
+			wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
+			if (frmr->fr_state == FRMR_IS_FREE)
+				break;
 			rds_ib_free_frmr(ibmr, true);
+		}
 		ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
 		if (IS_ERR(ibmr))
 			return ibmr;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 5da12c248431..42daccb7b5eb 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -57,6 +57,7 @@ struct rds_ib_frmr {
 	struct ib_mr		*mr;
 	enum rds_ib_fr_state	fr_state;
 	bool			fr_inv;
+	wait_queue_head_t	fr_inv_done;
 	struct ib_send_wr	fr_wr;
 	unsigned int		dma_npages;
 	unsigned int		sg_byte_len;
-- 
2.22.0



^ permalink raw reply related

* [PATCH net v3 0/7] net/rds: RDMA fixes
From: Gerd Rausch @ 2019-07-16 22:28 UTC (permalink / raw)
  To: Santosh Shilimkar, netdev, linux-rdma, rds-devel; +Cc: David Miller

A number of net/rds fixes necessary to make "rds_rdma.ko"
pass some basic Oracle internal tests.

Gerd Rausch (7):
  net/rds: Give fr_state a chance to transition to FRMR_IS_FREE
  net/rds: Get rid of "wait_clean_list_grace" and add locking
  net/rds: Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition after
    posting IB_WR_LOCAL_INV
  net/rds: Fix NULL/ERR_PTR inconsistency
  net/rds: Set fr_state only to FRMR_IS_FREE if IB_WR_LOCAL_INV had been
    successful
  net/rds: Keep track of and wait for FRWR segments in use upon shutdown
  net/rds: Initialize ic->i_fastreg_wrs upon allocation

 net/rds/ib.h      |  1 +
 net/rds/ib_cm.c   |  9 ++++-
 net/rds/ib_frmr.c | 84 ++++++++++++++++++++++++++++++++++++++++++-----
 net/rds/ib_mr.h   |  4 +++
 net/rds/ib_rdma.c | 60 +++++++++++----------------------
 5 files changed, 109 insertions(+), 49 deletions(-)

-- 

Changes in submitted patch v3:
* Use "wait_event" instead of "wait_event_timeout" in order to
  not have a deadline for the HCA firmware to respond.

^ permalink raw reply

* Re: [PATCH RFC 0/4] Add support to directly attach BPF program to ftrace
From: Alexei Starovoitov @ 2019-07-16 22:26 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: linux-kernel, Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190716213050.GA161922@google.com>

On Tue, Jul 16, 2019 at 05:30:50PM -0400, Joel Fernandes wrote:
> 
> I also thought about the pinning idea before, but we also want to add support
> for not just raw tracepoints, but also regular tracepoints (events if you
> will). I am hesitant to add a new BPF API just for creating regular
> tracepoints and then pinning those as well.

and they should be done through the pinning as well.

> I don't see why a new bpf node for a trace event is a bad idea, really.

See the patches for kprobe/uprobe FD-based api and the reasons behind it.
tldr: text is racy, doesn't scale, poor security, etc.

> tracefs is how we deal with trace events on Android. 

android has made mistakes in the past as well.

> This is a natural extension to that and fits with the security model
> well.

I think it's the opposite.
I'm absolutely against text based apis.


^ permalink raw reply

* [PATCH net] bonding: Force slave speed check after link state recovery for 802.3ad
From: Thomas Falcon @ 2019-07-16 22:25 UTC (permalink / raw)
  To: netdev
  Cc: bjking1, pradeep, Thomas Falcon, Jarod Wilson, Jay Vosburgh,
	Veaceslav Falico, Andy Gospodarek

The following scenario was encountered during testing of logical
partition mobility on pseries partitions with bonded ibmvnic
adapters in LACP mode.

1. Driver receives a signal that the device has been
   swapped, and it needs to reset to initialize the new
   device.

2. Driver reports loss of carrier and begins initialization.

3. Bonding driver receives NETDEV_CHANGE notifier and checks
   the slave's current speed and duplex settings. Because these
   are unknown at the time, the bond sets its link state to
   BOND_LINK_FAIL and handles the speed update, clearing
   AD_PORT_LACP_ENABLE.

4. Driver finishes recovery and reports that the carrier is on.

5. Bond receives a new notification and checks the speed again.
   The speeds are valid but miimon has not altered the link
   state yet.  AD_PORT_LACP_ENABLE remains off.

Because the slave's link state is still BOND_LINK_FAIL,
no further port checks are made when it recovers. Though
the slave devices are operational and have valid speed
and duplex settings, the bond will not send LACPDU's. The
simplest fix I can see is to force another speed check
in bond_miimon_commit. This way the bond will update
AD_PORT_LACP_ENABLE if needed when transitioning from
BOND_LINK_FAIL to BOND_LINK_UP.

CC: Jarod Wilson <jarod@redhat.com>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Thomas Falcon <tlfalcon@linux.ibm.com>
---
 drivers/net/bonding/bond_main.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9b7016a..02fd782 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2196,6 +2196,15 @@ static void bond_miimon_commit(struct bonding *bond)
 	bond_for_each_slave(bond, slave, iter) {
 		switch (slave->new_link) {
 		case BOND_LINK_NOCHANGE:
+			/* For 802.3ad mode, check current slave speed and
+			 * duplex again in case its port was disabled after
+			 * invalid speed/duplex reporting but recovered before
+			 * link monitoring could make a decision on the actual
+			 * link status
+			 */
+			if (BOND_MODE(bond) == BOND_MODE_8023AD &&
+			    slave->link == BOND_LINK_UP)
+				bond_3ad_adapter_speed_duplex_changed(slave);
 			continue;
 
 		case BOND_LINK_UP:
-- 
1.8.3.1


^ permalink raw reply related

* Re: [RFC bpf-next 0/8] bpf: accelerate insn patching speed
From: Jakub Kicinski @ 2019-07-16 22:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jiong Wang, Andrii Nakryiko, Daniel Borkmann, Edward Cree,
	Naveen N. Rao, Andrii Nakryiko, bpf, Networking, oss-drivers,
	Yonghong Song
In-Reply-To: <20190716161701.mk5ye47aj2slkdjp@ast-mbp.dhcp.thefacebook.com>

On Tue, 16 Jul 2019 09:17:03 -0700, Alexei Starovoitov wrote:
> I don't think we have a test for such 'dead prog only due to verifier walk'
> situation. I wonder what happens :)

FWIW we do have verifier and BTF self tests for dead code removal
of entire subprogs! :)

^ permalink raw reply

* [PATCH 3/3] ipconfig: Handle CONFIG_CIFS_ROOT option
From: Paulo Alcantara (SUSE) @ 2019-07-16 22:04 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-cifs, samba-technical
  Cc: Paulo Alcantara (SUSE), David S. Miller, Alexey Kuznetsov,
	Hideaki YOSHIFUJI
In-Reply-To: <20190716220452.3382-1-paulo@paulo.ac>

The experimental root file system support in cifs.ko relies on
ipconfig to set up the network stack and then accessing the SMB share
that contains the rootfs files.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Signed-off-by: Paulo Alcantara (SUSE) <paulo@paulo.ac>
---
 net/ipv4/ipconfig.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 9bcca08efec9..32e20b758b68 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1483,10 +1483,10 @@ static int __init ip_auto_config(void)
 	 * missing values.
 	 */
 	if (ic_myaddr == NONE ||
-#ifdef CONFIG_ROOT_NFS
+#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT)
 	    (root_server_addr == NONE &&
 	     ic_servaddr == NONE &&
-	     ROOT_DEV == Root_NFS) ||
+	     (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) ||
 #endif
 	    ic_first_dev->next) {
 #ifdef IPCONFIG_DYNAMIC
@@ -1513,6 +1513,12 @@ static int __init ip_auto_config(void)
 				goto try_try_again;
 			}
 #endif
+#ifdef CONFIG_CIFS_ROOT
+			if (ROOT_DEV == Root_CIFS) {
+				pr_err("IP-Config: Retrying forever (CIFS root)...\n");
+				goto try_try_again;
+			}
+#endif
 
 			if (--retries) {
 				pr_err("IP-Config: Reopening network devices...\n");
-- 
2.22.0


^ permalink raw reply related

* [PATCH 2/3] init: Support mounting root file systems over SMB
From: Paulo Alcantara (SUSE) @ 2019-07-16 22:04 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-cifs, samba-technical
  Cc: Paulo Alcantara (SUSE), Andrew Morton
In-Reply-To: <20190716220452.3382-1-paulo@paulo.ac>

Add a new virtual device named /dev/cifs (0xfe) to tell the kernel to
mount the root file system over the network by using SMB protocol.

cifs_root_data() will be responsible to retrieve the parsed
information of the new command-line option (cifsroot=) and then call
do_mount_root() with the appropriate mount options for cifs.ko.

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paulo Alcantara (SUSE) <paulo@paulo.ac>
---
 init/do_mounts.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 2d1ea3028454..28c5b583afef 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -212,6 +212,7 @@ static int match_dev_by_label(struct device *dev, const void *data)
  *	   a colon.
  *	9) PARTLABEL=<name> with name being the GPT partition label.
  *	   MSDOS partitions do not support labels!
+ *	10) /dev/cifs represents Root_CIFS (0xfe)
  *
  *	If name doesn't have fall into the categories above, we return (0,0).
  *	block_class is used to check if something is a disk name. If the disk
@@ -268,6 +269,9 @@ dev_t name_to_dev_t(const char *name)
 	res = Root_NFS;
 	if (strcmp(name, "nfs") == 0)
 		goto done;
+	res = Root_CIFS;
+	if (strcmp(name, "cifs") == 0)
+		goto done;
 	res = Root_RAM0;
 	if (strcmp(name, "ram") == 0)
 		goto done;
@@ -501,6 +505,42 @@ static int __init mount_nfs_root(void)
 }
 #endif
 
+#ifdef CONFIG_CIFS_ROOT
+
+extern int cifs_root_data(char **dev, char **opts);
+
+#define CIFSROOT_TIMEOUT_MIN	5
+#define CIFSROOT_TIMEOUT_MAX	30
+#define CIFSROOT_RETRY_MAX	5
+
+static int __init mount_cifs_root(void)
+{
+	char *root_dev, *root_data;
+	unsigned int timeout;
+	int try, err;
+
+	err = cifs_root_data(&root_dev, &root_data);
+	if (err != 0)
+		return 0;
+
+	timeout = CIFSROOT_TIMEOUT_MIN;
+	for (try = 1; ; try++) {
+		err = do_mount_root(root_dev, "cifs", root_mountflags,
+				    root_data);
+		if (err == 0)
+			return 1;
+		if (try > CIFSROOT_RETRY_MAX)
+			break;
+
+		ssleep(timeout);
+		timeout <<= 1;
+		if (timeout > CIFSROOT_TIMEOUT_MAX)
+			timeout = CIFSROOT_TIMEOUT_MAX;
+	}
+	return 0;
+}
+#endif
+
 #if defined(CONFIG_BLK_DEV_RAM) || defined(CONFIG_BLK_DEV_FD)
 void __init change_floppy(char *fmt, ...)
 {
@@ -542,6 +582,15 @@ void __init mount_root(void)
 		ROOT_DEV = Root_FD0;
 	}
 #endif
+#ifdef CONFIG_CIFS_ROOT
+	if (ROOT_DEV == Root_CIFS) {
+		if (mount_cifs_root())
+			return;
+
+		printk(KERN_ERR "VFS: Unable to mount root fs via SMB, trying floppy.\n");
+		ROOT_DEV = Root_FD0;
+	}
+#endif
 #ifdef CONFIG_BLK_DEV_FD
 	if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) {
 		/* rd_doload is 2 for a dual initrd/ramload setup */
-- 
2.22.0


^ permalink raw reply related

* [PATCH 1/3] cifs: Add support for root file systems
From: Paulo Alcantara (SUSE) @ 2019-07-16 22:04 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-cifs, samba-technical
  Cc: Paulo Alcantara (SUSE), Steve French

Introduce a new CONFIG_CIFS_ROOT option to handle root file systems
over a SMB share.

In order to mount the root file system during the init process, make
cifs.ko perform non-blocking socket operations while mounting and
accessing it.

Cc: Steve French <smfrench@gmail.com>
Signed-off-by: Paulo Alcantara (SUSE) <paulo@paulo.ac>
---
 Documentation/filesystems/cifs/cifsroot.txt | 97 +++++++++++++++++++++
 fs/cifs/Kconfig                             |  8 ++
 fs/cifs/Makefile                            |  2 +
 fs/cifs/cifsglob.h                          |  2 +
 fs/cifs/cifsroot.c                          | 83 ++++++++++++++++++
 fs/cifs/connect.c                           | 17 +++-
 include/linux/root_dev.h                    |  1 +
 7 files changed, 207 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/filesystems/cifs/cifsroot.txt
 create mode 100644 fs/cifs/cifsroot.c

diff --git a/Documentation/filesystems/cifs/cifsroot.txt b/Documentation/filesystems/cifs/cifsroot.txt
new file mode 100644
index 000000000000..0fa1a2c36a40
--- /dev/null
+++ b/Documentation/filesystems/cifs/cifsroot.txt
@@ -0,0 +1,97 @@
+Mounting root file system via SMB (cifs.ko)
+===========================================
+
+Written 2019 by Paulo Alcantara <palcantara@suse.de>
+Written 2019 by Aurelien Aptel <aaptel@suse.com>
+
+The CONFIG_CIFS_ROOT option enables experimental root file system
+support over the SMB protocol via cifs.ko.
+
+It introduces a new kernel command-line option called 'cifsroot='
+which will tell the kernel to mount the root file system over the
+network by utilizing SMB or CIFS protocol.
+
+In order to mount, the network stack will also need to be set up by
+using 'ip=' config option. For more details, see
+Documentation/filesystems/nfs/nfsroot.txt.
+
+A CIFS root mount currently requires the use of SMB1+UNIX Extensions
+which is only supported by the Samba server. SMB1 is the older
+deprecated version of the protocol but it has been extended to support
+POSIX features (See [1]). The equivalent extensions for the newer
+recommended version of the protocol (SMB3) have not been fully
+implemented yet which means SMB3 doesn't support some required POSIX
+file system objects (e.g. block devices, pipes, sockets).
+
+As a result, a CIFS root will default to SMB1 for now but the version
+to use can nonetheless be changed via the 'vers=' mount option.  This
+default will change once the SMB3 POSIX extensions are fully
+implemented.
+
+Server configuration
+====================
+
+To enable SMB1+UNIX extensions you will need to set these global
+settings in Samba smb.conf:
+
+    [global]
+    server min protocol = NT1
+    unix extension = yes        # default
+
+Kernel command line
+===================
+
+root=/dev/cifs
+
+This is just a virtual device that basically tells the kernel to mount
+the root file system via SMB protocol.
+
+cifsroot=//<server-ip>/<share>[,options]
+
+Enables the kernel to mount the root file system via SMB that are
+located in the <server-ip> and <share> specified in this option.
+
+The default mount options are set in fs/cifs/cifsroot.c.
+
+server-ip
+	IPv4 address of the server.
+
+share
+	Path to SMB share (rootfs).
+
+options
+	Optional mount options. For more information, see mount.cifs(8).
+
+Examples
+========
+
+Export root file system as a Samba share in smb.conf file.
+
+...
+[linux]
+	path = /path/to/rootfs
+	read only = no
+	guest ok = yes
+	force user = root
+	force group = root
+	browseable = yes
+	writeable = yes
+	admin users = root
+	public = yes
+	create mask = 0777
+	directory mask = 0777
+...
+
+Restart smb service.
+
+# systemctl restart smb
+
+Test it under QEMU on a kernel built with CONFIG_CIFS_ROOT and
+CONFIG_IP_PNP options enabled.
+
+# qemu-system-x86_64 -enable-kvm -cpu host -m 1024 \
+  -kernel /path/to/linux/arch/x86/boot/bzImage -nographic \
+  -append "root=/dev/cifs rw ip=dhcp cifsroot=//10.0.2.2/linux,username=foo,password=bar console=ttyS0 3"
+
+
+1: https://wiki.samba.org/index.php/UNIX_Extensions
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 523e9ea78a28..d28d62532318 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -217,3 +217,11 @@ config CIFS_FSCACHE
 	  Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
 	  to be cached locally on disk through the general filesystem cache
 	  manager. If unsure, say N.
+
+config CIFS_ROOT
+	bool "SMB root file system (Experimental)"
+	depends on CIFS=y && IP_PNP
+	help
+	  Enables root file system support over SMB protocol.
+
+	  Most people say N here.
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 51af69a1a328..45cf67f37487 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -22,3 +22,5 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += dns_resolve.o cifs_dfs_ref.o dfs_cache.o
 cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o
 
 cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
+
+cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 4777b3c4a92c..ac7e433a5b94 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -600,6 +600,7 @@ struct smb_vol {
 	__u64 snapshot_time; /* needed for timewarp tokens */
 	__u32 handle_timeout; /* persistent and durable handle timeout in ms */
 	unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
+	bool rootfs:1; /* if it's a SMB root file system */
 };
 
 /**
@@ -752,6 +753,7 @@ struct TCP_Server_Info {
 	 * reconnect.
 	 */
 	int nr_targets;
+	bool noblockcnt; /* use non-blocking connect() */
 };
 
 struct cifs_credits {
diff --git a/fs/cifs/cifsroot.c b/fs/cifs/cifsroot.c
new file mode 100644
index 000000000000..8760d9cbf25e
--- /dev/null
+++ b/fs/cifs/cifsroot.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * SMB root file system support
+ *
+ * Copyright (c) 2019 Paulo Alcantara <palcantara@suse.de>
+ */
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/root_dev.h>
+#include <linux/kernel.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <net/ipconfig.h>
+
+#define DEFAULT_MNT_OPTS \
+	"vers=1.0,cifsacl,mfsymlinks,rsize=1048576,wsize=65536,uid=0,gid=0," \
+	"hard,rootfs"
+
+static char root_dev[2048] __initdata = "";
+static char root_opts[1024] __initdata = DEFAULT_MNT_OPTS;
+
+static __be32 __init parse_srvaddr(char *start, char *end)
+{
+	char addr[sizeof("aaa.bbb.ccc.ddd")];
+	int i = 0;
+
+	while (start < end && i < sizeof(addr) - 1) {
+		if (isdigit(*start) || *start == '.')
+			addr[i++] = *start;
+		start++;
+	}
+	addr[i] = '\0';
+	return in_aton(addr);
+}
+
+/* cifsroot=//<server-ip>/<share>[,options] */
+static int __init cifs_root_setup(char *line)
+{
+	char *s;
+	int len;
+	__be32 srvaddr = htonl(INADDR_NONE);
+
+	ROOT_DEV = Root_CIFS;
+
+	if (strlen(line) > 3 && line[0] == '/' && line[1] == '/') {
+		s = strchr(&line[2], '/');
+		if (!s || s[1] == '\0')
+			return 1;
+
+		s = strchrnul(s, ',');
+		len = s - line + 1;
+		if (len <= sizeof(root_dev)) {
+			strlcpy(root_dev, line, len);
+			srvaddr = parse_srvaddr(&line[2], s);
+			if (*s) {
+				snprintf(root_opts, sizeof(root_opts), "%s,%s",
+					 DEFAULT_MNT_OPTS, s + 1);
+			}
+		}
+	}
+
+	root_server_addr = srvaddr;
+
+	return 1;
+}
+
+__setup("cifsroot=", cifs_root_setup);
+
+int __init cifs_root_data(char **dev, char **opts)
+{
+	if (!root_dev[0] || root_server_addr == htonl(INADDR_NONE)) {
+		printk(KERN_ERR "Root-CIFS: no SMB server address\n");
+		return -1;
+	}
+
+	*dev = root_dev;
+	*opts = root_opts;
+
+	return 0;
+}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 714a359c7c8d..ca6b04af727f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -96,7 +96,7 @@ enum {
 	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
 	Opt_persistent, Opt_nopersistent,
 	Opt_resilient, Opt_noresilient,
-	Opt_domainauto, Opt_rdma,
+	Opt_domainauto, Opt_rdma, Opt_rootfs,
 
 	/* Mount options which take numeric value */
 	Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -259,6 +259,7 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_ignore, "nomand" },
 	{ Opt_ignore, "relatime" },
 	{ Opt_ignore, "_netdev" },
+	{ Opt_rootfs, "rootfs" },
 
 	{ Opt_err, NULL }
 };
@@ -1744,6 +1745,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		case Opt_nodfs:
 			vol->nodfs = 1;
 			break;
+		case Opt_rootfs:
+#ifdef CONFIG_CIFS_ROOT
+			vol->rootfs = true;
+#endif
+			break;
 		case Opt_posixpaths:
 			vol->posix_paths = 1;
 			break;
@@ -2662,7 +2668,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 		goto out_err_crypto_release;
 	}
 
-	tcp_ses->noblocksnd = volume_info->noblocksnd;
+	tcp_ses->noblockcnt = volume_info->rootfs;
+	tcp_ses->noblocksnd = volume_info->noblocksnd || volume_info->rootfs;
 	tcp_ses->noautotune = volume_info->noautotune;
 	tcp_ses->tcp_nodelay = volume_info->sockopt_tcp_nodelay;
 	tcp_ses->rdma = volume_info->rdma;
@@ -3768,7 +3775,11 @@ generic_ip_connect(struct TCP_Server_Info *server)
 		 socket->sk->sk_sndbuf,
 		 socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
 
-	rc = socket->ops->connect(socket, saddr, slen, 0);
+	rc = socket->ops->connect(socket, saddr, slen,
+				  server->noblockcnt ? O_NONBLOCK : 0);
+
+	if (rc == -EINPROGRESS)
+		rc = 0;
 	if (rc < 0) {
 		cifs_dbg(FYI, "Error %d connecting to server\n", rc);
 		sock_release(socket);
diff --git a/include/linux/root_dev.h b/include/linux/root_dev.h
index bab671b0782f..4e78651371ba 100644
--- a/include/linux/root_dev.h
+++ b/include/linux/root_dev.h
@@ -8,6 +8,7 @@
 
 enum {
 	Root_NFS = MKDEV(UNNAMED_MAJOR, 255),
+	Root_CIFS = MKDEV(UNNAMED_MAJOR, 254),
 	Root_RAM0 = MKDEV(RAMDISK_MAJOR, 0),
 	Root_RAM1 = MKDEV(RAMDISK_MAJOR, 1),
 	Root_FD0 = MKDEV(FLOPPY_MAJOR, 0),
-- 
2.22.0


^ permalink raw reply related

* Re: [PATCH ghak90 V6 02/10] audit: add container id
From: Richard Guy Briggs @ 2019-07-16 22:03 UTC (permalink / raw)
  To: Paul Moore
  Cc: Serge E. Hallyn, Tycho Andersen, containers, linux-api,
	Linux-Audit Mailing List, linux-fsdevel, LKML, netdev,
	netfilter-devel, sgrubb, omosnace, dhowells, simo, Eric Paris,
	ebiederm, nhorman
In-Reply-To: <CAHC9VhRTT7JWqNnynvK04wKerjc-3UJ6R1uPtjCAPVr_tW-7MA@mail.gmail.com>

On 2019-07-15 17:04, Paul Moore wrote:
> On Mon, Jul 8, 2019 at 2:06 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-05-30 15:29, Paul Moore wrote:
> 
> ...
> 
> > > [REMINDER: It is an "*audit* container ID" and not a general
> > > "container ID" ;)  Smiley aside, I'm not kidding about that part.]
> > >
> > > I'm not interested in supporting/merging something that isn't useful;
> > > if this doesn't work for your use case then we need to figure out what
> > > would work.  It sounds like nested containers are much more common in
> > > the lxc world, can you elaborate a bit more on this?
> > >
> > > As far as the possible solutions you mention above, I'm not sure I
> > > like the per-userns audit container IDs, I'd much rather just emit the
> > > necessary tracking information via the audit record stream and let the
> > > log analysis tools figure it out.  However, the bigger question is how
> > > to limit (re)setting the audit container ID when you are in a non-init
> > > userns.  For reasons already mentioned, using capable() is a non
> > > starter for everything but the initial userns, and using ns_capable()
> > > is equally poor as it essentially allows any userns the ability to
> > > munge it's audit container ID (obviously not good).  It appears we
> > > need a different method for controlling access to the audit container
> > > ID.
> >
> > We're not quite ready yet for multiple audit daemons and possibly not
> > yet for audit namespaces, but this is starting to look a lot like the
> > latter.
> 
> A few quick comments on audit namespaces: the audit container ID is
> not envisioned as a new namespace (even in nested form) and neither do
> I consider running multiple audit daemons to be a new namespace.

I can picture either one.

> From my perspective we create namespaces to allow us to redefine a
> global resource for some subset of the system, e.g. providing a unique
> /tmp for some number of processes on the system.  While it may be
> tempting to think of the audit container ID as something we could
> "namespace", especially when multiple audit daemons are concerned, in
> some ways this would be counter productive; the audit container ID is
> intended to be a global ID that can be used to associate audit event
> records with a "container" where the "container" is defined by an
> orchestrator outside the audit subsystem.  The global nature of the
> audit container ID allows us to maintain a sane(ish) view of the
> system in the audit log, if we were to "namespace" the audit container
> ID such that the value was no longer guaranteed to be unique
> throughout the system, we would need to additionally track the audit
> namespace along with the audit container ID which starts to border on
> insanity IMHO.

Understood.  And mostly agree.  Any audit namespace would have to be a
hybrid anyways, since only the init one would have full access to audit
resources.  All the others would be somewhat neutered.  And in the case
of checking for previous usage of a contid, if it was not already in use
in the hypothetical audit namespace but was in use elsewhere in the
system and we blocked its usage in this namespace, it would leak that
information by blocking it.

I saw it as a way of permitting layering with the natural descendancy
structure showing that hierarchy.  The potential flaw with my reasoning
is that a parent could exit and its children would get re-parented onto
its next ancestor, so the intermediate task with an intermediate contid
would break that contid documentation chain.

> > If we can't trust ns_capable() then why are we passing on
> > CAP_AUDIT_CONTROL?  It is being passed down and not stripped purposely
> > by the orchestrator/engine.  If ns_capable() isn't inherited how is it
> > gained otherwise?  Can it be inserted by cotainer image?  I think the
> > answer is "no".  Either we trust ns_capable() or we have audit
> > namespaces (recommend based on user namespace) (or both).
> 
> My thinking is that since ns_capable() checks the credentials with
> respect to the current user namespace we can't rely on it to control
> access since it would be possible for a privileged process running
> inside an unprivileged container to manipulate the audit container ID
> (containerized process has CAP_AUDIT_CONTROL, e.g. running as root in
> the container, while the container itself does not).

What makes an unprivileged container unprivileged?  "root", or "CAP_*"?

If CAP_AUDIT_CONTROL is granted, does "root" matter?  Does it matter
what user namespace it is in?  I understand that root is *gained* in an
unprivileged user namespace, but capabilities are inherited or permitted
and that process either has it or it doesn't and an unprivileged user
namespace can't gain a capability that has been rescinded.  Different
subsystems use the userid or capabilities or both to determine
privileges.  In this case, is the userid relevant?

> > At this point I would say we are at an impasse unless we trust
> > ns_capable() or we implement audit namespaces.
> 
> I'm not sure how we can trust ns_capable(), but if you can think of a
> way I would love to hear it.  I'm also not sure how namespacing audit
> is helpful (see my above comments), but if you think it is please
> explain.

So if we are not namespacing, why do we not trust capabilities?

> > I don't think another mechanism to trust nested orchestrators/engines
> > will buy us anything.
> >
> > Am I missing something?
> 
> Based on your questions/comments above it looks like your
> understanding of ns_capable() does not match mine; if I'm thinking
> about ns_capable() incorrectly, please educate me.
> 
> -- 
> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635

^ permalink raw reply

* Re: [PATCH 2/9] rcu: Add support for consolidated-RCU reader checking (v3)
From: Joel Fernandes @ 2019-07-16 22:02 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, Alexey Kuznetsov, Bjorn Helgaas, Borislav Petkov,
	c0d1n61at3, David S. Miller, edumazet, Greg Kroah-Hartman,
	Hideaki YOSHIFUJI, H. Peter Anvin, Ingo Molnar, Jonathan Corbet,
	Josh Triplett, keescook, kernel-hardening, kernel-team,
	Lai Jiangshan, Len Brown, linux-acpi, linux-doc, linux-pci,
	linux-pm, Mathieu Desnoyers, neilb, netdev, Oleg Nesterov,
	Pavel Machek, peterz, Rafael J. Wysocki, Rasmus Villemoes, rcu,
	Steven Rostedt, Tejun Heo, Thomas Gleixner, will,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)
In-Reply-To: <20190716185303.GM14271@linux.ibm.com>

On Tue, Jul 16, 2019 at 11:53:03AM -0700, Paul E. McKenney wrote:
[snip]
> > > A few more things below.
> > > > ---
> > > >  include/linux/rculist.h  | 28 ++++++++++++++++++++-----
> > > >  include/linux/rcupdate.h |  7 +++++++
> > > >  kernel/rcu/Kconfig.debug | 11 ++++++++++
> > > >  kernel/rcu/update.c      | 44 ++++++++++++++++++++++++----------------
> > > >  4 files changed, 67 insertions(+), 23 deletions(-)
> > > > 
> > > > diff --git a/include/linux/rculist.h b/include/linux/rculist.h
> > > > index e91ec9ddcd30..1048160625bb 100644
> > > > --- a/include/linux/rculist.h
> > > > +++ b/include/linux/rculist.h
> > > > @@ -40,6 +40,20 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
> > > >   */
> > > >  #define list_next_rcu(list)	(*((struct list_head __rcu **)(&(list)->next)))
> > > >  
> > > > +/*
> > > > + * Check during list traversal that we are within an RCU reader
> > > > + */
> > > > +
> > > > +#ifdef CONFIG_PROVE_RCU_LIST
> > > 
> > > This new Kconfig option is OK temporarily, but unless there is reason to
> > > fear malfunction that a few weeks of rcutorture, 0day, and -next won't
> > > find, it would be better to just use CONFIG_PROVE_RCU.  The overall goal
> > > is to reduce the number of RCU knobs rather than grow them, must though
> > > history might lead one to believe otherwise.  :-/
> > 
> > If you want, we can try to drop this option and just use PROVE_RCU however I
> > must say there may be several warnings that need to be fixed in a short
> > period of time (even a few weeks may be too short) considering the 1000+
> > uses of RCU lists.
> Do many people other than me build with CONFIG_PROVE_RCU?  If so, then
> that would be a good reason for a temporary CONFIG_PROVE_RCU_LIST,
> as in going away in a release or two once the warnings get fixed.

PROVE_RCU is enabled by default with PROVE_LOCKING, so it is used quite
heavilty.

> > But I don't mind dropping it and it may just accelerate the fixing up of all
> > callers.
> 
> I will let you decide based on the above question.  But if you have
> CONFIG_PROVE_RCU_LIST, as noted below, it needs to depend on RCU_EXPERT.

Ok, will make it depend. But yes for temporary purpose, I will leave it as a
config and remove it later.

thanks,

 - Joel
 

^ permalink raw reply

* OOM triggered by SCTP
From: Marek Majkowski @ 2019-07-16 21:47 UTC (permalink / raw)
  To: vyasevich, nhorman, marcelo.leitner, linux-sctp; +Cc: netdev, kernel-team

Morning,

My poor man's fuzzer found something interesting in SCTP. It seems
like creating large number of SCTP sockets + some magic dance, upsets
a memory subsystem related to SCTP. The sequence:

 - create SCTP socket
 - call setsockopts (SCTP_EVENTS)
 - call bind(::1, port)
 - call sendmsg(long buffer, MSG_CONFIRM, ::1, port)
 - close SCTP socket
 - repeat couple thousand times

Full code:
https://gist.github.com/majek/bd083dae769804d39134ce01f4f802bb#file-test_sctp-c

I'm running it on virtme the simplest way:
$ virtme-run --show-boot-console --rw --pwd --kimg bzImage --memory
512M --script-sh ./test_sctp

Originally I was running it inside net namespace, and just having a
localhost interface is sufficient to trigger the problem.

Kernel is 5.2.1 (with KASAN and such, but that shouldn't be a factor).
In some tests I saw a message that might indicate something funny
hitting neighbor table:

neighbour: ndisc_cache: neighbor table overflow!

I'm not addr-decoding the stack trace, since it seems unrelated to the
root cause.

Cheers,
    Marek

^ permalink raw reply

* Re: [Patch net v2 2/2] selftests: add a test case for rp filter
From: Cong Wang @ 2019-07-16 21:42 UTC (permalink / raw)
  To: Linux Kernel Network Developers; +Cc: David Ahern
In-Reply-To: <20190716205804.19775-2-xiyou.wangcong@gmail.com>

On Tue, Jul 16, 2019 at 1:58 PM Cong Wang <xiyou.wangcong@gmail.com> wrote:
>
> Add a test case to simulate the loopback packet case fixed
> in the previous patch.

Well, I actually have to create a dummy1 to simulate it, as dummy0
is the gateway interface assigned to an IP address.

I will send v3.

Thanks.

^ permalink raw reply

* Re: [PATCH bpf 1/2] selftests/bpf: fix test_verifier/test_maps make dependencies
From: Andrii Nakryiko @ 2019-07-16 21:40 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Andrii Nakryiko, bpf, Networking, Daniel Borkmann,
	Alexei Starovoitov, Kernel Team, Ilya Leoshkevich,
	Stanislav Fomichev, Martin KaFai Lau
In-Reply-To: <20190716195544.GB14834@mini-arch>

On Tue, Jul 16, 2019 at 12:55 PM Stanislav Fomichev <sdf@fomichev.me> wrote:
>
> On 07/16, Andrii Nakryiko wrote:
> > e46fc22e60a4 ("selftests/bpf: make directory prerequisites order-only")
> > exposed existing problem in Makefile for test_verifier and test_maps tests:
> > their dependency on auto-generated header file with a list of all tests wasn't
> > recorded explicitly. This patch fixes these issues.
> Why adding it explicitly fixes it? At least for test_verifier, we have
> the following rule:
>
>         test_verifier.c: $(VERIFIER_TESTS_H)
>
> And there should be implicit/builtin test_verifier -> test_verifier.c
> dependency rule.
>
> Same for maps, I guess:
>
>         $(OUTPUT)/test_maps: map_tests/*.c
>         test_maps.c: $(MAP_TESTS_H)
>
> So why is it not working as is? What I'm I missing?

I don't know exactly why it's not working, but it's clearly because of
that. It's the only difference between how test_progs are set up,
which didn't break, and test_maps/test_verifier, which did.

Feel free to figure it out through a maze of Makefiles why it didn't
work as expected, but this definitely fixed a breakage (at least for
me).

^ permalink raw reply

* Re: [PATCH ghak90 V6 02/10] audit: add container id
From: Paul Moore @ 2019-07-16 21:39 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: Tycho Andersen, containers, linux-api, Linux-Audit Mailing List,
	linux-fsdevel, LKML, netdev, netfilter-devel, sgrubb, omosnace,
	dhowells, simo, Eric Paris, Serge Hallyn, ebiederm, nhorman
In-Reply-To: <20190716193828.xvm67iv5jyypvvxp@madcap2.tricolour.ca>

On Tue, Jul 16, 2019 at 3:38 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-07-15 16:38, Paul Moore wrote:
> > On Mon, Jul 8, 2019 at 1:51 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > On 2019-05-29 11:29, Paul Moore wrote:
> >
> > ...
> >
> > > > The idea is that only container orchestrators should be able to
> > > > set/modify the audit container ID, and since setting the audit
> > > > container ID can have a significant effect on the records captured
> > > > (and their routing to multiple daemons when we get there) modifying
> > > > the audit container ID is akin to modifying the audit configuration
> > > > which is why it is gated by CAP_AUDIT_CONTROL.  The current thinking
> > > > is that you would only change the audit container ID from one
> > > > set/inherited value to another if you were nesting containers, in
> > > > which case the nested container orchestrator would need to be granted
> > > > CAP_AUDIT_CONTROL (which everyone to date seems to agree is a workable
> > > > compromise).  We did consider allowing for a chain of nested audit
> > > > container IDs, but the implications of doing so are significant
> > > > (implementation mess, runtime cost, etc.) so we are leaving that out
> > > > of this effort.
> > >
> > > We had previously discussed the idea of restricting
> > > orchestrators/engines from only being able to set the audit container
> > > identifier on their own descendants, but it was discarded.  I've added a
> > > check to ensure this is now enforced.
> >
> > When we weren't allowing nested orchestrators it wasn't necessary, but
> > with the move to support nesting I believe this will be a requirement.
> > We might also need/want to restrict audit container ID changes if a
> > descendant is acting as a container orchestrator and managing one or
> > more audit container IDs; although I'm less certain of the need for
> > this.
>
> I was of the opinion it was necessary before with single-layer parallel
> orchestrators/engines.

One of the many things we've disagreed on, but it doesn't really
matter at this point.

> > > I've also added a check to ensure that a process can't set its own audit
> > > container identifier ...
> >
> > What does this protect against, or what problem does this solve?
> > Considering how easy it is to fork/exec, it seems like this could be
> > trivially bypassed.
>
> Well, for starters, it would remove one layer of nesting.  It would
> separate the functional layers of processes.

This doesn't seem like something we need to protect against, what's
the harm?  My opinion at this point is that we should only add
restrictions to protect against problematic or dangerous situations; I
don't believe one extra layer of nesting counts as either.

Perhaps the container folks on the To/CC line can comment on this?  If
there is a valid reason for this restriction, great, let's do it,
otherwise it seems like an unnecessary hard coded policy to me.

> Other than that, it seems
> like a gut feeling that it is just wrong to allow it.  It seems like a
> layer violation that one container orchestrator/engine could set its own
> audit container identifier and then set its children as well.  It would
> be its own parent.

I suspect you are right that the current crop of container engines
won't do this, but who knows what we'll be doing with "containers" 5,
or even 10, years from now.  With that in mind, let me ask the
question again: is allowing an orchestrator the ability to set its own
audit container ID problematic and/or dangerous?

> It would make it harder to verify adherance to descendancy and inheritance rules.

The audit log should contain all the information needed to track that,
right?  If it doesn't, then I think we have a problem with the
information we are logging.  Right?

> > > ... and that if the identifier is already set, then the
> > > orchestrator/engine must be in a descendant user namespace from the
> > > orchestrator that set the previously inherited audit container
> > > identifier.
> >
> > You lost me here ... although I don't like the idea of relying on X
> > namespace inheritance for a hard coded policy on setting the audit
> > container ID; we've worked hard to keep this independent of any
> > definition of a "container" and it would sadden me greatly if we had
> > to go back on that.
>
> This would seem to be the one concession I'm reluctantly making to try
> to solve this nested container orchestrator/engine challenge.

As I said, you lost me on this - how does this help?  A more detailed
explanation of how this helps resolve the nesting problem would be
useful.

> Would backing off on that descendant user namespace requirement and only
> require that a nested audit container identifier only be permitted on a
> descendant task be sufficient?  It may for this use case, but I suspect
> not for additional audit daemons (we're not there yet) and message
> routing to those daemons.
>
> The one difference here is that it does not depend on this if the audit
> container identifier has not already been set.

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH RFC 0/4] Add support to directly attach BPF program to ftrace
From: Joel Fernandes @ 2019-07-16 21:30 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-kernel, Adrian Ratiu, Alexei Starovoitov, bpf,
	Brendan Gregg, connoro, Daniel Borkmann, duyuchao, Ingo Molnar,
	jeffv, Karim Yaghmour, kernel-team, linux-kselftest,
	Manali Shukla, Manjo Raja Rao, Martin KaFai Lau, Masami Hiramatsu,
	Matt Mullins, Michal Gregorczyk, Michal Gregorczyk,
	Mohammad Husain, namhyung, namhyung, netdev, paul.chaignon,
	primiano, Qais Yousef, Shuah Khan, Song Liu, Srinivas Ramana,
	Steven Rostedt, Tamir Carmeli, Yonghong Song
In-Reply-To: <20190716205455.iimn3pqpvsc3k4ry@ast-mbp.dhcp.thefacebook.com>

On Tue, Jul 16, 2019 at 01:54:57PM -0700, Alexei Starovoitov wrote:
> On Wed, Jul 10, 2019 at 10:15:44AM -0400, Joel Fernandes (Google) wrote:
> > Hi,
> 
> why are you cc-ing the whole world for this patch set?

Well, the whole world happens to be interested in BPF on Android.

> I'll reply to all as well, but I suspect a bunch of folks consider it spam.
> Please read Documentation/bpf/bpf_devel_QA.rst

Ok, I'll read it.

> Also, I think, netdev@vger rejects emails with 80+ characters in cc as spam,
> so I'm not sure this set reached public mailing lists.

Certainly the CC list here is not added to folks who consider it spam. All
the folks added have been interested in BPF on Android at various points of
time. Is this CC list really that large? It has around 24 email addresses or
so. I can trim it a bit if needed. Also, you sound like as if people are
screaming at me to stop emailing them, certainly that's not the case and no
one has told me it is spam.

And, it did reach the public archive btw:
https://lore.kernel.org/netdev/20190716205455.iimn3pqpvsc3k4ry@ast-mbp.dhcp.thefacebook.com/T/#m1460ba463b78312e38b68b8c118f673d2ead9446

> > These patches make it possible to attach BPF programs directly to tracepoints
> > using ftrace (/sys/kernel/debug/tracing) without needing the process doing the
> > attach to be alive. This has the following benefits:
> > 
> > 1. Simplified Security: In Android, we have finer-grained security controls to
> > specific ftrace trace events using SELinux labels. We control precisely who is
> > allowed to enable an ftrace event already. By adding a node to ftrace for
> > attaching BPF programs, we can use the same mechanism to further control who is
> > allowed to attach to a trace event.
> > 
> > 2. Process lifetime: In Android we are adding usecases where a tracing program
> > needs to be attached all the time to a tracepoint, for the full life time of
> > the system. Such as to gather statistics where there no need for a detach for
> > the full system lifetime. With perf or bpf(2)'s BPF_RAW_TRACEPOINT_OPEN, this
> > means keeping a process alive all the time.  However, in Android our BPF loader
> > currently (for hardeneded security) involves just starting a process at boot
> > time, doing the BPF program loading, and then pinning them to /sys/fs/bpf.  We
> > don't keep this process alive all the time. It is more suitable to do a
> > one-shot attach of the program using ftrace and not need to have a process
> > alive all the time anymore for this. Such process also needs elevated
> > privileges since tracepoint program loading currently requires CAP_SYS_ADMIN
> > anyway so by design Android's bpfloader runs once at init and exits.
> > 
> > This series add a new bpf file to /sys/kernel/debug/tracing/events/X/Y/bpf
> > The following commands can be written into it:
> > attach:<fd>     Attaches BPF prog fd to tracepoint
> > detach:<fd>     Detaches BPF prog fd to tracepoint
> 
> Looks like, to detach a program the user needs to read a text file,
> parse bpf prog id from text into binary. Then call fd_from_id bpf syscall,
> get a binary FD, convert it back to text and write as a text back into this file.
> I think this is just a single example why text based apis are not accepted
> in bpf anymore.

This can also be considered a tracefs API.

And we can certainly change the detach to accept program ids as well if
that's easier. 'detach:prog:<prog_id>' and 'detach:fd:<fd>'.

By the way, I can also list the set of cumbersome steps needed to attach a
BPF program using perf and I bet it will be longer ;-)

> Through the patch set you call it ftrace. As far as I can see, this set
> has zero overlap with ftrace. There is no ftrace-bpf connection here at all
> that we discussed in the past Steven. It's all quite confusing.

It depends on what you mean by ftrace, may be I can call it 'trace events' or
something if it is less ambiguious. All of this has been collectively called
ftrace before.

I am not sure if you you are making sense actually, trace_events mechanism is
a part of ftrace. See the documentation: Documentation/trace/ftrace.rst. Even
the documentation file name has the word ftrace in it.

I have also spoken to Steven before about this, I don't think he ever told me
there is no connection so again I am a bit lost at your comments.

> I suggest android to solve sticky raw_tracepoint problem with user space deamon.
> The reasons, you point out why user daemon cannot be used, sound weak to me.

I don't think it is weak. It seems overkill to have a daemon for a trace
event that is say supposed to be attached to all the time for the lifetime of
the system. Why should there be a daemon consuming resources if it is active
all the time?

In Android, we are very careful about spawning useless processes and leaving
them alive for the lifetime of the system - for no good reason. Our security
teams also don't like this, and they can comment more.

> Another acceptable solution would be to introduce pinning of raw_tp objects.
> bpf progs and maps can be pinned in bpffs already. Pinning raw_tp would
> be natural extension.

I don't think the pinning solves the security problem, it just solves the
process lifetime problem. Currently, attaching trace events through perf
requires CAP_SYS_ADMIN. However, with ftrace events, we already control
security of events by labeling the nodes in tracefs and granting access to
the labeled context through the selinux policies. Having a 'bpf' node in
tracefs for events, and granting access to the labels is a natural extension.

I also thought about the pinning idea before, but we also want to add support
for not just raw tracepoints, but also regular tracepoints (events if you
will). I am hesitant to add a new BPF API just for creating regular
tracepoints and then pinning those as well.

I don't see why a new bpf node for a trace event is a bad idea, really.
tracefs is how we deal with trace events on Android. We do it in production
systems. This is a natural extension to that and fits with the security model
well.

thanks,

 - Joel





^ permalink raw reply

* Re: [PATCH net-next v1] fix: taprio: Change type of txtime-delay parameter to u32
From: David Miller @ 2019-07-16 21:19 UTC (permalink / raw)
  To: vedang.patel
  Cc: netdev, jeffrey.t.kirsher, jhs, xiyou.wangcong, jiri,
	intel-wired-lan, vinicius.gomes, l, jakub.kicinski, m-karicheri2,
	sergei.shtylyov, eric.dumazet, aaron.f.brown, stephen
In-Reply-To: <1563306738-2779-1-git-send-email-vedang.patel@intel.com>

From: Vedang Patel <vedang.patel@intel.com>
Date: Tue, 16 Jul 2019 12:52:18 -0700

> During the review of the iproute2 patches for txtime-assist mode, it was
> pointed out that it does not make sense for the txtime-delay parameter to
> be negative. So, change the type of the parameter from s32 to u32.
> 
> Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode")
> Reported-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Vedang Patel <vedang.patel@intel.com>

You should have targetted this at 'net' as that's the only tree open
right now.

I'll apply this.

^ permalink raw reply

* Re: [PATCH -next] iwlwifi: dbg: work around clang bug by marking debug strings static
From: Nick Desaulniers @ 2019-07-16 21:18 UTC (permalink / raw)
  To: Luca Coelho
  Cc: Joe Perches, Kalle Valo, Arnd Bergmann, Nathan Chancellor,
	Johannes Berg, Emmanuel Grumbach, Intel Linux Wireless,
	David S. Miller, Shahar S Matityahu, Sara Sharon, linux-wireless,
	netdev, LKML, clang-built-linux
In-Reply-To: <da053a97d771eff0ad8db37e644108ed2fad25a3.camel@coelho.fi>

On Tue, Jul 16, 2019 at 2:15 PM Luca Coelho <luca@coelho.fi> wrote:
>
> On Tue, 2019-07-16 at 10:28 -0700, Nick Desaulniers wrote:
> > On Thu, Jul 11, 2019 at 7:15 PM Joe Perches <joe@perches.com> wrote:
> > > Better still would be to use the format string directly
> > > in both locations instead of trying to deduplicate it
> > > via storing it into a separate pointer.
> > >
> > > Let the compiler/linker consolidate the format.
> > > It's smaller object code, allows format/argument verification,
> > > and is simpler for humans to understand.
> >
> > Whichever Kalle prefers, I just want my CI green again.
>
> We already changed this in a later internal patch, which will reach the
> mainline (-next) soon.  So let's skip this for now.

Ok, but this has now regressed into mainline and is blocking Linaro's
ToolChain Working Group's CI, so if you could send a bugfix ASAP we'd
appreciate it.
-- 
Thanks,
~Nick Desaulniers

^ permalink raw reply

* Re: [PATCH -next] iwlwifi: dbg: work around clang bug by marking debug strings static
From: Luca Coelho @ 2019-07-16 21:14 UTC (permalink / raw)
  To: Nick Desaulniers, Joe Perches, Kalle Valo
  Cc: Arnd Bergmann, Nathan Chancellor, Johannes Berg,
	Emmanuel Grumbach, Intel Linux Wireless, David S. Miller,
	Shahar S Matityahu, Sara Sharon, linux-wireless, netdev, LKML,
	clang-built-linux
In-Reply-To: <CAKwvOdkD_r2YBqRDy-uTGMG1YeRF8KokxjikR0XLkXLsdJca0g@mail.gmail.com>

On Tue, 2019-07-16 at 10:28 -0700, Nick Desaulniers wrote:
> On Thu, Jul 11, 2019 at 7:15 PM Joe Perches <joe@perches.com> wrote:
> > On Thu, 2019-07-11 at 17:17 -0700, Nick Desaulniers wrote:
> > > Commit r353569 in prerelease Clang-9 is producing a linkage failure:
> > > 
> > > ld: drivers/net/wireless/intel/iwlwifi/fw/dbg.o:
> > > in function `_iwl_fw_dbg_apply_point':
> > > dbg.c:(.text+0x827a): undefined reference to `__compiletime_assert_2387'
> > > 
> > > when the following configs are enabled:
> > > - CONFIG_IWLWIFI
> > > - CONFIG_IWLMVM
> > > - CONFIG_KASAN
> > > 
> > > Work around the issue for now by marking the debug strings as `static`,
> > > which they probably should be any ways.
> > > 
> > > Link: https://bugs.llvm.org/show_bug.cgi?id=42580
> > > Link: https://github.com/ClangBuiltLinux/linux/issues/580
> > > Reported-by: Arnd Bergmann <arnd@arndb.de>
> > > Reported-by: Nathan Chancellor <natechancellor@gmail.com>
> > > Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> > > ---
> > >  drivers/net/wireless/intel/iwlwifi/fw/dbg.c | 4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
> > > index e411ac98290d..f8c90ea4e9b4 100644
> > > --- a/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
> > > +++ b/drivers/net/wireless/intel/iwlwifi/fw/dbg.c
> > > @@ -2438,7 +2438,7 @@ static void iwl_fw_dbg_info_apply(struct iwl_fw_runtime *fwrt,
> > >  {
> > >       u32 img_name_len = le32_to_cpu(dbg_info->img_name_len);
> > >       u32 dbg_cfg_name_len = le32_to_cpu(dbg_info->dbg_cfg_name_len);
> > > -     const char err_str[] =
> > > +     static const char err_str[] =
> > >               "WRT: ext=%d. Invalid %s name length %d, expected %d\n";
> > 
> > Better still would be to use the format string directly
> > in both locations instead of trying to deduplicate it
> > via storing it into a separate pointer.
> > 
> > Let the compiler/linker consolidate the format.
> > It's smaller object code, allows format/argument verification,
> > and is simpler for humans to understand.
> 
> Whichever Kalle prefers, I just want my CI green again.

We already changed this in a later internal patch, which will reach the
mainline (-next) soon.  So let's skip this for now.

--
Cheers,
Luca.


^ permalink raw reply

* Re: [PATCH 4/9] ipv4: add lockdep condition to fix for_each_entry (v1)
From: David Miller @ 2019-07-16 21:12 UTC (permalink / raw)
  To: paulmck
  Cc: joel, linux-kernel, kuznet, bhelgaas, bp, c0d1n61at3, edumazet,
	gregkh, yoshfuji, hpa, mingo, corbet, josh, keescook,
	kernel-hardening, kernel-team, jiangshanlai, lenb, linux-acpi,
	linux-doc, linux-pci, linux-pm, mathieu.desnoyers, neilb, netdev,
	oleg, pavel, peterz, rjw, rasmus.villemoes, rcu, rostedt, tj,
	tglx, will, x86
In-Reply-To: <20190716183955.GF14271@linux.ibm.com>

From: "Paul E. McKenney" <paulmck@linux.ibm.com>
Date: Tue, 16 Jul 2019 11:39:55 -0700

> On Mon, Jul 15, 2019 at 10:37:00AM -0400, Joel Fernandes (Google) wrote:
>> Using the previous support added, use it for adding lockdep conditions
>> to list usage here.
>> 
>> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> 
> We need an ack or better from the subsystem maintainer for this one.

Acked-by: David S. Miller <davem@davemloft.net>

^ permalink raw reply

* Re: [PATCH net v2] skbuff: fix compilation warnings in skb_dump()
From: David Miller @ 2019-07-16 21:12 UTC (permalink / raw)
  To: cai; +Cc: willemb, joe, clang-built-linux, netdev, linux-kernel
In-Reply-To: <1563291785-6545-1-git-send-email-cai@lca.pw>

From: Qian Cai <cai@lca.pw>
Date: Tue, 16 Jul 2019 11:43:05 -0400

> The commit 6413139dfc64 ("skbuff: increase verbosity when dumping skb
> data") introduced a few compilation warnings.
> 
> net/core/skbuff.c:766:32: warning: format specifies type 'unsigned
> short' but the argument has type 'unsigned int' [-Wformat]
>                        level, sk->sk_family, sk->sk_type,
> sk->sk_protocol);
>                                              ^~~~~~~~~~~
> net/core/skbuff.c:766:45: warning: format specifies type 'unsigned
> short' but the argument has type 'unsigned int' [-Wformat]
>                        level, sk->sk_family, sk->sk_type,
> sk->sk_protocol);
> ^~~~~~~~~~~~~~~
> 
> Fix them by using the proper types.
> 
> Fixes: 6413139dfc64 ("skbuff: increase verbosity when dumping skb data")
> Signed-off-by: Qian Cai <cai@lca.pw>

Applied, thank you.

^ permalink raw reply

* [Patch net v2 2/2] selftests: add a test case for rp filter
From: Cong Wang @ 2019-07-16 20:58 UTC (permalink / raw)
  To: netdev; +Cc: Cong Wang, David Ahern

Add a test case to simulate the loopback packet case fixed
in the previous patch.

This test gets passed after the fix:

IPv4 rp_filter tests
    TEST: rp_filter passes local packets                                [ OK ]
    TEST: rp_filter passes loopback packets                             [ OK ]

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 tools/testing/selftests/net/fib_tests.sh | 30 +++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
index 9457aaeae092..a9e45471edfe 100755
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -9,12 +9,13 @@ ret=0
 ksft_skip=4
 
 # all tests in this script. Can be overridden with -t option
-TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw"
+TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter"
 
 VERBOSE=0
 PAUSE_ON_FAIL=no
 PAUSE=no
 IP="ip -netns ns1"
+NETNS="ip netns exec ns1"
 
 log_test()
 {
@@ -433,6 +434,32 @@ fib_carrier_test()
 	fib_carrier_unicast_test
 }
 
+fib_rp_filter_test()
+{
+	echo
+	echo "IPv4 rp_filter tests"
+
+	setup
+
+	$IP link set dev lo address 52:54:00:6a:c7:5e
+	$IP link set dev dummy0 address 52:54:00:6a:c7:5e
+	echo 1 | $NETNS tee /proc/sys/net/ipv4/conf/all/rp_filter > /dev/null
+	echo 1 | $NETNS tee /proc/sys/net/ipv4/conf/all/accept_local > /dev/null
+	echo 1 | $NETNS tee /proc/sys/net/ipv4/conf/all/route_localnet > /dev/null
+
+	$NETNS tc qd add dev dummy0 parent root handle 1: fq_codel
+	$NETNS tc filter add dev dummy0 parent 1: protocol arp basic action mirred egress redirect dev lo
+	$NETNS tc filter add dev dummy0 parent 1: protocol ip basic action mirred egress redirect dev lo
+
+	run_cmd "ip netns exec ns1 ping -I dummy0 -w1 -c1 198.51.100.1"
+	log_test $? 0 "rp_filter passes local packets"
+
+	run_cmd "ip netns exec ns1 ping -I dummy0 -w1 -c1 127.0.0.1"
+	log_test $? 0 "rp_filter passes loopback packets"
+
+	cleanup
+}
+
 ################################################################################
 # Tests on nexthop spec
 
@@ -1557,6 +1584,7 @@ do
 	fib_unreg_test|unregister)	fib_unreg_test;;
 	fib_down_test|down)		fib_down_test;;
 	fib_carrier_test|carrier)	fib_carrier_test;;
+	fib_rp_filter_test|rp_filter)	fib_rp_filter_test;;
 	fib_nexthop_test|nexthop)	fib_nexthop_test;;
 	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
 	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
-- 
2.21.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox