Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 11/15] brcmfmac: Use __skb_peek().
From: David Miller @ 2018-09-08 20:10 UTC (permalink / raw)
  To: netdev


Instead of direct SKB list pointer accesses.

In these situations, we absolutely know that the SKB queue in question
is non-empty.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 2 +-
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
index d2f788d88668..3e37c8cf82c6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c
@@ -576,7 +576,7 @@ int brcmf_sdiod_recv_chain(struct brcmf_sdio_dev *sdiodev,
 
 	if (pktq->qlen == 1)
 		err = brcmf_sdiod_skbuff_read(sdiodev, sdiodev->func2, addr,
-					      pktq->next);
+					      __skb_peek(pktq));
 	else if (!sdiodev->sg_support) {
 		glom_skb = brcmu_pkt_buf_get_skb(totlen);
 		if (!glom_skb)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index a907d7b065fa..1e2fd289323a 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -2189,7 +2189,7 @@ brcmf_sdio_txpkt_prep(struct brcmf_sdio *bus, struct sk_buff_head *pktq,
 	 * length of the chain (including padding)
 	 */
 	if (bus->txglom)
-		brcmf_sdio_update_hwhdr(pktq->next->data, total_len);
+		brcmf_sdio_update_hwhdr(__skb_peek(pktq)->data, total_len);
 	return 0;
 }
 
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 12/15] net: Add and use skb_mark_not_on_list().
From: David Miller @ 2018-09-08 20:10 UTC (permalink / raw)
  To: netdev


An SKB is not on a list if skb->next is NULL.

Codify this convention into a helper function and use it
where we are dequeueing an SKB and need to mark it as such.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h                  | 5 +++++
 net/core/dev.c                          | 8 ++++----
 net/core/sock.c                         | 2 +-
 net/ieee802154/6lowpan/reassembly.c     | 2 +-
 net/ipv4/ip_fragment.c                  | 2 +-
 net/ipv4/ip_input.c                     | 2 +-
 net/ipv4/ip_output.c                    | 4 ++--
 net/ipv6/ip6_output.c                   | 2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
 net/ipv6/reassembly.c                   | 2 +-
 net/netfilter/nfnetlink_queue.c         | 2 +-
 net/rxrpc/input.c                       | 2 +-
 net/sched/sch_cake.c                    | 6 +++---
 net/sched/sch_fq.c                      | 2 +-
 net/sched/sch_fq_codel.c                | 2 +-
 net/sched/sch_generic.c                 | 4 ++--
 net/sched/sch_hhf.c                     | 2 +-
 net/sched/sch_netem.c                   | 2 +-
 net/sched/sch_tbf.c                     | 2 +-
 net/tipc/bearer.c                       | 2 +-
 net/xfrm/xfrm_device.c                  | 2 +-
 net/xfrm/xfrm_output.c                  | 2 +-
 22 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 89283b77294d..c4c9e3f5cd9a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1339,6 +1339,11 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
 	}
 }
 
+static inline void skb_mark_not_on_list(struct sk_buff *skb)
+{
+	skb->next = NULL;
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..f76dd7e14dd6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3231,7 +3231,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
 	while (skb) {
 		struct sk_buff *next = skb->next;
 
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		rc = xmit_one(skb, dev, txq, next != NULL);
 		if (unlikely(!dev_xmit_complete(rc))) {
 			skb->next = next;
@@ -3331,7 +3331,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
 
 	for (; skb != NULL; skb = next) {
 		next = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 
 		/* in case skb wont be segmented, point to itself */
 		skb->prev = skb;
@@ -5296,7 +5296,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
 		list_del(&skb->list);
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		napi_gro_complete(skb);
 		napi->gro_hash[index].count--;
 	}
@@ -5482,7 +5482,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 
 	if (pp) {
 		list_del(&pp->list);
-		pp->next = NULL;
+		skb_mark_not_on_list(pp);
 		napi_gro_complete(pp);
 		napi->gro_hash[hash].count--;
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..8537b6ca72c5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2332,7 +2332,7 @@ static void __release_sock(struct sock *sk)
 			next = skb->next;
 			prefetch(next);
 			WARN_ON_ONCE(skb_dst_is_noref(skb));
-			skb->next = NULL;
+			skb_mark_not_on_list(skb);
 			sk_backlog_rcv(sk, skb);
 
 			cond_resched();
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index e7857a8ac86d..09ffbf5ce8fa 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -260,7 +260,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	}
 	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->dev = ldev;
 	head->tstamp = fq->q.stamp;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 330f62353b11..cab3e4a5124b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -623,7 +623,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	sub_frag_mem_limit(qp->q.net, head->truesize);
 
 	*nextp = NULL;
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->prev = NULL;
 	head->dev = dev;
 	head->tstamp = qp->q.stamp;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..eba7f3883230 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -535,7 +535,7 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 		/* Handle ip{6}_forward case, as sch_direct_xmit have
 		 * another kind of SKB-list usage (see validate_xmit_skb_list)
 		 */
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		dst_input(skb);
 	}
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..c09219e7f230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -278,7 +278,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 		struct sk_buff *nskb = segs->next;
 		int err;
 
-		segs->next = NULL;
+		skb_mark_not_on_list(segs);
 		err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
 
 		if (err && ret == 0)
@@ -684,7 +684,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 			skb = frag;
 			frag = skb->next;
-			skb->next = NULL;
+			skb_mark_not_on_list(skb);
 		}
 
 		if (err == 0) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 16f200f06500..9a8934ac053b 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -727,7 +727,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 			skb = frag;
 			frag = skb->next;
-			skb->next = NULL;
+			skb_mark_not_on_list(skb);
 		}
 
 		kfree(tmp_hdr);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 2a14d8b65924..00e20004d241 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -449,7 +449,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	sub_frag_mem_limit(fq->q.net, head->truesize);
 
 	head->ignore_df = 1;
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
 	ipv6_hdr(head)->payload_len = htons(payload_len);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c5b4f79296e..f1b1ff30fe5b 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -388,7 +388,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	}
 	sub_frag_mem_limit(fq->q.net, sum_truesize);
 
-	head->next = NULL;
+	skb_mark_not_on_list(head);
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
 	ipv6_hdr(head)->payload_len = htons(payload_len);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index ea4ba551abb2..5207eb8a5864 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -764,7 +764,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
 		return ret;
 	}
 
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 
 	entry_seg = nf_queue_entry_dup(entry);
 	if (entry_seg) {
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index cfdc199c6351..ee8e7e1d5c0f 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -259,7 +259,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
 	while (list) {
 		skb = list;
 		list = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
 	}
 }
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index c07c30b916d5..dc539295ae65 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -812,7 +812,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow)
 
 	if (skb) {
 		flow->head = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 	}
 
 	return skb;
@@ -1252,7 +1252,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
 	else
 		flow->head = elig_ack->next;
 
-	elig_ack->next = NULL;
+	skb_mark_not_on_list(elig_ack);
 
 	return elig_ack;
 }
@@ -1675,7 +1675,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
 		while (segs) {
 			nskb = segs->next;
-			segs->next = NULL;
+			skb_mark_not_on_list(segs);
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
 			cobalt_set_enqueue_time(segs, now);
 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4808713c73b9..b27ba36a269c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -319,7 +319,7 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
 
 	if (skb) {
 		flow->head = skb->next;
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		flow->qlen--;
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 6c0a9d5dbf94..cd04d40c30b6 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -124,7 +124,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
 	struct sk_buff *skb = flow->head;
 
 	flow->head = skb->next;
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 	return skb;
 }
 
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69078c82963e..a64132a5db36 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -184,7 +184,7 @@ static void try_bulk_dequeue_skb(struct Qdisc *q,
 		skb = nskb;
 		(*packets)++; /* GSO counts as one pkt */
 	}
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 }
 
 /* This variant of try_bulk_dequeue_skb() makes sure
@@ -210,7 +210,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
 		skb = nskb;
 	} while (++cnt < 8);
 	(*packets) += cnt;
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 }
 
 /* Note that dequeue_skb can possibly return a SKB list (via skb->next).
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c3a8388dcdf6..9d6a47697406 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -330,7 +330,7 @@ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
 	struct sk_buff *skb = bucket->head;
 
 	bucket->head = skb->next;
-	skb->next = NULL;
+	skb_mark_not_on_list(skb);
 	return skb;
 }
 
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index b9541ce4d672..506e1960ed7f 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -568,7 +568,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (segs) {
 		while (segs) {
 			skb2 = segs->next;
-			segs->next = NULL;
+			skb_mark_not_on_list(segs);
 			qdisc_skb_cb(segs)->pkt_len = segs->len;
 			last_len = segs->len;
 			rc = qdisc_enqueue(segs, sch, to_free);
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 6f74a426f159..a4530e85bd02 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -162,7 +162,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
 	nb = 0;
 	while (segs) {
 		nskb = segs->next;
-		segs->next = NULL;
+		skb_mark_not_on_list(segs);
 		qdisc_skb_cb(segs)->pkt_len = segs->len;
 		len += segs->len;
 		ret = qdisc_enqueue(segs, q->qdisc, to_free);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 418f03d0be90..91891041e5e1 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -577,7 +577,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
 		rcu_dereference_rtnl(orig_dev->tipc_ptr);
 	if (likely(b && test_bit(0, &b->up) &&
 		   (skb->pkt_type <= PACKET_MULTICAST))) {
-		skb->next = NULL;
+		skb_mark_not_on_list(skb);
 		tipc_rcv(dev_net(b->pt.dev), skb, b);
 		rcu_read_unlock();
 		return NET_RX_SUCCESS;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 5611b7521020..260fbba4f03e 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -99,7 +99,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 
 	do {
 		struct sk_buff *nskb = skb2->next;
-		skb2->next = NULL;
+		skb_mark_not_on_list(skb2);
 
 		xo = xfrm_offload(skb2);
 		xo->flags |= XFRM_DEV_RESUME;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 45ba07ab3e4f..2d42cb0c94b8 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -189,7 +189,7 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
 		struct sk_buff *nskb = segs->next;
 		int err;
 
-		segs->next = NULL;
+		skb_mark_not_on_list(segs);
 		err = xfrm_output2(net, sk, segs);
 
 		if (unlikely(err)) {
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 13/15] net: Add and use skb_list_del_init().
From: David Miller @ 2018-09-08 20:11 UTC (permalink / raw)
  To: netdev


It documents what is happening, and eliminates the spurious list
pointer poisoning.

In the long term, in order to get proper list head debugging, we
might want to use the list poinson value as the indicator that
an SKB is a singleton and not on a list.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 6 ++++++
 net/core/dev.c         | 6 ++----
 net/ipv4/ip_input.c    | 6 +-----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c4c9e3f5cd9a..e3a53ca4a9b5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1344,6 +1344,12 @@ static inline void skb_mark_not_on_list(struct sk_buff *skb)
 	skb->next = NULL;
 }
 
+static inline void skb_list_del_init(struct sk_buff *skb)
+{
+	__list_del_entry(&skb->list);
+	skb_mark_not_on_list(skb);
+}
+
 /**
  *	skb_queue_empty - check if a queue is empty
  *	@list: queue head
diff --git a/net/core/dev.c b/net/core/dev.c
index f76dd7e14dd6..0b2d777e5b9e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5295,8 +5295,7 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
 	list_for_each_entry_safe_reverse(skb, p, head, list) {
 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
 			return;
-		list_del(&skb->list);
-		skb_mark_not_on_list(skb);
+		skb_list_del_init(skb);
 		napi_gro_complete(skb);
 		napi->gro_hash[index].count--;
 	}
@@ -5481,8 +5480,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
 
 	if (pp) {
-		list_del(&pp->list);
-		skb_mark_not_on_list(pp);
+		skb_list_del_init(pp);
 		napi_gro_complete(pp);
 		napi->gro_hash[hash].count--;
 	}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index eba7f3883230..35a786c0aaa0 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -531,11 +531,7 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 	struct sk_buff *skb, *next;
 
 	list_for_each_entry_safe(skb, next, head, list) {
-		list_del(&skb->list);
-		/* Handle ip{6}_forward case, as sch_direct_xmit have
-		 * another kind of SKB-list usage (see validate_xmit_skb_list)
-		 */
-		skb_mark_not_on_list(skb);
+		skb_list_del_init(skb);
 		dst_input(skb);
 	}
 }
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 14/15] can: Remove SKB list assumptions in rx-offload.c
From: David Miller @ 2018-09-08 20:11 UTC (permalink / raw)
  To: netdev


Eliminate code which assumes that SKBs and skb_queue_head objects
can be cast to eachother during list processing.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/can/rx-offload.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/rx-offload.c b/drivers/net/can/rx-offload.c
index d94dae216820..c7d05027a7a0 100644
--- a/drivers/net/can/rx-offload.c
+++ b/drivers/net/can/rx-offload.c
@@ -79,7 +79,7 @@ static int can_rx_offload_napi_poll(struct napi_struct *napi, int quota)
 static inline void __skb_queue_add_sort(struct sk_buff_head *head, struct sk_buff *new,
 					int (*compare)(struct sk_buff *a, struct sk_buff *b))
 {
-	struct sk_buff *pos, *insert = (struct sk_buff *)head;
+	struct sk_buff *pos, *insert = NULL;
 
 	skb_queue_reverse_walk(head, pos) {
 		const struct can_rx_offload_cb *cb_pos, *cb_new;
@@ -99,8 +99,10 @@ static inline void __skb_queue_add_sort(struct sk_buff_head *head, struct sk_buf
 		insert = pos;
 		break;
 	}
-
-	__skb_queue_after(head, insert, new);
+	if (!insert)
+		__skb_queue_head(head, new);
+	else
+		__skb_queue_after(head, insert, new);
 }
 
 static int can_rx_offload_compare(struct sk_buff *a, struct sk_buff *b)
-- 
2.17.1

^ permalink raw reply related

* [PATCH net-next 15/15] rtl818x: Remove SKB list assumptions.
From: David Miller @ 2018-09-08 20:11 UTC (permalink / raw)
  To: netdev


Eliminate the assumption that SKBs and SKB list heads can
be cast to eachother in SKB list handling code.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
index 9a1d15b3ce45..cec37787ecf8 100644
--- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
+++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c
@@ -499,7 +499,7 @@ static void rtl8187b_status_cb(struct urb *urb)
 	if (cmd_type == 1) {
 		unsigned int pkt_rc, seq_no;
 		bool tok;
-		struct sk_buff *skb;
+		struct sk_buff *skb, *iter;
 		struct ieee80211_hdr *ieee80211hdr;
 		unsigned long flags;
 
@@ -508,8 +508,9 @@ static void rtl8187b_status_cb(struct urb *urb)
 		seq_no = (val >> 16) & 0xFFF;
 
 		spin_lock_irqsave(&priv->b_tx_status.queue.lock, flags);
-		skb_queue_reverse_walk(&priv->b_tx_status.queue, skb) {
-			ieee80211hdr = (struct ieee80211_hdr *)skb->data;
+		skb = NULL;
+		skb_queue_reverse_walk(&priv->b_tx_status.queue, iter) {
+			ieee80211hdr = (struct ieee80211_hdr *)iter->data;
 
 			/*
 			 * While testing, it was discovered that the seq_no
@@ -522,10 +523,12 @@ static void rtl8187b_status_cb(struct urb *urb)
 			 * it's unlikely we wrongly ack some sent data
 			 */
 			if ((le16_to_cpu(ieee80211hdr->seq_ctrl)
-			    & 0xFFF) == seq_no)
+			     & 0xFFF) == seq_no) {
+				skb = iter;
 				break;
+			}
 		}
-		if (skb != (struct sk_buff *) &priv->b_tx_status.queue) {
+		if (skb) {
 			struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 
 			__skb_unlink(skb, &priv->b_tx_status.queue);
-- 
2.17.1

^ permalink raw reply related

* [PATCH net 01/13] net: sched: cls_u32: fix hnode refcounting
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, stable
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

cls_u32.c misuses refcounts for struct tc_u_hnode - it counts references via
->hlist and via ->tp_root together.  u32_destroy() drops the former and, in
case when there had been links, leaves the sucker on the list.  As the result,
there's nothing to protect it from getting freed once links are dropped.
That also makes the "is it busy" check incapable of catching the root hnode -
it *is* busy (there's a reference from tp), but we don't see it as something
separate.  "Is it our root?" check partially covers that, but the problem
exists for others' roots as well.

AFAICS, the minimal fix preserving the existing behaviour (where it doesn't
include oopsen, that is) would be this:
        * count tp->root and tp_c->hlist as separate references.  I.e.
have u32_init() set refcount to 2, not 1.
	* in u32_destroy() we always drop the former; in u32_destroy_hnode() -
the latter.

	That way we have *all* references contributing to refcount.  List
removal happens in u32_destroy_hnode() (called only when ->refcnt is 1)
an in u32_destroy() in case of tc_u_common going away, along with everything
reachable from it.  IOW, that way we know that u32_destroy_key() won't
free something still on the list (or pointed to by someone's ->root).

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index f218ccf1e2d9..b2c3406a2cf2 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -398,6 +398,7 @@ static int u32_init(struct tcf_proto *tp)
 	rcu_assign_pointer(tp_c->hlist, root_ht);
 	root_ht->tp_c = tp_c;
 
+	root_ht->refcnt++;
 	rcu_assign_pointer(tp->root, root_ht);
 	tp->data = tp_c;
 	return 0;
@@ -610,7 +611,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 	struct tc_u_hnode __rcu **hn;
 	struct tc_u_hnode *phn;
 
-	WARN_ON(ht->refcnt);
+	WARN_ON(--ht->refcnt);
 
 	u32_clear_hnode(tp, ht, extack);
 
@@ -649,7 +650,7 @@ static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 
 	WARN_ON(root_ht == NULL);
 
-	if (root_ht && --root_ht->refcnt == 0)
+	if (root_ht && --root_ht->refcnt == 1)
 		u32_destroy_hnode(tp, root_ht, extack);
 
 	if (--tp_c->refcnt == 0) {
@@ -698,7 +699,6 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 	}
 
 	if (ht->refcnt == 1) {
-		ht->refcnt--;
 		u32_destroy_hnode(tp, ht, extack);
 	} else {
 		NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter");
@@ -708,11 +708,11 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 out:
 	*last = true;
 	if (root_ht) {
-		if (root_ht->refcnt > 1) {
+		if (root_ht->refcnt > 2) {
 			*last = false;
 			goto ret;
 		}
-		if (root_ht->refcnt == 1) {
+		if (root_ht->refcnt == 2) {
 			if (!ht_empty(root_ht)) {
 				*last = false;
 				goto ret;
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 1/2] tcp: show number of network segments in some SNMP counters
From: Yafang Shao @ 2018-09-09  3:14 UTC (permalink / raw)
  To: edumazet, davem; +Cc: netdev, linux-kernel, Yafang Shao

It is better to show the number of network segments in bellow SNMP
counters, because that could be more useful for the user.
For example, the user could easily figure out how mant packets are
dropped and how many packets are queued in the out-of-oder queue.

- LINUX_MIB_TCPRCVQDROP
- LINUX_MIB_TCPZEROWINDOWDROP
- LINUX_MIB_TCPBACKLOGDROP
- LINUX_MIB_TCPMINTTLDROP
- LINUX_MIB_TCPOFODROP
- LINUX_MIB_TCPOFOQUEUE

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 net/ipv4/tcp_input.c | 18 ++++++++++++------
 net/ipv4/tcp_ipv4.c  |  9 ++++++---
 net/ipv6/tcp_ipv6.c  |  6 ++++--
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 62508a2..c2ce334 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4496,7 +4496,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	tcp_ecn_check_ce(sk, skb);
 
 	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP,
+			      max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		tcp_drop(sk, skb);
 		return;
 	}
@@ -4505,7 +4506,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	tp->pred_flags = 0;
 	inet_csk_schedule_ack(sk);
 
-	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE,
+		      max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 	seq = TCP_SKB_CB(skb)->seq;
 	end_seq = TCP_SKB_CB(skb)->end_seq;
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
@@ -4666,7 +4668,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	skb->len = size;
 
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP,
+			      max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		goto err_free;
 	}
 
@@ -4725,7 +4728,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	 */
 	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
 		if (tcp_receive_window(tp) == 0) {
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
+			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP,
+				      max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 			goto out_of_window;
 		}
 
@@ -4734,7 +4738,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		if (skb_queue_len(&sk->sk_receive_queue) == 0)
 			sk_forced_mem_schedule(sk, skb->truesize);
 		else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP,
+				      max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 			goto drop;
 		}
 
@@ -4796,7 +4801,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		 * remembering D-SACK for its head made in previous line.
 		 */
 		if (!tcp_receive_window(tp)) {
-			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
+			NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP,
+				      max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 			goto out_of_window;
 		}
 		goto queue_and_out;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 09547ef..23d7cb5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -475,7 +475,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 		goto out;
 
 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
-		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+		__NET_ADD_STATS(net, LINUX_MIB_TCPMINTTLDROP,
+				max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		goto out;
 	}
 
@@ -1633,7 +1634,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
 
 	if (unlikely(sk_add_backlog(sk, skb, limit))) {
 		bh_unlock_sock(sk);
-		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+		__NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP,
+				max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		return true;
 	}
 	return false;
@@ -1790,7 +1792,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
 		}
 	}
 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
-		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+		__NET_ADD_STATS(net, LINUX_MIB_TCPMINTTLDROP,
+				max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		goto discard_and_relse;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 03e6b7a..bbf7667 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -391,7 +391,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 		goto out;
 
 	if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) {
-		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+		__NET_ADD_STATS(net, LINUX_MIB_TCPMINTTLDROP,
+				max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		goto out;
 	}
 
@@ -1523,7 +1524,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 		}
 	}
 	if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
-		__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+		__NET_ADD_STATS(net, LINUX_MIB_TCPMINTTLDROP,
+				max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 		goto discard_and_relse;
 	}
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v2 net-next 2/2] tcp: fix the error count of tcpInSegs
From: Yafang Shao @ 2018-09-09  3:14 UTC (permalink / raw)
  To: edumazet, davem; +Cc: netdev, linux-kernel, Yafang Shao
In-Reply-To: <1536462862-11767-1-git-send-email-laoar.shao@gmail.com>

In RFC1213, the tcpInSegs is the total number of segments received.
While currently it is the total number of SKBs received.
The number of SKBs may be not equal with the numer of segments because of
GRO.
So fix this error count.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 770917d..66578f4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -310,6 +310,8 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 #define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
 #define TCP_DEC_STATS(net, field)	SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
 #define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
+#define __TCP_ADD_STATS(net, field, val)	\
+	__SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
 
 void tcp_tasklet_init(void);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 23d7cb5..2b98242 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1703,7 +1703,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
 		goto discard_it;
 
 	/* Count it even if it's bad */
-	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
+	__TCP_ADD_STATS(net, TCP_MIB_INSEGS,
+			max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 
 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
 		goto discard_it;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bbf7667..8d4ef46 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1441,7 +1441,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
 	/*
 	 *	Count it even if it's bad.
 	 */
-	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
+	__TCP_ADD_STATS(net, TCP_MIB_INSEGS,
+			max_t(u16, 1, skb_shinfo(skb)->gso_segs));
 
 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
 		goto discard_it;
-- 
1.8.3.1

^ permalink raw reply related

* Re: Why not use all the syn queues? in the function "tcp_conn_request", I have some questions.
From: Ttttabcd @ 2018-09-09  1:14 UTC (permalink / raw)
  To: Neal Cardwell; +Cc: Netdev
In-Reply-To: <CADVnQymHJ5VGpawQMWtcvO6YUTC-tV9bWcb0meu=c0MKmQdyhQ@mail.gmail.com>

Sent with ProtonMail Secure Email.

‐‐‐‐‐‐‐ Original Message ‐‐‐‐‐‐‐
On Sunday, 9 September 2018 02:24, Neal Cardwell <ncardwell@google.com> wrote:

> By default, and essentially always in practice (AFAIK), Linux
> installations enable syncookies. With syncookies, there is essentially
> no limit on the syn queue, or number of incomplete passive connections
> (as the man page you quoted notes). So in practice the listen()
> parameter usually controls only the accept queue.
>

>
> That discussion pertains to a code path that is relevant if syncookies
> are disabled, which is very uncommon (see above).
>

Yes, when I tested, I disabled syncookies. I want to know how the kernel will handle syn attacks if syncookies are disabled.

> Keep in mind that the semantics of the listen() argument and the
> /proc/sys/net/ipv4/tcp_max_syn_backlog sysctl knob, as described in
> the man page, are part of the Linux kernel's user-visible API. So, in
> essence, they cannot be changed. Changing the semantics of system
> calls and sysctl knobs breaks applications and system configuration
> scripts. :-)

So, as you said

Is there a historical issue with two variables controlling the syn queue?

^ permalink raw reply

* Re: [PATCH net-next] veth: report NEWLINK event when moving the peer device in a new namespace
From: David Ahern @ 2018-09-09  1:16 UTC (permalink / raw)
  To: Thomas Haller, Lorenzo Bianconi; +Cc: David S. Miller, Network Development
In-Reply-To: <8eb1f295bbfe0d662e0df19448e9719be24348f5.camel@redhat.com>

Hi Thomas:

On 9/7/18 12:52 PM, Thomas Haller wrote:
> Hi David,
> 
> 
> On Mon, 2018-09-03 at 20:54 -0600, David Ahern wrote:
> 
>> From init_net:
>> $ ip monitor all-nsid
> 
> I thought the concern of the patch is the overhead of sending one
> additional RTM_NEWLINK message. This workaround has likely higher
> overhead. More importantly, it's so cumbersome, that I doubt anybody
> would implementing such a solution.

Yes, the concern is additional messages. Each one adds non-negligible
overhead to build and send, something that is measurable in a system at
scale. For example, consider an interface manager creating 4k VLANs on
one or more bridges. The RTM_NEWLINK messages (there are 2) adds 33% to
the overhead and time of creating the vlans. Just in some quick tests I
see times vary from as little as 15 usec to over 1 msec - per message
with the RTNL held. The norm seems to be somewhere between 35 and 40
usecs, but those add up when it is per link and per action.

We have to more disciplined about the size and frequency of these messages.

> 
> When the events of one namespace are not sufficient to get all relevant
> information (local to the namespace itself), the solution is not
> monitor all-nsid.
> 
> You might save complexity and performance overhead in kernel. But what
> you save here is just moved to user-space, which faces higher
> complexity (at multiple places/projects, where developers are not
> experts in netlink) and higher overhead.

First, this is one use case that seems to care about a message coming
back on the veth pairs. The message is sent for everyone, always. Adding
a 3rd message puts additional load on all use cases.

Second, I am questioning this use case driving a kernel change. You say
a userspace app cares about a veth pair but only wants to monitor events
for one end of it. That's a userspace choice. Another argument brought
up is that the other end of the pair can change namespaces faster than
the app can track it. Sure. But the monitored end can move as well - and
faster than the app can track it. e.g.,

1. veth1 and veth2 are a pair.
2. app filters events for veth1 only
3. veth2 is moved to namespace foo
4. app is sent notification of move
5. veth2 is moved to namespace bar
6. app processes notification of the event in step 3, looks in namespace
foo and link is gone.

That seems to be the fundamentals of your request for this new message, yes?

What happens when veth1 is moved to another namespace and then another
namespace - faster than the app can not track it? You have the same
problem. Your use case may not have this problem, but generically it can
happen and this solution does not cover it.

The movement of links is provided to userspace, and userspace needs to
be smart about what it wants to do.

Alternatively, you could provide an API for your interface manager --
whatever is handling the link movement. It know where it is moving
interfaces. You could have it respond to inquiries of where device X is
at any given moment.

^ permalink raw reply

* Re: Containers and checkpoint/restart micro-conference at LPC2018
From: Christian Brauner @ 2018-09-09  1:31 UTC (permalink / raw)
  To: Amir Goldstein
  Cc: Stephane Graber, containers, Miklos Szeredi, Netdev, overlayfs,
	lxc-users, LSM List, lxc-devel, linux-fsdevel
In-Reply-To: <CAOQ4uxiQEpofdS97kxnii8LtVW2QiKAGvjjaH0Px-Bj3eHVCFA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3905 bytes --]

On Sat, Sep 08, 2018 at 10:41:41AM +0300, Amir Goldstein wrote:
> On Sat, Sep 8, 2018 at 8:00 AM Stéphane Graber <stgraber@ubuntu.com> wrote:
> >
> > On Mon, Aug 13, 2018 at 12:10:15PM -0400, Stéphane Graber wrote:
> > > Hello,
> > >
> > > This year's edition of the Linux Plumbers Conference will once again
> > > have a containers micro-conference but this time around we'll have twice
> > > the usual amount of time and will include the content that would
> > > traditionally go into the checkpoint/restore micro-conference.
> > >
> > > LPC2018 will be held in Vancouver, Canada from the 13th to the 15th of
> > > November, co-located with the Linux Kernel Summit.
> > >
> > >
> > > We're looking for discussion topics around kernel work related to
> > > containers and namespacing, resource control, access control,
> > > checkpoint/restore of kernel structures, filesystem/mount handling for
> > > containers and any related userspace work.
> > >
> > >
> > > The format of the event will mostly be discussions where someone
> > > introduces a given topic/problem and it then gets discussed for 20-30min
> > > before moving on to something else. There will also be limited room for
> > > short demos of recent work with shorter 15min slots.
> > >
> > >
> > > Details can be found here:
> > >
> > >   https://discuss.linuxcontainers.org/t/containers-micro-conference-at-linux-plumbers-2018/2417
> > >
> > >
> > > Looking forward to seeing you in Vancouver!
> >
> > Hello,
> >
> > We've added an extra week to the CFP, new deadline is Friday 14th of September.
> >
> > If you were thinking about sending something bug then forgot or just
> > missed the deadline, now is your chance to send it!
> >
> 
> [cc: overlayfs developers]
> 
> Hi Stéphane!

Hey Amir,

I'm one of the co-organizers of the microconf.

> 
> I am not planing to travel to LPC this year, so this is more of an FYI than
> a CFP, but maybe another overlayfs developer can pick up this glove??

Sure, that would be great.

> 
> For the past two years I have participated in the effort to fix overlayfs
> "non-standard" behavior:
> https://github.com/amir73il/overlayfs/wiki/Overlayfs-non-standard-behavior

Yes, this is an issue that we were aware of for a long time and it
something that has made overlayfs somewhat more difficult to use than it
should be.

> 
> Allegedly, this effort went underway to improve the experience of overlayfs
> users, who are mostly applications running inside containers. For backward
> compatibility reasons, container runtimes will need to opt-in for fixing some
> of the legacy behavior.
> 
> In reality, I have seen very little cross list interaction between linux-unionfs
> and containers mailing lists. The only interaction I recall in the
> past two years
> ended up in a fix in overlayfs to require opt-in for fixing yet another backward
> compatible bad behavior, although docker did follow up shortly after to fix
> bad practice in container runtime:
> https://github.com/moby/moby/issues/34672
> 
> So the questions I would like to relay to the micro-conf participants w.r.t the
> new opt-in overlayfs behavior:
> 1. Did you know?

I personally did not know about the new opt-in behavior. More reason to
give a talk! :)

> 2. Do you care?

Yes, we do care. However - speaking as LXC upstream now - we have
recently focused on getting shiftfs to work rather than overlayfs.

We are more than happy to have a overlayfs talk at the microconf. If
someone were to talk about:
- What non-standard behavior has already been fixed?
- How has it been fixed?
- What non-standard behavior still needs to be fixed?
- Outstanding problems that either still need a solution or
  are solved but one would like feedback on the implementation. This way
  we can have a good discussion.

Thanks!
Christian

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* [PATCH net 00/13] cls_u32 cleanups and fixes.
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko

From: Al Viro <viro@zeniv.linux.org.uk>

A series of net/sched/cls_u32.c cleanups and fixes:
	1) fix hnode refcounting.  Refcounting for tc_u_hnode is broken;
it's not hard to trigger oopsen (including one inside an interrupt handler,
with resulting panic) as well as memory corruption.  Definitely -stable
fodder.
	2) mark root hnode explicitly.  Consistent errors on attempts to
delete root hnodes.  Less serious than 1/13.
	3) disallow linking to root hnode.  Prohibit creating links to
root hnodes; not critical (nothing actually breaks if we do allow those),
but gets rid of surprising cases.
	4) make sure that divisor is a power of 2.  Missing validation -
divisor is documented as power of 2, but that's not actually enforced.
Results are moderately bogus (i.e. the kernel doesn't break), but rather
surprising.

Those are fixes, or at least can be argued to be such.  The rest are pure
cleanups:
	5) get rid of unused argument of u32_destroy_key()
	6) get rid of tc_u_knode ->tp
	7) get rid of tc_u_common ->rcu
Eliminate some unused fields.
	8) clean tc_u_common hashtable.
Hash lookups are best done with minimum of calculations per chain
element - comparing the field in each candidate with f(key) where
f() is actually a pure function is not nice, especially when
compiler doesn't see f() as such...  Better calculate f(key) once,
especially since we need its value to choose the hash chain in
the first place.
	9) pass tc_u_common to u32_set_parms() instead of tc_u_hnode
	10) the tp_c argument of u32_set_parms() is always tp->data
	11) get rid of hnode ->tp_c and tp_c argument of u32_set_parms()
Massage that ends with getting rid of a redundant field.
	12) keep track of knodes count in tc_u_common
	13) simplify the hell out u32_delete() emptiness check
Checking if a filter needs to be killed after u32_delete() can be
done much easier - the test is equivalent to "filter doesn't
have ->data shared with anyone else and it has no knodes left
in it" and keeping track of the number of knodes is trivial.

^ permalink raw reply

* [PATCH net 02/13] net: sched: cls_u32: mark root hnode explicitly
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

... and produce consistent error on attempt to delete such.
Existing check in u32_delete() is inconsistent - after

tc qdisc add dev eth0 ingress
tc filter add dev eth0 parent ffff: protocol ip prio 100 handle 1: u32 divisor 1
tc filter add dev eth0 parent ffff: protocol ip prio 200 handle 2: u32 divisor 1

both

tc filter delete dev eth0 parent ffff: protocol ip prio 100 handle 801: u32

and

tc filter delete dev eth0 parent ffff: protocol ip prio 100 handle 800: u32

will fail (at least with refcounting fixes), but the former will complain
about an attempt to remove a busy table, while the latter will recognize
it as root and yield "Not allowed to delete root node" instead.

The problem with the existing check is that several tcf_proto instances might
share the same tp->data and handle-to-hnode lookup will be the same for all
of them.  So comparing an hnode to be deleted with tp->root won't catch the
case when one tp is used to try deleting the root of another.  Solution is
trivial - mark the root hnodes explicitly upon allocation and check for that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b2c3406a2cf2..c4782aa808c7 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -84,6 +84,7 @@ struct tc_u_hnode {
 	int			refcnt;
 	unsigned int		divisor;
 	struct idr		handle_idr;
+	bool			is_root;
 	struct rcu_head		rcu;
 	u32			flags;
 	/* The 'ht' field MUST be the last field in structure to allow for
@@ -377,6 +378,7 @@ static int u32_init(struct tcf_proto *tp)
 	root_ht->refcnt++;
 	root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
 	root_ht->prio = tp->prio;
+	root_ht->is_root = true;
 	idr_init(&root_ht->handle_idr);
 
 	if (tp_c == NULL) {
@@ -693,7 +695,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 		goto out;
 	}
 
-	if (root_ht == ht) {
+	if (ht->is_root) {
 		NL_SET_ERR_MSG_MOD(extack, "Not allowed to delete root node");
 		return -EINVAL;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 03/13] net: sched: cls_u32: disallow linking to root hnode
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Operation makes no sense.  Nothing will actually break if we do so
(depth limit in u32_classify() will prevent infinite loops), but
according to maintainers it's best prohibited outright.

NOTE: doing so guarantees that u32_destroy() will trigger the call
of u32_destroy_hnode(); we might want to make that unconditional.

Test:
tc qdisc add dev eth0 ingress
tc filter add dev eth0 parent ffff: protocol ip prio 100 u32 \
	link 800: offset at 0 mask 0f00 shift 6 plus 0 eat match ip protocol 6 ff
should fail with
Error: cls_u32: Not linking to root node

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index c4782aa808c7..72459b09d910 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -797,6 +797,10 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 				NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
 				return -EINVAL;
 			}
+			if (ht_down->is_root) {
+				NL_SET_ERR_MSG_MOD(extack, "Not linking to root node");
+				return -EINVAL;
+			}
 			ht_down->refcnt++;
 		}
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 04/13] net: sched: cls_u32: make sure that divisor is a power of 2
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 72459b09d910..d9923d474b65 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -994,7 +994,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (tb[TCA_U32_DIVISOR]) {
 		unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
 
-		if (--divisor > 0x100) {
+		if (!is_power_of_2(divisor)) {
+			NL_SET_ERR_MSG_MOD(extack, "Divisor is not a power of 2");
+			return -EINVAL;
+		}
+		if (divisor-- > 0x100) {
 			NL_SET_ERR_MSG_MOD(extack, "Exceeded maximum 256 hash buckets");
 			return -EINVAL;
 		}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 05/13] net: sched: cls_u32: get rid of unused argument of u32_destroy_key()
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d9923d474b65..d11862823911 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -406,8 +406,7 @@ static int u32_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
-			   bool free_pf)
+static int u32_destroy_key(struct tc_u_knode *n, bool free_pf)
 {
 	struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
 
@@ -441,7 +440,7 @@ static void u32_delete_key_work(struct work_struct *work)
 					      struct tc_u_knode,
 					      rwork);
 	rtnl_lock();
-	u32_destroy_key(key->tp, key, false);
+	u32_destroy_key(key, false);
 	rtnl_unlock();
 }
 
@@ -458,7 +457,7 @@ static void u32_delete_key_freepf_work(struct work_struct *work)
 					      struct tc_u_knode,
 					      rwork);
 	rtnl_lock();
-	u32_destroy_key(key->tp, key, true);
+	u32_destroy_key(key, true);
 	rtnl_unlock();
 }
 
@@ -601,7 +600,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 			if (tcf_exts_get_net(&n->exts))
 				tcf_queue_work(&n->rwork, u32_delete_key_freepf_work);
 			else
-				u32_destroy_key(n->tp, n, true);
+				u32_destroy_key(n, true);
 		}
 	}
 }
@@ -971,13 +970,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 				    tca[TCA_RATE], ovr, extack);
 
 		if (err) {
-			u32_destroy_key(tp, new, false);
+			u32_destroy_key(new, false);
 			return err;
 		}
 
 		err = u32_replace_hw_knode(tp, new, flags, extack);
 		if (err) {
-			u32_destroy_key(tp, new, false);
+			u32_destroy_key(new, false);
 			return err;
 		}
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 06/13] net: sched: cls_u32: get rid of tc_u_knode ->tp
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

not used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index d11862823911..281ac954511c 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -68,7 +68,6 @@ struct tc_u_knode {
 	u32			mask;
 	u32 __percpu		*pcpu_success;
 #endif
-	struct tcf_proto	*tp;
 	struct rcu_work		rwork;
 	/* The 'sel' field MUST be the last field in structure to allow for
 	 * tc_u32_keys allocated at end of structure.
@@ -896,7 +895,6 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 	/* Similarly success statistics must be moved as pointers */
 	new->pcpu_success = n->pcpu_success;
 #endif
-	new->tp = tp;
 	memcpy(&new->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key));
 
 	if (tcf_exts_init(&new->exts, TCA_U32_ACT, TCA_U32_POLICE)) {
@@ -1112,7 +1110,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
 	n->flags = flags;
-	n->tp = tp;
 
 	err = tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	if (err < 0)
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 07/13] net: sched: cls_u32: get rid of tc_u_common ->rcu
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

unused

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 281ac954511c..1bfbfcab7260 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -98,7 +98,6 @@ struct tc_u_common {
 	int			refcnt;
 	struct idr		handle_idr;
 	struct hlist_node	hnode;
-	struct rcu_head		rcu;
 };
 
 static inline unsigned int u32_hash_fold(__be32 key,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 08/13] net: sched: cls_u32: clean tc_u_common hashtable
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

* calculate key *once*, not for each hash chain element
* let tc_u_hash() return the pointer to chain head rather than index -
callers are cleaner that way.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 1bfbfcab7260..af05113c1212 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -343,19 +343,16 @@ static void *tc_u_common_ptr(const struct tcf_proto *tp)
 		return block->q;
 }
 
-static unsigned int tc_u_hash(const struct tcf_proto *tp)
+static struct hlist_head *tc_u_hash(void *key)
 {
-	return hash_ptr(tc_u_common_ptr(tp), U32_HASH_SHIFT);
+	return tc_u_common_hash + hash_ptr(key, U32_HASH_SHIFT);
 }
 
-static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
+static struct tc_u_common *tc_u_common_find(void *key)
 {
 	struct tc_u_common *tc;
-	unsigned int h;
-
-	h = tc_u_hash(tp);
-	hlist_for_each_entry(tc, &tc_u_common_hash[h], hnode) {
-		if (tc->ptr == tc_u_common_ptr(tp))
+	hlist_for_each_entry(tc, tc_u_hash(key), hnode) {
+		if (tc->ptr == key)
 			return tc;
 	}
 	return NULL;
@@ -364,10 +361,8 @@ static struct tc_u_common *tc_u_common_find(const struct tcf_proto *tp)
 static int u32_init(struct tcf_proto *tp)
 {
 	struct tc_u_hnode *root_ht;
-	struct tc_u_common *tp_c;
-	unsigned int h;
-
-	tp_c = tc_u_common_find(tp);
+	void *key = tc_u_common_ptr(tp);
+	struct tc_u_common *tp_c = tc_u_common_find(key);
 
 	root_ht = kzalloc(sizeof(*root_ht), GFP_KERNEL);
 	if (root_ht == NULL)
@@ -385,12 +380,11 @@ static int u32_init(struct tcf_proto *tp)
 			kfree(root_ht);
 			return -ENOBUFS;
 		}
-		tp_c->ptr = tc_u_common_ptr(tp);
+		tp_c->ptr = key;
 		INIT_HLIST_NODE(&tp_c->hnode);
 		idr_init(&tp_c->handle_idr);
 
-		h = tc_u_hash(tp);
-		hlist_add_head(&tp_c->hnode, &tc_u_common_hash[h]);
+		hlist_add_head(&tp_c->hnode, tc_u_hash(key));
 	}
 
 	tp_c->refcnt++;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 09/13] net: sched: cls_u32: pass tc_u_common to u32_set_parms() instead of tc_u_hnode
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

the only thing we used ht for was ht->tp_c and callers can get that
without going through ->tp_c at all; start with lifting that into
the callers, next commits will massage those, eventually removing
->tp_c altogether.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index af05113c1212..221ce532b241 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -761,7 +761,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
-			 unsigned long base, struct tc_u_hnode *ht,
+			 unsigned long base, struct tc_u_common *tp_c,
 			 struct tc_u_knode *n, struct nlattr **tb,
 			 struct nlattr *est, bool ovr,
 			 struct netlink_ext_ack *extack)
@@ -782,7 +782,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 		}
 
 		if (handle) {
-			ht_down = u32_lookup_ht(ht->tp_c, handle);
+			ht_down = u32_lookup_ht(tp_c, handle);
 
 			if (!ht_down) {
 				NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
@@ -957,7 +957,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 			return -ENOMEM;
 
 		err = u32_set_parms(net, tp, base,
-				    rtnl_dereference(n->ht_up), new, tb,
+				    rtnl_dereference(n->ht_up)->tp_c, new, tb,
 				    tca[TCA_RATE], ovr, extack);
 
 		if (err) {
@@ -1124,7 +1124,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	}
 #endif
 
-	err = u32_set_parms(net, tp, base, ht, n, tb, tca[TCA_RATE], ovr,
+	err = u32_set_parms(net, tp, base, ht->tp_c, n, tb, tca[TCA_RATE], ovr,
 			    extack);
 	if (err == 0) {
 		struct tc_u_knode __rcu **ins;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 10/13] net: sched: cls_u32: the tp_c argument of u32_set_parms() is always tp->data
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

It must be tc_u_common associated with that tp (i.e. tp->data).
Proof:
	* both ->ht_up and ->tp_c are assign-once
	* ->tp_c of anything inserted into tp_c->hlist is tp_c
	* hnodes never get reinserted into the lists or moved
between those, so anything found by u32_lookup_ht(tp->data, ...)
will have ->tp_c equal to tp->data.
	* tp->root->tp_c == tp->data.
	* ->ht_up of anything inserted into hnode->ht[...] is
equal to hnode.
	* knodes never get reinserted into hash chains or moved
between those, so anything returned by u32_lookup_key(ht, ...)
will have ->ht_up equal to ht.
	* any knode returned by u32_get(tp, ...) will have ->ht_up->tp_c
point to tp->data

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 221ce532b241..12757e3ec8d8 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -956,8 +956,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (!new)
 			return -ENOMEM;
 
-		err = u32_set_parms(net, tp, base,
-				    rtnl_dereference(n->ht_up)->tp_c, new, tb,
+		err = u32_set_parms(net, tp, base, tp_c, new, tb,
 				    tca[TCA_RATE], ovr, extack);
 
 		if (err) {
@@ -1124,7 +1123,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	}
 #endif
 
-	err = u32_set_parms(net, tp, base, ht->tp_c, n, tb, tca[TCA_RATE], ovr,
+	err = u32_set_parms(net, tp, base, tp_c, n, tb, tca[TCA_RATE], ovr,
 			    extack);
 	if (err == 0) {
 		struct tc_u_knode __rcu **ins;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 11/13] net: sched: cls_u32: get rid of hnode ->tp_c and tp_c argument of u32_set_parms()
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

the latter is redundant, the former - never read...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 12757e3ec8d8..f6bb3885598d 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -79,7 +79,6 @@ struct tc_u_hnode {
 	struct tc_u_hnode __rcu	*next;
 	u32			handle;
 	u32			prio;
-	struct tc_u_common	*tp_c;
 	int			refcnt;
 	unsigned int		divisor;
 	struct idr		handle_idr;
@@ -390,7 +389,6 @@ static int u32_init(struct tcf_proto *tp)
 	tp_c->refcnt++;
 	RCU_INIT_POINTER(root_ht->next, tp_c->hlist);
 	rcu_assign_pointer(tp_c->hlist, root_ht);
-	root_ht->tp_c = tp_c;
 
 	root_ht->refcnt++;
 	rcu_assign_pointer(tp->root, root_ht);
@@ -761,7 +759,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
-			 unsigned long base, struct tc_u_common *tp_c,
+			 unsigned long base,
 			 struct tc_u_knode *n, struct nlattr **tb,
 			 struct nlattr *est, bool ovr,
 			 struct netlink_ext_ack *extack)
@@ -782,7 +780,7 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp,
 		}
 
 		if (handle) {
-			ht_down = u32_lookup_ht(tp_c, handle);
+			ht_down = u32_lookup_ht(tp->data, handle);
 
 			if (!ht_down) {
 				NL_SET_ERR_MSG_MOD(extack, "Link hash table not found");
@@ -956,7 +954,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (!new)
 			return -ENOMEM;
 
-		err = u32_set_parms(net, tp, base, tp_c, new, tb,
+		err = u32_set_parms(net, tp, base, new, tb,
 				    tca[TCA_RATE], ovr, extack);
 
 		if (err) {
@@ -1012,7 +1010,6 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 				return err;
 			}
 		}
-		ht->tp_c = tp_c;
 		ht->refcnt = 1;
 		ht->divisor = divisor;
 		ht->handle = handle;
@@ -1123,8 +1120,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	}
 #endif
 
-	err = u32_set_parms(net, tp, base, tp_c, n, tb, tca[TCA_RATE], ovr,
-			    extack);
+	err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr, extack);
 	if (err == 0) {
 		struct tc_u_knode __rcu **ins;
 		struct tc_u_knode *pins;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 12/13] net: sched: cls_u32: keep track of knodes count in tc_u_common
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

allows to simplify u32_delete() considerably

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index f6bb3885598d..86cbe4f5800e 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -97,6 +97,7 @@ struct tc_u_common {
 	int			refcnt;
 	struct idr		handle_idr;
 	struct hlist_node	hnode;
+	long			knodes;
 };
 
 static inline unsigned int u32_hash_fold(__be32 key,
@@ -453,6 +454,7 @@ static void u32_delete_key_freepf_work(struct work_struct *work)
 
 static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 {
+	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_knode __rcu **kp;
 	struct tc_u_knode *pkp;
 	struct tc_u_hnode *ht = rtnl_dereference(key->ht_up);
@@ -463,6 +465,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
 		     kp = &pkp->next, pkp = rtnl_dereference(*kp)) {
 			if (pkp == key) {
 				RCU_INIT_POINTER(*kp, key->next);
+				tp_c->knodes--;
 
 				tcf_unbind_filter(tp, &key->res);
 				idr_remove(&ht->handle_idr, key->handle);
@@ -577,6 +580,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
 static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 			    struct netlink_ext_ack *extack)
 {
+	struct tc_u_common *tp_c = tp->data;
 	struct tc_u_knode *n;
 	unsigned int h;
 
@@ -584,6 +588,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 		while ((n = rtnl_dereference(ht->ht[h])) != NULL) {
 			RCU_INIT_POINTER(ht->ht[h],
 					 rtnl_dereference(n->next));
+			tp_c->knodes--;
 			tcf_unbind_filter(tp, &n->res);
 			u32_remove_hw_knode(tp, n, extack);
 			idr_remove(&ht->handle_idr, n->handle);
@@ -1140,6 +1145,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
+		tp_c->knodes++;
 		*arg = n;
 		return 0;
 	}
-- 
2.11.0

^ permalink raw reply related

* [PATCH net 13/13] net: sched: cls_u32: simplify the hell out u32_delete() emptiness check
From: Al Viro @ 2018-09-09  1:31 UTC (permalink / raw)
  To: netdev; +Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko
In-Reply-To: <20180909013132.3222-1-viro@ZenIV.linux.org.uk>

From: Al Viro <viro@zeniv.linux.org.uk>

Now that we have the knode count, we can instantly check if
any hnodes are non-empty.  And that kills the check for extra
references to root hnode - those could happen only if there was
a knode to carry such a link.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/sched/cls_u32.c | 48 +-----------------------------------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 86cbe4f5800e..ef786ec24843 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -628,17 +628,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
 	return -ENOENT;
 }
 
-static bool ht_empty(struct tc_u_hnode *ht)
-{
-	unsigned int h;
-
-	for (h = 0; h <= ht->divisor; h++)
-		if (rcu_access_pointer(ht->ht[h]))
-			return false;
-
-	return true;
-}
-
 static void u32_destroy(struct tcf_proto *tp, struct netlink_ext_ack *extack)
 {
 	struct tc_u_common *tp_c = tp->data;
@@ -676,13 +665,9 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 		      struct netlink_ext_ack *extack)
 {
 	struct tc_u_hnode *ht = arg;
-	struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
 	struct tc_u_common *tp_c = tp->data;
 	int ret = 0;
 
-	if (ht == NULL)
-		goto out;
-
 	if (TC_U32_KEY(ht->handle)) {
 		u32_remove_hw_knode(tp, (struct tc_u_knode *)ht, extack);
 		ret = u32_delete_key(tp, (struct tc_u_knode *)ht);
@@ -702,38 +687,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last,
 	}
 
 out:
-	*last = true;
-	if (root_ht) {
-		if (root_ht->refcnt > 2) {
-			*last = false;
-			goto ret;
-		}
-		if (root_ht->refcnt == 2) {
-			if (!ht_empty(root_ht)) {
-				*last = false;
-				goto ret;
-			}
-		}
-	}
-
-	if (tp_c->refcnt > 1) {
-		*last = false;
-		goto ret;
-	}
-
-	if (tp_c->refcnt == 1) {
-		struct tc_u_hnode *ht;
-
-		for (ht = rtnl_dereference(tp_c->hlist);
-		     ht;
-		     ht = rtnl_dereference(ht->next))
-			if (!ht_empty(ht)) {
-				*last = false;
-				break;
-			}
-	}
-
-ret:
+	*last = tp_c->refcnt == 1 && tp_c->knodes == 0;
 	return ret;
 }
 
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node
From: David Ahern @ 2018-09-09  1:44 UTC (permalink / raw)
  To: Xin Long, network dev; +Cc: davem, Roopa Prabhu
In-Reply-To: <840f0c017c670ae9cc8fd8330ca24fcca1207918.1536398641.git.lucien.xin@gmail.com>

On 9/8/18 3:24 AM, Xin Long wrote:
> In inet6_rtm_getroute, since Commit 93531c674315 ("net/ipv6: separate
> handling of FIB entries from dst based routes"), it has used rt->from
> to dump route info instead of rt.
> 
> However for some route like cache, its information is not the same as
> that of the 'from' one. It caused 'ip -6 route get' to dump the wrong
> route information.
> 
> In Jianlin's testing, the output information even lost the expiration
> time for a pmtu route cache due to the wrong fib6_flags.

you are right about the flags ...

> 
> So change to use rt6_info members when it tries to dump a route entry
> without fibmatch set.

but not the src, dst and prefsrc.

> 
> Note that we will fix the gw/nh dump in another patch.

And only the gateway can change do to a redirect and redirects do not
change the device - only the gateway.

Let's do both changes in a single patch.

> 
> Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes")
> Reported-by: Jianlin Shi <jishi@redhat.com>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  net/ipv6/route.c | 39 ++++++++++++++++++++++++++-------------
>  1 file changed, 26 insertions(+), 13 deletions(-)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 18e00ce..e554922 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
>  			 int iif, int type, u32 portid, u32 seq,
>  			 unsigned int flags)
>  {
> -	struct rtmsg *rtm;
> +	struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> +	struct rt6_info *rt6 = (struct rt6_info *)dst;
> +	u32 *pmetrics, table, fib6_flags;
>  	struct nlmsghdr *nlh;
> +	struct rtmsg *rtm;
>  	long expires = 0;
> -	u32 *pmetrics;
> -	u32 table;
>  
>  	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
>  	if (!nlh)
>  		return -EMSGSIZE;
>  
> +	if (rt6) {
> +		fib6_dst = &rt6->rt6i_dst;
> +		fib6_src = &rt6->rt6i_src;
> +		fib6_flags = rt6->rt6i_flags;
> +		fib6_prefsrc = &rt6->rt6i_prefsrc;
> +	} else {
> +		fib6_dst = &rt->fib6_dst;
> +		fib6_src = &rt->fib6_src;
> +		fib6_flags = rt->fib6_flags;
> +		fib6_prefsrc = &rt->fib6_prefsrc;
> +	}

Unless I am missing something at the moment, an rt6_info can only have
the same dst, src and prefsrc as the fib6_info on which it is based.
Thus, only the flags is needed above. That simplifies this patch a lot.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox