Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
From: Sridhar Samudrala @ 2017-08-31 23:27 UTC (permalink / raw)
  To: alexander.h.duyck, netdev

This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
to enable symmetric tx and rx queues on a socket.

This option is specifically useful for epoll based multi threaded workloads
where each thread handles packets received on a single RX queue . In this model,
we have noticed that it helps to send the packets on the same TX queue
corresponding to the queue-pair associated with the RX queue specifically when
busy poll is enabled with epoll().

Two new fields are added to struct sock_common to cache the last rx ifindex and
the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached
rx queue when this option is enabled and the TX is happening on the same device.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/net/request_sock.h        |  1 +
 include/net/sock.h                | 17 +++++++++++++++++
 include/uapi/asm-generic/socket.h |  2 ++
 net/core/dev.c                    |  8 +++++++-
 net/core/sock.c                   | 10 ++++++++++
 net/ipv4/tcp_input.c              |  1 +
 net/ipv4/tcp_ipv4.c               |  1 +
 net/ipv4/tcp_minisocks.c          |  1 +
 8 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 23e2205..c3bc12e 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req)
 	req_to_sk(req)->sk_prot = sk_listener->sk_prot;
 	sk_node_init(&req_to_sk(req)->sk_node);
 	sk_tx_queue_clear(req_to_sk(req));
+	req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
 	req->saved_syn = NULL;
 	refcount_set(&req->rsk_refcnt, 0);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 03a3625..3421809 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...)
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_rx_queue_mapping: rx queue number for this connection
+ *	@skc_rx_ifindex: rx ifindex for this connection
  *	@skc_flags: place holder for sk_flags
  *		%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  *		%SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *	@skc_incoming_cpu: record/match cpu processing incoming packets
  *	@skc_refcnt: reference count
+ *	@skc_symmetric_queues: symmetric tx/rx queues
  *
  *	This is the minimal network layer representation of sockets, the header
  *	for struct sock and struct inet_timewait_sock.
@@ -177,6 +180,7 @@ struct sock_common {
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
 	unsigned char		skc_net_refcnt:1;
+	unsigned char		skc_symmetric_queues:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -214,6 +218,8 @@ struct sock_common {
 		struct hlist_nulls_node skc_nulls_node;
 	};
 	int			skc_tx_queue_mapping;
+	int			skc_rx_queue_mapping;
+	int			skc_rx_ifindex;
 	union {
 		int		skc_incoming_cpu;
 		u32		skc_rcv_wnd;
@@ -324,6 +330,8 @@ struct sock {
 #define sk_nulls_node		__sk_common.skc_nulls_node
 #define sk_refcnt		__sk_common.skc_refcnt
 #define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping
+#define sk_rx_queue_mapping	__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex		__sk_common.skc_rx_ifindex
 
 #define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end		__sk_common.skc_dontcopy_end
@@ -340,6 +348,7 @@ struct sock {
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_net_refcnt		__sk_common.skc_net_refcnt
+#define sk_symmetric_queues	__sk_common.skc_symmetric_queues
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
@@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
 	return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+	if (sk->sk_symmetric_queues) {
+		sk->sk_rx_ifindex = skb->skb_iif;
+		sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+	}
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
 	sk_tx_queue_clear(sk);
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index e47c9e4..f6b416e 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -106,4 +106,6 @@
 
 #define SO_ZEROCOPY		60
 
+#define SO_SYMMETRIC_QUEUES	61
+
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 270b547..d96cda8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3322,7 +3322,13 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
 
 	if (queue_index < 0 || skb->ooo_okay ||
 	    queue_index >= dev->real_num_tx_queues) {
-		int new_index = get_xps_queue(dev, skb);
+		int new_index = -1;
+
+		if (sk && sk->sk_symmetric_queues && dev->ifindex == sk->sk_rx_ifindex)
+			new_index = sk->sk_rx_queue_mapping;
+
+		if (new_index < 0 || new_index >= dev->real_num_tx_queues)
+			new_index = get_xps_queue(dev, skb);
 
 		if (new_index < 0)
 			new_index = skb_tx_hash(dev, skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 9b7b6bb..3876cce 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1059,6 +1059,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
 		break;
 
+	case SO_SYMMETRIC_QUEUES:
+		sk->sk_symmetric_queues = valbool;
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
@@ -1391,6 +1395,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
 		break;
 
+	case SO_SYMMETRIC_QUEUES:
+		v.val = sk->sk_symmetric_queues;
+		break;
+
 	default:
 		/* We implement the SO_SNDLOWAT etc to not be settable
 		 * (1003.1g 7).
@@ -2738,6 +2746,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_max_pacing_rate = ~0U;
 	sk->sk_pacing_rate = ~0U;
 	sk->sk_incoming_cpu = -1;
+	sk->sk_rx_ifindex = -1;
+	sk->sk_rx_queue_mapping = -1;
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
 	 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c5d7656..12381e0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6356,6 +6356,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_rsk(req)->txhash = net_tx_rndhash();
 	tcp_openreq_init_rwin(req, sk, dst);
+	sk_mark_rx_queue(req_to_sk(req), skb);
 	if (!want_cookie) {
 		tcp_reqsk_record_syn(sk, req, skb);
 		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a63486a..82f9af4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1450,6 +1450,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 
 		sock_rps_save_rxhash(sk, skb);
 		sk_mark_napi_id(sk, skb);
+		sk_mark_rx_queue(sk, skb);
 		if (dst) {
 			if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
 			    !dst->ops->check(dst, 0)) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 188a6f3..2b5efd5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -809,6 +809,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 
 	/* record NAPI ID of child */
 	sk_mark_napi_id(child, skb);
+	sk_mark_rx_queue(child, skb);
 
 	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 13/31] timer: Remove meaningless .data/.function assignments
From: Kees Cook @ 2017-08-31 23:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Krzysztof Halasa, Aditya Shankar, Ganesh Krishna,
	Greg Kroah-Hartman, Jens Axboe, netdev, linux-wireless, devel,
	linux-kernel
In-Reply-To: <1504222183-61202-1-git-send-email-keescook@chromium.org>

Several timer users needlessly reset their .function/.data fields during
their timer callback, but nothing else changes them. Some users do not
use their .data field at all. Each instance is removed here.

Cc: Krzysztof Halasa <khc@pm.waw.pl>
Cc: Aditya Shankar <aditya.shankar@microchip.com>
Cc: Ganesh Krishna <ganesh.krishna@microchip.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jens Axboe <axboe@fb.com>
Cc: netdev@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: devel@driverdev.osuosl.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/block/amiflop.c                           | 3 +--
 drivers/net/wan/hdlc_cisco.c                      | 2 --
 drivers/net/wan/hdlc_fr.c                         | 2 --
 drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 4 +---
 4 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index c4b1cba27178..6680d75bc857 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -323,7 +323,7 @@ static void fd_deselect (int drive)
 
 }
 
-static void motor_on_callback(unsigned long nr)
+static void motor_on_callback(unsigned long ignored)
 {
 	if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) {
 		complete_all(&motor_on_completion);
@@ -344,7 +344,6 @@ static int fd_motor_on(int nr)
 		fd_select(nr);
 
 		reinit_completion(&motor_on_completion);
-		motor_on_timer.data = nr;
 		mod_timer(&motor_on_timer, jiffies + HZ/2);
 
 		on_attempts = 10;
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index c696d42f4502..6c98d85f2773 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -276,8 +276,6 @@ static void cisco_timer(unsigned long arg)
 	spin_unlock(&st->lock);
 
 	st->timer.expires = jiffies + st->settings.interval * HZ;
-	st->timer.function = cisco_timer;
-	st->timer.data = arg;
 	add_timer(&st->timer);
 }
 
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index de42faca076a..7da2424c28a4 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -644,8 +644,6 @@ static void fr_timer(unsigned long arg)
 			state(hdlc)->settings.t391 * HZ;
 	}
 
-	state(hdlc)->timer.function = fr_timer;
-	state(hdlc)->timer.data = arg;
 	add_timer(&state(hdlc)->timer);
 }
 
diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
index 68fd5b3b8b2d..2fca2b017093 100644
--- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
+++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
@@ -275,7 +275,7 @@ static void update_scan_time(void)
 		last_scanned_shadow[i].time_scan = jiffies;
 }
 
-static void remove_network_from_shadow(unsigned long arg)
+static void remove_network_from_shadow(unsigned long unused)
 {
 	unsigned long now = jiffies;
 	int i, j;
@@ -296,7 +296,6 @@ static void remove_network_from_shadow(unsigned long arg)
 	}
 
 	if (last_scanned_cnt != 0) {
-		hAgingTimer.data = arg;
 		mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME));
 	}
 }
@@ -313,7 +312,6 @@ static int is_network_in_shadow(struct network_info *pstrNetworkInfo,
 	int i;
 
 	if (last_scanned_cnt == 0) {
-		hAgingTimer.data = (unsigned long)user_void;
 		mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME));
 		state = -1;
 	} else {
-- 
2.7.4

^ permalink raw reply related

* [PATCH 19/31] timer: Remove open-coded casts for .data and .function
From: Kees Cook @ 2017-08-31 23:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Samuel Ortiz, James E.J. Bottomley, Kees Cook, Martin K. Petersen,
	linux-kernel, linux-scsi, netdev, Paul Mackerras, Tyrel Datwyler,
	linuxppc-dev
In-Reply-To: <1504222183-61202-1-git-send-email-keescook@chromium.org>

This standardizes the callback and data prototypes in several places that
perform casting, in an effort to remove more open-coded .data and
.function uses in favor of setup_timer().

Cc: Samuel Ortiz <samuel@sortiz.org>
Cc: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: netdev@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/net/irda/bfin_sir.c      |  5 +++--
 drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++++++--------
 drivers/scsi/ibmvscsi/ibmvscsi.c |  8 ++++----
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/net/irda/bfin_sir.c b/drivers/net/irda/bfin_sir.c
index 3151b580dbd6..c9413bd580a7 100644
--- a/drivers/net/irda/bfin_sir.c
+++ b/drivers/net/irda/bfin_sir.c
@@ -317,8 +317,9 @@ static void bfin_sir_dma_rx_chars(struct net_device *dev)
 		async_unwrap_char(dev, &self->stats, &self->rx_buff, port->rx_dma_buf.buf[i]);
 }
 
-void bfin_sir_rx_dma_timeout(struct net_device *dev)
+void bfin_sir_rx_dma_timeout(unsigned long data)
 {
+	struct net_device *dev = (struct net_device *)data;
 	struct bfin_sir_self *self = netdev_priv(dev);
 	struct bfin_sir_port *port = self->sir_port;
 	int x_pos, pos;
@@ -406,7 +407,7 @@ static int bfin_sir_startup(struct bfin_sir_port *port, struct net_device *dev)
 	enable_dma(port->rx_dma_channel);
 
 	port->rx_dma_timer.data = (unsigned long)(dev);
-	port->rx_dma_timer.function = (void *)bfin_sir_rx_dma_timeout;
+	port->rx_dma_timer.function = bfin_sir_rx_dma_timeout;
 
 #else
 
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index cc4e05be8d4a..1be20688dd1f 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1393,8 +1393,9 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd,
  *
  * Called when an internally generated command times out
  **/
-static void ibmvfc_timeout(struct ibmvfc_event *evt)
+static void ibmvfc_timeout(unsigned long data)
 {
+	struct ibmvfc_event *evt = (struct ibmvfc_event *)data;
 	struct ibmvfc_host *vhost = evt->vhost;
 	dev_err(vhost->dev, "Command timed out (%p). Resetting connection\n", evt);
 	ibmvfc_reset_host(vhost);
@@ -1424,12 +1425,10 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
 		BUG();
 
 	list_add_tail(&evt->queue, &vhost->sent);
-	init_timer(&evt->timer);
+	setup_timer(&evt->timer, ibmvfc_timeout, (unsigned long)evt);
 
 	if (timeout) {
-		evt->timer.data = (unsigned long) evt;
 		evt->timer.expires = jiffies + (timeout * HZ);
-		evt->timer.function = (void (*)(unsigned long))ibmvfc_timeout;
 		add_timer(&evt->timer);
 	}
 
@@ -3696,8 +3695,9 @@ static void ibmvfc_tgt_adisc_cancel_done(struct ibmvfc_event *evt)
  * out, reset the CRQ. When the ADISC comes back as cancelled,
  * log back into the target.
  **/
-static void ibmvfc_adisc_timeout(struct ibmvfc_target *tgt)
+static void ibmvfc_adisc_timeout(unsigned long data)
 {
+	struct ibmvfc_target *tgt = (struct ibmvfc_target *)data;
 	struct ibmvfc_host *vhost = tgt->vhost;
 	struct ibmvfc_event *evt;
 	struct ibmvfc_tmf *tmf;
@@ -3782,9 +3782,7 @@ static void ibmvfc_tgt_adisc(struct ibmvfc_target *tgt)
 	if (timer_pending(&tgt->timer))
 		mod_timer(&tgt->timer, jiffies + (IBMVFC_ADISC_TIMEOUT * HZ));
 	else {
-		tgt->timer.data = (unsigned long) tgt;
 		tgt->timer.expires = jiffies + (IBMVFC_ADISC_TIMEOUT * HZ);
-		tgt->timer.function = (void (*)(unsigned long))ibmvfc_adisc_timeout;
 		add_timer(&tgt->timer);
 	}
 
@@ -3916,7 +3914,7 @@ static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, u64 scsi_id)
 	tgt->vhost = vhost;
 	tgt->need_login = 1;
 	tgt->cancel_key = vhost->task_set++;
-	init_timer(&tgt->timer);
+	setup_timer(&tgt->timer, ibmvfc_adisc_timeout, (unsigned long)tgt);
 	kref_init(&tgt->kref);
 	ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout);
 	spin_lock_irqsave(vhost->host->host_lock, flags);
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index da22b3665cb0..44ae85903a00 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -837,8 +837,9 @@ static void ibmvscsi_reset_host(struct ibmvscsi_host_data *hostdata)
  *
  * Called when an internally generated command times out
 */
-static void ibmvscsi_timeout(struct srp_event_struct *evt_struct)
+static void ibmvscsi_timeout(unsigned long data)
 {
+	struct srp_event_struct *evt_struct = (struct srp_event_struct *)data;
 	struct ibmvscsi_host_data *hostdata = evt_struct->hostdata;
 
 	dev_err(hostdata->dev, "Command timed out (%x). Resetting connection\n",
@@ -927,11 +928,10 @@ static int ibmvscsi_send_srp_event(struct srp_event_struct *evt_struct,
 	 */
 	list_add_tail(&evt_struct->list, &hostdata->sent);
 
-	init_timer(&evt_struct->timer);
+	setup_timer(&evt_struct->timer, ibmvscsi_timeout,
+		    (unsigned long)evt_struct);
 	if (timeout) {
-		evt_struct->timer.data = (unsigned long) evt_struct;
 		evt_struct->timer.expires = jiffies + (timeout * HZ);
-		evt_struct->timer.function = (void (*)(unsigned long))ibmvscsi_timeout;
 		add_timer(&evt_struct->timer);
 	}
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 20/31] net/core: Collapse redundant sk_timer callback data assignments
From: Kees Cook @ 2017-08-31 23:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, David S. Miller, Ralf Baechle, Andrew Hendry,
	Eric Dumazet, Paolo Abeni, David Howells, Colin Ian King,
	Ingo Molnar, linzhang, netdev, linux-hams, linux-x25,
	linux-kernel
In-Reply-To: <1504222183-61202-1-git-send-email-keescook@chromium.org>

The core sk_timer initializer can provide the common .data assignment
instead of it being set separately in users.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: linzhang <xiaolou4617@gmail.com>
Cc: netdev@vger.kernel.org
Cc: linux-hams@vger.kernel.org
Cc: linux-x25@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 net/core/sock.c       | 2 +-
 net/netrom/nr_timer.c | 1 -
 net/rose/rose_timer.c | 1 -
 net/x25/af_x25.c      | 1 -
 net/x25/x25_timer.c   | 1 -
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index ac2a404c73eb..5281a998ab32 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2623,7 +2623,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk_init_common(sk);
 	sk->sk_send_head	=	NULL;
 
-	init_timer(&sk->sk_timer);
+	setup_timer(&sk->sk_timer, NULL, (unsigned long)sk);
 
 	sk->sk_allocation	=	GFP_KERNEL;
 	sk->sk_rcvbuf		=	sysctl_rmem_default;
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index 94d05806a9a2..f84ce71f1f5f 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -45,7 +45,6 @@ void nr_init_timers(struct sock *sk)
 	setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.data     = (unsigned long)sk;
 	sk->sk_timer.function = &nr_heartbeat_expiry;
 }
 
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index bc5469d6d9cb..6baa415b199a 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -36,7 +36,6 @@ void rose_start_heartbeat(struct sock *sk)
 {
 	del_timer(&sk->sk_timer);
 
-	sk->sk_timer.data     = (unsigned long)sk;
 	sk->sk_timer.function = &rose_heartbeat_expiry;
 	sk->sk_timer.expires  = jiffies + 5 * HZ;
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5a1a98df3499..a5ac385b9120 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -414,7 +414,6 @@ static void __x25_destroy_socket(struct sock *sk)
 		/* Defer: outstanding buffers */
 		sk->sk_timer.expires  = jiffies + 10 * HZ;
 		sk->sk_timer.function = x25_destroy_timer;
-		sk->sk_timer.data = (unsigned long)sk;
 		add_timer(&sk->sk_timer);
 	} else {
 		/* drop last reference so sock_put will free */
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index 5c5db1a36399..de5cec41d100 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -36,7 +36,6 @@ void x25_init_timers(struct sock *sk)
 	setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.data     = (unsigned long)sk;
 	sk->sk_timer.function = &x25_heartbeat_expiry;
 }
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 30/31] appletalk: Remove unneeded synchronization
From: Kees Cook @ 2017-08-31 23:29 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Kees Cook, David Howells, netdev, linux-kernel
In-Reply-To: <1504222183-61202-1-git-send-email-keescook@chromium.org>

The use of del_timer_sync() will make sure a timer is not rescheduled.
As such, there is no need to add external signals to kill timers. In
preparation for switching the timer callback argument to the timer
pointer, this drops the .data argument since it doesn't serve a meaningful
purpose here.

Cc: David Howells <dhowells@redhat.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/net/appletalk/ltpc.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c
index e4aa374caa4d..cc3dc9337eae 100644
--- a/drivers/net/appletalk/ltpc.c
+++ b/drivers/net/appletalk/ltpc.c
@@ -880,14 +880,10 @@ static void ltpc_poll(unsigned long l)
 		}
 		ltpc_poll_counter--;
 	}
-  
-	if (!dev)
-		return;  /* we've been downed */
 
 	/* poll 20 times per second */
 	idle(dev);
 	ltpc_timer.expires = jiffies + HZ/20;
-	
 	add_timer(&ltpc_timer);
 }
 
@@ -1252,8 +1248,6 @@ static void __exit ltpc_cleanup(void)
 	if(debug & DEBUG_VERBOSE) printk("unregister_netdev\n");
 	unregister_netdev(dev_ltpc);
 
-	ltpc_timer.data = 0;  /* signal the poll routine that we're done */
-
 	del_timer_sync(&ltpc_timer);
 
 	if(debug & DEBUG_VERBOSE) printk("freeing irq\n");
-- 
2.7.4

^ permalink raw reply related

* [PATCH 31/31] timer: Switch to testing for .function instead of .data
From: Kees Cook @ 2017-08-31 23:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, Rafael J. Wysocki, Pavel Machek, Len Brown,
	Greg Kroah-Hartman, Mike Marciniszyn, Dennis Dalessandro,
	Doug Ledford, Sean Hefty, Hal Rosenstock, Dmitry Torokhov,
	Jeff Kirsher, linux-pm, linux-rdma, linux-input, intel-wired-lan,
	netdev, linux-kernel
In-Reply-To: <1504222183-61202-1-git-send-email-keescook@chromium.org>

In several places, .data is checked for initialization to gate early
calls to del_timer_sync(). Checking for .function is equally valid, so
switch to this in all callers.

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Len Brown <len.brown@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mike Marciniszyn <mike.marciniszyn@intel.com>
Cc: Dennis Dalessandro <dennis.dalessandro@intel.com>
Cc: Doug Ledford <dledford@redhat.com>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: linux-pm@vger.kernel.org
Cc: linux-rdma@vger.kernel.org
Cc: linux-input@vger.kernel.org
Cc: intel-wired-lan@lists.osuosl.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/base/power/wakeup.c                 |  3 +--
 drivers/infiniband/hw/hfi1/chip.c           |  6 ++----
 drivers/infiniband/hw/hfi1/init.c           |  2 +-
 drivers/infiniband/hw/qib/qib_iba7220.c     |  2 +-
 drivers/infiniband/hw/qib/qib_iba7322.c     |  2 +-
 drivers/infiniband/hw/qib/qib_init.c        | 14 +++++---------
 drivers/infiniband/hw/qib/qib_mad.c         |  2 +-
 drivers/input/input.c                       |  5 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  2 +-
 9 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 144e6d8fafc8..79a3c1b204af 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -479,8 +479,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
 	 * Use timer struct to check if the given source is initialized
 	 * by wakeup_source_add.
 	 */
-	return ws->timer.function != pm_wakeup_timer_fn ||
-		   ws->timer.data != (unsigned long)ws;
+	return ws->timer.function != pm_wakeup_timer_fn;
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 94b54850ec75..53a6596cd7d6 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -5513,9 +5513,8 @@ static int init_rcverr(struct hfi1_devdata *dd)
 
 static void free_rcverr(struct hfi1_devdata *dd)
 {
-	if (dd->rcverr_timer.data)
+	if (dd->rcverr_timer.function)
 		del_timer_sync(&dd->rcverr_timer);
-	dd->rcverr_timer.data = 0;
 }
 
 static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
@@ -11992,9 +11991,8 @@ static void free_cntrs(struct hfi1_devdata *dd)
 	struct hfi1_pportdata *ppd;
 	int i;
 
-	if (dd->synth_stats_timer.data)
+	if (dd->synth_stats_timer.function)
 		del_timer_sync(&dd->synth_stats_timer);
-	dd->synth_stats_timer.data = 0;
 	ppd = (struct hfi1_pportdata *)(dd + 1);
 	for (i = 0; i < dd->num_pports; i++, ppd++) {
 		kfree(ppd->cntrs);
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 4a11d4da4c92..bc2af709c111 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -839,7 +839,7 @@ static void stop_timers(struct hfi1_devdata *dd)
 
 	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		ppd = dd->pport + pidx;
-		if (ppd->led_override_timer.data) {
+		if (ppd->led_override_timer.function) {
 			del_timer_sync(&ppd->led_override_timer);
 			atomic_set(&ppd->led_override_timer_active, 0);
 		}
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
index b1d512c7ff4b..22fd65fe7193 100644
--- a/drivers/infiniband/hw/qib/qib_iba7220.c
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -1662,7 +1662,7 @@ static void qib_7220_quiet_serdes(struct qib_pportdata *ppd)
 		       dd->control | QLOGIC_IB_C_FREEZEMODE);
 
 	ppd->cpspec->chase_end = 0;
-	if (ppd->cpspec->chase_timer.data) /* if initted */
+	if (ppd->cpspec->chase_timer.function) /* if initted */
 		del_timer_sync(&ppd->cpspec->chase_timer);
 
 	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index bb2439fff8fa..471aaf6bcbf2 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -2531,7 +2531,7 @@ static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd)
 		cancel_delayed_work_sync(&ppd->cpspec->ipg_work);
 
 	ppd->cpspec->chase_end = 0;
-	if (ppd->cpspec->chase_timer.data) /* if initted */
+	if (ppd->cpspec->chase_timer.function) /* if initted */
 		del_timer_sync(&ppd->cpspec->chase_timer);
 
 	/*
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index 6c16ba1107ba..66fb0318660b 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -815,23 +815,19 @@ static void qib_stop_timers(struct qib_devdata *dd)
 	struct qib_pportdata *ppd;
 	int pidx;
 
-	if (dd->stats_timer.data) {
+	if (dd->stats_timer.function)
 		del_timer_sync(&dd->stats_timer);
-		dd->stats_timer.data = 0;
-	}
-	if (dd->intrchk_timer.data) {
+	if (dd->intrchk_timer.function)
 		del_timer_sync(&dd->intrchk_timer);
-		dd->intrchk_timer.data = 0;
-	}
 	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 		ppd = dd->pport + pidx;
-		if (ppd->hol_timer.data)
+		if (ppd->hol_timer.function)
 			del_timer_sync(&ppd->hol_timer);
-		if (ppd->led_override_timer.data) {
+		if (ppd->led_override_timer.function) {
 			del_timer_sync(&ppd->led_override_timer);
 			atomic_set(&ppd->led_override_timer_active, 0);
 		}
-		if (ppd->symerr_clear_timer.data)
+		if (ppd->symerr_clear_timer.function)
 			del_timer_sync(&ppd->symerr_clear_timer);
 	}
 }
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index f5a2aed31a89..37483f32935c 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -2496,7 +2496,7 @@ void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx)
 	struct qib_devdata *dd = container_of(ibdev,
 					      struct qib_devdata, verbs_dev);
 
-	if (dd->pport[port_idx].cong_stats.timer.data)
+	if (dd->pport[port_idx].cong_stats.timer.function)
 		del_timer_sync(&dd->pport[port_idx].cong_stats.timer);
 
 	if (dd->pport[port_idx].ibport_data.smi_ah)
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 7e6842bd525c..a91fbbfc1b32 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -76,7 +76,7 @@ static void input_start_autorepeat(struct input_dev *dev, int code)
 {
 	if (test_bit(EV_REP, dev->evbit) &&
 	    dev->rep[REP_PERIOD] && dev->rep[REP_DELAY] &&
-	    dev->timer.data) {
+	    dev->timer.function) {
 		dev->repeat_key = code;
 		mod_timer(&dev->timer,
 			  jiffies + msecs_to_jiffies(dev->rep[REP_DELAY]));
@@ -1790,7 +1790,7 @@ struct input_dev *input_allocate_device(void)
 		device_initialize(&dev->dev);
 		mutex_init(&dev->mutex);
 		spin_lock_init(&dev->event_lock);
-		init_timer(&dev->timer);
+		setup_timer(&dev->timer, NULL, (unsigned long)dev);
 		INIT_LIST_HEAD(&dev->h_list);
 		INIT_LIST_HEAD(&dev->node);
 
@@ -2053,7 +2053,6 @@ static void devm_input_device_unregister(struct device *dev, void *res)
  */
 void input_enable_softrepeat(struct input_dev *dev, int delay, int period)
 {
-	dev->timer.data = (unsigned long) dev;
 	dev->timer.function = input_repeat_key;
 	dev->rep[REP_DELAY] = delay;
 	dev->rep[REP_PERIOD] = period;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2db93d3f6d23..b9a4c1a6e4ba 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11797,7 +11797,7 @@ static void i40e_remove(struct pci_dev *pdev)
 	/* no more scheduling of any task */
 	set_bit(__I40E_SUSPENDED, pf->state);
 	set_bit(__I40E_DOWN, pf->state);
-	if (pf->service_timer.data)
+	if (pf->service_timer.function)
 		del_timer_sync(&pf->service_timer);
 	if (pf->service_task.func)
 		cancel_work_sync(&pf->service_task);
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC net-next 1/8] net: dsa: Allow switch drivers to indicate number of RX/TX queues
From: Andrew Lunn @ 2017-08-31 23:44 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, jiri, jhs, davem, xiyou.wangcong, vivien.didelot
In-Reply-To: <1504138732-65383-2-git-send-email-f.fainelli@gmail.com>

On Wed, Aug 30, 2017 at 05:18:45PM -0700, Florian Fainelli wrote:
> Let switch drivers indicate how many RX and TX queues they support. Some
> switches, such as Broadcom Starfighter 2 are resigned with 8 egress
> queues.

Marvell switches also have egress queue.

Does the SF2 have ingress queues? Marvel don't as far as i known.  So
i wounder if num_rx_queues is useful?

Do switches in general have ingress queues? 

   Andrew

^ permalink raw reply

* Re: [PATCH net] ipv4: Don't override return code from ip_route_input_noref()
From: Sabrina Dubroca @ 2017-08-31 23:45 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: David S . Miller, netdev, Wei Wang
In-Reply-To: <c4d8970306e614ac139cf2f33ab5e6bb73b30154.1504194307.git.sbrivio@redhat.com>

2017-08-31, 18:11:41 +0200, Stefano Brivio wrote:
> After ip_route_input() calls ip_route_input_noref(), another
> check on skb_dst() is done, but if this fails, we shouldn't
> override the return code from ip_route_input_noref(), as it
> could have been more specific (i.e. -EHOSTUNREACH).
> 
> This also saves one call to skb_dst_force_safe() and one to
> skb_dst() in case the ip_route_input_noref() check fails.
> 
> Reported-by: Sabrina Dubroca <sdubroca@redhat.com>
> Fixes: ad65a2f05695 ("ipv4: call dst_hold_safe() properly")

That should be instead:

Fixes: 9df16efadd2a ("ipv4: call dst_hold_safe() properly")

Acked-by: Sabrina Dubroca <sd@queasysnail.net>

-- 
Sabrina

^ permalink raw reply

* Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data
From: Dmitry Torokhov @ 2017-08-31 23:45 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Gleixner, Rafael J. Wysocki, Pavel Machek, Len Brown,
	Greg Kroah-Hartman, Mike Marciniszyn, Dennis Dalessandro,
	Doug Ledford, Sean Hefty, Hal Rosenstock, Jeff Kirsher,
	linux-pm@vger.kernel.org, linux-rdma, linux-input@vger.kernel.org,
	intel-wired-lan, netdev, lkml
In-Reply-To: <1504222183-61202-32-git-send-email-keescook@chromium.org>

On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook <keescook@chromium.org> wrote:
> In several places, .data is checked for initialization to gate early
> calls to del_timer_sync(). Checking for .function is equally valid, so
> switch to this in all callers.

Not seeing the rest of patches it is unclear from the patch
description why this is needed/wanted.

Thanks.

-- 
Dmitry

^ permalink raw reply

* [PATCH v2 net-next 0/2] net: ubuf_info.refcnt conversion
From: Eric Dumazet @ 2017-08-31 23:48 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Willem de Bruijn, Eric Dumazet, Eric Dumazet

Yet another atomic_t -> refcount_t conversion, split in two patches.

First patch prepares the automatic conversion done in the second patch.

Eric Dumazet (2):
  net: prepare (struct ubuf_info)->refcnt conversion
  net: convert (struct ubuf_info)->refcnt to refcount_t

 drivers/vhost/net.c    |  2 +-
 include/linux/skbuff.h |  5 +++--
 net/core/skbuff.c      | 14 ++++----------
 net/ipv4/tcp.c         |  2 --
 4 files changed, 8 insertions(+), 15 deletions(-)

-- 
2.14.1.581.gf28d330327-goog

^ permalink raw reply

* [PATCH v2 net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion
From: Eric Dumazet @ 2017-08-31 23:48 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Willem de Bruijn, Eric Dumazet, Eric Dumazet
In-Reply-To: <20170831234822.26612-1-edumazet@google.com>

In order to convert this atomic_t refcnt to refcount_t,
we need to init the refcount to one to not trigger
a 0 -> 1 transition.

This also removes one atomic operation in fast path.

v2: removed dead code in sock_zerocopy_put_abort()
as suggested by Willem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/core/skbuff.c | 10 ++--------
 net/ipv4/tcp.c    |  2 --
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 917da73d3ab3b82163cf0a9ee944da09cb5a391f..1a754d7896ceac1eb85e3b13c1422b4d6a88fedf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	atomic_set(&uarg->refcnt, 0);
+	atomic_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1005,6 +1005,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 			uarg->len++;
 			uarg->bytelen = bytelen;
 			atomic_set(&sk->sk_zckey, ++next);
+			sock_zerocopy_get(uarg);
 			return uarg;
 		}
 	}
@@ -1102,13 +1103,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
-		/* sock_zerocopy_put expects a ref. Most sockets take one per
-		 * skb, which is zero on abort. tcp_sendmsg holds one extra, to
-		 * avoid an skb send inside the main loop triggering uarg free.
-		 */
-		if (sk->sk_type != SOCK_STREAM)
-			atomic_inc(&uarg->refcnt);
-
 		sock_zerocopy_put(uarg);
 	}
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 21ca2df274c5130a13d31a391a1408d779af34af..fff4d7bc3b8b46174e7bd0a04d7c212307e55cb5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1190,8 +1190,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
-		/* skb may be freed in main loop, keep extra ref on uarg */
-		sock_zerocopy_get(uarg);
 		if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
 			uarg->zerocopy = 0;
 	}
-- 
2.14.1.581.gf28d330327-goog

^ permalink raw reply related

* [PATCH v2 net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
From: Eric Dumazet @ 2017-08-31 23:48 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Willem de Bruijn, Eric Dumazet, Eric Dumazet
In-Reply-To: <20170831234822.26612-1-edumazet@google.com>

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

v2: added the change in drivers/vhost/net.c as spotted
by Willem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 drivers/vhost/net.c    | 2 +-
 include/linux/skbuff.h | 5 +++--
 net/core/skbuff.c      | 6 +++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ba08b78ed630c429ccf415af69bf27f2433af4f0..8d2bcae53a2ec9ea1c876625e581bcd429abe365 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -533,7 +533,7 @@ static void handle_tx(struct vhost_net *net)
 			ubuf->callback = vhost_zerocopy_callback;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
-			atomic_set(&ubuf->refcnt, 1);
+			refcount_set(&ubuf->refcnt, 1);
 			msg.msg_control = ubuf;
 			msg.msg_controllen = sizeof(ubuf);
 			ubufs = nvq->ubufs;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -22,6 +22,7 @@
 #include <linux/cache.h>
 #include <linux/rbtree.h>
 #include <linux/socket.h>
+#include <linux/refcount.h>
 
 #include <linux/atomic.h>
 #include <asm/types.h>
@@ -456,7 +457,7 @@ struct ubuf_info {
 			u32 bytelen;
 		};
 	};
-	atomic_t refcnt;
+	refcount_t refcnt;
 
 	struct mmpin {
 		struct user_struct *user;
@@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
 
 static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 {
-	atomic_inc(&uarg->refcnt);
+	refcount_inc(&uarg->refcnt);
 }
 
 void sock_zerocopy_put(struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1a754d7896ceac1eb85e3b13c1422b4d6a88fedf..06b9ddca3188442d1d1d2052fc060cf5b1e2a6f4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	atomic_set(&uarg->refcnt, 1);
+	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 
 void sock_zerocopy_put(struct ubuf_info *uarg)
 {
-	if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+	if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
 		if (uarg->callback)
 			uarg->callback(uarg, uarg->zerocopy);
 		else
@@ -1483,7 +1483,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		if (skb_orphan_frags(skb, gfp_mask))
 			goto nofrags;
 		if (skb_zcopy(skb))
-			atomic_inc(&skb_uarg(skb)->refcnt);
+			refcount_inc(&skb_uarg(skb)->refcnt);
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			skb_frag_ref(skb, i);
 
-- 
2.14.1.581.gf28d330327-goog

^ permalink raw reply related

* [PATCH 25/31] net/atm/mpc: Use separate static data field with with static timer
From: Kees Cook @ 2017-08-31 23:29 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Kees Cook, David S. Miller, Andrew Morton, Alexey Dobriyan,
	Reshetova, Elena, netdev, linux-kernel
In-Reply-To: <1504222183-61202-1-git-send-email-keescook@chromium.org>

In preparation for changing the timer callback argument to the timer
pointer, move to a separate static data variable.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Reshetova, Elena" <elena.reshetova@intel.com>
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 net/atm/mpc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 5baa31b8500c..fc24a46500ae 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -95,7 +95,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
 static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
 			       unsigned long event, void *dev);
 static void mpc_timer_refresh(void);
-static void mpc_cache_check(unsigned long checking_time);
+static void mpc_cache_check(unsigned long unused);
 
 static struct llc_snap_hdr llc_snap_mpoa_ctrl = {
 	0xaa, 0xaa, 0x03,
@@ -121,7 +121,8 @@ static struct notifier_block mpoa_notifier = {
 
 struct mpoa_client *mpcs = NULL; /* FIXME */
 static struct atm_mpoa_qos *qos_head = NULL;
-static DEFINE_TIMER(mpc_timer, NULL);
+static DEFINE_TIMER(mpc_timer, mpc_cache_check);
+static unsigned long checking_time;
 
 
 static struct mpoa_client *find_mpc_by_itfnum(int itf)
@@ -1411,12 +1412,11 @@ static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action)
 static void mpc_timer_refresh(void)
 {
 	mpc_timer.expires = jiffies + (MPC_P2 * HZ);
-	mpc_timer.data = mpc_timer.expires;
-	mpc_timer.function = mpc_cache_check;
+	checking_time = mpc_timer.expires;
 	add_timer(&mpc_timer);
 }
 
-static void mpc_cache_check(unsigned long checking_time)
+static void mpc_cache_check(unsigned long unused)
 {
 	struct mpoa_client *mpc = mpcs;
 	static unsigned long previous_resolving_check_time;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v2 net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion
From: Willem de Bruijn @ 2017-08-31 23:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn, Eric Dumazet
In-Reply-To: <20170831234822.26612-2-edumazet@google.com>

On Thu, Aug 31, 2017 at 7:48 PM, Eric Dumazet <edumazet@google.com> wrote:
> In order to convert this atomic_t refcnt to refcount_t,
> we need to init the refcount to one to not trigger
> a 0 -> 1 transition.
>
> This also removes one atomic operation in fast path.
>
> v2: removed dead code in sock_zerocopy_put_abort()
> as suggested by Willem.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply

* Re: [PATCH v2 net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
From: Willem de Bruijn @ 2017-08-31 23:55 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn, Eric Dumazet
In-Reply-To: <20170831234822.26612-3-edumazet@google.com>

On Thu, Aug 31, 2017 at 7:48 PM, Eric Dumazet <edumazet@google.com> wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> v2: added the change in drivers/vhost/net.c as spotted
> by Willem.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Acked-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply

* Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data
From: Kees Cook @ 2017-08-31 23:59 UTC (permalink / raw)
  To: Dmitry Torokhov
  Cc: Thomas Gleixner, Rafael J. Wysocki, Pavel Machek, Len Brown,
	Greg Kroah-Hartman, Mike Marciniszyn, Dennis Dalessandro,
	Doug Ledford, Sean Hefty, Hal Rosenstock, Jeff Kirsher,
	linux-pm@vger.kernel.org, linux-rdma, linux-input@vger.kernel.org,
	intel-wired-lan, netdev, lkml
In-Reply-To: <CAKdAkRTSgs+JTKRa2nvcSPD+zQ5OhsLx_aM8WaZyg304QQN3hA@mail.gmail.com>

On Thu, Aug 31, 2017 at 4:45 PM, Dmitry Torokhov
<dmitry.torokhov@gmail.com> wrote:
> On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook <keescook@chromium.org> wrote:
>> In several places, .data is checked for initialization to gate early
>> calls to del_timer_sync(). Checking for .function is equally valid, so
>> switch to this in all callers.
>
> Not seeing the rest of patches it is unclear from the patch
> description why this is needed/wanted.

The CC list would have been really giant, but here is the first patch
and the earlier series list:

https://lkml.org/lkml/2017/8/31/904
https://lkml.org/lkml/2017/8/30/760

tl;dr: We're going to switch all struct timer_list callbacks to get
the timer pointer as the argument instead of from the .data field.
This patch is one step in removing open-coded users of the .data
field.

-Kees

-- 
Kees Cook
Pixel Security

^ permalink raw reply

* Re: [PATCH v2 net-next 0/2] net: ubuf_info.refcnt conversion
From: Eric Dumazet @ 2017-09-01  0:04 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S . Miller, netdev, Willem de Bruijn
In-Reply-To: <20170831234822.26612-1-edumazet@google.com>

On Thu, 2017-08-31 at 16:48 -0700, Eric Dumazet wrote:
> Yet another atomic_t -> refcount_t conversion, split in two patches.
> 
> First patch prepares the automatic conversion done in the second patch.
> 
> Eric Dumazet (2):
>   net: prepare (struct ubuf_info)->refcnt conversion
>   net: convert (struct ubuf_info)->refcnt to refcount_t
> 
>  drivers/vhost/net.c    |  2 +-
>  include/linux/skbuff.h |  5 +++--
>  net/core/skbuff.c      | 14 ++++----------
>  net/ipv4/tcp.c         |  2 --
>  4 files changed, 8 insertions(+), 15 deletions(-)
> 

David please ignore this series, I will send a V3 :)

^ permalink raw reply

* Re: [RFC net-next 0/8] net: dsa: Multi-queue awareness
From: Andrew Lunn @ 2017-09-01  0:05 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, jiri, jhs, davem, xiyou.wangcong, vivien.didelot
In-Reply-To: <1504138732-65383-1-git-send-email-f.fainelli@gmail.com>

On Wed, Aug 30, 2017 at 05:18:44PM -0700, Florian Fainelli wrote:
> This patch series is sent as reference, especially because the last patch
> is trying not to be creating too many layer violations, but clearly there
> are a little bit being created here anyways.
> 
> Essentially what I am trying to achieve is that you have a stacked device which
> is multi-queue aware, that applications will be using, and for which they can
> control the queue selection (using mq) the way they want. Each of each stacked
> network devices are created for each port of the switch (this is what DSA
> does). When a skb is submitted from say net_device X, we can derive its port
> number and look at the queue_mapping value to determine which port of the
> switch and queue we should be sending this to. The information is embedded in a
> tag (4 bytes) and is used by the switch to steer the transmission.
> 
> These stacked devices will actually transmit using a "master" or conduit
> network device which has a number of queues as well. In one version of the
> hardware that I work with, we have up to 4 ports, each with 8 queues, and the
> master device has a total of 32 hardware queues, so a 1:1 mapping is easy. With
> another version of the hardware, same number of ports and queues, but only 16
> hardware queues, so only a 2:1 mapping is possible.
> 
> In order for congestion information to work properly, I need to establish a
> mapping, preferably before transmission starts (but reconfiguration while
> interfaces are running would be possible too) between these stacked device's
> queue and the conduit interface's queue.
> 
> Comments, flames, rotten tomatoes, anything!

Right, i think i understand.

This works just for traffic between the host and ports.  The host can
set the egress queue. And i assume the queues are priorities, either
absolute or weighted round robin, etc.

But this has no effect on traffic going from port to port. At some
point, i expect you will want to offload TC for that.

How will the two interact? Could the TC rules also act on traffic from
the host to a port? Would it be simpler in the long run to just
implement TC rules?

	  Andrew

^ permalink raw reply

* Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses
From: Ivan Delalande @ 2017-09-01  0:21 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: David Miller, Eric Dumazet, netdev
In-Reply-To: <20170831232633.GA678@bistromath.localdomain>

On Fri, Sep 01, 2017 at 01:26:33AM +0200, Sabrina Dubroca wrote:
> 2017-08-31, 09:59:39 -0700, Ivan Delalande wrote:
> > diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
> > index a748c74aa8b7..abbf0edcf6c2 100644
> > --- a/net/ipv4/tcp_diag.c
> > +++ b/net/ipv4/tcp_diag.c
> [...]
> > +static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
> > +			    struct sk_buff *skb)
> > +{
> > +#ifdef CONFIG_TCP_MD5SIG
> > +	if (net_admin) {
> 
> In tcp_diag_get_aux_size() you put a check for sk_fullsock. I don't
> see anything preventing you from reaching this with a !fullsock?

Currently handler->idiag_get_aux is only called from inet_sk_diag_fill
which has a `BUG_ON(!sk_fullsock(sk));`, but I could add another
explicit check in that function if you think it's more consistent.

Actually, I wasn't sure when adding this idiag_get_aux in v2 if it
should be called from inet_twsk_diag_fill, inet_req_diag_fill and
inet_csk_diag_fill, or just the last one. I chose that simpler approach
for now to avoid duplicating these state checks in the idiag_get_aux
defined by protocols and because we didn't need for INET_DIAG_MD5SIG,
but it shouldn't be too hard to change. Do you think this could be
useful for other protocols or attributes?

Thank you,
-- 
Ivan Delalande
Arista Networks

^ permalink raw reply

* Re: netdev carrier changes is one even after ethernet link up.
From: Florian Fainelli @ 2017-09-01  0:23 UTC (permalink / raw)
  To: Bhadram Varka, andrew@lunn.ch; +Cc: linux-netdev
In-Reply-To: <974898714b3e4e59b933983ded977ce2@bgmail102.nvidia.com>

On 08/30/2017 10:53 PM, Bhadram Varka wrote:
> Hi,
> 
>  
> 
> I have observed that carrier_changes is one even in case of the ethernet
> link is up.
> 
>  
> 
> After investigating the code below is my observation –
> 
>  
> 
> ethernet_driver_probe()
> 
> +--->phy_connect()
> 
> |     +--->phy_attach_direct()
> 
> |           +---> netif_carrier_off()    : which increments
> carrier_changes to one.
> 
> +--->register_netdevice() : will the carrier_changes becomes zero here ?
> 
> +--->netif_carrier_off(): not increment the carrier_changes since
> __LINK_STATE_NOCARRIER already set.
> 
>  
> 
> From ethernet driver open will start the PHY and trigger the
> phy_state_machine.
> 
> Phy_state_machine workqueue calling netif_carrier_on() once the link is UP.
> 
> netif_carrier_on() increments the carrier_changes by one.

If the call trace is correct, then there is at least two problems here:

- phy_connect() does start the PHY machine which means that as soon as
it detects a link state of any kind (up or down) it can call
netif_carrier_off() respectively netif_carrier_on()

- as soon as you call register_netdevice() notifiers run and other parts
of the kernel or user-space programs can see an inconsistent link state

I would suggest doing the following sequence instead:

netif_carrier_off()
register_netdevice()
phy_connect()

Which should result in a consistent link state and carrier value.

> 
>  
> 
> After link is UP if we check the carrier_changes sysfs node - it will be
> one only.
> 
>  
> 
> $ cat /sys/class/net/eth0/carrier_changes
> 
> 1
> 
>  
> 
> After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net:
> phy: turn carrier off on phy attach) then I could see the carrier
> changes incremented to 2 after Link UP.
> 
> $ cat /sys/class/net/eth0/carrier_changes
> 
> 2
> 
>  
> 
> Thanks,
> 
> Bhadram.
> 
> ------------------------------------------------------------------------
> This email message is for the sole use of the intended recipient(s) and
> may contain confidential information.  Any unauthorized review, use,
> disclosure or distribution is prohibited.  If you are not the intended
> recipient, please contact the sender by reply email and destroy all
> copies of the original message.
> ------------------------------------------------------------------------


-- 
Florian

^ permalink raw reply

* Re: [PATCH 19/31] timer: Remove open-coded casts for .data and .function
From: Tyrel Datwyler @ 2017-09-01  0:28 UTC (permalink / raw)
  To: Kees Cook, Thomas Gleixner
  Cc: Samuel Ortiz, James E.J. Bottomley, Martin K. Petersen,
	linux-kernel, linux-scsi, netdev, Paul Mackerras, Tyrel Datwyler,
	linuxppc-dev
In-Reply-To: <1504222183-61202-20-git-send-email-keescook@chromium.org>

On 08/31/2017 04:29 PM, Kees Cook wrote:
> This standardizes the callback and data prototypes in several places that
> perform casting, in an effort to remove more open-coded .data and
> .function uses in favor of setup_timer().
> 
> Cc: Samuel Ortiz <samuel@sortiz.org>
> Cc: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
> Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
> Cc: netdev@vger.kernel.org
> Cc: linux-scsi@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  drivers/net/irda/bfin_sir.c      |  5 +++--
>  drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++++++--------
>  drivers/scsi/ibmvscsi/ibmvscsi.c |  8 ++++----
>  3 files changed, 13 insertions(+), 14 deletions(-)
For ibmvfc & ibmvscsi portions:

Acked-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>

^ permalink raw reply

* Re: [PATCH 19/31] timer: Remove open-coded casts for .data and .function
From: Tyrel Datwyler @ 2017-09-01  0:29 UTC (permalink / raw)
  To: Kees Cook, Thomas Gleixner
  Cc: Samuel Ortiz, James E.J. Bottomley, Martin K. Petersen,
	linux-kernel, linux-scsi, netdev, Paul Mackerras, Tyrel Datwyler,
	linuxppc-dev
In-Reply-To: <1504222183-61202-20-git-send-email-keescook@chromium.org>

On 08/31/2017 04:29 PM, Kees Cook wrote:
> This standardizes the callback and data prototypes in several places that
> perform casting, in an effort to remove more open-coded .data and
> .function uses in favor of setup_timer().
> 
> Cc: Samuel Ortiz <samuel@sortiz.org>
> Cc: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
> Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
> Cc: netdev@vger.kernel.org
> Cc: linux-scsi@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
>  drivers/net/irda/bfin_sir.c      |  5 +++--
>  drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++++++--------
>  drivers/scsi/ibmvscsi/ibmvscsi.c |  8 ++++----
>  3 files changed, 13 insertions(+), 14 deletions(-)

Resending from correct email address.

For ibmvfc & ibmvscsi portions:

Acked-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>

^ permalink raw reply

* Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
From: Chenbo Feng @ 2017-09-01  0:29 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Chenbo Feng, linux-security-module, Jeffrey Vander Stoep, netdev,
	SELinux, Alexei Starovoitov, Lorenzo Colitti
In-Reply-To: <59A88FF0.9010605@iogearbox.net>

On Thu, Aug 31, 2017 at 3:38 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 08/31/2017 10:56 PM, Chenbo Feng wrote:
>>
>> From: Chenbo Feng <fengc@google.com>
>>
>> Introduce a pointer into struct bpf_map to hold the security information
>> about the map. The actual security struct varies based on the security
>> models implemented. Place the LSM hooks before each of the unrestricted
>> eBPF operations, the map_update_elem and map_delete_elem operations are
>> checked by security_map_modify. The map_lookup_elem and map_get_next_key
>> operations are checked by securtiy_map_read.
>>
>> Signed-off-by: Chenbo Feng <fengc@google.com>
>
>
> Against which tree is this by the way, net-next? There are
> changes here which require a rebase of your set.
>

This patch series is rebased on security subsystem since patch 1/3 is
a patch for
security system I assume. But I am not sure where this specific patch
should go in.
If I should submit this one to net-next, I will rebase it against
net-next and submit again.
>> ---
>>   include/linux/bpf.h  |  3 +++
>>   kernel/bpf/syscall.c | 28 ++++++++++++++++++++++++++++
>>   2 files changed, 31 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index b69e7a5869ff..ca3e6ff7091d 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -53,6 +53,9 @@ struct bpf_map {
>>         struct work_struct work;
>>         atomic_t usercnt;
>>         struct bpf_map *inner_map_meta;
>> +#ifdef CONFIG_SECURITY
>> +       void *security;
>> +#endif
>>   };
>>
>>   /* function argument constraints */
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 045646da97cc..b15580bcf3b1 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
>>         if (err)
>>                 return -EINVAL;
>>
>> +       err = security_map_create();
>
>
> Seems a bit limited to me, don't you want to be able to
> also differentiate between different map types? Same goes
> for security_prog_load() wrt prog types, no?
>
I don't want to introduce extra complexity into it if not necessary.
so I only included the
thing needed for the selinux implementation for now.  But I agree that the map
and program type information could be useful when people developing more complex
security checks. I will add a union bpf_attr *attr pointer into the
security hook functions
for future needs.
>> +       if (err)
>> +               return -EACCES;
>> +
>>         /* find map type and init map: hashtable vs rbtree vs bloom vs ...
>> */
>>         map = find_and_alloc_map(attr);
>>         if (IS_ERR(map))
>> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
>>         if (err)
>>                 goto free_map_nouncharge;
>>
>> +       err = security_post_create(map);
>> +       if (err < 0)
>> +               goto free_map;
>> +
>
>
> So the hook you implement in patch 3/3 does:
>
> +static int selinux_bpf_post_create(struct bpf_map *map)
> +{
> +       struct bpf_security_struct *bpfsec;
> +
> +       bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
> +       if (!bpfsec)
> +               return -ENOMEM;
> +
> +       bpfsec->sid = current_sid();
> +       map->security = bpfsec;
> +
> +       return 0;
> +}
>
> Where do you kfree() bpfsec when the map gets released
> normally or in error path?
>
Will add it in next version. Thanks for point it out.
>>         err = bpf_map_alloc_id(map);
>>         if (err)
>>                 goto free_map;
>> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>>         if (IS_ERR(map))
>>                 return PTR_ERR(map);
>>
>> +       err = security_map_read(map);
>> +       if (err)
>> +               return -EACCES;
>
>
> How about actually dropping ref?
>
May bad, thanks again.
>> +
>>         key = memdup_user(ukey, map->key_size);
>>         if (IS_ERR(key)) {
>>                 err = PTR_ERR(key);
>> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>>         if (IS_ERR(map))
>>                 return PTR_ERR(map);
>>
>> +       err = security_map_modify(map);
>> +       if (err)
>> +               return -EACCES;
>
>
> Ditto ...
>
>>         key = memdup_user(ukey, map->key_size);
>>         if (IS_ERR(key)) {
>>                 err = PTR_ERR(key);
>> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
>>         if (IS_ERR(map))
>>                 return PTR_ERR(map);
>>
>> +       err = security_map_modify(map);
>> +       if (err)
>> +               return -EACCES;
>
>
> Ditto ...
>
>>         key = memdup_user(ukey, map->key_size);
>>         if (IS_ERR(key)) {
>>                 err = PTR_ERR(key);
>> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
>>         if (IS_ERR(map))
>>                 return PTR_ERR(map);
>>
>> +       err = security_map_read(map);
>> +       if (err)
>> +               return -EACCES;
>
>
> And once again here ... :(
>
>
>>         if (ukey) {
>>                 key = memdup_user(ukey, map->key_size);
>>                 if (IS_ERR(key)) {
>> @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
>>         if (CHECK_ATTR(BPF_PROG_LOAD))
>>                 return -EINVAL;
>>
>> +       err = security_prog_load();
>> +       if (err)
>> +               return -EACCES;
>> +
>>         if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
>>                 return -EINVAL;
>>
>>
>

^ permalink raw reply

* Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data
From: Dmitry Torokhov @ 2017-09-01  1:06 UTC (permalink / raw)
  To: Kees Cook
  Cc: Thomas Gleixner, Rafael J. Wysocki, Pavel Machek, Len Brown,
	Greg Kroah-Hartman, Mike Marciniszyn, Dennis Dalessandro,
	Doug Ledford, Sean Hefty, Hal Rosenstock, Jeff Kirsher,
	linux-pm-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, linux-rdma,
	linux-input-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	intel-wired-lan-qjLDD68F18P21nG7glBr7A, netdev, lkml
In-Reply-To: <CAGXu5jK84cN9MdjfzaipD7GN8a37JMfD8X0Em4mk2_aFGuaOUg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

On Thu, Aug 31, 2017 at 4:59 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
> On Thu, Aug 31, 2017 at 4:45 PM, Dmitry Torokhov
> <dmitry.torokhov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
>> On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org> wrote:
>>> In several places, .data is checked for initialization to gate early
>>> calls to del_timer_sync(). Checking for .function is equally valid, so
>>> switch to this in all callers.
>>
>> Not seeing the rest of patches it is unclear from the patch
>> description why this is needed/wanted.
>
> The CC list would have been really giant, but here is the first patch
> and the earlier series list:
>
> https://lkml.org/lkml/2017/8/31/904
> https://lkml.org/lkml/2017/8/30/760
>
> tl;dr: We're going to switch all struct timer_list callbacks to get
> the timer pointer as the argument instead of from the .data field.
> This patch is one step in removing open-coded users of the .data
> field.
>

And that is exactly what should have been in the patch description.

FWIW for input bits:

Acked-by: Dmitry Torokhov <dmitry.torokhov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Thanks.

-- 
Dmitry
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [RFC net-next 0/5] TSN: Add qdisc-based config interfaces for traffic shapers
From: Vinicius Costa Gomes @ 2017-09-01  1:26 UTC (permalink / raw)
  To: netdev
  Cc: Vinicius Costa Gomes, jhs, xiyou.wangcong, jiri, intel-wired-lan,
	andre.guedes, ivan.briano, jesus.sanchez-palencia, boon.leong.ong,
	richardcochran

Hi,

This patchset is an RFC on a proposal of how the Traffic Control subsystem can
be used to offload the configuration of traffic shapers into network devices
that provide support for them in HW. Our goal here is to start upstreaming
support for features related to the Time-Sensitive Networking (TSN) set of
standards into the kernel.

As part of this work, we've assessed previous public discussions related to TSN
enabling: patches from Henrik Austad (Cisco), the presentation from Eric Mann
at Linux Plumbers 2012, patches from Gangfeng Huang (National Instruments) and
the current state of the OpenAVNU project (https://github.com/AVnu/OpenAvnu/).

Please note that the patches provided as part of this RFC are implementing what
is needed only for 802.1Qav (FQTSS) only, but we'd like to take advantage of
this discussion and share our WIP ideas for the 802.1Qbv and 802.1Qbu interfaces
as well. The current patches are only providing support for HW offload of the
configs.


Overview
========

Time-sensitive Networking (TSN) is a set of standards that aim to address
resources availability for providing bandwidth reservation and bounded latency
on Ethernet based LANs. The proposal described here aims to cover mainly what is
needed to enable the following standards: 802.1Qat, 802.1Qav, 802.1Qbv and
802.1Qbu.

The initial target of this work is the Intel i210 NIC, but other controllers'
datasheet were also taken into account, like the Renesas RZ/A1H RZ/A1M group and
the Synopsis DesignWare Ethernet QoS controller.


Proposal
========

Feature-wise, what is covered here are configuration interfaces for HW
implementations of the Credit-Based shaper (CBS, 802.1Qav), Time-Aware shaper
(802.1Qbv) and Frame Preemption (802.1Qbu). CBS is a per-queue shaper, while
Qbv and Qbu must be configured per port, with the configuration covering all
queues. Given that these features are related to traffic shaping, and that the
traffic control subsystem already provides a queueing discipline that offloads
config into the device driver (i.e. mqprio), designing new qdiscs for the
specific purpose of offloading the config for each shaper seemed like a good
fit.

For steering traffic into the correct queues, we use the socket option
SO_PRIORITY and then a mechanism to map priority to traffic classes / Tx queues.
The qdisc mqprio is currently used in our tests.

As for the shapers config interface:

 * CBS (802.1Qav)

   This patchset is proposing a new qdisc called 'cbs'. Its 'tc' cmd line is:
   $ tc qdisc add dev IFACE parent ID cbs locredit N hicredit M sendslope S \
     idleslope I

   Note that the parameters for this qdisc are the ones defined by the
   802.1Q-2014 spec, so no hardware specific functionality is exposed here.


 * Time-aware shaper (802.1Qbv):

   The idea we are currently exploring is to add a "time-aware", priority based
   qdisc, that also exposes the Tx queues available and provides a mechanism for
   mapping priority <-> traffic class <-> Tx queues in a similar fashion as
   mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be:

   $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4    \
     	   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3                         \
	   queues 0 1 2 3                                              \
     	   sched-file gates.sched [base-time <interval>]               \
           [cycle-time <interval>] [extension-time <interval>]

   <file> is multi-line, with each line being of the following format:
   <cmd> <gate mask> <interval in nanoseconds>

   Qbv only defines one <cmd>: "S" for 'SetGates'

   For example:

   S 0x01 300
   S 0x03 500

   This means that there are two intervals, the first will have the gate
   for traffic class 0 open for 300 nanoseconds, the second will have
   both traffic classes open for 500 nanoseconds.

   Additionally, an option to set just one entry of the gate control list will
   also be provided by 'taprio':

   $ tc qdisc (...) \
        sched-row <row number> <cmd> <gate mask> <interval>  \
        [base-time <interval>] [cycle-time <interval>] \
        [extension-time <interval>]


 * Frame Preemption (802.1Qbu):

   To control even further the latency, it may prove useful to signal which
   traffic classes are marked as preemptable. For that, 'taprio' provides the
   preemption command so you set each traffic class as preemptable or not:

   $ tc qdisc (...) \
        preemption 0 1 1 1


 * Time-aware shaper + Preemption:

   As an example of how Qbv and Qbu can be used together, we may specify
   both the schedule and the preempt-mask, and this way we may also
   specify the Set-Gates-and-Hold and Set-Gates-and-Release commands as
   specified in the Qbu spec:

   $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4 \
     	   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3                    \
	   queues 0 1 2 3                                         \
     	   preemption 0 1 1 1                                     \
	   sched-file preempt_gates.sched

    <file> is multi-line, with each line being of the following format:
    <cmd> <gate mask> <interval in nanoseconds>

    For this case, two new commands are introduced:

    "H" for 'set gates and hold'
    "R" for 'set gates and release'

    H 0x01 300
    R 0x03 500



Testing this RFC
================

For testing the patches of this RFC only, you can refer to the samples and
helper script being added to samples/tsn/ and the use the 'mqprio' qdisc to
setup the priorities to Tx queues mapping, together with the 'cbs' qdisc to
configure the HW shaper of the i210 controller:

1) Setup priorities to traffic classes to hardware queues mapping
$ tc qdisc replace dev enp3s0 parent root mqprio num_tc 3 \
     map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

2) Check scheme. You want to get the inner qdiscs ID from the bottom up
$ tc -g  class show dev enp3s0

Ex.:
+---(802a:3) mqprio
|    +---(802a:6) mqprio
|    +---(802a:7) mqprio
|
+---(802a:2) mqprio
|    +---(802a:5) mqprio
|
+---(802a:1) mqprio
     +---(802a:4) mqprio

 * Here '802a:4' is Tx Queue #0 and '802a:5' is Tx Queue #1.

3) Calculate CBS parameters for classes A and B. i.e. BW for A is 20Mbps and
   for B is 10Mbps:
$ ./samples/tsn/calculate_cbs_params.py -A 20000 -a 1500 -B 10000 -b 1500

4) Configure CBS for traffic class A (priority 3) as provided by the script:
$ tc qdisc replace dev enp3s0 parent 802a:4 cbs locredit -1470 \
     hicredit 30 sendslope -980000 idleslope 20000

5) Configure CBS for traffic class B (priority 2):
$ tc qdisc replace dev enp3s0 parent 802a:5 cbs \
     locredit -1485 hicredit 31 sendslope -990000 idleslope 10000

6) Run Listener, compiled from samples/tsn/listener.c
$ ./listener -i enp3s0

7) Run Talker for class A (prio 3 here), compiled from samples/tsn/talker.c
$ ./talker -i enp3s0 -p 3

 * The bandwidth displayed on the listener output at this stage should be very
   close to the one configured for class A.

8) You can also run a Talker for class B (prio 2 here)
$ ./talker -i enp3s0 -p 2

 * The bandwidth displayed on the listener output now should increase to very
   close to the one configured for class A + class B.

Authors
=======
 - Andre Guedes <andre.guedes@intel.com>
 - Ivan Briano <ivan.briano@intel.com>
 - Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
 - Vinicius Gomes <vinicius.gomes@intel.com>


Andre Guedes (2):
  igb: Add support for CBS offload
  samples/tsn: Add script for calculating CBS config

Jesus Sanchez-Palencia (1):
  sample: Add TSN Talker and Listener examples

Vinicius Costa Gomes (2):
  net/sched: Introduce the user API for the CBS shaper
  net/sched: Introduce Credit Based Shaper (CBS) qdisc

 drivers/net/ethernet/intel/igb/e1000_defines.h |  23 ++
 drivers/net/ethernet/intel/igb/e1000_regs.h    |   8 +
 drivers/net/ethernet/intel/igb/igb.h           |   6 +
 drivers/net/ethernet/intel/igb/igb_main.c      | 349 +++++++++++++++++++++++++
 include/linux/netdevice.h                      |   1 +
 include/uapi/linux/pkt_sched.h                 |  29 ++
 net/sched/Kconfig                              |  11 +
 net/sched/Makefile                             |   1 +
 net/sched/sch_cbs.c                            | 286 ++++++++++++++++++++
 samples/tsn/calculate_cbs_params.py            | 112 ++++++++
 samples/tsn/listener.c                         | 254 ++++++++++++++++++
 samples/tsn/talker.c                           | 136 ++++++++++
 12 files changed, 1216 insertions(+)
 create mode 100644 net/sched/sch_cbs.c
 create mode 100755 samples/tsn/calculate_cbs_params.py
 create mode 100644 samples/tsn/listener.c
 create mode 100644 samples/tsn/talker.c

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox