Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 3/4] l2tp: remove useless NULL check in __l2tp_ip*_bind_lookup()
From: Guillaume Nault @ 2017-01-06 19:03 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston
In-Reply-To: <cover.1483727525.git.g.nault@alphalink.fr>

If "l2tp" was NULL, that'd mean "sk" is NULL too. This can't happen
since "sk" is returned by sk_for_each_bound().

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ip.c  | 3 ---
 net/l2tp/l2tp_ip6.c | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index e5686bf898f7..499d3cdbfbc8 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -56,9 +56,6 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
 		const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
 		const struct inet_sock *inet = inet_sk(sk);
 
-		if (l2tp == NULL)
-			continue;
-
 		if ((l2tp->conn_id == tunnel_id) &&
 		    net_eq(sock_net(sk), net) &&
 		    !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 2f6be6ddc8cb..7165b06d7b25 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -69,9 +69,6 @@ static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
 		const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
 		const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
-		if (l2tp == NULL)
-			continue;
-
 		if ((l2tp->conn_id == tunnel_id) &&
 		    net_eq(sock_net(sk), net) &&
 		    (!sk_laddr || ipv6_addr_any(sk_laddr) || ipv6_addr_equal(sk_laddr, laddr)) &&
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 2/4] l2tp: make __l2tp_ip*_bind_lookup() parameters 'const'
From: Guillaume Nault @ 2017-01-06 19:03 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston
In-Reply-To: <cover.1483727525.git.g.nault@alphalink.fr>

Add const qualifier wherever possible for __l2tp_ip_bind_lookup() and
__l2tp_ip6_bind_lookup().

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ip.c  | 4 ++--
 net/l2tp/l2tp_ip6.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 992761e721af..e5686bf898f7 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -53,8 +53,8 @@ static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
 	struct sock *sk;
 
 	sk_for_each_bound(sk, &l2tp_ip_bind_table) {
-		struct inet_sock *inet = inet_sk(sk);
-		struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+		const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
 
 		if (l2tp == NULL)
 			continue;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 331ccf5a7bad..2f6be6ddc8cb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -57,8 +57,8 @@ static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
 	return (struct l2tp_ip6_sock *)sk;
 }
 
-static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
-					   struct in6_addr *laddr,
+static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
+					   const struct in6_addr *laddr,
 					   const struct in6_addr *raddr,
 					   int dif, u32 tunnel_id)
 {
@@ -67,7 +67,7 @@ static struct sock *__l2tp_ip6_bind_lookup(struct net *net,
 	sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
 		const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
 		const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
-		struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
+		const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
 
 		if (l2tp == NULL)
 			continue;
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/4] l2tp: remove redundant addr_len check in l2tp_ip_bind()
From: Guillaume Nault @ 2017-01-06 19:03 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston
In-Reply-To: <cover.1483727525.git.g.nault@alphalink.fr>

addr_len's value has already been verified at this point.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
---
 net/l2tp/l2tp_ip.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 3d73278b86ca..992761e721af 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -258,7 +258,7 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	if (!sock_flag(sk, SOCK_ZAPPED))
 		goto out;
 
-	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_l2tpip))
+	if (sk->sk_state != TCP_CLOSE)
 		goto out;
 
 	chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 0/4] l2tp: cleanup socket lookup code in l2tp_ip and l2tp_ip6
From: Guillaume Nault @ 2017-01-06 19:03 UTC (permalink / raw)
  To: netdev; +Cc: James Chapman, Chris Elston

First three patches remove redundant tests and add missing "const"
qualifiers.

Fourth patch splits the conditionals found in __l2tp_ip*_bind_lookup(),
to make these functions easier to review. In the process, I found that
some corner cases were still not handled properly. So I've added the
missing tests in this patch too, because they're pretty simple and the
whole "if" statements are modified anyway.

I expect it to be easier to review this way. If not, I can split up
patch #4, post the missing tests separately to -net, and later repost
this series as pure cleanup. Just let me know.

Guillaume Nault (4):
  l2tp: remove redundant addr_len check in l2tp_ip_bind()
  l2tp: make __l2tp_ip*_bind_lookup() parameters 'const'
  l2tp: remove useless NULL check in __l2tp_ip{,6}_bind_lookup()
  l2tp: rework socket comparison in __l2tp_ip*_bind_lookup()

 net/l2tp/l2tp_ip.c  | 29 ++++++++++++++++++-----------
 net/l2tp/l2tp_ip6.c | 30 +++++++++++++++++++-----------
 2 files changed, 37 insertions(+), 22 deletions(-)

-- 
2.11.0

^ permalink raw reply

* Re: [PATCH/RFC v2 net-next] ravb: unmap descriptors when freeing rings
From: Sergei Shtylyov @ 2017-01-06 19:02 UTC (permalink / raw)
  To: Simon Horman, David Miller; +Cc: Magnus Damm, netdev, linux-renesas-soc
In-Reply-To: <1483613000-537-1-git-send-email-horms+renesas@verge.net.au>

Hello!

On 01/05/2017 01:43 PM, Simon Horman wrote:

> From: Kazuya Mizuguchi <kazuya.mizuguchi.ks@renesas.com>
>
> "swiotlb buffer is full" errors occur after repeated initialisation of a
> device - f.e. suspend/resume or ip link set up/down. This is because memory
> mapped using dma_map_single() in ravb_ring_format() and ravb_start_xmit()
> is not released.  Resolve this problem by unmapping descriptors when
> freeing rings.
>
> Note, ravb_tx_free() is moved but not otherwise modified by this patch.
>
> Signed-off-by: Kazuya Mizuguchi <kazuya.mizuguchi.ks@renesas.com>
> [simon: reworked]
> Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
> --
> v1 [Kazuya Mizuguchi]
>
> v2 [Simon Horman]
> * As suggested by Sergei Shtylyov
>   - Use dma_mapping_error() and rx_desc->ds_cc when unmapping RX descriptors;
>     this is consistent with the way that they are mapped
>   - Use ravb_tx_free() to clear TX descriptors

    Not sure that was good idea (sorry)... ravb_tx_ring() only unmaps the 
transmitted buffers, while we need to unmap everything...

> * Reduce scope of new local variable
> ---
>  drivers/net/ethernet/renesas/ravb_main.c | 89 ++++++++++++++++++--------------
>  1 file changed, 51 insertions(+), 38 deletions(-)
>
> diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
> index 92d7692c840d..1797c48e3176 100644
> --- a/drivers/net/ethernet/renesas/ravb_main.c
> +++ b/drivers/net/ethernet/renesas/ravb_main.c
> @@ -179,6 +179,44 @@ static struct mdiobb_ops bb_ops = {
>  	.get_mdio_data = ravb_get_mdio_data,
>  };
>
> +/* Free TX skb function for AVB-IP */
> +static int ravb_tx_free(struct net_device *ndev, int q)
> +{
> +	struct ravb_private *priv = netdev_priv(ndev);
> +	struct net_device_stats *stats = &priv->stats[q];
> +	struct ravb_tx_desc *desc;
> +	int free_num = 0;
> +	int entry;
> +	u32 size;
> +
> +	for (; priv->cur_tx[q] - priv->dirty_tx[q] > 0; priv->dirty_tx[q]++) {
> +		entry = priv->dirty_tx[q] % (priv->num_tx_ring[q] *
> +					     NUM_TX_DESC);
> +		desc = &priv->tx_ring[q][entry];
> +		if (desc->die_dt != DT_FEMPTY)

    Here, it stop once an untransmitted buffer is encountered...

> +			break;
> +		/* Descriptor type must be checked before all other reads */
> +		dma_rmb();
> +		size = le16_to_cpu(desc->ds_tagl) & TX_DS;
> +		/* Free the original skb. */
> +		if (priv->tx_skb[q][entry / NUM_TX_DESC]) {
> +			dma_unmap_single(ndev->dev.parent, le32_to_cpu(desc->dptr),
> +					 size, DMA_TO_DEVICE);
> +			/* Last packet descriptor? */
> +			if (entry % NUM_TX_DESC == NUM_TX_DESC - 1) {
> +				entry /= NUM_TX_DESC;
> +				dev_kfree_skb_any(priv->tx_skb[q][entry]);
> +				priv->tx_skb[q][entry] = NULL;
> +				stats->tx_packets++;
> +			}
> +			free_num++;
> +		}
> +		stats->tx_bytes += size;
> +		desc->die_dt = DT_EEMPTY;
> +	}
> +	return free_num;
> +}
> +
>  /* Free skb's and DMA buffers for Ethernet AVB */
>  static void ravb_ring_free(struct net_device *ndev, int q)
>  {
> @@ -207,6 +245,18 @@ static void ravb_ring_free(struct net_device *ndev, int q)
>  	priv->tx_align[q] = NULL;
>
>  	if (priv->rx_ring[q]) {
> +		for (i = 0; i < priv->num_rx_ring[q]; i++) {
> +			struct ravb_ex_rx_desc *rx_desc = &priv->rx_ring[q][i];
> +
> +			if (!dma_mapping_error(ndev->dev.parent,
> +					       rx_desc->dptr)) {

   You forgot le32_to_cpu() here, we can't use the raw descriptor fields.

> +				dma_unmap_single(ndev->dev.parent,
> +						 le32_to_cpu(rx_desc->dptr),
> +						 PKT_BUF_SZ,
> +						 DMA_FROM_DEVICE);
> +				rx_desc->ds_cc = cpu_to_le16(0);

    You don't check it anyway, not sure what that buys...

[...]

MBR, Sergei

^ permalink raw reply

* [net-next][PATCH] RDS: validate the requested traces user input against max supported
From: Santosh Shilimkar @ 2017-01-06 18:44 UTC (permalink / raw)
  To: netdev, davem; +Cc: linux-kernel, Santosh Shilimkar

Larger than supported value can lead to array read/write overflow.

Reported-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 net/rds/af_rds.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index fd821740..b405f77 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -310,6 +310,9 @@ static int rds_recv_track_latency(struct rds_sock *rs, char __user *optval,
 	if (copy_from_user(&trace, optval, sizeof(trace)))
 		return -EFAULT;
 
+	if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
+		return -EFAULT;
+
 	rs->rs_rx_traces = trace.rx_traces;
 	for (i = 0; i < rs->rs_rx_traces; i++) {
 		if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCHv3 net-next] sctp: prepare asoc stream for stream reconf
From: Marcelo Ricardo Leitner @ 2017-01-06 18:42 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, linux-sctp, Neil Horman, Vlad Yasevich, davem
In-Reply-To: <efd6462731ca0b18a3039f9537dda61e0ed72430.1483712313.git.lucien.xin@gmail.com>

On Fri, Jan 06, 2017 at 10:18:33PM +0800, Xin Long wrote:
> sctp stream reconf, described in RFC 6525, needs a structure to
> save per stream information in assoc, like stream state.
> 
> In the future, sctp stream scheduler also needs it to save some
> stream scheduler params and queues.
> 
> This patchset is to prepare the stream array in assoc for stream
> reconf. It defines sctp_stream that includes stream arrays inside
> to replace ssnmap.
> 
> Note that we use different structures for IN and OUT streams, as
> the members in per OUT stream will get more and more different
> from per IN stream.
> 
> v1->v2:
>   - put these patches into a smaller group.
> v2->v3:
>   - define sctp_stream to contain stream arrays, and create stream.c
>     to put stream-related functions.
>   - merge 3 patches into 1, as new sctp_stream has the same name
>     with before.
> 
> Signed-off-by: Xin Long <lucien.xin@gmail.com>

Looks good to me but I cannot build-test it now, thus
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>

> ---
>  include/net/sctp/sctp.h    |   1 -
>  include/net/sctp/structs.h |  76 +++++++++++----------------
>  net/sctp/Makefile          |   2 +-
>  net/sctp/associola.c       |  13 +++--
>  net/sctp/objcnt.c          |   2 -
>  net/sctp/sm_make_chunk.c   |  10 ++--
>  net/sctp/sm_statefuns.c    |   3 +-
>  net/sctp/ssnmap.c          | 125 ---------------------------------------------
>  net/sctp/stream.c          |  85 ++++++++++++++++++++++++++++++
>  net/sctp/ulpqueue.c        |  36 ++++++-------
>  10 files changed, 147 insertions(+), 206 deletions(-)
>  delete mode 100644 net/sctp/ssnmap.c
>  create mode 100644 net/sctp/stream.c
> 
> diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
> index d8833a8..598d938 100644
> --- a/include/net/sctp/sctp.h
> +++ b/include/net/sctp/sctp.h
> @@ -283,7 +283,6 @@ extern atomic_t sctp_dbg_objcnt_chunk;
>  extern atomic_t sctp_dbg_objcnt_bind_addr;
>  extern atomic_t sctp_dbg_objcnt_bind_bucket;
>  extern atomic_t sctp_dbg_objcnt_addr;
> -extern atomic_t sctp_dbg_objcnt_ssnmap;
>  extern atomic_t sctp_dbg_objcnt_datamsg;
>  extern atomic_t sctp_dbg_objcnt_keys;
>  
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index 87d56cc..4741ec2 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -82,7 +82,6 @@ struct sctp_outq;
>  struct sctp_bind_addr;
>  struct sctp_ulpq;
>  struct sctp_ep_common;
> -struct sctp_ssnmap;
>  struct crypto_shash;
>  
>  
> @@ -377,54 +376,22 @@ typedef struct sctp_sender_hb_info {
>  	__u64 hb_nonce;
>  } __packed sctp_sender_hb_info_t;
>  
> -/*
> - *  RFC 2960 1.3.2 Sequenced Delivery within Streams
> - *
> - *  The term "stream" is used in SCTP to refer to a sequence of user
> - *  messages that are to be delivered to the upper-layer protocol in
> - *  order with respect to other messages within the same stream.  This is
> - *  in contrast to its usage in TCP, where it refers to a sequence of
> - *  bytes (in this document a byte is assumed to be eight bits).
> - *  ...
> - *
> - *  This is the structure we use to track both our outbound and inbound
> - *  SSN, or Stream Sequence Numbers.
> - */
> -
> -struct sctp_stream {
> -	__u16 *ssn;
> -	unsigned int len;
> -};
> -
> -struct sctp_ssnmap {
> -	struct sctp_stream in;
> -	struct sctp_stream out;
> -};
> -
> -struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
> -				    gfp_t gfp);
> -void sctp_ssnmap_free(struct sctp_ssnmap *map);
> -void sctp_ssnmap_clear(struct sctp_ssnmap *map);
> +struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp);
> +void sctp_stream_free(struct sctp_stream *stream);
> +void sctp_stream_clear(struct sctp_stream *stream);
>  
>  /* What is the current SSN number for this stream? */
> -static inline __u16 sctp_ssn_peek(struct sctp_stream *stream, __u16 id)
> -{
> -	return stream->ssn[id];
> -}
> +#define sctp_ssn_peek(stream, type, sid) \
> +	((stream)->type[sid].ssn)
>  
>  /* Return the next SSN number for this stream.	*/
> -static inline __u16 sctp_ssn_next(struct sctp_stream *stream, __u16 id)
> -{
> -	return stream->ssn[id]++;
> -}
> +#define sctp_ssn_next(stream, type, sid) \
> +	((stream)->type[sid].ssn++)
>  
>  /* Skip over this ssn and all below. */
> -static inline void sctp_ssn_skip(struct sctp_stream *stream, __u16 id, 
> -				 __u16 ssn)
> -{
> -	stream->ssn[id] = ssn+1;
> -}
> -              
> +#define sctp_ssn_skip(stream, type, sid, ssn) \
> +	((stream)->type[sid].ssn = ssn + 1)
> +
>  /*
>   * Pointers to address related SCTP functions.
>   * (i.e. things that depend on the address family.)
> @@ -1331,6 +1298,25 @@ struct sctp_inithdr_host {
>  	__u32 initial_tsn;
>  };
>  
> +struct sctp_stream_out {
> +	__u16	ssn;
> +	__u8	state;
> +};
> +
> +struct sctp_stream_in {
> +	__u16	ssn;
> +};
> +
> +struct sctp_stream {
> +	struct sctp_stream_out *out;
> +	struct sctp_stream_in *in;
> +	__u16 outcnt;
> +	__u16 incnt;
> +};
> +
> +#define SCTP_STREAM_CLOSED		0x00
> +#define SCTP_STREAM_OPEN		0x01
> +
>  /* SCTP_GET_ASSOC_STATS counters */
>  struct sctp_priv_assoc_stats {
>  	/* Maximum observed rto in the association during subsequent
> @@ -1746,8 +1732,8 @@ struct sctp_association {
>  	/* Default receive parameters */
>  	__u32 default_rcv_context;
>  
> -	/* This tracks outbound ssn for a given stream.	 */
> -	struct sctp_ssnmap *ssnmap;
> +	/* Stream arrays */
> +	struct sctp_stream *stream;
>  
>  	/* All outbound chunks go through this structure.  */
>  	struct sctp_outq outqueue;
> diff --git a/net/sctp/Makefile b/net/sctp/Makefile
> index 6c4f749..70f1b57 100644
> --- a/net/sctp/Makefile
> +++ b/net/sctp/Makefile
> @@ -11,7 +11,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
>  	  transport.o chunk.o sm_make_chunk.o ulpevent.o \
>  	  inqueue.o outqueue.o ulpqueue.o \
>  	  tsnmap.o bind_addr.o socket.o primitive.o \
> -	  output.o input.o debug.o ssnmap.o auth.o \
> +	  output.o input.o debug.o stream.o auth.o \
>  	  offload.o
>  
>  sctp_probe-y := probe.o
> diff --git a/net/sctp/associola.c b/net/sctp/associola.c
> index d3cc30c..36294f7 100644
> --- a/net/sctp/associola.c
> +++ b/net/sctp/associola.c
> @@ -358,8 +358,8 @@ void sctp_association_free(struct sctp_association *asoc)
>  
>  	sctp_tsnmap_free(&asoc->peer.tsn_map);
>  
> -	/* Free ssnmap storage. */
> -	sctp_ssnmap_free(asoc->ssnmap);
> +	/* Free stream information. */
> +	sctp_stream_free(asoc->stream);
>  
>  	/* Clean up the bound address list. */
>  	sctp_bind_addr_free(&asoc->base.bind_addr);
> @@ -1137,7 +1137,7 @@ void sctp_assoc_update(struct sctp_association *asoc,
>  		/* Reinitialize SSN for both local streams
>  		 * and peer's streams.
>  		 */
> -		sctp_ssnmap_clear(asoc->ssnmap);
> +		sctp_stream_clear(asoc->stream);
>  
>  		/* Flush the ULP reassembly and ordered queue.
>  		 * Any data there will now be stale and will
> @@ -1162,10 +1162,9 @@ void sctp_assoc_update(struct sctp_association *asoc,
>  
>  		asoc->ctsn_ack_point = asoc->next_tsn - 1;
>  		asoc->adv_peer_ack_point = asoc->ctsn_ack_point;
> -		if (!asoc->ssnmap) {
> -			/* Move the ssnmap. */
> -			asoc->ssnmap = new->ssnmap;
> -			new->ssnmap = NULL;
> +		if (!asoc->stream) {
> +			asoc->stream = new->stream;
> +			new->stream = NULL;
>  		}
>  
>  		if (!asoc->assoc_id) {
> diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
> index 40e7fac..105ac33 100644
> --- a/net/sctp/objcnt.c
> +++ b/net/sctp/objcnt.c
> @@ -51,7 +51,6 @@ SCTP_DBG_OBJCNT(bind_addr);
>  SCTP_DBG_OBJCNT(bind_bucket);
>  SCTP_DBG_OBJCNT(chunk);
>  SCTP_DBG_OBJCNT(addr);
> -SCTP_DBG_OBJCNT(ssnmap);
>  SCTP_DBG_OBJCNT(datamsg);
>  SCTP_DBG_OBJCNT(keys);
>  
> @@ -67,7 +66,6 @@ static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = {
>  	SCTP_DBG_OBJCNT_ENTRY(bind_addr),
>  	SCTP_DBG_OBJCNT_ENTRY(bind_bucket),
>  	SCTP_DBG_OBJCNT_ENTRY(addr),
> -	SCTP_DBG_OBJCNT_ENTRY(ssnmap),
>  	SCTP_DBG_OBJCNT_ENTRY(datamsg),
>  	SCTP_DBG_OBJCNT_ENTRY(keys),
>  };
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 9e9690b..a15d824 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -1536,7 +1536,7 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
>  
>  	/* All fragments will be on the same stream */
>  	sid = ntohs(chunk->subh.data_hdr->stream);
> -	stream = &chunk->asoc->ssnmap->out;
> +	stream = chunk->asoc->stream;
>  
>  	/* Now assign the sequence number to the entire message.
>  	 * All fragments must have the same stream sequence number.
> @@ -1547,9 +1547,9 @@ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk)
>  			ssn = 0;
>  		} else {
>  			if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG)
> -				ssn = sctp_ssn_next(stream, sid);
> +				ssn = sctp_ssn_next(stream, out, sid);
>  			else
> -				ssn = sctp_ssn_peek(stream, sid);
> +				ssn = sctp_ssn_peek(stream, out, sid);
>  		}
>  
>  		lchunk->subh.data_hdr->ssn = htons(ssn);
> @@ -2444,9 +2444,9 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk,
>  	if (!asoc->temp) {
>  		int error;
>  
> -		asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams,
> +		asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams,
>  					       asoc->c.sinit_num_ostreams, gfp);
> -		if (!asoc->ssnmap)
> +		if (!asoc->stream)
>  			goto clean_up;
>  
>  		error = sctp_assoc_set_id(asoc, gfp);
> diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
> index 3382ef2..0ceded3 100644
> --- a/net/sctp/sm_statefuns.c
> +++ b/net/sctp/sm_statefuns.c
> @@ -6274,9 +6274,8 @@ static int sctp_eat_data(const struct sctp_association *asoc,
>  	 * and is invalid.
>  	 */
>  	ssn = ntohs(data_hdr->ssn);
> -	if (ordered && SSN_lt(ssn, sctp_ssn_peek(&asoc->ssnmap->in, sid))) {
> +	if (ordered && SSN_lt(ssn, sctp_ssn_peek(asoc->stream, in, sid)))
>  		return SCTP_IERROR_PROTO_VIOLATION;
> -	}
>  
>  	/* Send the data up to the user.  Note:  Schedule  the
>  	 * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK
> diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
> deleted file mode 100644
> index b9c8521..0000000
> --- a/net/sctp/ssnmap.c
> +++ /dev/null
> @@ -1,125 +0,0 @@
> -/* SCTP kernel implementation
> - * Copyright (c) 2003 International Business Machines, Corp.
> - *
> - * This file is part of the SCTP kernel implementation
> - *
> - * These functions manipulate sctp SSN tracker.
> - *
> - * This SCTP implementation is free software;
> - * you can redistribute it and/or modify it under the terms of
> - * the GNU General Public License as published by
> - * the Free Software Foundation; either version 2, or (at your option)
> - * any later version.
> - *
> - * This SCTP implementation is distributed in the hope that it
> - * will be useful, but WITHOUT ANY WARRANTY; without even the implied
> - *                 ************************
> - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> - * See the GNU General Public License for more details.
> - *
> - * You should have received a copy of the GNU General Public License
> - * along with GNU CC; see the file COPYING.  If not, see
> - * <http://www.gnu.org/licenses/>.
> - *
> - * Please send any bug reports or fixes you make to the
> - * email address(es):
> - *    lksctp developers <linux-sctp@vger.kernel.org>
> - *
> - * Written or modified by:
> - *    Jon Grimm             <jgrimm@us.ibm.com>
> - */
> -
> -#include <linux/types.h>
> -#include <linux/slab.h>
> -#include <net/sctp/sctp.h>
> -#include <net/sctp/sm.h>
> -
> -static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
> -					    __u16 out);
> -
> -/* Storage size needed for map includes 2 headers and then the
> - * specific needs of in or out streams.
> - */
> -static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
> -{
> -	return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16);
> -}
> -
> -
> -/* Create a new sctp_ssnmap.
> - * Allocate room to store at least 'len' contiguous TSNs.
> - */
> -struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
> -				    gfp_t gfp)
> -{
> -	struct sctp_ssnmap *retval;
> -	int size;
> -
> -	size = sctp_ssnmap_size(in, out);
> -	if (size <= KMALLOC_MAX_SIZE)
> -		retval = kmalloc(size, gfp);
> -	else
> -		retval = (struct sctp_ssnmap *)
> -			  __get_free_pages(gfp, get_order(size));
> -	if (!retval)
> -		goto fail;
> -
> -	if (!sctp_ssnmap_init(retval, in, out))
> -		goto fail_map;
> -
> -	SCTP_DBG_OBJCNT_INC(ssnmap);
> -
> -	return retval;
> -
> -fail_map:
> -	if (size <= KMALLOC_MAX_SIZE)
> -		kfree(retval);
> -	else
> -		free_pages((unsigned long)retval, get_order(size));
> -fail:
> -	return NULL;
> -}
> -
> -
> -/* Initialize a block of memory as a ssnmap.  */
> -static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in,
> -					    __u16 out)
> -{
> -	memset(map, 0x00, sctp_ssnmap_size(in, out));
> -
> -	/* Start 'in' stream just after the map header. */
> -	map->in.ssn = (__u16 *)&map[1];
> -	map->in.len = in;
> -
> -	/* Start 'out' stream just after 'in'. */
> -	map->out.ssn = &map->in.ssn[in];
> -	map->out.len = out;
> -
> -	return map;
> -}
> -
> -/* Clear out the ssnmap streams.  */
> -void sctp_ssnmap_clear(struct sctp_ssnmap *map)
> -{
> -	size_t size;
> -
> -	size = (map->in.len + map->out.len) * sizeof(__u16);
> -	memset(map->in.ssn, 0x00, size);
> -}
> -
> -/* Dispose of a ssnmap.  */
> -void sctp_ssnmap_free(struct sctp_ssnmap *map)
> -{
> -	int size;
> -
> -	if (unlikely(!map))
> -		return;
> -
> -	size = sctp_ssnmap_size(map->in.len, map->out.len);
> -	if (size <= KMALLOC_MAX_SIZE)
> -		kfree(map);
> -	else
> -		free_pages((unsigned long)map, get_order(size));
> -
> -	SCTP_DBG_OBJCNT_DEC(ssnmap);
> -}
> diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> new file mode 100644
> index 0000000..f86de43
> --- /dev/null
> +++ b/net/sctp/stream.c
> @@ -0,0 +1,85 @@
> +/* SCTP kernel implementation
> + * (C) Copyright IBM Corp. 2001, 2004
> + * Copyright (c) 1999-2000 Cisco, Inc.
> + * Copyright (c) 1999-2001 Motorola, Inc.
> + * Copyright (c) 2001 Intel Corp.
> + *
> + * This file is part of the SCTP kernel implementation
> + *
> + * These functions manipulate sctp tsn mapping array.
> + *
> + * This SCTP implementation is free software;
> + * you can redistribute it and/or modify it under the terms of
> + * the GNU General Public License as published by
> + * the Free Software Foundation; either version 2, or (at your option)
> + * any later version.
> + *
> + * This SCTP implementation is distributed in the hope that it
> + * will be useful, but WITHOUT ANY WARRANTY; without even the implied
> + *                 ************************
> + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
> + * See the GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with GNU CC; see the file COPYING.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + *
> + * Please send any bug reports or fixes you make to the
> + * email address(es):
> + *    lksctp developers <linux-sctp@vger.kernel.org>
> + *
> + * Written or modified by:
> + *    Xin Long <lucien.xin@gmail.com>
> + */
> +
> +#include <net/sctp/sctp.h>
> +
> +struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp)
> +{
> +	struct sctp_stream *stream;
> +	int i;
> +
> +	stream = kzalloc(sizeof(*stream), gfp);
> +	if (!stream)
> +		return NULL;
> +
> +	stream->outcnt = outcnt;
> +	stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp);
> +	if (!stream->out) {
> +		kfree(stream);
> +		return NULL;
> +	}
> +	for (i = 0; i < stream->outcnt; i++)
> +		stream->out[i].state = SCTP_STREAM_OPEN;
> +
> +	stream->incnt = incnt;
> +	stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp);
> +	if (!stream->in) {
> +		kfree(stream->out);
> +		kfree(stream);
> +		return NULL;
> +	}
> +
> +	return stream;
> +}
> +
> +void sctp_stream_free(struct sctp_stream *stream)
> +{
> +	if (unlikely(!stream))
> +		return;
> +
> +	kfree(stream->out);
> +	kfree(stream->in);
> +	kfree(stream);
> +}
> +
> +void sctp_stream_clear(struct sctp_stream *stream)
> +{
> +	int i;
> +
> +	for (i = 0; i < stream->outcnt; i++)
> +		stream->out[i].ssn = 0;
> +
> +	for (i = 0; i < stream->incnt; i++)
> +		stream->in[i].ssn = 0;
> +}
> diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
> index 84d0fda..aa3624d 100644
> --- a/net/sctp/ulpqueue.c
> +++ b/net/sctp/ulpqueue.c
> @@ -760,11 +760,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
>  	struct sk_buff_head *event_list;
>  	struct sk_buff *pos, *tmp;
>  	struct sctp_ulpevent *cevent;
> -	struct sctp_stream *in;
> +	struct sctp_stream *stream;
>  	__u16 sid, csid, cssn;
>  
>  	sid = event->stream;
> -	in  = &ulpq->asoc->ssnmap->in;
> +	stream  = ulpq->asoc->stream;
>  
>  	event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev;
>  
> @@ -782,11 +782,11 @@ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq,
>  		if (csid < sid)
>  			continue;
>  
> -		if (cssn != sctp_ssn_peek(in, sid))
> +		if (cssn != sctp_ssn_peek(stream, in, sid))
>  			break;
>  
> -		/* Found it, so mark in the ssnmap. */
> -		sctp_ssn_next(in, sid);
> +		/* Found it, so mark in the stream. */
> +		sctp_ssn_next(stream, in, sid);
>  
>  		__skb_unlink(pos, &ulpq->lobby);
>  
> @@ -849,7 +849,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
>  					     struct sctp_ulpevent *event)
>  {
>  	__u16 sid, ssn;
> -	struct sctp_stream *in;
> +	struct sctp_stream *stream;
>  
>  	/* Check if this message needs ordering.  */
>  	if (SCTP_DATA_UNORDERED & event->msg_flags)
> @@ -858,10 +858,10 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
>  	/* Note: The stream ID must be verified before this routine.  */
>  	sid = event->stream;
>  	ssn = event->ssn;
> -	in  = &ulpq->asoc->ssnmap->in;
> +	stream  = ulpq->asoc->stream;
>  
>  	/* Is this the expected SSN for this stream ID?  */
> -	if (ssn != sctp_ssn_peek(in, sid)) {
> +	if (ssn != sctp_ssn_peek(stream, in, sid)) {
>  		/* We've received something out of order, so find where it
>  		 * needs to be placed.  We order by stream and then by SSN.
>  		 */
> @@ -870,7 +870,7 @@ static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq,
>  	}
>  
>  	/* Mark that the next chunk has been found.  */
> -	sctp_ssn_next(in, sid);
> +	sctp_ssn_next(stream, in, sid);
>  
>  	/* Go find any other chunks that were waiting for
>  	 * ordering.
> @@ -888,12 +888,12 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
>  	struct sk_buff *pos, *tmp;
>  	struct sctp_ulpevent *cevent;
>  	struct sctp_ulpevent *event;
> -	struct sctp_stream *in;
> +	struct sctp_stream *stream;
>  	struct sk_buff_head temp;
>  	struct sk_buff_head *lobby = &ulpq->lobby;
>  	__u16 csid, cssn;
>  
> -	in  = &ulpq->asoc->ssnmap->in;
> +	stream = ulpq->asoc->stream;
>  
>  	/* We are holding the chunks by stream, by SSN.  */
>  	skb_queue_head_init(&temp);
> @@ -912,7 +912,7 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
>  			continue;
>  
>  		/* see if this ssn has been marked by skipping */
> -		if (!SSN_lt(cssn, sctp_ssn_peek(in, csid)))
> +		if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid)))
>  			break;
>  
>  		__skb_unlink(pos, lobby);
> @@ -932,8 +932,8 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
>  		csid = cevent->stream;
>  		cssn = cevent->ssn;
>  
> -		if (csid == sid && cssn == sctp_ssn_peek(in, csid)) {
> -			sctp_ssn_next(in, csid);
> +		if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) {
> +			sctp_ssn_next(stream, in, csid);
>  			__skb_unlink(pos, lobby);
>  			__skb_queue_tail(&temp, pos);
>  			event = sctp_skb2event(pos);
> @@ -955,17 +955,17 @@ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid)
>   */
>  void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn)
>  {
> -	struct sctp_stream *in;
> +	struct sctp_stream *stream;
>  
>  	/* Note: The stream ID must be verified before this routine.  */
> -	in  = &ulpq->asoc->ssnmap->in;
> +	stream  = ulpq->asoc->stream;
>  
>  	/* Is this an old SSN?  If so ignore. */
> -	if (SSN_lt(ssn, sctp_ssn_peek(in, sid)))
> +	if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)))
>  		return;
>  
>  	/* Mark that we are no longer expecting this SSN or lower. */
> -	sctp_ssn_skip(in, sid, ssn);
> +	sctp_ssn_skip(stream, in, sid, ssn);
>  
>  	/* Go find any other chunks that were waiting for
>  	 * ordering and deliver them if needed.
> -- 
> 2.1.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* [PATCH net-next 4/4] syncookies: use SipHash in place of SHA1
From: Jason A. Donenfeld @ 2017-01-06 18:37 UTC (permalink / raw)
  To: davem, netdev, linux-kernel; +Cc: Jason A. Donenfeld, Eric Dumazet
In-Reply-To: <20170106183716.5567-1-Jason@zx2c4.com>

SHA1 is slower and less secure than SipHash, and so replacing syncookie
generation with SipHash makes natural sense. Some BSDs have been doing
this for several years in fact.

The speedup should be similar -- and even more impressive -- to the
speedup from the sequence number fix in this series.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
---
 net/ipv4/syncookies.c | 20 ++++----------------
 net/ipv6/syncookies.c | 37 ++++++++++++++++---------------------
 2 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 3e88467d70ee..03bb068f8888 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -13,13 +13,13 @@
 #include <linux/tcp.h>
 #include <linux/slab.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <net/tcp.h>
 #include <net/route.h>
 
-static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie_secret[2] __read_mostly;
 
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -48,24 +48,12 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
 #define TSBITS	6
 #define TSMASK	(((__u32)1 << TSBITS) - 1)
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
-
 static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
 		       u32 count, int c)
 {
-	__u32 *tmp;
-
 	net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
-
-	tmp  = this_cpu_ptr(ipv4_cookie_scratch);
-	memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
-	tmp[0] = (__force u32)saddr;
-	tmp[1] = (__force u32)daddr;
-	tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[3] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return siphash_4u32(saddr, daddr, (u32)sport << 16 | dport, count,
+			    syncookie_secret[c]);
 }
 
 
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index a4d49760bf43..be51fc0d99ad 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,7 +16,7 @@
 
 #include <linux/tcp.h>
 #include <linux/random.h>
-#include <linux/cryptohash.h>
+#include <linux/siphash.h>
 #include <linux/kernel.h>
 #include <net/ipv6.h>
 #include <net/tcp.h>
@@ -24,7 +24,7 @@
 #define COOKIEBITS 24	/* Upper bits store count */
 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
 
-static u32 syncookie6_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
+static siphash_key_t syncookie6_secret[2] __read_mostly;
 
 /* RFC 2460, Section 8.3:
  * [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -41,30 +41,25 @@ static __u16 const msstab[] = {
 	9000 - 60,
 };
 
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
-
 static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
 		       __be16 sport, __be16 dport, u32 count, int c)
 {
-	__u32 *tmp;
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		u32 count;
+		u16 sport;
+		u16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *saddr,
+		.daddr = *daddr,
+		.count = count,
+		.sport = sport,
+		.dport = dport
+	};
 
 	net_get_random_once(syncookie6_secret, sizeof(syncookie6_secret));
-
-	tmp  = this_cpu_ptr(ipv6_cookie_scratch);
-
-	/*
-	 * we have 320 bits of information to hash, copy in the remaining
-	 * 192 bits required for sha_transform, from the syncookie6_secret
-	 * and overwrite the digest with the secret
-	 */
-	memcpy(tmp + 10, syncookie6_secret[c], 44);
-	memcpy(tmp, saddr, 16);
-	memcpy(tmp + 4, daddr, 16);
-	tmp[8] = ((__force u32)sport << 16) + (__force u32)dport;
-	tmp[9] = count;
-	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
-
-	return tmp[17];
+	return siphash(&combined, offsetofend(typeof(combined), dport), syncookie6_secret[c]);
 }
 
 static __u32 secure_tcp_syn_cookie(const struct in6_addr *saddr,
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 2/4] siphash: implement HalfSipHash1-3 for hash tables
From: Jason A. Donenfeld @ 2017-01-06 18:37 UTC (permalink / raw)
  To: davem, netdev, linux-kernel; +Cc: Jason A. Donenfeld, Jean-Philippe Aumasson
In-Reply-To: <20170106183716.5567-1-Jason@zx2c4.com>

HalfSipHash, or hsiphash, is a shortened version of SipHash, which
generates 32-bit outputs using a weaker 64-bit key. It has *much* lower
security margins, and shouldn't be used for anything too sensitive, but
it could be used as a hashtable key function replacement, if the output
is never exposed, and if the security requirement is not too high.

The goal is to make this something that performance-critical jhash users
would be willing to use.

On 64-bit machines, HalfSipHash1-3 is slower than SipHash1-3, so we alias
SipHash1-3 to HalfSipHash1-3 on those systems.

64-bit x86_64:
[    0.509409] test_siphash:     SipHash2-4 cycles: 4049181
[    0.510650] test_siphash:     SipHash1-3 cycles: 2512884
[    0.512205] test_siphash: HalfSipHash1-3 cycles: 3429920
[    0.512904] test_siphash:    JenkinsHash cycles:  978267
So, we map hsiphash() -> SipHash1-3

32-bit x86:
[    0.509868] test_siphash:     SipHash2-4 cycles: 14812892
[    0.513601] test_siphash:     SipHash1-3 cycles:  9510710
[    0.515263] test_siphash: HalfSipHash1-3 cycles:  3856157
[    0.515952] test_siphash:    JenkinsHash cycles:  1148567
So, we map hsiphash() -> HalfSipHash1-3

hsiphash() is roughly 3 times slower than jhash(), but comes with a
considerable security improvement.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
---
 Documentation/siphash.txt |  75 +++++++++++
 include/linux/siphash.h   |  56 +++++++-
 lib/siphash.c             | 318 +++++++++++++++++++++++++++++++++++++++++++++-
 lib/test_siphash.c        | 139 ++++++++++++++++----
 4 files changed, 561 insertions(+), 27 deletions(-)

diff --git a/Documentation/siphash.txt b/Documentation/siphash.txt
index 39ff7f0438e7..f93c1d7104c4 100644
--- a/Documentation/siphash.txt
+++ b/Documentation/siphash.txt
@@ -77,3 +77,78 @@ Linux implements the "2-4" variant of SipHash.
 
 Read the SipHash paper if you're interested in learning more:
 https://131002.net/siphash/siphash.pdf
+
+
+~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~=~
+
+HalfSipHash - SipHash's insecure younger cousin
+-----------------------------------------------
+Written by Jason A. Donenfeld <jason@zx2c4.com>
+
+On the off-chance that SipHash is not fast enough for your needs, you might be
+able to justify using HalfSipHash, a terrifying but potentially useful
+possibility. HalfSipHash cuts SipHash's rounds down from "2-4" to "1-3" and,
+even scarier, uses an easily brute-forcable 64-bit key (with a 32-bit output)
+instead of SipHash's 128-bit key. However, this may appeal to some
+high-performance `jhash` users.
+
+Danger!
+
+Do not ever use HalfSipHash except for as a hashtable key function, and only
+then when you can be absolutely certain that the outputs will never be
+transmitted out of the kernel. This is only remotely useful over `jhash` as a
+means of mitigating hashtable flooding denial of service attacks.
+
+1. Generating a key
+
+Keys should always be generated from a cryptographically secure source of
+random numbers, either using get_random_bytes or get_random_once:
+
+hsiphash_key_t key;
+get_random_bytes(key, sizeof(key));
+
+If you're not deriving your key from here, you're doing it wrong.
+
+2. Using the functions
+
+There are two variants of the function, one that takes a list of integers, and
+one that takes a buffer:
+
+u32 hsiphash(const void *data, size_t len, siphash_key_t key);
+
+And:
+
+u32 hsiphash_1u32(u32, hsiphash_key_t key);
+u32 hsiphash_2u32(u32, u32, hsiphash_key_t key);
+u32 hsiphash_3u32(u32, u32, u32, hsiphash_key_t key);
+u32 hsiphash_4u32(u32, u32, u32, u32, hsiphash_key_t key);
+
+If you pass the generic hsiphash function something of a constant length, it
+will constant fold at compile-time and automatically choose one of the
+optimized functions.
+
+3. Hashtable key function usage:
+
+struct some_hashtable {
+	DECLARE_HASHTABLE(hashtable, 8);
+	hsiphash_key_t key;
+};
+
+void init_hashtable(struct some_hashtable *table)
+{
+	get_random_bytes(table->key, sizeof(table->key));
+}
+
+static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
+{
+	return &table->hashtable[hsiphash(input, sizeof(*input), table->key) & (HASH_SIZE(table->hashtable) - 1)];
+}
+
+You may then iterate like usual over the returned hash bucket.
+
+4. Performance
+
+HalfSipHash is roughly 3 times slower than JenkinsHash. For many replacements,
+this will not be a problem, as the hashtable lookup isn't the bottleneck. And
+in general, this is probably a good sacrifice to make for the security and DoS
+resistance of HalfSipHash.
diff --git a/include/linux/siphash.h b/include/linux/siphash.h
index 7aa666eb00d9..efab44c654f3 100644
--- a/include/linux/siphash.h
+++ b/include/linux/siphash.h
@@ -5,7 +5,9 @@
  * SipHash: a fast short-input PRF
  * https://131002.net/siphash/
  *
- * This implementation is specifically for SipHash2-4.
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
  */
 
 #ifndef _LINUX_SIPHASH_H
@@ -76,4 +78,56 @@ static inline u64 siphash(const void *data, size_t len, const siphash_key_t key)
 	return ___siphash_aligned(data, len, key);
 }
 
+#if BITS_PER_LONG == 64
+typedef siphash_key_t hsiphash_key_t;
+#define HSIPHASH_ALIGNMENT SIPHASH_ALIGNMENT
+#else
+typedef u32 hsiphash_key_t[2];
+#define HSIPHASH_ALIGNMENT __alignof__(u32)
+#endif
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t key);
+#endif
+
+u32 hsiphash_1u32(const u32 a, const hsiphash_key_t key);
+u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t key);
+u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
+		  const hsiphash_key_t key);
+u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
+		  const hsiphash_key_t key);
+
+static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, const hsiphash_key_t key)
+{
+	if (__builtin_constant_p(len) && len == 4)
+		return hsiphash_1u32(le32_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 8)
+		return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     key);
+	if (__builtin_constant_p(len) && len == 12)
+		return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     le32_to_cpu(data[2]), key);
+	if (__builtin_constant_p(len) && len == 16)
+		return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+				     le32_to_cpu(data[2]), le32_to_cpu(data[3]),
+				     key);
+	return __hsiphash_aligned(data, len, key);
+}
+
+/**
+ * hsiphash - compute 32-bit hsiphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the hsiphash key
+ */
+static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+		return __hsiphash_unaligned(data, len, key);
+#endif
+	return ___hsiphash_aligned(data, len, key);
+}
+
 #endif /* _LINUX_SIPHASH_H */
diff --git a/lib/siphash.c b/lib/siphash.c
index ff2151313667..e2481226d96c 100644
--- a/lib/siphash.c
+++ b/lib/siphash.c
@@ -5,7 +5,9 @@
  * SipHash: a fast short-input PRF
  * https://131002.net/siphash/
  *
- * This implementation is specifically for SipHash2-4.
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
  */
 
 #include <linux/siphash.h>
@@ -230,3 +232,317 @@ u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
 	POSTAMBLE
 }
 EXPORT_SYMBOL(siphash_3u32);
+
+#if BITS_PER_LONG == 64
+/* Note that this HalfSipHash1-3 implementation on 64-bit
+ * isn't actually HalfSipHash1-3 but rather SipHash1-3. */
+
+#define HSIPROUND SIPROUND
+#define HPREAMBLE(len) PREAMBLE(len)
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t key)
+{
+	HPREAMBLE(4)
+	b |= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(8)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(12)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	b |= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(16)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	combined = (u64)forth << 32 | third;
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#else
+#define HSIPROUND \
+	do { \
+	v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
+	v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
+	v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
+	v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
+	} while(0)
+
+#define HPREAMBLE(len) \
+	u32 v0 = 0; \
+	u32 v1 = 0; \
+	u32 v2 = 0x6c796765U; \
+	u32 v3 = 0x74656462U; \
+	u32 b = ((u32)len) << 24; \
+	v3 ^= key[1]; \
+	v2 ^= key[0]; \
+	v1 ^= key[1]; \
+	v0 ^= key[0];
+
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return v1 ^ v3;
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = le32_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = get_unaligned_le32(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t key)
+{
+	HPREAMBLE(4)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t key)
+{
+	HPREAMBLE(8)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t key)
+{
+	HPREAMBLE(12)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t key)
+{
+	HPREAMBLE(16)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	HSIPROUND;
+	v0 ^= forth;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#endif
diff --git a/lib/test_siphash.c b/lib/test_siphash.c
index e0ba2cf8dc67..ac291ec27fb6 100644
--- a/lib/test_siphash.c
+++ b/lib/test_siphash.c
@@ -7,7 +7,9 @@
  * SipHash: a fast short-input PRF
  * https://131002.net/siphash/
  *
- * This implementation is specifically for SipHash2-4.
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -18,10 +20,16 @@
 #include <linux/errno.h>
 #include <linux/module.h>
 
-/* Test vectors taken from official reference source available at:
- *     https://131002.net/siphash/siphash24.c
+/* Test vectors taken from reference source available at:
+ *     https://github.com/veorq/SipHash
  */
-static const u64 test_vectors[64] = {
+
+
+
+static const siphash_key_t test_key_siphash =
+	{ 0x0706050403020100ULL , 0x0f0e0d0c0b0a0908ULL };
+
+static const u64 test_vectors_siphash[64] = {
 	0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL,
 	0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL,
 	0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL,
@@ -45,9 +53,64 @@ static const u64 test_vectors[64] = {
 	0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL,
 	0x958a324ceb064572ULL
 };
-static const siphash_key_t test_key =
+#if BITS_PER_LONG == 64
+static const hsiphash_key_t test_key_hsiphash =
 	{ 0x0706050403020100ULL , 0x0f0e0d0c0b0a0908ULL };
 
+static const u32 test_vectors_hsiphash[64] = {
+	0x050fc4dcU, 0x7d57ca93U, 0x4dc7d44dU,
+	0xe7ddf7fbU, 0x88d38328U, 0x49533b67U,
+	0xc59f22a7U, 0x9bb11140U, 0x8d299a8eU,
+	0x6c063de4U, 0x92ff097fU, 0xf94dc352U,
+	0x57b4d9a2U, 0x1229ffa7U, 0xc0f95d34U,
+	0x2a519956U, 0x7d908b66U, 0x63dbd80cU,
+	0xb473e63eU, 0x8d297d1cU, 0xa6cce040U,
+	0x2b45f844U, 0xa320872eU, 0xdae6c123U,
+	0x67349c8cU, 0x705b0979U, 0xca9913a5U,
+	0x4ade3b35U, 0xef6cd00dU, 0x4ab1e1f4U,
+	0x43c5e663U, 0x8c21d1bcU, 0x16a7b60dU,
+	0x7a8ff9bfU, 0x1f2a753eU, 0xbf186b91U,
+	0xada26206U, 0xa3c33057U, 0xae3a36a1U,
+	0x7b108392U, 0x99e41531U, 0x3f1ad944U,
+	0xc8138825U, 0xc28949a6U, 0xfaf8876bU,
+	0x9f042196U, 0x68b1d623U, 0x8b5114fdU,
+	0xdf074c46U, 0x12cc86b3U, 0x0a52098fU,
+	0x9d292f9aU, 0xa2f41f12U, 0x43a71ed0U,
+	0x73f0bce6U, 0x70a7e980U, 0x243c6d75U,
+	0xfdb71513U, 0xa67d8a08U, 0xb7e8f148U,
+	0xf7a644eeU, 0x0f1837f2U, 0x4b6694e0U,
+	0xb7bbb3a8U
+};
+#else
+static const hsiphash_key_t test_key_hsiphash =
+	{ 0x03020100U, 0x07060504U };
+
+static const u32 test_vectors_hsiphash[64] = {
+	0x5814c896U, 0xe7e864caU, 0xbc4b0e30U,
+	0x01539939U, 0x7e059ea6U, 0x88e3d89bU,
+	0xa0080b65U, 0x9d38d9d6U, 0x577999b1U,
+	0xc839caedU, 0xe4fa32cfU, 0x959246eeU,
+	0x6b28096cU, 0x66dd9cd6U, 0x16658a7cU,
+	0xd0257b04U, 0x8b31d501U, 0x2b1cd04bU,
+	0x06712339U, 0x522aca67U, 0x911bb605U,
+	0x90a65f0eU, 0xf826ef7bU, 0x62512debU,
+	0x57150ad7U, 0x5d473507U, 0x1ec47442U,
+	0xab64afd3U, 0x0a4100d0U, 0x6d2ce652U,
+	0x2331b6a3U, 0x08d8791aU, 0xbc6dda8dU,
+	0xe0f6c934U, 0xb0652033U, 0x9b9851ccU,
+	0x7c46fb7fU, 0x732ba8cbU, 0xf142997aU,
+	0xfcc9aa1bU, 0x05327eb2U, 0xe110131cU,
+	0xf9e5e7c0U, 0xa7d708a6U, 0x11795ab1U,
+	0x65671619U, 0x9f5fff91U, 0xd89c5267U,
+	0x007783ebU, 0x95766243U, 0xab639262U,
+	0x9c7e1390U, 0xc368dda6U, 0x38ddc455U,
+	0xfa13d379U, 0x979ea4e8U, 0x53ecd77eU,
+	0x2ee80657U, 0x33dbb66aU, 0xae3f0577U,
+	0x88b4c4ccU, 0x3e7f480bU, 0x74c1ebf8U,
+	0x87178304U
+};
+#endif
+
 static int __init siphash_test_init(void)
 {
 	u8 in[64] __aligned(SIPHASH_ALIGNMENT);
@@ -58,49 +121,75 @@ static int __init siphash_test_init(void)
 	for (i = 0; i < 64; ++i) {
 		in[i] = i;
 		in_unaligned[i + 1] = i;
-		if (siphash(in, i, test_key) != test_vectors[i]) {
-			pr_info("self-test aligned %u: FAIL\n", i + 1);
+		if (siphash(in, i, test_key_siphash) != test_vectors_siphash[i]) {
+			pr_info("siphash self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (siphash(in_unaligned + 1, i, test_key_siphash) != test_vectors_siphash[i]) {
+			pr_info("siphash self-test unaligned %u: FAIL\n", i + 1);
 			ret = -EINVAL;
 		}
-		if (siphash(in_unaligned + 1, i, test_key) != test_vectors[i]) {
-			pr_info("self-test unaligned %u: FAIL\n", i + 1);
+		if (hsiphash(in, i, test_key_hsiphash) != test_vectors_hsiphash[i]) {
+			pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (hsiphash(in_unaligned + 1, i, test_key_hsiphash) != test_vectors_hsiphash[i]) {
+			pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1);
 			ret = -EINVAL;
 		}
 	}
-	if (siphash_1u64(0x0706050403020100ULL, test_key) != test_vectors[8]) {
-		pr_info("self-test 1u64: FAIL\n");
+	if (siphash_1u64(0x0706050403020100ULL, test_key_siphash) != test_vectors_siphash[8]) {
+		pr_info("siphash self-test 1u64: FAIL\n");
 		ret = -EINVAL;
 	}
-	if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, test_key) != test_vectors[16]) {
-		pr_info("self-test 2u64: FAIL\n");
+	if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, test_key_siphash) != test_vectors_siphash[16]) {
+		pr_info("siphash self-test 2u64: FAIL\n");
 		ret = -EINVAL;
 	}
 	if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
-			 0x1716151413121110ULL, test_key) != test_vectors[24]) {
-		pr_info("self-test 3u64: FAIL\n");
+			 0x1716151413121110ULL, test_key_siphash) != test_vectors_siphash[24]) {
+		pr_info("siphash self-test 3u64: FAIL\n");
 		ret = -EINVAL;
 	}
 	if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
-			 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, test_key) != test_vectors[32]) {
-		pr_info("self-test 4u64: FAIL\n");
+			 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, test_key_siphash) != test_vectors_siphash[32]) {
+		pr_info("siphash self-test 4u64: FAIL\n");
 		ret = -EINVAL;
 	}
-	if (siphash_1u32(0x03020100U, test_key) != test_vectors[4]) {
-		pr_info("self-test 1u32: FAIL\n");
+	if (siphash_1u32(0x03020100U, test_key_siphash) != test_vectors_siphash[4]) {
+		pr_info("siphash self-test 1u32: FAIL\n");
 		ret = -EINVAL;
 	}
-	if (siphash_2u32(0x03020100U, 0x07060504U, test_key) != test_vectors[8]) {
-		pr_info("self-test 2u32: FAIL\n");
+	if (siphash_2u32(0x03020100U, 0x07060504U, test_key_siphash) != test_vectors_siphash[8]) {
+		pr_info("siphash self-test 2u32: FAIL\n");
 		ret = -EINVAL;
 	}
 	if (siphash_3u32(0x03020100U, 0x07060504U,
-			 0x0b0a0908U, test_key) != test_vectors[12]) {
-		pr_info("self-test 3u32: FAIL\n");
+			 0x0b0a0908U, test_key_siphash) != test_vectors_siphash[12]) {
+		pr_info("siphash self-test 3u32: FAIL\n");
 		ret = -EINVAL;
 	}
 	if (siphash_4u32(0x03020100U, 0x07060504U,
-			 0x0b0a0908U, 0x0f0e0d0cU, test_key) != test_vectors[16]) {
-		pr_info("self-test 4u32: FAIL\n");
+			 0x0b0a0908U, 0x0f0e0d0cU, test_key_siphash) != test_vectors_siphash[16]) {
+		pr_info("siphash self-test 4u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_1u32(0x03020100U, test_key_hsiphash) != test_vectors_hsiphash[4]) {
+		pr_info("hsiphash self-test 1u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_2u32(0x03020100U, 0x07060504U, test_key_hsiphash) != test_vectors_hsiphash[8]) {
+		pr_info("hsiphash self-test 2u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_3u32(0x03020100U, 0x07060504U,
+			  0x0b0a0908U, test_key_hsiphash) != test_vectors_hsiphash[12]) {
+		pr_info("hsiphash self-test 3u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_4u32(0x03020100U, 0x07060504U,
+			  0x0b0a0908U, 0x0f0e0d0cU, test_key_hsiphash) != test_vectors_hsiphash[16]) {
+		pr_info("hsiphash self-test 4u32: FAIL\n");
 		ret = -EINVAL;
 	}
 	if (!ret)
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 3/4] secure_seq: use SipHash in place of MD5
From: Jason A. Donenfeld @ 2017-01-06 18:37 UTC (permalink / raw)
  To: davem, netdev, linux-kernel
  Cc: Jason A. Donenfeld, Andi Kleen, David Laight, Tom Herbert,
	Hannes Frederic Sowa, Eric Dumazet
In-Reply-To: <20170106183716.5567-1-Jason@zx2c4.com>

This gives a clear speed and security improvement. Siphash is both
faster and is more solid crypto than the aging MD5.

Rather than manually filling MD5 buffers, for IPv6, we simply create
a layout by a simple anonymous struct, for which gcc generates
rather efficient code. For IPv4, we pass the values directly to the
short input convenience functions.

64-bit x86_64:
[    1.683628] secure_tcpv6_sequence_number_md5# cycles: 99563527
[    1.717350] secure_tcp_sequence_number_md5# cycles: 92890502
[    1.741968] secure_tcpv6_sequence_number_siphash# cycles: 67825362
[    1.762048] secure_tcp_sequence_number_siphash# cycles: 67485526

32-bit x86:
[    1.600012] secure_tcpv6_sequence_number_md5# cycles: 103227892
[    1.634219] secure_tcp_sequence_number_md5# cycles: 94732544
[    1.669102] secure_tcpv6_sequence_number_siphash# cycles: 96299384
[    1.700165] secure_tcp_sequence_number_siphash# cycles: 86015473

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Miller <davem@davemloft.net>
Cc: David Laight <David.Laight@aculab.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/core/secure_seq.c | 135 ++++++++++++++++++++------------------------------
 1 file changed, 54 insertions(+), 81 deletions(-)

diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 88a8e429fc3e..3dc2689bcc64 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,3 +1,5 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */
+
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/cryptohash.h>
@@ -8,14 +10,14 @@
 #include <linux/ktime.h>
 #include <linux/string.h>
 #include <linux/net.h>
-
+#include <linux/siphash.h>
 #include <net/secure_seq.h>
 
 #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
+#include <linux/in6.h>
 #include <net/tcp.h>
-#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
 
-static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
+static siphash_key_t net_secret;
 
 static __always_inline void net_secret_init(void)
 {
@@ -44,80 +46,65 @@ static u32 seq_scale(u32 seq)
 u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 				 __be16 sport, __be16 dport, u32 *tsoff)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
-	u32 i;
-
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.sport = sport,
+		.dport = dport
+	};
+	u64 hash;
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32)daddr[i];
-	secret[4] = net_secret[4] +
-		(((__force u16)sport << 16) + (__force u16)dport);
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	*tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
-	return seq_scale(hash[0]);
+	hash = siphash(&combined, offsetofend(typeof(combined), dport), net_secret);
+	*tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+	return seq_scale(hash);
 }
 EXPORT_SYMBOL(secure_tcpv6_sequence_number);
 
 u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
 			       __be16 dport)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
-	u32 i;
-
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.dport = dport
+	};
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32) daddr[i];
-	secret[4] = net_secret[4] + (__force u32)dport;
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	return hash[0];
+	return siphash(&combined, offsetofend(typeof(combined), dport), net_secret);
 }
 EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
 #endif
 
 #ifdef CONFIG_INET
 
+/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
+ * but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
+ * it would be easy enough to have the former function use siphash_4u32, passing
+ * the arguments as separate u32.
+ */
+
 u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
 			       __be16 sport, __be16 dport, u32 *tsoff)
 {
-	u32 hash[MD5_DIGEST_WORDS];
-
+	u64 hash;
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	*tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0;
-	return seq_scale(hash[0]);
+	hash = siphash_3u32(saddr, daddr, (u32)sport << 16 | dport, net_secret);
+	*tsoff = sysctl_tcp_timestamps == 1 ? (hash >> 32) : 0;
+	return seq_scale(hash);
 }
 
 u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
 {
-	u32 hash[MD5_DIGEST_WORDS];
-
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = (__force u32)dport ^ net_secret[14];
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	return hash[0];
+	return siphash_3u32(saddr, daddr, dport, net_secret);
 }
 EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
 #endif
@@ -126,21 +113,11 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
 u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport)
 {
-	u32 hash[MD5_DIGEST_WORDS];
 	u64 seq;
-
 	net_secret_init();
-	hash[0] = (__force u32)saddr;
-	hash[1] = (__force u32)daddr;
-	hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
-	hash[3] = net_secret[15];
-
-	md5_transform(hash, net_secret);
-
-	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq = siphash_3u32(saddr, daddr, (u32)sport << 16 | dport, net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
-
 	return seq;
 }
 EXPORT_SYMBOL(secure_dccp_sequence_number);
@@ -149,26 +126,22 @@ EXPORT_SYMBOL(secure_dccp_sequence_number);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 				  __be16 sport, __be16 dport)
 {
-	u32 secret[MD5_MESSAGE_BYTES / 4];
-	u32 hash[MD5_DIGEST_WORDS];
+	const struct {
+		struct in6_addr saddr;
+		struct in6_addr daddr;
+		__be16 sport;
+		__be16 dport;
+	} __aligned(SIPHASH_ALIGNMENT) combined = {
+		.saddr = *(struct in6_addr *)saddr,
+		.daddr = *(struct in6_addr *)daddr,
+		.sport = sport,
+		.dport = dport
+	};
 	u64 seq;
-	u32 i;
-
 	net_secret_init();
-	memcpy(hash, saddr, 16);
-	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + (__force u32)daddr[i];
-	secret[4] = net_secret[4] +
-		(((__force u16)sport << 16) + (__force u16)dport);
-	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-		secret[i] = net_secret[i];
-
-	md5_transform(hash, secret);
-
-	seq = hash[0] | (((u64)hash[1]) << 32);
+	seq = siphash(&combined, offsetofend(typeof(combined), dport), net_secret);
 	seq += ktime_get_real_ns();
 	seq &= (1ull << 48) - 1;
-
 	return seq;
 }
 EXPORT_SYMBOL(secure_dccpv6_sequence_number);
-- 
2.11.0

^ permalink raw reply related

* [PATCH net-next 1/4] siphash: add cryptographically secure PRF
From: Jason A. Donenfeld @ 2017-01-06 18:37 UTC (permalink / raw)
  To: davem, netdev, linux-kernel
  Cc: Jason A. Donenfeld, Jean-Philippe Aumasson, Linus Torvalds,
	Eric Biggers, David Laight, Eric Dumazet

SipHash is a 64-bit keyed hash function that is actually a
cryptographically secure PRF, like HMAC. Except SipHash is super fast,
and is meant to be used as a hashtable keyed lookup function, or as a
general PRF for short input use cases, such as sequence numbers or RNG
chaining.

For the first usage:

There are a variety of attacks known as "hashtable poisoning" in which an
attacker forms some data such that the hash of that data will be the
same, and then preceeds to fill up all entries of a hashbucket. This is
a realistic and well-known denial-of-service vector. Currently
hashtables use jhash, which is fast but not secure, and some kind of
rotating key scheme (or none at all, which isn't good). SipHash is meant
as a replacement for jhash in these cases.

There are a modicum of places in the kernel that are vulnerable to
hashtable poisoning attacks, either via userspace vectors or network
vectors, and there's not a reliable mechanism inside the kernel at the
moment to fix it. The first step toward fixing these issues is actually
getting a secure primitive into the kernel for developers to use. Then
we can, bit by bit, port things over to it as deemed appropriate.

While SipHash is extremely fast for a cryptographically secure function,
it is likely a bit slower than the insecure jhash, and so replacements
will be evaluated on a case-by-case basis based on whether or not the
difference in speed is negligible and whether or not the current jhash usage
poses a real security risk.

For the second usage:

A few places in the kernel are using MD5 or SHA1 for creating secure
sequence numbers, syn cookies, port numbers, or fast random numbers.
SipHash is a faster and more fitting, and more secure replacement for MD5
in those situations. Replacing MD5 and SHA1 with SipHash for these uses is
obvious and straight-forward, and so is submitted along with this patch
series. There shouldn't be much of a debate over its efficacy.

Dozens of languages are already using this internally for their hash
tables and PRFs. Some of the BSDs already use this in their kernels.
SipHash is a widely known high-speed solution to a widely known set of
problems, and it's time we catch-up.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 Documentation/siphash.txt |  79 ++++++++++++++++
 MAINTAINERS               |   7 ++
 include/linux/siphash.h   |  79 ++++++++++++++++
 lib/Kconfig.debug         |   6 +-
 lib/Makefile              |   5 +-
 lib/siphash.c             | 232 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/test_siphash.c        | 119 ++++++++++++++++++++++++
 7 files changed, 522 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/siphash.txt
 create mode 100644 include/linux/siphash.h
 create mode 100644 lib/siphash.c
 create mode 100644 lib/test_siphash.c

diff --git a/Documentation/siphash.txt b/Documentation/siphash.txt
new file mode 100644
index 000000000000..39ff7f0438e7
--- /dev/null
+++ b/Documentation/siphash.txt
@@ -0,0 +1,79 @@
+         SipHash - a short input PRF
+-----------------------------------------------
+Written by Jason A. Donenfeld <jason@zx2c4.com>
+
+SipHash is a cryptographically secure PRF -- a keyed hash function -- that
+performs very well for short inputs, hence the name. It was designed by
+cryptographers Daniel J. Bernstein and Jean-Philippe Aumasson. It is intended
+as a replacement for some uses of: `jhash`, `md5_transform`, `sha_transform`,
+and so forth.
+
+SipHash takes a secret key filled with randomly generated numbers and either
+an input buffer or several input integers. It spits out an integer that is
+indistinguishable from random. You may then use that integer as part of secure
+sequence numbers, secure cookies, or mask it off for use in a hash table.
+
+1. Generating a key
+
+Keys should always be generated from a cryptographically secure source of
+random numbers, either using get_random_bytes or get_random_once:
+
+siphash_key_t key;
+get_random_bytes(key, sizeof(key));
+
+If you're not deriving your key from here, you're doing it wrong.
+
+2. Using the functions
+
+There are two variants of the function, one that takes a list of integers, and
+one that takes a buffer:
+
+u64 siphash(const void *data, size_t len, siphash_key_t key);
+
+And:
+
+u64 siphash_1u64(u64, siphash_key_t key);
+u64 siphash_2u64(u64, u64, siphash_key_t key);
+u64 siphash_3u64(u64, u64, u64, siphash_key_t key);
+u64 siphash_4u64(u64, u64, u64, u64, siphash_key_t key);
+u64 siphash_1u32(u32, siphash_key_t key);
+u64 siphash_2u32(u32, u32, siphash_key_t key);
+u64 siphash_3u32(u32, u32, u32, siphash_key_t key);
+u64 siphash_4u32(u32, u32, u32, u32, siphash_key_t key);
+
+If you pass the generic siphash function something of a constant length, it
+will constant fold at compile-time and automatically choose one of the
+optimized functions.
+
+3. Hashtable key function usage:
+
+struct some_hashtable {
+	DECLARE_HASHTABLE(hashtable, 8);
+	siphash_key_t key;
+};
+
+void init_hashtable(struct some_hashtable *table)
+{
+	get_random_bytes(table->key, sizeof(table->key));
+}
+
+static inline hlist_head *some_hashtable_bucket(struct some_hashtable *table, struct interesting_input *input)
+{
+	return &table->hashtable[siphash(input, sizeof(*input), table->key) & (HASH_SIZE(table->hashtable) - 1)];
+}
+
+You may then iterate like usual over the returned hash bucket.
+
+4. Security
+
+SipHash has a very high security margin, with its 128-bit key. So long as the
+key is kept secret, it is impossible for an attacker to guess the outputs of
+the function, even if being able to observe many outputs, since 2^128 outputs
+is significant.
+
+Linux implements the "2-4" variant of SipHash.
+
+5. Resources
+
+Read the SipHash paper if you're interested in learning more:
+https://131002.net/siphash/siphash.pdf
diff --git a/MAINTAINERS b/MAINTAINERS
index cfff2c9e3d94..e1384ae37344 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11291,6 +11291,13 @@ F:	arch/arm/mach-s3c24xx/mach-bast.c
 F:	arch/arm/mach-s3c24xx/bast-ide.c
 F:	arch/arm/mach-s3c24xx/bast-irq.c
 
+SIPHASH PRF ROUTINES
+M:	Jason A. Donenfeld <Jason@zx2c4.com>
+S:	Maintained
+F:	lib/siphash.c
+F:	lib/test_siphash.c
+F:	include/linux/siphash.h
+
 TI DAVINCI MACHINE SUPPORT
 M:	Sekhar Nori <nsekhar@ti.com>
 M:	Kevin Hilman <khilman@kernel.org>
diff --git a/include/linux/siphash.h b/include/linux/siphash.h
new file mode 100644
index 000000000000..7aa666eb00d9
--- /dev/null
+++ b/include/linux/siphash.h
@@ -0,0 +1,79 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4.
+ */
+
+#ifndef _LINUX_SIPHASH_H
+#define _LINUX_SIPHASH_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#define SIPHASH_ALIGNMENT __alignof__(u64)
+typedef u64 siphash_key_t[2];
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t key);
+#endif
+
+u64 siphash_1u64(const u64 a, const siphash_key_t key);
+u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t key);
+u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
+		 const siphash_key_t key);
+u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
+		 const siphash_key_t key);
+u64 siphash_1u32(const u32 a, const siphash_key_t key);
+u64 siphash_3u32(const u32 a, const u32 b, const u32 c, const siphash_key_t key);
+
+static inline u64 siphash_2u32(const u32 a, const u32 b, const siphash_key_t key)
+{
+	return siphash_1u64((u64)b << 32 | a, key);
+}
+static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
+			       const siphash_key_t key)
+{
+	return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
+}
+
+
+static inline u64 ___siphash_aligned(const __le64 *data, size_t len, const siphash_key_t key)
+{
+	if (__builtin_constant_p(len) && len == 4)
+		return siphash_1u32(le32_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 8)
+		return siphash_1u64(le64_to_cpu(data[0]), key);
+	if (__builtin_constant_p(len) && len == 16)
+		return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    key);
+	if (__builtin_constant_p(len) && len == 24)
+		return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    le64_to_cpu(data[2]), key);
+	if (__builtin_constant_p(len) && len == 32)
+		return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+				    le64_to_cpu(data[2]), le64_to_cpu(data[3]),
+				    key);
+	return __siphash_aligned(data, len, key);
+}
+
+/**
+ * siphash - compute 64-bit siphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the siphash key
+ */
+static inline u64 siphash(const void *data, size_t len, const siphash_key_t key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+		return __siphash_unaligned(data, len, key);
+#endif
+	return ___siphash_aligned(data, len, key);
+}
+
+#endif /* _LINUX_SIPHASH_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b06848a104e6..3d2515a770c3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1819,9 +1819,9 @@ config TEST_HASH
 	tristate "Perform selftest on hash functions"
 	default n
 	help
-	  Enable this option to test the kernel's integer (<linux/hash,h>)
-	  and string (<linux/stringhash.h>) hash functions on boot
-	  (or module load).
+	  Enable this option to test the kernel's integer (<linux/hash.h>),
+	  string (<linux/stringhash.h>), and siphash (<linux/siphash.h>)
+	  hash functions on boot (or module load).
 
 	  This is intended to help people writing architecture-specific
 	  optimized versions.  If unsure, say N.
diff --git a/lib/Makefile b/lib/Makefile
index bc4073a8cd08..7b3008d58600 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o win_minmax.o
+	 earlycpio.o seq_buf.o siphash.o \
+	 nmi_backtrace.o nodemask.o win_minmax.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
@@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
-obj-$(CONFIG_TEST_HASH) += test_hash.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
diff --git a/lib/siphash.c b/lib/siphash.c
new file mode 100644
index 000000000000..ff2151313667
--- /dev/null
+++ b/lib/siphash.c
@@ -0,0 +1,232 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4.
+ */
+
+#include <linux/siphash.h>
+#include <asm/unaligned.h>
+
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+#include <linux/dcache.h>
+#include <asm/word-at-a-time.h>
+#endif
+
+#define SIPROUND \
+	do { \
+	v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
+	v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
+	v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
+	v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
+	} while(0)
+
+#define PREAMBLE(len) \
+	u64 v0 = 0x736f6d6570736575ULL; \
+	u64 v1 = 0x646f72616e646f6dULL; \
+	u64 v2 = 0x6c7967656e657261ULL; \
+	u64 v3 = 0x7465646279746573ULL; \
+	u64 b = ((u64)len) << 56; \
+	v3 ^= key[1]; \
+	v2 ^= key[0]; \
+	v1 ^= key[1]; \
+	v0 ^= key[0];
+
+#define POSTAMBLE \
+	v3 ^= b; \
+	SIPROUND; \
+	SIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_unaligned);
+#endif
+
+/**
+ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
+ * @first: first u64
+ * @key: the siphash key
+ */
+u64 siphash_1u64(const u64 first, const siphash_key_t key)
+{
+	PREAMBLE(8)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u64);
+
+/**
+ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
+ * @first: first u64
+ * @second: second u64
+ * @key: the siphash key
+ */
+u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t key)
+{
+	PREAMBLE(16)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_2u64);
+
+/**
+ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @key: the siphash key
+ */
+u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
+		 const siphash_key_t key)
+{
+	PREAMBLE(24)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u64);
+
+/**
+ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @forth: forth u64
+ * @key: the siphash key
+ */
+u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
+		 const u64 forth, const siphash_key_t key)
+{
+	PREAMBLE(32)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= forth;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_4u64);
+
+u64 siphash_1u32(const u32 first, const siphash_key_t key)
+{
+	PREAMBLE(4)
+	b |= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u32);
+
+u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
+		 const siphash_key_t key)
+{
+	u64 combined = (u64)second << 32 | first;
+	PREAMBLE(12)
+	v3 ^= combined;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= combined;
+	b |= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u32);
diff --git a/lib/test_siphash.c b/lib/test_siphash.c
new file mode 100644
index 000000000000..e0ba2cf8dc67
--- /dev/null
+++ b/lib/test_siphash.c
@@ -0,0 +1,119 @@
+/* Test cases for siphash.c
+ *
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/siphash.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/* Test vectors taken from official reference source available at:
+ *     https://131002.net/siphash/siphash24.c
+ */
+static const u64 test_vectors[64] = {
+	0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL,
+	0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL,
+	0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL,
+	0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL,
+	0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL,
+	0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL,
+	0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL,
+	0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL,
+	0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL,
+	0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL,
+	0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL,
+	0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL,
+	0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL,
+	0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL,
+	0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL,
+	0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL,
+	0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL,
+	0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL,
+	0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL,
+	0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL,
+	0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL,
+	0x958a324ceb064572ULL
+};
+static const siphash_key_t test_key =
+	{ 0x0706050403020100ULL , 0x0f0e0d0c0b0a0908ULL };
+
+static int __init siphash_test_init(void)
+{
+	u8 in[64] __aligned(SIPHASH_ALIGNMENT);
+	u8 in_unaligned[65];
+	u8 i;
+	int ret = 0;
+
+	for (i = 0; i < 64; ++i) {
+		in[i] = i;
+		in_unaligned[i + 1] = i;
+		if (siphash(in, i, test_key) != test_vectors[i]) {
+			pr_info("self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (siphash(in_unaligned + 1, i, test_key) != test_vectors[i]) {
+			pr_info("self-test unaligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+	}
+	if (siphash_1u64(0x0706050403020100ULL, test_key) != test_vectors[8]) {
+		pr_info("self-test 1u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, test_key) != test_vectors[16]) {
+		pr_info("self-test 2u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 0x1716151413121110ULL, test_key) != test_vectors[24]) {
+		pr_info("self-test 3u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, test_key) != test_vectors[32]) {
+		pr_info("self-test 4u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_1u32(0x03020100U, test_key) != test_vectors[4]) {
+		pr_info("self-test 1u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_2u32(0x03020100U, 0x07060504U, test_key) != test_vectors[8]) {
+		pr_info("self-test 2u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_3u32(0x03020100U, 0x07060504U,
+			 0x0b0a0908U, test_key) != test_vectors[12]) {
+		pr_info("self-test 3u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_4u32(0x03020100U, 0x07060504U,
+			 0x0b0a0908U, 0x0f0e0d0cU, test_key) != test_vectors[16]) {
+		pr_info("self-test 4u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (!ret)
+		pr_info("self-tests: pass\n");
+	return ret;
+}
+
+static void __exit siphash_test_exit(void)
+{
+}
+
+module_init(siphash_test_init);
+module_exit(siphash_test_exit);
+
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+MODULE_LICENSE("Dual BSD/GPL");
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net] net: Fix inconsistent rtnl_lock usage on dev_get_stats().
From: David Miller @ 2017-01-06 18:01 UTC (permalink / raw)
  To: eric.dumazet; +Cc: michael.chan, netdev
In-Reply-To: <1483723976.9712.19.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 06 Jan 2017 09:32:56 -0800

> This makes no sense to me.
> 
> RTNL is absolutely not needed to get device stats.
> 
> We try to not add RTNL, especially when not required.
> 
> Sure, RTNETLINK dumps currently hold RTNL, but we had various attempts
> in the past to get rid of this behavior.
> 
> If a device driver expects RTNL being locked, it is clearly a bug that
> needs a fix anyway.

This is extremely problematic when the driver has to synchronize some
piece of state between the get stats method and open/close.  It is
exactly the case we are trying to solve in tg3, and lots of drivers
end up hitting the same exact issue.

If open/close can happen asynchronously to get stats, it is very hard
to make dynamically allocated data structures or DMA buffers usable
from the stats call.

Drivers in this situation will just add a mutex specifically for this
situation if we don't consistently apply RTNL locking here.

^ permalink raw reply

* [net-next PATCH] net: reduce cycles spend on ICMP replies that gets rate limited
From: Jesper Dangaard Brouer @ 2017-01-06 17:39 UTC (permalink / raw)
  To: netdev; +Cc: Eric Dumazet, Jesper Dangaard Brouer

This patch split the global and per (inet)peer ICMP-reply limiter
code, and moves the global limit check to earlier in the packet
processing path.  Thus, avoid spending cycles on ICMP replies that
gets limited/suppressed anyhow.

The global ICMP rate limiter icmp_global_allow() is a good solution,
it just happens too late in the process.  The kernel goes through
allocating memory and route lookup (return path) for the ICMP message,
before taking the rate limit decision of not sending the ICMP reply.

Details: The kernels global rate limiter for ICMP messages got added
in commit 4cdf507d5452 ("icmp: add a global rate limitation").  It is
a token bucket limiter with a global lock.  It brilliantly avoids
locking congestion by only updating when 20ms (HZ/50) were elapsed. It
can then avoids taking lock when credit is exhausted (when under
pressure) and time constraint for refill is not yet meet.

Use-case: The specific case I experienced this being a bottleneck is,
sending UDP packets to a port with no listener, which obviously result
in kernel replying with ICMP Destination Unreachable (type:3), Port
Unreachable (code:3), which cause the bottleneck.
 After Eric and Paolo optimized the UDP socket code, the kernels PPS
processing capabilities is lower for no-listen ports, than normal UDP
sockets.  This is bad for capacity planning when restarting a service.

UDP no-listen benchmark 8xCPUs using pktgen_sample04_many_flows.sh:
 Baseline: 6.6 Mpps
 Patch:   14.5 Mpps

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 net/ipv4/icmp.c |   87 +++++++++++++++++++++++++++++++++++--------------------
 net/ipv6/icmp.c |   49 +++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 46 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 0777ea949223..3d7b447c8b72 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -282,6 +282,33 @@ bool icmp_global_allow(void)
 }
 EXPORT_SYMBOL(icmp_global_allow);
 
+static bool icmpv4_mask_allow(struct net *net, int type, int code)
+{
+	if (type > NR_ICMP_TYPES)
+		return true;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		return true;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+		return true;
+
+	return false;
+}
+
+static bool icmpv4_global_allow(struct net *net, int type, int code)
+{
+	if (icmpv4_mask_allow(net, type, code))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  *	Send an ICMP frame.
  */
@@ -290,34 +317,22 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 			       struct flowi4 *fl4, int type, int code)
 {
 	struct dst_entry *dst = &rt->dst;
+	struct inet_peer *peer;
 	bool rc = true;
+	int vif;
 
-	if (type > NR_ICMP_TYPES)
-		goto out;
-
-	/* Don't limit PMTU discovery. */
-	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+	if (icmpv4_mask_allow(net, type, code))
 		goto out;
 
 	/* No rate limit on loopback */
 	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
 		goto out;
 
-	/* Limit if icmp type is enabled in ratemask. */
-	if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
-		goto out;
-
-	rc = false;
-	if (icmp_global_allow()) {
-		int vif = l3mdev_master_ifindex(dst->dev);
-		struct inet_peer *peer;
-
-		peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
-		rc = inet_peer_xrlim_allow(peer,
-					   net->ipv4.sysctl_icmp_ratelimit);
-		if (peer)
-			inet_putpeer(peer);
-	}
+	vif = l3mdev_master_ifindex(dst->dev);
+	peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
+	rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
+	if (peer)
+		inet_putpeer(peer);
 out:
 	return rc;
 }
@@ -396,6 +411,8 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct inet_sock *inet;
 	__be32 daddr, saddr;
 	u32 mark = IP4_REPLY_MARK(net, skb->mark);
+	int type = icmp_param->data.icmph.type;
+	int code = icmp_param->data.icmph.code;
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb))
 		return;
@@ -405,6 +422,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 		return;
 	inet = inet_sk(sk);
 
+	/* global icmp_msgs_per_sec */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
+
 	icmp_param->data.icmph.checksum = 0;
 
 	inet->tos = ip_hdr(skb)->tos;
@@ -433,8 +454,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
-	if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type,
-			       icmp_param->data.icmph.code))
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 	ip_rt_put(rt);
 out_unlock:
@@ -648,13 +668,17 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 		}
 	}
 
-	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
-	if (!icmp_param)
-		return;
-
 	sk = icmp_xmit_lock(net);
 	if (!sk)
-		goto out_free;
+		goto out;
+
+	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
+	if (!icmpv4_global_allow(net, type, code))
+		goto out_unlock;
+
+	icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC);
+	if (!icmp_param)
+		goto out_unlock;
 
 	/*
 	 *	Construct source address and options.
@@ -682,7 +706,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	mark = IP4_REPLY_MARK(net, skb_in->mark);
 
 	if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in))
-		goto out_unlock;
+		goto out_free;
 
 
 	/*
@@ -706,8 +730,9 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
 			       type, code, icmp_param);
 	if (IS_ERR(rt))
-		goto out_unlock;
+		goto out_free;
 
+	/* peer icmp_ratelimit */
 	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
 		goto ende;
 
@@ -727,10 +752,10 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
 ende:
 	ip_rt_put(rt);
-out_unlock:
-	icmp_xmit_unlock(sk);
 out_free:
 	kfree(icmp_param);
+out_unlock:
+	icmp_xmit_unlock(sk);
 out:;
 }
 EXPORT_SYMBOL(icmp_send);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 3036f665e6c8..b26ae8b5c1ce 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -168,6 +168,30 @@ static bool is_ineligible(const struct sk_buff *skb)
 	return false;
 }
 
+static bool icmpv6_mask_allow(int type)
+{
+	/* Informational messages are not limited. */
+	if (type & ICMPV6_INFOMSG_MASK)
+		return true;
+
+	/* Do not limit pmtu discovery, it would break it. */
+	if (type == ICMPV6_PKT_TOOBIG)
+		return true;
+
+	return false;
+}
+
+static bool icmpv6_global_allow(int type)
+{
+	if (icmpv6_mask_allow(type))
+		return true;
+
+	if (icmp_global_allow())
+		return true;
+
+	return false;
+}
+
 /*
  * Check the ICMP output rate limit
  */
@@ -178,12 +202,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	struct dst_entry *dst;
 	bool res = false;
 
-	/* Informational messages are not limited. */
-	if (type & ICMPV6_INFOMSG_MASK)
-		return true;
-
-	/* Do not limit pmtu discovery, it would break it. */
-	if (type == ICMPV6_PKT_TOOBIG)
+	if (icmpv6_mask_allow(type))
 		return true;
 
 	/*
@@ -200,20 +219,16 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else {
 		struct rt6_info *rt = (struct rt6_info *)dst;
 		int tmo = net->ipv6.sysctl.icmpv6_time;
+		struct inet_peer *peer;
 
 		/* Give more bandwidth to wider prefixes. */
 		if (rt->rt6i_dst.plen < 128)
 			tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
 
-		if (icmp_global_allow()) {
-			struct inet_peer *peer;
-
-			peer = inet_getpeer_v6(net->ipv6.peers,
-					       &fl6->daddr, 1);
-			res = inet_peer_xrlim_allow(peer, tmo);
-			if (peer)
-				inet_putpeer(peer);
-		}
+		peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+		res = inet_peer_xrlim_allow(peer, tmo);
+		if (peer)
+			inet_putpeer(peer);
 	}
 	dst_release(dst);
 	return res;
@@ -493,6 +508,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	sk = icmpv6_xmit_lock(net);
 	if (!sk)
 		return;
+
+	if (!icmpv6_global_allow(type))
+		goto out;
+
 	sk->sk_mark = mark;
 	np = inet6_sk(sk);
 

^ permalink raw reply related

* Re: [PATCH net] net: Fix inconsistent rtnl_lock usage on dev_get_stats().
From: Eric Dumazet @ 2017-01-06 17:32 UTC (permalink / raw)
  To: Michael Chan; +Cc: davem, netdev
In-Reply-To: <1483676478-14228-1-git-send-email-michael.chan@broadcom.com>

On Thu, 2017-01-05 at 23:21 -0500, Michael Chan wrote:
> Some callers take rtnl_lock() before calling dev_get_stats() and some
> don't.  Most network drivers expect the ndo_get_stats64() to be called
> under rtnl_lock() to avoid race conditions with device close or ethtool
> reconfigurations.  Fix it so that all callers take rtnl_lock().
> 
> Rename the original dev_get_stats() as __dev_get_stats() and add a new
> dev_get_stats() that takes rtnl_lock() before calling __dev_get_stats().
> Modify all callers that already take rtnl_lock() to call __dev_get_stats().

This makes no sense to me.

RTNL is absolutely not needed to get device stats.

We try to not add RTNL, especially when not required.

Sure, RTNETLINK dumps currently hold RTNL, but we had various attempts
in the past to get rid of this behavior.

If a device driver expects RTNL being locked, it is clearly a bug that
needs a fix anyway.

^ permalink raw reply

* Re: [net-next PATCH v2 5/6] i40e: Add TX and RX support in switchdev mode.
From: Jiri Pirko @ 2017-01-06 17:30 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: alexander.h.duyck, john.r.fastabend, anjali.singhai,
	jakub.kicinski, davem, intel-wired-lan, netdev
In-Reply-To: <586EE461.2000801@intel.com>

Fri, Jan 06, 2017 at 01:27:13AM CET, sridhar.samudrala@intel.com wrote:
>
>
>On 1/5/2017 4:56 AM, Jiri Pirko wrote:
>> Tue, Jan 03, 2017 at 07:07:53PM CET, sridhar.samudrala@intel.com wrote:
>> > In switchdev mode, broadcast filter is not enabled on VFs. The broadcasts and
>> > unknown frames from VFs are received by the PF and passed to corresponding VF
>> > port representator netdev.
>> > A host based switching entity like a linux bridge or OVS redirects these frames
>> > to the right VFs via VFPR netdevs. Any frames sent via VFPR netdevs are sent as
>> > directed transmits to the corresponding VFs. To enable directed transmit, skb
>> > metadata dst is used to pass the VF id and the frame is requeued to call the PFs
>> > transmit routine.
>> > 
>> > Small script to demonstrate inter VF pings in switchdev mode.
>> > PF: enp5s0f0, VFs: enp5s2,enp5s2f1 VFPRs:enp5s0f0-vf0, enp5s0f0-vf1
>> > 
>> > # rmmod i40e; modprobe i40e
>> > # devlink dev eswitch set pci/0000:05:00.0 mode switchdev
>> > # echo 2 > /sys/class/net/enp5s0f0/device/sriov_numvfs
>> > # ip link set enp5s0f0 vf 0 mac 00:11:22:33:44:55
>> > # ip link set enp5s0f0 vf 1 mac 00:11:22:33:44:56
>> > # rmmod i40evf; modprobe i40evf
>> > 
>> > /* Create 2 namespaces and move the VFs to the corresponding ns. */
>> > # ip netns add ns0
>> > # ip link set enp5s2 netns ns0
>> > # ip netns exec ns0 ip addr add 192.168.1.10/24 dev enp5s2
>> > # ip netns exec ns0 ip link set enp5s2 up
>> > # ip netns add ns1
>> > # ip link set enp5s2f1 netns ns1
>> > # ip netns exec ns1 ip addr add 192.168.1.11/24 dev enp5s2f1
>> > # ip netns exec ns1 ip link set enp5s2f1 up
>> > 
>> > /* bring up pf and vfpr netdevs */
>> > # ip link set enp5s0f0 up
>> > # ip link set enp5s0f0-vf0 up
>> > # ip link set enp5s0f0-vf1 up
>> > 
>> > /* Create a linux bridge and add vfpr netdevs to it. */
>> > # ip link add vfpr-br type bridge
>> > # ip link set enp5s0f0-vf0 master vfpr-br
>> > # ip link set enp5s0f0-vf1 master vfpr-br
>> > # ip addr add 192.168.1.1/24 dev vfpr-br
>> > # ip link set vfpr-br up
>> > 
>> > # ip netns exec ns0 ping -c3 192.168.1.11
>> > # ip netns exec ns1 ping -c3 192.168.1.10
>> > 
>> > Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> [...]
>> 
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> > index 352cf7c..b46ddaa 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
>> > @@ -1176,16 +1176,37 @@ static bool i40e_alloc_mapped_page(struct i40e_ring *rx_ring,
>> >   * @rx_ring:  rx ring in play
>> >   * @skb: packet to send up
>> >   * @vlan_tag: vlan tag for packet
>> > + * @lpbk: is it a loopback frame?
>> >   **/
>> > static void i40e_receive_skb(struct i40e_ring *rx_ring,
>> > -			     struct sk_buff *skb, u16 vlan_tag)
>> > +			     struct sk_buff *skb, u16 vlan_tag, bool lpbk)
>> > {
>> > 	struct i40e_q_vector *q_vector = rx_ring->q_vector;
>> > +	struct i40e_pf *pf = rx_ring->vsi->back;
>> > +	struct i40e_vf *vf;
>> > +	struct ethhdr *eth;
>> > +	int vf_id;
>> > 
>> > 	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
>> > 	    (vlan_tag & VLAN_VID_MASK))
>> > 		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
>> > 
>> > +	if ((pf->eswitch_mode == DEVLINK_ESWITCH_MODE_LEGACY) || !lpbk)
>> > +		goto gro_receive;
>> > +
>> > +	/* If a loopback packet is received from a VF in switchdev mode, pass the
>> > +	 * frame to the corresponding VFPR netdev based on the source MAC in the frame.
>> > +	 */
>> > +	eth = (struct ethhdr *)skb_mac_header(skb);
>> > +	for (vf_id = 0; vf_id < pf->num_alloc_vfs; vf_id++) {
>> > +		vf = &pf->vf[vf_id];
>> > +		if (ether_addr_equal(eth->h_source, vf->default_lan_addr.addr)) {
>> > +			skb->dev = vf->vfpr_netdev;
>> This sucks :( Is't there any identification coming from rx ring that
>> would tell you what vf this is? To match vrpr according to a single MAC
>> address seems a bit awkward. What if there is a macvlan configured
>> on the VF?
>Unfortunately, with the current HW, RX descriptor only indicates if it is a
>loopback packet from a VF,
>but not the specific id of the VF.

Is it a FW limitation? If so, I believe that you should consider to
implement it.


>Multiple macs on VF are not supported with the current patchset.
>At this point we are not making switchdev as the default mode because of
>these limitations.
>
>> 
>> 
>> 
>> > +			break;
>> > +		}
>> > +	}
>> > +
>> > +gro_receive:
>> > 	napi_gro_receive(&q_vector->napi, skb);
>> > }
>> > 
>> [...]
>> 
>> > @@ -2998,3 +3064,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
>> > 
>> > 	return i40e_xmit_frame_ring(skb, tx_ring);
>> > }
>> > +
>> > +netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff *skb,
>> > +				        struct net_device *dev)
>> > +{
>> > +	struct i40e_vfpr_netdev_priv *priv = netdev_priv(dev);
>> > +	struct i40e_vf *vf = priv->vf;
>> > +	struct i40e_pf *pf = vf->pf;
>> > +	struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
>> > +
>> > +	skb_dst_drop(skb);
>> > +	dst_hold(&priv->vfpr_dst->dst);
>> > +	skb_dst_set(skb, &priv->vfpr_dst->dst);
>> > +	skb->dev = vsi->netdev;
>> This dst dance seems a bit odd to me. Why don't you just call
>> i40e_xmit_frame_ring with an extra arg holding the needed metadata?
>
>We don't have TX/RX queues associated with VFPR netdevs, so we need to set
>the dev to PF netdev and requeue the skb.

Still, you eventually call a function within same .c file. Using dst
does not look right to me.


>
>> 
>> 
>> > +
>> > +	return dev_queue_xmit(skb);
>> > +}
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> > index e065321..850723f 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
>> > @@ -366,6 +366,7 @@ struct i40e_ring_container {
>> > 
>> > bool i40e_alloc_rx_buffers(struct i40e_ring *rxr, u16 cleaned_count);
>> > netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
>> > +netdev_tx_t i40e_vfpr_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev);
>> > void i40e_clean_tx_ring(struct i40e_ring *tx_ring);
>> > void i40e_clean_rx_ring(struct i40e_ring *rx_ring);
>> > int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring);
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
>> > index edc0abd..c136cc9 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_type.h
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
>> > @@ -728,6 +728,9 @@ enum i40e_rx_desc_status_bits {
>> > #define I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT  I40E_RX_DESC_STATUS_TSYNVALID_SHIFT
>> > #define I40E_RXD_QW1_STATUS_TSYNVALID_MASK \
>> > 				    BIT_ULL(I40E_RXD_QW1_STATUS_TSYNVALID_SHIFT)
>> > +#define I40E_RXD_QW1_STATUS_LPBK_SHIFT  I40E_RX_DESC_STATUS_LPBK_SHIFT
>> > +#define I40E_RXD_QW1_STATUS_LPBK_MASK \
>> > +				BIT_ULL(I40E_RXD_QW1_STATUS_LPBK_SHIFT)
>> > 
>> > enum i40e_rx_desc_fltstat_values {
>> > 	I40E_RX_DESC_FLTSTAT_NO_DATA	= 0,
>> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>> > index 0c8687d..f0860ef 100644
>> > --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>> > +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
>> > @@ -1062,6 +1062,7 @@ static int i40e_vfpr_netdev_stop(struct net_device *dev)
>> > static const struct net_device_ops i40e_vfpr_netdev_ops = {
>> > 	.ndo_open       	= i40e_vfpr_netdev_open,
>> > 	.ndo_stop       	= i40e_vfpr_netdev_stop,
>> > +	.ndo_start_xmit		= i40e_vfpr_netdev_start_xmit,
>> > };
>> > 
>> > /**
>> > @@ -1121,16 +1122,22 @@ int i40e_alloc_vfpr_netdev(struct i40e_vf *vf, u16 vf_num)
>> > 
>> > 	priv = netdev_priv(vfpr_netdev);
>> > 	priv->vf = &(pf->vf[vf_num]);
>> > +	priv->vfpr_dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL);
>> I'm missing a patch here. In net-next, metadata_dst_alloc has 2 args.
>
>Somehow that patch didn't make it to netdev. You can find it here on
>intel-wired-lan archives.
>http://lists.osuosl.org/pipermail/intel-wired-lan/Week-of-Mon-20170102/007723.html
>I will CC you when i submit V3.

Thanks.

^ permalink raw reply

* Re: [PATCH] net: phy: dp83867: fix irq generation
From: Grygorii Strashko @ 2017-01-06 17:19 UTC (permalink / raw)
  To: Florian Fainelli, netdev, Dan Murphy, Mugunthan V N
  Cc: Sekhar Nori, linux-kernel, linux-omap, linux-arm-kernel
In-Reply-To: <de3c3fb3-1244-76a5-ea5e-8ef1b83f25d4@gmail.com>



On 01/05/2017 04:10 PM, Florian Fainelli wrote:
> On 01/05/2017 12:48 PM, Grygorii Strashko wrote:
>> For proper IRQ generation by DP83867 phy the INT/PWDN pin has to be
>> programmed as an interrupt output instead of a Powerdown input in
>> Configuration Register 3 (CFG3), Address 0x001E, bit 7 INT_OE = 1. The
>> current driver doesn't do this and as result IRQs will not be generated by
>> DP83867 phy even if they are properly configured in DT.
>>
>> Hence, fix IRQ generation by properly configuring CFG3.INT_OE bit and
>> ensure that Link Status Change (LINK_STATUS_CHNG_INT) and Auto-Negotiation
>> Complete (AUTONEG_COMP_INT) interrupt are enabled. After this the DP83867
>> driver will work properly in interrupt enabled mode.
>>
>> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
>> ---
>>  drivers/net/phy/dp83867.c | 10 ++++++++++
>>  1 file changed, 10 insertions(+)
>>
>> diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c
>> index 1b63924..e84ae08 100644
>> --- a/drivers/net/phy/dp83867.c
>> +++ b/drivers/net/phy/dp83867.c
>> @@ -29,6 +29,7 @@
>>  #define MII_DP83867_MICR	0x12
>>  #define MII_DP83867_ISR		0x13
>>  #define DP83867_CTRL		0x1f
>> +#define DP83867_CFG3		0x1e
>>
>>  /* Extended Registers */
>>  #define DP83867_RGMIICTL	0x0032
>> @@ -98,6 +99,8 @@ static int dp83867_config_intr(struct phy_device *phydev)
>>  		micr_status |=
>>  			(MII_DP83867_MICR_AN_ERR_INT_EN |
>>  			MII_DP83867_MICR_SPEED_CHNG_INT_EN |
>> +			MII_DP83867_MICR_AUTONEG_COMP_INT_EN |
>> +			MII_DP83867_MICR_LINK_STS_CHNG_INT_EN |
>>  			MII_DP83867_MICR_DUP_MODE_CHNG_INT_EN |
>>  			MII_DP83867_MICR_SLEEP_MODE_CHNG_INT_EN);
>>
>> @@ -214,6 +217,13 @@ static int dp83867_config_init(struct phy_device *phydev)
>>  		}
>>  	}
>>
>> +	/* Enable Interrupt output INT_OE in CFG3 register */
>> +	if (phy_interrupt_is_valid(phydev)) {
>> +		val = phy_read(phydev, DP83867_CFG3);
>> +		val |= BIT(7);
>> +		phy_write(phydev, DP83867_CFG3, val);
>> +	}
>
> Don't you need to clear that bit in the case phy_interrupt_is_valid()
> returns false?

Not sure I need to touch it in this case - default value is 0
and Linux will not configure any IRQ.

>
> Other than that:
>
> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
>

-- 
regards,
-grygorii

^ permalink raw reply

* Re: [PATCH net] dccp: fix option range in dccp_parse_options()
From: Eric Dumazet @ 2017-01-06 17:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Andrey Konovalov, Gerrit Renker
In-Reply-To: <1483721009.9712.14.camel@edumazet-glaptop3.roam.corp.google.com>

On Fri, 2017-01-06 at 08:43 -0800, Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> dccp_parse_options() improperly parses 12 or 16 bytes in excess,
> because it forgets to subtract DCCP header len.
> 
> This causes various issues, since these 12/16 bytes are part of the
> payload and this might not even be present in skb->head, as
> dccp_invalid_packet() only pulled everything but payload.
> 
> KASAN complains since we might access uninitialized data.

Scratch that.

Never send a patch before first coffee in the morning ;)

^ permalink raw reply

* [PATCH net-next 2/2] afs: Make afs_readpages() fetch data in bulk
From: David Howells @ 2017-01-06 17:09 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <148372251997.8578.5117118854142528477.stgit@warthog.procyon.org.uk>

Make afs_readpages() use afs_vnode_fetch_data()'s new ability to take a
list of pages and do a bulk fetch.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/afs/file.c   |  131 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/afs/volume.c |    1 
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6c262ceef32d..82897a78abc7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -16,6 +16,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/gfp.h>
+#include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
@@ -262,6 +263,129 @@ static int afs_readpage(struct file *file, struct page *page)
 }
 
 /*
+ * Make pages available as they're filled.
+ */
+static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
+{
+	struct afs_vnode *vnode = call->reply;
+	struct page *page = req->pages[req->index];
+
+	req->pages[req->index] = NULL;
+	SetPageUptodate(page);
+
+	/* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+	if (PageFsCache(page) &&
+	    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+		fscache_uncache_page(vnode->cache, page);
+		BUG_ON(PageFsCache(page));
+	}
+#endif
+	unlock_page(page);
+	put_page(page);
+}
+
+/*
+ * Read a contiguous set of pages.
+ */
+static int afs_readpages_one(struct file *file, struct address_space *mapping,
+			     struct list_head *pages)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct afs_read *req;
+	struct list_head *p;
+	struct page *first, *page;
+	struct key *key = file->private_data;
+	pgoff_t index;
+	int ret, n, i;
+
+	/* Count the number of contiguous pages at the front of the list.  Note
+	 * that the list goes prev-wards rather than next-wards.
+	 */
+	first = list_entry(pages->prev, struct page, lru);
+	index = first->index + 1;
+	n = 1;
+	for (p = first->lru.prev; p != pages; p = p->prev) {
+		page = list_entry(p, struct page, lru);
+		if (page->index != index)
+			break;
+		index++;
+		n++;
+	}
+
+	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *) * n,
+		      GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	atomic_set(&req->usage, 1);
+	req->page_done = afs_readpages_page_done;
+	req->pos = first->index;
+	req->pos <<= PAGE_SHIFT;
+
+	/* Transfer the pages to the request.  We add them in until one fails
+	 * to add to the LRU and then we stop (as that'll make a hole in the
+	 * contiguous run.
+	 *
+	 * Note that it's possible for the file size to change whilst we're
+	 * doing this, but we rely on the server returning less than we asked
+	 * for if the file shrank.  We also rely on this to deal with a partial
+	 * page at the end of the file.
+	 */
+	do {
+		page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		index = page->index;
+		if (add_to_page_cache_lru(page, mapping, index,
+					  readahead_gfp_mask(mapping))) {
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			put_page(page);
+			break;
+		}
+
+		req->pages[req->nr_pages++] = page;
+		req->len += PAGE_SIZE;
+	} while (req->nr_pages < n);
+
+	if (req->nr_pages == 0) {
+		kfree(req);
+		return 0;
+	}
+
+	ret = afs_vnode_fetch_data(vnode, key, req);
+	if (ret < 0)
+		goto error;
+
+	task_io_account_read(PAGE_SIZE * req->nr_pages);
+	afs_put_read(req);
+	return 0;
+
+error:
+	if (ret == -ENOENT) {
+		_debug("got NOENT from server"
+		       " - marking file deleted and stale");
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		ret = -ESTALE;
+	}
+
+	for (i = 0; i < req->nr_pages; i++) {
+		page = req->pages[i];
+		if (page) {
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			SetPageError(page);
+			unlock_page(page);
+		}
+	}
+
+	afs_put_read(req);
+	return ret;
+}
+
+/*
  * read a set of pages
  */
 static int afs_readpages(struct file *file, struct address_space *mapping,
@@ -314,8 +438,11 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
-	/* load the missing pages from the network */
-	ret = read_cache_pages(mapping, pages, afs_page_filler, key);
+	while (!list_empty(pages)) {
+		ret = afs_readpages_one(file, mapping, pages);
+		if (ret < 0)
+			break;
+	}
 
 	_leave(" = %d [netting]", ret);
 	return ret;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index d142a2449e65..546f9d01710b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
+	volume->bdi.ra_pages	= VM_MAX_READAHEAD*1024/PAGE_SIZE; 
 	ret = bdi_setup_and_register(&volume->bdi, "afs");
 	if (ret)
 		goto error_bdi;

^ permalink raw reply related

* [PATCH net-next 1/2] afs: Make afs_fs_fetch_data() take a list of pages
From: David Howells @ 2017-01-06 17:08 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel
In-Reply-To: <148372251997.8578.5117118854142528477.stgit@warthog.procyon.org.uk>

Make afs_fs_fetch_data() take a list of pages for bulk data transfer.  This
will allow afs_readpages() to be made more efficient.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/afs/file.c     |   37 ++++++++++++++---
 fs/afs/fsclient.c |  117 ++++++++++++++++++++++++++++++++++-------------------
 fs/afs/internal.h |   21 +++++++++-
 fs/afs/vnode.c    |    6 +--
 fs/afs/write.c    |   19 +++++++--
 5 files changed, 145 insertions(+), 55 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6344aee4ac4b..6c262ceef32d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -101,6 +101,21 @@ int afs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * Dispose of a ref to a read record.
+ */
+void afs_put_read(struct afs_read *req)
+{
+	int i;
+
+	if (atomic_dec_and_test(&req->usage)) {
+		for (i = 0; i < req->nr_pages; i++)
+			if (req->pages[i])
+				put_page(req->pages[i]);
+		kfree(req);
+	}
+}
+
 #ifdef CONFIG_AFS_FSCACHE
 /*
  * deal with notification that a page was read from the cache
@@ -126,9 +141,8 @@ int afs_page_filler(void *data, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_read *req;
 	struct key *key = data;
-	size_t len;
-	off_t offset;
 	int ret;
 
 	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -164,12 +178,23 @@ int afs_page_filler(void *data, struct page *page)
 		_debug("cache said ENOBUFS");
 	default:
 	go_on:
-		offset = page->index << PAGE_SHIFT;
-		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
+		req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
+			      GFP_KERNEL);
+		if (!req)
+			goto enomem;
+
+		atomic_set(&req->usage, 1);
+		req->pos = (loff_t)page->index << PAGE_SHIFT;
+		req->len = min_t(size_t, i_size_read(inode) - req->pos,
+				 PAGE_SIZE);
+		req->nr_pages = 1;
+		req->pages[0] = page;
+		get_page(page);
 
 		/* read the contents of the file from the server into the
 		 * page */
-		ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
+		ret = afs_vnode_fetch_data(vnode, key, req);
+		afs_put_read(req);
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				_debug("got NOENT from server"
@@ -201,6 +226,8 @@ int afs_page_filler(void *data, struct page *page)
 	_leave(" = 0");
 	return 0;
 
+enomem:
+	ret = -ENOMEM;
 error:
 	SetPageError(page);
 	unlock_page(page);
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 31c616ab9b40..7dc1f6fb3661 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -309,15 +309,19 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
+	struct afs_read *req = call->reply3;
 	const __be32 *bp;
-	struct page *page;
+	unsigned int size;
 	void *buffer;
 	int ret;
 
-	_enter("{%u}", call->unmarshall);
+	_enter("{%u,%zu/%u;%u/%llu}",
+	       call->unmarshall, call->offset, call->count,
+	       req->remain, req->actual_len);
 
 	switch (call->unmarshall) {
 	case 0:
+		req->actual_len = 0;
 		call->offset = 0;
 		call->unmarshall++;
 		if (call->operation_ID != FSFETCHDATA64) {
@@ -334,10 +338,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		call->count = ntohl(call->tmp);
-		_debug("DATA length MSW: %u", call->count);
-		if (call->count > 0)
-			return -EBADMSG;
+		req->actual_len = ntohl(call->tmp);
+		req->actual_len <<= 32;
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -349,26 +351,52 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		call->count = ntohl(call->tmp);
-		_debug("DATA length: %u", call->count);
-		if (call->count > PAGE_SIZE)
+		req->actual_len |= ntohl(call->tmp);
+		_debug("DATA length: %llu", req->actual_len);
+		/* Check that the server didn't want to send us extra.  We
+		 * might want to just discard instead, but that requires
+		 * cooperation from AF_RXRPC.
+		 */
+		if (req->actual_len > req->len)
 			return -EBADMSG;
-		call->offset = 0;
+
+		req->remain = req->actual_len;
+		call->offset = req->pos & (PAGE_SIZE - 1);
+		req->index = 0;
+		if (req->actual_len == 0)
+			goto no_more_data;
 		call->unmarshall++;
 
+	begin_page:
+		if (req->remain > PAGE_SIZE - call->offset)
+			size = PAGE_SIZE - call->offset;
+		else
+			size = req->remain;
+		call->count = call->offset + size;
+		ASSERTCMP(call->count, <=, PAGE_SIZE);
+		req->remain -= size;
+
 		/* extract the returned data */
 	case 3:
-		_debug("extract data");
-		if (call->count > 0) {
-			page = call->reply3;
-			buffer = kmap(page);
-			ret = afs_extract_data(call, buffer,
-					       call->count, true);
-			kunmap(page);
-			if (ret < 0)
-				return ret;
+		_debug("extract data %u/%llu %zu/%u",
+		       req->remain, req->actual_len, call->offset, call->count);
+
+		buffer = kmap(req->pages[req->index]);
+		ret = afs_extract_data(call, buffer, call->count, true);
+		kunmap(req->pages[req->index]);
+		if (ret < 0)
+			return ret;
+		if (call->offset == PAGE_SIZE) {
+			if (req->page_done)
+				req->page_done(call, req);
+			if (req->remain > 0) {
+				req->index++;
+				call->offset = 0;
+				goto begin_page;
+			}
 		}
 
+	no_more_data:
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -393,17 +421,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 	}
 
 	if (call->count < PAGE_SIZE) {
-		_debug("clear");
-		page = call->reply3;
-		buffer = kmap(page);
+		buffer = kmap(req->pages[req->index]);
 		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
-		kunmap(page);
+		kunmap(req->pages[req->index]);
+		if (req->page_done)
+			req->page_done(call, req);
 	}
 
 	_leave(" = 0 [done]");
 	return 0;
 }
 
+static void afs_fetch_data_destructor(struct afs_call *call)
+{
+	struct afs_read *req = call->reply3;
+
+	afs_put_read(req);
+	afs_flat_call_destructor(call);
+}
+
 /*
  * FS.FetchData operation type
  */
@@ -411,14 +447,14 @@ static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
 	.deliver	= afs_deliver_fs_fetch_data,
 	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+	.destructor	= afs_fetch_data_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
 	.deliver	= afs_deliver_fs_fetch_data,
 	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+	.destructor	= afs_fetch_data_destructor,
 };
 
 /*
@@ -427,8 +463,7 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
 static int afs_fs_fetch_data64(struct afs_server *server,
 			       struct key *key,
 			       struct afs_vnode *vnode,
-			       off_t offset, size_t length,
-			       struct page *buffer,
+			       struct afs_read *req,
 			       const struct afs_wait_mode *wait_mode)
 {
 	struct afs_call *call;
@@ -436,8 +471,6 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 
 	_enter("");
 
-	ASSERTCMP(length, <, ULONG_MAX);
-
 	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
@@ -445,7 +478,7 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	call->key = key;
 	call->reply = vnode;
 	call->reply2 = NULL; /* volsync */
-	call->reply3 = buffer;
+	call->reply3 = req;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
 	call->operation_ID = FSFETCHDATA64;
@@ -456,11 +489,12 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
-	bp[4] = htonl(upper_32_bits(offset));
-	bp[5] = htonl((u32) offset);
+	bp[4] = htonl(upper_32_bits(req->pos));
+	bp[5] = htonl(lower_32_bits(req->pos));
 	bp[6] = 0;
-	bp[7] = htonl((u32) length);
+	bp[7] = htonl(lower_32_bits(req->len));
 
+	atomic_inc(&req->usage);
 	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
 
@@ -470,16 +504,16 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 int afs_fs_fetch_data(struct afs_server *server,
 		      struct key *key,
 		      struct afs_vnode *vnode,
-		      off_t offset, size_t length,
-		      struct page *buffer,
+		      struct afs_read *req,
 		      const struct afs_wait_mode *wait_mode)
 {
 	struct afs_call *call;
 	__be32 *bp;
 
-	if (upper_32_bits(offset) || upper_32_bits(offset + length))
-		return afs_fs_fetch_data64(server, key, vnode, offset, length,
-					   buffer, wait_mode);
+	if (upper_32_bits(req->pos) ||
+	    upper_32_bits(req->len) ||
+	    upper_32_bits(req->pos + req->len))
+		return afs_fs_fetch_data64(server, key, vnode, req, wait_mode);
 
 	_enter("");
 
@@ -490,7 +524,7 @@ int afs_fs_fetch_data(struct afs_server *server,
 	call->key = key;
 	call->reply = vnode;
 	call->reply2 = NULL; /* volsync */
-	call->reply3 = buffer;
+	call->reply3 = req;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
 	call->operation_ID = FSFETCHDATA;
@@ -501,9 +535,10 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
-	bp[4] = htonl(offset);
-	bp[5] = htonl(length);
+	bp[4] = htonl(lower_32_bits(req->pos));
+	bp[5] = htonl(lower_32_bits(req->len));
 
+	atomic_inc(&req->usage);
 	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
 }
 
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 535a38d2c1d0..6f7a9638ba1a 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -134,6 +134,22 @@ struct afs_call_type {
 };
 
 /*
+ * Record of an outstanding read operation on a vnode.
+ */
+struct afs_read {
+	loff_t			pos;		/* Where to start reading */
+	loff_t			len;		/* How much to read */
+	loff_t			actual_len;	/* How much we're actually getting */
+	atomic_t		usage;
+	unsigned int		remain;		/* Amount remaining */
+	unsigned int		index;		/* Which page we're reading into */
+	unsigned int		pg_offset;	/* Offset in page we're at */
+	unsigned int		nr_pages;
+	void (*page_done)(struct afs_call *, struct afs_read *);
+	struct page		*pages[];
+};
+
+/*
  * record of an outstanding writeback on a vnode
  */
 struct afs_writeback {
@@ -494,6 +510,7 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
 extern int afs_page_filler(void *, struct page *);
+extern void afs_put_read(struct afs_read *);
 
 /*
  * flock.c
@@ -513,7 +530,7 @@ extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
 extern int afs_fs_give_up_callbacks(struct afs_server *,
 				    const struct afs_wait_mode *);
 extern int afs_fs_fetch_data(struct afs_server *, struct key *,
-			     struct afs_vnode *, off_t, size_t, struct page *,
+			     struct afs_vnode *, struct afs_read *,
 			     const struct afs_wait_mode *);
 extern int afs_fs_create(struct afs_server *, struct key *,
 			 struct afs_vnode *, const char *, umode_t,
@@ -699,7 +716,7 @@ extern void afs_vnode_finalise_status_update(struct afs_vnode *,
 extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
 				  struct key *);
 extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
-				off_t, size_t, struct page *);
+				struct afs_read *);
 extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
 			    umode_t, struct afs_fid *, struct afs_file_status *,
 			    struct afs_callback *, struct afs_server **);
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 25cf4c3f4ff7..45aa874f5d32 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -393,7 +393,7 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode,
  * - TODO implement caching
  */
 int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-			 off_t offset, size_t length, struct page *page)
+			 struct afs_read *desc)
 {
 	struct afs_server *server;
 	int ret;
@@ -420,8 +420,8 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_fetch_data(server, key, vnode, offset, length,
-					page, &afs_sync_call);
+		ret = afs_fs_fetch_data(server, key, vnode, desc,
+					&afs_sync_call);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f865c3f05bea..c83c1a0e851f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -86,19 +86,30 @@ void afs_put_writeback(struct afs_writeback *wb)
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 			 loff_t pos, struct page *page)
 {
+	struct afs_read *req;
 	loff_t i_size;
 	int ret;
-	int len;
 
 	_enter(",,%llu", (unsigned long long)pos);
 
+	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	atomic_set(&req->usage, 1);
+	req->pos = pos;
+	req->nr_pages = 1;
+	req->pages[0] = page;
+
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (pos + PAGE_SIZE > i_size)
-		len = i_size - pos;
+		req->len = i_size - pos;
 	else
-		len = PAGE_SIZE;
+		req->len = PAGE_SIZE;
 
-	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
+	ret = afs_vnode_fetch_data(vnode, key, req);
+	afs_put_read(req);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
 			_debug("got NOENT from server"

^ permalink raw reply related

* [PATCH net-next 0/2] afs: Implement bulk read
From: David Howells @ 2017-01-06 17:08 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, linux-afs, linux-kernel


This pair of patches implements bulk data reading from an AFS server.

The patches can be found here also:

	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-rewrite

Tagged thusly:

	git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
	rxrpc-rewrite-20170106

David
---
David Howells (2):
      afs: Make afs_fs_fetch_data() take a list of pages
      afs: Make afs_readpages() fetch data in bulk


 fs/afs/file.c     |  168 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/afs/fsclient.c |  117 ++++++++++++++++++++++++-------------
 fs/afs/internal.h |   21 ++++++-
 fs/afs/vnode.c    |    6 +-
 fs/afs/volume.c   |    1 
 fs/afs/write.c    |   19 +++++-
 6 files changed, 275 insertions(+), 57 deletions(-)

^ permalink raw reply

* Re: [PATCH v3] net: stmmac: fix maxmtu assignment to be within valid range
From: Andy Shevchenko @ 2017-01-06 17:06 UTC (permalink / raw)
  To: Kweh, Hock Leong
  Cc: David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe, Jarod Wilson, Alexandre TORGUE,
	Joachim Eastwood, Niklas Cassel, Johan Hovold, Pavel Machek,
	lars.persson, netdev, LKML
In-Reply-To: <1483750143-11966-1-git-send-email-hock.leong.kweh@intel.com>

On Sat, Jan 7, 2017 at 2:49 AM, Kweh, Hock Leong
<hock.leong.kweh@intel.com> wrote:
> From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>
>
> There is no checking valid value of maxmtu when getting it from device tree.
> This resolution added the checking condition to ensure the assignment is
> made within a valid range.

> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
>                 ndev->max_mtu = JUMBO_LEN;
>         else
>                 ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
> -       if (priv->plat->maxmtu < ndev->max_mtu)

> +

The lines are logically grouped here. No need to split it. Thus,
remove this extra line.

> +       if ((priv->plat->maxmtu < ndev->max_mtu) &&
> +           (priv->plat->maxmtu >= ndev->min_mtu))

>                 ndev->max_mtu = priv->plat->maxmtu;

> +       else if (priv->plat->maxmtu < ndev->min_mtu)

And if it > ndev->max_mtu?..

> +               netdev_warn(priv->dev,
> +                           "%s: warning: maxmtu having invalid value (%d)\n",
> +                           __func__, priv->plat->maxmtu);


> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> @@ -204,6 +204,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
>
>         pci_set_master(pdev);
>
> +       /* Set the maxmtu to a default of JUMBO_LEN in case the
> +        * parameter is not defined for the device.
> +        */
> +       plat->maxmtu = JUMBO_LEN;

Please, use *_default_data() hooks for that.

At some point it might make sense to extract
static int common_default_data() {...}
and call it at the beginning of the rest of *_defautl_data() hooks.

-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply

* Re: [PATCH v3] net: stmmac: fix maxmtu assignment to be within valid range
From: Joao Pinto @ 2017-01-06 16:57 UTC (permalink / raw)
  To: Kweh, Hock Leong, David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe, Jarod Wilson, Andy Shevchenko
  Cc: Alexandre TORGUE, Joachim Eastwood, Niklas Cassel, Johan Hovold,
	pavel, lars.persson, netdev, LKML
In-Reply-To: <1483750143-11966-1-git-send-email-hock.leong.kweh@intel.com>


Hi Wilson,

Às 12:49 AM de 1/7/2017, Kweh, Hock Leong escreveu:
> From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>
> 
> There is no checking valid value of maxmtu when getting it from device tree.
> This resolution added the checking condition to ensure the assignment is
> made within a valid range.
> 
> Signed-off-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>
> ---
> changelog v3:
> * print the warning message only if maxmtu < min_mtu
> * add maxmtu = JUMBO_LEN at stmmac_pci.c to follow stmmac_platform.c
> 
> changelog v2:
> * correction of "devicetree" to "device tree" reported by Andy
> * print warning message while maxmtu is not in valid range
> 
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    8 +++++++-
>  drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c  |    4 ++++
>  2 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 92ac006..ce74ae6 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
>  		ndev->max_mtu = JUMBO_LEN;
>  	else
>  		ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
> -	if (priv->plat->maxmtu < ndev->max_mtu)
> +
> +	if ((priv->plat->maxmtu < ndev->max_mtu) &&
> +	    (priv->plat->maxmtu >= ndev->min_mtu))
>  		ndev->max_mtu = priv->plat->maxmtu;
> +	else if (priv->plat->maxmtu < ndev->min_mtu)
> +		netdev_warn(priv->dev,
> +			    "%s: warning: maxmtu having invalid value (%d)\n",
> +			    __func__, priv->plat->maxmtu);
>  
>  	if (flow_ctrl)
>  		priv->flow_ctrl = FLOW_AUTO;	/* RX/TX pause on */
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> index a283177..e539afe 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
> @@ -204,6 +204,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
>  
>  	pci_set_master(pdev);
>  
> +	/* Set the maxmtu to a default of JUMBO_LEN in case the
> +	 * parameter is not defined for the device.
> +	 */
> +	plat->maxmtu = JUMBO_LEN;

I suggest to put this configuration in one of the default config functions.

Tahnks.

>  	if (info) {
>  		info->pdev = pdev;
>  		if (info->setup) {
> 

^ permalink raw reply

* EMAIL ALERT
From: System Administrator @ 2017-01-06 14:15 UTC (permalink / raw)
  To: Recipients

Recently, we have detect some unusual activity on your account and as a
result, all email users are urged to update their email account within 24 hours of receiving this e-mail, Please CLICK  to confirm that your email account is up to date with the institution requirement.

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus

^ permalink raw reply

* [PATCH v3] net: stmmac: fix maxmtu assignment to be within valid range
From: Kweh, Hock Leong @ 2017-01-07  0:49 UTC (permalink / raw)
  To: David S. Miller, Joao Pinto, Giuseppe CAVALLARO,
	seraphin.bonnaffe, Jarod Wilson, Andy Shevchenko
  Cc: Alexandre TORGUE, Joachim Eastwood, Niklas Cassel, Johan Hovold,
	pavel, Kweh, Hock Leong, lars.persson, netdev, LKML

From: "Kweh, Hock Leong" <hock.leong.kweh@intel.com>

There is no checking valid value of maxmtu when getting it from device tree.
This resolution added the checking condition to ensure the assignment is
made within a valid range.

Signed-off-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>
---
changelog v3:
* print the warning message only if maxmtu < min_mtu
* add maxmtu = JUMBO_LEN at stmmac_pci.c to follow stmmac_platform.c

changelog v2:
* correction of "devicetree" to "device tree" reported by Andy
* print warning message while maxmtu is not in valid range

 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    8 +++++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c  |    4 ++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 92ac006..ce74ae6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3345,8 +3345,14 @@ int stmmac_dvr_probe(struct device *device,
 		ndev->max_mtu = JUMBO_LEN;
 	else
 		ndev->max_mtu = SKB_MAX_HEAD(NET_SKB_PAD + NET_IP_ALIGN);
-	if (priv->plat->maxmtu < ndev->max_mtu)
+
+	if ((priv->plat->maxmtu < ndev->max_mtu) &&
+	    (priv->plat->maxmtu >= ndev->min_mtu))
 		ndev->max_mtu = priv->plat->maxmtu;
+	else if (priv->plat->maxmtu < ndev->min_mtu)
+		netdev_warn(priv->dev,
+			    "%s: warning: maxmtu having invalid value (%d)\n",
+			    __func__, priv->plat->maxmtu);
 
 	if (flow_ctrl)
 		priv->flow_ctrl = FLOW_AUTO;	/* RX/TX pause on */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index a283177..e539afe 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -204,6 +204,10 @@ static int stmmac_pci_probe(struct pci_dev *pdev,
 
 	pci_set_master(pdev);
 
+	/* Set the maxmtu to a default of JUMBO_LEN in case the
+	 * parameter is not defined for the device.
+	 */
+	plat->maxmtu = JUMBO_LEN;
 	if (info) {
 		info->pdev = pdev;
 		if (info->setup) {
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH net] dccp: fix option range in dccp_parse_options()
From: Eric Dumazet @ 2017-01-06 16:43 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, Andrey Konovalov, Gerrit Renker

From: Eric Dumazet <edumazet@google.com>

dccp_parse_options() improperly parses 12 or 16 bytes in excess,
because it forgets to subtract DCCP header len.

This causes various issues, since these 12/16 bytes are part of the
payload and this might not even be present in skb->head, as
dccp_invalid_packet() only pulled everything but payload.

KASAN complains since we might access uninitialized data.

Strangely enough, net/netfilter/xt_dccp.c got this right.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
---
 net/dccp/options.c |    7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/dccp/options.c b/net/dccp/options.c
index 74d29c56c36709fd4e31f0e63a1f8b1aa38a32cd..41bd4bc4026f97b155e12a3a37095653836ce7fc 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -54,10 +54,9 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
 	struct dccp_sock *dp = dccp_sk(sk);
 	const struct dccp_hdr *dh = dccp_hdr(skb);
 	const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
-	unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
-	unsigned char *opt_ptr = options;
-	const unsigned char *opt_end = (unsigned char *)dh +
-					(dh->dccph_doff * 4);
+	unsigned char *opt_ptr = (unsigned char *)dh + __dccp_hdr_len(dh);
+	unsigned int optlen = dh->dccph_doff * 4 - __dccp_hdr_len(dh);
+	const unsigned char *opt_end = opt_ptr + optlen;
 	struct dccp_options_received *opt_recv = &dp->dccps_options_received;
 	unsigned char opt, len;
 	unsigned char *uninitialized_var(value);

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox