Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH RFC net-next 04/11] udp: paged allocation with gso
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

When sending large datagrams that are later segmented, store data in
page frags to avoid copying from linear in skb_segment.

This logic will also be used by zerocopy.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/ip_output.c  | 15 +++++++++++----
 net/ipv6/ip6_output.c | 19 ++++++++++++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7abfb24ec5e5..9ccd6c28e420 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,13 @@ static int __ip_append_data(struct sock *sk,
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
+	bool paged;
 
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
 	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+	paged = !!cork->gso_size;
 
 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
@@ -934,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
 			unsigned int fraglen;
 			unsigned int fraggap;
 			unsigned int alloclen;
+			unsigned int pagedlen = 0;
 			struct sk_buff *skb_prev;
 alloc_new_skb:
 			skb_prev = skb;
@@ -954,8 +957,12 @@ static int __ip_append_data(struct sock *sk,
 			if ((flags & MSG_MORE) &&
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
-			else
+			else if (!paged)
 				alloclen = fraglen;
+			else {
+				alloclen = min_t(int, fraglen, MAX_HEADER);
+				pagedlen = fraglen - alloclen;
+			}
 
 			alloclen += exthdrlen;
 
@@ -999,7 +1006,7 @@ static int __ip_append_data(struct sock *sk,
 			/*
 			 *	Find where to start putting bytes.
 			 */
-			data = skb_put(skb, fraglen + exthdrlen);
+			data = skb_put(skb, fraglen + exthdrlen - pagedlen);
 			skb_set_network_header(skb, exthdrlen);
 			skb->transport_header = (skb->network_header +
 						 fragheaderlen);
@@ -1015,7 +1022,7 @@ static int __ip_append_data(struct sock *sk,
 				pskb_trim_unique(skb_prev, maxfraglen);
 			}
 
-			copy = datalen - transhdrlen - fraggap;
+			copy = datalen - transhdrlen - fraggap - pagedlen;
 			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 				err = -EFAULT;
 				kfree_skb(skb);
@@ -1023,7 +1030,7 @@ static int __ip_append_data(struct sock *sk,
 			}
 
 			offset += copy;
-			length -= datalen - fraggap;
+			length -= copy + transhdrlen;
 			transhdrlen = 0;
 			exthdrlen = 0;
 			csummode = CHECKSUM_NONE;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3ce947c1d173..9fbcec4fb946 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1270,6 +1270,7 @@ static int __ip6_append_data(struct sock *sk,
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
 	unsigned int wmem_alloc_delta = 0;
+	bool paged;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1277,6 +1278,7 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
+	paged = !!cork->gso_size;
 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
 
@@ -1368,6 +1370,7 @@ static int __ip6_append_data(struct sock *sk,
 			unsigned int fraglen;
 			unsigned int fraggap;
 			unsigned int alloclen;
+			unsigned int pagedlen = 0;
 alloc_new_skb:
 			/* There's no room in the current skb */
 			if (skb)
@@ -1390,11 +1393,17 @@ static int __ip6_append_data(struct sock *sk,
 
 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+			fraglen = datalen + fragheaderlen;
+
 			if ((flags & MSG_MORE) &&
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
-			else
-				alloclen = datalen + fragheaderlen;
+			else if (!paged)
+				alloclen = fraglen;
+			else {
+				alloclen = min_t(int, fraglen, MAX_HEADER);
+				pagedlen = fraglen - alloclen;
+			}
 
 			alloclen += dst_exthdrlen;
 
@@ -1416,7 +1425,7 @@ static int __ip6_append_data(struct sock *sk,
 			 */
 			alloclen += sizeof(struct frag_hdr);
 
-			copy = datalen - transhdrlen - fraggap;
+			copy = datalen - transhdrlen - fraggap - pagedlen;
 			if (copy < 0) {
 				err = -EINVAL;
 				goto error;
@@ -1455,7 +1464,7 @@ static int __ip6_append_data(struct sock *sk,
 			/*
 			 *	Find where to start putting bytes
 			 */
-			data = skb_put(skb, fraglen);
+			data = skb_put(skb, fraglen - pagedlen);
 			skb_set_network_header(skb, exthdrlen);
 			data += fragheaderlen;
 			skb->transport_header = (skb->network_header +
@@ -1478,7 +1487,7 @@ static int __ip6_append_data(struct sock *sk,
 			}
 
 			offset += copy;
-			length -= datalen - fraggap;
+			length -= copy + transhdrlen;
 			transhdrlen = 0;
 			exthdrlen = 0;
 			dst_exthdrlen = 0;
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [PATCH RFC net-next 00/11] udp gso
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn

From: Willem de Bruijn <willemb@google.com>

Segmentation offload reduces cycles/byte for large packets by
amortizing the cost of protocol stack traversal.

This patchset implements GSO for UDP. A process can concatenate and
submit multiple datagrams to the same destination in one send call
by setting socket option SOL_UDP/UDP_SEGMENT with the segment size,
or passing an analogous cmsg at send time.

The stack will send the entire large (up to network layer max size)
datagram through the protocol layer. At the GSO layer, it is broken
up in individual segments. All receive the same network layer header
and UDP src and dst port. All but the last segment have the same UDP
header, but the last may differ in length and checksum.

This initial patchset is RFC. A few open items

* MSG_MORE
  The feature requires UDP checksum offload, as without it the
  checksum + copy operation at send() time is likely cheaper than
  checksumming each segment in the GSO layer.

  UDP checksum offload is disabled with MSG_MORE. As a result, GSO
  only works in the lockless fast path.

  The patchset can be simplified if explicitly excluding MSG_MORE.
  For one, patch 1 can be dropped by passing ipcm to udp_send_skb
  instead of inet_cork.

* MSG_ZEROCOPY
  UDP zerocopy has been sent for review before. Completion
  notification cost exceeds the savings from copy avoidance for
  datagrams of regular MSS (< 1500B).

  UDP GSO enables building larger packets, at which point
  zerocopy becomes effective. Results with the current benchmark
  are not as great as from GSO itself, though that may say more
  about the benchmark. Either way, I do not intend to submit
  this separate feature as part of a final UDP GSO patchset.

* GSO_BY_FRAGS
  An alternative implementation that would allow non-uniform
  segment length is to use GSO_BY_FRAGS like SCTP. This would
  likely require MSG_MORE to build the list using multiple
  send calls (or one sendmmsg). The two approaches are not
  mutually-exclusive, so that could be a follow-up.

Initial results show a significant reduction in UDP cycles/byte.
See the main patch for more details and benchmark results.

        udp
          876 MB/s 14873 msg/s 624666 calls/s
            11,205,777,429      cycles

        udp gso
         2139 MB/s 36282 msg/s 36282 calls/s
            11,204,374,561      cycles

The patch set is broken down as follows:
- patch 1 is a prerequisite: code rearrangement, noop otherwise
- patch 2 is the core feature
- patch 3,4,6 are refinements
- patch 5 adds the cmsg interface
- patch 7 adds udp zerocopy
- patch 8..11 are tests

This idea was presented previously at netconf 2017-2
http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

Known limitation:
  - The feature requires pacing and possibly a lower threshold on
    segment size to limit the number of segments that may be passed
    to the NIC at once.

  - Even when only accepting datagrams with CHECKSUM_PARTIAL, the
    segmentation layer must drop or fall back to software checksumming
    if the device cannot checksum the packet.

    This can happen if a device advertises checksum offload in
    general, but removes it for this skb in .ndo_features_check.

Willem de Bruijn (11):
  udp: expose inet cork to udp
  udp: add gso
  udp: better wmem accounting on gso
  udp: paged allocation with gso
  udp: add gso segment cmsg
  udp: add gso support to virtual devices
  udp: zerocopy
  selftests: udp gso
  selftests: udp gso with connected sockets
  selftests: udp gso with corking
  selftests: udp gso benchmark

 include/linux/netdev_features.h               |   3 +
 include/linux/skbuff.h                        |  10 +
 include/linux/udp.h                           |   1 +
 include/net/inet_sock.h                       |   1 +
 include/net/ip.h                              |   3 +-
 include/net/ipv6.h                            |   2 +
 include/net/udp.h                             |   5 +
 include/uapi/linux/udp.h                      |   1 +
 net/core/skbuff.c                             |  14 +-
 net/core/sock.c                               |   5 +-
 net/ipv4/af_inet.c                            |   2 +-
 net/ipv4/ip_output.c                          |  63 +-
 net/ipv4/udp.c                                |  78 ++-
 net/ipv4/udp_offload.c                        |  63 ++
 net/ipv6/ip6_offload.c                        |   5 +-
 net/ipv6/ip6_output.c                         |  66 +-
 net/ipv6/udp.c                                |  29 +-
 net/ipv6/udp_offload.c                        |  14 +
 tools/testing/selftests/net/.gitignore        |   3 +
 tools/testing/selftests/net/Makefile          |   3 +-
 tools/testing/selftests/net/udpgso.c          | 621 ++++++++++++++++++
 tools/testing/selftests/net/udpgso.sh         |  31 +
 tools/testing/selftests/net/udpgso_bench.sh   |  74 +++
 tools/testing/selftests/net/udpgso_bench_rx.c | 265 ++++++++
 tools/testing/selftests/net/udpgso_bench_tx.c | 379 +++++++++++
 25 files changed, 1689 insertions(+), 52 deletions(-)
 create mode 100644 tools/testing/selftests/net/udpgso.c
 create mode 100755 tools/testing/selftests/net/udpgso.sh
 create mode 100755 tools/testing/selftests/net/udpgso_bench.sh
 create mode 100644 tools/testing/selftests/net/udpgso_bench_rx.c
 create mode 100644 tools/testing/selftests/net/udpgso_bench_tx.c

-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply

* [PATCH RFC net-next 03/11] udp: better wmem accounting on gso
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

skb_segment by default transfers allocated wmem from the gso skb
to the tail of the segment list. This underreports real truesize
of the list, especially if the tail might be dropped.

Similar to tcp_gso_segment, update wmem_alloc with the aggregate
list truesize and make each segment responsible for its own
share by setting skb->destructor.

Clear gso_skb->destructor prior to calling skb_segment to skip
the default assignment to tail.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 net/ipv4/udp_offload.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 3378f8dec9c7..a69117e25e78 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -192,7 +192,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  unsigned int mss, __sum16 check)
 {
 	struct udphdr *uh = udp_hdr(gso_skb);
-	struct sk_buff *segs;
+	struct sock *sk = gso_skb->sk;
+	struct sk_buff *segs, *seg;
+	unsigned int sum_truesize = 0;
 	unsigned int hdrlen;
 
 	if (gso_skb->len <= sizeof(*uh) + mss)
@@ -203,9 +205,23 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 	skb_pull(gso_skb, sizeof(*uh));
 	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
 
+	/* clear destructor to avoid skb_segment assigning it to tail */
+	WARN_ON_ONCE(gso_skb->destructor != sock_wfree);
+	gso_skb->destructor = NULL;
+
 	segs = skb_segment(gso_skb, features);
-	if (unlikely(IS_ERR_OR_NULL(segs)))
+	if (unlikely(IS_ERR_OR_NULL(segs))) {
+		gso_skb->destructor = sock_wfree;
 		return segs;
+	}
+
+	for (seg = segs; seg; seg = seg->next) {
+		seg->destructor = sock_wfree;
+		seg->sk = sk;
+		sum_truesize += seg->truesize;
+	}
+
+	refcount_add(sum_truesize - gso_skb->truesize, &sk->sk_wmem_alloc);
 
 	/* If last packet is not full, fix up its header */
 	if (segs->prev->len != hdrlen + mss) {
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [PATCH RFC net-next 02/11] udp: add gso
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

Implement generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.

To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.

A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.

Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.

UDP GSO is not UFO. UFO fragments a single large datagram. GSO splits
a large payload into a number of discrete UDP datagrams.

The implementation adds a GSO type SKB_UDP_GSO_L4 to differentiate it
from UFO (SKB_UDP_GSO). It adds a gso_size field to the udp socket,
ip(v6) cmsg cookie and inet_cork structure to be able to set the value
at setsockopt or cmsg time and to work with both lockless and corked
paths. A lockless-only cmsg-only patch would be significantly shorter.

The feature requires udp checksum offload to avoid checksumming in the
GSO layer. This is disabled with MSG_MORE as of commit d749c9cbffd6
("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets") and commit
682b1a9d3f96 ("ipv6: no CHECKSUM_PARTIAL on MSG_MORE corked sockets"),
so only the lockless fast path works. IPPROTO_UDPLITE is excluded as
well, as that protocol has no gso handler registered.

Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.

    tcp tso
     3197 MB/s 54232 msg/s 54232 calls/s
         6,457,754,262      cycles

    tcp gso
     1765 MB/s 29939 msg/s 29939 calls/s
        11,203,021,806      cycles

    tcp without tso/gso *
      739 MB/s 12548 msg/s 12548 calls/s
        11,205,483,630      cycles

    udp
      876 MB/s 14873 msg/s 624666 calls/s
        11,205,777,429      cycles

    udp gso
     2139 MB/s 36282 msg/s 36282 calls/s
        11,204,374,561      cycles

   [*] after reverting commit 0a6b2a1dc2a2
       ("tcp: switch to GSO being always on")

Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:

  perf stat -a -C 12 -e cycles \
    ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4

Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/skbuff.h   |  9 ++++++++
 include/linux/udp.h      |  1 +
 include/net/inet_sock.h  |  1 +
 include/net/ip.h         |  1 +
 include/net/ipv6.h       |  1 +
 include/net/udp.h        |  4 ++++
 include/uapi/linux/udp.h |  1 +
 net/core/skbuff.c        |  2 ++
 net/ipv4/af_inet.c       |  2 +-
 net/ipv4/ip_output.c     |  7 ++++--
 net/ipv4/udp.c           | 31 +++++++++++++++++++++++---
 net/ipv4/udp_offload.c   | 47 ++++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6_offload.c   |  5 +++--
 net/ipv6/ip6_output.c    |  4 +++-
 net/ipv6/udp.c           | 21 +++++++++++++++---
 net/ipv6/udp_offload.c   | 14 ++++++++++++
 16 files changed, 139 insertions(+), 12 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9065477ed255..6850643508c1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -573,6 +573,8 @@ enum {
 	SKB_GSO_ESP = 1 << 15,
 
 	SKB_GSO_UDP = 1 << 16,
+
+	SKB_GSO_UDP_L4 = 1 << 17,
 };
 
 #if BITS_PER_LONG > 32
@@ -4047,6 +4049,13 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
 }
 
+static inline bool skb_is_ufo(const struct sk_buff *skb)
+{
+	const unsigned int gso_type = skb_shinfo(skb)->gso_type;
+
+	return (gso_type & (SKB_GSO_UDP | SKB_GSO_UDP_L4)) == SKB_GSO_UDP;
+}
+
 static inline void skb_gso_reset(struct sk_buff *skb)
 {
 	skb_shinfo(skb)->gso_size = 0;
diff --git a/include/linux/udp.h b/include/linux/udp.h
index eaea63bc79bb..f7184052ed32 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -55,6 +55,7 @@ struct udp_sock {
 	 * when the socket is uncorked.
 	 */
 	__u16		 len;		/* total length of pending frames */
+	__u16		 gso_size;
 	/*
 	 * Fields specific to UDP-Lite.
 	 */
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 0a671c32d6b9..83d5b3c2ac42 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -147,6 +147,7 @@ struct inet_cork {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 struct inet_cork_full {
diff --git a/include/net/ip.h b/include/net/ip.h
index 5a0d2b660506..1190d10fb9b2 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -76,6 +76,7 @@ struct ipcm_cookie {
 	__u8			ttl;
 	__s16			tos;
 	char			priority;
+	__u16			gso_size;
 };
 
 #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 0dd722cab037..0a872a7c33c8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -298,6 +298,7 @@ struct ipcm6_cookie {
 	__s16 tclass;
 	__s8  dontfrag;
 	struct ipv6_txoptions *opt;
+	__u16 gso_size;
 };
 
 static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
diff --git a/include/net/udp.h b/include/net/udp.h
index 0676b272f6ac..741d888d0fdb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -174,6 +174,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
 				 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
 
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features,
+				  unsigned int mss, __sum16 check);
+
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
 	struct udphdr *uh;
diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index efb7b5991c2f..09d00f8c442b 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -32,6 +32,7 @@ struct udphdr {
 #define UDP_ENCAP	100	/* Set the socket to accept encapsulated packets */
 #define UDP_NO_CHECK6_TX 101	/* Disable sending checksum for UDP6X */
 #define UDP_NO_CHECK6_RX 102	/* Disable accpeting checksum for UDP6 */
+#define UDP_SEGMENT	103	/* Set GSO segmentation size */
 
 /* UDP encapsulation types */
 #define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 345b51837ca8..3eed21f64e0b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4926,6 +4926,8 @@ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
 		thlen = tcp_hdrlen(skb);
 	} else if (unlikely(skb_is_gso_sctp(skb))) {
 		thlen = sizeof(struct sctphdr);
+	} else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+		thlen = sizeof(struct udphdr);
 	}
 	/* UFO sets gso_size to the size of the fragmentation
 	 * payload, i.e. the size of the L4 (UDP) header is already
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3ebf599cebae..d6c3cd02ca2a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1327,7 +1327,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	segs = ERR_PTR(-EPROTONOSUPPORT);
 
 	if (!skb->encapsulation || encap) {
-		udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+		udpfrag = skb_is_ufo(skb);
 		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
 
 		/* fixed ID is invalid if DF bit is not set */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 83b14ea16654..7abfb24ec5e5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,7 +882,8 @@ static int __ip_append_data(struct sock *sk,
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+
 	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
 		tskey = sk->sk_tskey++;
@@ -1133,6 +1134,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
 	*rtp = NULL;
 	cork->fragsize = ip_sk_use_pmtu(sk) ?
 			 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
+
+	cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0;
 	cork->dst = &rt->dst;
 	cork->length = 0;
 	cork->ttl = ipc->ttl;
@@ -1212,7 +1215,7 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 		return -EOPNOTSUPP;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6b9d8017b319..731772a69043 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(udp_set_csum);
 
-static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
+static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
+			struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct inet_sock *inet = inet_sk(sk);
@@ -777,6 +778,19 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP | SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)  				 /*     UDP-Lite      */
 		csum = udplite_csum(skb);
 
@@ -828,7 +842,7 @@ int udp_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_send_skb(skb, fl4);
+	err = udp_send_skb(skb, fl4, &inet->cork.base);
 
 out:
 	up->len = 0;
@@ -922,6 +936,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc.sockc.tsflags = sk->sk_tsflags;
 	ipc.addr = inet->inet_saddr;
 	ipc.oif = sk->sk_bound_dev_if;
+	ipc.gso_size = up->gso_size;
 
 	if (msg->msg_controllen) {
 		err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
@@ -1037,7 +1052,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_send_skb(skb, fl4);
+			err = udp_send_skb(skb, fl4, &cork);
 		goto out;
 	}
 
@@ -2367,6 +2382,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 		up->no_check6_rx = valbool;
 		break;
 
+	case UDP_SEGMENT:
+		if (val < 0 || val > USHRT_MAX)
+			return -EINVAL;
+		up->gso_size = val;
+		break;
+
 	/*
 	 * 	UDP-Lite's partial checksum coverage (RFC 3828).
 	 */
@@ -2457,6 +2478,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
 		val = up->no_check6_rx;
 		break;
 
+	case UDP_SEGMENT:
+		val = up->gso_size;
+		break;
+
 	/* The following two cannot be changed on UDP sockets, the return is
 	 * always 0 (which corresponds to the full checksum coverage of UDP). */
 	case UDPLITE_SEND_CSCOV:
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ea6e6e7df0ee..3378f8dec9c7 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -187,6 +187,50 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_udp_tunnel_segment);
 
+struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
+				  netdev_features_t features,
+				  unsigned int mss, __sum16 check)
+{
+	struct udphdr *uh = udp_hdr(gso_skb);
+	struct sk_buff *segs;
+	unsigned int hdrlen;
+
+	if (gso_skb->len <= sizeof(*uh) + mss)
+		return ERR_PTR(-EINVAL);
+
+	uh->len = htons(sizeof(*uh) + mss);
+	uh->check = check;
+	skb_pull(gso_skb, sizeof(*uh));
+	hdrlen = gso_skb->data - skb_mac_header(gso_skb);
+
+	segs = skb_segment(gso_skb, features);
+	if (unlikely(IS_ERR_OR_NULL(segs)))
+		return segs;
+
+	/* If last packet is not full, fix up its header */
+	if (segs->prev->len != hdrlen + mss) {
+		unsigned int mss_last = segs->prev->len - hdrlen;
+
+		uh = udp_hdr(segs->prev);
+		uh->len = htons(sizeof(*uh) + mss_last);
+		csum_replace2(&uh->check, htons(mss), htons(mss_last));
+	}
+
+	return segs;
+}
+
+static struct sk_buff *__udp4_gso_segment(struct sk_buff *gso_skb,
+					  netdev_features_t features)
+{
+	const struct iphdr *iph = ip_hdr(gso_skb);
+	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+	return __udp_gso_segment(gso_skb, features, mss,
+				 udp_v4_check(sizeof(struct udphdr) + mss,
+					      iph->saddr, iph->daddr, 0));
+}
+EXPORT_SYMBOL_GPL(__udp4_gso_segment);
+
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -209,6 +253,9 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 		goto out;
 
+	if (!skb_is_ufo(skb))
+		return __udp4_gso_segment(skb, features);
+
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
 		goto out;
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 4a87f9428ca5..e59f67c0bf89 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -88,9 +88,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 
 	if (skb->encapsulation &&
 	    skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6))
-		udpfrag = proto == IPPROTO_UDP && encap;
+		udpfrag = proto == IPPROTO_UDP && encap && skb_is_ufo(skb);
 	else
-		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+		udpfrag = proto == IPPROTO_UDP && !skb->encapsulation &&
+			  skb_is_ufo(skb);
 
 	ops = rcu_dereference(inet6_offloads[proto]);
 	if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6751f4c375b9..3ce947c1d173 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1234,6 +1234,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
 	if (mtu < IPV6_MIN_MTU)
 		return -EINVAL;
 	cork->base.fragsize = mtu;
+	cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0;
+
 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
 		cork->base.flags |= IPCORK_ALLFRAG;
 	cork->base.length = 0;
@@ -1275,7 +1277,7 @@ static int __ip6_append_data(struct sock *sk,
 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
 	}
 
-	mtu = cork->fragsize;
+	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
 	orig_mtu = mtu;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 824797f8d1ab..16aee23b1e5f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1023,7 +1023,8 @@ static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
  *	Sending
  */
 
-static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
+static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+			   struct inet_cork *cork)
 {
 	struct sock *sk = skb->sk;
 	struct udphdr *uh;
@@ -1042,6 +1043,19 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6)
 	uh->len = htons(len);
 	uh->check = 0;
 
+	if (cork->gso_size) {
+		const int hlen = skb_network_header_len(skb) +
+				 sizeof(struct udphdr);
+
+		if (hlen + cork->gso_size > cork->fragsize)
+			return -EINVAL;
+		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite)
+			return -EIO;
+
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP | SKB_GSO_UDP_L4;
+	}
+
 	if (is_udplite)
 		csum = udplite_csum(skb);
 	else if (udp_sk(sk)->no_check6_tx) {   /* UDP csum disabled */
@@ -1093,7 +1107,7 @@ static int udp_v6_push_pending_frames(struct sock *sk)
 	if (!skb)
 		goto out;
 
-	err = udp_v6_send_skb(skb, &fl6);
+	err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
 
 out:
 	up->len = 0;
@@ -1127,6 +1141,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	ipc6.hlimit = -1;
 	ipc6.tclass = -1;
 	ipc6.dontfrag = -1;
+	ipc6.gso_size = up->gso_size;
 	sockc.tsflags = sk->sk_tsflags;
 
 	/* destination address check */
@@ -1333,7 +1348,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
-			err = udp_v6_send_skb(skb, &fl6);
+			err = udp_v6_send_skb(skb, &fl6, &cork.base);
 		goto out;
 	}
 
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 2a04dc9c781b..7410120fc114 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -17,6 +17,17 @@
 #include <net/ip6_checksum.h>
 #include "ip6_offload.h"
 
+static struct sk_buff *__udp6_gso_segment(struct sk_buff *gso_skb,
+					  netdev_features_t features)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(gso_skb);
+	unsigned int mss = skb_shinfo(gso_skb)->gso_size;
+
+	return __udp_gso_segment(gso_skb, features, mss,
+				 udp_v6_check(sizeof(struct udphdr) + mss,
+					      &ip6h->saddr, &ip6h->daddr, 0));
+}
+
 static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
@@ -48,6 +59,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 			goto out;
 
+		if (!skb_is_ufo(skb))
+			return __udp6_gso_segment(skb, features);
+
 		/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
 		 * do checksum of UDP packets sent as multiple IP fragments.
 		 */
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [PATCH RFC net-next 01/11] udp: expose inet cork to udp
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
  To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>

From: Willem de Bruijn <willemb@google.com>

UDP segmentation offload needs access to inet_cork in the udp layer.
Pass the struct to ip(6)_make_skb instead of allocating it on the
stack in that function itself.

This patch is a noop otherwise.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/net/ip.h      |  2 +-
 include/net/ipv6.h    |  1 +
 net/ipv4/ip_output.c  | 17 ++++++++---------
 net/ipv4/udp.c        |  4 +++-
 net/ipv6/ip6_output.c | 20 ++++++++++----------
 net/ipv6/udp.c        |  3 ++-
 6 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index ecffd843e7b8..5a0d2b660506 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -171,7 +171,7 @@ struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags);
+			    struct inet_cork *cork, unsigned int flags);
 
 static inline struct sk_buff *ip_finish_skb(struct sock *sk, struct flowi4 *fl4)
 {
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 68b167d98879..0dd722cab037 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -950,6 +950,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc);
 
 static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4c11b810a447..83b14ea16654 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1468,9 +1468,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length, int transhdrlen,
 			    struct ipcm_cookie *ipc, struct rtable **rtp,
-			    unsigned int flags)
+			    struct inet_cork *cork, unsigned int flags)
 {
-	struct inet_cork cork;
 	struct sk_buff_head queue;
 	int err;
 
@@ -1479,22 +1478,22 @@ struct sk_buff *ip_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.flags = 0;
-	cork.addr = 0;
-	cork.opt = NULL;
-	err = ip_setup_cork(sk, &cork, ipc, rtp);
+	cork->flags = 0;
+	cork->addr = 0;
+	cork->opt = NULL;
+	err = ip_setup_cork(sk, cork, ipc, rtp);
 	if (err)
 		return ERR_PTR(err);
 
-	err = __ip_append_data(sk, fl4, &queue, &cork,
+	err = __ip_append_data(sk, fl4, &queue, cork,
 			       &current->task_frag, getfrag,
 			       from, length, transhdrlen, flags);
 	if (err) {
-		__ip_flush_pending_frames(sk, &queue, &cork);
+		__ip_flush_pending_frames(sk, &queue, cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip_make_skb(sk, fl4, &queue, &cork);
+	return __ip_make_skb(sk, fl4, &queue, cork);
 }
 
 /*
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24b5c59b1c53..6b9d8017b319 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1030,9 +1030,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	/* Lockless fast path for the non-corking case. */
 	if (!corkreq) {
+		struct inet_cork cork;
+
 		skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
 				  sizeof(struct udphdr), &ipc, &rt,
-				  msg->msg_flags);
+				  &cork, msg->msg_flags);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_send_skb(skb, fl4);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a39b04f9fa6e..6751f4c375b9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1749,9 +1749,9 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 			     void *from, int length, int transhdrlen,
 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
 			     struct rt6_info *rt, unsigned int flags,
+			     struct inet_cork_full *cork,
 			     const struct sockcm_cookie *sockc)
 {
-	struct inet_cork_full cork;
 	struct inet6_cork v6_cork;
 	struct sk_buff_head queue;
 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
@@ -1762,27 +1762,27 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
 
 	__skb_queue_head_init(&queue);
 
-	cork.base.flags = 0;
-	cork.base.addr = 0;
-	cork.base.opt = NULL;
-	cork.base.dst = NULL;
+	cork->base.flags = 0;
+	cork->base.addr = 0;
+	cork->base.opt = NULL;
+	cork->base.dst = NULL;
 	v6_cork.opt = NULL;
-	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
+	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
 	if (err) {
-		ip6_cork_release(&cork, &v6_cork);
+		ip6_cork_release(cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 	if (ipc6->dontfrag < 0)
 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
 
-	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
+	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
 				&current->task_frag, getfrag, from,
 				length + exthdrlen, transhdrlen + exthdrlen,
 				flags, ipc6, sockc);
 	if (err) {
-		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
+		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
 		return ERR_PTR(err);
 	}
 
-	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
+	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4ec76a87aeb8..824797f8d1ab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1324,12 +1324,13 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	/* Lockless fast path for the non-corking case */
 	if (!corkreq) {
+		struct inet_cork_full cork;
 		struct sk_buff *skb;
 
 		skb = ip6_make_skb(sk, getfrag, msg, ulen,
 				   sizeof(struct udphdr), &ipc6,
 				   &fl6, (struct rt6_info *)dst,
-				   msg->msg_flags, &sockc);
+				   msg->msg_flags, &cork, &sockc);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
 			err = udp_v6_send_skb(skb, &fl6);
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* [net  1/1] tipc: fix infinite loop when dumping link monitor summary
From: Jon Maloy @ 2018-04-17 19:58 UTC (permalink / raw)
  To: davem, netdev
  Cc: mohan.krishna.ghanta.krishnamurthy, tung.q.nguyen, hoang.h.le,
	jon.maloy, canh.d.luu, ying.xue, tipc-discussion

From: Tung Nguyen <tung.q.nguyen@dektech.com.au>

When configuring the number of used bearers to MAX_BEARER and issuing
command "tipc link monitor summary", the command enters infinite loop
in user space.

This issue happens because function tipc_nl_node_dump_monitor() returns
the wrong 'prev_bearer' value when all potential monitors have been
scanned.

The correct behavior is to always try to scan all monitors until either
the netlink message is full, in which case we return the bearer identity
of the affected monitor, or we continue through the whole bearer array
until we can return MAX_BEARERS. This solution also caters for the case
where there may be gaps in the bearer array.

Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/monitor.c |  2 +-
 net/tipc/node.c    | 11 ++++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 32dc33a..5453e56 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -777,7 +777,7 @@ int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg,
 
 	ret = tipc_bearer_get_name(net, bearer_name, bearer_id);
 	if (ret || !mon)
-		return -EINVAL;
+		return 0;
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
 			  NLM_F_MULTI, TIPC_NL_MON_GET);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index c77dd2f..6f98b56 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -2232,8 +2232,8 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
 	struct net *net = sock_net(skb->sk);
 	u32 prev_bearer = cb->args[0];
 	struct tipc_nl_msg msg;
+	int bearer_id;
 	int err;
-	int i;
 
 	if (prev_bearer == MAX_BEARERS)
 		return 0;
@@ -2243,16 +2243,13 @@ int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb)
 	msg.seq = cb->nlh->nlmsg_seq;
 
 	rtnl_lock();
-	for (i = prev_bearer; i < MAX_BEARERS; i++) {
-		prev_bearer = i;
+	for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) {
 		err = __tipc_nl_add_monitor(net, &msg, prev_bearer);
 		if (err)
-			goto out;
+			break;
 	}
-
-out:
 	rtnl_unlock();
-	cb->args[0] = prev_bearer;
+	cb->args[0] = bearer_id;
 
 	return skb->len;
 }
-- 
2.1.4

^ permalink raw reply related

* [net  1/1] tipc: fix use-after-free in tipc_nametbl_stop
From: Jon Maloy @ 2018-04-17 19:25 UTC (permalink / raw)
  To: davem, netdev
  Cc: mohan.krishna.ghanta.krishnamurthy, tung.q.nguyen, hoang.h.le,
	jon.maloy, canh.d.luu, ying.xue, tipc-discussion

When we delete a service item in tipc_nametbl_stop() we loop over
all service ranges in the service's RB tree, and for each service
range we loop over its pertaining publications while calling
tipc_service_remove_publ() for each of them.

However, tipc_service_remove_publ() has the side effect that it also
removes the comprising service range item when there are no publications
left. This leads to a "use-after-free" access when the inner loop
continues to the next iteration, since the range item holding the list
we are looping no longer exists.

We fix this by moving the delete of the service range item outside
the said function. Instead, we now let the two functions calling it
test if the list is empty and perform the removal when that is the
case.

Reported-by: syzbot+d64b64afc55660106556@syzkaller.appspotmail.com
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
---
 net/tipc/name_table.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 4068eaa..dd1c4fa 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -241,7 +241,8 @@ static struct publication *tipc_service_insert_publ(struct net *net,
 static struct publication *tipc_service_remove_publ(struct net *net,
 						    struct tipc_service *sc,
 						    u32 lower, u32 upper,
-						    u32 node, u32 key)
+						    u32 node, u32 key,
+						    struct service_range **rng)
 {
 	struct tipc_subscription *sub, *tmp;
 	struct service_range *sr;
@@ -275,19 +276,15 @@ static struct publication *tipc_service_remove_publ(struct net *net,
 
 	list_del(&p->all_publ);
 	list_del(&p->local_publ);
-
-	/* Remove service range item if this was its last publication */
-	if (list_empty(&sr->all_publ)) {
+	if (list_empty(&sr->all_publ))
 		last = true;
-		rb_erase(&sr->tree_node, &sc->ranges);
-		kfree(sr);
-	}
 
 	/* Notify any waiting subscriptions */
 	list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
 		tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
 					p->port, p->node, p->scope, last);
 	}
+	*rng = sr;
 	return p;
 }
 
@@ -379,13 +376,20 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 					     u32 node, u32 key)
 {
 	struct tipc_service *sc = tipc_service_find(net, type);
+	struct service_range *sr = NULL;
 	struct publication *p = NULL;
 
 	if (!sc)
 		return NULL;
 
 	spin_lock_bh(&sc->lock);
-	p = tipc_service_remove_publ(net, sc, lower, upper, node, key);
+	p = tipc_service_remove_publ(net, sc, lower, upper, node, key, &sr);
+
+	/* Remove service range item if this was its last publication */
+	if (sr && list_empty(&sr->all_publ)) {
+		rb_erase(&sr->tree_node, &sc->ranges);
+		kfree(sr);
+	}
 
 	/* Delete service item if this no more publications and subscriptions */
 	if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
@@ -747,16 +751,17 @@ int tipc_nametbl_init(struct net *net)
 static void tipc_service_delete(struct net *net, struct tipc_service *sc)
 {
 	struct service_range *sr, *tmpr;
-	struct publication *p, *tmpb;
+	struct publication *p, *tmp;
 
 	spin_lock_bh(&sc->lock);
 	rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
-		list_for_each_entry_safe(p, tmpb,
-					 &sr->all_publ, all_publ) {
+		list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) {
 			tipc_service_remove_publ(net, sc, p->lower, p->upper,
-						 p->node, p->key);
+						 p->node, p->key, &sr);
 			kfree_rcu(p, rcu);
 		}
+		rb_erase(&sr->tree_node, &sc->ranges);
+		kfree(sr);
 	}
 	hlist_del_init_rcu(&sc->service_list);
 	spin_unlock_bh(&sc->lock);
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH 08/10] net: ax88796: Make reset more robust on AX88796B
From: Michael Schmitz @ 2018-04-17 19:25 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Andrew Lunn, Michael Karcher, netdev, Linux/m68k,
	John Paul Adrian Glaubitz, Michael Karcher
In-Reply-To: <4e18a2db-b2e5-2000-b1c3-4c6ddcc0d3ed@gmail.com>

Thanks Florian,

I'll keep the Asix PHY driver separate from ax88796 for now. Mainly to
simplify testing. Let's see whether it can be used by any other MAC -
can still fold it into ax88796 later.

Cheers,

  Michael


On Wed, Apr 18, 2018 at 6:08 AM, Florian Fainelli <f.fainelli@gmail.com> wrote:
> On 04/17/2018 06:01 AM, Andrew Lunn wrote:
>> On Tue, Apr 17, 2018 at 07:18:10AM +0200, Michael Karcher wrote:
>>> [Andrew, sorry for the dup. I did hit reply-to-auhor instead of
>>> reply-to-all first.]
>>>
>>> Andrew Lunn schrieb:
>>>>>> This should really be fixed in the PHY driver, not the MAC.
>>>>>
>>>>> OK - do you want this separate, or as part of this series? Might have
>>>>> a few side effects on more commonly used hardware, perhaps?
>>>>
>>>> Hi Michael
>>>>
>>>> What PHY driver is used?
>>> The ax88796b comes with its own integrated (buggy) PHY needing this
>>> workaround. This PHY has its own ID which is not known by Linux, so it is
>>> using the genphy driver as fallback.
>>>
>>>> In the driver you can implement a .soft_reset
>>>> function which first does the dummy write, and then uses
>>>> genphy_soft_reset() to do the actual reset.
>>> We could do that - but I dont't see the point in creating a PHY driver
>>> that is only ever used by this MAC driver, just to add a single line to
>>> the genphy driver. If the same PHY might be used with a different MAC,
>>> you definitely would have a point there, though.
>>
>>
>> Hi Michael
>>
>> We try to keep the core code clean, and put all workarounds for buggy
>> hardware in drivers specific to them. It just helps keep the core code
>> maintainable.
>>
>> I would prefer a driver specific to this PHY with the workaround. But
>> lets see what Florian says.
>
> If you are already using the generic PHY driver, coming up with a custom
> one that only overrides the soft_reset and/or config_init callback is
> really not that much work, and as Andrew says, it helps make things
> clearer and properly isolated. As far as where to place that driver, you
> can either create a new file under drivers/net/phy/* or you can even
> register a phy_driver instance from within ax88796 if that makes it any
> clearer.
>
> FWIW, there are plenty of examples where there is a PHY driver used by a
> single MAC, and that is perfectly fine, because the abstraction is still
> preserved.
> --
> Florian

^ permalink raw reply

* Re: [PATCH net-next 2/4] net/smc: handle sockopt TCP_NODELAY
From: David Miller @ 2018-04-17 19:23 UTC (permalink / raw)
  To: ubraun; +Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun
In-Reply-To: <20180417151815.77191-3-ubraun@linux.ibm.com>

From: Ursula Braun <ubraun@linux.ibm.com>
Date: Tue, 17 Apr 2018 17:18:13 +0200

> From: Ursula Braun <ubraun@linux.vnet.ibm.com>
> 
> TCP sockopts must not interfere with the CLC handshake on the
> CLC socket. Therefore, we defer some of them till the CLC
> handshake has completed, like resetting TCP_NODELAY.
> 
> While touching setsockopt, the TCP_FASTOPEN sockopts are
> ignored, since SMC-connection setup is based on the TCP
> three-way-handshake.
> 
> Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>

Especially with fast-open, if you don't support the socket option
you should signal an error.

Also, the deferral mechanism means you'll never properly propagate
any errors back to the user.  The kernel_setsockopt() return value
isn't validated at all.

Color me not impressed at all with this change.

These kind of things have to have clear and proper semantics, plus
full error processing with proper propagation back to the user.

^ permalink raw reply

* Re: [PATCH RESEND net v2] KEYS: DNS: limit the length of option strings
From: David Miller @ 2018-04-17 19:18 UTC (permalink / raw)
  To: ebiggers3; +Cc: netdev, keyrings, mark.rutland, ebiggers
In-Reply-To: <20180417190706.217384-1-ebiggers3@gmail.com>

From: Eric Biggers <ebiggers3@gmail.com>
Date: Tue, 17 Apr 2018 12:07:06 -0700

> From: Eric Biggers <ebiggers@google.com>
> 
> Adding a dns_resolver key whose payload contains a very long option name
> resulted in that string being printed in full.  This hit the WARN_ONCE()
> in set_precision() during the printk(), because printk() only supports a
> precision of up to 32767 bytes:
> 
>     precision 1000000 too large
>     WARNING: CPU: 0 PID: 752 at lib/vsprintf.c:2189 vsnprintf+0x4bc/0x5b0
> 
> Fix it by limiting option strings (combined name + value) to a much more
> reasonable 128 bytes.  The exact limit is arbitrary, but currently the
> only recognized option is formatted as "dnserror=%lu" which fits well
> within this limit.
> 
> Also ratelimit the printks.
> 
> Reproducer:
> 
>     perl -e 'print "#", "A" x 1000000, "\x00"' | keyctl padd dns_resolver desc @s
> 
> This bug was found using syzkaller.
> 
> Reported-by: Mark Rutland <mark.rutland@arm.com>
> Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]")
> Signed-off-by: Eric Biggers <ebiggers@google.com>

Applied and qeueud up for -stable, thanks.

^ permalink raw reply

* Re: [PATCHv2 net-next] vxlan: add ttl inherit support
From: David Miller @ 2018-04-17 19:16 UTC (permalink / raw)
  To: liuhangbin; +Cc: netdev, jbenc, lucien.xin, sbrivio
In-Reply-To: <1523969574-3279-1-git-send-email-liuhangbin@gmail.com>

From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 17 Apr 2018 20:52:54 +0800

> Like tos inherit, ttl inherit should also means inherit the inner protocol's
> ttl values, which actually not implemented in vxlan yet.
> 
> But we could not treat ttl == 0 as "use the inner TTL", because that would be
> used also when the "ttl" option is not specified and that would be a behavior
> change, and breaking real use cases.
> 
> So add a different attribute IFLA_VXLAN_TTL_INHERIT when "ttl inherit" is
> specified.
> 
> ---
> v2: As suggested by Stefano, clean up function ip_tunnel_get_ttl().
> 
> Suggested-by: Jiri Benc <jbenc@redhat.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>

I already applied V1 of your patch.

Furthermore, this commit message would cause your signoffs and other tags
to be removed due to the "---" deliminator.

I generally encourage people to leave the change history text _in_ the
commit message anyways.  It is useful information for the future.

^ permalink raw reply

* Re: [PATCH net] sfc: check RSS is active for filter insert
From: David Miller @ 2018-04-17 19:13 UTC (permalink / raw)
  To: bkenward; +Cc: netdev, linux-net-drivers
In-Reply-To: <3b70d5bb-fef4-466b-3b40-fb76b0a55659@solarflare.com>

From: Bert Kenward <bkenward@solarflare.com>
Date: Tue, 17 Apr 2018 13:32:39 +0100

> For some firmware variants - specifically 'capture packed stream' - RSS
> filters are not valid. We must check if RSS is actually active rather
> than merely enabled.
> 
> Fixes: 42356d9a137b ("sfc: support RSS spreading of ethtool ntuple filters")
> Signed-off-by: Bert Kenward <bkenward@solarflare.com>

Applied, thanks.

^ permalink raw reply

* [PATCH RESEND net v2] KEYS: DNS: limit the length of option strings
From: Eric Biggers @ 2018-04-17 19:07 UTC (permalink / raw)
  To: netdev, David S . Miller; +Cc: keyrings, Mark Rutland, Eric Biggers

From: Eric Biggers <ebiggers@google.com>

Adding a dns_resolver key whose payload contains a very long option name
resulted in that string being printed in full.  This hit the WARN_ONCE()
in set_precision() during the printk(), because printk() only supports a
precision of up to 32767 bytes:

    precision 1000000 too large
    WARNING: CPU: 0 PID: 752 at lib/vsprintf.c:2189 vsnprintf+0x4bc/0x5b0

Fix it by limiting option strings (combined name + value) to a much more
reasonable 128 bytes.  The exact limit is arbitrary, but currently the
only recognized option is formatted as "dnserror=%lu" which fits well
within this limit.

Also ratelimit the printks.

Reproducer:

    perl -e 'print "#", "A" x 1000000, "\x00"' | keyctl padd dns_resolver desc @s

This bug was found using syzkaller.

Reported-by: Mark Rutland <mark.rutland@arm.com>
Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]")
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 net/dns_resolver/dns_key.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 8396705deffc..40c851693f77 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -91,9 +91,9 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)

 			next_opt = memchr(opt, '#', end - opt) ?: end;
 			opt_len = next_opt - opt;
-			if (!opt_len) {
-				printk(KERN_WARNING
-				       "Empty option to dns_resolver key\n");
+			if (opt_len <= 0 || opt_len > 128) {
+				pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
+						    opt_len);
 				return -EINVAL;
 			}

@@ -127,10 +127,8 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
 			}

 		bad_option_value:
-			printk(KERN_WARNING
-			       "Option '%*.*s' to dns_resolver key:"
-			       " bad/missing value\n",
-			       opt_nlen, opt_nlen, opt);
+			pr_warn_ratelimited("Option '%*.*s' to dns_resolver key: bad/missing value\n",
+					    opt_nlen, opt_nlen, opt);
 			return -EINVAL;
 		} while (opt = next_opt + 1, opt < end);
 	}
-- 
2.17.0.484.g0c8726318c-goog

^ permalink raw reply related

* Re: [PATCH net-next 1/5] virtio: Add support for SCTP checksum offloading
From: Vlad Yasevich @ 2018-04-17 19:06 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Vladislav Yasevich, netdev, linux-sctp, virtualization, jasowang,
	nhorman
In-Reply-To: <20180416200743-mutt-send-email-mst@kernel.org>

On 04/16/2018 01:09 PM, Michael S. Tsirkin wrote:
> On Mon, Apr 16, 2018 at 09:45:48AM -0400, Vlad Yasevich wrote:
>> On 04/11/2018 06:49 PM, Michael S. Tsirkin wrote:
>>> On Mon, Apr 02, 2018 at 09:40:02AM -0400, Vladislav Yasevich wrote:
>>>> To support SCTP checksum offloading, we need to add a new feature
>>>> to virtio_net, so we can negotiate support between the hypervisor
>>>> and the guest.
>>>>
>>>> The signalling to the guest that an alternate checksum needs to
>>>> be used is done via a new flag in the virtio_net_hdr.  If the
>>>> flag is set, the host will know to perform an alternate checksum
>>>> calculation, which right now is only CRC32c.
>>>>
>>>> Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
>>>> ---
>>>>  drivers/net/virtio_net.c        | 11 ++++++++---
>>>>  include/linux/virtio_net.h      |  6 ++++++
>>>>  include/uapi/linux/virtio_net.h |  2 ++
>>>>  3 files changed, 16 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>>> index 7b187ec..b601294 100644
>>>> --- a/drivers/net/virtio_net.c
>>>> +++ b/drivers/net/virtio_net.c
>>>> @@ -2724,9 +2724,14 @@ static int virtnet_probe(struct virtio_device *vdev)
>>>>  	/* Do we support "hardware" checksums? */
>>>>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
>>>>  		/* This opens up the world of extra features. */
>>>> -		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
>>>> +		netdev_features_t sctp = 0;
>>>> +
>>>> +		if (virtio_has_feature(vdev, VIRTIO_NET_F_SCTP_CSUM))
>>>> +			sctp |= NETIF_F_SCTP_CRC;
>>>> +
>>>> +		dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp;
>>>>  		if (csum)
>>>> -			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
>>>> +			dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp;
>>>>  
>>>>  		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
>>>>  			dev->hw_features |= NETIF_F_TSO
>>>> @@ -2952,7 +2957,7 @@ static struct virtio_device_id id_table[] = {
>>>>  };
>>>>  
>>>>  #define VIRTNET_FEATURES \
>>>> -	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
>>>> +	VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,  VIRTIO_NET_F_SCTP_CSUM, \
>>>>  	VIRTIO_NET_F_MAC, \
>>>>  	VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
>>>>  	VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \
>>>> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
>>>> index f144216..2e7a64a 100644
>>>> --- a/include/linux/virtio_net.h
>>>> +++ b/include/linux/virtio_net.h
>>>> @@ -39,6 +39,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
>>>>  
>>>>  		if (!skb_partial_csum_set(skb, start, off))
>>>>  			return -EINVAL;
>>>> +
>>>> +		if (hdr->flags & VIRTIO_NET_HDR_F_CSUM_NOT_INET)
>>>> +			skb->csum_not_inet = 1;
>>>>  	}
>>>>  
>>>>  	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
>>>> @@ -96,6 +99,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
>>>>  		hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
>>>>  	} /* else everything is zero */
>>>>  
>>>> +	if (skb->csum_not_inet)
>>>> +		hdr->flags &= VIRTIO_NET_HDR_F_CSUM_NOT_INET;
>>>> +
>>>>  	return 0;
>>>>  }
>>>>  
>>>> diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
>>>> index 5de6ed3..3f279c8 100644
>>>> --- a/include/uapi/linux/virtio_net.h
>>>> +++ b/include/uapi/linux/virtio_net.h
>>>> @@ -36,6 +36,7 @@
>>>>  #define VIRTIO_NET_F_GUEST_CSUM	1	/* Guest handles pkts w/ partial csum */
>>>>  #define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS 2 /* Dynamic offload configuration. */
>>>>  #define VIRTIO_NET_F_MTU	3	/* Initial MTU advice */
>>>> +#define VIRTIO_NET_F_SCTP_CSUM  4	/* SCTP checksum offload support */
>>>>  #define VIRTIO_NET_F_MAC	5	/* Host has given MAC address. */
>>>>  #define VIRTIO_NET_F_GUEST_TSO4	7	/* Guest can handle TSOv4 in. */
>>>>  #define VIRTIO_NET_F_GUEST_TSO6	8	/* Guest can handle TSOv6 in. */
>>>
>>> Is this a guest or a host checksum? We should differenciate between the
>>> two.
>>
>> I suppose this is HOST checksum, since it behaves like VIRTIO_NET_F_CSUM only for
>> SCTP.  I couldn't find the use for the GUEST side flag, since there technically
>> isn't any validations and there is no additional mappings from VIRTIO flag to a
>> NETIF flag.
>>
>> If the feature is negotiated, the guest ends up generating partially check-summed
>> packets, and the host turns on appropriate flags on it's side.   The host will
>> also make sure the checksum if fixed up if the guest doesn't support it.
>> So 1 flag is currently all that is needed.
>>
>> -vlad
> 
> I see code handling VIRTIO_NET_HDR_F_CSUM_NOT_INET on RX side.  Host
> needs to know whether it's ok/worth it to set this flag, too.

I think I understand. I have to re-consider outside of the context of
Linux behavior.

-vlad

> 
>>>
>>>
>>>> @@ -101,6 +102,7 @@ struct virtio_net_config {
>>>>  struct virtio_net_hdr_v1 {
>>>>  #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	/* Use csum_start, csum_offset */
>>>>  #define VIRTIO_NET_HDR_F_DATA_VALID	2	/* Csum is valid */
>>>> +#define VIRTIO_NET_HDR_F_CSUM_NOT_INET  4       /* Checksum is not inet */
>>>>  	__u8 flags;
>>>>  #define VIRTIO_NET_HDR_GSO_NONE		0	/* Not a GSO frame */
>>>>  #define VIRTIO_NET_HDR_GSO_TCPV4	1	/* GSO frame, IPv4 TCP (TSO) */
>>>> -- 
>>>> 2.9.5

^ permalink raw reply

* Re: [PATCH RESEND net-next] ipv6: provide Kconfig switch to disable accept_ra by default
From: David Miller @ 2018-04-17 19:05 UTC (permalink / raw)
  To: mschiffer; +Cc: netdev, kuznet, yoshfuji, linux-kernel
In-Reply-To: <b39b1b4304524c43dfd83280aea4cfb4e8ac97f5.1523959377.git.mschiffer@universe-factory.net>

From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Tue, 17 Apr 2018 12:04:50 +0200

> Many distributions and users prefer to handle router advertisements in
> userspace; one example is OpenWrt, which includes a combined RA and DHCPv6
> client. For such configurations, accept_ra should not be enabled by
> default.
> 
> As setting net.ipv6.conf.default.accept_ra via sysctl.conf or similar
> facilities may be too late to catch all interfaces and common sysctl.conf
> tools do not allow setting an option for all existing interfaces, this
> patch provides a Kconfig option to control the default value of
> default.accept_ra.
> 
> Using default.accept_ra is preferable to all.accept_ra for our usecase,
> as disabling all.accept_ra would preclude users from explicitly enabling
> accept_ra on individual interfaces.
> 
> Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>

This kind of Kconfig option makes an irreversible behavior choice at
compile time, which satisfies one set of users whilst completely
precluding another set.

So no matter what setting a distribution chooses, one set of users are
basically out of luck.

For that reason I consider this a poor approach.

I hate module options, but at least with that the user can choose
regardless of what the distribution maker decided to do with their
kernel.

This is essentially how we handle "disable_ipv6"

I'm not applying this, because it is as much of a step backwards as
it is a step forward, sorry.

^ permalink raw reply

* Re: [PATCH RESEND net-next v2] KEYS: DNS: limit the length of option strings
From: David Miller @ 2018-04-17 18:52 UTC (permalink / raw)
  To: ebiggers3; +Cc: netdev, keyrings, mark.rutland, ebiggers
In-Reply-To: <20180417183736.GC9237@gmail.com>

From: Eric Biggers <ebiggers3@gmail.com>
Date: Tue, 17 Apr 2018 11:37:36 -0700

> On Tue, Apr 17, 2018 at 02:24:37PM -0400, David Miller wrote:
>> From: Eric Biggers <ebiggers3@gmail.com>
>> Date: Tue, 17 Apr 2018 11:23:40 -0700
>> 
>> > Can you queue this up for stable too?  syzbot has been hitting this on older
>> > kernel versions.
>> 
>> If you want a patch bound for stable, it must show up in Linus's tree
>> first which means you should target 'net' rather than 'net-next'.
> 
> Okay, can you move the patch there, or do I need to resend, or is it too late
> already?  It's a clean cherry-pick.  Sorry, I'm not too familiar with the quirks
> of net and netdev -- most other maintainers do things differently.

It's already in net-next, so submit another copy based on net.

^ permalink raw reply

* Re: [PATCH 04/10] net: ax88796: Add block_input/output hooks to ax_plat_data
From: kbuild test robot @ 2018-04-17 18:46 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: kbuild-all, netdev, linux-m68k, Michael.Karcher, Michael Schmitz,
	Michael Karcher
In-Reply-To: <1523916285-6057-5-git-send-email-schmitzmic@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2005 bytes --]

Hi Michael,

I love your patch! Perhaps something to improve:

[auto build test WARNING on v4.16]
[cannot apply to net-next/master net/master v4.17-rc1 next-20180417]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Michael-Schmitz/New-network-driver-for-Amiga-X-Surf-100-m68k/20180417-141150
config: arm-samsung (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm 

All warnings (new ones prefixed by >>):

   In file included from arch/arm/mach-s3c24xx/mach-anubis.c:42:0:
>> include/net/ax88796.h:35:11: warning: 'struct sk_buff' declared inside parameter list will not be visible outside of this definition or declaration
       struct sk_buff *skb, int ring_offset);
              ^~~~~~~

vim +35 include/net/ax88796.h

    20	
    21	struct ax_plat_data {
    22		unsigned int	 flags;
    23		unsigned char	 wordlength;	/* 1 or 2 */
    24		unsigned char	 dcr_val;	/* default value for DCR */
    25		unsigned char	 rcr_val;	/* default value for RCR */
    26		unsigned char	 gpoc_val;	/* default value for GPOC */
    27		u32		*reg_offsets;	/* register offsets */
    28		u8		*mac_addr;	/* MAC addr (only used when
    29						   AXFLG_MAC_FROMPLATFORM is used */
    30	
    31		/* uses default ax88796 buffer if set to NULL */
    32		void (*block_output)(struct net_device *dev, int count,
    33				const unsigned char *buf, int star_page);
    34		void (*block_input)(struct net_device *dev, int count,
  > 35				struct sk_buff *skb, int ring_offset);
    36	};
    37	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 22607 bytes --]

^ permalink raw reply

* Re: [PATCH RESEND net-next v2] KEYS: DNS: limit the length of option strings
From: Eric Biggers @ 2018-04-17 18:37 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, keyrings, mark.rutland, ebiggers
In-Reply-To: <20180417.142437.1105065334888296683.davem@davemloft.net>

On Tue, Apr 17, 2018 at 02:24:37PM -0400, David Miller wrote:
> From: Eric Biggers <ebiggers3@gmail.com>
> Date: Tue, 17 Apr 2018 11:23:40 -0700
> 
> > Can you queue this up for stable too?  syzbot has been hitting this on older
> > kernel versions.
> 
> If you want a patch bound for stable, it must show up in Linus's tree
> first which means you should target 'net' rather than 'net-next'.

Okay, can you move the patch there, or do I need to resend, or is it too late
already?  It's a clean cherry-pick.  Sorry, I'm not too familiar with the quirks
of net and netdev -- most other maintainers do things differently.

Thanks,

Eric

^ permalink raw reply

* Re: [PATCH net-next 1/3] net: phy: Add binding for vendor specific C45 MDIO address space
From: Florian Fainelli @ 2018-04-17 18:29 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: vicentiu.galanopulo, robh, netdev, linux-kernel, mark.rutland,
	davem, marcel, devicetree, alexandru.marginean, madalin.bucur
In-Reply-To: <20180417182741.GC6688@lunn.ch>

On 04/17/2018 11:27 AM, Andrew Lunn wrote:
> On Tue, Apr 17, 2018 at 11:18:20AM -0700, Florian Fainelli wrote:
>> On 04/17/2018 02:02 AM, Vicentiu Galanopulo wrote:
>>> The extra property enables the discovery on the MDIO bus
>>> of the PHYs which have a vendor specific address space
>>> for accessing the C45 MDIO registers.
>>>
>>> Signed-off-by: Vicentiu Galanopulo <vicentiu.galanopulo@nxp.com>
>>> ---
>>>  Documentation/devicetree/bindings/net/phy.txt | 6 ++++++
>>>  1 file changed, 6 insertions(+)
>>>
>>> diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt
>>> index d2169a5..82692e2 100644
>>> --- a/Documentation/devicetree/bindings/net/phy.txt
>>> +++ b/Documentation/devicetree/bindings/net/phy.txt
>>> @@ -61,6 +61,11 @@ Optional Properties:
>>>  - reset-deassert-us: Delay after the reset was deasserted in microseconds.
>>>    If this property is missing the delay will be skipped.
>>>  
>>> +- dev-addr: If set, it indicates the device address of the PHY to be used
>>> +  when accessing the C45 PHY registers over MDIO. It is used for vendor specific
>>> +  register space addresses that do no conform to standard address for the MDIO
>>> +  registers (e.g. MMD30)
>>
>> Rob made that comment earlier, and I have to ask again now, why don't we
>> have the Clause 45 PHY binding be modified such that you have a reg
>> property that has #address-size = 2? This should be entirely backwards
>> compatible, but it would allow you to specify that device address in a
>> more traditional way.
> 
> Hi Florian
> 
> I think we might get into trouble when we have both c22 and c45 on the
> same bus. Two different reg formats. I would have to try it and see to
> be sure.

Hum indeed, we would no longer be able to mix and match on the same MDIO
bus, unless we give C22 PHYs a "fake" second cell. Disregard that idea
then, and let's stick with 'dev-addr'.
-- 
Florian

^ permalink raw reply

* Re: [PATCH net-next 1/3] net: phy: Add binding for vendor specific C45 MDIO address space
From: Andrew Lunn @ 2018-04-17 18:27 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: vicentiu.galanopulo, robh, netdev, linux-kernel, mark.rutland,
	davem, marcel, devicetree, alexandru.marginean, madalin.bucur
In-Reply-To: <bc00cc86-2293-0fd2-4e98-c516f59f97ce@gmail.com>

On Tue, Apr 17, 2018 at 11:18:20AM -0700, Florian Fainelli wrote:
> On 04/17/2018 02:02 AM, Vicentiu Galanopulo wrote:
> > The extra property enables the discovery on the MDIO bus
> > of the PHYs which have a vendor specific address space
> > for accessing the C45 MDIO registers.
> > 
> > Signed-off-by: Vicentiu Galanopulo <vicentiu.galanopulo@nxp.com>
> > ---
> >  Documentation/devicetree/bindings/net/phy.txt | 6 ++++++
> >  1 file changed, 6 insertions(+)
> > 
> > diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt
> > index d2169a5..82692e2 100644
> > --- a/Documentation/devicetree/bindings/net/phy.txt
> > +++ b/Documentation/devicetree/bindings/net/phy.txt
> > @@ -61,6 +61,11 @@ Optional Properties:
> >  - reset-deassert-us: Delay after the reset was deasserted in microseconds.
> >    If this property is missing the delay will be skipped.
> >  
> > +- dev-addr: If set, it indicates the device address of the PHY to be used
> > +  when accessing the C45 PHY registers over MDIO. It is used for vendor specific
> > +  register space addresses that do no conform to standard address for the MDIO
> > +  registers (e.g. MMD30)
> 
> Rob made that comment earlier, and I have to ask again now, why don't we
> have the Clause 45 PHY binding be modified such that you have a reg
> property that has #address-size = 2? This should be entirely backwards
> compatible, but it would allow you to specify that device address in a
> more traditional way.

Hi Florian

I think we might get into trouble when we have both c22 and c45 on the
same bus. Two different reg formats. I would have to try it and see to
be sure.

     Andrew

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: phy: Enable C45 PHYs with vendor specific address space
From: Florian Fainelli @ 2018-04-17 18:27 UTC (permalink / raw)
  To: vicentiu.galanopulo, andrew, robh, netdev, linux-kernel,
	mark.rutland, davem, marcel, devicetree
  Cc: alexandru.marginean, madalin.bucur
In-Reply-To: <20180417090233.21548-4-vicentiu.galanopulo@nxp.com>

On 04/17/2018 02:02 AM, Vicentiu Galanopulo wrote:
> A search of the dev-addr property is done in of_mdiobus_register.
> If the property is found in the PHY node, of_mdiobus_register_vend_spec_phy()
> is called. This is a wrapper function for of_mdiobus_register_phy()
> which finds the device in package based on dev-addr, and fills
> devices_addrs, which is a new field added to phy_c45_device_ids.
> This new field will store the dev-addr property on the same index
> where the device in package has been found.
> 
> The of_mdiobus_register_phy() now contains an extra parameter,
> which is struct phy_c45_device_ids *c45_ids.
> If c45_ids is not NULL, get_vend_spec_addr_phy_device() is called
> and c45_ids are propagated all the way to get_phy_c45_ids().
> 
> Having dev-addr stored in devices_addrs, in get_phy_c45_ids(),
> when probing the identifiers, dev-addr can be extracted from
> devices_addrs and probed if devices_addrs[current_identifier] is not 0.

I must clearly be missing something, but why are you introducing all
these conditionals instead of updating the existing code to be able to
operate against an arbitrary dev-addr value, and then just making sure
the first thing you do is fetch that property from Device Tree? There is
no way someone is going to be testing with your specific use case in the
future (except yourselves) so unless you make supporting an arbitrary
"dev-addr" value become part of how the code works, this is going to be
breaking badly.

And please, can you keep me copied for next submissions?

> 
> Signed-off-by: Vicentiu Galanopulo <vicentiu.galanopulo@nxp.com>
> ---
>  drivers/net/phy/phy_device.c |  49 +++++++++++++++++--
>  drivers/of/of_mdio.c         | 113 +++++++++++++++++++++++++++++++++++++++++--
>  include/linux/phy.h          |  14 ++++++
>  3 files changed, 169 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
> index ac23322..5c79fd8 100644
> --- a/drivers/net/phy/phy_device.c
> +++ b/drivers/net/phy/phy_device.c
> @@ -457,7 +457,7 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr,
>  static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>  			   struct phy_c45_device_ids *c45_ids) {
>  	int phy_reg;
> -	int i, reg_addr;
> +	int i, reg_addr, dev_addr;
>  	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
>  	u32 *devs = &c45_ids->devices_in_package;
>  
> @@ -493,13 +493,23 @@ static int get_phy_c45_ids(struct mii_bus *bus, int addr, u32 *phy_id,
>  		if (!(c45_ids->devices_in_package & (1 << i)))
>  			continue;
>  
> -		reg_addr = MII_ADDR_C45 | i << 16 | MII_PHYSID1;
> +		/* if c45_ids->devices_addrs for the current id is not 0,
> +		 * then dev-addr was defined in the PHY device tree node,
> +		 * and the PHY has been seen as a valid device, and added
> +		 * in the package. In this case we can use the
> +		 * dev-addr(c45_ids->devices_addrs[i]) to do the MDIO
> +		 * reading of the PHY ID.
> +		 */
> +		dev_addr = !!c45_ids->devices_addrs[i] ?
> +					c45_ids->devices_addrs[i] : i;
> +
> +		reg_addr = MII_ADDR_C45 | dev_addr << 16 | MII_PHYSID1;
>  		phy_reg = mdiobus_read(bus, addr, reg_addr);
>  		if (phy_reg < 0)
>  			return -EIO;
>  		c45_ids->device_ids[i] = (phy_reg & 0xffff) << 16;
>  
> -		reg_addr = MII_ADDR_C45 | i << 16 | MII_PHYSID2;
> +		reg_addr = MII_ADDR_C45 | dev_addr << 16 | MII_PHYSID2;
>  		phy_reg = mdiobus_read(bus, addr, reg_addr);
>  		if (phy_reg < 0)
>  			return -EIO;
> @@ -551,6 +561,39 @@ static int get_phy_id(struct mii_bus *bus, int addr, u32 *phy_id,
>  }
>  
>  /**
> + * get_vend_spec_addr_phy_device - reads the specified PHY device
> + *				   and returns its @phy_device struct
> + * @bus: the target MII bus
> + * @addr: PHY address on the MII bus
> + * @is_c45: If true the PHY uses the 802.3 clause 45 protocol
> + * @c45_ids: Query the c45_ids to see if a PHY with a vendor specific
> + *           register address space was defined in the PHY device tree
> + *           node by adding the "dev-addr" property to the node.
> + *           Store the c45 ID information about the rest of the PHYs
> + *           found PHYs on the MDIO bus during probing.
> + *
> + * Description: Reads the ID registers of the PHY at @addr on the
> + *   @bus, then allocates and returns the phy_device to represent it.
> + */
> +struct phy_device *get_vend_spec_addr_phy_device(struct mii_bus *bus,
> +						 int addr, bool is_c45,
> +						 struct phy_c45_device_ids *c45_ids)
> +{
> +	u32 phy_id = 0;
> +	int r;
> +
> +	r = get_phy_id(bus, addr, &phy_id, is_c45, c45_ids);
> +	if (r)
> +		return ERR_PTR(r);
> +
> +	/* If the phy_id is mostly Fs, there is no device there */
> +	if ((phy_id & 0x1fffffff) == 0x1fffffff)
> +		return ERR_PTR(-ENODEV);
> +
> +	return phy_device_create(bus, addr, phy_id, is_c45, c45_ids);
> +}
> +
> +/**
>   * get_phy_device - reads the specified PHY device and returns its @phy_device
>   *		    struct
>   * @bus: the target MII bus
> diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
> index 8c0c927..52e8bfb 100644
> --- a/drivers/of/of_mdio.c
> +++ b/drivers/of/of_mdio.c
> @@ -45,7 +45,8 @@ static int of_get_phy_id(struct device_node *device, u32 *phy_id)
>  }
>  
>  static int of_mdiobus_register_phy(struct mii_bus *mdio,
> -				    struct device_node *child, u32 addr)
> +				   struct device_node *child, u32 addr,
> +				   struct phy_c45_device_ids *c45_ids)
>  {
>  	struct phy_device *phy;
>  	bool is_c45;
> @@ -58,7 +59,12 @@ static int of_mdiobus_register_phy(struct mii_bus *mdio,
>  	if (!is_c45 && !of_get_phy_id(child, &phy_id))
>  		phy = phy_device_create(mdio, addr, phy_id, 0, NULL);
>  	else
> -		phy = get_phy_device(mdio, addr, is_c45);
> +		if (c45_ids)
> +			phy = get_vend_spec_addr_phy_device(mdio,
> +							    addr, is_c45,
> +							    c45_ids);
> +		else
> +			phy = get_phy_device(mdio, addr, is_c45);
>  	if (IS_ERR(phy))
>  		return PTR_ERR(phy);
>  
> @@ -190,6 +196,72 @@ static bool of_mdiobus_child_is_phy(struct device_node *child)
>  	return false;
>  }
>  
> +static void of_fill_c45_devices_addrs(u32 dev_addr,
> +				      struct phy_c45_device_ids *c45_ids)
> +{
> +	int i;
> +	const int num_ids = ARRAY_SIZE(c45_ids->device_ids);
> +
> +	/* Search through all Device Identifiers
> +	 * and set dev_addr in c45_ids->devices_addrs,
> +	 * if the device bit is set in
> +	 * c45_ids->devices_in_package
> +	 */
> +	for (i = 1; i < num_ids; i++) {
> +		if (!(c45_ids->devices_in_package & (1 << i)))
> +			continue;
> +
> +		c45_ids->devices_addrs[i] = dev_addr;
> +	}
> +}
> +
> +static int of_find_devaddr_in_pkg(struct mii_bus *bus, u32 addr, u32 dev_addr,
> +				  struct phy_c45_device_ids *c45_ids)
> +{
> +	u32 *devs = &c45_ids->devices_in_package;
> +	int phy_reg, reg_addr;
> +
> +	reg_addr = MII_ADDR_C45 | dev_addr << 16 | MDIO_DEVS2;
> +	phy_reg = mdiobus_read(bus, addr, reg_addr);
> +	if (phy_reg < 0)
> +		return -EIO;
> +
> +	*devs = (phy_reg & 0xffff) << 16;
> +
> +	reg_addr = MII_ADDR_C45 | dev_addr << 16 | MDIO_DEVS1;
> +	phy_reg = mdiobus_read(bus, addr, reg_addr);
> +	if (phy_reg < 0)
> +		return -EIO;
> +
> +	*devs |= (phy_reg & 0xffff);
> +
> +	return 0;
> +}
> +
> +/*
> + * Finds the device in package and populates the c45_ids
> + * if any device is found at dev_addr address. After this
> + * the PHY is registered
> + */
> +static int of_mdiobus_register_vend_spec_phy(struct mii_bus *mdio,
> +					     struct device_node *child,
> +					     u32 addr, u32 dev_addr)
> +{
> +	struct phy_c45_device_ids c45_ids = {0};
> +	int dev_err = 0;
> +
> +	if (!dev_addr)
> +		goto register_phy;
> +
> +	dev_err = of_find_devaddr_in_pkg(mdio, addr, dev_addr, &c45_ids);
> +
> +	if (!dev_err)
> +		of_fill_c45_devices_addrs(dev_addr, &c45_ids);
> +
> +register_phy:
> +	return of_mdiobus_register_phy(mdio, child, addr, &c45_ids);
> +}
> +
>  /**
>   * of_mdiobus_register - Register mii_bus and create PHYs from the device tree
>   * @mdio: pointer to mii_bus structure
> @@ -202,7 +274,10 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
>  {
>  	struct device_node *child;
>  	bool scanphys = false;
> +	bool dev_addr_found = true;
>  	int addr, rc;
> +	int dev_addr = 0;
> +	int ret;
>  
>  	/* Do not continue if the node is disabled */
>  	if (!of_device_is_available(np))
> @@ -226,6 +301,14 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
>  
>  	/* Loop over the child nodes and register a phy_device for each phy */
>  	for_each_available_child_of_node(np, child) {
> +		/* Check if dev-addr is set in the PHY node */
> +		ret = of_property_read_u32(child, "dev-addr", &dev_addr);
> +
> +		if (ret < 0) {
> +			/* either not set or invalid */
> +			dev_addr_found = false;
> +		}
> +
>  		addr = of_mdio_parse_addr(&mdio->dev, child);
>  		if (addr < 0) {
>  			scanphys = true;
> @@ -233,7 +316,14 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
>  		}
>  
>  		if (of_mdiobus_child_is_phy(child))
> -			rc = of_mdiobus_register_phy(mdio, child, addr);
> +			if (dev_addr_found)
> +				rc = of_mdiobus_register_vend_spec_phy(mdio,
> +								       child,
> +								       addr,
> +								       dev_addr);
> +			else
> +				rc = of_mdiobus_register_phy(mdio, child,
> +							     addr, NULL);
>  		else
>  			rc = of_mdiobus_register_device(mdio, child, addr);
>  
> @@ -248,8 +338,16 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
>  	if (!scanphys)
>  		return 0;
>  
> +	/* reset device found variable */
> +	dev_addr_found = true;
> +
>  	/* auto scan for PHYs with empty reg property */
>  	for_each_available_child_of_node(np, child) {
> +		/* Check if dev-addr is set in the PHY node,
> +		 * for PHYs which don't have reg property set
> +		 */
> +		ret = of_property_read_u32(child, "dev-addr", &dev_addr);
> +
>  		/* Skip PHYs with reg property set */
>  		if (of_find_property(child, "reg", NULL))
>  			continue;
> @@ -264,7 +362,14 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
>  				 child->name, addr);
>  
>  			if (of_mdiobus_child_is_phy(child)) {
> -				rc = of_mdiobus_register_phy(mdio, child, addr);
> +				if (dev_addr_found)
> +					rc = of_mdiobus_register_vend_spec_phy(mdio,
> +									       child,
> +									       addr,
> +									       dev_addr);
> +				else
> +					rc = of_mdiobus_register_phy(mdio, child,
> +								     addr, NULL);
>  				if (rc && rc != -ENODEV)
>  					goto unregister;
>  			}
> diff --git a/include/linux/phy.h b/include/linux/phy.h
> index 26aa320..889d85e 100644
> --- a/include/linux/phy.h
> +++ b/include/linux/phy.h
> @@ -357,10 +357,13 @@ enum phy_state {
>   * struct phy_c45_device_ids - 802.3-c45 Device Identifiers
>   * @devices_in_package: Bit vector of devices present.
>   * @device_ids: The device identifer for each present device.
> + * @devices_addrs: The devices addresses from the device tree
> + *		   for each present device.
>   */
>  struct phy_c45_device_ids {
>  	u32 devices_in_package;
>  	u32 device_ids[32];
> +	u32 devices_addrs[32];
>  };
>  
>  /* phy_device: An instance of a PHY
> @@ -904,6 +907,9 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
>  				     struct phy_c45_device_ids *c45_ids);
>  #if IS_ENABLED(CONFIG_PHYLIB)
>  struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
> +struct phy_device *get_vend_spec_addr_phy_device(struct mii_bus *bus, int addr,
> +						 bool is_c45,
> +						 struct phy_c45_device_ids *c45_ids);
>  int phy_device_register(struct phy_device *phy);
>  void phy_device_free(struct phy_device *phydev);
>  #else
> @@ -913,6 +919,14 @@ struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
>  	return NULL;
>  }
>  
> +static inline
> +struct phy_device *get_vend_spec_addr_phy_device(struct mii_bus *bus, int addr,
> +						 bool is_c45,
> +						 struct phy_c45_device_ids *c45_ids)
> +{
> +	return NULL;
> +}
> +
>  static inline int phy_device_register(struct phy_device *phy)
>  {
>  	return 0;
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH RESEND net-next v2] KEYS: DNS: limit the length of option strings
From: David Miller @ 2018-04-17 18:24 UTC (permalink / raw)
  To: ebiggers3; +Cc: netdev, keyrings, mark.rutland, ebiggers
In-Reply-To: <20180417182340.GB9237@gmail.com>

From: Eric Biggers <ebiggers3@gmail.com>
Date: Tue, 17 Apr 2018 11:23:40 -0700

> Can you queue this up for stable too?  syzbot has been hitting this on older
> kernel versions.

If you want a patch bound for stable, it must show up in Linus's tree
first which means you should target 'net' rather than 'net-next'.

^ permalink raw reply

* Re: [PATCH RESEND net-next v2] KEYS: DNS: limit the length of option strings
From: Eric Biggers @ 2018-04-17 18:23 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, keyrings, mark.rutland, ebiggers
In-Reply-To: <20180417.134316.1413649044013070735.davem@davemloft.net>

On Tue, Apr 17, 2018 at 01:43:16PM -0400, David Miller wrote:
> From: Eric Biggers <ebiggers3@gmail.com>
> Date: Mon, 16 Apr 2018 14:29:22 -0700
> 
> > From: Eric Biggers <ebiggers@google.com>
> > 
> > Adding a dns_resolver key whose payload contains a very long option name
> > resulted in that string being printed in full.  This hit the WARN_ONCE()
> > in set_precision() during the printk(), because printk() only supports a
> > precision of up to 32767 bytes:
> > 
> >     precision 1000000 too large
> >     WARNING: CPU: 0 PID: 752 at lib/vsprintf.c:2189 vsnprintf+0x4bc/0x5b0
> > 
> > Fix it by limiting option strings (combined name + value) to a much more
> > reasonable 128 bytes.  The exact limit is arbitrary, but currently the
> > only recognized option is formatted as "dnserror=%lu" which fits well
> > within this limit.
> > 
> > Also ratelimit the printks.
> > 
> > Reproducer:
> > 
> >     perl -e 'print "#", "A" x 1000000, "\x00"' | keyctl padd dns_resolver desc @s
> > 
> > This bug was found using syzkaller.
> > 
> > Reported-by: Mark Rutland <mark.rutland@arm.com>
> > Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]")
> > Signed-off-by: Eric Biggers <ebiggers@google.com>
> 
> Applied, thanks.

Can you queue this up for stable too?  syzbot has been hitting this on older
kernel versions.

Eric

^ permalink raw reply

* Re: [PATCH bpf-next v3 00/10] BTF: BPF Type Format
From: Martin KaFai Lau @ 2018-04-17 18:19 UTC (permalink / raw)
  To: Arnaldo Carvalho de Melo
  Cc: netdev, Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180416202200.GA3553@kernel.org>

On Mon, Apr 16, 2018 at 05:22:00PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Mon, Apr 16, 2018 at 12:33:17PM -0700, Martin KaFai Lau escreveu:
> > This patch introduces BPF Type Format (BTF).
> > 
> > BTF (BPF Type Format) is the meta data format which describes
> > the data types of BPF program/map.  Hence, it basically focus
> > on the C programming language which the modern BPF is primary
> > using.  The first use case is to provide a generic pretty print
> > capability for a BPF map.
> > 
> > A modified pahole (Cc: Arnaldo) that can convert dwarf to BTF is here:
> > https://github.com/iamkafai/pahole/tree/btf
> 
> Thanks for CCing me, no changes since when you posted the pahole
> patches, I gave it a quick look, seems sane, will try to merge and push
> a new pahole version out so that distros can pick it, at least fedora
> will 8-)
Thanks!  Right, the last commit should be on April 3rd.

> 
> - Arnaldo
>  
> > Please see individual patch for details.
> > 
> > v3:
> > - Rebase to bpf-next
> > - Fix sparse warning (by adding static)
> > - Add BTF header logging: btf_verifier_log_hdr()
> > - Fix the alignment test on btf->type_off
> > - Add tests for the BTF header
> > - Lower the max BTF size to 16MB.  It should be enough
> >   for some time.  We could raise it later if it would
> >   be needed.
> > 
> > v2:
> > - Use kvfree where needed in patch 1 and 2
> > - Also consider BTF_INT_OFFSET() in the btf_int_check_meta()
> >   in patch 1
> > - Fix an incorrect goto target in map_create() during
> >   the btf-error-path in patch 7
> > - re-org some local vars to keep the rev xmas tree in btf.c
> > 
> > Martin KaFai Lau (10):
> >   bpf: btf: Introduce BPF Type Format (BTF)
> >   bpf: btf: Validate type reference
> >   bpf: btf: Check members of struct/union
> >   bpf: btf: Add pretty print capability for data with BTF type info
> >   bpf: btf: Add BPF_BTF_LOAD command
> >   bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd
> >   bpf: btf: Add pretty print support to the basic arraymap
> >   bpf: btf: Sync bpf.h and btf.h to tools/
> >   bpf: btf: Add BTF support to libbpf
> >   bpf: btf: Add BTF tests
> > 
> >  include/linux/bpf.h                          |   20 +-
> >  include/linux/btf.h                          |   48 +
> >  include/uapi/linux/bpf.h                     |   12 +
> >  include/uapi/linux/btf.h                     |  132 ++
> >  kernel/bpf/Makefile                          |    1 +
> >  kernel/bpf/arraymap.c                        |   50 +
> >  kernel/bpf/btf.c                             | 2093 ++++++++++++++++++++++++++
> >  kernel/bpf/inode.c                           |  146 +-
> >  kernel/bpf/syscall.c                         |   51 +-
> >  tools/include/uapi/linux/bpf.h               |   13 +
> >  tools/include/uapi/linux/btf.h               |  132 ++
> >  tools/lib/bpf/Build                          |    2 +-
> >  tools/lib/bpf/bpf.c                          |   92 +-
> >  tools/lib/bpf/bpf.h                          |   16 +
> >  tools/lib/bpf/btf.c                          |  377 +++++
> >  tools/lib/bpf/btf.h                          |   22 +
> >  tools/lib/bpf/libbpf.c                       |  148 +-
> >  tools/lib/bpf/libbpf.h                       |    3 +
> >  tools/testing/selftests/bpf/Makefile         |   26 +-
> >  tools/testing/selftests/bpf/test_btf.c       | 1669 ++++++++++++++++++++
> >  tools/testing/selftests/bpf/test_btf_haskv.c |   48 +
> >  tools/testing/selftests/bpf/test_btf_nokv.c  |   43 +
> >  22 files changed, 5103 insertions(+), 41 deletions(-)
> >  create mode 100644 include/linux/btf.h
> >  create mode 100644 include/uapi/linux/btf.h
> >  create mode 100644 kernel/bpf/btf.c
> >  create mode 100644 tools/include/uapi/linux/btf.h
> >  create mode 100644 tools/lib/bpf/btf.c
> >  create mode 100644 tools/lib/bpf/btf.h
> >  create mode 100644 tools/testing/selftests/bpf/test_btf.c
> >  create mode 100644 tools/testing/selftests/bpf/test_btf_haskv.c
> >  create mode 100644 tools/testing/selftests/bpf/test_btf_nokv.c
> > 
> > -- 
> > 2.9.5

^ permalink raw reply

* Re: [PATCH net-next 2/3] net: phy: Change the array size to 32 for device_ids
From: Florian Fainelli @ 2018-04-17 18:18 UTC (permalink / raw)
  To: vicentiu.galanopulo, andrew, robh, netdev, linux-kernel,
	mark.rutland, davem, marcel, devicetree
  Cc: alexandru.marginean, madalin.bucur
In-Reply-To: <20180417090233.21548-3-vicentiu.galanopulo@nxp.com>

On 04/17/2018 02:02 AM, Vicentiu Galanopulo wrote:
> In the context of enabling the discovery of the PHYs
> which have the C45 MDIO address space in a non-standard
> address:  num_ids in get_phy_c45_ids, has the
> value 8 (ARRAY_SIZE(c45_ids->device_ids)), but the
> u32 *devs can store 32 devices in the bitfield.
> 
> If a device is stored in *devs, in bits 32 to 9
> (bit counting in lookup loop starts from 1), it will
> not be found.
> 
> Signed-off-by: Vicentiu Galanopulo <vicentiu.galanopulo@nxp.com>

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox