* [PATCH net-next 5/5] net/tls: dedup the record cleanup
From: Jakub Kicinski @ 2019-09-03 4:31 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, John Hurley, Dirk van der Merwe
In-Reply-To: <20190903043106.27570-1-jakub.kicinski@netronome.com>
If retransmit record hint fall into the cleanup window we will
free it by just walking the list. No need to duplicate the code.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 9e1bec1a0a28..41c106e45f01 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -159,12 +159,8 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
spin_lock_irqsave(&ctx->lock, flags);
info = ctx->retransmit_hint;
- if (info && !before(acked_seq, info->end_seq)) {
+ if (info && !before(acked_seq, info->end_seq))
ctx->retransmit_hint = NULL;
- list_del(&info->list);
- destroy_record(info);
- deleted_records++;
- }
list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
if (before(acked_seq, info->end_seq))
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 4/5] net/tls: clean up the number of #ifdefs for CONFIG_TLS_DEVICE
From: Jakub Kicinski @ 2019-09-03 4:31 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, John Hurley, Dirk van der Merwe
In-Reply-To: <20190903043106.27570-1-jakub.kicinski@netronome.com>
TLS code has a number of #ifdefs which make the code a little
harder to follow. Recent fixes removed the ifdef around the
TLS_HW define, so we can switch to the often used pattern
of defining tls_device functions as empty static inlines
in the header when CONFIG_TLS_DEVICE=n.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
include/net/tls.h | 38 ++++++++++++++++++++++++++++++++------
net/tls/tls_main.c | 19 +------------------
net/tls/tls_sw.c | 6 ++----
3 files changed, 35 insertions(+), 28 deletions(-)
diff --git a/include/net/tls.h b/include/net/tls.h
index 6dab6683e42f..c664e6dba0d1 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -366,13 +366,9 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
-int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tls_device_sendpage(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
-void tls_device_free_resources_tx(struct sock *sk);
-void tls_device_init(void);
-void tls_device_cleanup(void);
int tls_tx_records(struct sock *sk, int flags);
struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
@@ -649,7 +645,6 @@ int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg,
unsigned char *record_type);
void tls_register_device(struct tls_device *device);
void tls_unregister_device(struct tls_device *device);
-int tls_device_decrypted(struct sock *sk, struct sk_buff *skb);
int decrypt_skb(struct sock *sk, struct sk_buff *skb,
struct scatterlist *sgout);
struct sk_buff *tls_encrypt_skb(struct sk_buff *skb);
@@ -662,9 +657,40 @@ int tls_sw_fallback_init(struct sock *sk,
struct tls_offload_context_tx *offload_ctx,
struct tls_crypto_info *crypto_info);
+#ifdef CONFIG_TLS_DEVICE
+void tls_device_init(void);
+void tls_device_cleanup(void);
+int tls_set_device_offload(struct sock *sk, struct tls_context *ctx);
+void tls_device_free_resources_tx(struct sock *sk);
int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx);
-
void tls_device_offload_cleanup_rx(struct sock *sk);
void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq);
+int tls_device_decrypted(struct sock *sk, struct sk_buff *skb);
+#else
+static inline void tls_device_init(void) {}
+static inline void tls_device_cleanup(void) {}
+static inline int
+tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void tls_device_free_resources_tx(struct sock *sk) {}
+
+static inline int
+tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void tls_device_offload_cleanup_rx(struct sock *sk) {}
+static inline void
+tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {}
+
+static inline int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+{
+ return 0;
+}
+#endif
#endif /* _TLS_OFFLOAD_H */
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 2df1ae8b77fa..ac88877dcade 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -286,19 +286,14 @@ static void tls_sk_proto_cleanup(struct sock *sk,
kfree(ctx->tx.rec_seq);
kfree(ctx->tx.iv);
tls_sw_release_resources_tx(sk);
-#ifdef CONFIG_TLS_DEVICE
} else if (ctx->tx_conf == TLS_HW) {
tls_device_free_resources_tx(sk);
-#endif
}
if (ctx->rx_conf == TLS_SW)
tls_sw_release_resources_rx(sk);
-
-#ifdef CONFIG_TLS_DEVICE
- if (ctx->rx_conf == TLS_HW)
+ else if (ctx->rx_conf == TLS_HW)
tls_device_offload_cleanup_rx(sk);
-#endif
}
static void tls_sk_proto_close(struct sock *sk, long timeout)
@@ -537,26 +532,18 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
}
if (tx) {
-#ifdef CONFIG_TLS_DEVICE
rc = tls_set_device_offload(sk, ctx);
conf = TLS_HW;
if (rc) {
-#else
- {
-#endif
rc = tls_set_sw_offload(sk, ctx, 1);
if (rc)
goto err_crypto_info;
conf = TLS_SW;
}
} else {
-#ifdef CONFIG_TLS_DEVICE
rc = tls_set_device_offload_rx(sk, ctx);
conf = TLS_HW;
if (rc) {
-#else
- {
-#endif
rc = tls_set_sw_offload(sk, ctx, 0);
if (rc)
goto err_crypto_info;
@@ -920,9 +907,7 @@ static int __init tls_register(void)
tls_sw_proto_ops = inet_stream_ops;
tls_sw_proto_ops.splice_read = tls_sw_splice_read;
-#ifdef CONFIG_TLS_DEVICE
tls_device_init();
-#endif
tcp_register_ulp(&tcp_tls_ulp_ops);
return 0;
@@ -931,9 +916,7 @@ static int __init tls_register(void)
static void __exit tls_unregister(void)
{
tcp_unregister_ulp(&tcp_tls_ulp_ops);
-#ifdef CONFIG_TLS_DEVICE
tls_device_cleanup();
-#endif
}
module_init(tls_register);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 91d21b048a9b..c2b5e0d2ba1a 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1489,13 +1489,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
int pad, err = 0;
if (!ctx->decrypted) {
-#ifdef CONFIG_TLS_DEVICE
if (tls_ctx->rx_conf == TLS_HW) {
err = tls_device_decrypted(sk, skb);
if (err < 0)
return err;
}
-#endif
+
/* Still not decrypted after tls_device */
if (!ctx->decrypted) {
err = decrypt_internal(sk, skb, dest, NULL, chunk, zc,
@@ -2014,10 +2013,9 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
ret = -EINVAL;
goto read_failure;
}
-#ifdef CONFIG_TLS_DEVICE
+
tls_device_rx_resync_new_rec(strp->sk, data_len + TLS_HEADER_SIZE,
TCP_SKB_CB(skb)->seq + rxm->offset);
-#endif
return data_len + TLS_HEADER_SIZE;
read_failure:
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 3/5] net/tls: narrow down the critical area of device_offload_lock
From: Jakub Kicinski @ 2019-09-03 4:31 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, John Hurley, Dirk van der Merwe
In-Reply-To: <20190903043106.27570-1-jakub.kicinski@netronome.com>
On setsockopt path we need to hold device_offload_lock from
the moment we check netdev is up until the context is fully
ready to be added to the tls_device_list.
No need to hold it around the get_netdev_for_sock().
Change the code and remove the confusing comment.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 46 +++++++++++++++++++++-----------------------
1 file changed, 22 insertions(+), 24 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 2cd7318a1338..9e1bec1a0a28 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -935,17 +935,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (skb)
TCP_SKB_CB(skb)->eor = 1;
- /* We support starting offload on multiple sockets
- * concurrently, so we only need a read lock here.
- * This lock must precede get_netdev_for_sock to prevent races between
- * NETDEV_DOWN and setsockopt.
- */
- down_read(&device_offload_lock);
netdev = get_netdev_for_sock(sk);
if (!netdev) {
pr_err_ratelimited("%s: netdev not found\n", __func__);
rc = -EINVAL;
- goto release_lock;
+ goto disable_cad;
}
if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
@@ -956,10 +950,15 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
/* Avoid offloading if the device is down
* We don't want to offload new flows after
* the NETDEV_DOWN event
+ *
+ * device_offload_lock is taken in tls_devices's NETDEV_DOWN
+ * handler thus protecting from the device going down before
+ * ctx was added to tls_device_list.
*/
+ down_read(&device_offload_lock);
if (!(netdev->flags & IFF_UP)) {
rc = -EINVAL;
- goto release_netdev;
+ goto release_lock;
}
ctx->priv_ctx_tx = offload_ctx;
@@ -967,9 +966,10 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
&ctx->crypto_send.info,
tcp_sk(sk)->write_seq);
if (rc)
- goto release_netdev;
+ goto release_lock;
tls_device_attach(ctx, sk, netdev);
+ up_read(&device_offload_lock);
/* following this assignment tls_is_sk_tx_device_offloaded
* will return true and the context might be accessed
@@ -977,14 +977,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
*/
smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
dev_put(netdev);
- up_read(&device_offload_lock);
return 0;
-release_netdev:
- dev_put(netdev);
release_lock:
up_read(&device_offload_lock);
+release_netdev:
+ dev_put(netdev);
+disable_cad:
clean_acked_data_disable(inet_csk(sk));
crypto_free_aead(offload_ctx->aead_send);
free_rec_seq:
@@ -1008,17 +1008,10 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
if (ctx->crypto_recv.info.version != TLS_1_2_VERSION)
return -EOPNOTSUPP;
- /* We support starting offload on multiple sockets
- * concurrently, so we only need a read lock here.
- * This lock must precede get_netdev_for_sock to prevent races between
- * NETDEV_DOWN and setsockopt.
- */
- down_read(&device_offload_lock);
netdev = get_netdev_for_sock(sk);
if (!netdev) {
pr_err_ratelimited("%s: netdev not found\n", __func__);
- rc = -EINVAL;
- goto release_lock;
+ return -EINVAL;
}
if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
@@ -1029,16 +1022,21 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
/* Avoid offloading if the device is down
* We don't want to offload new flows after
* the NETDEV_DOWN event
+ *
+ * device_offload_lock is taken in tls_devices's NETDEV_DOWN
+ * handler thus protecting from the device going down before
+ * ctx was added to tls_device_list.
*/
+ down_read(&device_offload_lock);
if (!(netdev->flags & IFF_UP)) {
rc = -EINVAL;
- goto release_netdev;
+ goto release_lock;
}
context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
if (!context) {
rc = -ENOMEM;
- goto release_netdev;
+ goto release_lock;
}
context->resync_nh_reset = 1;
@@ -1066,10 +1064,10 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
down_read(&device_offload_lock);
release_ctx:
ctx->priv_ctx_rx = NULL;
-release_netdev:
- dev_put(netdev);
release_lock:
up_read(&device_offload_lock);
+release_netdev:
+ dev_put(netdev);
return rc;
}
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 2/5] net/tls: don't jump to return
From: Jakub Kicinski @ 2019-09-03 4:31 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, John Hurley, Dirk van der Merwe
In-Reply-To: <20190903043106.27570-1-jakub.kicinski@netronome.com>
Reusing parts of error path for normal exit will make
next commit harder to read, untangle the two.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
net/tls/tls_device.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index e188139f0464..2cd7318a1338 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -838,22 +838,18 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
struct net_device *netdev;
char *iv, *rec_seq;
struct sk_buff *skb;
- int rc = -EINVAL;
__be64 rcd_sn;
+ int rc;
if (!ctx)
- goto out;
+ return -EINVAL;
- if (ctx->priv_ctx_tx) {
- rc = -EEXIST;
- goto out;
- }
+ if (ctx->priv_ctx_tx)
+ return -EEXIST;
start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
- if (!start_marker_record) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!start_marker_record)
+ return -ENOMEM;
offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
if (!offload_ctx) {
@@ -982,7 +978,8 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
dev_put(netdev);
up_read(&device_offload_lock);
- goto out;
+
+ return 0;
release_netdev:
dev_put(netdev);
@@ -999,7 +996,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
ctx->priv_ctx_tx = NULL;
free_marker_record:
kfree(start_marker_record);
-out:
return rc;
}
@@ -1058,7 +1054,11 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
goto free_sw_resources;
tls_device_attach(ctx, sk, netdev);
- goto release_netdev;
+ up_read(&device_offload_lock);
+
+ dev_put(netdev);
+
+ return 0;
free_sw_resources:
up_read(&device_offload_lock);
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 1/5] net/tls: use the full sk_proto pointer
From: Jakub Kicinski @ 2019-09-03 4:31 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski, John Hurley, Dirk van der Merwe
In-Reply-To: <20190903043106.27570-1-jakub.kicinski@netronome.com>
Since we already have the pointer to the full original sk_proto
stored use that instead of storing all individual callback
pointers as well.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com>
---
drivers/crypto/chelsio/chtls/chtls_main.c | 6 +++--
include/net/tls.h | 10 ---------
net/tls/tls_main.c | 27 +++++++++--------------
3 files changed, 14 insertions(+), 29 deletions(-)
diff --git a/drivers/crypto/chelsio/chtls/chtls_main.c b/drivers/crypto/chelsio/chtls/chtls_main.c
index 635bb4b447fb..e6df5b95ed47 100644
--- a/drivers/crypto/chelsio/chtls/chtls_main.c
+++ b/drivers/crypto/chelsio/chtls/chtls_main.c
@@ -474,7 +474,8 @@ static int chtls_getsockopt(struct sock *sk, int level, int optname,
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
- return ctx->getsockopt(sk, level, optname, optval, optlen);
+ return ctx->sk_proto->getsockopt(sk, level,
+ optname, optval, optlen);
return do_chtls_getsockopt(sk, optval, optlen);
}
@@ -541,7 +542,8 @@ static int chtls_setsockopt(struct sock *sk, int level, int optname,
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
- return ctx->setsockopt(sk, level, optname, optval, optlen);
+ return ctx->sk_proto->setsockopt(sk, level,
+ optname, optval, optlen);
return do_chtls_setsockopt(sk, optname, optval, optlen);
}
diff --git a/include/net/tls.h b/include/net/tls.h
index ec3c3ed2c6c3..6dab6683e42f 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -275,16 +275,6 @@ struct tls_context {
struct proto *sk_proto;
void (*sk_destruct)(struct sock *sk);
- void (*sk_proto_close)(struct sock *sk, long timeout);
-
- int (*setsockopt)(struct sock *sk, int level,
- int optname, char __user *optval,
- unsigned int optlen);
- int (*getsockopt)(struct sock *sk, int level,
- int optname, char __user *optval,
- int __user *optlen);
- int (*hash)(struct sock *sk);
- void (*unhash)(struct sock *sk);
union tls_crypto_context crypto_send;
union tls_crypto_context crypto_recv;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 277f7c209fed..2df1ae8b77fa 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -331,7 +331,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
tls_sw_strparser_done(ctx);
if (ctx->rx_conf == TLS_SW)
tls_sw_free_ctx_rx(ctx);
- ctx->sk_proto_close(sk, timeout);
+ ctx->sk_proto->close(sk, timeout);
if (free_ctx)
tls_ctx_free(sk, ctx);
@@ -451,7 +451,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname,
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
- return ctx->getsockopt(sk, level, optname, optval, optlen);
+ return ctx->sk_proto->getsockopt(sk, level,
+ optname, optval, optlen);
return do_tls_getsockopt(sk, optname, optval, optlen);
}
@@ -609,7 +610,8 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
struct tls_context *ctx = tls_get_ctx(sk);
if (level != SOL_TLS)
- return ctx->setsockopt(sk, level, optname, optval, optlen);
+ return ctx->sk_proto->setsockopt(sk, level, optname, optval,
+ optlen);
return do_tls_setsockopt(sk, optname, optval, optlen);
}
@@ -624,10 +626,7 @@ static struct tls_context *create_ctx(struct sock *sk)
return NULL;
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
- ctx->setsockopt = sk->sk_prot->setsockopt;
- ctx->getsockopt = sk->sk_prot->getsockopt;
- ctx->sk_proto_close = sk->sk_prot->close;
- ctx->unhash = sk->sk_prot->unhash;
+ ctx->sk_proto = sk->sk_prot;
return ctx;
}
@@ -683,9 +682,6 @@ static int tls_hw_prot(struct sock *sk)
spin_unlock_bh(&device_spinlock);
tls_build_proto(sk);
- ctx->hash = sk->sk_prot->hash;
- ctx->unhash = sk->sk_prot->unhash;
- ctx->sk_proto_close = sk->sk_prot->close;
ctx->sk_destruct = sk->sk_destruct;
sk->sk_destruct = tls_hw_sk_destruct;
ctx->rx_conf = TLS_HW_RECORD;
@@ -717,7 +713,7 @@ static void tls_hw_unhash(struct sock *sk)
}
}
spin_unlock_bh(&device_spinlock);
- ctx->unhash(sk);
+ ctx->sk_proto->unhash(sk);
}
static int tls_hw_hash(struct sock *sk)
@@ -726,7 +722,7 @@ static int tls_hw_hash(struct sock *sk)
struct tls_device *dev;
int err;
- err = ctx->hash(sk);
+ err = ctx->sk_proto->hash(sk);
spin_lock_bh(&device_spinlock);
list_for_each_entry(dev, &device_list, dev_list) {
if (dev->hash) {
@@ -816,7 +812,6 @@ static int tls_init(struct sock *sk)
ctx->tx_conf = TLS_BASE;
ctx->rx_conf = TLS_BASE;
- ctx->sk_proto = sk->sk_prot;
update_sk_prot(sk, ctx);
out:
write_unlock_bh(&sk->sk_callback_lock);
@@ -828,12 +823,10 @@ static void tls_update(struct sock *sk, struct proto *p)
struct tls_context *ctx;
ctx = tls_get_ctx(sk);
- if (likely(ctx)) {
- ctx->sk_proto_close = p->close;
+ if (likely(ctx))
ctx->sk_proto = p;
- } else {
+ else
sk->sk_prot = p;
- }
}
static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
--
2.21.0
^ permalink raw reply related
* [PATCH net-next 0/5] net/tls: minor cleanups
From: Jakub Kicinski @ 2019-09-03 4:31 UTC (permalink / raw)
To: davem
Cc: netdev, oss-drivers, davejwatson, borisp, aviadye, john.fastabend,
daniel, Jakub Kicinski
Hi!
This set is a grab bag of TLS cleanups accumulated in my tree
in an attempt to avoid merge problems with net. Nothing stands
out. First patch dedups context information. Next control path
locking is very slightly optimized. Fourth patch cleans up
ugly #ifdefs.
Jakub Kicinski (5):
net/tls: use the full sk_proto pointer
net/tls: don't jump to return
net/tls: narrow down the critical area of device_offload_lock
net/tls: clean up the number of #ifdefs for CONFIG_TLS_DEVICE
net/tls: dedup the record cleanup
drivers/crypto/chelsio/chtls/chtls_main.c | 6 +-
include/net/tls.h | 48 +++++++++-----
net/tls/tls_device.c | 78 +++++++++++------------
net/tls/tls_main.c | 46 ++++---------
net/tls/tls_sw.c | 6 +-
5 files changed, 85 insertions(+), 99 deletions(-)
--
2.21.0
^ permalink raw reply
* RE: [PATCH v2 5/6] mdev: Update sysfs documentation
From: Parav Pandit @ 2019-09-03 3:53 UTC (permalink / raw)
To: Cornelia Huck
Cc: alex.williamson@redhat.com, Jiri Pirko, kwankhede@nvidia.com,
davem@davemloft.net, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20190902163658.51fc48d2.cohuck@redhat.com>
> -----Original Message-----
> From: Cornelia Huck <cohuck@redhat.com>
> Sent: Monday, September 2, 2019 8:07 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: alex.williamson@redhat.com; Jiri Pirko <jiri@mellanox.com>;
> kwankhede@nvidia.com; davem@davemloft.net; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; netdev@vger.kernel.org
> Subject: Re: [PATCH v2 5/6] mdev: Update sysfs documentation
>
> On Fri, 30 Aug 2019 13:10:17 +0000
> Parav Pandit <parav@mellanox.com> wrote:
>
> > > -----Original Message-----
> > > From: Cornelia Huck <cohuck@redhat.com>
> > > Sent: Friday, August 30, 2019 6:19 PM
> > > To: Parav Pandit <parav@mellanox.com>
> > > Cc: alex.williamson@redhat.com; Jiri Pirko <jiri@mellanox.com>;
> > > kwankhede@nvidia.com; davem@davemloft.net; kvm@vger.kernel.org;
> > > linux- kernel@vger.kernel.org; netdev@vger.kernel.org
> > > Subject: Re: [PATCH v2 5/6] mdev: Update sysfs documentation
> > >
> > > On Thu, 29 Aug 2019 06:19:03 -0500
> > > Parav Pandit <parav@mellanox.com> wrote:
> > >
> > > > Updated documentation for optional read only sysfs attribute.
> > >
> > > I'd probably merge this into the patch introducing the attribute.
> > >
> > Ok. I will spin v3.
> >
> > > >
> > > > Signed-off-by: Parav Pandit <parav@mellanox.com>
> > > > ---
> > > > Documentation/driver-api/vfio-mediated-device.rst | 5 +++++
> > > > 1 file changed, 5 insertions(+)
> > > >
> > > > diff --git a/Documentation/driver-api/vfio-mediated-device.rst
> > > > b/Documentation/driver-api/vfio-mediated-device.rst
> > > > index 25eb7d5b834b..0ab03d3f5629 100644
> > > > --- a/Documentation/driver-api/vfio-mediated-device.rst
> > > > +++ b/Documentation/driver-api/vfio-mediated-device.rst
> > > > @@ -270,6 +270,7 @@ Directories and Files Under the sysfs for Each
> > > > mdev
> > > Device
> > > > |--- remove
> > > > |--- mdev_type {link to its type}
> > > > |--- vendor-specific-attributes [optional]
> > > > + |--- alias [optional]
> > >
> > > "optional" implies "not always present" to me, not "might return a
> > > read error if not available". Don't know if there's a better way to
> > > tag this? Or make it really optional? :)
> >
> > May be write it as,
> >
> > alias [ optional when requested by parent ]
>
> I'm not sure what 'optional when requested' is supposed to mean...
> maybe something like 'content optional' or so?
>
> >
> > >
> > > >
> > > > * remove (write only)
> > > >
> > > > @@ -281,6 +282,10 @@ Example::
> > > >
> > > > # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
> > > >
> > > > +* alias (read only)
> > > > +Whenever a parent requested to generate an alias, each mdev is
> > > > +assigned a unique alias by the mdev core. This file shows the
> > > > +alias of the
> > > mdev device.
> > >
> > > It's not really the parent, but the vendor driver requesting this,
> > > right? Also,
> > At mdev level, it only knows parent->ops structure, whether parent is
> registered by vendor driver or something else.
>
> Who else is supposed to create the mdev device?
If you nitpick the language what is the vendor id for sample mttty driver?
Mtty is not a 'vendor driver' per say.
>
> >
> > > "each mdev" is a bit ambiguous,
> > It is in context of the parent. Sentence is not starting with "each mdev".
> > But may be more verbosely written as,
> >
> > Whenever a parent requested to generate an alias, Each mdev device of
> > such parent is assigned unique alias by the mdev core. This file shows the
> alias of the mdev device.
>
> I'd really leave the parent out of this: this seems more like an
> implementation detail. It's more that alias may either contain an alias, or
> return a read error if no alias has been generated. Who requested the alias
> to be generated is probably not really of interest to the userspace reader.
>
The documentation is for user and developer both.
It is not the right claim that 'only user care' for this.
Otherwise all the .ko diagrams and API description etc doesn't make any sense to the user.
For user it doesn't matter whether alias length is provided by 'vendor driver' or 'registered parent'.
This note on who should specify the alias length is mainly for the developers.
> >
> > > created via that driver. Lastly, if we stick with the "returns an
> > > error if not implemented" approach, that should also be mentioned
> here.
> > Ok. Will spin v3 to describe it.
> >
> > >
> > > > +
> > > > Mediated device Hot plug
> > > > ------------------------
> > > >
> >
^ permalink raw reply
* RE: [PATCH v2 1/6] mdev: Introduce sha1 based mdev alias
From: Parav Pandit @ 2019-09-03 3:47 UTC (permalink / raw)
To: Cornelia Huck
Cc: alex.williamson@redhat.com, Jiri Pirko, kwankhede@nvidia.com,
davem@davemloft.net, kvm@vger.kernel.org,
linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20190902164604.1d04614f.cohuck@redhat.com>
> -----Original Message-----
> From: Cornelia Huck <cohuck@redhat.com>
> Sent: Monday, September 2, 2019 8:16 PM
> To: Parav Pandit <parav@mellanox.com>
> Cc: alex.williamson@redhat.com; Jiri Pirko <jiri@mellanox.com>;
> kwankhede@nvidia.com; davem@davemloft.net; kvm@vger.kernel.org;
> linux-kernel@vger.kernel.org; netdev@vger.kernel.org
> Subject: Re: [PATCH v2 1/6] mdev: Introduce sha1 based mdev alias
>
> On Fri, 30 Aug 2019 15:45:13 +0000
> Parav Pandit <parav@mellanox.com> wrote:
>
> > > > > > > This detour via the local variable looks weird to me. Can
> > > > > > > you either create the alias directly in the mdev (would need
> > > > > > > to happen later in the function, but I'm not sure why you
> > > > > > > generate the alias before checking for duplicates anyway), or do
> an explicit copy?
> > > > > > Alias duplicate check is done after generating it, because
> > > > > > duplicate alias are
> > > > > not allowed.
> > > > > > The probability of collision is rare.
> > > > > > So it is speculatively generated without hold the lock,
> > > > > > because there is no
> > > > > need to hold the lock.
> > > > > > It is compared along with guid while mutex lock is held in single
> loop.
> > > > > > And if it is duplicate, there is no need to allocate mdev.
> > > > > >
> > > > > > It will be sub optimal to run through the mdev list 2nd time
> > > > > > after mdev
> > > > > creation and after generating alias for duplicate check.
> > > > >
> > > > > Ok, but what about copying it? I find this "set local variable
> > > > > to NULL after ownership is transferred" pattern a bit unintuitive.
> > > > > Copying it to the mdev (and then unconditionally freeing it)
> > > > > looks more
> > > obvious to me.
> > > > Its not unconditionally freed.
> > >
> > > That's not what I have been saying :(
> > >
> > Ah I see. You want to allocate alias memory twice; once inside mdev device
> and another one in _create() function.
> > _create() one you want to free unconditionally.
> >
> > Well, passing pointer is fine.
>
> It's not that it doesn't work, but it feels fragile due to its non-obviousness.
And its well commented as Alex asked.
>
> > mdev_register_device() has similar little tricky pattern that makes parent =
> NULL on __find_parent_device() finds duplicate one.
>
> I don't think that the two are comparable.
>
They are very similar.
Why parent should be marked null otherwise.
> >
> > Ownership transfer is more straight forward code.
>
> I have to disagree here.
>
Ok. It is better than allocating memory twice. So I prefer to stick to this method.
> >
> > It is similar to device_initialize(), device init sequence code, where once
> device_initialize is done, freeing the device memory will be left to the
> put_device(), we don't call kfree() on mdev device.
>
> This does not really look similar to me: devices are refcounted structures,
> while strings aren't; you transfer a local pointer to a refcounted structure
> and then discard the local reference.
^ permalink raw reply
* RE: [PATCH net-next] r8152: modify rtl8152_set_speed function
From: Hayes Wang @ 2019-09-03 3:16 UTC (permalink / raw)
To: Heiner Kallweit, netdev@vger.kernel.org
Cc: nic_swsd, linux-kernel@vger.kernel.org
In-Reply-To: <280e6a3d-c6c3-ef32-a65d-19566190a1d3@gmail.com>
Heiner Kallweit [mailto:hkallweit1@gmail.com]
> Sent: Tuesday, September 03, 2019 2:37 AM
[...]
> Seeing all this code it might be a good idea to switch this driver
> to phylib, similar to what I did with r8169 some time ago.
It is too complex to be completed for me at the moment.
If this patch is unacceptable, I would submit other
patches first. Thanks.
Best Regards,
Hayes
^ permalink raw reply
* Re: how to search for the best route from userspace in netlink?
From: David Ahern @ 2019-09-03 3:13 UTC (permalink / raw)
To: Dave Taht, Linux Kernel Network Developers
In-Reply-To: <CAA93jw73AJMwLL+6cNLB2R6oqA2DyMYc1ZUsrFPndESs0ZONng@mail.gmail.com>
On 9/2/19 4:07 PM, Dave Taht wrote:
> Windows has the "RtmGetMostSpecificDestination" call:
> https://docs.microsoft.com/en-us/windows/win32/rras/search-for-the-best-route
>
> In particular, I wanted to search for the best route, AND pick up the
> PMTU from that (if it existed)
> for older UDP applications like dnssec[1] and newer ones like QUIC[2].
RTM_GETROUTE with data for the route lookup. See iproute2 code as an
example.
^ permalink raw reply
* Re: [PATCH v3] tun: fix use-after-free when register netdev failed
From: Jason Wang @ 2019-09-03 3:03 UTC (permalink / raw)
To: Yang Yingliang
Cc: David Miller, netdev, eric dumazet, xiyou wangcong, weiyongjun1
In-Reply-To: <5D6DC5BF.5020009@huawei.com>
On 2019/9/3 上午9:45, Yang Yingliang wrote:
>
>
> On 2019/9/2 13:32, Jason Wang wrote:
>>
>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>
>>>
>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>> ----- Original Message -----
>>>>>
>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>
>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>
>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
>>>>>>>>>> tfile->tun
>>>>>>>>>> is not published until the netdevice is registered. So the
>>>>>>>>>> read/write
>>>>>>>>>> thread can not use the tun pointer that may freed by
>>>>>>>>>> free_netdev().
>>>>>>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(),
>>>>>>>>>> they
>>>>>>>>>> can
>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>> register_netdevice() must always be the last operation in the
>>>>>>>>> order of
>>>>>>>>> network device setup.
>>>>>>>>>
>>>>>>>>> At the point register_netdevice() is called, the device is
>>>>>>>>> visible
>>>>>>>>> globally
>>>>>>>>> and therefore all of it's software state must be fully
>>>>>>>>> initialized and
>>>>>>>>> ready for us.
>>>>>>>>>
>>>>>>>>> You're going to have to find another solution to these problems.
>>>>>>>>
>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>>> case, there's a small window that network stack think the
>>>>>>>> device has
>>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>
>>>>>>>> Or if not, we can try to hold the device before tun_attach and
>>>>>>>> drop
>>>>>>>> it after register_netdevice().
>>>>>>>
>>>>>>> Hi Yang:
>>>>>>>
>>>>>>> I think maybe we can try to hold refcnt instead of playing real num
>>>>>>> queues here. Do you want to post a V4?
>>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>>> When register_netdevice() failed, free_netdev() will be called
>>>>>> directly,
>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>>>> How about using patch-v1 that using a flag to check whether the
>>>>> device
>>>>> registered successfully.
>>>>>
>>>> As I said, it lacks sufficient locks or barriers. To be clear, I meant
>>>> something like (compile-test only):
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index db16d7a13e00..e52678f9f049 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
>>>> struct file *file, struct ifreq *ifr)
>>>> (ifr->ifr_flags & TUN_FEATURES);
>>>> INIT_LIST_HEAD(&tun->disabled);
>>>> + dev_hold(dev);
>>>> err = tun_attach(tun, file, false, ifr->ifr_flags
>>>> & IFF_NAPI,
>>>> ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>> if (err < 0)
>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
>>>> struct file *file, struct ifreq *ifr)
>>>> err = register_netdevice(tun->dev);
>>>> if (err < 0)
>>>> goto err_detach;
>>>> + dev_put(dev);
>>>> }
>>>> netif_carrier_on(tun->dev);
>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
>>>> struct file *file, struct ifreq *ifr)
>>>> return 0;
>>>> err_detach:
>>>> + dev_put(dev);
>>>> tun_detach_all(dev);
>>>> /* register_netdevice() already called tun_free_netdev() */
>>>> goto err_free_dev;
>>>> err_free_flow:
>>>> + dev_put(dev);
>>>> tun_flow_uninit(tun);
>>>> security_tun_dev_free_security(tun->security);
>>>> err_free_stat:
>>>>
>>>> What's your thought?
>>>
>>> The dev pointer are freed without checking the refcount in
>>> free_netdev() called by err_free_dev
>>>
>>> path, so I don't understand how the refcount protects this pointer.
>>>
>>
>> The refcount are guaranteed to be zero there, isn't it?
> No, it's not.
>
> err_free_dev:
> free_netdev(dev);
>
> void free_netdev(struct net_device *dev)
> {
> ...
> /* pcpu_refcnt can be freed without checking refcount */
> free_percpu(dev->pcpu_refcnt);
> dev->pcpu_refcnt = NULL;
>
> /* Compatibility with error handling in drivers */
> if (dev->reg_state == NETREG_UNINITIALIZED) {
> /* dev can be freed without checking refcount */
> netdev_freemem(dev);
> return;
> }
> ...
> }
Right, but what I meant is in my patch, when code reaches free_netdev()
the refcnt is zero. What did I miss?
Thanks
>
>>
>> Thanks
>>
>>
>>> Thanks,
>>> Yang
>>>
>>>>
>>>> Thanks
>>>>
>>>> .
>>>>
>>>
>>>
>>
>> .
>>
>
>
^ permalink raw reply
* Re: [RFC v3] vhost: introduce mdev based hardware vhost backend
From: Jason Wang @ 2019-09-03 2:51 UTC (permalink / raw)
To: Tiwei Bie
Cc: mst, alex.williamson, maxime.coquelin, linux-kernel, kvm,
virtualization, netdev, dan.daly, cunming.liang, zhihong.wang,
lingshan.zhu
In-Reply-To: <20190903015602.GA11404@___>
On 2019/9/3 上午9:56, Tiwei Bie wrote:
> On Mon, Sep 02, 2019 at 12:15:05PM +0800, Jason Wang wrote:
>> On 2019/8/28 下午1:37, Tiwei Bie wrote:
>>> Details about this can be found here:
>>>
>>> https://lwn.net/Articles/750770/
>>>
>>> What's new in this version
>>> ==========================
>>>
>>> There are three choices based on the discussion [1] in RFC v2:
>>>
>>>> #1. We expose a VFIO device, so we can reuse the VFIO container/group
>>>> based DMA API and potentially reuse a lot of VFIO code in QEMU.
>>>>
>>>> But in this case, we have two choices for the VFIO device interface
>>>> (i.e. the interface on top of VFIO device fd):
>>>>
>>>> A) we may invent a new vhost protocol (as demonstrated by the code
>>>> in this RFC) on VFIO device fd to make it work in VFIO's way,
>>>> i.e. regions and irqs.
>>>>
>>>> B) Or as you proposed, instead of inventing a new vhost protocol,
>>>> we can reuse most existing vhost ioctls on the VFIO device fd
>>>> directly. There should be no conflicts between the VFIO ioctls
>>>> (type is 0x3B) and VHOST ioctls (type is 0xAF) currently.
>>>>
>>>> #2. Instead of exposing a VFIO device, we may expose a VHOST device.
>>>> And we will introduce a new mdev driver vhost-mdev to do this.
>>>> It would be natural to reuse the existing kernel vhost interface
>>>> (ioctls) on it as much as possible. But we will need to invent
>>>> some APIs for DMA programming (reusing VHOST_SET_MEM_TABLE is a
>>>> choice, but it's too heavy and doesn't support vIOMMU by itself).
>>> This version is more like a quick PoC to try Jason's proposal on
>>> reusing vhost ioctls. And the second way (#1/B) in above three
>>> choices was chosen in this version to demonstrate the idea quickly.
>>>
>>> Now the userspace API looks like this:
>>>
>>> - VFIO's container/group based IOMMU API is used to do the
>>> DMA programming.
>>>
>>> - Vhost's existing ioctls are used to setup the device.
>>>
>>> And the device will report device_api as "vfio-vhost".
>>>
>>> Note that, there are dirty hacks in this version. If we decide to
>>> go this way, some refactoring in vhost.c/vhost.h may be needed.
>>>
>>> PS. The direct mapping of the notify registers isn't implemented
>>> in this version.
>>>
>>> [1] https://lkml.org/lkml/2019/7/9/101
>>
>> Thanks for the patch, see comments inline.
>>
>>
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>> drivers/vhost/Kconfig | 9 +
>>> drivers/vhost/Makefile | 3 +
>>> drivers/vhost/mdev.c | 382 +++++++++++++++++++++++++++++++++++++
>>> include/linux/vhost_mdev.h | 58 ++++++
>>> include/uapi/linux/vfio.h | 2 +
>>> include/uapi/linux/vhost.h | 8 +
>>> 6 files changed, 462 insertions(+)
>>> create mode 100644 drivers/vhost/mdev.c
>>> create mode 100644 include/linux/vhost_mdev.h
> [...]
>>> +
>>> + break;
>>> + }
>>> + case VFIO_DEVICE_GET_REGION_INFO:
>>> + case VFIO_DEVICE_GET_IRQ_INFO:
>>> + case VFIO_DEVICE_SET_IRQS:
>>> + case VFIO_DEVICE_RESET:
>>> + ret = -EINVAL;
>>> + break;
>>> +
>>> + case VHOST_MDEV_SET_STATE:
>>> + ret = vhost_set_state(vdpa, argp);
>>> + break;
>>
>> So this is used to start or stop the device. This means if userspace want to
>> drive a network device, the API is not 100% compatible. Any blocker for
>> this? E.g for SET_BACKEND, we can pass a fd and then identify the type of
>> backend.
> This is a legacy from the previous RFC code. I didn't try to
> get rid of it while getting this POC to work. I can try to make
> the vhost ioctls fully compatible with the existing userspace
> if possible.
That would be fine.
>
>> Another question is, how can user know the type of a device?
> Maybe we can introduce an attribute in $UUID/ to tell the type.
Yes, something like this or using mdev types and identify through
something similar to get_socket() etc.
>
>>
>>> + case VHOST_GET_FEATURES:
>>> + ret = vhost_get_features(vdpa, argp);
>>> + break;
>>> + case VHOST_SET_FEATURES:
>>> + ret = vhost_set_features(vdpa, argp);
>>> + break;
>>> + case VHOST_GET_VRING_BASE:
>>> + ret = vhost_get_vring_base(vdpa, argp);
>>> + break;
>>> + default:
>>> + ret = vhost_dev_ioctl(&vdpa->dev, cmd, argp);
>>> + if (ret == -ENOIOCTLCMD)
>>> + ret = vhost_vring_ioctl(&vdpa->dev, cmd, argp);
>>> + }
>>> +
>>> + return ret;
>>> +}
> [...]
>>> +struct mdev_device;
>>> +struct vhost_mdev;
>>> +
>>> +typedef int (*vhost_mdev_start_device_t)(struct vhost_mdev *vdpa);
>>> +typedef int (*vhost_mdev_stop_device_t)(struct vhost_mdev *vdpa);
>>> +typedef int (*vhost_mdev_set_features_t)(struct vhost_mdev *vdpa);
>>> +typedef void (*vhost_mdev_notify_device_t)(struct vhost_mdev *vdpa, int queue_id);
>>> +typedef u64 (*vhost_mdev_get_notify_addr_t)(struct vhost_mdev *vdpa, int queue_id);
>>> +typedef u16 (*vhost_mdev_get_vring_base_t)(struct vhost_mdev *vdpa, int queue_id);
>>> +typedef void (*vhost_mdev_features_changed_t)(struct vhost_mdev *vdpa);
>>> +
>>> +struct vhost_mdev_device_ops {
>>> + vhost_mdev_start_device_t start;
>>> + vhost_mdev_stop_device_t stop;
>>> + vhost_mdev_notify_device_t notify;
>>> + vhost_mdev_get_notify_addr_t get_notify_addr;
>>> + vhost_mdev_get_vring_base_t get_vring_base;
>>> + vhost_mdev_features_changed_t features_changed;
>>> +};
>>
>> Consider we want to implement a network device, who is going to implement
>> the device configuration space? I believe it's not good to invent another
>> set of API for doing this. So I believe we want something like
>> read_config/write_config here.
>>
>> Then I came up an idea:
>>
>> 1) introduce a new mdev bus transport, and a new mdev driver virtio_mdev
>> 2) vDPA (either software or hardware) can register as a device of virtio
>> mdev device
>> 3) then we can use kernel virtio driver to drive vDPA device and utilize
>> kernel networking/storage stack
>> 4) for userspace driver like vhost-mdev, it could be built of top of mdev
>> transport
>>
>> Having a full new transport for virtio, the advantages are obvious:
>>
>> 1) A generic solution for both kernel and userspace driver and support
>> configuration space access
>> 2) For kernel driver, exist kernel networking/storage stack could be reused,
>> and so did fast path implementation (e.g XDP, io_uring etc).
>> 2) For userspace driver, the function of virtio transport is a superset of
>> vhost, any API could be built on top easily (e.g vhost ioctl).
>>
>> What's your thought?
> This sounds interesting to me! ;)
>
> But I'm not quite sure whether it's the best choice to abstract
> vhost accelerators as virtio device in vDPA. Virtio device is
> the frontend device. There are some backend features missing in
> virtio currently. E.g. there is no way to tell the virtio device
> to do dirty page logging.
This could be extended via new mdev transport. E.g set an memory region
for logging dirty pages.
> Besides, e.g. the control vq in network
> case seems not a quite good interface for a backend device.
Yes, it was not implemented in current vhost-net. Having a new transport
will solve this issue.
> In
> this case, the userspace virtio-mdev driver in QEMU will do the
> DMA mapping to allow guest driver to be able to use GPA/IOVA to
> access the Rx/Tx queues of the virtio-mdev device directly, but
> I'm wondering will this userspace virtio-mdev driver in QEMU use
> similar IOVA to access the software based control vq of the same
> virtio-mdev device at the same time?
Let me clarify.
- The first thing is to introduce a new mdev based transport, this could
something similar to virtio MMIO but the command was set through mdev bus.
- Next step is to implement a mdev based transport for kernel driver,
this allow kernel virtio driver work with mdev device that implements
mdev transport. Then kernel networking or storage stack could be reused.
It work as, register a new mdev driver and a new mdev virtio transport,
then what it does is basically translate virtio_config_ops to mdev bus
command.
- The third step is to implement virtio mdev device (mdev transport).
Then kernel virtio driver can drive those device without any modification.
- The last part is to implement another mdev driver (e.g vhost-mdev),
this is for userspace driver. It did translation between userspace mdev
API (ioctl, etc) to mdev bus command.
For the question of IOVA, in mdev transport, we will have a command like:
#define VIRTIO_MDEV_QUEUE_DESC_LOW 0x080
#define VIRTIO_MDEV_QUEUE_DESC_HIGH 0x084
There's no need for the device know about what kind of address it is
(kernel or userspace, GPA or IOVA). It's the responsibility of parent
(mdev device) to do proper mapping. And mdev device implementation can
choose to emulate control vq, offloaded it fully to hardware or even
claim it was not supported.
Thanks
>
> Thanks,
> Tiwei
>
>> Thanks
>>
>>
>>> +
>>> +struct vhost_mdev *vhost_mdev_alloc(struct mdev_device *mdev,
>>> + void *private, int nvqs);
>>> +void vhost_mdev_free(struct vhost_mdev *vdpa);
>>> +
>>> +ssize_t vhost_mdev_read(struct mdev_device *mdev, char __user *buf,
>>> + size_t count, loff_t *ppos);
>>> +ssize_t vhost_mdev_write(struct mdev_device *mdev, const char __user *buf,
>>> + size_t count, loff_t *ppos);
>>> +long vhost_mdev_ioctl(struct mdev_device *mdev, unsigned int cmd,
>>> + unsigned long arg);
>>> +int vhost_mdev_mmap(struct mdev_device *mdev, struct vm_area_struct *vma);
>>> +int vhost_mdev_open(struct mdev_device *mdev);
>>> +void vhost_mdev_close(struct mdev_device *mdev);
>>> +
>>> +int vhost_mdev_set_device_ops(struct vhost_mdev *vdpa,
>>> + const struct vhost_mdev_device_ops *ops);
>>> +int vhost_mdev_set_features(struct vhost_mdev *vdpa, u64 features);
>>> +struct eventfd_ctx *vhost_mdev_get_call_ctx(struct vhost_mdev *vdpa,
>>> + int queue_id);
>>> +int vhost_mdev_get_acked_features(struct vhost_mdev *vdpa, u64 *features);
>>> +int vhost_mdev_get_vring_num(struct vhost_mdev *vdpa, int queue_id, u16 *num);
>>> +int vhost_mdev_get_vring_base(struct vhost_mdev *vdpa, int queue_id, u16 *base);
>>> +int vhost_mdev_get_vring_addr(struct vhost_mdev *vdpa, int queue_id,
>>> + struct vhost_vring_addr *addr);
>>> +int vhost_mdev_get_log_base(struct vhost_mdev *vdpa, int queue_id,
>>> + void **log_base, u64 *log_size);
>>> +struct mdev_device *vhost_mdev_get_mdev(struct vhost_mdev *vdpa);
>>> +void *vhost_mdev_get_private(struct vhost_mdev *vdpa);
>>> +
>>> +#endif /* _VHOST_MDEV_H */
>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>>> index 8f10748dac79..0300d6831cc5 100644
>>> --- a/include/uapi/linux/vfio.h
>>> +++ b/include/uapi/linux/vfio.h
>>> @@ -201,6 +201,7 @@ struct vfio_device_info {
>>> #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
>>> #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */
>>> #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */
>>> +#define VFIO_DEVICE_FLAGS_VHOST (1 << 6) /* vfio-vhost device */
>>> __u32 num_regions; /* Max region index + 1 */
>>> __u32 num_irqs; /* Max IRQ index + 1 */
>>> };
>>> @@ -217,6 +218,7 @@ struct vfio_device_info {
>>> #define VFIO_DEVICE_API_AMBA_STRING "vfio-amba"
>>> #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw"
>>> #define VFIO_DEVICE_API_AP_STRING "vfio-ap"
>>> +#define VFIO_DEVICE_API_VHOST_STRING "vfio-vhost"
>>> /**
>>> * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
>>> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
>>> index 40d028eed645..5afbc2f08fa3 100644
>>> --- a/include/uapi/linux/vhost.h
>>> +++ b/include/uapi/linux/vhost.h
>>> @@ -116,4 +116,12 @@
>>> #define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
>>> #define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
>>> +/* VHOST_MDEV specific defines */
>>> +
>>> +#define VHOST_MDEV_SET_STATE _IOW(VHOST_VIRTIO, 0x70, __u64)
>>> +
>>> +#define VHOST_MDEV_S_STOPPED 0
>>> +#define VHOST_MDEV_S_RUNNING 1
>>> +#define VHOST_MDEV_S_MAX 2
>>> +
>>> #endif
^ permalink raw reply
* Re: [PATCH v4] rtw88: pci: Move a mass of jobs in hw IRQ to soft IRQ
From: Jian-Hong Pan @ 2019-09-03 2:45 UTC (permalink / raw)
To: Kalle Valo
Cc: Tony Chuang, David S . Miller, linux-wireless@vger.kernel.org,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
linux@endlessm.com
In-Reply-To: <875zmarivz.fsf@kamboji.qca.qualcomm.com>
Kalle Valo <kvalo@codeaurora.org> 於 2019年9月2日 週一 下午8:18寫道:
>
> Tony Chuang <yhchuang@realtek.com> writes:
>
> >> From: Jian-Hong Pan
> >> Subject: [PATCH v4] rtw88: pci: Move a mass of jobs in hw IRQ to soft IRQ
> >>
> >> There is a mass of jobs between spin lock and unlock in the hardware
> >> IRQ which will occupy much time originally. To make system work more
> >> efficiently, this patch moves the jobs to the soft IRQ (bottom half) to
> >> reduce the time in hardware IRQ.
> >>
> >> Signed-off-by: Jian-Hong Pan <jian-hong@endlessm.com>
> >
> > Now it works fine with MSI interrupt enabled.
> >
> > But this patch is conflicting with MSI interrupt patch.
> > Is there a better way we can make Kalle apply them more smoothly?
> > I can rebase them and submit both if you're OK.
The rebase work is appreciated.
Thank you,
Jian-Hong Pan
> Yeah, submitting all the MSI patches in the same patchset is the easiest
> approach. That way they apply cleanly to wireless-drivers-next.
>
> --
> https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches
^ permalink raw reply
* Re: [PATCHv2 1/1] net: rds: add service level support in rds-info
From: Gustavo A. R. Silva @ 2019-09-03 1:58 UTC (permalink / raw)
To: Zhu Yanjun, santosh.shilimkar, davem, netdev, linux-rdma,
rds-devel, gerd.rausch
In-Reply-To: <1566608656-30836-1-git-send-email-yanjun.zhu@oracle.com>
Hi,
On 8/23/19 8:04 PM, Zhu Yanjun wrote:
[..]
> diff --git a/net/rds/ib.c b/net/rds/ib.c
> index ec05d91..45acab2 100644
> --- a/net/rds/ib.c
> +++ b/net/rds/ib.c
> @@ -291,7 +291,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
> void *buffer)
> {
> struct rds_info_rdma_connection *iinfo = buffer;
> - struct rds_ib_connection *ic;
> + struct rds_ib_connection *ic = conn->c_transport_data;
>
> /* We will only ever look at IB transports */
> if (conn->c_trans != &rds_ib_transport)
> @@ -301,15 +301,16 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
>
> iinfo->src_addr = conn->c_laddr.s6_addr32[3];
> iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
> - iinfo->tos = conn->c_tos;
> + if (ic) {
Is this null-check actually necessary? (see related comments below...)
> + iinfo->tos = conn->c_tos;
> + iinfo->sl = ic->i_sl;
> + }
>
> memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
> memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
> if (rds_conn_state(conn) == RDS_CONN_UP) {
> struct rds_ib_device *rds_ibdev;
>
> - ic = conn->c_transport_data;
> -
> rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid,
Notice that *ic* is dereferenced here without null-checking it. More
comments below...
> (union ib_gid *)&iinfo->dst_gid);
>
> @@ -329,7 +330,7 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
> void *buffer)
> {
> struct rds6_info_rdma_connection *iinfo6 = buffer;
> - struct rds_ib_connection *ic;
> + struct rds_ib_connection *ic = conn->c_transport_data;
>
> /* We will only ever look at IB transports */
> if (conn->c_trans != &rds_ib_transport)
> @@ -337,6 +338,10 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
>
> iinfo6->src_addr = conn->c_laddr;
> iinfo6->dst_addr = conn->c_faddr;
> + if (ic) {
> + iinfo6->tos = conn->c_tos;
> + iinfo6->sl = ic->i_sl;
> + }
>
> memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
> memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
> @@ -344,7 +349,6 @@ static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
> if (rds_conn_state(conn) == RDS_CONN_UP) {
> struct rds_ib_device *rds_ibdev;
>
> - ic = conn->c_transport_data;
> rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo6->src_gid,
Again, *ic* is being dereferenced here without a previous null-check.
> (union ib_gid *)&iinfo6->dst_gid);
> rds_ibdev = ic->rds_ibdev;
--
Gustavo
^ permalink raw reply
* Re: [PATCH] net-ipv6: fix excessive RTF_ADDRCONF flag on ::1/128 local route (and others)
From: David Ahern @ 2019-09-03 2:18 UTC (permalink / raw)
To: Lorenzo Colitti, Maciej Żenczykowski
Cc: Maciej Żenczykowski, David S . Miller, Linux NetDev
In-Reply-To: <CAKD1Yr2tcRiiLwGdTB3TwpxoAH0+R=dgfCDh6TpZ2fHTE2rC9w@mail.gmail.com>
On 9/1/19 8:12 PM, Lorenzo Colitti wrote:
> Not sure if this patch is the right fix, though, because it breaks
> things in the opposite direction: even routes created by an IPv6
> address added by receiving an RA will no longer have RTF_ADDRCONF.
> Perhaps add something like this as well?
>
> struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
> - const struct in6_addr *addr, bool anycast,
> - const struct in6_addr *addr, u8 flags,
> gfp_t gfp_flags);
>
> flags would be RTF_ANYCAST iff the code previously called with true,
> and RTF_ADDRCONF if called by a function that is adding an IPv6
> address coming from an RA.
addrconf_f6i_alloc is used for addresses added by userspace
(ipv6_add_addr) and anycast. ie., from what I can see it is not used for RAs
^ permalink raw reply
* Re: [PATCH v2] net-ipv6: fix excessive RTF_ADDRCONF flag on ::1/128 local route (and others)
From: David Ahern @ 2019-09-03 2:14 UTC (permalink / raw)
To: Maciej Żenczykowski, Maciej Żenczykowski,
David S . Miller
Cc: netdev, Lorenzo Colitti
In-Reply-To: <20190902162336.240405-1-zenczykowski@gmail.com>
On 9/2/19 10:23 AM, Maciej Żenczykowski wrote:
> From: Maciej Żenczykowski <maze@google.com>
>
> There is a subtle change in behaviour introduced by:
> commit c7a1ce397adacaf5d4bb2eab0a738b5f80dc3e43
> 'ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create'
>
> Before that patch /proc/net/ipv6_route includes:
> 00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001 lo
>
> Afterwards /proc/net/ipv6_route includes:
> 00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80240001 lo
>
> ie. the above commit causes the ::1/128 local (automatic) route to be flagged with RTF_ADDRCONF (0x040000).
>
> AFAICT, this is incorrect since these routes are *not* coming from RA's.
>
> As such, this patch restores the old behaviour.
>
> Fixes: c7a1ce397adacaf5d4bb2eab0a738b5f80dc3e43
> Cc: David Ahern <dsahern@gmail.com>
> Cc: Lorenzo Colitti <lorenzo@google.com>
> Signed-off-by: Maciej Żenczykowski <maze@google.com>
> ---
> net/ipv6/route.c | 8 ++++++--
> 1 file changed, 6 insertions(+), 2 deletions(-)
>
Looks correct to me. Thanks for the patch.
Reviewed-by: David Ahern <dsahern@gmail.com>
^ permalink raw reply
* Re: [RFC v3] vhost: introduce mdev based hardware vhost backend
From: Tiwei Bie @ 2019-09-03 1:56 UTC (permalink / raw)
To: Jason Wang
Cc: mst, alex.williamson, maxime.coquelin, linux-kernel, kvm,
virtualization, netdev, dan.daly, cunming.liang, zhihong.wang,
lingshan.zhu
In-Reply-To: <b91820c4-2fe2-55ee-5089-5f7c94322521@redhat.com>
On Mon, Sep 02, 2019 at 12:15:05PM +0800, Jason Wang wrote:
> On 2019/8/28 下午1:37, Tiwei Bie wrote:
> > Details about this can be found here:
> >
> > https://lwn.net/Articles/750770/
> >
> > What's new in this version
> > ==========================
> >
> > There are three choices based on the discussion [1] in RFC v2:
> >
> > > #1. We expose a VFIO device, so we can reuse the VFIO container/group
> > > based DMA API and potentially reuse a lot of VFIO code in QEMU.
> > >
> > > But in this case, we have two choices for the VFIO device interface
> > > (i.e. the interface on top of VFIO device fd):
> > >
> > > A) we may invent a new vhost protocol (as demonstrated by the code
> > > in this RFC) on VFIO device fd to make it work in VFIO's way,
> > > i.e. regions and irqs.
> > >
> > > B) Or as you proposed, instead of inventing a new vhost protocol,
> > > we can reuse most existing vhost ioctls on the VFIO device fd
> > > directly. There should be no conflicts between the VFIO ioctls
> > > (type is 0x3B) and VHOST ioctls (type is 0xAF) currently.
> > >
> > > #2. Instead of exposing a VFIO device, we may expose a VHOST device.
> > > And we will introduce a new mdev driver vhost-mdev to do this.
> > > It would be natural to reuse the existing kernel vhost interface
> > > (ioctls) on it as much as possible. But we will need to invent
> > > some APIs for DMA programming (reusing VHOST_SET_MEM_TABLE is a
> > > choice, but it's too heavy and doesn't support vIOMMU by itself).
> > This version is more like a quick PoC to try Jason's proposal on
> > reusing vhost ioctls. And the second way (#1/B) in above three
> > choices was chosen in this version to demonstrate the idea quickly.
> >
> > Now the userspace API looks like this:
> >
> > - VFIO's container/group based IOMMU API is used to do the
> > DMA programming.
> >
> > - Vhost's existing ioctls are used to setup the device.
> >
> > And the device will report device_api as "vfio-vhost".
> >
> > Note that, there are dirty hacks in this version. If we decide to
> > go this way, some refactoring in vhost.c/vhost.h may be needed.
> >
> > PS. The direct mapping of the notify registers isn't implemented
> > in this version.
> >
> > [1] https://lkml.org/lkml/2019/7/9/101
>
>
> Thanks for the patch, see comments inline.
>
>
> >
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > ---
> > drivers/vhost/Kconfig | 9 +
> > drivers/vhost/Makefile | 3 +
> > drivers/vhost/mdev.c | 382 +++++++++++++++++++++++++++++++++++++
> > include/linux/vhost_mdev.h | 58 ++++++
> > include/uapi/linux/vfio.h | 2 +
> > include/uapi/linux/vhost.h | 8 +
> > 6 files changed, 462 insertions(+)
> > create mode 100644 drivers/vhost/mdev.c
> > create mode 100644 include/linux/vhost_mdev.h
[...]
> > +
> > + break;
> > + }
> > + case VFIO_DEVICE_GET_REGION_INFO:
> > + case VFIO_DEVICE_GET_IRQ_INFO:
> > + case VFIO_DEVICE_SET_IRQS:
> > + case VFIO_DEVICE_RESET:
> > + ret = -EINVAL;
> > + break;
> > +
> > + case VHOST_MDEV_SET_STATE:
> > + ret = vhost_set_state(vdpa, argp);
> > + break;
>
>
> So this is used to start or stop the device. This means if userspace want to
> drive a network device, the API is not 100% compatible. Any blocker for
> this? E.g for SET_BACKEND, we can pass a fd and then identify the type of
> backend.
This is a legacy from the previous RFC code. I didn't try to
get rid of it while getting this POC to work. I can try to make
the vhost ioctls fully compatible with the existing userspace
if possible.
>
> Another question is, how can user know the type of a device?
Maybe we can introduce an attribute in $UUID/ to tell the type.
>
>
> > + case VHOST_GET_FEATURES:
> > + ret = vhost_get_features(vdpa, argp);
> > + break;
> > + case VHOST_SET_FEATURES:
> > + ret = vhost_set_features(vdpa, argp);
> > + break;
> > + case VHOST_GET_VRING_BASE:
> > + ret = vhost_get_vring_base(vdpa, argp);
> > + break;
> > + default:
> > + ret = vhost_dev_ioctl(&vdpa->dev, cmd, argp);
> > + if (ret == -ENOIOCTLCMD)
> > + ret = vhost_vring_ioctl(&vdpa->dev, cmd, argp);
> > + }
> > +
> > + return ret;
> > +}
[...]
> > +struct mdev_device;
> > +struct vhost_mdev;
> > +
> > +typedef int (*vhost_mdev_start_device_t)(struct vhost_mdev *vdpa);
> > +typedef int (*vhost_mdev_stop_device_t)(struct vhost_mdev *vdpa);
> > +typedef int (*vhost_mdev_set_features_t)(struct vhost_mdev *vdpa);
> > +typedef void (*vhost_mdev_notify_device_t)(struct vhost_mdev *vdpa, int queue_id);
> > +typedef u64 (*vhost_mdev_get_notify_addr_t)(struct vhost_mdev *vdpa, int queue_id);
> > +typedef u16 (*vhost_mdev_get_vring_base_t)(struct vhost_mdev *vdpa, int queue_id);
> > +typedef void (*vhost_mdev_features_changed_t)(struct vhost_mdev *vdpa);
> > +
> > +struct vhost_mdev_device_ops {
> > + vhost_mdev_start_device_t start;
> > + vhost_mdev_stop_device_t stop;
> > + vhost_mdev_notify_device_t notify;
> > + vhost_mdev_get_notify_addr_t get_notify_addr;
> > + vhost_mdev_get_vring_base_t get_vring_base;
> > + vhost_mdev_features_changed_t features_changed;
> > +};
>
>
> Consider we want to implement a network device, who is going to implement
> the device configuration space? I believe it's not good to invent another
> set of API for doing this. So I believe we want something like
> read_config/write_config here.
>
> Then I came up an idea:
>
> 1) introduce a new mdev bus transport, and a new mdev driver virtio_mdev
> 2) vDPA (either software or hardware) can register as a device of virtio
> mdev device
> 3) then we can use kernel virtio driver to drive vDPA device and utilize
> kernel networking/storage stack
> 4) for userspace driver like vhost-mdev, it could be built of top of mdev
> transport
>
> Having a full new transport for virtio, the advantages are obvious:
>
> 1) A generic solution for both kernel and userspace driver and support
> configuration space access
> 2) For kernel driver, exist kernel networking/storage stack could be reused,
> and so did fast path implementation (e.g XDP, io_uring etc).
> 2) For userspace driver, the function of virtio transport is a superset of
> vhost, any API could be built on top easily (e.g vhost ioctl).
>
> What's your thought?
This sounds interesting to me! ;)
But I'm not quite sure whether it's the best choice to abstract
vhost accelerators as virtio device in vDPA. Virtio device is
the frontend device. There are some backend features missing in
virtio currently. E.g. there is no way to tell the virtio device
to do dirty page logging. Besides, e.g. the control vq in network
case seems not a quite good interface for a backend device. In
this case, the userspace virtio-mdev driver in QEMU will do the
DMA mapping to allow guest driver to be able to use GPA/IOVA to
access the Rx/Tx queues of the virtio-mdev device directly, but
I'm wondering will this userspace virtio-mdev driver in QEMU use
similar IOVA to access the software based control vq of the same
virtio-mdev device at the same time?
Thanks,
Tiwei
>
> Thanks
>
>
> > +
> > +struct vhost_mdev *vhost_mdev_alloc(struct mdev_device *mdev,
> > + void *private, int nvqs);
> > +void vhost_mdev_free(struct vhost_mdev *vdpa);
> > +
> > +ssize_t vhost_mdev_read(struct mdev_device *mdev, char __user *buf,
> > + size_t count, loff_t *ppos);
> > +ssize_t vhost_mdev_write(struct mdev_device *mdev, const char __user *buf,
> > + size_t count, loff_t *ppos);
> > +long vhost_mdev_ioctl(struct mdev_device *mdev, unsigned int cmd,
> > + unsigned long arg);
> > +int vhost_mdev_mmap(struct mdev_device *mdev, struct vm_area_struct *vma);
> > +int vhost_mdev_open(struct mdev_device *mdev);
> > +void vhost_mdev_close(struct mdev_device *mdev);
> > +
> > +int vhost_mdev_set_device_ops(struct vhost_mdev *vdpa,
> > + const struct vhost_mdev_device_ops *ops);
> > +int vhost_mdev_set_features(struct vhost_mdev *vdpa, u64 features);
> > +struct eventfd_ctx *vhost_mdev_get_call_ctx(struct vhost_mdev *vdpa,
> > + int queue_id);
> > +int vhost_mdev_get_acked_features(struct vhost_mdev *vdpa, u64 *features);
> > +int vhost_mdev_get_vring_num(struct vhost_mdev *vdpa, int queue_id, u16 *num);
> > +int vhost_mdev_get_vring_base(struct vhost_mdev *vdpa, int queue_id, u16 *base);
> > +int vhost_mdev_get_vring_addr(struct vhost_mdev *vdpa, int queue_id,
> > + struct vhost_vring_addr *addr);
> > +int vhost_mdev_get_log_base(struct vhost_mdev *vdpa, int queue_id,
> > + void **log_base, u64 *log_size);
> > +struct mdev_device *vhost_mdev_get_mdev(struct vhost_mdev *vdpa);
> > +void *vhost_mdev_get_private(struct vhost_mdev *vdpa);
> > +
> > +#endif /* _VHOST_MDEV_H */
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index 8f10748dac79..0300d6831cc5 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -201,6 +201,7 @@ struct vfio_device_info {
> > #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */
> > #define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */
> > #define VFIO_DEVICE_FLAGS_AP (1 << 5) /* vfio-ap device */
> > +#define VFIO_DEVICE_FLAGS_VHOST (1 << 6) /* vfio-vhost device */
> > __u32 num_regions; /* Max region index + 1 */
> > __u32 num_irqs; /* Max IRQ index + 1 */
> > };
> > @@ -217,6 +218,7 @@ struct vfio_device_info {
> > #define VFIO_DEVICE_API_AMBA_STRING "vfio-amba"
> > #define VFIO_DEVICE_API_CCW_STRING "vfio-ccw"
> > #define VFIO_DEVICE_API_AP_STRING "vfio-ap"
> > +#define VFIO_DEVICE_API_VHOST_STRING "vfio-vhost"
> > /**
> > * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
> > diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> > index 40d028eed645..5afbc2f08fa3 100644
> > --- a/include/uapi/linux/vhost.h
> > +++ b/include/uapi/linux/vhost.h
> > @@ -116,4 +116,12 @@
> > #define VHOST_VSOCK_SET_GUEST_CID _IOW(VHOST_VIRTIO, 0x60, __u64)
> > #define VHOST_VSOCK_SET_RUNNING _IOW(VHOST_VIRTIO, 0x61, int)
> > +/* VHOST_MDEV specific defines */
> > +
> > +#define VHOST_MDEV_SET_STATE _IOW(VHOST_VIRTIO, 0x70, __u64)
> > +
> > +#define VHOST_MDEV_S_STOPPED 0
> > +#define VHOST_MDEV_S_RUNNING 1
> > +#define VHOST_MDEV_S_MAX 2
> > +
> > #endif
^ permalink raw reply
* Re: [PATCH v3] tun: fix use-after-free when register netdev failed
From: Yang Yingliang @ 2019-09-03 1:45 UTC (permalink / raw)
To: Jason Wang
Cc: David Miller, netdev, eric dumazet, xiyou wangcong, weiyongjun1
In-Reply-To: <1be732b2-6eda-4ea6-772d-780694557910@redhat.com>
On 2019/9/2 13:32, Jason Wang wrote:
>
> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>
>>
>> On 2019/8/23 11:05, Jason Wang wrote:
>>> ----- Original Message -----
>>>>
>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>
>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>
>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
>>>>>>>>> tfile->tun
>>>>>>>>> is not published until the netdevice is registered. So the
>>>>>>>>> read/write
>>>>>>>>> thread can not use the tun pointer that may freed by
>>>>>>>>> free_netdev().
>>>>>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(),
>>>>>>>>> they
>>>>>>>>> can
>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>> register_netdevice() must always be the last operation in the
>>>>>>>> order of
>>>>>>>> network device setup.
>>>>>>>>
>>>>>>>> At the point register_netdevice() is called, the device is visible
>>>>>>>> globally
>>>>>>>> and therefore all of it's software state must be fully
>>>>>>>> initialized and
>>>>>>>> ready for us.
>>>>>>>>
>>>>>>>> You're going to have to find another solution to these problems.
>>>>>>>
>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>> case, there's a small window that network stack think the device
>>>>>>> has
>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>> Maybe it's ok here with some comments?
>>>>>>>
>>>>>>> Or if not, we can try to hold the device before tun_attach and drop
>>>>>>> it after register_netdevice().
>>>>>>
>>>>>> Hi Yang:
>>>>>>
>>>>>> I think maybe we can try to hold refcnt instead of playing real num
>>>>>> queues here. Do you want to post a V4?
>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>> When register_netdevice() failed, free_netdev() will be called
>>>>> directly,
>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>>> How about using patch-v1 that using a flag to check whether the device
>>>> registered successfully.
>>>>
>>> As I said, it lacks sufficient locks or barriers. To be clear, I meant
>>> something like (compile-test only):
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index db16d7a13e00..e52678f9f049 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct
>>> file *file, struct ifreq *ifr)
>>> (ifr->ifr_flags & TUN_FEATURES);
>>> INIT_LIST_HEAD(&tun->disabled);
>>> + dev_hold(dev);
>>> err = tun_attach(tun, file, false, ifr->ifr_flags &
>>> IFF_NAPI,
>>> ifr->ifr_flags & IFF_NAPI_FRAGS);
>>> if (err < 0)
>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct
>>> file *file, struct ifreq *ifr)
>>> err = register_netdevice(tun->dev);
>>> if (err < 0)
>>> goto err_detach;
>>> + dev_put(dev);
>>> }
>>> netif_carrier_on(tun->dev);
>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
>>> struct file *file, struct ifreq *ifr)
>>> return 0;
>>> err_detach:
>>> + dev_put(dev);
>>> tun_detach_all(dev);
>>> /* register_netdevice() already called tun_free_netdev() */
>>> goto err_free_dev;
>>> err_free_flow:
>>> + dev_put(dev);
>>> tun_flow_uninit(tun);
>>> security_tun_dev_free_security(tun->security);
>>> err_free_stat:
>>>
>>> What's your thought?
>>
>> The dev pointer are freed without checking the refcount in
>> free_netdev() called by err_free_dev
>>
>> path, so I don't understand how the refcount protects this pointer.
>>
>
> The refcount are guaranteed to be zero there, isn't it?
No, it's not.
err_free_dev:
free_netdev(dev);
void free_netdev(struct net_device *dev)
{
...
/* pcpu_refcnt can be freed without checking refcount */
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED) {
/* dev can be freed without checking refcount */
netdev_freemem(dev);
return;
}
...
}
>
> Thanks
>
>
>> Thanks,
>> Yang
>>
>>>
>>> Thanks
>>>
>>> .
>>>
>>
>>
>
> .
>
^ permalink raw reply
* Re: [PATCH net] sctp: use transport pf_retrans in sctp_do_8_2_transport_strike
From: Marcelo Ricardo Leitner @ 2019-09-03 1:31 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, linux-sctp, davem, Neil Horman
In-Reply-To: <41769d6033d27d629798e060671a3b21f22e2a21.1567437861.git.lucien.xin@gmail.com>
On Mon, Sep 02, 2019 at 11:24:21PM +0800, Xin Long wrote:
> Transport should use its own pf_retrans to do the error_count
> check, instead of asoc's. Otherwise, it's meaningless to make
> pf_retrans per transport.
>
> Fixes: 5aa93bcf66f4 ("sctp: Implement quick failover draft from tsvwg")
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> net/sctp/sm_sideeffect.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> index 1cf5bb5..e52b212 100644
> --- a/net/sctp/sm_sideeffect.c
> +++ b/net/sctp/sm_sideeffect.c
> @@ -547,7 +547,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
> if (net->sctp.pf_enable &&
> (transport->state == SCTP_ACTIVE) &&
> (transport->error_count < transport->pathmaxrxt) &&
> - (transport->error_count > asoc->pf_retrans)) {
> + (transport->error_count > transport->pf_retrans)) {
>
> sctp_assoc_control_transport(asoc, transport,
> SCTP_TRANSPORT_PF,
> --
> 2.1.0
>
^ permalink raw reply
* [PATCH] net: sched: taprio: Fix potential integer overflow in taprio_set_picos_per_byte
From: Gustavo A. R. Silva @ 2019-09-03 1:08 UTC (permalink / raw)
To: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller,
Vladimir Oltean
Cc: netdev, linux-kernel, Gustavo A. R. Silva
Add suffix LL to constant 1000 in order to avoid a potential integer
overflow and give the compiler complete information about the proper
arithmetic to use. Notice that this constant is being used in a context
that expects an expression of type s64, but it's currently evaluated
using 32-bit arithmetic.
Addresses-Coverity-ID: 1453459 ("Unintentional integer overflow")
Fixes: f04b514c0ce2 ("taprio: Set default link speed to 10 Mbps in taprio_set_picos_per_byte")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
net/sched/sch_taprio.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 8d8bc2ec5cd6..956f837436ea 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -966,7 +966,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev,
skip:
picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
- speed * 1000 * 1000);
+ speed * 1000LL * 1000);
atomic64_set(&q->picos_per_byte, picos_per_byte);
netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
--
2.23.0
^ permalink raw reply related
* Re: [PATCH] net: sched: taprio: Fix potential integer overflow in taprio_set_picos_per_byte
From: Vladimir Oltean @ 2019-09-03 1:20 UTC (permalink / raw)
To: Gustavo A. R. Silva
Cc: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller, netdev,
lkml, Vinicius Costa Gomes
In-Reply-To: <20190903010817.GA13595@embeddedor>
On Tue, 3 Sep 2019 at 04:08, Gustavo A. R. Silva <gustavo@embeddedor.com> wrote:
>
> Add suffix LL to constant 1000 in order to avoid a potential integer
> overflow and give the compiler complete information about the proper
> arithmetic to use. Notice that this constant is being used in a context
> that expects an expression of type s64, but it's currently evaluated
> using 32-bit arithmetic.
>
> Addresses-Coverity-ID: 1453459 ("Unintentional integer overflow")
> Fixes: f04b514c0ce2 ("taprio: Set default link speed to 10 Mbps in taprio_set_picos_per_byte")
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> ---
Acked-by: Vladimir Oltean <olteanv@gmail.com>
> net/sched/sch_taprio.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
> index 8d8bc2ec5cd6..956f837436ea 100644
> --- a/net/sched/sch_taprio.c
> +++ b/net/sched/sch_taprio.c
> @@ -966,7 +966,7 @@ static void taprio_set_picos_per_byte(struct net_device *dev,
>
> skip:
> picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
> - speed * 1000 * 1000);
> + speed * 1000LL * 1000);
>
> atomic64_set(&q->picos_per_byte, picos_per_byte);
> netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
> --
> 2.23.0
>
^ permalink raw reply
* Re: [PATCH] net: hisilicon: Variable "reg_value" in function mdio_sc_cfg_reg_write() could be uninitialized
From: kbuild test robot @ 2019-09-03 1:17 UTC (permalink / raw)
To: Yizhuo
Cc: kbuild-all, csong, zhiyunq, Yizhuo, Yisen Zhuang, Salil Mehta,
David S. Miller, netdev, linux-kernel
In-Reply-To: <20190902231510.21374-1-yzhai003@ucr.edu>
[-- Attachment #1: Type: text/plain, Size: 6635 bytes --]
Hi Yizhuo,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[cannot apply to v5.3-rc7 next-20190902]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Yizhuo/net-hisilicon-Variable-reg_value-in-function-mdio_sc_cfg_reg_write-could-be-uninitialized/20190903-071544
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 7.4.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
In file included from include/linux/acpi.h:15:0,
from drivers/net//ethernet/hisilicon/hns_mdio.c:6:
drivers/net//ethernet/hisilicon/hns_mdio.c: In function 'mdio_sc_cfg_reg_write':
>> drivers/net//ethernet/hisilicon/hns_mdio.c:158:20: error: 'struct hns_mdio_device' has no member named 'regmap'
dev_err(mdio_dev->regmap->dev, "Fail to read from the register\n");
^
include/linux/device.h:1499:11: note: in definition of macro 'dev_err'
_dev_err(dev, dev_fmt(fmt), ##__VA_ARGS__)
^~~
vim +158 drivers/net//ethernet/hisilicon/hns_mdio.c
> 6 #include <linux/acpi.h>
7 #include <linux/errno.h>
8 #include <linux/etherdevice.h>
9 #include <linux/init.h>
10 #include <linux/kernel.h>
11 #include <linux/mfd/syscon.h>
12 #include <linux/module.h>
13 #include <linux/mutex.h>
14 #include <linux/netdevice.h>
15 #include <linux/of_address.h>
16 #include <linux/of.h>
17 #include <linux/of_mdio.h>
18 #include <linux/of_platform.h>
19 #include <linux/phy.h>
20 #include <linux/platform_device.h>
21 #include <linux/regmap.h>
22
23 #define MDIO_DRV_NAME "Hi-HNS_MDIO"
24 #define MDIO_BUS_NAME "Hisilicon MII Bus"
25
26 #define MDIO_TIMEOUT 1000000
27
28 struct hns_mdio_sc_reg {
29 u16 mdio_clk_en;
30 u16 mdio_clk_dis;
31 u16 mdio_reset_req;
32 u16 mdio_reset_dreq;
33 u16 mdio_clk_st;
34 u16 mdio_reset_st;
35 };
36
37 struct hns_mdio_device {
38 u8 __iomem *vbase; /* mdio reg base address */
39 struct regmap *subctrl_vbase;
40 struct hns_mdio_sc_reg sc_reg;
41 };
42
43 /* mdio reg */
44 #define MDIO_COMMAND_REG 0x0
45 #define MDIO_ADDR_REG 0x4
46 #define MDIO_WDATA_REG 0x8
47 #define MDIO_RDATA_REG 0xc
48 #define MDIO_STA_REG 0x10
49
50 /* cfg phy bit map */
51 #define MDIO_CMD_DEVAD_M 0x1f
52 #define MDIO_CMD_DEVAD_S 0
53 #define MDIO_CMD_PRTAD_M 0x1f
54 #define MDIO_CMD_PRTAD_S 5
55 #define MDIO_CMD_OP_S 10
56 #define MDIO_CMD_ST_S 12
57 #define MDIO_CMD_START_B 14
58
59 #define MDIO_ADDR_DATA_M 0xffff
60 #define MDIO_ADDR_DATA_S 0
61
62 #define MDIO_WDATA_DATA_M 0xffff
63 #define MDIO_WDATA_DATA_S 0
64
65 #define MDIO_RDATA_DATA_M 0xffff
66 #define MDIO_RDATA_DATA_S 0
67
68 #define MDIO_STATE_STA_B 0
69
70 enum mdio_st_clause {
71 MDIO_ST_CLAUSE_45 = 0,
72 MDIO_ST_CLAUSE_22
73 };
74
75 enum mdio_c22_op_seq {
76 MDIO_C22_WRITE = 1,
77 MDIO_C22_READ = 2
78 };
79
80 enum mdio_c45_op_seq {
81 MDIO_C45_WRITE_ADDR = 0,
82 MDIO_C45_WRITE_DATA,
83 MDIO_C45_READ_INCREMENT,
84 MDIO_C45_READ
85 };
86
87 /* peri subctrl reg */
88 #define MDIO_SC_CLK_EN 0x338
89 #define MDIO_SC_CLK_DIS 0x33C
90 #define MDIO_SC_RESET_REQ 0xA38
91 #define MDIO_SC_RESET_DREQ 0xA3C
92 #define MDIO_SC_CLK_ST 0x531C
93 #define MDIO_SC_RESET_ST 0x5A1C
94
95 static void mdio_write_reg(u8 __iomem *base, u32 reg, u32 value)
96 {
97 writel_relaxed(value, base + reg);
98 }
99
100 #define MDIO_WRITE_REG(a, reg, value) \
101 mdio_write_reg((a)->vbase, (reg), (value))
102
103 static u32 mdio_read_reg(u8 __iomem *base, u32 reg)
104 {
105 return readl_relaxed(base + reg);
106 }
107
108 #define mdio_set_field(origin, mask, shift, val) \
109 do { \
110 (origin) &= (~((mask) << (shift))); \
111 (origin) |= (((val) & (mask)) << (shift)); \
112 } while (0)
113
114 #define mdio_get_field(origin, mask, shift) (((origin) >> (shift)) & (mask))
115
116 static void mdio_set_reg_field(u8 __iomem *base, u32 reg, u32 mask, u32 shift,
117 u32 val)
118 {
119 u32 origin = mdio_read_reg(base, reg);
120
121 mdio_set_field(origin, mask, shift, val);
122 mdio_write_reg(base, reg, origin);
123 }
124
125 #define MDIO_SET_REG_FIELD(dev, reg, mask, shift, val) \
126 mdio_set_reg_field((dev)->vbase, (reg), (mask), (shift), (val))
127
128 static u32 mdio_get_reg_field(u8 __iomem *base, u32 reg, u32 mask, u32 shift)
129 {
130 u32 origin;
131
132 origin = mdio_read_reg(base, reg);
133 return mdio_get_field(origin, mask, shift);
134 }
135
136 #define MDIO_GET_REG_FIELD(dev, reg, mask, shift) \
137 mdio_get_reg_field((dev)->vbase, (reg), (mask), (shift))
138
139 #define MDIO_GET_REG_BIT(dev, reg, bit) \
140 mdio_get_reg_field((dev)->vbase, (reg), 0x1ull, (bit))
141
142 #define MDIO_CHECK_SET_ST 1
143 #define MDIO_CHECK_CLR_ST 0
144
145 static int mdio_sc_cfg_reg_write(struct hns_mdio_device *mdio_dev,
146 u32 cfg_reg, u32 set_val,
147 u32 st_reg, u32 st_msk, u8 check_st)
148 {
149 u32 time_cnt;
150 u32 reg_value;
151 int ret;
152
153 regmap_write(mdio_dev->subctrl_vbase, cfg_reg, set_val);
154
155 for (time_cnt = MDIO_TIMEOUT; time_cnt; time_cnt--) {
156 ret = regmap_read(mdio_dev->subctrl_vbase, st_reg, ®_value);
157 if (ret) {
> 158 dev_err(mdio_dev->regmap->dev, "Fail to read from the register\n");
159 return ret;
160 }
161
162 reg_value &= st_msk;
163 if ((!!check_st) == (!!reg_value))
164 break;
165 }
166
167 if ((!!check_st) != (!!reg_value))
168 return -EBUSY;
169
170 return 0;
171 }
172
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 51785 bytes --]
^ permalink raw reply
* Re: WARNING in __mark_chain_precision (2)
From: Alexei Starovoitov @ 2019-09-03 0:40 UTC (permalink / raw)
To: Daniel Borkmann
Cc: bpf, David S. Miller, Jakub Kicinski, LKML, Network Development
In-Reply-To: <000000000000b7a14105913fcca8@google.com>
On Thu, Aug 29, 2019 at 4:28 AM syzbot
<syzbot+c8d66267fd2b5955287e@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit: 47ee6e86 selftests/bpf: remove wrong nhoff in flow dissect..
> git tree: bpf-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=16227fa6600000
> kernel config: https://syzkaller.appspot.com/x/.config?x=d4cf1ffb87d590d7
> dashboard link: https://syzkaller.appspot.com/bug?extid=c8d66267fd2b5955287e
> compiler: gcc (GCC) 9.0.0 20181231 (experimental)
> syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10d26ebc600000
> C reproducer: https://syzkaller.appspot.com/x/repro.c?x=127805ca600000
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+c8d66267fd2b5955287e@syzkaller.appspotmail.com
>
> ------------[ cut here ]------------
> verifier backtracking bug
> WARNING: CPU: 0 PID: 8795 at kernel/bpf/verifier.c:1782
> __mark_chain_precision+0x197a/0x1ea0 kernel/bpf/verifier.c:1782
fyi
I found some time to debug it.
The following program will be incorrectly rejected
due to precision tracking bug.
(b7) r2 = 0
(bf) r3 = r10
(16) if w3 == 0xf6fffdfd goto pc+0
(7a) *(u64 *)(r3 -16) = -8
(79) r4 = *(u64 *)(r10 -16)
(b7) r6 = -1
(2d) if r4 > r6 goto pc+5
Still thinking how to fix it cleanly.
^ permalink raw reply
* Re: [PATCH] Fix a double free bug in rsi_91x_deinit
From: Guenter Roeck @ 2019-09-03 0:35 UTC (permalink / raw)
To: Greg KH
Cc: Kalle Valo, Hui Peng, security, Mathias Payer, David S. Miller,
linux-wireless, netdev, linux-kernel
In-Reply-To: <20190902200635.GA29465@kroah.com>
On 9/2/19 1:06 PM, Greg KH wrote:
> On Mon, Sep 02, 2019 at 12:32:37PM -0700, Guenter Roeck wrote:
>> On 9/2/19 11:47 AM, Greg KH wrote:
>>> On Sun, Sep 01, 2019 at 07:08:29AM -0700, Guenter Roeck wrote:
>>>> On 9/1/19 1:03 AM, Kalle Valo wrote:
>>>>> Guenter Roeck <linux@roeck-us.net> writes:
>>>>>
>>>>>> On Mon, Aug 19, 2019 at 06:02:29PM -0400, Hui Peng wrote:
>>>>>>> `dev` (struct rsi_91x_usbdev *) field of adapter
>>>>>>> (struct rsi_91x_usbdev *) is allocated and initialized in
>>>>>>> `rsi_init_usb_interface`. If any error is detected in information
>>>>>>> read from the device side, `rsi_init_usb_interface` will be
>>>>>>> freed. However, in the higher level error handling code in
>>>>>>> `rsi_probe`, if error is detected, `rsi_91x_deinit` is called
>>>>>>> again, in which `dev` will be freed again, resulting double free.
>>>>>>>
>>>>>>> This patch fixes the double free by removing the free operation on
>>>>>>> `dev` in `rsi_init_usb_interface`, because `rsi_91x_deinit` is also
>>>>>>> used in `rsi_disconnect`, in that code path, the `dev` field is not
>>>>>>> (and thus needs to be) freed.
>>>>>>>
>>>>>>> This bug was found in v4.19, but is also present in the latest version
>>>>>>> of kernel.
>>>>>>>
>>>>>>> Reported-by: Hui Peng <benquike@gmail.com>
>>>>>>> Reported-by: Mathias Payer <mathias.payer@nebelwelt.net>
>>>>>>> Signed-off-by: Hui Peng <benquike@gmail.com>
>>>>>>
>>>>>> FWIW:
>>>>>>
>>>>>> Reviewed-by: Guenter Roeck <linux@roeck-us.net>
>>>>>>
>>>>>> This patch is listed as fix for CVE-2019-15504, which has a CVSS 2.0 score
>>>>>> of 10.0 (high) and CVSS 3.0 score of 9.8 (critical).
>>>>>
>>>>> A double free in error path is considered as a critical CVE issue? I'm
>>>>> very curious, why is that?
>>>>>
>>>>
>>>> You'd have to ask the people assigning CVSS scores. However, if the memory
>>>> was reallocated, that reallocated memory (which is still in use) is freed.
>>>> Then all kinds of bad things can happen.
>>>
>>> Yes, but moving from "bad things _can_ happen" to "bad things happen" in
>>> an instance like this will be a tough task. It also requires physical
>>> access to the machine.
>>>
>>
>> Is this correct even with usbip enabled ?
>
> Who has usbip enabled anywhere? :)
>
It is enabled in Ubuntu, and it looks like it is enabled in Fedora as well.
It is disabled in Chrome OS. I didn't check other distributions.
> I don't know if usbip can trigger this type of thing, maybe someone
> needs to test that...
>
I seemed to recall someone mentioning that it is possible to use usbip
for remote attacks. This is why I mentioned it. I don't recall details,
though, and I don't know if it is really possible and to what extent.
Guenter
^ permalink raw reply
* Re: [PATCH 0/4 net-next] flow_offload: update mangle action representation
From: Pablo Neira Ayuso @ 2019-09-03 0:05 UTC (permalink / raw)
To: Jakub Kicinski; +Cc: netfilter-devel, davem, netdev, vishal, saeedm, jiri
In-Reply-To: <20190901134754.1bcd72d4@cakuba.netronome.com>
On Sun, Sep 01, 2019 at 01:47:54PM -0700, Jakub Kicinski wrote:
> On Sat, 31 Aug 2019 16:22:17 +0200, Pablo Neira Ayuso wrote:
[...]
> > > Please see the definitions of:
> > >
> > > struct nfp_fl_set_eth
> > > struct nfp_fl_set_ip4_addrs
> > > struct nfp_fl_set_ip4_ttl_tos
> > > struct nfp_fl_set_ipv6_tc_hl_fl
> > > struct nfp_fl_set_ipv6_addr
> > > struct nfp_fl_set_tport
> > >
> > > These are the programming primitives for header rewrites in the NFP.
> > > Since each of those contains more than just one field, we'll have to
> > > keep all the field coalescing logic in the driver, even if you coalesce
> > > while fields (i.e. IPv6 addresses).
> >
> > nfp has been updated in this patch series to deal with the new mangle
> > representation.
>
> It has been updated to handle the trivial coalescing.
>
> > > Perhaps it's not a serious blocker for the series, but it'd be nice if
> > > rewrite action grouping was handled in the core. Since you're already
> > > poking at that code..
> >
> > Rewrite action grouping is already handled from the core front-end in
> > this patch series.
>
> If you did what I'm asking the functions nfp_fl_check_mangle_start()
> and nfp_fl_check_mangle_end() would no longer exist. They were not
> really needed before you "common flow API" changes.
Thanks for the pointer. This driver-level coalescing routine you are
refering to is specific to optimize your layout. I agree the core
could be updated to do more coalescing, but this would need a way to
express what coalescing the driver would like to see in place. I would
wait to see more drivers that can benefit from that. I can only make
incremental steps, it's already hard to navigate over all this code.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox