* [PATCH net-next] net/tls: Move protocol constants from cipher context to tls context
From: Vakul Garg @ 2019-02-14 7:11 UTC (permalink / raw)
To: netdev@vger.kernel.org
Cc: borisp@mellanox.com, aviadye@mellanox.com, davejwatson@fb.com,
davem@davemloft.net, doronrk@fb.com, Vakul Garg
Each tls context maintains two cipher contexts (one each for tx and rx
directions). For each tls session, the constants such as protocol
version, ciphersuite, iv size, associated data size etc are same for
both the directions and need to be stored only once per tls context.
Hence these are moved from 'struct cipher_context' to 'struct
tls_prot_info' and stored only once in 'struct tls_context'.
Signed-off-by: Vakul Garg <vakul.garg@nxp.com>
---
include/net/tls.h | 46 +++++++++-----
net/tls/tls_device.c | 24 ++++---
net/tls/tls_main.c | 17 ++++-
net/tls/tls_sw.c | 172 +++++++++++++++++++++++++++------------------------
4 files changed, 149 insertions(+), 110 deletions(-)
diff --git a/include/net/tls.h b/include/net/tls.h
index a93a8ed8f716..a8b37226a287 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -199,15 +199,8 @@ enum {
};
struct cipher_context {
- u16 prepend_size;
- u16 tag_size;
- u16 overhead_size;
- u16 iv_size;
char *iv;
- u16 rec_seq_size;
char *rec_seq;
- u16 aad_size;
- u16 tail_size;
};
union tls_crypto_context {
@@ -218,7 +211,21 @@ union tls_crypto_context {
};
};
+struct tls_prot_info {
+ u16 version;
+ u16 cipher_type;
+ u16 prepend_size;
+ u16 tag_size;
+ u16 overhead_size;
+ u16 iv_size;
+ u16 rec_seq_size;
+ u16 aad_size;
+ u16 tail_size;
+};
+
struct tls_context {
+ struct tls_prot_info prot_info;
+
union tls_crypto_context crypto_send;
union tls_crypto_context crypto_recv;
@@ -401,16 +408,26 @@ static inline bool tls_bigint_increment(unsigned char *seq, int len)
return (i == -1);
}
+static inline struct tls_context *tls_get_ctx(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ulp_data;
+}
+
static inline void tls_advance_record_sn(struct sock *sk,
struct cipher_context *ctx,
int version)
{
- if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size))
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+
+ if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size))
tls_err_abort(sk, EBADMSG);
if (version != TLS_1_3_VERSION) {
tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
- ctx->iv_size);
+ prot->iv_size);
}
}
@@ -420,9 +437,10 @@ static inline void tls_fill_prepend(struct tls_context *ctx,
unsigned char record_type,
int version)
{
- size_t pkt_len, iv_size = ctx->tx.iv_size;
+ struct tls_prot_info *prot = &ctx->prot_info;
+ size_t pkt_len, iv_size = prot->iv_size;
- pkt_len = plaintext_len + ctx->tx.tag_size;
+ pkt_len = plaintext_len + prot->tag_size;
if (version != TLS_1_3_VERSION) {
pkt_len += iv_size;
@@ -475,12 +493,6 @@ static inline void xor_iv_with_seq(int version, char *iv, char *seq)
}
}
-static inline struct tls_context *tls_get_ctx(const struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- return icsk->icsk_ulp_data;
-}
static inline struct tls_sw_context_rx *tls_sw_ctx_rx(
const struct tls_context *tls_ctx)
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 7ee9008b2187..a5c17c47d08a 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -247,6 +247,7 @@ static int tls_push_record(struct sock *sk,
int flags,
unsigned char record_type)
{
+ struct tls_prot_info *prot = &ctx->prot_info;
struct tcp_sock *tp = tcp_sk(sk);
struct page_frag dummy_tag_frag;
skb_frag_t *frag;
@@ -256,7 +257,7 @@ static int tls_push_record(struct sock *sk,
frag = &record->frags[0];
tls_fill_prepend(ctx,
skb_frag_address(frag),
- record->len - ctx->tx.prepend_size,
+ record->len - prot->prepend_size,
record_type,
ctx->crypto_send.info.version);
@@ -264,7 +265,7 @@ static int tls_push_record(struct sock *sk,
dummy_tag_frag.page = skb_frag_page(frag);
dummy_tag_frag.offset = 0;
- tls_append_frag(record, &dummy_tag_frag, ctx->tx.tag_size);
+ tls_append_frag(record, &dummy_tag_frag, prot->tag_size);
record->end_seq = tp->write_seq + record->len;
spin_lock_irq(&offload_ctx->lock);
list_add_tail(&record->list, &offload_ctx->records_list);
@@ -347,6 +348,7 @@ static int tls_push_data(struct sock *sk,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
@@ -376,10 +378,10 @@ static int tls_push_data(struct sock *sk,
* we need to leave room for an authentication tag.
*/
max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
- tls_ctx->tx.prepend_size;
+ prot->prepend_size;
do {
rc = tls_do_allocation(sk, ctx, pfrag,
- tls_ctx->tx.prepend_size);
+ prot->prepend_size);
if (rc) {
rc = sk_stream_wait_memory(sk, &timeo);
if (!rc)
@@ -397,7 +399,7 @@ static int tls_push_data(struct sock *sk,
size = orig_size;
destroy_record(record);
ctx->open_record = NULL;
- } else if (record->len > tls_ctx->tx.prepend_size) {
+ } else if (record->len > prot->prepend_size) {
goto last_record;
}
@@ -658,6 +660,8 @@ int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
{
u16 nonce_size, tag_size, iv_size, rec_seq_size;
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_record_info *start_marker_record;
struct tls_offload_context_tx *offload_ctx;
struct tls_crypto_info *crypto_info;
@@ -703,10 +707,10 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
goto free_offload_ctx;
}
- ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
- ctx->tx.tag_size = tag_size;
- ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
- ctx->tx.iv_size = iv_size;
+ prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
+ prot->tag_size = tag_size;
+ prot->overhead_size = prot->prepend_size + prot->tag_size;
+ prot->iv_size = iv_size;
ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
GFP_KERNEL);
if (!ctx->tx.iv) {
@@ -716,7 +720,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
- ctx->tx.rec_seq_size = rec_seq_size;
+ prot->rec_seq_size = rec_seq_size;
ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
if (!ctx->tx.rec_seq) {
rc = -ENOMEM;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d1c2fd9a3f63..caff15b2f9b2 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -435,6 +435,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
unsigned int optlen, int tx)
{
struct tls_crypto_info *crypto_info;
+ struct tls_crypto_info *alt_crypto_info;
struct tls_context *ctx = tls_get_ctx(sk);
size_t optsize;
int rc = 0;
@@ -445,10 +446,13 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
goto out;
}
- if (tx)
+ if (tx) {
crypto_info = &ctx->crypto_send.info;
- else
+ alt_crypto_info = &ctx->crypto_recv.info;
+ } else {
crypto_info = &ctx->crypto_recv.info;
+ alt_crypto_info = &ctx->crypto_send.info;
+ }
/* Currently we don't support set crypto info more than one time */
if (TLS_CRYPTO_INFO_READY(crypto_info)) {
@@ -469,6 +473,15 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
goto err_crypto_info;
}
+ /* Ensure that TLS version and ciphers are same in both directions */
+ if (TLS_CRYPTO_INFO_READY(alt_crypto_info)) {
+ if (alt_crypto_info->version != crypto_info->version ||
+ alt_crypto_info->cipher_type != crypto_info->cipher_type) {
+ rc = -EINVAL;
+ goto err_crypto_info;
+ }
+ }
+
switch (crypto_info->cipher_type) {
case TLS_CIPHER_AES_GCM_128:
case TLS_CIPHER_AES_GCM_256: {
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index ae4784734547..71be8acfbc9b 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -127,7 +127,7 @@ static int padding_length(struct tls_sw_context_rx *ctx,
int sub = 0;
/* Determine zero-padding length */
- if (tls_ctx->crypto_recv.info.version == TLS_1_3_VERSION) {
+ if (tls_ctx->prot_info.version == TLS_1_3_VERSION) {
char content_type = 0;
int err;
int back = 17;
@@ -155,6 +155,7 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
struct scatterlist *sgin = aead_req->src;
struct tls_sw_context_rx *ctx;
struct tls_context *tls_ctx;
+ struct tls_prot_info *prot;
struct scatterlist *sg;
struct sk_buff *skb;
unsigned int pages;
@@ -163,6 +164,7 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
skb = (struct sk_buff *)req->data;
tls_ctx = tls_get_ctx(skb->sk);
ctx = tls_sw_ctx_rx(tls_ctx);
+ prot = &tls_ctx->prot_info;
/* Propagate if there was an err */
if (err) {
@@ -171,8 +173,8 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
} else {
struct strp_msg *rxm = strp_msg(skb);
rxm->full_len -= padding_length(ctx, tls_ctx, skb);
- rxm->offset += tls_ctx->rx.prepend_size;
- rxm->full_len -= tls_ctx->rx.overhead_size;
+ rxm->offset += prot->prepend_size;
+ rxm->full_len -= prot->overhead_size;
}
/* After using skb->sk to propagate sk through crypto async callback
@@ -209,13 +211,14 @@ static int tls_do_decryption(struct sock *sk,
bool async)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
int ret;
aead_request_set_tfm(aead_req, ctx->aead_recv);
- aead_request_set_ad(aead_req, tls_ctx->rx.aad_size);
+ aead_request_set_ad(aead_req, prot->aad_size);
aead_request_set_crypt(aead_req, sgin, sgout,
- data_len + tls_ctx->rx.tag_size,
+ data_len + prot->tag_size,
(u8 *)iv_recv);
if (async) {
@@ -253,12 +256,13 @@ static int tls_do_decryption(struct sock *sk,
static void tls_trim_both_msgs(struct sock *sk, int target_size)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec;
sk_msg_trim(sk, &rec->msg_plaintext, target_size);
if (target_size > 0)
- target_size += tls_ctx->tx.overhead_size;
+ target_size += prot->overhead_size;
sk_msg_trim(sk, &rec->msg_encrypted, target_size);
}
@@ -275,6 +279,7 @@ static int tls_alloc_encrypted_msg(struct sock *sk, int len)
static int tls_clone_plaintext_msg(struct sock *sk, int required)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec;
struct sk_msg *msg_pl = &rec->msg_plaintext;
@@ -290,7 +295,7 @@ static int tls_clone_plaintext_msg(struct sock *sk, int required)
/* Skip initial bytes in msg_en's data to be able to use
* same offset of both plain and encrypted data.
*/
- skip = tls_ctx->tx.prepend_size + msg_pl->sg.size;
+ skip = prot->prepend_size + msg_pl->sg.size;
return sk_msg_clone(sk, msg_pl, msg_en, skip, len);
}
@@ -298,6 +303,7 @@ static int tls_clone_plaintext_msg(struct sock *sk, int required)
static struct tls_rec *tls_get_rec(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct sk_msg *msg_pl, *msg_en;
struct tls_rec *rec;
@@ -316,13 +322,11 @@ static struct tls_rec *tls_get_rec(struct sock *sk)
sk_msg_init(msg_en);
sg_init_table(rec->sg_aead_in, 2);
- sg_set_buf(&rec->sg_aead_in[0], rec->aad_space,
- tls_ctx->tx.aad_size);
+ sg_set_buf(&rec->sg_aead_in[0], rec->aad_space, prot->aad_size);
sg_unmark_end(&rec->sg_aead_in[1]);
sg_init_table(rec->sg_aead_out, 2);
- sg_set_buf(&rec->sg_aead_out[0], rec->aad_space,
- tls_ctx->tx.aad_size);
+ sg_set_buf(&rec->sg_aead_out[0], rec->aad_space, prot->aad_size);
sg_unmark_end(&rec->sg_aead_out[1]);
return rec;
@@ -411,6 +415,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
struct aead_request *aead_req = (struct aead_request *)req;
struct sock *sk = req->data;
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct scatterlist *sge;
struct sk_msg *msg_en;
@@ -422,8 +427,8 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err)
msg_en = &rec->msg_encrypted;
sge = sk_msg_elem(msg_en, msg_en->sg.curr);
- sge->offset -= tls_ctx->tx.prepend_size;
- sge->length += tls_ctx->tx.prepend_size;
+ sge->offset -= prot->prepend_size;
+ sge->length += prot->prepend_size;
/* Check if error is previously set on socket */
if (err || sk->sk_err) {
@@ -470,22 +475,23 @@ static int tls_do_encryption(struct sock *sk,
struct aead_request *aead_req,
size_t data_len, u32 start)
{
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_rec *rec = ctx->open_rec;
struct sk_msg *msg_en = &rec->msg_encrypted;
struct scatterlist *sge = sk_msg_elem(msg_en, start);
int rc;
memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data));
- xor_iv_with_seq(tls_ctx->crypto_send.info.version, rec->iv_data,
+ xor_iv_with_seq(prot->version, rec->iv_data,
tls_ctx->tx.rec_seq);
- sge->offset += tls_ctx->tx.prepend_size;
- sge->length -= tls_ctx->tx.prepend_size;
+ sge->offset += prot->prepend_size;
+ sge->length -= prot->prepend_size;
msg_en->sg.curr = start;
aead_request_set_tfm(aead_req, ctx->aead_send);
- aead_request_set_ad(aead_req, tls_ctx->tx.aad_size);
+ aead_request_set_ad(aead_req, prot->aad_size);
aead_request_set_crypt(aead_req, rec->sg_aead_in,
rec->sg_aead_out,
data_len, rec->iv_data);
@@ -500,8 +506,8 @@ static int tls_do_encryption(struct sock *sk,
rc = crypto_aead_encrypt(aead_req);
if (!rc || rc != -EINPROGRESS) {
atomic_dec(&ctx->encrypt_pending);
- sge->offset -= tls_ctx->tx.prepend_size;
- sge->length += tls_ctx->tx.prepend_size;
+ sge->offset -= prot->prepend_size;
+ sge->length += prot->prepend_size;
}
if (!rc) {
@@ -513,8 +519,7 @@ static int tls_do_encryption(struct sock *sk,
/* Unhook the record from context if encryption is not failure */
ctx->open_rec = NULL;
- tls_advance_record_sn(sk, &tls_ctx->tx,
- tls_ctx->crypto_send.info.version);
+ tls_advance_record_sn(sk, &tls_ctx->tx, prot->version);
return rc;
}
@@ -640,6 +645,7 @@ static int tls_push_record(struct sock *sk, int flags,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
struct tls_rec *rec = ctx->open_rec, *tmp = NULL;
u32 i, split_point, uninitialized_var(orig_end);
@@ -658,12 +664,12 @@ static int tls_push_record(struct sock *sk, int flags,
split = split_point && split_point < msg_pl->sg.size;
if (split) {
rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
- split_point, tls_ctx->tx.overhead_size,
+ split_point, prot->overhead_size,
&orig_end);
if (rc < 0)
return rc;
sk_msg_trim(sk, msg_en, msg_pl->sg.size +
- tls_ctx->tx.overhead_size);
+ prot->overhead_size);
}
rec->tx_flags = flags;
@@ -673,7 +679,7 @@ static int tls_push_record(struct sock *sk, int flags,
sk_msg_iter_var_prev(i);
rec->content_type = record_type;
- if (tls_ctx->crypto_send.info.version == TLS_1_3_VERSION) {
+ if (prot->version == TLS_1_3_VERSION) {
/* Add content type to end of message. No padding added */
sg_set_buf(&rec->sg_content_type, &rec->content_type, 1);
sg_mark_end(&rec->sg_content_type);
@@ -694,22 +700,20 @@ static int tls_push_record(struct sock *sk, int flags,
i = msg_en->sg.start;
sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]);
- tls_make_aad(rec->aad_space, msg_pl->sg.size + tls_ctx->tx.tail_size,
- tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
- record_type,
- tls_ctx->crypto_send.info.version);
+ tls_make_aad(rec->aad_space, msg_pl->sg.size + prot->tail_size,
+ tls_ctx->tx.rec_seq, prot->rec_seq_size,
+ record_type, prot->version);
tls_fill_prepend(tls_ctx,
page_address(sg_page(&msg_en->sg.data[i])) +
msg_en->sg.data[i].offset,
- msg_pl->sg.size + tls_ctx->tx.tail_size,
- record_type,
- tls_ctx->crypto_send.info.version);
+ msg_pl->sg.size + prot->tail_size,
+ record_type, prot->version);
tls_ctx->pending_open_record_frags = false;
rc = tls_do_encryption(sk, tls_ctx, ctx, req,
- msg_pl->sg.size + tls_ctx->tx.tail_size, i);
+ msg_pl->sg.size + prot->tail_size, i);
if (rc < 0) {
if (rc != -EINPROGRESS) {
tls_err_abort(sk, EBADMSG);
@@ -723,8 +727,7 @@ static int tls_push_record(struct sock *sk, int flags,
} else if (split) {
msg_pl = &tmp->msg_plaintext;
msg_en = &tmp->msg_encrypted;
- sk_msg_trim(sk, msg_en, msg_pl->sg.size +
- tls_ctx->tx.overhead_size);
+ sk_msg_trim(sk, msg_en, msg_pl->sg.size + prot->overhead_size);
tls_ctx->pending_open_record_frags = true;
ctx->open_rec = tmp;
}
@@ -859,6 +862,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
bool async_capable = ctx->async_capable;
unsigned char record_type = TLS_RECORD_TYPE_DATA;
@@ -925,7 +929,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
required_size = msg_pl->sg.size + try_to_copy +
- tls_ctx->tx.overhead_size;
+ prot->overhead_size;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -994,8 +998,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
try_to_copy -= required_size - msg_pl->sg.size;
full_record = true;
- sk_msg_trim(sk, msg_en, msg_pl->sg.size +
- tls_ctx->tx.overhead_size);
+ sk_msg_trim(sk, msg_en,
+ msg_pl->sg.size + prot->overhead_size);
}
if (try_to_copy) {
@@ -1081,6 +1085,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page,
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
unsigned char record_type = TLS_RECORD_TYPE_DATA;
struct sk_msg *msg_pl;
struct tls_rec *rec;
@@ -1130,8 +1135,7 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page,
full_record = true;
}
- required_size = msg_pl->sg.size + copy +
- tls_ctx->tx.overhead_size;
+ required_size = msg_pl->sg.size + copy + prot->overhead_size;
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -1330,6 +1334,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct strp_msg *rxm = strp_msg(skb);
int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0;
struct aead_request *aead_req;
@@ -1337,16 +1342,16 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
u8 *aad, *iv, *mem = NULL;
struct scatterlist *sgin = NULL;
struct scatterlist *sgout = NULL;
- const int data_len = rxm->full_len - tls_ctx->rx.overhead_size +
- tls_ctx->rx.tail_size;
+ const int data_len = rxm->full_len - prot->overhead_size +
+ prot->tail_size;
if (*zc && (out_iov || out_sg)) {
if (out_iov)
n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
else
n_sgout = sg_nents(out_sg);
- n_sgin = skb_nsg(skb, rxm->offset + tls_ctx->rx.prepend_size,
- rxm->full_len - tls_ctx->rx.prepend_size);
+ n_sgin = skb_nsg(skb, rxm->offset + prot->prepend_size,
+ rxm->full_len - prot->prepend_size);
} else {
n_sgout = 0;
*zc = false;
@@ -1363,7 +1368,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
aead_size = sizeof(*aead_req) + crypto_aead_reqsize(ctx->aead_recv);
mem_size = aead_size + (nsg * sizeof(struct scatterlist));
- mem_size = mem_size + tls_ctx->rx.aad_size;
+ mem_size = mem_size + prot->aad_size;
mem_size = mem_size + crypto_aead_ivsize(ctx->aead_recv);
/* Allocate a single block of memory which contains
@@ -1379,37 +1384,35 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
sgin = (struct scatterlist *)(mem + aead_size);
sgout = sgin + n_sgin;
aad = (u8 *)(sgout + n_sgout);
- iv = aad + tls_ctx->rx.aad_size;
+ iv = aad + prot->aad_size;
/* Prepare IV */
err = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
- tls_ctx->rx.iv_size);
+ prot->iv_size);
if (err < 0) {
kfree(mem);
return err;
}
- if (tls_ctx->crypto_recv.info.version == TLS_1_3_VERSION)
+ if (prot->version == TLS_1_3_VERSION)
memcpy(iv, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv));
else
memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
- xor_iv_with_seq(tls_ctx->crypto_recv.info.version, iv,
- tls_ctx->rx.rec_seq);
+ xor_iv_with_seq(prot->version, iv, tls_ctx->rx.rec_seq);
/* Prepare AAD */
- tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size +
- tls_ctx->rx.tail_size,
- tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size,
- ctx->control,
- tls_ctx->crypto_recv.info.version);
+ tls_make_aad(aad, rxm->full_len - prot->overhead_size +
+ prot->tail_size,
+ tls_ctx->rx.rec_seq, prot->rec_seq_size,
+ ctx->control, prot->version);
/* Prepare sgin */
sg_init_table(sgin, n_sgin);
- sg_set_buf(&sgin[0], aad, tls_ctx->rx.aad_size);
+ sg_set_buf(&sgin[0], aad, prot->aad_size);
err = skb_to_sgvec(skb, &sgin[1],
- rxm->offset + tls_ctx->rx.prepend_size,
- rxm->full_len - tls_ctx->rx.prepend_size);
+ rxm->offset + prot->prepend_size,
+ rxm->full_len - prot->prepend_size);
if (err < 0) {
kfree(mem);
return err;
@@ -1418,7 +1421,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
if (n_sgout) {
if (out_iov) {
sg_init_table(sgout, n_sgout);
- sg_set_buf(&sgout[0], aad, tls_ctx->rx.aad_size);
+ sg_set_buf(&sgout[0], aad, prot->aad_size);
*chunk = 0;
err = tls_setup_from_iter(sk, out_iov, data_len,
@@ -1459,7 +1462,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- int version = tls_ctx->crypto_recv.info.version;
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ int version = prot->version;
struct strp_msg *rxm = strp_msg(skb);
int err = 0;
@@ -1480,8 +1484,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
rxm->full_len -= padding_length(ctx, tls_ctx, skb);
- rxm->offset += tls_ctx->rx.prepend_size;
- rxm->full_len -= tls_ctx->rx.overhead_size;
+ rxm->offset += prot->prepend_size;
+ rxm->full_len -= prot->overhead_size;
tls_advance_record_sn(sk, &tls_ctx->rx, version);
ctx->decrypted = true;
ctx->saved_data_ready(sk);
@@ -1605,6 +1609,7 @@ int tls_sw_recvmsg(struct sock *sk,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct sk_psock *psock;
unsigned char control = 0;
ssize_t decrypted = 0;
@@ -1667,11 +1672,11 @@ int tls_sw_recvmsg(struct sock *sk,
rxm = strp_msg(skb);
- to_decrypt = rxm->full_len - tls_ctx->rx.overhead_size;
+ to_decrypt = rxm->full_len - prot->overhead_size;
if (to_decrypt <= len && !is_kvec && !is_peek &&
ctx->control == TLS_RECORD_TYPE_DATA &&
- tls_ctx->crypto_recv.info.version != TLS_1_3_VERSION)
+ prot->version != TLS_1_3_VERSION)
zc = true;
/* Do not use async mode if record is non-data */
@@ -1875,6 +1880,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
{
struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
struct strp_msg *rxm = strp_msg(skb);
size_t cipher_overhead;
@@ -1882,17 +1888,17 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
int ret;
/* Verify that we have a full TLS header, or wait for more data */
- if (rxm->offset + tls_ctx->rx.prepend_size > skb->len)
+ if (rxm->offset + prot->prepend_size > skb->len)
return 0;
/* Sanity-check size of on-stack buffer. */
- if (WARN_ON(tls_ctx->rx.prepend_size > sizeof(header))) {
+ if (WARN_ON(prot->prepend_size > sizeof(header))) {
ret = -EINVAL;
goto read_failure;
}
/* Linearize header to local buffer */
- ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size);
+ ret = skb_copy_bits(skb, rxm->offset, header, prot->prepend_size);
if (ret < 0)
goto read_failure;
@@ -1901,12 +1907,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
data_len = ((header[4] & 0xFF) | (header[3] << 8));
- cipher_overhead = tls_ctx->rx.tag_size;
- if (tls_ctx->crypto_recv.info.version != TLS_1_3_VERSION)
- cipher_overhead += tls_ctx->rx.iv_size;
+ cipher_overhead = prot->tag_size;
+ if (prot->version != TLS_1_3_VERSION)
+ cipher_overhead += prot->iv_size;
if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead +
- tls_ctx->rx.tail_size) {
+ prot->tail_size) {
ret = -EMSGSIZE;
goto read_failure;
}
@@ -2066,6 +2072,8 @@ static void tx_work_handler(struct work_struct *work)
int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
struct tls_crypto_info *crypto_info;
struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
struct tls12_crypto_info_aes_gcm_256 *gcm_256_info;
@@ -2171,18 +2179,20 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
if (crypto_info->version == TLS_1_3_VERSION) {
nonce_size = 0;
- cctx->aad_size = TLS_HEADER_SIZE;
- cctx->tail_size = 1;
+ prot->aad_size = TLS_HEADER_SIZE;
+ prot->tail_size = 1;
} else {
- cctx->aad_size = TLS_AAD_SPACE_SIZE;
- cctx->tail_size = 0;
+ prot->aad_size = TLS_AAD_SPACE_SIZE;
+ prot->tail_size = 0;
}
- cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
- cctx->tag_size = tag_size;
- cctx->overhead_size = cctx->prepend_size + cctx->tag_size +
- cctx->tail_size;
- cctx->iv_size = iv_size;
+ prot->version = crypto_info->version;
+ prot->cipher_type = crypto_info->cipher_type;
+ prot->prepend_size = TLS_HEADER_SIZE + nonce_size;
+ prot->tag_size = tag_size;
+ prot->overhead_size = prot->prepend_size +
+ prot->tag_size + prot->tail_size;
+ prot->iv_size = iv_size;
cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
GFP_KERNEL);
if (!cctx->iv) {
@@ -2192,7 +2202,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
/* Note: 128 & 256 bit salt are the same size */
memcpy(cctx->iv, salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
- cctx->rec_seq_size = rec_seq_size;
+ prot->rec_seq_size = rec_seq_size;
cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
if (!cctx->rec_seq) {
rc = -ENOMEM;
@@ -2215,7 +2225,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
if (rc)
goto free_aead;
- rc = crypto_aead_setauthsize(*aead, cctx->tag_size);
+ rc = crypto_aead_setauthsize(*aead, prot->tag_size);
if (rc)
goto free_aead;
--
2.13.6
^ permalink raw reply related
* Re: stmmac / meson8b-dwmac
From: Simon Huelck @ 2019-02-14 7:21 UTC (permalink / raw)
To: Jose Abreu, Martin Blumenstingl
Cc: Emiliano Ingrassia, Gpeppe.cavallaro, alexandre.torgue,
linux-amlogic, netdev
In-Reply-To: <330a523a-c1d6-4a99-3287-459096af0330@synopsys.com>
Hi,
i used iperf2 on my odroid c2 , since this can do duplex tests ! iperf2 -c 10.10.11.1
-i1 -d
Do you really need the exact numbers ?
NON-duplex:
4.14.29 reached 930 MBits without the -d ( duplex option ), 5.0rc5 only
reaches 600MBits there
DUPLEX:
With the -d option things get worse on 4.14.29 , 600MBits in both
directions. on 5.0rc5 this is the 500 to 600 in one direction, the other
only 150MBits.
iperf3 is not duplex capable, i didnt test the reverse option, but
duplex tests show that one direction drops in performance when the other
is busy.
Do you really need the exact output ? I need to reflash my box then
..... if needed i would do so.
regards,
Simon
Am 11.02.2019 um 14:44 schrieb Jose Abreu:
> Hello,
>
> On 2/9/2019 1:09 AM, Martin Blumenstingl wrote:
>> (it's interesting that the sending direction has 445 retries)
> I saw this before and I think it was related with COE. Can you
> please disable all offloading and try again?
>
> Anyway, maybe Simon should bissect ? I think since 4.14 there are
> not that many commits in stmmac / meson8b-dwmac that can cause
> this behavior.
>
> Thanks,
> Jose Miguel Abreu
^ permalink raw reply
* Re: [PATCH net-next 1/2] net: phy: marvell: add new m88e1510 LED configuration
From: shenjian (K) @ 2019-02-14 7:24 UTC (permalink / raw)
To: Florian Fainelli, andrew, hkallweit1, davem
Cc: netdev, linux-kernel, linuxarm
In-Reply-To: <da37103d-ad41-7e27-bd6e-3a402bbaf096@gmail.com>
在 2019/2/14 12:06, Florian Fainelli 写道:
>
>
> On 2/13/2019 8:31 PM, Jian Shen wrote:
>> The default m88e1510 LED configuration is 0x1177, used LED[0]
>> for 1000M link, LED[1] for 100M link, and LED[2] for active.
>> But for our boards, we want to use 0x1040, which use LED[0] for
>> link, and LED[1] for active.
>>
>> This patch adds a new m88e1510 LED configuration for it.
>
> There appears to be a precedent with the DNS323 flag that was defined
> for the same purpose, but this unfortunately does not scale we cannot
> have every new platform come up with its own LED configuration without
> having a more structured approach to representing the LED configuration.
>
> Maybe we can encode the desired LED behavior in a more generic way and
> utilize the 32 flag bits available to denote a selection, e.g.:
>
> MARVELL_PHY_FLAG_LED0_100M BIT(3)
> MARVELL_PHY_FLAG_LED0_1000M BIT(4)
>
> etc.
>
> or maybe even better would be to expose the LEDs using the standard LEDs
> class subsystem and allow configuring different triggers. We have some
> amount of support for PHY LEDs already in tree, but AFAIR what we do not
> have support for is a "hardware blinking" trigger which those LEDs are.
>
Thanks Florian.
I agree that we should use a generic way.
Maybe I understood incorrectly, for "utilize the 32 flag bits available to denote a selection",
do you mean to define a serial flags for each usage of each led ?
For each led can be configured to several usages, include link/10M link/100M link/1000M link/active/...,
it needs a lot bits for these combination, e.g:
#define MARVELL_PHY_FLAG_LED0_LINK BIT(2)
#define MARVELL_PHY_FLAG_LED0_LINK_100M BIT(3)
#define MARVELL_PHY_FLAG_LED0_LINK_1000M BIT(4)
#define MARVELL_PHY_FLAG_LED0_ACTIVE BIT(5)
#define MARVELL_PHY_FLAG_LED1_LINK BIT(6)
#define MARVELL_PHY_FLAG_LED1_LINK_100M BIT(7)
#define MARVELL_PHY_FLAG_LED1_LINK_1000M BIT(8)
#define MARVELL_PHY_FLAG_LED1_ACTIVE BIT(9)
...
Jian Shen
>>
>> Signed-off-by: Jian Shen <shenjian15@huawei.com>
>> ---
>> drivers/net/phy/marvell.c | 22 +++++++++++++++++++++-
>> include/linux/marvell_phy.h | 1 +
>> 2 files changed, 22 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
>> index 3ccba37..c195286 100644
>> --- a/drivers/net/phy/marvell.c
>> +++ b/drivers/net/phy/marvell.c
>> @@ -128,6 +128,10 @@
>> #define MII_PHY_LED_CTRL 16
>> #define MII_88E1121_PHY_LED_DEF 0x0030
>> #define MII_88E1510_PHY_LED_DEF 0x1177
>> +#define MII_88E1510_PHY_HNS3_LED_DEF 0x1040
>> +
>> +#define MII_88E1510_PHY_LED_POLARITY_CTRL 0x11
>> +#define MII_88E1510_PHY_HNS3_LED_POLARITY 0x4415
>>
>> #define MII_M1011_PHY_STATUS 0x11
>> #define MII_M1011_PHY_STATUS_1000 0x8000
>> @@ -619,12 +623,19 @@ static void marvell_config_led(struct phy_device *phydev)
>> def_config = MII_88E1121_PHY_LED_DEF;
>> break;
>> /* Default PHY LED config:
>> + * For hns3:
>> + * LED[0] .. Link
>> + * LED[1] .. Activity
>> + * For others:
>> * LED[0] .. 1000Mbps Link
>> * LED[1] .. 100Mbps Link
>> * LED[2] .. Blink, Activity
>> */
>> case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1510):
>> - def_config = MII_88E1510_PHY_LED_DEF;
>> + if (phydev->dev_flags & MARVELL_PHY_M1510_HNS3_LEDS)
>> + def_config = MII_88E1510_PHY_HNS3_LED_DEF;
>> + else
>> + def_config = MII_88E1510_PHY_LED_DEF;
>> break;
>> default:
>> return;
>> @@ -634,6 +645,15 @@ static void marvell_config_led(struct phy_device *phydev)
>> def_config);
>> if (err < 0)
>> phydev_warn(phydev, "Fail to config marvell phy LED.\n");
>> +
>> + if (phydev->dev_flags & MARVELL_PHY_M1510_HNS3_LEDS) {
>> + err = phy_write_paged(phydev, MII_MARVELL_LED_PAGE,
>> + MII_88E1510_PHY_LED_POLARITY_CTRL,
>> + MII_88E1510_PHY_HNS3_LED_POLARITY);
>> + if (err < 0)
>> + phydev_warn(phydev,
>> + "Fail to config marvell phy LED polarity.\n");
>> + }
>> }
>>
>> static int marvell_config_init(struct phy_device *phydev)
>> diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
>> index 1eb6f24..99e0bbb 100644
>> --- a/include/linux/marvell_phy.h
>> +++ b/include/linux/marvell_phy.h
>> @@ -32,5 +32,6 @@
>> /* struct phy_device dev_flags definitions */
>> #define MARVELL_PHY_M1145_FLAGS_RESISTANCE 0x00000001
>> #define MARVELL_PHY_M1118_DNS323_LEDS 0x00000002
>> +#define MARVELL_PHY_M1510_HNS3_LEDS 0x00000004
>>
>> #endif /* _MARVELL_PHY_H */
>>
>
^ permalink raw reply
* Re: [PATCH net-next 2/2] net: hns3: add fixup handle for hns3 driver
From: shenjian (K) @ 2019-02-14 7:25 UTC (permalink / raw)
To: Florian Fainelli, andrew, hkallweit1, davem
Cc: netdev, linux-kernel, linuxarm
In-Reply-To: <a9086c2f-9d15-b3d0-9809-44385d6281ff@gmail.com>
在 2019/2/14 12:08, Florian Fainelli 写道:
>
>
> On 2/13/2019 8:31 PM, Jian Shen wrote:
>> The default led configuration of marvell 88E1510 is not fit
>> for hns3 driver, this patch fixes it.
>>
>> Signed-off-by: Jian Shen <shenjian15@huawei.com>
>> ---
>> .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 19 +++++++++++++++++++
>> 1 file changed, 19 insertions(+)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
>> index 84f2878..4c8346e 100644
>> --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
>> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
>> @@ -2,6 +2,7 @@
>> // Copyright (c) 2016-2017 Hisilicon Limited.
>>
>> #include <linux/etherdevice.h>
>> +#include <linux/marvell_phy.h>
>> #include <linux/kernel.h>
>>
>> #include "hclge_cmd.h"
>> @@ -125,6 +126,13 @@ static int hclge_mdio_read(struct mii_bus *bus, int phyid, int regnum)
>> return le16_to_cpu(mdio_cmd->data_rd);
>> }
>>
>> +static int hclge_phy_marvell_fixup(struct phy_device *phydev)
>> +{
>> + phydev->dev_flags |= MARVELL_PHY_M1510_HNS3_LEDS;
>> +
>> + return 0;
>> +}
>> +
>> int hclge_mac_mdio_config(struct hclge_dev *hdev)
>> {
>> struct hclge_mac *mac = &hdev->hw.mac;
>> @@ -168,6 +176,15 @@ int hclge_mac_mdio_config(struct hclge_dev *hdev)
>> mac->phydev = phydev;
>> mac->mdio_bus = mdio_bus;
>>
>> + /* register the PHY board fixup (for Marvell 88E1510) */
>> + ret = phy_register_fixup_for_uid(MARVELL_PHY_ID_88E1510,
>> + MARVELL_PHY_ID_MASK,
>> + hclge_phy_marvell_fixup);
>> + /* we can live without it, so just issue a warning */
>> + if (ret)
>> + dev_warn(&hdev->pdev->dev,
>> + "Cannot register PHY board fixup\n");
>
> You don't need to register a fixup for passing your flags, you can do
> that at the time you attach to the PHY:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/phy.h#n945
>
>
Thanks, will fix it in next version.
>> +
>> return 0;
>> }
>>
>> @@ -240,6 +257,8 @@ void hclge_mac_disconnect_phy(struct hnae3_handle *handle)
>> if (!phydev)
>> return;
>>
>> + phy_unregister_fixup_for_uid(MARVELL_PHY_ID_88E1510,
>> + MARVELL_PHY_ID_MASK);
>> phy_disconnect(phydev);
>> }
>>
>>
>
^ permalink raw reply
* [Bug reporting] kernel panic during handle the dst unreach icmp msg.
From: 배석진 @ 2019-02-14 7:46 UTC (permalink / raw)
To: netdev@vger.kernel.org; +Cc: 배석진, 박종언
In-Reply-To: <CGME20190214074641epcms1p1db1c5589f96718a440a166328eec9ebd@epcms1p1>
Dear all,
https://www.mail-archive.com/netdev@vger.kernel.org/msg256527.html
as we concerned before at above mail thread,
we faced a problem cased by not removed socket.
(from now, 'the socket' means the socket alloced at 0xFFFFFFC0051E5E00)
#1. the socket is state in TIME_WAIT1. maybe it's process closed the socket.
below is memory dump information with Trace32.
(struct sock *)0xFFFFFFC0051E5E00 = 0xFFFFFFC0051E5E00 = end+0x3FF9E4CE00 -> (
__sk_common = (
...
skc_rcv_saddr = 0x0200A8C0, ==> 192.168.0.2
...
skc_state = 4, ==> TIME_WAIT1
...
skc_flags = 0x4301, ==> SOCK_DEAD(0x01) set
#2. user changed WIFI AP to another one, so previous netdevice deleted and destroied it's sockets.
[60392.948657][4: netd] 02-13 00:39:32.095 5249 5323 I NetdDestroyed 30 sockets on 192.168.0.2 in 2.7 ms
[60392.948705][4: netd] 02-13 00:39:32.095 5249 5323 D Netdnotify() code: 614, msg: Address removed 192.168.0.2/24 wlan0 128 0
--> the socket will be exist for a while.
because of 'sock_diag_destory() -> tcp_abort()' can not call tcp_done() for the socket.
but clearing the socket's sk_write_queue by calling tcp_write_queue_purge(sk).
#3. icmp msg(dst unreach) came for sent packet by the socket.
to retransmit them, lookup sk and fint it. (because the socket still exist)
but it's sk_write_queue was already cleared so has no skb to send.
and make the kernel bug.
<4>[60392.948306] I[1: ksoftirqd/1: 19] ------------[ cut here ]------------
<0>[60392.948334] I[1: ksoftirqd/1: 19] kernel BUG at net/ipv4/tcp_ipv4.c:519!
<2>[60392.948344] I[1: ksoftirqd/1: 19] sec_debug_set_extra_info_fault = BUG / 0xffffff80090351d0
<0>[60392.948386] I[1: ksoftirqd/1: 19] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
...
<4>[60392.950676] I[1: ksoftirqd/1: 19] PC is at tcp_v4_err+0x4b0/0x4bc
<4>[60392.950684] I[1: ksoftirqd/1: 19] LR is at tcp_v4_err+0x3ac/0x4bc
370 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
371 {
...
516 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
517
518 skb = tcp_write_queue_head(sk);
519 BUG_ON(!skb);
520
521 tcp_mstamp_refresh(tp);
we know that the line 519 removed on latest state. instead this will be shown to kernel panic.
how about below change? do not retransmit packets when socket was already closed.
best regards,
From: soukjin bae <soukjin.bae@samsung.com>
Date: Wen, 14 Jan 2019 14:26:35 +0900
Subject: net: Don't retransmit packets when socket was already closed
Signed-off-by: soukjin bae <soukjin.bae@samsung.com>
Signed-off-by: jongeon park <jongeon.park@samsung.com>
---
net/ipv4/tcp_ipv4 | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/ipv4/tcp_ipv4 b/net/ipv4/tcp_ipv4
index fe4daf6..654bd19 100755
--- a/net/ipv4/tcp_ipv4
+++ b/net/ipv4/tcp_ipv4
@@ -442,6 +465,10 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
err = EPROTO;
break;
case ICMP_DEST_UNREACH:
+ /* Don't retransmit packets when socket was already closed */
+ if (sock_flag(sk, SOCK_DEAD))
+ goto out;
+
if (code > NR_ICMP_UNREACH)
goto out;
^ permalink raw reply
* [PATCH net-next 10/12] net: sched: flower: protect flower classifier state with spinlock
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
struct tcf_proto was extended with spinlock to be used by classifiers
instead of global rtnl lock. Use it to protect shared flower classifier
data structures (handle_idr, mask hashtable and list) and fields of
individual filters that can be accessed concurrently. This patch set uses
tcf_proto->lock as per instance lock that protects all filters on
tcf_proto.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 36 ++++++++++++++++++++++++++++++------
1 file changed, 30 insertions(+), 6 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index bfef7d6c597d..556f7a1c694a 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -384,7 +384,9 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
cls_flower.cookie = (unsigned long) f;
tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false);
+ spin_lock(&tp->lock);
tcf_block_offload_dec(block, &f->flags);
+ spin_unlock(&tp->lock);
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
@@ -422,7 +424,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
return err;
} else if (err > 0) {
f->in_hw_count = err;
+ spin_lock(&tp->lock);
tcf_block_offload_inc(block, &f->flags);
+ spin_unlock(&tp->lock);
}
if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
@@ -510,18 +514,22 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
(*last) = false;
+ spin_lock(&tp->lock);
if (!f->deleted) {
f->deleted = true;
rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
f->mask->filter_ht_params);
idr_remove(&head->handle_idr, f->handle);
list_del_rcu(&f->list);
+ spin_unlock(&tp->lock);
+
(*last) = fl_mask_put(head, f->mask, async);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f, extack);
tcf_unbind_filter(tp, &f->res);
__fl_put(f);
} else {
+ spin_unlock(&tp->lock);
err = -ENOENT;
}
@@ -1497,6 +1505,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (!tc_in_hw(fnew->flags))
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ spin_lock(&tp->lock);
+
/* tp was deleted concurrently. EAGAIN will cause caller to lookup proto
* again or create new one, if necessary.
*/
@@ -1527,6 +1537,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
list_replace_rcu(&fold->list, &fnew->list);
fold->deleted = true;
+ spin_unlock(&tp->lock);
+
fl_mask_put(head, fold->mask, true);
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, NULL);
@@ -1571,6 +1583,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout_idr;
list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
+ spin_unlock(&tp->lock);
}
*arg = fnew;
@@ -1583,6 +1596,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (!fold)
idr_remove(&head->handle_idr, fnew->handle);
errout_hw:
+ spin_unlock(&tp->lock);
if (!tc_skip_hw(fnew->flags))
fl_hw_destroy_filter(tp, fnew, NULL);
errout_mask:
@@ -1681,8 +1695,10 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
continue;
}
+ spin_lock(&tp->lock);
tc_cls_offload_cnt_update(block, &f->in_hw_count,
&f->flags, add);
+ spin_unlock(&tp->lock);
}
}
@@ -2216,6 +2232,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
struct cls_fl_filter *f = fh;
struct nlattr *nest;
struct fl_flow_key *key, *mask;
+ bool skip_hw;
if (!f)
return skb->len;
@@ -2226,21 +2243,26 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
if (!nest)
goto nla_put_failure;
+ spin_lock(&tp->lock);
+
if (f->res.classid &&
nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
- goto nla_put_failure;
+ goto nla_put_failure_locked;
key = &f->key;
mask = &f->mask->key;
+ skip_hw = tc_skip_hw(f->flags);
if (fl_dump_key(skb, net, key, mask))
- goto nla_put_failure;
-
- if (!tc_skip_hw(f->flags))
- fl_hw_update_stats(tp, f);
+ goto nla_put_failure_locked;
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
- goto nla_put_failure;
+ goto nla_put_failure_locked;
+
+ spin_unlock(&tp->lock);
+
+ if (!skip_hw)
+ fl_hw_update_stats(tp, f);
if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
goto nla_put_failure;
@@ -2255,6 +2277,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
return skb->len;
+nla_put_failure_locked:
+ spin_unlock(&tp->lock);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 02/12] net: sched: flower: refactor fl_change
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
As a preparation for using classifier spinlock instead of relying on
external rtnl lock, rearrange code in fl_change. The goal is to group the
code which changes classifier state in single block in order to allow
following commits in this set to protect it from parallel modification with
tp->lock. Data structures that require tp->lock protection are mask
hashtable and filters list, and classifier handle_idr.
fl_hw_replace_filter() is a sleeping function and cannot be called while
holding a spinlock. In order to execute all sequence of changes to shared
classifier data structures atomically, call fl_hw_replace_filter() before
modifying them.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 85 ++++++++++++++++++++++++++------------------------
1 file changed, 44 insertions(+), 41 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 88d7af78ba7e..91596a6271f8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1354,90 +1354,93 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
goto errout;
- if (!handle) {
- handle = 1;
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- INT_MAX, GFP_KERNEL);
- } else if (!fold) {
- /* user specifies a handle and it doesn't exist */
- err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
- handle, GFP_KERNEL);
- }
- if (err)
- goto errout;
- fnew->handle = handle;
-
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
if (!tc_flags_valid(fnew->flags)) {
err = -EINVAL;
- goto errout_idr;
+ goto errout;
}
}
err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
tp->chain->tmplt_priv, extack);
if (err)
- goto errout_idr;
+ goto errout;
err = fl_check_assign_mask(head, fnew, fold, mask);
if (err)
- goto errout_idr;
-
- if (!fold && __fl_lookup(fnew->mask, &fnew->mkey)) {
- err = -EEXIST;
- goto errout_mask;
- }
-
- err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
- fnew->mask->filter_ht_params);
- if (err)
- goto errout_mask;
+ goto errout;
if (!tc_skip_hw(fnew->flags)) {
err = fl_hw_replace_filter(tp, fnew, extack);
if (err)
- goto errout_mask_ht;
+ goto errout_mask;
}
if (!tc_in_hw(fnew->flags))
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
if (fold) {
+ fnew->handle = handle;
+
+ err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
+ if (err)
+ goto errout_hw;
+
rhashtable_remove_fast(&fold->mask->ht,
&fold->ht_node,
fold->mask->filter_ht_params);
- if (!tc_skip_hw(fold->flags))
- fl_hw_destroy_filter(tp, fold, NULL);
- }
-
- *arg = fnew;
-
- if (fold) {
idr_replace(&head->handle_idr, fnew, fnew->handle);
list_replace_rcu(&fold->list, &fnew->list);
+
+ if (!tc_skip_hw(fold->flags))
+ fl_hw_destroy_filter(tp, fold, NULL);
tcf_unbind_filter(tp, &fold->res);
tcf_exts_get_net(&fold->exts);
tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
} else {
+ if (__fl_lookup(fnew->mask, &fnew->mkey)) {
+ err = -EEXIST;
+ goto errout_hw;
+ }
+
+ if (handle) {
+ /* user specifies a handle and it doesn't exist */
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ handle, GFP_ATOMIC);
+ } else {
+ handle = 1;
+ err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
+ INT_MAX, GFP_ATOMIC);
+ }
+ if (err)
+ goto errout_hw;
+ fnew->handle = handle;
+
+ err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
+ fnew->mask->filter_ht_params);
+ if (err)
+ goto errout_idr;
+
list_add_tail_rcu(&fnew->list, &fnew->mask->filters);
}
+ *arg = fnew;
+
kfree(tb);
kfree(mask);
return 0;
-errout_mask_ht:
- rhashtable_remove_fast(&fnew->mask->ht, &fnew->ht_node,
- fnew->mask->filter_ht_params);
-
-errout_mask:
- fl_mask_put(head, fnew->mask, false);
-
errout_idr:
if (!fold)
idr_remove(&head->handle_idr, fnew->handle);
+errout_hw:
+ if (!tc_skip_hw(fnew->flags))
+ fl_hw_destroy_filter(tp, fnew, NULL);
+errout_mask:
+ fl_mask_put(head, fnew->mask, false);
errout:
tcf_exts_destroy(&fnew->exts);
kfree(fnew);
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 04/12] net: sched: flower: track filter deletion with flag
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
In order to prevent double deletion of filter by concurrent tasks when rtnl
lock is not used for synchronization, add 'deleted' filter field. Check
value of this field when modifying filters and return error if concurrent
deletion is detected.
Refactor __fl_delete() to accept pointer to 'last' boolean as argument,
and return error code as function return value instead. This is necessary
to signal concurrent filter delete to caller.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 55 ++++++++++++++++++++++++++++++++++----------------
1 file changed, 38 insertions(+), 17 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b216ed26f344..fa5465f890e1 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -110,6 +110,7 @@ struct cls_fl_filter {
* synchronization. Use atomic reference counter to be concurrency-safe.
*/
refcount_t refcnt;
+ bool deleted;
};
static const struct rhashtable_params mask_ht_params = {
@@ -454,6 +455,8 @@ static void __fl_put(struct cls_fl_filter *f)
if (!refcount_dec_and_test(&f->refcnt))
return;
+ WARN_ON(!f->deleted);
+
if (tcf_exts_get_net(&f->exts))
tcf_queue_work(&f->rwork, fl_destroy_filter_work);
else
@@ -490,22 +493,31 @@ static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
return f;
}
-static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
- struct netlink_ext_ack *extack)
+static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
+ bool *last, struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = fl_head_dereference(tp);
bool async = tcf_exts_get_net(&f->exts);
- bool last;
-
- idr_remove(&head->handle_idr, f->handle);
- list_del_rcu(&f->list);
- last = fl_mask_put(head, f->mask, async);
- if (!tc_skip_hw(f->flags))
- fl_hw_destroy_filter(tp, f, extack);
- tcf_unbind_filter(tp, &f->res);
- __fl_put(f);
+ int err = 0;
+
+ (*last) = false;
+
+ if (!f->deleted) {
+ f->deleted = true;
+ rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
+ f->mask->filter_ht_params);
+ idr_remove(&head->handle_idr, f->handle);
+ list_del_rcu(&f->list);
+ (*last) = fl_mask_put(head, f->mask, async);
+ if (!tc_skip_hw(f->flags))
+ fl_hw_destroy_filter(tp, f, extack);
+ tcf_unbind_filter(tp, &f->res);
+ __fl_put(f);
+ } else {
+ err = -ENOENT;
+ }
- return last;
+ return err;
}
static void fl_destroy_sleepable(struct work_struct *work)
@@ -525,10 +537,12 @@ static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
struct cls_fl_head *head = fl_head_dereference(tp);
struct fl_flow_mask *mask, *next_mask;
struct cls_fl_filter *f, *next;
+ bool last;
list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
list_for_each_entry_safe(f, next, &mask->filters, list) {
- if (__fl_delete(tp, f, extack))
+ __fl_delete(tp, f, &last, extack);
+ if (last)
break;
}
}
@@ -1439,6 +1453,12 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
refcount_inc(&fnew->refcnt);
if (fold) {
+ /* Fold filter was deleted concurrently. Retry lookup. */
+ if (fold->deleted) {
+ err = -EAGAIN;
+ goto errout_hw;
+ }
+
fnew->handle = handle;
err = rhashtable_insert_fast(&fnew->mask->ht, &fnew->ht_node,
@@ -1451,6 +1471,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
fold->mask->filter_ht_params);
idr_replace(&head->handle_idr, fnew, fnew->handle);
list_replace_rcu(&fold->list, &fnew->list);
+ fold->deleted = true;
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, NULL);
@@ -1520,14 +1541,14 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
{
struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *f = arg;
+ bool last_on_mask;
+ int err = 0;
- rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
- f->mask->filter_ht_params);
- __fl_delete(tp, f, extack);
+ err = __fl_delete(tp, f, &last_on_mask, extack);
*last = list_empty(&head->masks);
__fl_put(f);
- return 0;
+ return err;
}
static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 07/12] net: sched: flower: protect masks list with spinlock
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Protect modifications of flower masks list with spinlock to remove
dependency on rtnl lock and allow concurrent access.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 2b032303f8d5..fc6371a9b0f9 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -88,6 +88,7 @@ struct fl_flow_tmplt {
struct cls_fl_head {
struct rhashtable ht;
+ spinlock_t masks_lock; /* Protect masks list */
struct list_head masks;
struct rcu_work rwork;
struct idr handle_idr;
@@ -312,6 +313,7 @@ static int fl_init(struct tcf_proto *tp)
if (!head)
return -ENOBUFS;
+ spin_lock_init(&head->masks_lock);
INIT_LIST_HEAD_RCU(&head->masks);
rcu_assign_pointer(tp->root, head);
idr_init(&head->handle_idr);
@@ -341,7 +343,11 @@ static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
return false;
rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
+
+ spin_lock(&head->masks_lock);
list_del_rcu(&mask->list);
+ spin_unlock(&head->masks_lock);
+
if (async)
tcf_queue_work(&mask->rwork, fl_mask_free_work);
else
@@ -1309,7 +1315,9 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
/* Wait until any potential concurrent users of mask are finished */
synchronize_rcu();
+ spin_lock(&head->masks_lock);
list_add_tail_rcu(&newmask->list, &head->masks);
+ spin_unlock(&head->masks_lock);
return newmask;
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 09/12] net: sched: flower: handle concurrent tcf proto deletion
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Without rtnl lock protection tcf proto can be deleted concurrently. Check
tcf proto 'deleting' flag after taking tcf spinlock to verify that no
concurrent deletion is in progress. Return EAGAIN error if concurrent
deletion detected, which will cause caller to retry and possibly create new
instance of tcf proto.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 114cb7876133..bfef7d6c597d 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1497,6 +1497,14 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (!tc_in_hw(fnew->flags))
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ /* tp was deleted concurrently. EAGAIN will cause caller to lookup proto
+ * again or create new one, if necessary.
+ */
+ if (tp->deleting) {
+ err = -EAGAIN;
+ goto errout_hw;
+ }
+
refcount_inc(&fnew->refcnt);
if (fold) {
/* Fold filter was deleted concurrently. Retry lookup. */
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 01/12] net: sched: flower: don't check for rtnl on head dereference
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Flower classifier only changes root pointer during init and destroy. Cls
API implements reference counting for tcf_proto, so there is no danger of
concurrent access to tp when it is being destroyed, even without protection
provided by rtnl lock.
Implement new function fl_head_dereference() to dereference tp->root
without checking for rtnl lock. Use it in all flower function that obtain
head pointer instead of rtnl_dereference().
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 24 +++++++++++++++++-------
1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 32fa3e20adc5..88d7af78ba7e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -433,10 +433,20 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
cls_flower.stats.lastused);
}
+static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
+{
+ /* Flower classifier only changes root pointer during init and destroy.
+ * Cls API implements reference counting for tcf_proto, so there is no
+ * danger of concurrent access to tp when it is being destroyed, even
+ * without protection provided by rtnl lock.
+ */
+ return rcu_dereference_protected(tp->root, 1);
+}
+
static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
bool async = tcf_exts_get_net(&f->exts);
bool last;
@@ -468,7 +478,7 @@ static void fl_destroy_sleepable(struct work_struct *work)
static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct fl_flow_mask *mask, *next_mask;
struct cls_fl_filter *f, *next;
@@ -486,7 +496,7 @@ static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
static void *fl_get(struct tcf_proto *tp, u32 handle)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
return idr_find(&head->handle_idr, handle);
}
@@ -1304,7 +1314,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
void **arg, bool ovr, bool rtnl_held,
struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *fold = *arg;
struct cls_fl_filter *fnew;
struct fl_flow_mask *mask;
@@ -1441,7 +1451,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
bool rtnl_held, struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *f = arg;
rhashtable_remove_fast(&f->mask->ht, &f->ht_node,
@@ -1454,7 +1464,7 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
bool rtnl_held)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *f;
arg->count = arg->skip;
@@ -1473,7 +1483,7 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
void *cb_priv, struct netlink_ext_ack *extack)
{
- struct cls_fl_head *head = rtnl_dereference(tp->root);
+ struct cls_fl_head *head = fl_head_dereference(tp);
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
struct fl_flow_mask *mask;
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 06/12] net: sched: flower: handle concurrent mask insertion
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Without rtnl lock protection masks with same key can be inserted
concurrently. Insert temporary mask with reference count zero to masks
hashtable. This will cause any concurrent modifications to retry.
Wait for rcu grace period to complete after removing temporary mask from
masks hashtable to accommodate concurrent readers.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Suggested-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 41 ++++++++++++++++++++++++++++++++++-------
1 file changed, 34 insertions(+), 7 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b41b72e894a6..2b032303f8d5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1301,11 +1301,14 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
INIT_LIST_HEAD_RCU(&newmask->filters);
refcount_set(&newmask->refcnt, 1);
- err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
- mask_ht_params);
+ err = rhashtable_replace_fast(&head->ht, &mask->ht_node,
+ &newmask->ht_node, mask_ht_params);
if (err)
goto errout_destroy;
+ /* Wait until any potential concurrent users of mask are finished */
+ synchronize_rcu();
+
list_add_tail_rcu(&newmask->list, &head->masks);
return newmask;
@@ -1327,19 +1330,36 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
int ret = 0;
rcu_read_lock();
- fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
+
+ /* Insert mask as temporary node to prevent concurrent creation of mask
+ * with same key. Any concurrent lookups with same key will return
+ * EAGAIN because mask's refcnt is zero. It is safe to insert
+ * stack-allocated 'mask' to masks hash table because we call
+ * synchronize_rcu() before returning from this function (either in case
+ * of error or after replacing it with heap-allocated mask in
+ * fl_create_new_mask()).
+ */
+ fnew->mask = rhashtable_lookup_get_insert_fast(&head->ht,
+ &mask->ht_node,
+ mask_ht_params);
if (!fnew->mask) {
rcu_read_unlock();
- if (fold)
- return -EINVAL;
+ if (fold) {
+ ret = -EINVAL;
+ goto errout_cleanup;
+ }
newmask = fl_create_new_mask(head, mask);
- if (IS_ERR(newmask))
- return PTR_ERR(newmask);
+ if (IS_ERR(newmask)) {
+ ret = PTR_ERR(newmask);
+ goto errout_cleanup;
+ }
fnew->mask = newmask;
return 0;
+ } else if (IS_ERR(fnew->mask)) {
+ ret = PTR_ERR(fnew->mask);
} else if (fold && fold->mask != fnew->mask) {
ret = -EINVAL;
} else if (!refcount_inc_not_zero(&fnew->mask->refcnt)) {
@@ -1348,6 +1368,13 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
}
rcu_read_unlock();
return ret;
+
+errout_cleanup:
+ rhashtable_remove_fast(&head->ht, &mask->ht_node,
+ mask_ht_params);
+ /* Wait until any potential concurrent users of mask are finished */
+ synchronize_rcu();
+ return ret;
}
static int fl_set_parms(struct net *net, struct tcf_proto *tp,
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 11/12] net: sched: flower: track rtnl lock state
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Use 'rtnl_held' flag to track if caller holds rtnl lock. Propagate the flag
to internal functions that need to know rtnl lock state. Take rtnl lock
before calling tcf APIs that require it (hw offload, bind filter, etc.).
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 68 +++++++++++++++++++++++++++++++++++---------------
1 file changed, 48 insertions(+), 20 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 556f7a1c694a..8b53959ca716 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -374,11 +374,14 @@ static void fl_destroy_filter_work(struct work_struct *work)
}
static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
- struct netlink_ext_ack *extack)
+ bool rtnl_held, struct netlink_ext_ack *extack)
{
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ if (!rtnl_held)
+ rtnl_lock();
+
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, extack);
cls_flower.command = TC_CLSFLOWER_DESTROY;
cls_flower.cookie = (unsigned long) f;
@@ -387,16 +390,22 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f,
spin_lock(&tp->lock);
tcf_block_offload_dec(block, &f->flags);
spin_unlock(&tp->lock);
+
+ if (!rtnl_held)
+ rtnl_unlock();
}
static int fl_hw_replace_filter(struct tcf_proto *tp,
- struct cls_fl_filter *f,
+ struct cls_fl_filter *f, bool rtnl_held,
struct netlink_ext_ack *extack)
{
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
bool skip_sw = tc_skip_sw(f->flags);
- int err;
+ int err = 0;
+
+ if (!rtnl_held)
+ rtnl_lock();
cls_flower.rule = flow_rule_alloc(tcf_exts_num_actions(&f->exts));
if (!cls_flower.rule)
@@ -420,26 +429,37 @@ static int fl_hw_replace_filter(struct tcf_proto *tp,
kfree(cls_flower.rule);
if (err < 0) {
- fl_hw_destroy_filter(tp, f, NULL);
- return err;
+ fl_hw_destroy_filter(tp, f, true, NULL);
+ goto errout;
} else if (err > 0) {
f->in_hw_count = err;
+ err = 0;
spin_lock(&tp->lock);
tcf_block_offload_inc(block, &f->flags);
spin_unlock(&tp->lock);
}
- if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW))
- return -EINVAL;
+ if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) {
+ err = -EINVAL;
+ goto errout;
+ }
- return 0;
+errout:
+ if (!rtnl_held)
+ rtnl_unlock();
+
+ return err;
}
-static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
+static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
+ bool rtnl_held)
{
struct tc_cls_flower_offload cls_flower = {};
struct tcf_block *block = tp->chain->block;
+ if (!rtnl_held)
+ rtnl_lock();
+
tc_cls_common_offload_init(&cls_flower.common, tp, f->flags, NULL);
cls_flower.command = TC_CLSFLOWER_STATS;
cls_flower.cookie = (unsigned long) f;
@@ -450,6 +470,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
cls_flower.stats.pkts,
cls_flower.stats.lastused);
+
+ if (!rtnl_held)
+ rtnl_unlock();
}
static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
@@ -506,7 +529,8 @@ static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
}
static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
- bool *last, struct netlink_ext_ack *extack)
+ bool *last, bool rtnl_held,
+ struct netlink_ext_ack *extack)
{
struct cls_fl_head *head = fl_head_dereference(tp);
bool async = tcf_exts_get_net(&f->exts);
@@ -525,7 +549,7 @@ static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
(*last) = fl_mask_put(head, f->mask, async);
if (!tc_skip_hw(f->flags))
- fl_hw_destroy_filter(tp, f, extack);
+ fl_hw_destroy_filter(tp, f, rtnl_held, extack);
tcf_unbind_filter(tp, &f->res);
__fl_put(f);
} else {
@@ -557,7 +581,7 @@ static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
list_for_each_entry_safe(mask, next_mask, &head->masks, list) {
list_for_each_entry_safe(f, next, &mask->filters, list) {
- __fl_delete(tp, f, &last, extack);
+ __fl_delete(tp, f, &last, rtnl_held, extack);
if (last)
break;
}
@@ -1397,19 +1421,23 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
struct cls_fl_filter *f, struct fl_flow_mask *mask,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr,
- struct fl_flow_tmplt *tmplt,
+ struct fl_flow_tmplt *tmplt, bool rtnl_held,
struct netlink_ext_ack *extack)
{
int err;
- err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true,
+ err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held,
extack);
if (err < 0)
return err;
if (tb[TCA_FLOWER_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]);
+ if (!rtnl_held)
+ rtnl_lock();
tcf_bind_filter(tp, &f->res, base);
+ if (!rtnl_held)
+ rtnl_unlock();
}
err = fl_set_key(net, tb, &f->key, &mask->key, extack);
@@ -1488,7 +1516,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr,
- tp->chain->tmplt_priv, extack);
+ tp->chain->tmplt_priv, rtnl_held, extack);
if (err)
goto errout;
@@ -1497,7 +1525,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
goto errout;
if (!tc_skip_hw(fnew->flags)) {
- err = fl_hw_replace_filter(tp, fnew, extack);
+ err = fl_hw_replace_filter(tp, fnew, rtnl_held, extack);
if (err)
goto errout_mask;
}
@@ -1541,7 +1569,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
fl_mask_put(head, fold->mask, true);
if (!tc_skip_hw(fold->flags))
- fl_hw_destroy_filter(tp, fold, NULL);
+ fl_hw_destroy_filter(tp, fold, rtnl_held, NULL);
tcf_unbind_filter(tp, &fold->res);
tcf_exts_get_net(&fold->exts);
/* Caller holds reference to fold, so refcnt is always > 0
@@ -1598,7 +1626,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
errout_hw:
spin_unlock(&tp->lock);
if (!tc_skip_hw(fnew->flags))
- fl_hw_destroy_filter(tp, fnew, NULL);
+ fl_hw_destroy_filter(tp, fnew, rtnl_held, NULL);
errout_mask:
fl_mask_put(head, fnew->mask, true);
errout:
@@ -1622,7 +1650,7 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
bool last_on_mask;
int err = 0;
- err = __fl_delete(tp, f, &last_on_mask, extack);
+ err = __fl_delete(tp, f, &last_on_mask, rtnl_held, extack);
*last = list_empty(&head->masks);
__fl_put(f);
@@ -2262,7 +2290,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
spin_unlock(&tp->lock);
if (!skip_hw)
- fl_hw_update_stats(tp, f);
+ fl_hw_update_stats(tp, f, rtnl_held);
if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
goto nla_put_failure;
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 00/12] Refactor flower classifier to remove dependency on rtnl lock
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
Currently, all netlink protocol handlers for updating rules, actions and
qdiscs are protected with single global rtnl lock which removes any
possibility for parallelism. This patch set is a third step to remove
rtnl lock dependency from TC rules update path.
Recently, new rtnl registration flag RTNL_FLAG_DOIT_UNLOCKED was added.
TC rule update handlers (RTM_NEWTFILTER, RTM_DELTFILTER, etc.) are
already registered with this flag and only take rtnl lock when qdisc or
classifier requires it. Classifiers can indicate that their ops
callbacks don't require caller to hold rtnl lock by setting the
TCF_PROTO_OPS_DOIT_UNLOCKED flag. The goal of this change is to refactor
flower classifier to support unlocked execution and register it with
unlocked flag.
This patch set implements following changes to make flower classifier
concurrency-safe:
- Implement reference counting for individual filters. Change fl_get to
take reference to filter. Implement tp->ops->put callback that was
introduced in cls API patch set to release reference to flower filter.
- Use tp->lock spinlock to protect internal classifier data structures
from concurrent modification.
- Handle concurrent tcf proto deletion by returning EAGAIN, which will
cause cls API to retry and create new proto instance or return error
to the user (depending on message type).
- Handle concurrent insertion of filter with same priority and handle by
returning EAGAIN, which will cause cls API to lookup filter again and
process it accordingly to netlink message flags.
- Extend flower mask with reference counting and protect masks list with
masks_lock spinlock.
- Prevent concurrent mask insertion by inserting temporary value to
masks hash table. This is necessary because mask initialization is a
sleeping operation and cannot be done while holding tp->lock.
Tcf hw offloads API is not changed by this patch set and still requires
caller to hold rtnl lock. Refactored flower classifier tracks rtnl lock
state by means of 'rtnl_held' flag provided by cls API and obtains the
lock before calling hw offloads.
With these changes flower classifier is safely registered with
TCF_PROTO_OPS_DOIT_UNLOCKED flag in last patch.
Github: [https://github.com/vbuslov/linux/tree/unlocked_flower_cong_1]
Vlad Buslov (12):
net: sched: flower: don't check for rtnl on head dereference
net: sched: flower: refactor fl_change
net: sched: flower: introduce reference counting for filters
net: sched: flower: track filter deletion with flag
net: sched: flower: add reference counter to flower mask
net: sched: flower: handle concurrent mask insertion
net: sched: flower: protect masks list with spinlock
net: sched: flower: handle concurrent filter insertion in fl_change
net: sched: flower: handle concurrent tcf proto deletion
net: sched: flower: protect flower classifier state with spinlock
net: sched: flower: track rtnl lock state
net: sched: flower: set unlocked flag for flower proto ops
net/sched/cls_flower.c | 424 +++++++++++++++++++++++++++++++++++++------------
1 file changed, 321 insertions(+), 103 deletions(-)
--
2.13.6
^ permalink raw reply
* [PATCH net-next 05/12] net: sched: flower: add reference counter to flower mask
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Extend fl_flow_mask structure with reference counter to allow parallel
modification without relying on rtnl lock. Use rcu read lock to safely
lookup mask and increment reference counter in order to accommodate
concurrent deletes.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index fa5465f890e1..b41b72e894a6 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -76,6 +76,7 @@ struct fl_flow_mask {
struct list_head filters;
struct rcu_work rwork;
struct list_head list;
+ refcount_t refcnt;
};
struct fl_flow_tmplt {
@@ -320,6 +321,7 @@ static int fl_init(struct tcf_proto *tp)
static void fl_mask_free(struct fl_flow_mask *mask)
{
+ WARN_ON(!list_empty(&mask->filters));
rhashtable_destroy(&mask->ht);
kfree(mask);
}
@@ -335,7 +337,7 @@ static void fl_mask_free_work(struct work_struct *work)
static bool fl_mask_put(struct cls_fl_head *head, struct fl_flow_mask *mask,
bool async)
{
- if (!list_empty(&mask->filters))
+ if (!refcount_dec_and_test(&mask->refcnt))
return false;
rhashtable_remove_fast(&head->ht, &mask->ht_node, mask_ht_params);
@@ -1298,6 +1300,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
INIT_LIST_HEAD_RCU(&newmask->filters);
+ refcount_set(&newmask->refcnt, 1);
err = rhashtable_insert_fast(&head->ht, &newmask->ht_node,
mask_ht_params);
if (err)
@@ -1321,9 +1324,13 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
struct fl_flow_mask *mask)
{
struct fl_flow_mask *newmask;
+ int ret = 0;
+ rcu_read_lock();
fnew->mask = rhashtable_lookup_fast(&head->ht, mask, mask_ht_params);
if (!fnew->mask) {
+ rcu_read_unlock();
+
if (fold)
return -EINVAL;
@@ -1332,11 +1339,15 @@ static int fl_check_assign_mask(struct cls_fl_head *head,
return PTR_ERR(newmask);
fnew->mask = newmask;
+ return 0;
} else if (fold && fold->mask != fnew->mask) {
- return -EINVAL;
+ ret = -EINVAL;
+ } else if (!refcount_inc_not_zero(&fnew->mask->refcnt)) {
+ /* Mask was deleted concurrently, try again */
+ ret = -EAGAIN;
}
-
- return 0;
+ rcu_read_unlock();
+ return ret;
}
static int fl_set_parms(struct net *net, struct tcf_proto *tp,
@@ -1473,6 +1484,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
list_replace_rcu(&fold->list, &fnew->list);
fold->deleted = true;
+ fl_mask_put(head, fold->mask, true);
if (!tc_skip_hw(fold->flags))
fl_hw_destroy_filter(tp, fold, NULL);
tcf_unbind_filter(tp, &fold->res);
@@ -1522,7 +1534,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (!tc_skip_hw(fnew->flags))
fl_hw_destroy_filter(tp, fnew, NULL);
errout_mask:
- fl_mask_put(head, fnew->mask, false);
+ fl_mask_put(head, fnew->mask, true);
errout:
tcf_exts_destroy(&fnew->exts);
kfree(fnew);
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 03/12] net: sched: flower: introduce reference counting for filters
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Extend flower filters with reference counting in order to remove dependency
on rtnl lock in flower ops and allow to modify filters concurrently.
Reference to flower filter can be taken/released concurrently as soon as it
is marked as 'unlocked' by last patch in this series. Use atomic reference
counter type to make concurrent modifications safe.
Always take reference to flower filter while working with it:
- Modify fl_get() to take reference to filter.
- Implement tp->put() callback as fl_put() function to allow cls API to
release reference taken by fl_get().
- Modify fl_change() to assume that caller holds reference to fold and take
reference to fnew.
- Take reference to filter while using it in fl_walk().
Implement helper functions to get/put filter reference counter.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 95 ++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 81 insertions(+), 14 deletions(-)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 91596a6271f8..b216ed26f344 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/rhashtable.h>
#include <linux/workqueue.h>
+#include <linux/refcount.h>
#include <linux/if_ether.h>
#include <linux/in6.h>
@@ -104,6 +105,11 @@ struct cls_fl_filter {
u32 in_hw_count;
struct rcu_work rwork;
struct net_device *hw_dev;
+ /* Flower classifier is unlocked, which means that its reference counter
+ * can be changed concurrently without any kind of external
+ * synchronization. Use atomic reference counter to be concurrency-safe.
+ */
+ refcount_t refcnt;
};
static const struct rhashtable_params mask_ht_params = {
@@ -443,6 +449,47 @@ static struct cls_fl_head *fl_head_dereference(struct tcf_proto *tp)
return rcu_dereference_protected(tp->root, 1);
}
+static void __fl_put(struct cls_fl_filter *f)
+{
+ if (!refcount_dec_and_test(&f->refcnt))
+ return;
+
+ if (tcf_exts_get_net(&f->exts))
+ tcf_queue_work(&f->rwork, fl_destroy_filter_work);
+ else
+ __fl_destroy_filter(f);
+}
+
+static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle)
+{
+ struct cls_fl_filter *f;
+
+ rcu_read_lock();
+ f = idr_find(&head->handle_idr, handle);
+ if (f && !refcount_inc_not_zero(&f->refcnt))
+ f = NULL;
+ rcu_read_unlock();
+
+ return f;
+}
+
+static struct cls_fl_filter *fl_get_next_filter(struct tcf_proto *tp,
+ unsigned long *handle)
+{
+ struct cls_fl_head *head = fl_head_dereference(tp);
+ struct cls_fl_filter *f;
+
+ rcu_read_lock();
+ /* don't return filters that are being deleted */
+ while ((f = idr_get_next_ul(&head->handle_idr,
+ handle)) != NULL &&
+ !refcount_inc_not_zero(&f->refcnt))
+ ++(*handle);
+ rcu_read_unlock();
+
+ return f;
+}
+
static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
struct netlink_ext_ack *extack)
{
@@ -456,10 +503,7 @@ static bool __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f,
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f, extack);
tcf_unbind_filter(tp, &f->res);
- if (async)
- tcf_queue_work(&f->rwork, fl_destroy_filter_work);
- else
- __fl_destroy_filter(f);
+ __fl_put(f);
return last;
}
@@ -494,11 +538,18 @@ static void fl_destroy(struct tcf_proto *tp, bool rtnl_held,
tcf_queue_work(&head->rwork, fl_destroy_sleepable);
}
+static void fl_put(struct tcf_proto *tp, void *arg)
+{
+ struct cls_fl_filter *f = arg;
+
+ __fl_put(f);
+}
+
static void *fl_get(struct tcf_proto *tp, u32 handle)
{
struct cls_fl_head *head = fl_head_dereference(tp);
- return idr_find(&head->handle_idr, handle);
+ return __fl_get(head, handle);
}
static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -1321,12 +1372,16 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
struct nlattr **tb;
int err;
- if (!tca[TCA_OPTIONS])
- return -EINVAL;
+ if (!tca[TCA_OPTIONS]) {
+ err = -EINVAL;
+ goto errout_fold;
+ }
mask = kzalloc(sizeof(struct fl_flow_mask), GFP_KERNEL);
- if (!mask)
- return -ENOBUFS;
+ if (!mask) {
+ err = -ENOBUFS;
+ goto errout_fold;
+ }
tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
if (!tb) {
@@ -1349,6 +1404,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
err = -ENOBUFS;
goto errout_tb;
}
+ refcount_set(&fnew->refcnt, 1);
err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0);
if (err < 0)
@@ -1381,6 +1437,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
if (!tc_in_hw(fnew->flags))
fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW;
+ refcount_inc(&fnew->refcnt);
if (fold) {
fnew->handle = handle;
@@ -1399,7 +1456,11 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
fl_hw_destroy_filter(tp, fold, NULL);
tcf_unbind_filter(tp, &fold->res);
tcf_exts_get_net(&fold->exts);
- tcf_queue_work(&fold->rwork, fl_destroy_filter_work);
+ /* Caller holds reference to fold, so refcnt is always > 0
+ * after this.
+ */
+ refcount_dec(&fold->refcnt);
+ __fl_put(fold);
} else {
if (__fl_lookup(fnew->mask, &fnew->mkey)) {
err = -EEXIST;
@@ -1448,6 +1509,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
kfree(tb);
errout_mask_alloc:
kfree(mask);
+errout_fold:
+ if (fold)
+ __fl_put(fold);
return err;
}
@@ -1461,24 +1525,26 @@ static int fl_delete(struct tcf_proto *tp, void *arg, bool *last,
f->mask->filter_ht_params);
__fl_delete(tp, f, extack);
*last = list_empty(&head->masks);
+ __fl_put(f);
+
return 0;
}
static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg,
bool rtnl_held)
{
- struct cls_fl_head *head = fl_head_dereference(tp);
struct cls_fl_filter *f;
arg->count = arg->skip;
- while ((f = idr_get_next_ul(&head->handle_idr,
- &arg->cookie)) != NULL) {
+ while ((f = fl_get_next_filter(tp, &arg->cookie)) != NULL) {
if (arg->fn(tp, f, arg) < 0) {
+ __fl_put(f);
arg->stop = 1;
break;
}
- arg->cookie = f->handle + 1;
+ __fl_put(f);
+ arg->cookie++;
arg->count++;
}
}
@@ -2148,6 +2214,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.init = fl_init,
.destroy = fl_destroy,
.get = fl_get,
+ .put = fl_put,
.change = fl_change,
.delete = fl_delete,
.walk = fl_walk,
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 12/12] net: sched: flower: set unlocked flag for flower proto ops
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Set TCF_PROTO_OPS_DOIT_UNLOCKED for flower classifier to indicate that its
ops callbacks don't require caller to hold rtnl lock.
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 8b53959ca716..360cac828cad 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -2362,6 +2362,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.tmplt_destroy = fl_tmplt_destroy,
.tmplt_dump = fl_tmplt_dump,
.owner = THIS_MODULE,
+ .flags = TCF_PROTO_OPS_DOIT_UNLOCKED,
};
static int __init cls_fl_init(void)
--
2.13.6
^ permalink raw reply related
* [PATCH net-next 08/12] net: sched: flower: handle concurrent filter insertion in fl_change
From: Vlad Buslov @ 2019-02-14 7:47 UTC (permalink / raw)
To: netdev; +Cc: jhs, xiyou.wangcong, jiri, davem, Vlad Buslov
In-Reply-To: <20190214074712.17846-1-vladbu@mellanox.com>
Check if user specified a handle and another filter with the same handle
was inserted concurrently. Return EAGAIN to retry filter processing (in
case it is an overwrite request).
Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
net/sched/cls_flower.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index fc6371a9b0f9..114cb7876133 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1539,6 +1539,15 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
/* user specifies a handle and it doesn't exist */
err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
handle, GFP_ATOMIC);
+
+ /* Filter with specified handle was concurrently
+ * inserted after initial check in cls_api. This is not
+ * necessarily an error if NLM_F_EXCL is not set in
+ * message flags. Returning EAGAIN will cause cls_api to
+ * try to update concurrently inserted rule.
+ */
+ if (err == -ENOSPC)
+ err = -EAGAIN;
} else {
handle = 1;
err = idr_alloc_u32(&head->handle_idr, fnew, &handle,
--
2.13.6
^ permalink raw reply related
* Re: [PATCH bpf-next v4 0/2] libbpf: adding AF_XDP support
From: Magnus Karlsson @ 2019-02-14 8:25 UTC (permalink / raw)
To: Jonathan Lemon
Cc: Magnus Karlsson, Björn Töpel, ast, Daniel Borkmann,
Network Development, Jakub Kicinski, Björn Töpel,
Zhang, Qi Z, Jesper Dangaard Brouer, xiaolong.ye
In-Reply-To: <3213D100-3861-4963-9490-EE445B731E63@gmail.com>
On Wed, Feb 13, 2019 at 9:49 PM Jonathan Lemon <jonathan.lemon@gmail.com> wrote:
>
> On 13 Feb 2019, at 3:32, Magnus Karlsson wrote:
>
> > On Mon, Feb 11, 2019 at 9:44 PM Jonathan Lemon
> > <jonathan.lemon@gmail.com> wrote:
> >>
> >> On 8 Feb 2019, at 5:05, Magnus Karlsson wrote:
> >>
> >>> This patch proposes to add AF_XDP support to libbpf. The main reason
> >>> for this is to facilitate writing applications that use AF_XDP by
> >>> offering higher-level APIs that hide many of the details of the
> >>> AF_XDP
> >>> uapi. This is in the same vein as libbpf facilitates XDP adoption by
> >>> offering easy-to-use higher level interfaces of XDP
> >>> functionality. Hopefully this will facilitate adoption of AF_XDP,
> >>> make
> >>> applications using it simpler and smaller, and finally also make it
> >>> possible for applications to benefit from optimizations in the
> >>> AF_XDP
> >>> user space access code. Previously, people just copied and pasted
> >>> the
> >>> code from the sample application into their application, which is
> >>> not
> >>> desirable.
> >>
> >> I like the idea of encapsulating the boilerplate logic in a library.
> >>
> >> I do think there is an important missing piece though - there should
> >> be
> >> some code which queries the netdev for how many queues are attached,
> >> and
> >> create the appropriate number of umem/AF_XDP sockets.
> >>
> >> I ran into this issue when testing the current AF_XDP code - on my
> >> test
> >> boxes, the mlx5 card has 55 channels (aka queues), so when the test
> >> program
> >> binds only to channel 0, nothing works as expected, since not all
> >> traffic
> >> is being intercepted. While obvious in hindsight, this took a while
> >> to
> >> track down.
> >
> > Yes, agreed. You are not the first one to stumble upon this problem
> > :-). Let me think a little bit on how to solve this in a good way. We
> > need this to be simple and intuitive, as you say.
>
> Has any investigation been done on using some variant of MPSC
> implementation
> as an intermediate form for AF_XDP? E.g.: something like LCRQ or the
> bulkQ
> in bpf devmap/cpumap. I'm aware that this would be slightly slower, as
> it
> would introduce a lock in the path, but I'd think that having DEVMAP,
> CPUMAP
> and XSKMAP all behave the same way would add more flexibility.
Not as far as I know. But adding the option of having a MPSC or even
MPMC queues has been on the todo list for a while, however, the
current focus of Björn and myself is to upstream the performance
improvements from the Linux Plumbers paper, improve ease-of-use, and
help Jesper et al. with the per-queue XDP program implementation
(which will increase both performance and ease-of-use). If anyone has
some spare cycles out there, please go ahead and give MPSC or MPMC
queues a try :-).
/Magnus
> Ideally, if the configuration matches the underlying hardware, then the
> implementation would reduce to the current setup (and allow ZC
> implementations),
> but a non-matching configuration would still work - as opposed to the
> current
> situation.
> --
> Jonathan
^ permalink raw reply
* Re: [PATCH 2/2] net: Replace dev_kfree_skb_any by dev_consume_skb_any
From: Sergei Shtylyov @ 2019-02-14 9:13 UTC (permalink / raw)
To: Huang Zijiang, linux-net-drivers
Cc: ecree, bkenward, davem, netdev, linux-kernel, wang.yi59
In-Reply-To: <1550126533-28462-1-git-send-email-huang.zijiang@zte.com.cn>
Hello!
On 14.02.2019 9:42, Huang Zijiang wrote:
> The skb should be freed by dev_consume_skb_any() efx_tx_tso_fallback()
^ in?
> when skb is still used. The skb is be replaced by segments, so the
^^ will?
> original skb should be consumed(not drop).
>
> Signed-off-by: Huang Zijiang <huang.zijiang@zte.com.cn>
[...]
MBR, Sergei
^ permalink raw reply
* Re: [RFC PATCH] net act_vlan: use correct len in skb_pull
From: Toshiaki Makita @ 2019-02-14 9:16 UTC (permalink / raw)
To: Zahari Doychev, netdev, bridge, nikolay, roopa, jhs, jiri,
xiyou.wangcong
Cc: johannes
In-Reply-To: <20190213195102.1917-1-zahari.doychev@linux.com>
On 2019/02/14 4:51, Zahari Doychev wrote:
> The bridge and VLAN code expects that skb->data points to the start of the
> VLAN header instead of the next (network) header. Currently after
> tcf_vlan_act() on ingress filter skb->data points to the next network
> header. In this case the Linux bridge does not forward correctly double
> tagged VLAN packets added using tc vlan action as the outer vlan tag from
> the skb is inserted at the wrong offset after the vlan tag in the payload.
> Making skb->data to point to the VLAN header in tcf_vlan_act() by using
> ETH_HLEN in skb_pull_rcsum() fixes the problem.
>
> The following commands were used for testing:
>
> ip link add name br0 type bridge vlan_filtering 1
> ip link set dev br0 up
>
> ip link set dev net0 up
> ip link set dev net0 master br0
>
> ip link set dev net1 up
> ip link set dev net1 master br0
>
> bridge vlan add dev net0 vid 100 master
> bridge vlan add dev br0 vid 100 self
> bridge vlan add dev net1 vid 100 master
>
> tc qdisc add dev net0 handle ffff: clsact
> tc qdisc add dev net1 handle ffff: clsact
>
> tc filter add dev net0 ingress pref 1 protocol all flower \
> action vlan push id 10 pipe action vlan push id 100
>
> tc filter add dev net0 egress pref 1 protocol 802.1q flower \
> vlan_id 100 vlan_ethtype 802.1q cvlan_id 10 \
> action vlan pop pipe action vlan pop
>
> Signed-off-by: Zahari Doychev <zahari.doychev@linux.com>
> ---
> net/sched/act_vlan.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
> index 93fdaf707313..308d7d89f925 100644
> --- a/net/sched/act_vlan.c
> +++ b/net/sched/act_vlan.c
> @@ -86,7 +86,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
>
> out:
> if (skb_at_tc_ingress(skb))
> - skb_pull_rcsum(skb, skb->mac_len);
> + skb_pull_rcsum(skb, ETH_HLEN);
As I said before, it would be safer to remember mac_len and use it later.
__u16 mac_len = skb->mac_len;
...
err = skb_vlan_push(...)
...
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, mac_len);
--
Toshiaki Makita
^ permalink raw reply
* Re: [PATCH 2/2] net: Replace dev_kfree_skb_any by dev_consume_skb_any
From: Bert Kenward @ 2019-02-14 9:34 UTC (permalink / raw)
To: Huang Zijiang, linux-net-drivers
Cc: ecree, davem, netdev, linux-kernel, wang.yi59
In-Reply-To: <1550126533-28462-1-git-send-email-huang.zijiang@zte.com.cn>
On 14/02/2019 06:42, Huang Zijiang wrote:
> The skb should be freed by dev_consume_skb_any() efx_tx_tso_fallback()
> when skb is still used. The skb is be replaced by segments, so the
> original skb should be consumed(not drop).
>
> Signed-off-by: Huang Zijiang <huang.zijiang@zte.com.cn>
Sergei's commit message fixups look good, but apart from that:
Acked-by: Bert Kenward <bkenward@solarflare.com>
> ---
> drivers/net/ethernet/sfc/tx.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
> index c3ad564..ed551f0 100644
> --- a/drivers/net/ethernet/sfc/tx.c
> +++ b/drivers/net/ethernet/sfc/tx.c
> @@ -471,7 +471,7 @@ static int efx_tx_tso_fallback(struct efx_tx_queue *tx_queue,
> if (IS_ERR(segments))
> return PTR_ERR(segments);
>
> - dev_kfree_skb_any(skb);
> + dev_consume_skb_any(skb);
> skb = segments;
>
> while (skb) {
>
^ permalink raw reply
* Re: [PATCH 2/4] mac80211: Free mpath object when rhashtable insertion fails
From: Herbert Xu @ 2019-02-14 9:41 UTC (permalink / raw)
To: Johannes Berg; +Cc: David Miller, linux-wireless, netdev, j, tgraf
In-Reply-To: <36adaef5c982ba10444d6f837b0d5c55ac953bdf.camel@sipsolutions.net>
On Wed, Feb 13, 2019 at 04:04:29PM +0100, Johannes Berg wrote:
> On Wed, 2019-02-13 at 22:39 +0800, Herbert Xu wrote:
> > + if (ret != -EEXIST)
> > return ERR_PTR(ret);
>
> Surely that should still be "if (ret && ret != -EEXIST)" otherwise you
> return for the success case too? or "else if"?
>
> I'd probably say we should combine all those ifs into something like this?
>
>
> if (ret == 0) {
> sdata->u.mesh.mesh_paths_generation++;
> return new_mpath;
> } else if (ret == -EEXIST) {
> kfree(new_mpath);
> return mpath;
> } else {
> kfree(new_mpath);
> return NULL;
> }
>
>
> Yes, that's a small change in behaviour as to when the generation
> counter is updated, but I'm pretty sure it really only needs updating
> when we inserted something, not if we found an old mpath.
You are right. Let me fix this and repost.
Thanks!
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* RE: [PATCH] can: flexcan: fix timeout when set small bitrate
From: Joakim Zhang @ 2019-02-14 9:56 UTC (permalink / raw)
To: mkl@pengutronix.de, linux-can@vger.kernel.org
Cc: wg@grandegger.com, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, dl-linux-imx
In-Reply-To: <20190131093509.12613-1-qiangqing.zhang@nxp.com>
Kindly Ping...
Reviewed-by: Dong Aisheng <aisheng.dong@nxp.com>
Best Regards,
Joakim Zhang
> -----Original Message-----
> From: Joakim Zhang
> Sent: 2019年1月31日 17:37
> To: mkl@pengutronix.de; linux-can@vger.kernel.org
> Cc: wg@grandegger.com; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; dl-linux-imx <linux-imx@nxp.com>; Joakim
> Zhang <qiangqing.zhang@nxp.com>
> Subject: [PATCH] can: flexcan: fix timeout when set small bitrate
>
> Current we can meet timeout issue when setting a small bitrate like 10000 as
> follows on i.MX6UL EVK board (ipg clock = 66MHZ, per clock = 30MHZ):
> root@imx6ul7d:~# ip link set can0 up type can bitrate 10000 A link change
> request failed with some changes committed already.
> Interface can0 may have been left with an inconsistent configuration, please
> check.
> RTNETLINK answers: Connection timed out
>
> It is caused by calling of flexcan_chip_unfreeze() timeout.
>
> Originally the code is using usleep_range(10, 20) for unfreeze operation, but
> the patch (8badd65 can: flexcan: avoid calling usleep_range from interrupt
> context) changed it into udelay(10) which is only a half delay of before,
> there're also some other delay changes.
>
> After double to FLEXCAN_TIMEOUT_US to 100 can fix the issue.
>
> Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
> ---
> drivers/net/can/flexcan.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c index
> 2bca867bcfaa..1f2b4db7da88 100644
> --- a/drivers/net/can/flexcan.c
> +++ b/drivers/net/can/flexcan.c
> @@ -166,7 +166,7 @@
> #define FLEXCAN_MB_CNT_LENGTH(x) (((x) & 0xf) << 16)
> #define FLEXCAN_MB_CNT_TIMESTAMP(x) ((x) & 0xffff)
>
> -#define FLEXCAN_TIMEOUT_US (50)
> +#define FLEXCAN_TIMEOUT_US (100)
>
> /* FLEXCAN hardware feature flags
> *
> --
> 2.17.1
^ permalink raw reply
* RE: [PATCH V4] can: flexcan: implement can Runtime PM
From: Joakim Zhang @ 2019-02-14 9:57 UTC (permalink / raw)
To: mkl@pengutronix.de, linux-can@vger.kernel.org
Cc: wg@grandegger.com, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, dl-linux-imx, Aisheng Dong
In-Reply-To: <DB7PR04MB4618BFE57BD34A8B7A708E0FE6830@DB7PR04MB4618.eurprd04.prod.outlook.com>
Kindly Ping...
Best Regards,
Joakim Zhang
> -----Original Message-----
> From: Joakim Zhang
> Sent: 2019年1月17日 14:23
> To: mkl@pengutronix.de; linux-can@vger.kernel.org
> Cc: wg@grandegger.com; netdev@vger.kernel.org;
> linux-kernel@vger.kernel.org; dl-linux-imx <linux-imx@nxp.com>; Aisheng
> Dong <aisheng.dong@nxp.com>
> Subject: RE: [PATCH V4] can: flexcan: implement can Runtime PM
>
>
> Kindly Ping...
>
> Best Regards,
> Joakim Zhang
>
> > -----Original Message-----
> > From: Joakim Zhang
> > Sent: 2018年11月30日 16:53
> > To: mkl@pengutronix.de; linux-can@vger.kernel.org
> > Cc: wg@grandegger.com; netdev@vger.kernel.org;
> > linux-kernel@vger.kernel.org; dl-linux-imx <linux-imx@nxp.com>;
> > Aisheng DONG <aisheng.dong@nxp.com>; Joakim Zhang
> > <qiangqing.zhang@nxp.com>
> > Subject: [PATCH V4] can: flexcan: implement can Runtime PM
> >
> > From: Aisheng Dong <aisheng.dong@nxp.com>
> >
> > Flexcan will be disabled during suspend if no wakeup function required
> > and enabled after resume accordingly. During this period, we could
> > explicitly disable clocks.
> > Since PM is optional, the clock is enabled at probe to guarante the
> > clock is running when PM is not enabled in the kernel.
> >
> > Implement Runtime PM which will:
> > 1) Without CONFIG_PM, clock is running whether Flexcan up or down.
> > 2) With CONFIG_PM, clock enabled while Flexcan up and disabled when
> > Flexcan down.
> > 3) Disable clock when do system suspend and enable clock while system
> > resume.
> > 4) Make Power Domain framework be able to shutdown the corresponding
> > power domain of this device.
> >
> > Signed-off-by: Aisheng Dong <aisheng.dong@nxp.com>
> > Signed-off-by: Joakim Zhang <qiangqing.zhang@nxp.com>
> > ---
> > ChangeLog:
> > V1->V2:
> > *rebased on patch "can: flexcan: add self wakeup support".
> > V2->V3:
> > *fix device fails to probe without CONFIG_PM.
> > V3->V4:
> > *runtime pm enable should ahead of registering device.
> > *disable device even if keeping the clocks on.
> > ---
> > drivers/net/can/flexcan.c | 111
> > +++++++++++++++++++++++++-------------
> > 1 file changed, 73 insertions(+), 38 deletions(-)
> >
> > diff --git a/drivers/net/can/flexcan.c b/drivers/net/can/flexcan.c
> > index
> > 0f36eafe3ac1..cad42f20cfe5 100644
> > --- a/drivers/net/can/flexcan.c
> > +++ b/drivers/net/can/flexcan.c
> > @@ -24,6 +24,7 @@
> > #include <linux/of.h>
> > #include <linux/of_device.h>
> > #include <linux/platform_device.h>
> > +#include <linux/pm_runtime.h>
> > #include <linux/regulator/consumer.h> #include <linux/regmap.h>
> >
> > @@ -277,6 +278,7 @@ struct flexcan_priv {
> > u32 reg_imask1_default;
> > u32 reg_imask2_default;
> >
> > + struct device *dev;
> > struct clk *clk_ipg;
> > struct clk *clk_per;
> > const struct flexcan_devtype_data *devtype_data; @@ -444,6 +446,27
> > @@ static inline void flexcan_error_irq_disable(const struct flexcan_priv
> *priv)
> > priv->write(reg_ctrl, ®s->ctrl);
> > }
> >
> > +static int flexcan_clks_enable(const struct flexcan_priv *priv) {
> > + int err;
> > +
> > + err = clk_prepare_enable(priv->clk_ipg);
> > + if (err)
> > + return err;
> > +
> > + err = clk_prepare_enable(priv->clk_per);
> > + if (err)
> > + clk_disable_unprepare(priv->clk_ipg);
> > +
> > + return err;
> > +}
> > +
> > +static void flexcan_clks_disable(const struct flexcan_priv *priv) {
> > + clk_disable_unprepare(priv->clk_ipg);
> > + clk_disable_unprepare(priv->clk_per);
> > +}
> > +
> > static inline int flexcan_transceiver_enable(const struct flexcan_priv *priv)
> {
> > if (!priv->reg_xceiver)
> > @@ -570,19 +593,13 @@ static int flexcan_get_berr_counter(const struct
> > net_device *dev,
> > const struct flexcan_priv *priv = netdev_priv(dev);
> > int err;
> >
> > - err = clk_prepare_enable(priv->clk_ipg);
> > - if (err)
> > + err = pm_runtime_get_sync(priv->dev);
> > + if (err < 0)
> > return err;
> >
> > - err = clk_prepare_enable(priv->clk_per);
> > - if (err)
> > - goto out_disable_ipg;
> > -
> > err = __flexcan_get_berr_counter(dev, bec);
> >
> > - clk_disable_unprepare(priv->clk_per);
> > - out_disable_ipg:
> > - clk_disable_unprepare(priv->clk_ipg);
> > + pm_runtime_put(priv->dev);
> >
> > return err;
> > }
> > @@ -1215,17 +1232,13 @@ static int flexcan_open(struct net_device *dev)
> > struct flexcan_priv *priv = netdev_priv(dev);
> > int err;
> >
> > - err = clk_prepare_enable(priv->clk_ipg);
> > - if (err)
> > + err = pm_runtime_get_sync(priv->dev);
> > + if (err < 0)
> > return err;
> >
> > - err = clk_prepare_enable(priv->clk_per);
> > - if (err)
> > - goto out_disable_ipg;
> > -
> > err = open_candev(dev);
> > if (err)
> > - goto out_disable_per;
> > + goto out_disable_clks;
> >
> > err = request_irq(dev->irq, flexcan_irq, IRQF_SHARED, dev->name, dev);
> > if (err)
> > @@ -1288,10 +1301,8 @@ static int flexcan_open(struct net_device *dev)
> > free_irq(dev->irq, dev);
> > out_close:
> > close_candev(dev);
> > - out_disable_per:
> > - clk_disable_unprepare(priv->clk_per);
> > - out_disable_ipg:
> > - clk_disable_unprepare(priv->clk_ipg);
> > + out_disable_clks:
> > + pm_runtime_put(priv->dev);
> >
> > return err;
> > }
> > @@ -1306,10 +1317,9 @@ static int flexcan_close(struct net_device
> > *dev)
> >
> > can_rx_offload_del(&priv->offload);
> > free_irq(dev->irq, dev);
> > - clk_disable_unprepare(priv->clk_per);
> > - clk_disable_unprepare(priv->clk_ipg);
> >
> > close_candev(dev);
> > + pm_runtime_put(priv->dev);
> >
> > can_led_event(dev, CAN_LED_EVENT_STOP);
> >
> > @@ -1349,18 +1359,14 @@ static int register_flexcandev(struct
> > net_device
> > *dev)
> > struct flexcan_regs __iomem *regs = priv->regs;
> > u32 reg, err;
> >
> > - err = clk_prepare_enable(priv->clk_ipg);
> > + err = flexcan_clks_enable(priv);
> > if (err)
> > return err;
> >
> > - err = clk_prepare_enable(priv->clk_per);
> > - if (err)
> > - goto out_disable_ipg;
> > -
> > /* select "bus clock", chip must be disabled */
> > err = flexcan_chip_disable(priv);
> > if (err)
> > - goto out_disable_per;
> > + goto out_disable_clks;
> > reg = priv->read(®s->ctrl);
> > reg |= FLEXCAN_CTRL_CLK_SRC;
> > priv->write(reg, ®s->ctrl);
> > @@ -1389,14 +1395,13 @@ static int register_flexcandev(struct
> > net_device
> > *dev)
> >
> > err = register_candev(dev);
> >
> > - /* disable core and turn off clocks */
> > - out_chip_disable:
> > flexcan_chip_disable(priv);
> > - out_disable_per:
> > - clk_disable_unprepare(priv->clk_per);
> > - out_disable_ipg:
> > - clk_disable_unprepare(priv->clk_ipg);
> > + return 0;
> >
> > + out_chip_disable:
> > + flexcan_chip_disable(priv);
> > + out_disable_clks:
> > + flexcan_clks_disable(priv);
> > return err;
> > }
> >
> > @@ -1556,6 +1561,7 @@ static int flexcan_probe(struct platform_device
> > *pdev)
> > priv->write = flexcan_write_le;
> > }
> >
> > + priv->dev = &pdev->dev;
> > priv->can.clock.freq = clock_freq;
> > priv->can.bittiming_const = &flexcan_bittiming_const;
> > priv->can.do_set_mode = flexcan_set_mode; @@ -1569,6 +1575,10 @@
> > static int flexcan_probe(struct platform_device *pdev)
> > priv->devtype_data = devtype_data;
> > priv->reg_xceiver = reg_xceiver;
> >
> > + pm_runtime_get_noresume(&pdev->dev);
> > + pm_runtime_set_active(&pdev->dev);
> > + pm_runtime_enable(&pdev->dev);
> > +
> > err = register_flexcandev(dev);
> > if (err) {
> > dev_err(&pdev->dev, "registering netdev failed\n"); @@ -1586,6
> > +1596,7 @@ static int flexcan_probe(struct platform_device *pdev)
> > dev_info(&pdev->dev, "device registered (reg_base=%p, irq=%d)\n",
> > priv->regs, dev->irq);
> >
> > + pm_runtime_put(&pdev->dev);
> > return 0;
> >
> > failed_register:
> > @@ -1598,6 +1609,7 @@ static int flexcan_remove(struct platform_device
> > *pdev)
> > struct net_device *dev = platform_get_drvdata(pdev);
> >
> > unregister_flexcandev(dev);
> > + pm_runtime_disable(&pdev->dev);
> > free_candev(dev);
> >
> > return 0;
> > @@ -1607,7 +1619,7 @@ static int __maybe_unused
> flexcan_suspend(struct
> > device *device) {
> > struct net_device *dev = dev_get_drvdata(device);
> > struct flexcan_priv *priv = netdev_priv(dev);
> > - int err;
> > + int err = 0;
> >
> > if (netif_running(dev)) {
> > /* if wakeup is enabled, enter stop mode @@ -1620,20 +1632,22
> @@
> > static int __maybe_unused flexcan_suspend(struct device *device)
> > err = flexcan_chip_disable(priv);
> > if (err)
> > return err;
> > +
> > + err = pm_runtime_force_suspend(device);
> > }
> > netif_stop_queue(dev);
> > netif_device_detach(dev);
> > }
> > priv->can.state = CAN_STATE_SLEEPING;
> >
> > - return 0;
> > + return err;
> > }
> >
> > static int __maybe_unused flexcan_resume(struct device *device) {
> > struct net_device *dev = dev_get_drvdata(device);
> > struct flexcan_priv *priv = netdev_priv(dev);
> > - int err;
> > + int err = 0;
> >
> > priv->can.state = CAN_STATE_ERROR_ACTIVE;
> > if (netif_running(dev)) {
> > @@ -1642,14 +1656,34 @@ static int __maybe_unused
> > flexcan_resume(struct device *device)
> > if (device_may_wakeup(device)) {
> > disable_irq_wake(dev->irq);
> > } else {
> > - err = flexcan_chip_enable(priv);
> > + err = pm_runtime_force_resume(device);
> > if (err)
> > return err;
> > +
> > + err = flexcan_chip_enable(priv);
> > }
> > }
> > + return err;
> > +}
> > +
> > +static int __maybe_unused flexcan_runtime_suspend(struct device
> > +*device) {
> > + struct net_device *dev = dev_get_drvdata(device);
> > + struct flexcan_priv *priv = netdev_priv(dev);
> > +
> > + flexcan_clks_disable(priv);
> > +
> > return 0;
> > }
> >
> > +static int __maybe_unused flexcan_runtime_resume(struct device
> > +*device) {
> > + struct net_device *dev = dev_get_drvdata(device);
> > + struct flexcan_priv *priv = netdev_priv(dev);
> > +
> > + return flexcan_clks_enable(priv);
> > +}
> > +
> > static int __maybe_unused flexcan_noirq_suspend(struct device *device)
> {
> > struct net_device *dev = dev_get_drvdata(device); @@ -1676,6 +1710,7
> > @@ static int __maybe_unused flexcan_noirq_resume(struct device
> > *device)
> >
> > static const struct dev_pm_ops flexcan_pm_ops = {
> > SET_SYSTEM_SLEEP_PM_OPS(flexcan_suspend, flexcan_resume)
> > + SET_RUNTIME_PM_OPS(flexcan_runtime_suspend,
> > flexcan_runtime_resume,
> > +NULL)
> > SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(flexcan_noirq_suspend,
> > flexcan_noirq_resume) };
> >
> > --
> > 2.17.1
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox