* [RFC PATCH] ipv6: make ipv6_renew_options() interrupt/kernel safe
From: Paul Moore @ 2018-07-02 3:01 UTC (permalink / raw)
To: netdev; +Cc: Al Viro, selinux, linux-security-module
From: Paul Moore <paul@paul-moore.com>
At present the ipv6_renew_options_kern() function ends up calling into
access_ok() which is problematic if done from inside an interrupt as
access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures
(x86-64 is affected). Example warning/backtrace is shown below:
WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90
...
Call Trace:
<IRQ>
ipv6_renew_option+0xb2/0xf0
ipv6_renew_options+0x26a/0x340
ipv6_renew_options_kern+0x2c/0x40
calipso_req_setattr+0x72/0xe0
netlbl_req_setattr+0x126/0x1b0
selinux_netlbl_inet_conn_request+0x80/0x100
selinux_inet_conn_request+0x6d/0xb0
security_inet_conn_request+0x32/0x50
tcp_conn_request+0x35f/0xe00
? __lock_acquire+0x250/0x16c0
? selinux_socket_sock_rcv_skb+0x1ae/0x210
? tcp_rcv_state_process+0x289/0x106b
tcp_rcv_state_process+0x289/0x106b
? tcp_v6_do_rcv+0x1a7/0x3c0
tcp_v6_do_rcv+0x1a7/0x3c0
tcp_v6_rcv+0xc82/0xcf0
ip6_input_finish+0x10d/0x690
ip6_input+0x45/0x1e0
? ip6_rcv_finish+0x1d0/0x1d0
ipv6_rcv+0x32b/0x880
? ip6_make_skb+0x1e0/0x1e0
__netif_receive_skb_core+0x6f2/0xdf0
? process_backlog+0x85/0x250
? process_backlog+0x85/0x250
? process_backlog+0xec/0x250
process_backlog+0xec/0x250
net_rx_action+0x153/0x480
__do_softirq+0xd9/0x4f7
do_softirq_own_stack+0x2a/0x40
</IRQ>
...
While not present in the backtrace, ipv6_renew_option() ends up calling
access_ok() via the following chain:
access_ok()
_copy_from_user()
copy_from_user()
ipv6_renew_option()
The fix presented in this patch is to perform the userspace copy
earlier in the call chain such that it is only called when the option
data is actually coming from userspace; that place is
do_ipv6_setsockopt(). Not only does this solve the problem seen in
the backtrace above, it also allows us to simplify the code quite a
bit by removing ipv6_renew_options_kern() completely. We also take
this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option()
a small amount as well.
This patch is heavily based on a rough patch by Al Viro. I've taken
his original patch, converted a kmemdup() call in do_ipv6_setsockopt()
to a memdup_user() call, made better use of the e_inval jump target in
the same function, and cleaned up the use ipv6_renew_option() by
ipv6_renew_options().
CC: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
include/net/ipv6.h | 9 ----
net/ipv6/calipso.c | 9 +---
net/ipv6/exthdrs.c | 108 ++++++++++++----------------------------------
net/ipv6/ipv6_sockglue.c | 27 ++++++++----
4 files changed, 50 insertions(+), 103 deletions(-)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 16475c269749..d02881e4ad1f 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk,
struct ipv6_txoptions *ipv6_renew_options(struct sock *sk,
struct ipv6_txoptions *opt,
int newtype,
- struct ipv6_opt_hdr __user *newopt,
- int newoptlen);
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk,
- struct ipv6_txoptions *opt,
- int newtype,
- struct ipv6_opt_hdr *newopt,
- int newoptlen);
+ struct ipv6_opt_hdr *newopt);
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
struct ipv6_txoptions *opt);
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1323b9679cf7..1c0bb9fb76e6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
{
struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
- txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
- hop, hop ? ipv6_optlen(hop) : 0);
+ txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
txopt_put(old);
if (IS_ERR(txopts))
return PTR_ERR(txopts);
@@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req,
if (IS_ERR(new))
return PTR_ERR(new);
- txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
- new, new ? ipv6_optlen(new) : 0);
+ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
kfree(new);
@@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req)
if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
return; /* Nothing to do */
- txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
- new, new ? ipv6_optlen(new) : 0);
+ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
if (!IS_ERR(txopts)) {
txopts = xchg(&req_inet->ipv6_opt, txopts);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 5bc2bf3733ab..1e1d9bc2fd3d 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
}
EXPORT_SYMBOL_GPL(ipv6_dup_options);
-static int ipv6_renew_option(void *ohdr,
- struct ipv6_opt_hdr __user *newopt, int newoptlen,
- int inherit,
- struct ipv6_opt_hdr **hdr,
- char **p)
+static void ipv6_renew_option(int renewtype,
+ struct ipv6_opt_hdr **dest,
+ struct ipv6_opt_hdr *old,
+ struct ipv6_opt_hdr *new,
+ int newtype, char **p)
{
- if (inherit) {
- if (ohdr) {
- memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
- *hdr = (struct ipv6_opt_hdr *)*p;
- *p += CMSG_ALIGN(ipv6_optlen(*hdr));
- }
- } else {
- if (newopt) {
- if (copy_from_user(*p, newopt, newoptlen))
- return -EFAULT;
- *hdr = (struct ipv6_opt_hdr *)*p;
- if (ipv6_optlen(*hdr) > newoptlen)
- return -EINVAL;
- *p += CMSG_ALIGN(newoptlen);
- }
- }
- return 0;
+ struct ipv6_opt_hdr *src;
+
+ src = (renewtype == newtype ? new : old);
+ if (!src)
+ return;
+
+ memcpy(*p, src, ipv6_optlen(src));
+ *dest = (struct ipv6_opt_hdr *)*p;
+ p += CMSG_ALIGN(ipv6_optlen(*dest));
}
/**
@@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr,
*/
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
- int newtype,
- struct ipv6_opt_hdr __user *newopt, int newoptlen)
+ int newtype, struct ipv6_opt_hdr *newopt)
{
int tot_len = 0;
char *p;
struct ipv6_txoptions *opt2;
- int err;
if (opt) {
if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
}
- if (newopt && newoptlen)
- tot_len += CMSG_ALIGN(newoptlen);
+ if (newopt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
if (!tot_len)
return NULL;
@@ -1098,29 +1088,16 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
opt2->tot_len = tot_len;
p = (char *)(opt2 + 1);
- err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
- newtype != IPV6_HOPOPTS,
- &opt2->hopopt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
- newtype != IPV6_RTHDRDSTOPTS,
- &opt2->dst0opt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
- newtype != IPV6_RTHDR,
- (struct ipv6_opt_hdr **)&opt2->srcrt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
- newtype != IPV6_DSTOPTS,
- &opt2->dst1opt, &p);
- if (err)
- goto out;
+ ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, opt->hopopt,
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, opt->dst0opt,
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_RTHDR,
+ (struct ipv6_opt_hdr **)&opt2->srcrt,
+ (struct ipv6_opt_hdr *)opt->srcrt,
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, opt->dst1opt,
+ newopt, newtype, &p);
opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
(opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -1128,37 +1105,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
return opt2;
-out:
- sock_kfree_s(sk, opt2, opt2->tot_len);
- return ERR_PTR(err);
-}
-
-/**
- * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
- *
- * @sk: sock from which to allocate memory
- * @opt: original options
- * @newtype: option type to replace in @opt
- * @newopt: new option of type @newtype to replace (kernel-mem)
- * @newoptlen: length of @newopt
- *
- * See ipv6_renew_options(). The difference is that @newopt is
- * kernel memory, rather than user memory.
- */
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
- int newtype, struct ipv6_opt_hdr *newopt,
- int newoptlen)
-{
- struct ipv6_txoptions *ret_val;
- const mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret_val = ipv6_renew_options(sk, opt, newtype,
- (struct ipv6_opt_hdr __user *)newopt,
- newoptlen);
- set_fs(old_fs);
- return ret_val;
}
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 4d780c7f0130..c95c3486d904 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
case IPV6_DSTOPTS:
{
struct ipv6_txoptions *opt;
+ struct ipv6_opt_hdr *new = NULL;
+
+ /* hop-by-hop / destination options are privileged option */
+ retv = -EPERM;
+ if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+ break;
/* remove any sticky options header with a zero option
* length, per RFC3542.
@@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
else if (optlen < sizeof(struct ipv6_opt_hdr) ||
optlen & 0x7 || optlen > 8 * 255)
goto e_inval;
-
- /* hop-by-hop / destination options are privileged option */
- retv = -EPERM;
- if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
- break;
+ else {
+ new = memdup_user(optval, optlen);
+ if (IS_ERR(new)) {
+ retv = PTR_ERR(new);
+ break;
+ }
+ if (unlikely(ipv6_optlen(new) > optlen)) {
+ kfree(new);
+ goto e_inval;
+ }
+ }
opt = rcu_dereference_protected(np->opt,
lockdep_sock_is_held(sk));
- opt = ipv6_renew_options(sk, opt, optname,
- (struct ipv6_opt_hdr __user *)optval,
- optlen);
+ opt = ipv6_renew_options(sk, opt, optname, new);
+ kfree(new);
if (IS_ERR(opt)) {
retv = PTR_ERR(opt);
break;
^ permalink raw reply related
* [PATCH net 0/4] qed*: Fix series.
From: Sudarsana Reddy Kalluru @ 2018-07-02 3:03 UTC (permalink / raw)
To: davem; +Cc: netdev, Michal.Kalderon, Sudarsana Reddy Kalluru
From: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
The patch series addresses few issues in the qed* drivers.
Please consider applying it to 'net' branch.
Sudarsana Reddy Kalluru (4):
qed: Limit msix vectors in kdump kernel to the minimum required count.
qed: Fix setting of incorrect eswitch mode.
qed: Fix use of incorrect size in memcpy call.
qede: Adverstise software timestamp caps when PHC is not available.
drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 8 ++++----
drivers/net/ethernet/qlogic/qed/qed_dev.c | 2 +-
drivers/net/ethernet/qlogic/qed/qed_main.c | 8 ++++++++
drivers/net/ethernet/qlogic/qed/qed_sriov.c | 19 +++++++++++++++++--
drivers/net/ethernet/qlogic/qede/qede_ptp.c | 10 ++++++++--
5 files changed, 38 insertions(+), 9 deletions(-)
--
1.8.3.1
^ permalink raw reply
* [PATCH net 1/4] qed: Limit msix vectors in kdump kernel to the minimum required count.
From: Sudarsana Reddy Kalluru @ 2018-07-02 3:03 UTC (permalink / raw)
To: davem; +Cc: netdev, Michal.Kalderon
In-Reply-To: <20180702030308.16944-1-sudarsana.kalluru@cavium.com>
Memory size is limited in the kdump kernel environment. Allocation of more
msix-vectors (or queues) consumes few tens of MBs of memory, which might
lead to the kdump kernel failure.
This patch adds changes to limit the number of MSI-X vectors in kdump
kernel to minimum required value (i.e., 2 per engine).
Fixes: fe56b9e6a ("qed: Add module with basic common support")
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
---
drivers/net/ethernet/qlogic/qed/qed_main.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 5c10fd7..0cbc74d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -789,6 +789,14 @@ static int qed_slowpath_setup_int(struct qed_dev *cdev,
/* We want a minimum of one slowpath and one fastpath vector per hwfn */
cdev->int_params.in.min_msix_cnt = cdev->num_hwfns * 2;
+ if (is_kdump_kernel()) {
+ DP_INFO(cdev,
+ "Kdump kernel: Limit the max number of requested MSI-X vectors to %hd\n",
+ cdev->int_params.in.min_msix_cnt);
+ cdev->int_params.in.num_vectors =
+ cdev->int_params.in.min_msix_cnt;
+ }
+
rc = qed_set_int_mode(cdev, false);
if (rc) {
DP_ERR(cdev, "qed_slowpath_setup_int ERR\n");
--
1.8.3.1
^ permalink raw reply related
* [PATCH net 3/4] qed: Fix use of incorrect size in memcpy call.
From: Sudarsana Reddy Kalluru @ 2018-07-02 3:03 UTC (permalink / raw)
To: davem; +Cc: netdev, Michal.Kalderon
In-Reply-To: <20180702030308.16944-1-sudarsana.kalluru@cavium.com>
Use the correct size value while copying chassis/port id values.
Fixes: 6ad8c632e ("qed: Add support for query/config dcbx.")
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
---
drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index f0b0138..e0680ce9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -709,9 +709,9 @@ static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn)
p_local = &p_hwfn->p_dcbx_info->lldp_local[LLDP_NEAREST_BRIDGE];
memcpy(params->lldp_local.local_chassis_id, p_local->local_chassis_id,
- ARRAY_SIZE(p_local->local_chassis_id));
+ sizeof(p_local->local_chassis_id));
memcpy(params->lldp_local.local_port_id, p_local->local_port_id,
- ARRAY_SIZE(p_local->local_port_id));
+ sizeof(p_local->local_port_id));
}
static void
@@ -723,9 +723,9 @@ static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn)
p_remote = &p_hwfn->p_dcbx_info->lldp_remote[LLDP_NEAREST_BRIDGE];
memcpy(params->lldp_remote.peer_chassis_id, p_remote->peer_chassis_id,
- ARRAY_SIZE(p_remote->peer_chassis_id));
+ sizeof(p_remote->peer_chassis_id));
memcpy(params->lldp_remote.peer_port_id, p_remote->peer_port_id,
- ARRAY_SIZE(p_remote->peer_port_id));
+ sizeof(p_remote->peer_port_id));
}
static int
--
1.8.3.1
^ permalink raw reply related
* [PATCH net 2/4] qed: Fix setting of incorrect eswitch mode.
From: Sudarsana Reddy Kalluru @ 2018-07-02 3:03 UTC (permalink / raw)
To: davem; +Cc: netdev, Michal.Kalderon
In-Reply-To: <20180702030308.16944-1-sudarsana.kalluru@cavium.com>
By default, driver sets the eswitch mode incorrectly as VEB (virtual
Ethernet bridging).
Need to set VEB eswitch mode only when sriov is enabled, and it should be
to set NONE by default. The patch incorporates this change.
Fixes: 0fefbfbaa ("qed*: Management firmware - notifications and defaults")
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
---
drivers/net/ethernet/qlogic/qed/qed_dev.c | 2 +-
drivers/net/ethernet/qlogic/qed/qed_sriov.c | 19 +++++++++++++++++--
2 files changed, 18 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 329781c..e5249b4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1804,7 +1804,7 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
DP_INFO(p_hwfn, "Failed to update driver state\n");
rc = qed_mcp_ov_update_eswitch(p_hwfn, p_hwfn->p_main_ptt,
- QED_OV_ESWITCH_VEB);
+ QED_OV_ESWITCH_NONE);
if (rc)
DP_INFO(p_hwfn, "Failed to update eswitch mode\n");
}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index f01bf52..fd59cf4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -4513,6 +4513,8 @@ static void qed_sriov_enable_qid_config(struct qed_hwfn *hwfn,
static int qed_sriov_enable(struct qed_dev *cdev, int num)
{
struct qed_iov_vf_init_params params;
+ struct qed_hwfn *hwfn;
+ struct qed_ptt *ptt;
int i, j, rc;
if (num >= RESC_NUM(&cdev->hwfns[0], QED_VPORT)) {
@@ -4525,8 +4527,8 @@ static int qed_sriov_enable(struct qed_dev *cdev, int num)
/* Initialize HW for VF access */
for_each_hwfn(cdev, j) {
- struct qed_hwfn *hwfn = &cdev->hwfns[j];
- struct qed_ptt *ptt = qed_ptt_acquire(hwfn);
+ hwfn = &cdev->hwfns[j];
+ ptt = qed_ptt_acquire(hwfn);
/* Make sure not to use more than 16 queues per VF */
params.num_queues = min_t(int,
@@ -4562,6 +4564,19 @@ static int qed_sriov_enable(struct qed_dev *cdev, int num)
goto err;
}
+ hwfn = QED_LEADING_HWFN(cdev);
+ ptt = qed_ptt_acquire(hwfn);
+ if (!ptt) {
+ DP_ERR(hwfn, "Failed to acquire ptt\n");
+ rc = -EBUSY;
+ goto err;
+ }
+
+ rc = qed_mcp_ov_update_eswitch(hwfn, ptt, QED_OV_ESWITCH_VEB);
+ if (rc)
+ DP_INFO(cdev, "Failed to update eswitch mode\n");
+ qed_ptt_release(hwfn, ptt);
+
return num;
err:
--
1.8.3.1
^ permalink raw reply related
* [PATCH net 4/4] qede: Adverstise software timestamp caps when PHC is not available.
From: Sudarsana Reddy Kalluru @ 2018-07-02 3:03 UTC (permalink / raw)
To: davem; +Cc: netdev, Michal.Kalderon
In-Reply-To: <20180702030308.16944-1-sudarsana.kalluru@cavium.com>
When ptp clock is not available for a PF (e.g., higher PFs in NPAR mode),
get-tsinfo() callback should return the software timestamp capabilities
instead of returning the error.
Fixes: 4c55215c ("qede: Add driver support for PTP")
Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Michal Kalderon <Michal.Kalderon@cavium.com>
---
drivers/net/ethernet/qlogic/qede/qede_ptp.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
index 02adb513..013ff56 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
@@ -337,8 +337,14 @@ int qede_ptp_get_ts_info(struct qede_dev *edev, struct ethtool_ts_info *info)
{
struct qede_ptp *ptp = edev->ptp;
- if (!ptp)
- return -EIO;
+ if (!ptp) {
+ info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+
+ return 0;
+ }
info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
SOF_TIMESTAMPING_RX_SOFTWARE |
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02 3:05 UTC (permalink / raw)
To: Toshiaki Makita, Michael S. Tsirkin; +Cc: netdev, kvm, virtualization
In-Reply-To: <c6ef8c74-7efa-707b-d62d-471636dd0a7f@lab.ntt.co.jp>
On 2018年07月02日 10:52, Toshiaki Makita wrote:
> On 2018/07/02 11:41, Jason Wang wrote:
>> On 2018年06月30日 00:38, Michael S. Tsirkin wrote:
>>> On Fri, Jun 29, 2018 at 05:09:50PM +0900, Toshiaki Makita wrote:
>>>> Under heavy load vhost busypoll may run without suppressing
>>>> notification. For example tx zerocopy callback can push tx work while
>>>> handle_tx() is running, then busyloop exits due to vhost_has_work()
>>>> condition and enables notification but immediately reenters handle_tx()
>>>> because the pushed work was tx. In this case handle_tx() tries to
>>>> disable notification again, but when using event_idx it by design
>>>> cannot. Then busyloop will run without suppressing notification.
>>>> Another example is the case where handle_tx() tries to enable
>>>> notification but avail idx is advanced so disables it again. This case
>>>> also lead to the same situation with event_idx.
>>>>
>>>> The problem is that once we enter this situation busyloop does not work
>>>> under heavy load for considerable amount of time, because notification
>>>> is likely to happen during busyloop and handle_tx() immediately enables
>>>> notification after notification happens. Specifically busyloop detects
>>>> notification by vhost_has_work() and then handle_tx() calls
>>>> vhost_enable_notify().
>>> I'd like to understand the problem a bit better.
>>> Why does this happen?
>>> Doesn't this only happen if ring is empty?
>>>
>> My understanding is:
>>
>> vhost_zerocopy_callback() try to poll vhost virtqueue. This will cause
>> the busy loop in vhost_net_tx_get_vq_desc() to exit because of
>> vhost_has_work() return true. Then handle_tx() tends to enable
>> notification. Then guest may kick us even if handle_tx() call
>> vhost_disable_notify() which in fact did nothing for even index.
> Yes.
>
>> Maybe we can try to call vhost_zerocopy_signal_used() if we found
>> there's pending used from zerocopy instead.
> Note that even when zerocopy is disabled the problem happens as I wrote.
> When vhost_enable_notify() detects avail_idx advanced it tries to
> disable notification again but it fails.
>
Yes, and the main reason is need_resched() and rx work. (polling RX will
be addressed by Tonghao's patch I think).
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net-next 1/1] net sched actions: add extack messages in pedit action
From: Roman Mashak @ 2018-07-02 4:02 UTC (permalink / raw)
To: davem; +Cc: netdev, kernel, jhs, xiyou.wangcong, jiri, Roman Mashak
Signed-off-by: Roman Mashak <mrv@mojatatu.com>
---
net/sched/act_pedit.c | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index ab151346d3d4..55bc96b610e8 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -144,8 +144,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
int ret = 0, err;
int ksize;
- if (!nla)
+ if (!nla) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed");
return -EINVAL;
+ }
err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL);
if (err < 0)
@@ -154,21 +156,27 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
pattr = tb[TCA_PEDIT_PARMS];
if (!pattr)
pattr = tb[TCA_PEDIT_PARMS_EX];
- if (!pattr)
+ if (!pattr) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute");
return -EINVAL;
+ }
parm = nla_data(pattr);
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
- if (nla_len(pattr) < sizeof(*parm) + ksize)
+ if (nla_len(pattr) < sizeof(*parm) + ksize) {
+ NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid");
return -EINVAL;
+ }
keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys);
if (IS_ERR(keys_ex))
return PTR_ERR(keys_ex);
if (!tcf_idr_check(tn, parm->index, a, bind)) {
- if (!parm->nkeys)
+ if (!parm->nkeys) {
+ NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed");
return -EINVAL;
+ }
ret = tcf_idr_create(tn, parm->index, est, a,
&act_pedit_ops, bind, false);
if (ret)
--
2.7.4
^ permalink raw reply related
* Re: [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Tonghao Zhang @ 2018-07-02 4:05 UTC (permalink / raw)
To: jasowang
Cc: Linux Kernel Network Developers, virtualization, Tonghao Zhang,
mst
In-Reply-To: <fa6a368a-7a01-2467-ba63-322eef4f544a@redhat.com>
On Mon, Jul 2, 2018 at 10:29 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
>
> On 2018年06月30日 14:33, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > Factor out generic busy polling logic and will be
> > used for tx path in the next patch. And with the patch,
> > qemu can set differently the busyloop_timeout for rx queue.
> >
> > Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
> > ---
> > drivers/vhost/net.c | 92 ++++++++++++++++++++++++++++++-----------------------
> > 1 file changed, 53 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 62bb8e8..458f81d 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> > return vhost_poll_start(poll, sock->file);
> > }
> >
> > +static int sk_has_rx_data(struct sock *sk)
> > +{
> > + struct socket *sock = sk->sk_socket;
> > +
> > + if (sock->ops->peek_len)
> > + return sock->ops->peek_len(sock);
> > +
> > + return skb_queue_empty(&sk->sk_receive_queue);
> > +}
> > +
> > +static void vhost_net_busy_poll(struct vhost_net *net,
> > + struct vhost_virtqueue *rvq,
> > + struct vhost_virtqueue *tvq,
> > + bool rx)
> > +{
> > + unsigned long uninitialized_var(endtime);
> > + struct socket *sock = rvq->private_data;
> > + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> > + unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
> > + tvq->busyloop_timeout;
>
> As simple as vq->busyloop_timeout?
maybe we should allow user set busyloop_timeout for rx or tx
differently. this code should be moved under mutex.
> > +
> > + mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>
> We need move sock = rvq->private_data under the protection of vq mutex
> if rx is false.
yes, thanks for your review.
> > + vhost_disable_notify(&net->dev, vq);
> > +
> > + preempt_disable();
> > + endtime = busy_clock() + busyloop_timeout;
> > + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> > + !(sock && sk_has_rx_data(sock->sk)) &&
> > + vhost_vq_avail_empty(tvq->dev, tvq))
> > + cpu_relax();
> > + preempt_enable();
> > +
> > + if ((rx && !vhost_vq_avail_empty(&net->dev, vq)) ||
> > + (!rx && (sock && sk_has_rx_data(sock->sk)))) {
> > + vhost_poll_queue(&vq->poll);
> > + } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> > + vhost_disable_notify(&net->dev, vq);
> > + vhost_poll_queue(&vq->poll);
> > + }
> > +
> > + mutex_unlock(&vq->mutex);
> > +}
> > +
> > +
> > static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > struct vhost_virtqueue *vq,
> > struct iovec iov[], unsigned int iov_size,
> > @@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
> > return len;
> > }
> >
> > -static int sk_has_rx_data(struct sock *sk)
> > -{
> > - struct socket *sock = sk->sk_socket;
> > -
> > - if (sock->ops->peek_len)
> > - return sock->ops->peek_len(sock);
> > -
> > - return skb_queue_empty(&sk->sk_receive_queue);
> > -}
> > -
> > static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> > {
> > struct vhost_virtqueue *vq = &nvq->vq;
> > @@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >
> > static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
> > {
> > - struct vhost_net_virtqueue *rvq = &net->vqs[VHOST_NET_VQ_RX];
> > - struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> > - struct vhost_virtqueue *vq = &nvq->vq;
> > - unsigned long uninitialized_var(endtime);
> > - int len = peek_head_len(rvq, sk);
> > + struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
> > + struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
>
> It looks to me rnvq and tnvq is slightly better.
yes. patch 4 will also update.
> Other looks good to me.
>
> Thanks
>
> >
> > - if (!len && vq->busyloop_timeout) {
> > - /* Flush batched heads first */
> > - vhost_rx_signal_used(rvq);
> > - /* Both tx vq and rx socket were polled here */
> > - mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> > - vhost_disable_notify(&net->dev, vq);
> > + int len = peek_head_len(nvq_rx, sk);
> >
> > - preempt_disable();
> > - endtime = busy_clock() + vq->busyloop_timeout;
> > -
> > - while (vhost_can_busy_poll(&net->dev, endtime) &&
> > - !sk_has_rx_data(sk) &&
> > - vhost_vq_avail_empty(&net->dev, vq))
> > - cpu_relax();
> > -
> > - preempt_enable();
> > -
> > - if (!vhost_vq_avail_empty(&net->dev, vq))
> > - vhost_poll_queue(&vq->poll);
> > - else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
> > - vhost_disable_notify(&net->dev, vq);
> > - vhost_poll_queue(&vq->poll);
> > - }
> > + if (!len && nvq_rx->vq.busyloop_timeout) {
> > + /* Flush batched heads first */
> > + vhost_rx_signal_used(nvq_rx);
> >
> > - mutex_unlock(&vq->mutex);
> > + /* Both tx vq and rx socket were polled here */
> > + vhost_net_busy_poll(net, &nvq_rx->vq, &nvq_tx->vq, true);
> >
> > - len = peek_head_len(rvq, sk);
> > + len = peek_head_len(nvq_rx, sk);
> > }
> >
> > return len;
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02 4:37 UTC (permalink / raw)
To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <4dc58c49-f804-b720-6e50-5867dc32d7ec@redhat.com>
On 2018/07/02 11:54, Jason Wang wrote:
> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>> Hi Jason,
>>
>> On 2018/06/29 18:30, Jason Wang wrote:
>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>> ...
>>>> To fix this, poll the work instead of enabling notification when
>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>> completely cancel the busypoll, so let's run busypoll after the
>>>> necessary work is done. In order to avoid too long busyloop due to
>>>> interruption, save the endtime in vq field and use it when reentering
>>>> the work function.
>>> I think we don't care long busyloop unless e.g tx can starve rx?
>> I just want to keep it user-controllable. Unless memorizing it busypoll
>> can run unexpectedly long.
>
> I think the total amount of time for busy polling is bounded. If I was
> wrong, it should be a bug somewhere.
Consider this kind of scenario:
0. Set 100us busypoll for example.
1. handle_tx() runs busypoll.
2. Something like zerocopy queues tx_work within 100us.
3. busypoll exits and call handle_tx() again.
4. Repeat 1-3.
In this case handle_tx() does not process packets but busypoll
essentially runs beyond 100us without endtime memorized. This may be
just a theoretical problem, but I was worried that more code to poll tx
queue can be added in the future and it becomes realistic.
>>>> Performance numbers:
>>>>
>>>> - Bulk transfer from guest to external physical server.
>>>> [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>> [Server]
>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>> use the generic XDP datapath?
>> For some reason zerocopy was not applied for most packets, so in most
>> cases driver XDP was used. I was going to dig into it but not yet.
>
> Right, just to confirm this. This is expected.
>
> In tuntap, we do native XDP only for small and non zerocopy packets. See
> tun_can_build_skb(). The reason is XDP may adjust packet header which is
> not supported by zercopy. We can only use XDP generic for zerocopy in
> this case.
I think I understand when driver XDP can be used. What I'm not sure and
was going to narrow down is why zerocopy is mostly not applied.
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [net-next 01/12] net/mlx5e: Add UDP GSO support
From: Boris Pismenny @ 2018-07-02 5:29 UTC (permalink / raw)
To: Willem de Bruijn, Alexander Duyck
Cc: David Miller, Network Development, Saeed Mahameed, ogerlitz,
yossiku
In-Reply-To: <CAF=yD-+XSKomatb19y48ZTNrJbmASFRHt1khy0TxdjPbgiZXkg@mail.gmail.com>
On 7/2/2018 4:45 AM, Willem de Bruijn wrote:
>>> I've noticed that we could get cleaner code in our driver if we remove
>>> these two lines from net/ipv4/udp_offload.c:
>>> if (skb_is_gso(segs))
>>> mss *= skb_shinfo(segs)->gso_segs;
>>>
>>> I think that this is correct in case of GSO_PARTIAL segmentation for the
>>> following reasons:
>>> 1. After this change the UDP payload field is consistent with the IP
>>> header payload length field. Currently, IPv4 length is 1500 and UDP
>>> total length is the full unsegmented length.
>
> How does this simplify the driver? Does it currently have to
> change the udph->length field to the mss on the wire, because the
> device only splits + replicates the headers + computes the csum?
Yes, this is the code I have at the moment.
The device's limitation is more subtle than this. It could adjust the
length, but then the checksum would be wrong.
^ permalink raw reply
* [PATCH net-next] net: phy: realtek: add support for RTL8211C
From: Heiner Kallweit @ 2018-07-02 6:08 UTC (permalink / raw)
To: David Miller, Andrew Lunn, Florian Fainelli,
Realtek linux nic maintainers
Cc: netdev@vger.kernel.org
RTL8211C has an issue when operating in Gigabit slave mode, therefore
genphy driver can't be used. See also this U-boot change.
https://lists.denx.de/pipermail/u-boot/2016-March/249712.html
Add a PHY driver for this chip with the quirk to force Gigabit master
mode. As a note: This will make it impossible to connect two network
ports directly which both are driven by a RTl8211C.
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
drivers/net/phy/realtek.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 082fb40c..eb6cb2cc 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -128,6 +128,15 @@ static int rtl8211f_config_intr(struct phy_device *phydev)
return phy_write_paged(phydev, 0xa42, RTL821x_INER, val);
}
+static int rtl8211c_config_init(struct phy_device *phydev)
+{
+ /* RTL8211C has an issue when operating in Gigabit slave mode */
+ phy_set_bits(phydev, MII_CTRL1000,
+ CTL1000_ENABLE_MASTER | CTL1000_AS_MASTER);
+
+ return genphy_config_init(phydev);
+}
+
static int rtl8211f_config_init(struct phy_device *phydev)
{
int ret;
@@ -190,6 +199,14 @@ static struct phy_driver realtek_drvs[] = {
.write_mmd = &genphy_write_mmd_unsupported,
.suspend = rtl8211b_suspend,
.resume = rtl8211b_resume,
+ }, {
+ .phy_id = 0x001cc913,
+ .name = "RTL8211C Gigabit Ethernet",
+ .phy_id_mask = 0x001fffff,
+ .features = PHY_GBIT_FEATURES,
+ .config_init = rtl8211c_config_init,
+ .read_mmd = &genphy_read_mmd_unsupported,
+ .write_mmd = &genphy_write_mmd_unsupported,
}, {
.phy_id = 0x001cc914,
.name = "RTL8211DN Gigabit Ethernet",
--
2.18.0
^ permalink raw reply related
* [PATCH] net: phy: marvell: change default m88e1510 LED configuration
From: Wang Dongsheng @ 2018-07-02 6:15 UTC (permalink / raw)
To: andrew
Cc: clemens.gruber, davem, kstewart, pombredanne, tglx, gregkh,
netdev, Wang Dongsheng
The m88e1121 LED default configuration does not apply m88e151x.
So add a function to relpace m88e1121 LED configuration.
Signed-off-by: Wang Dongsheng <dongsheng.wang@hxt-semitech.com>
---
drivers/net/phy/marvell.c | 54 ++++++++++++++++++++++++-------------
include/linux/marvell_phy.h | 2 ++
2 files changed, 38 insertions(+), 18 deletions(-)
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index b8f57e9b9379..1cd439bdf608 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -130,8 +130,9 @@
#define MII_88E1318S_PHY_WOL_CTRL_CLEAR_WOL_STATUS BIT(12)
#define MII_88E1318S_PHY_WOL_CTRL_MAGIC_PACKET_MATCH_ENABLE BIT(14)
-#define MII_88E1121_PHY_LED_CTRL 16
+#define MII_PHY_LED_CTRL 16
#define MII_88E1121_PHY_LED_DEF 0x0030
+#define MII_88E1510_PHY_LED_DEF 0x1177
#define MII_M1011_PHY_STATUS 0x11
#define MII_M1011_PHY_STATUS_1000 0x8000
@@ -632,8 +633,40 @@ static int m88e1510_config_aneg(struct phy_device *phydev)
return err;
}
+static void marvell_config_led(struct phy_device *phydev)
+{
+ u16 def_config;
+ int err;
+
+ switch (MARVELL_PHY_FAMILY_ID(phydev->phy_id)) {
+ /* Default PHY LED config: LED[0] .. Link, LED[1] .. Activity */
+ case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1121R):
+ case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1318S):
+ def_config = MII_88E1121_PHY_LED_DEF;
+ break;
+ /* Default PHY LED config:
+ * LED[0] .. 1000Mbps Link
+ * LED[1] .. 100Mbps Link
+ * LED[2] .. Blink, Activity
+ */
+ case MARVELL_PHY_FAMILY_ID(MARVELL_PHY_ID_88E1510):
+ def_config = MII_88E1510_PHY_LED_DEF;
+ break;
+ default:
+ return;
+ }
+
+ err = phy_write_paged(phydev, MII_MARVELL_LED_PAGE, MII_PHY_LED_CTRL,
+ def_config);
+ if (err < 0)
+ pr_warn("Fail to config marvell phy LED.\n");
+}
+
static int marvell_config_init(struct phy_device *phydev)
{
+ /* Set defalut LED */
+ marvell_config_led(phydev);
+
/* Set registers from marvell,reg-init DT property */
return marvell_of_reg_init(phydev);
}
@@ -813,21 +846,6 @@ static int m88e1111_config_init(struct phy_device *phydev)
return genphy_soft_reset(phydev);
}
-static int m88e1121_config_init(struct phy_device *phydev)
-{
- int err;
-
- /* Default PHY LED config: LED[0] .. Link, LED[1] .. Activity */
- err = phy_write_paged(phydev, MII_MARVELL_LED_PAGE,
- MII_88E1121_PHY_LED_CTRL,
- MII_88E1121_PHY_LED_DEF);
- if (err < 0)
- return err;
-
- /* Set marvell,reg-init configuration from device tree */
- return marvell_config_init(phydev);
-}
-
static int m88e1318_config_init(struct phy_device *phydev)
{
if (phy_interrupt_is_valid(phydev)) {
@@ -841,7 +859,7 @@ static int m88e1318_config_init(struct phy_device *phydev)
return err;
}
- return m88e1121_config_init(phydev);
+ return marvell_config_init(phydev);
}
static int m88e1510_config_init(struct phy_device *phydev)
@@ -2087,7 +2105,7 @@ static struct phy_driver marvell_drivers[] = {
.features = PHY_GBIT_FEATURES,
.flags = PHY_HAS_INTERRUPT,
.probe = &m88e1121_probe,
- .config_init = &m88e1121_config_init,
+ .config_init = &marvell_config_init,
.config_aneg = &m88e1121_config_aneg,
.read_status = &marvell_read_status,
.ack_interrupt = &marvell_ack_interrupt,
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 4f5f8c21e283..1eb6f244588d 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -27,6 +27,8 @@
*/
#define MARVELL_PHY_ID_88E6390 0x01410f90
+#define MARVELL_PHY_FAMILY_ID(id) ((id) >> 4)
+
/* struct phy_device dev_flags definitions */
#define MARVELL_PHY_M1145_FLAGS_RESISTANCE 0x00000001
#define MARVELL_PHY_M1118_DNS323_LEDS 0x00000002
--
2.18.0
^ permalink raw reply related
* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Jason Wang @ 2018-07-02 6:17 UTC (permalink / raw)
To: Toshiaki Makita, Michael S. Tsirkin
Cc: kvm, virtualization, netdev, Tonghao Zhang
In-Reply-To: <23eaea23-2288-e00a-88df-f13eeb890e89@lab.ntt.co.jp>
On 2018年07月02日 12:37, Toshiaki Makita wrote:
> On 2018/07/02 11:54, Jason Wang wrote:
>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>> Hi Jason,
>>>
>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>> ...
>>>>> To fix this, poll the work instead of enabling notification when
>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>> interruption, save the endtime in vq field and use it when reentering
>>>>> the work function.
>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>> I just want to keep it user-controllable. Unless memorizing it busypoll
>>> can run unexpectedly long.
>> I think the total amount of time for busy polling is bounded. If I was
>> wrong, it should be a bug somewhere.
> Consider this kind of scenario:
> 0. Set 100us busypoll for example.
> 1. handle_tx() runs busypoll.
> 2. Something like zerocopy queues tx_work within 100us.
> 3. busypoll exits and call handle_tx() again.
> 4. Repeat 1-3.
>
> In this case handle_tx() does not process packets but busypoll
> essentially runs beyond 100us without endtime memorized. This may be
> just a theoretical problem, but I was worried that more code to poll tx
> queue can be added in the future and it becomes realistic.
Yes, but consider zerocopy tends to batch 16 used packets and we will
finally finish all processing of packets. The above won't be endless, so
it was probably tolerable.
>
>>>>> Performance numbers:
>>>>>
>>>>> - Bulk transfer from guest to external physical server.
>>>>> [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>> [Server]
>>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>>> use the generic XDP datapath?
>>> For some reason zerocopy was not applied for most packets, so in most
>>> cases driver XDP was used. I was going to dig into it but not yet.
>> Right, just to confirm this. This is expected.
>>
>> In tuntap, we do native XDP only for small and non zerocopy packets. See
>> tun_can_build_skb(). The reason is XDP may adjust packet header which is
>> not supported by zercopy. We can only use XDP generic for zerocopy in
>> this case.
> I think I understand when driver XDP can be used. What I'm not sure and
> was going to narrow down is why zerocopy is mostly not applied.
>
I see, any touch to the zerocopy packet (clone, header expansion or
segmentation) that lead a userspace copy will increase the error counter
in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
zerocopy. So it was probably something in your setup or a bug somewhere.
Thanks
^ permalink raw reply
* [PATCHv2 net-next 0/2] route: add support and selftests for directed broadcast forwarding
From: Xin Long @ 2018-07-02 6:30 UTC (permalink / raw)
To: network dev; +Cc: davem, David Ahern, Davide Caratti, idosch
Patch 1/2 is the feature and 2/2 is the selftest. Check the changelog
on each of them to know the details.
v1->v2:
- fix a typo in changelog.
- fix an uapi break that Davide noticed.
- flush route cache when bc_forwarding is changed.
- add the selftest for this patch as Ido's suggestion.
Xin Long (2):
route: add support for directed broadcast forwarding
selftests: add a selftest for directed broadcast forwarding
include/linux/inetdevice.h | 1 +
include/uapi/linux/ip.h | 1 +
include/uapi/linux/netconf.h | 1 +
net/ipv4/devinet.c | 11 ++
net/ipv4/route.c | 6 +-
.../selftests/net/forwarding/router_broadcast.sh | 142 +++++++++++++++++++++
6 files changed, 161 insertions(+), 1 deletion(-)
create mode 100755 tools/testing/selftests/net/forwarding/router_broadcast.sh
--
2.1.0
^ permalink raw reply
* [PATCHv2 net-next 1/2] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-07-02 6:30 UTC (permalink / raw)
To: network dev; +Cc: davem, David Ahern, Davide Caratti, idosch
In-Reply-To: <cover.1530512974.git.lucien.xin@gmail.com>
This patch implements the feature described in rfc1812#section-5.3.5.2
and rfc2644. It allows the router to forward directed broadcast when
sysctl bc_forwarding is enabled.
Note that this feature could be done by iptables -j TEE, but it would
cause some problems:
- target TEE's gateway param has to be set with a specific address,
and it's not flexible especially when the route wants forward all
directed broadcasts.
- this duplicates the directed broadcasts so this may cause side
effects to applications.
Besides, to keep consistent with other os router like BSD, it's also
necessary to implement it in the route rx path.
Note that route cache needs to be flushed when bc_forwarding is
changed.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
include/linux/inetdevice.h | 1 +
include/uapi/linux/ip.h | 1 +
include/uapi/linux/netconf.h | 1 +
net/ipv4/devinet.c | 11 +++++++++++
net/ipv4/route.c | 6 +++++-
5 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 27650f1..c759d1c 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
#define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
+#define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
#define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER)
#define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK)
#define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index b24a742..e42d13b 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -168,6 +168,7 @@ enum
IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
+ IPV4_DEVCONF_BC_FORWARDING,
__IPV4_DEVCONF_MAX
};
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index c84fcdf..fac4edd 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -18,6 +18,7 @@ enum {
NETCONFA_PROXY_NEIGH,
NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
NETCONFA_INPUT,
+ NETCONFA_BC_FORWARDING,
__NETCONFA_MAX
};
#define NETCONFA_MAX (__NETCONFA_MAX - 1)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab..80cb464 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+ if (all || type == NETCONFA_BC_FORWARDING)
+ size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+ if ((all || type == NETCONFA_BC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+ IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+ goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
+ if (i == IPV4_DEVCONF_BC_FORWARDING - 1 ||
+ new_value != old_value)
+ rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+ DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97..b678466 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto no_route;
}
- if (res->type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST) {
+ if (IN_DEV_BFORWARD(in_dev))
+ goto make_route;
goto brd_input;
+ }
if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res->type != RTN_UNICAST)
goto martian_destination;
+make_route:
err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
out: return err;
--
2.1.0
^ permalink raw reply related
* [PATCHv2 net-next 2/2] selftests: add a selftest for directed broadcast forwarding
From: Xin Long @ 2018-07-02 6:30 UTC (permalink / raw)
To: network dev; +Cc: davem, David Ahern, Davide Caratti, idosch
In-Reply-To: <cover.1530512974.git.lucien.xin@gmail.com>
As Ido's suggestion, this patch is to add a selftest for directed
broadcast forwarding with vrf. Just note that it puts the h2 into
the main route space, so that ping_test could get echo_reply.
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
.../selftests/net/forwarding/router_broadcast.sh | 142 +++++++++++++++++++++
1 file changed, 142 insertions(+)
create mode 100755 tools/testing/selftests/net/forwarding/router_broadcast.sh
diff --git a/tools/testing/selftests/net/forwarding/router_broadcast.sh b/tools/testing/selftests/net/forwarding/router_broadcast.sh
new file mode 100755
index 0000000..6917768
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_broadcast.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4"
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev $h1 master vrf-h1
+
+ ip link set dev vrf-h1 up
+ ip link set dev $h1 up
+
+ ip address add 192.0.2.2/24 dev $h1
+ ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+}
+
+h1_destroy()
+{
+ ip route del 198.51.100.0/24 vrf vrf-h1
+ ip address del 192.0.2.2/24 dev $h1
+
+ ip link set dev $h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ ip link set dev $h2 up
+
+ ip address add 198.51.100.2/24 dev $h2
+ ip route add 192.0.2.0/24 dev $h2 via 198.51.100.1
+}
+
+h2_destroy()
+{
+ ip route del 192.0.2.0/24 dev $h2 via 198.51.100.1
+ ip address del 198.51.100.2/24 dev $h2
+
+ ip link set dev $h2 down
+}
+
+router_create()
+{
+ vrf_create "vrf-r1"
+ ip link set dev $rp1 master vrf-r1
+ ip link set dev $rp2 master vrf-r1
+
+ ip link set dev vrf-r1 up
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ ip address add 192.0.2.1/24 dev $rp1
+ ip address add 198.51.100.1/24 dev $rp2
+}
+
+router_destroy()
+{
+ ip address del 198.51.100.1/24 dev $rp2
+ ip address del 192.0.2.1/24 dev $rp1
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+ vrf_destroy "vrf-r1"
+}
+
+bc_forwarding_disable()
+{
+ sysctl_set net.ipv4.conf.all.bc_forwarding 0
+ sysctl_set net.ipv4.conf.$rp1.bc_forwarding 0
+}
+
+bc_forwarding_enable()
+{
+ sysctl_set net.ipv4.conf.all.bc_forwarding 1
+ sysctl_set net.ipv4.conf.$rp1.bc_forwarding 1
+}
+
+bc_forwarding_restore()
+{
+ sysctl_restore net.ipv4.conf.$rp1.bc_forwarding
+ sysctl_restore net.ipv4.conf.all.bc_forwarding
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ sysctl_set net.ipv4.icmp_echo_ignore_broadcasts 0
+ bc_forwarding_disable
+ ping_test $h1 198.51.100.255
+
+ iptables -A INPUT -i vrf-r1 -p icmp -j DROP
+ bc_forwarding_restore
+ bc_forwarding_enable
+ ping_test $h1 198.51.100.255
+
+ bc_forwarding_restore
+ iptables -D INPUT -i vrf-r1 -p icmp -j DROP
+ sysctl_restore net.ipv4.icmp_echo_ignore_broadcasts
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
--
2.1.0
^ permalink raw reply related
* [PATCH net] sctp: fix the issue that pathmtu may be set lower than MINSEGMENT
From: Xin Long @ 2018-07-02 6:51 UTC (permalink / raw)
To: network dev, linux-sctp
Cc: davem, Marcelo Ricardo Leitner, Neil Horman, syzkaller
After commit b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed
for too small MTUs"), sctp_transport_update_pmtu would refetch pathmtu
from the dst and set it to transport's pathmtu without any check.
The new pathmtu may be lower than MINSEGMENT if the dst is obsolete and
updated by .get_dst() in sctp_transport_update_pmtu.
Syzbot reported a warning in sctp_mtu_payload caused by this.
This fix uses the refetched pathmtu only when it's greater than the
frag_needed pmtu.
Fixes: b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed for too small MTUs")
Reported-by: syzbot+f0d9d7cba052f9344b03@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
net/sctp/transport.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 445b7ef..ddfb687 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -282,7 +282,10 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
if (dst) {
/* Re-fetch, as under layers may have a higher minimum size */
- pmtu = SCTP_TRUNC4(dst_mtu(dst));
+ u32 mtu = SCTP_TRUNC4(dst_mtu(dst));
+
+ if (pmtu < mtu)
+ pmtu = mtu;
change = t->pathmtu != pmtu;
}
t->pathmtu = pmtu;
--
2.1.0
^ permalink raw reply related
* Re: [PATCH net-next 05/10] net: hns3: using modulo for cyclic counters in hclge_cmd_send
From: lipeng (Y) @ 2018-07-02 7:06 UTC (permalink / raw)
To: David Miller; +Cc: netdev, linux-kernel, linuxarm, yisen.zhuang, salil.mehta
In-Reply-To: <20180630.210342.804224843120744818.davem@davemloft.net>
On 2018/6/30 20:03, David Miller wrote:
> From: Peng Li <lipeng321@huawei.com>
> Date: Fri, 29 Jun 2018 19:23:00 +0800
>
>> @@ -228,8 +228,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num)
>> desc_to_use = &hw->cmq.csq.desc[hw->cmq.csq.next_to_use];
>> *desc_to_use = desc[handle];
>> (hw->cmq.csq.next_to_use)++;
>> - if (hw->cmq.csq.next_to_use == hw->cmq.csq.desc_num)
>> - hw->cmq.csq.next_to_use = 0;
>> + hw->cmq.csq.next_to_use %= hw->cmq.csq.desc_num;
>> handle++;
> I would advise against this.
>
> The "%" modulus operation takes many cpu cycles, and the current code
> is thus much faster.
>
> .
Agree with you.
Thanks for your review, we concentrate on the code style and ignore the
performance in this patch, It is not good.
I will remove this patch from the patchset.
>
^ permalink raw reply
* Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll
From: Toshiaki Makita @ 2018-07-02 7:11 UTC (permalink / raw)
To: Jason Wang, Michael S. Tsirkin; +Cc: netdev, Tonghao Zhang, kvm, virtualization
In-Reply-To: <b0670083-d938-8b96-56d2-65d41d6b7c8c@redhat.com>
On 2018/07/02 15:17, Jason Wang wrote:
> On 2018年07月02日 12:37, Toshiaki Makita wrote:
>> On 2018/07/02 11:54, Jason Wang wrote:
>>> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>>>> Hi Jason,
>>>>
>>>> On 2018/06/29 18:30, Jason Wang wrote:
>>>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>>>> ...
>>>>>> To fix this, poll the work instead of enabling notification when
>>>>>> busypoll is interrupted by something. IMHO signal_pending() and
>>>>>> vhost_has_work() are kind of interruptions rather than signals to
>>>>>> completely cancel the busypoll, so let's run busypoll after the
>>>>>> necessary work is done. In order to avoid too long busyloop due to
>>>>>> interruption, save the endtime in vq field and use it when reentering
>>>>>> the work function.
>>>>> I think we don't care long busyloop unless e.g tx can starve rx?
>>>> I just want to keep it user-controllable. Unless memorizing it busypoll
>>>> can run unexpectedly long.
>>> I think the total amount of time for busy polling is bounded. If I was
>>> wrong, it should be a bug somewhere.
>> Consider this kind of scenario:
>> 0. Set 100us busypoll for example.
>> 1. handle_tx() runs busypoll.
>> 2. Something like zerocopy queues tx_work within 100us.
>> 3. busypoll exits and call handle_tx() again.
>> 4. Repeat 1-3.
>>
>> In this case handle_tx() does not process packets but busypoll
>> essentially runs beyond 100us without endtime memorized. This may be
>> just a theoretical problem, but I was worried that more code to poll tx
>> queue can be added in the future and it becomes realistic.
>
> Yes, but consider zerocopy tends to batch 16 used packets and we will
> finally finish all processing of packets. The above won't be endless, so
> it was probably tolerable.
Right. So endtime memorization is more like a future-proof thing.
Would you like to keep it or change something?
>>>>>> Performance numbers:
>>>>>>
>>>>>> - Bulk transfer from guest to external physical server.
>>>>>> [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
>>>>>> [Server]
>>>>> Just to confirm in this case since zerocopy is enabled, we are in fact
>>>>> use the generic XDP datapath?
>>>> For some reason zerocopy was not applied for most packets, so in most
>>>> cases driver XDP was used. I was going to dig into it but not yet.
>>> Right, just to confirm this. This is expected.
>>>
>>> In tuntap, we do native XDP only for small and non zerocopy packets. See
>>> tun_can_build_skb(). The reason is XDP may adjust packet header which is
>>> not supported by zercopy. We can only use XDP generic for zerocopy in
>>> this case.
>> I think I understand when driver XDP can be used. What I'm not sure and
>> was going to narrow down is why zerocopy is mostly not applied.
>>
>
> I see, any touch to the zerocopy packet (clone, header expansion or
> segmentation) that lead a userspace copy will increase the error counter
> in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
> zerocopy. So it was probably something in your setup or a bug somewhere.
Thanks for the hint!
--
Toshiaki Makita
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH V2 net-next 4/9] net: hns3: simplify hclge_cmd_csq_clean
From: Peng Li @ 2018-07-02 7:50 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-kernel, linuxarm, yisen.zhuang, salil.mehta,
lipeng321
In-Reply-To: <1530517826-69226-1-git-send-email-lipeng321@huawei.com>
From: Huazhong Tan <tanhuazhong@huawei.com>
csq is used as a ring buffer, the value of the desc will be replaced
in next use. This patch removes the unnecessary memset, and just
updates the next_to_clean.
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
---
.../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 21 +++++----------------
1 file changed, 5 insertions(+), 16 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index 054a913..0839e84 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -151,31 +151,20 @@ static int hclge_cmd_csq_clean(struct hclge_hw *hw)
{
struct hclge_dev *hdev = container_of(hw, struct hclge_dev, hw);
struct hclge_cmq_ring *csq = &hw->cmq.csq;
- u16 ntc = csq->next_to_clean;
- struct hclge_desc *desc;
- int clean = 0;
u32 head;
+ int clean;
- desc = &csq->desc[ntc];
head = hclge_read_dev(hw, HCLGE_NIC_CSQ_HEAD_REG);
rmb(); /* Make sure head is ready before touch any data */
if (!is_valid_csq_clean_head(csq, head)) {
- dev_warn(&hdev->pdev->dev, "wrong head (%d, %d-%d)\n", head,
- csq->next_to_use, csq->next_to_clean);
+ dev_warn(&hdev->pdev->dev, "wrong cmd head (%d, %d-%d)\n", head,
+ csq->next_to_use, csq->next_to_clean);
return 0;
}
- while (head != ntc) {
- memset(desc, 0, sizeof(*desc));
- ntc++;
- if (ntc == csq->desc_num)
- ntc = 0;
- desc = &csq->desc[ntc];
- clean++;
- }
- csq->next_to_clean = ntc;
-
+ clean = (head - csq->next_to_clean + csq->desc_num) % csq->desc_num;
+ csq->next_to_clean = head;
return clean;
}
--
1.9.1
^ permalink raw reply related
* [PATCH V2 net-next 6/9] net: hns3: remove some unused members of some structures
From: Peng Li @ 2018-07-02 7:50 UTC (permalink / raw)
To: davem; +Cc: netdev, linux-kernel, linuxarm, yisen.zhuang, salil.mehta,
lipeng321
In-Reply-To: <1530517826-69226-1-git-send-email-lipeng321@huawei.com>
From: Huazhong Tan <tanhuazhong@huawei.com>
Some members in struct hns3_enet_tqp_vector, struct hnae3_client
and struct hnae3_ae_algo are unused.
This patch removes them.
Signed-off-by: Huazhong Tan <tanhuazhong@huawei.com>
Signed-off-by: Peng Li <lipeng321@huawei.com>
---
drivers/net/ethernet/hisilicon/hns3/hnae3.h | 2 --
drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 1 -
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 1 -
drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 1 -
4 files changed, 5 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 8acb1d1..422c56b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -167,7 +167,6 @@ struct hnae3_client_ops {
#define HNAE3_CLIENT_NAME_LENGTH 16
struct hnae3_client {
char name[HNAE3_CLIENT_NAME_LENGTH];
- u16 version;
unsigned long state;
enum hnae3_client_type type;
const struct hnae3_client_ops *ops;
@@ -436,7 +435,6 @@ struct hnae3_dcb_ops {
struct hnae3_ae_algo {
const struct hnae3_ae_ops *ops;
struct list_head node;
- char name[HNAE3_CLASS_NAME_SIZE];
const struct pci_device_id *pdev_id_table;
};
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 3b083d5a..8d6096c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -499,7 +499,6 @@ struct hns3_enet_tqp_vector {
u16 num_tqps; /* total number of tqps in TQP vector */
- cpumask_t affinity_mask;
char name[HNAE3_INT_NAME_LEN];
/* when 0 should adjust interrupt coalesce parameter */
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index b7f6960..2a0cd70 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -6287,7 +6287,6 @@ static void hclge_get_port_type(struct hnae3_handle *handle,
static struct hnae3_ae_algo ae_algo = {
.ops = &hclge_ops,
- .name = HCLGE_NAME,
.pdev_id_table = ae_algo_pci_tbl,
};
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 1eb61c1..1638193 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -1959,7 +1959,6 @@ void hclgevf_update_speed_duplex(struct hclgevf_dev *hdev, u32 speed,
static struct hnae3_ae_algo ae_algovf = {
.ops = &hclgevf_ops,
- .name = HCLGEVF_NAME,
.pdev_id_table = ae_algovf_pci_tbl,
};
--
1.9.1
^ permalink raw reply related
* Re: [PATCH v4 09/18] ARM: davinci: da850-evm: remove dead MTD code
From: Bartosz Golaszewski @ 2018-07-02 7:28 UTC (permalink / raw)
To: David Lechner
Cc: Bartosz Golaszewski, Sekhar Nori, Kevin Hilman, Russell King,
Grygorii Strashko, David S . Miller, Srinivas Kandagatla,
Lukas Wunner, Rob Herring, Florian Fainelli, Dan Carpenter,
Ivan Khoronzhuk, Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet,
arm-soc, LKML, Linux-OMAP, netdev
In-Reply-To: <46acbc64-6d22-d361-941c-f67aa3029242@lechnology.com>
2018-06-29 19:09 GMT+02:00 David Lechner <david@lechnology.com>:
> On 06/29/2018 04:40 AM, Bartosz Golaszewski wrote:
>>
>> From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
>>
>> We no longer need to register the MTD notifier to read the MAC address
>> as it's now being done in the emac address.
>
>
> I think you mean "it's now being done in the emac _driver_"
Haha yes I do. :)
Thanks,
Bart
^ permalink raw reply
* [PATCH] atm: zatm: remove redundant pointer zatm_dev
From: Colin King @ 2018-07-02 7:37 UTC (permalink / raw)
To: Chas Williams, linux-atm-general, netdev; +Cc: kernel-janitors, linux-kernel
From: Colin Ian King <colin.king@canonical.com>
Pointer zatm_dev is being assigned but is never used hence it is redundant
and can be removed.
Cleans up clang warning:
warning: variable 'zatm_dev' set but not used [-Wunused-but-set-variable]
Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
drivers/atm/zatm.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c
index 2c288d1f42bb..e89146ddede6 100644
--- a/drivers/atm/zatm.c
+++ b/drivers/atm/zatm.c
@@ -1385,14 +1385,12 @@ static void zatm_close(struct atm_vcc *vcc)
static int zatm_open(struct atm_vcc *vcc)
{
- struct zatm_dev *zatm_dev;
struct zatm_vcc *zatm_vcc;
short vpi = vcc->vpi;
int vci = vcc->vci;
int error;
DPRINTK(">zatm_open\n");
- zatm_dev = ZATM_DEV(vcc->dev);
if (!test_bit(ATM_VF_PARTIAL,&vcc->flags))
vcc->dev_data = NULL;
if (vci != ATM_VPI_UNSPEC && vpi != ATM_VCI_UNSPEC)
--
2.17.1
^ permalink raw reply related
* Re: [PATCH v4 08/18] net: davinci_emac: potentially get the MAC address from MTD
From: Bartosz Golaszewski @ 2018-07-02 7:41 UTC (permalink / raw)
To: David Lechner
Cc: Bartosz Golaszewski, Sekhar Nori, Kevin Hilman, Russell King,
Grygorii Strashko, David S . Miller, Srinivas Kandagatla,
Lukas Wunner, Rob Herring, Florian Fainelli, Dan Carpenter,
Ivan Khoronzhuk, Greg Kroah-Hartman, Andrew Lunn, Jonathan Corbet,
arm-soc, LKML, Linux-OMAP, netdev
In-Reply-To: <ee49a8de-0812-a619-25be-4c6d8a35cc81@lechnology.com>
2018-06-29 22:35 GMT+02:00 David Lechner <david@lechnology.com>:
> On 06/29/2018 03:09 PM, David Lechner wrote:
>>
>> On 06/29/2018 04:40 AM, Bartosz Golaszewski wrote:
>>>
>>> From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
>>>
>>> On da850-evm board we can read the MAC address from MTD. It's currently
>>> done in the relevant board file, but we want to get rid of all the MAC
>>> reading callbacks from the board file (SPI and NAND). Move the reading
>>> of the MAC address from SPI to the emac driver's probe function.
>>>
>>> Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
>>> ---
>>> drivers/net/ethernet/ti/davinci_emac.c | 20 ++++++++++++++++++--
>>> 1 file changed, 18 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/ti/davinci_emac.c
>>> b/drivers/net/ethernet/ti/davinci_emac.c
>>> index a1a6445b5a7e..48e6a7755811 100644
>>> --- a/drivers/net/ethernet/ti/davinci_emac.c
>>> +++ b/drivers/net/ethernet/ti/davinci_emac.c
>>> @@ -67,7 +67,7 @@
>>> #include <linux/of_irq.h>
>>> #include <linux/of_net.h>
>>> #include <linux/mfd/syscon.h>
>>> -
>>> +#include <linux/mtd/mtd.h>
>>> #include <asm/irq.h>
>>> #include <asm/page.h>
>>> @@ -1783,7 +1783,10 @@ static int davinci_emac_probe(struct
>>> platform_device *pdev)
>>> struct cpdma_params dma_params;
>>> struct clk *emac_clk;
>>> unsigned long emac_bus_frequency;
>>> -
>>> +#ifdef CONFIG_MTD
>>> + size_t mac_addr_len;
>>> + struct mtd_info *mtd;
>>> +#endif /* CONFIG_MTD */
>>> /* obtain emac clock from kernel */
>>> emac_clk = devm_clk_get(&pdev->dev, NULL);
>>> @@ -1815,6 +1818,19 @@ static int davinci_emac_probe(struct
>>> platform_device *pdev)
>>> goto err_free_netdev;
>>> }
>>> +#ifdef CONFIG_MTD
>>
>>
>> What about the case when MTD is compiled as a module?
>>
>>> + mtd = get_mtd_device_nm("MAC-Address");
>>
>>
>> What about the case when PTR_ERR(mtd) == -EPROBE_DEFER?
>
>
> To answer my own question: because get_mtd_device_nm() doesn't
> ever return -EPROBE_DEFER.
>
> I'm trying to make this work on LCDK, but the emac driver probes
> before any mtd device is registered, so, I just get -ENODEV even
> though I've added a partition to the device tree labeled
> "MAC-Address". You can see in the kernel messages that MTD is
> not probed until later.
>
I tested it on da850-evm - all MTD & SPI related modules need to be
built-in for it to work, so:
CONFIG_MTD=y
CONFIG_MTD_M25P80=y
CONFIG_SPI_DAVINCI=y
Best regards,
Bartosz Golaszewski
>
>>
>>> + if (!IS_ERR(mtd)) {
>>> + rc = mtd_read(mtd, 0, ETH_ALEN,
>>> + &mac_addr_len, priv->mac_addr);
>>> + if (rc == 0)
>>> + dev_info(&pdev->dev,
>>> + "Read MAC addr from SPI Flash: %pM\n",
>>> + priv->mac_addr);
>>> + put_mtd_device(mtd);
>>> + }
>>> +#endif /* CONFIG_MTD */
>>> +
>>> /* MAC addr and PHY mask , RMII enable info from platform_data */
>>> memcpy(priv->mac_addr, pdata->mac_addr, ETH_ALEN);
>>> priv->phy_id = pdata->phy_id;
>>>
>>
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox