Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 4/5] ceph: use timespec64 for r_mtime
From: Yan, Zheng @ 2018-06-21 12:41 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Zheng Yan, Sage Weil, Ilya Dryomov, elder, y2038 Mailman List,
	ceph-devel, axboe, David Miller, martin.petersen, Jason Dillaman,
	daniel.m.jordan, Jan Kara, linux-block, Linux Kernel Mailing List,
	Networking
In-Reply-To: <20180620155101.57685-4-arnd@arndb.de>

On Wed, Jun 20, 2018 at 11:55 PM Arnd Bergmann <arnd@arndb.de> wrote:
>
> The request mtime field is used all over ceph, and is currently
> represented as a 'timespec' structure in Linux. This changes it to
> timespec64 to allow times beyond 2038, modifying all users at the
> same time.
>
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> ---
>  drivers/block/rbd.c             |  2 +-
>  fs/ceph/addr.c                  | 12 ++++++------
>  fs/ceph/file.c                  | 11 +++++------
>  include/linux/ceph/osd_client.h |  6 +++---
>  net/ceph/osd_client.c           |  8 ++++----
>  5 files changed, 19 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index fa0729c1e776..356936333cd9 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1452,7 +1452,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
>         struct ceph_osd_request *osd_req = obj_request->osd_req;
>
>         osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
> -       ktime_get_real_ts(&osd_req->r_mtime);
> +       ktime_get_real_ts64(&osd_req->r_mtime);
>         osd_req->r_data_offset = obj_request->ex.oe_off;
>  }
>
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index 292b3d72d725..d44d51e69e76 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -574,7 +574,7 @@ static u64 get_writepages_data_length(struct inode *inode,
>   */
>  static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
>  {
> -       struct timespec ts;
> +       struct timespec64 ts;
>         struct inode *inode;
>         struct ceph_inode_info *ci;
>         struct ceph_fs_client *fsc;
> @@ -625,7 +625,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
>                 set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
>
>         set_page_writeback(page);
> -       ts = timespec64_to_timespec(inode->i_mtime);
> +       ts = inode->i_mtime;
>         err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
>                                    &ci->i_layout, snapc, page_off, len,
>                                    ceph_wbc.truncate_seq,
> @@ -1134,7 +1134,7 @@ static int ceph_writepages_start(struct address_space *mapping,
>                         pages = NULL;
>                 }
>
> -               req->r_mtime = timespec64_to_timespec(inode->i_mtime);
> +               req->r_mtime = inode->i_mtime;
>                 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
>                 BUG_ON(rc);
>                 req = NULL;
> @@ -1734,7 +1734,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
>                 goto out;
>         }
>
> -       req->r_mtime = timespec64_to_timespec(inode->i_mtime);
> +       req->r_mtime = inode->i_mtime;
>         err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>         if (!err)
>                 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> @@ -1776,7 +1776,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
>                         goto out_put;
>         }
>
> -       req->r_mtime = timespec64_to_timespec(inode->i_mtime);
> +       req->r_mtime = inode->i_mtime;
>         err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>         if (!err)
>                 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
> @@ -1937,7 +1937,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
>                                      0, false, true);
>         err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
>
> -       wr_req->r_mtime = timespec64_to_timespec(ci->vfs_inode.i_mtime);
> +       wr_req->r_mtime = ci->vfs_inode.i_mtime;
>         err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
>
>         if (!err)
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index ad0bed99b1d5..1795a8dc9a1e 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -721,7 +721,7 @@ struct ceph_aio_request {
>         struct list_head osd_reqs;
>         unsigned num_reqs;
>         atomic_t pending_reqs;
> -       struct timespec mtime;
> +       struct timespec64 mtime;
>         struct ceph_cap_flush *prealloc_cf;
>  };
>
> @@ -923,7 +923,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>         int num_pages = 0;
>         int flags;
>         int ret;
> -       struct timespec mtime = timespec64_to_timespec(current_time(inode));
> +       struct timespec64 mtime = current_time(inode);
>         size_t count = iov_iter_count(iter);
>         loff_t pos = iocb->ki_pos;
>         bool write = iov_iter_rw(iter) == WRITE;
> @@ -1013,7 +1013,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
>                         truncate_inode_pages_range(inode->i_mapping, pos,
>                                         (pos+len) | (PAGE_SIZE - 1));
>
> -                       req->r_mtime = mtime;
> +                       req->r_mtime = current_time(inode);
this change is not needed


>                 }
>
>                 osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
> @@ -1131,7 +1131,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>         int flags;
>         int ret;
>         bool check_caps = false;
> -       struct timespec mtime = timespec64_to_timespec(current_time(inode));
>         size_t count = iov_iter_count(from);
>
>         if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
> @@ -1201,7 +1200,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
>                 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
>                                                 false, true);
>
> -               req->r_mtime = mtime;
> +               req->r_mtime = current_time(inode);

here too

>                 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>                 if (!ret)
>                         ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> @@ -1663,7 +1662,7 @@ static int ceph_zero_partial_object(struct inode *inode,
>                 goto out;
>         }
>
> -       req->r_mtime = timespec64_to_timespec(inode->i_mtime);
> +       req->r_mtime = inode->i_mtime;
>         ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
>         if (!ret) {
>                 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 0d6ee04b4c41..2e6611c1e9a0 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -199,7 +199,7 @@ struct ceph_osd_request {
>         /* set by submitter */
>         u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
>         struct ceph_snap_context *r_snapc;    /* for writes */
> -       struct timespec r_mtime;              /* ditto */
> +       struct timespec64 r_mtime;            /* ditto */
>         u64 r_data_offset;                    /* ditto */
>         bool r_linger;                        /* don't resend on failure */
>
> @@ -253,7 +253,7 @@ struct ceph_osd_linger_request {
>         struct ceph_osd_request_target t;
>         u32 map_dne_bound;
>
> -       struct timespec mtime;
> +       struct timespec64 mtime;
>
>         struct kref kref;
>         struct mutex lock;
> @@ -508,7 +508,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
>                                 struct ceph_snap_context *sc,
>                                 u64 off, u64 len,
>                                 u32 truncate_seq, u64 truncate_size,
> -                               struct timespec *mtime,
> +                               struct timespec64 *mtime,
>                                 struct page **pages, int nr_pages);
>
>  /* watch/notify */
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index a00c74f1154e..a87a021ca9d0 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -1978,7 +1978,7 @@ static void encode_request_partial(struct ceph_osd_request *req,
>         p += sizeof(struct ceph_blkin_trace_info);
>
>         ceph_encode_32(&p, 0); /* client_inc, always 0 */
> -       ceph_encode_timespec(p, &req->r_mtime);
> +       ceph_encode_timespec64(p, &req->r_mtime);
>         p += sizeof(struct ceph_timespec);
>
>         encode_oloc(&p, end, &req->r_t.target_oloc);
> @@ -4512,7 +4512,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
>         ceph_oid_copy(&lreq->t.base_oid, oid);
>         ceph_oloc_copy(&lreq->t.base_oloc, oloc);
>         lreq->t.flags = CEPH_OSD_FLAG_WRITE;
> -       ktime_get_real_ts(&lreq->mtime);
> +       ktime_get_real_ts64(&lreq->mtime);
>
>         lreq->reg_req = alloc_linger_request(lreq);
>         if (!lreq->reg_req) {
> @@ -4570,7 +4570,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
>         ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
>         ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
>         req->r_flags = CEPH_OSD_FLAG_WRITE;
> -       ktime_get_real_ts(&req->r_mtime);
> +       ktime_get_real_ts64(&req->r_mtime);
>         osd_req_op_watch_init(req, 0, lreq->linger_id,
>                               CEPH_OSD_WATCH_OP_UNWATCH);
>
> @@ -5136,7 +5136,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
>                          struct ceph_snap_context *snapc,
>                          u64 off, u64 len,
>                          u32 truncate_seq, u64 truncate_size,
> -                        struct timespec *mtime,
> +                        struct timespec64 *mtime,
>                          struct page **pages, int num_pages)
>  {
>         struct ceph_osd_request *req;
> --
> 2.9.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC v2, net-next, PATCH 4/4] net/cpsw_switchdev: add switchdev mode of operation on cpsw driver
From: Ilias Apalodimas @ 2018-06-21 12:45 UTC (permalink / raw)
  To: Ivan Vecera
  Cc: Florian Fainelli, Andrew Lunn, netdev, grygorii.strashko,
	ivan.khoronzhuk, nsekhar, jiri, francois.ozog, yogeshs, spatton,
	Jose.Abreu
In-Reply-To: <033dd1bc-0401-a058-5283-4938ac39f701@redhat.com>

On Thu, Jun 21, 2018 at 02:19:55PM +0200, Ivan Vecera wrote:
> On 20.6.2018 20:03, Ilias Apalodimas wrote:
> > Hi Florian,
> > 
> > On Wed, Jun 20, 2018 at 10:58:26AM -0700, Florian Fainelli wrote:
> >> On 06/20/2018 10:51 AM, Ilias Apalodimas wrote:
> >>> Hello Ivan,
> >>> On Wed, Jun 20, 2018 at 02:56:48PM +0200, Ivan Vecera wrote:
> >>>> On 18.6.2018 22:19, Ilias Apalodimas wrote:
> >>>>> Jiri proposed using devlink, which makes sense, but i am not sure it's
> >>>>> applicable on this patchset. This will change the driver completely and will
> >>>>> totally break backwards compatibility.
> >>>>
> >>>> Another good reason for a new driver.
> >>>>
> >>>> I.
> >>> This is actually conflicting at least to my understanding. Jiri proposed using 
> >>> devlink was used as an alternative method to enable a new mode instead of 
> >>> adding it on a .config option. A new driver wouldn't have a need for that right?
> >>
> >> Correct, with a new driver would likely behave correctly upon being
> >> probed such that you could have your switch ports act as normal network
> >> devices from which you could run IP-config and do NFS boot.
> > The current driver also does NFS properly and the 2 ethernet ports act as normal
> > network interfaces. The NFS section in the cover letter
> > is to cover the cases were users running on NFS need to change the running
> > switch configuration(starting from adding the 2 interfaces on a bridge). 
> > Since iproute2 is located on the NFS filesystem the moment
> > network connectivity is lost, you loose the ability to perform further
> > configuration and in certian configuration scenarios render the device
> > unusable.
> 
> Yes, with a new driver you can drop NFS-boot hack you mentioned in cover letter.
> All configuration is done during driver probe and thus prior NFS mount. Only
> thing you loose with a new driver is backward compatibility but the question is:
> DO you really need it?
Ok i think there's a bit of confusion here. I'll try to clarify it. 
There is no NFS hack for the current driver or ever was. Whether you use
.config/DTS/devlink/module param method for configuration this is strictly for
configuration. The driver (via .config currently) correctly
registers and initializes everything it needs to work. NFS boots fine without
using anything from that script.
The whole script is not there to boot up the device.  The script is there to 
help any potential testing that has to be done *via* NFS and the user has to
reconfigure the switch for his testcases.
Since you need to add the 2 interfaces in a bridge and start the switch
configuration, the moment you do that you loose your network access, thus all
the commands you need for configuration. This is why you need to chroot to do
that. I don't see how writing a new driver will change that.

The driver is currently widely used and that's the reason we tried to avoid
rewriting it. The current driver uses a DTS option to distinguish between two
existing modes. This patch just adds a third one. So to my understanding we
have the following options:
1. The driver already uses DTS to configure the hardware mode. Although this is
techincally wrong, we can add a third mode on DTS called 'switchdev;', get rid
of the .config option and keep the configuration method common (although not
optimal).
2. Keep the .config option which overrides the 2 existing modes. 
3. Introduce a devlink option. If this is applied for all 3 modes, it will break
backwards compatibility, so it's not an option. If it's introduced for
configuring 'switchdev' mode only, we fall into the same pitfall as option 2),
we have something that overrides our current config, slightly better though
since it's not a compile time option.
4. rewrite the driver 

If it was a brand new driver, i'd definitely pick 4. Since it's a pre-existing
driver though i can't rule out the rest of the options. 

Regards
Ilias

^ permalink raw reply

* [PATCH 0/2] xen-netfront: Fix issues with commit f599c64fdf7d
From: Ross Lagerwall @ 2018-06-21 13:00 UTC (permalink / raw)
  To: netdev
  Cc: Ross Lagerwall, Boris Ostrovsky, Juergen Gross, David S. Miller,
	xen-devel, linux-kernel

Fix a couple of issues with commit f599c64fdf7d ("xen-netfront: Fix race
between device setup and open").

Ross Lagerwall (2):
  xen-netfront: Fix mismatched rtnl_unlock
  xen-netfront: Update features after registering netdev

 drivers/net/xen-netfront.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH 1/2] xen-netfront: Fix mismatched rtnl_unlock
From: Ross Lagerwall @ 2018-06-21 13:00 UTC (permalink / raw)
  To: netdev
  Cc: Ross Lagerwall, Boris Ostrovsky, Juergen Gross, David S. Miller,
	xen-devel, linux-kernel
In-Reply-To: <20180621130021.27029-1-ross.lagerwall@citrix.com>

Fixes: f599c64fdf7d ("xen-netfront: Fix race between device setup and open")
Reported-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
---
 drivers/net/xen-netfront.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 922ce0a..ee4cb6c 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1810,7 +1810,7 @@ static int talk_to_netback(struct xenbus_device *dev,
 	err = xen_net_read_mac(dev, info->netdev->dev_addr);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
-		goto out;
+		goto out_unlocked;
 	}
 
 	rtnl_lock();
@@ -1925,6 +1925,7 @@ static int talk_to_netback(struct xenbus_device *dev,
 	xennet_destroy_queues(info);
  out:
 	rtnl_unlock();
+out_unlocked:
 	device_unregister(&dev->dev);
 	return err;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH 2/2] xen-netfront: Update features after registering netdev
From: Ross Lagerwall @ 2018-06-21 13:00 UTC (permalink / raw)
  To: netdev
  Cc: Ross Lagerwall, Boris Ostrovsky, Juergen Gross, David S. Miller,
	xen-devel, linux-kernel, Liam Shepherd
In-Reply-To: <20180621130021.27029-1-ross.lagerwall@citrix.com>

Update the features after calling register_netdev() otherwise the
device features are not set up correctly and it not possible to change
the MTU of the device. After this change, the features reported by
ethtool match the device's features before the commit which introduced
the issue and it is possible to change the device's MTU.

Fixes: f599c64fdf7d ("xen-netfront: Fix race between device setup and open")
Reported-by: Liam Shepherd <liam@dancer.es>
Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
---
 drivers/net/xen-netfront.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index ee4cb6c..a57daec 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1951,10 +1951,6 @@ static int xennet_connect(struct net_device *dev)
 	/* talk_to_netback() sets the correct number of queues */
 	num_queues = dev->real_num_tx_queues;
 
-	rtnl_lock();
-	netdev_update_features(dev);
-	rtnl_unlock();
-
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		err = register_netdev(dev);
 		if (err) {
@@ -1964,6 +1960,10 @@ static int xennet_connect(struct net_device *dev)
 		}
 	}
 
+	rtnl_lock();
+	netdev_update_features(dev);
+	rtnl_unlock();
+
 	/*
 	 * All public and private state should now be sane.  Get
 	 * ready to start sending and receiving packets and give the driver
-- 
2.9.5

^ permalink raw reply related

* Re: BUG: jumbo frames broken after commit xen-netfront: Fix race between device setup and open
From: Ross Lagerwall @ 2018-06-21 13:02 UTC (permalink / raw)
  To: Juergen Gross
  Cc: Javier Martinez Canillas, Andrew Jeddeloh, Laura Abbott,
	dustymabe, Greg Kroah-Hartman, David S. Miller, stable, netdev,
	xen-devel, Liam Shepherd
In-Reply-To: <df12769e-429f-7afb-9cdd-20955f8d6549@suse.com>

On 06/21/2018 10:05 AM, Juergen Gross wrote:
> On 14/06/18 17:43, Javier Martinez Canillas wrote:
>> Hi Andrew,
>>
>> On Wed, Jun 6, 2018 at 6:29 PM, Andrew Jeddeloh
>> <andrew.jeddeloh@redhat.com> wrote:
>>> Hi all,
>>>
>>> The patch "xen-netfront: Fix race between device setup and open" seems
>>> to have introduced a regression preventing setting MTU's larger than
>>> 1500. We experienced this downstream with Container Linux and
>>> confirmed with Fedora 28 as well.
>>>
>>> It's commit f599c64fdf7d9c108e8717fb04bc41c680120da4 in the linux-stable tree.
>>> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?id=f599c64fdf7d9c108e8717fb04bc41c680120da4
>>>
>>> Downstream bugs:
>>> https://github.com/coreos/bugs/issues/2443
>>> https://bugzilla.redhat.com/show_bug.cgi?id=1584216
>>>
>>> We've confirmed that reverting that commit fixes the bug. It be
>>> reliably can be reproduced on AWS with t2.micro instances (and
>>> presumably other systems using the same driver). Both using
>>> systemd-networkd to set the mtu and manual ip link commands cause the
>>> link to repsond with "Invalid argument" when trying to set the MTU >
>>> 1500.
>>>
>>> I'm not sure why that commit introduced the regression.
>>>
>>> Please let me know if there's any more information that would be helpful.
>>>
>>> - Andrew
>>
>> I'm adding some relevant people to the CC list to bring more attention
>> on this regression.
>>
>> The get_maintainer.pl script is very useful to get some hints on who
>> should be copied, i.e:
>>
>> $ ./scripts/get_maintainer.pl -f drivers/net/xen-netfront.c
> 
> Ross, have you made any progress here? If not I'm thinking of reverting
> your patch as I think the current problem is more severe than the one
> your patch did address.
> 
> 

I've sent a patch today which I believe fixes the issue. Sorry for the 
slow response.

Regards,
-- 
Ross Lagerwall

^ permalink raw reply

* Re: [PATCH 1/2] xen-netfront: Fix mismatched rtnl_unlock
From: Juergen Gross @ 2018-06-21 13:05 UTC (permalink / raw)
  To: Ross Lagerwall, netdev
  Cc: Boris Ostrovsky, David S. Miller, xen-devel, linux-kernel
In-Reply-To: <20180621130021.27029-2-ross.lagerwall@citrix.com>

On 21/06/18 15:00, Ross Lagerwall wrote:
> Fixes: f599c64fdf7d ("xen-netfront: Fix race between device setup and open")
> Reported-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>

Reviewed-by: Juergen Gross <jgross@suse.com>


Juergen

^ permalink raw reply

* Re: [PATCH 2/2] xen-netfront: Update features after registering netdev
From: Juergen Gross @ 2018-06-21 13:12 UTC (permalink / raw)
  To: Ross Lagerwall, netdev
  Cc: Boris Ostrovsky, David S. Miller, xen-devel, linux-kernel,
	Liam Shepherd
In-Reply-To: <20180621130021.27029-3-ross.lagerwall@citrix.com>

On 21/06/18 15:00, Ross Lagerwall wrote:
> Update the features after calling register_netdev() otherwise the
> device features are not set up correctly and it not possible to change
> the MTU of the device. After this change, the features reported by
> ethtool match the device's features before the commit which introduced
> the issue and it is possible to change the device's MTU.
> 
> Fixes: f599c64fdf7d ("xen-netfront: Fix race between device setup and open")
> Reported-by: Liam Shepherd <liam@dancer.es>
> Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>

Reviewed-by: Juergen Gross <jgross@suse.com>


Juergen

^ permalink raw reply

* [PATCH v0 01/12] mlxsw: spectrum: Move QSFP EEPROM defenitons to common location
From: Vadim Pasternak @ 2018-06-21 15:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Move QSFP EEPROM definitions to common location from the spectrum
driver in order to make them available for other mlxsw modules. They
are common for all kind of chips and have relation to SFF
specifications 8024, 8436, 8472, 8636, rather then to chip type.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h      | 32 ++++++++++++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 62 +++++++++-----------------
 2 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 1877d9f..6a41c48 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -6757,13 +6757,41 @@ MLXSW_ITEM32(reg, mcia, device_address, 0x04, 0, 16);
  */
 MLXSW_ITEM32(reg, mcia, size, 0x08, 0, 16);
 
-#define MLXSW_SP_REG_MCIA_EEPROM_SIZE 48
+#define MLXSW_REG_MCIA_EEPROM_PAGE_LENGTH	256
+#define MLXSW_REG_MCIA_EEPROM_SIZE		48
+#define MLXSW_REG_MCIA_I2C_ADDR_LOW		0x50
+#define MLXSW_REG_MCIA_I2C_ADDR_HIGH		0x51
+#define MLXSW_REG_MCIA_PAGE0_LO_OFF		0xa0
+#define MLXSW_REG_MCIA_TH_SIZE			8
+#define MLXSW_REG_MCIA_TH_PAGE_NUM		3
+#define MLXSW_REG_MCIA_PAGE0_LO			0
+#define MLXSW_REG_MCIA_TH_PAGE_OFF		0x80
+
+enum mlxsw_reg_mcia_eeprom_module_info_rev_id {
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_REV_ID_UNSPC	= 0x00,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_REV_ID_8436	= 0x01,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_REV_ID_8636	= 0x03,
+};
+
+enum mlxsw_reg_mcia_eeprom_module_info_id {
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_SFP	= 0x03,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP	= 0x0C,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP_PLUS	= 0x0D,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP28	= 0x11,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP_DD	= 0x18,
+};
+
+enum mlxsw_reg_mcia_eeprom_module_info {
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_REV_ID,
+	MLXSW_REG_MCIA_EEPROM_MODULE_INFO_SIZE,
+};
 
 /* reg_mcia_eeprom
  * Bytes to read/write.
  * Access: RW
  */
-MLXSW_ITEM_BUF(reg, mcia, eeprom, 0x10, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+MLXSW_ITEM_BUF(reg, mcia, eeprom, 0x10, MLXSW_REG_MCIA_EEPROM_SIZE);
 
 static inline void mlxsw_reg_mcia_pack(char *payload, u8 module, u8 lock,
 				       u8 page_number, u16 device_addr,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 968b88a..1b0d1bc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2481,23 +2481,23 @@ static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port,
 					unsigned int *p_read_size)
 {
 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
-	char eeprom_tmp[MLXSW_SP_REG_MCIA_EEPROM_SIZE];
+	char eeprom_tmp[MLXSW_REG_MCIA_EEPROM_SIZE];
 	char mcia_pl[MLXSW_REG_MCIA_LEN];
 	u16 i2c_addr;
 	int status;
 	int err;
 
-	size = min_t(u16, size, MLXSW_SP_REG_MCIA_EEPROM_SIZE);
+	size = min_t(u16, size, MLXSW_REG_MCIA_EEPROM_SIZE);
 
-	if (offset < MLXSW_SP_EEPROM_PAGE_LENGTH &&
-	    offset + size > MLXSW_SP_EEPROM_PAGE_LENGTH)
+	if (offset < MLXSW_REG_MCIA_EEPROM_PAGE_LENGTH &&
+	    offset + size > MLXSW_REG_MCIA_EEPROM_PAGE_LENGTH)
 		/* Cross pages read, read until offset 256 in low page */
-		size = MLXSW_SP_EEPROM_PAGE_LENGTH - offset;
+		size = MLXSW_REG_MCIA_EEPROM_PAGE_LENGTH - offset;
 
-	i2c_addr = MLXSW_SP_I2C_ADDR_LOW;
-	if (offset >= MLXSW_SP_EEPROM_PAGE_LENGTH) {
-		i2c_addr = MLXSW_SP_I2C_ADDR_HIGH;
-		offset -= MLXSW_SP_EEPROM_PAGE_LENGTH;
+	i2c_addr = MLXSW_REG_MCIA_I2C_ADDR_LOW;
+	if (offset >= MLXSW_REG_MCIA_EEPROM_PAGE_LENGTH) {
+		i2c_addr = MLXSW_REG_MCIA_I2C_ADDR_HIGH;
+		offset -= MLXSW_REG_MCIA_EEPROM_PAGE_LENGTH;
 	}
 
 	mlxsw_reg_mcia_pack(mcia_pl, mlxsw_sp_port->mapping.module,
@@ -2518,55 +2518,37 @@ static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port,
 	return 0;
 }
 
-enum mlxsw_sp_eeprom_module_info_rev_id {
-	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_UNSPC      = 0x00,
-	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8436       = 0x01,
-	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636       = 0x03,
-};
-
-enum mlxsw_sp_eeprom_module_info_id {
-	MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP              = 0x03,
-	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP             = 0x0C,
-	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS        = 0x0D,
-	MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28           = 0x11,
-};
-
-enum mlxsw_sp_eeprom_module_info {
-	MLXSW_SP_EEPROM_MODULE_INFO_ID,
-	MLXSW_SP_EEPROM_MODULE_INFO_REV_ID,
-	MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
-};
-
 static int mlxsw_sp_get_module_info(struct net_device *netdev,
 				    struct ethtool_modinfo *modinfo)
 {
 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(netdev);
-	u8 module_info[MLXSW_SP_EEPROM_MODULE_INFO_SIZE];
+	u8 module_info[MLXSW_REG_MCIA_EEPROM_MODULE_INFO_SIZE];
 	u8 module_rev_id, module_id;
 	unsigned int read_size;
 	int err;
 
 	err = mlxsw_sp_query_module_eeprom(mlxsw_sp_port, 0,
-					   MLXSW_SP_EEPROM_MODULE_INFO_SIZE,
-					   module_info, &read_size);
+				MLXSW_REG_MCIA_EEPROM_MODULE_INFO_SIZE,
+				module_info, &read_size);
 	if (err)
 		return err;
 
-	if (read_size < MLXSW_SP_EEPROM_MODULE_INFO_SIZE)
+	if (read_size < MLXSW_REG_MCIA_EEPROM_MODULE_INFO_SIZE)
 		return -EIO;
 
-	module_rev_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_REV_ID];
-	module_id = module_info[MLXSW_SP_EEPROM_MODULE_INFO_ID];
+	module_rev_id = module_info[MLXSW_REG_MCIA_EEPROM_MODULE_INFO_REV_ID];
+	module_id = module_info[MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID];
 
 	switch (module_id) {
-	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP:
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP:
 		modinfo->type       = ETH_MODULE_SFF_8436;
 		modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
 		break;
-	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP_PLUS:
-	case MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28:
-		if (module_id  == MLXSW_SP_EEPROM_MODULE_INFO_ID_QSFP28 ||
-		    module_rev_id >= MLXSW_SP_EEPROM_MODULE_INFO_REV_ID_8636) {
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP_PLUS:
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP28:
+		if (module_id == MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP28 ||
+		    module_rev_id >=
+		    MLXSW_REG_MCIA_EEPROM_MODULE_INFO_REV_ID_8636) {
 			modinfo->type       = ETH_MODULE_SFF_8636;
 			modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN;
 		} else {
@@ -2574,7 +2556,7 @@ static int mlxsw_sp_get_module_info(struct net_device *netdev,
 			modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN;
 		}
 		break;
-	case MLXSW_SP_EEPROM_MODULE_INFO_ID_SFP:
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_SFP:
 		modinfo->type       = ETH_MODULE_SFF_8472;
 		modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN;
 		break;
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 02/12] mlxsw: reg: Add MTBR register
From: Vadim Pasternak @ 2018-06-21 15:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Add MTBR (Management Temperature Bulk Register), which is used for port
temperature reading in a bulk mode.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/reg.h | 69 +++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h
index 6a41c48..cfe6bde 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/reg.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h
@@ -6703,6 +6703,74 @@ static inline void mlxsw_reg_mtmp_unpack(char *payload, unsigned int *p_temp,
 		mlxsw_reg_mtmp_sensor_name_memcpy_from(payload, sensor_name);
 }
 
+/* MTBR - Management Temperature Bulk Register
+ * -------------------------------------------
+ * This register is used for bulk temperature reading.
+ */
+#define MLXSW_REG_MTBR_ID		0x900F
+#define MLXSW_REG_MTBR_LEN		0xCC
+#define MLXSW_REG_MTBR_REC_MAX_COUNT	47
+
+MLXSW_REG_DEFINE(mtbr, MLXSW_REG_MTBR_ID, MLXSW_REG_MTBR_LEN);
+
+/* reg_mtbr_base_sensor_index
+ * Base sensors index to access (0 - ASIC sensor, 1-63 - ambient sensors,
+ * 64-127 are mapped to the SFP+/QSFP modules sequentially).
+ * Access: Index
+ */
+MLXSW_ITEM32(reg, mtbr, base_sensor_index, 0x00, 0, 7);
+
+/* reg_mtbr_num_rec
+ * Request: Number of records to read
+ * Response: Number of records read
+ * See above description for more details.
+ * Ranges 0..64
+ * Access: RW
+ */
+MLXSW_ITEM32(reg, mtbr, num_rec, 0x04, 0, 8);
+
+/* reg_mtbr_temp
+ * Temperature reading from the sensor. Reading is in 0.125 Celsius
+ * degrees units.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, mtbr, temp, 0x10, 0, 16, 0x04, 0x00, false);
+
+/* reg_mtbr_max_temp
+ * The highest measured temperature from the sensor.
+ * When the bit mte is cleared, the field max_temperature is reserved.
+ * Access: RO
+ */
+MLXSW_ITEM32_INDEXED(reg, mtbr, max_temp, 0x10, 16, 16, 0x04, 0x00, false);
+
+static inline void mlxsw_reg_mtbr_pack(char *payload, u8 base_sensor_index,
+				       u8 num_rec)
+{
+	MLXSW_REG_ZERO(mtbr, payload);
+	mlxsw_reg_mtbr_base_sensor_index_set(payload, base_sensor_index);
+	mlxsw_reg_mtbr_num_rec_set(payload, num_rec);
+}
+
+/* Error codes from temperatute reading */
+enum mlxsw_reg_mtbr_temp_status {
+	MLXSW_REG_MTBR_NO_CONN		= 0x8000,
+	MLXSW_REG_MTBR_NO_TEMP_SENS	= 0x8001,
+	MLXSW_REG_MTBR_INDEX_NA		= 0x8002,
+	MLXSW_REG_MTBR_BAD_SENS_INFO	= 0x8003,
+};
+
+/* Base index for reading ports temperature */
+#define MLXSW_REG_MTBR_BASE_PORT_INDEX		64
+
+static inline void mlxsw_reg_mtbr_temp_unpack(char *payload, int rec_index,
+					      u16 *p_temp, u16 *p_max_temp)
+{
+	if (p_temp)
+		*p_temp = mlxsw_reg_mtbr_temp_get(payload, rec_index);
+	if (p_max_temp)
+		*p_max_temp = mlxsw_reg_mtbr_max_temp_get(payload, rec_index);
+}
+
 /* MCIA - Management Cable Info Access
  * -----------------------------------
  * MCIA register is used to access the SFP+ and QSFP connector's EPROM.
@@ -7945,6 +8013,7 @@ static const struct mlxsw_reg_info *mlxsw_reg_infos[] = {
 	MLXSW_REG(mfsc),
 	MLXSW_REG(mfsm),
 	MLXSW_REG(mfsl),
+	MLXSW_REG(mtbr),
 	MLXSW_REG(mtcap),
 	MLXSW_REG(mtmp),
 	MLXSW_REG(mcia),
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 00/12] mlxsw thermal monitoring amendments
From: Vadim Pasternak @ 2018-06-21 15:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak

This patchset extends mlxsw hwmon and thermal modules with ports
temperature reading and adds new hwmon attributes for FAN and
temperature.

Ports temperatures are most critical component in system thermal control
and should be considered by thermal algorithm.

New hwmon attributes, such as FAN faults, port temperature fault will
improve system monitoring abilities.

Vadim Pasternak (12):
  mlxsw: spectrum: Move QSFP EEPROM defenitons to common location
  mlxsw: reg: Add MTBR register
  mlxsw: core: Add core environment module for port temperature reading
  mlxsw: core: Extend hwmon interface with FAN fault attribute
  mlxsw: core: Extend hwmon interface with port temperature attributes
  mlxsw: core: Add bus frequency capability flag for the bus type
  mlxsw: core: Set different thermal polling time based on bus type
  mlxsw: core: Modify thermal zone definition
  mlxsw: core: Extend thermal zone operations with get_trend method
  mlxsw: core: Extend cooling device with cooling levels
  mlxsw: core: Rename cooling device
  mlxsw: core: Add ports temperature measurement to thermal algorithm

 drivers/net/ethernet/mellanox/mlxsw/Makefile       |   2 +-
 drivers/net/ethernet/mellanox/mlxsw/core.h         |   1 +
 drivers/net/ethernet/mellanox/mlxsw/core_env.c     | 316 +++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/core_env.h     |  63 ++++
 drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c   | 164 ++++++++++-
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 231 +++++++++++++--
 drivers/net/ethernet/mellanox/mlxsw/i2c.c          |   1 +
 drivers/net/ethernet/mellanox/mlxsw/reg.h          | 101 ++++++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c     |  62 ++--
 9 files changed, 865 insertions(+), 76 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_env.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_env.h

-- 
2.1.4

^ permalink raw reply

* [PATCH v0 03/12] mlxsw: core: Add core environment module for port temperature reading
From: Vadim Pasternak @ 2018-06-21 15:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Add new core_env module to allow port temperature reading. This
information has most critical impact on system's thermal monitoring and
is to be used by core_hwmon and core_thermal modules.

New internal API reads the temperature from all the modules, which are
equipped with the thermal sensor and exposes temperature according to
the worst measure. All individual temperature values are normalized to
pre-defined range.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlxsw/core_env.c | 316 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlxsw/core_env.h |  63 +++++
 3 files changed, 380 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_env.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/core_env.h

diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile
index 0cadcab..9f1dc0b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Makefile
+++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_MLXSW_CORE)	+= mlxsw_core.o
 mlxsw_core-objs			:= core.o core_acl_flex_keys.o \
-				   core_acl_flex_actions.o
+				   core_acl_flex_actions.o core_env.o
 mlxsw_core-$(CONFIG_MLXSW_CORE_HWMON) += core_hwmon.o
 mlxsw_core-$(CONFIG_MLXSW_CORE_THERMAL) += core_thermal.o
 obj-$(CONFIG_MLXSW_PCI)		+= mlxsw_pci.o
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_env.c b/drivers/net/ethernet/mellanox/mlxsw/core_env.c
new file mode 100644
index 0000000..fb6394d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_env.c
@@ -0,0 +1,316 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_env.c
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/err.h>
+
+#include "core.h"
+#include "core_env.h"
+#include "item.h"
+
+union mlxsw_env_port_thresh {
+	u8 buf[MLXSW_REG_MCIA_TH_SIZE];
+	struct mlxsw_env_port_temp_th {
+		u16 temp_alarm_hi;
+		u16 temp_alarm_lo;
+		u16 temp_warn_hi;
+		u16 temp_warn_low;
+	} t;
+};
+
+static int mlxsw_env_bulk_get(struct mlxsw_core *core,
+			      int *ports_temp_cache, int port_count,
+			      bool *untrusted_sensor)
+{
+	char mtbr_pl[MLXSW_REG_MTBR_LEN];
+	int i, j, count, off;
+	u16 temp;
+	int err;
+
+	/* Read ports temperature. */
+	if (untrusted_sensor)
+		*untrusted_sensor = false;
+	count = 0;
+	while (count < port_count) {
+		off = min_t(u8, MLXSW_REG_MTBR_REC_MAX_COUNT,
+			    port_count - count);
+		mlxsw_reg_mtbr_pack(mtbr_pl, MLXSW_REG_MTBR_BASE_PORT_INDEX +
+				    count, off);
+		err = mlxsw_reg_query(core, MLXSW_REG(mtbr), mtbr_pl);
+		if (err)
+			return err;
+
+		for (i = 0, j = count; i < off; i++, j++) {
+			mlxsw_reg_mtbr_temp_unpack(mtbr_pl, i, &temp, NULL);
+
+			/* Update status and temperature cache. */
+			switch (temp) {
+			case MLXSW_REG_MTBR_NO_CONN:
+			case MLXSW_REG_MTBR_NO_TEMP_SENS:
+			case MLXSW_REG_MTBR_INDEX_NA:
+				ports_temp_cache[j] = 0;
+				break;
+			case MLXSW_REG_MTBR_BAD_SENS_INFO:
+				/* Untrusted cable is connected. It means that
+				 * reading temperature from its sensor is
+				 * unreliable and thermal control should
+				 * consider increasing system's FAN speed
+				 * according to the system requirements.
+				 * The presence of untrusted cable is exposed
+				 * to hwmon through temp1_fault attribute.
+				 */
+				ports_temp_cache[j] = 0;
+				if (untrusted_sensor)
+					*untrusted_sensor = false;
+				break;
+			default:
+				ports_temp_cache[j] =
+					MLXSW_REG_MTMP_TEMP_TO_MC(temp);
+				break;
+			}
+		}
+		count += off;
+	}
+
+	return 0;
+}
+
+static void mlxsw_env_scale_temp(int hot, int crit, int tdelta, u8 mask,
+				 int *temp)
+{
+	int twindow;
+
+	/* Scale port temperature thresholds window to the based window: do
+	 * nothong, if windows are equal, shrink window if it exceeds, expand
+	 * in other case. Set delta according this scale.
+	 */
+	twindow = crit - hot;
+	if (twindow > MLXSW_ENV_TEMP_WINDOW)
+		tdelta /= DIV_ROUND_CLOSEST(twindow, MLXSW_ENV_TEMP_WINDOW);
+	else if (twindow < MLXSW_ENV_TEMP_WINDOW)
+		tdelta *= DIV_ROUND_CLOSEST(MLXSW_ENV_TEMP_WINDOW, twindow);
+
+	switch (mask) {
+	case MLXSW_ENV_CRIT_MASK:
+		*temp = clamp_val(MLXSW_ENV_TEMP_HOT + tdelta,
+				  MLXSW_ENV_TEMP_HOT, MLXSW_ENV_TEMP_CRIT);
+		break;
+	case MLXSW_ENV_HOT_MASK:
+		*temp = clamp_val(MLXSW_ENV_TEMP_NORM + tdelta,
+				  MLXSW_ENV_TEMP_NORM, MLXSW_ENV_TEMP_HOT);
+		break;
+	default:
+		/* Don't set temperature below nominal value. */
+		tdelta %= MLXSW_ENV_TEMP_NORM;
+		*temp = clamp_val(MLXSW_ENV_TEMP_NORM - tdelta, *temp,
+				  MLXSW_ENV_TEMP_NORM);
+		break;
+	}
+}
+
+static void mlxsw_env_process_temp(int temp,
+				   struct mlxsw_env_temp_thresh *port,
+				   struct mlxsw_env_temp_thresh *delta,
+				   struct mlxsw_env_temp_multi *multi)
+{
+	int tdelta;
+
+	/* Compare each port temperature sensors values, with warning and
+	 * threshold values for this port. Find the worst delta for the all,
+	 * sensors which is defined as following:
+	 * - if value is below the warning threshold - the closest value to the
+	 *   warning threshold;
+	 * - if value is between the warning and alarm thresholds - the closet
+	 *   value to the alarm threshold;
+	 * - if value is above the alarm threshold - the value with the biggest
+	 *   delta.
+	 * The temperature value should be set according to the worst delta
+	 * with the next priority:
+	 * - if any sensor above alarm threshold - from the alarm;
+	 * - if any sensor above warning threshold - from the hot;
+	 * - from norm in other case.
+	 */
+	if (!multi->mask && temp < port->hot) {
+		tdelta = port->hot - temp;
+		mlxsw_env_scale_temp(port->hot, port->crit, tdelta, 0, &temp);
+		if (tdelta < delta->normal) {
+			multi->thresh.normal = temp;
+			delta->normal = tdelta;
+		}
+	} else if (temp >= port->crit) {
+		tdelta = temp - port->crit;
+		mlxsw_env_scale_temp(port->hot, port->crit, tdelta,
+				     MLXSW_ENV_CRIT_MASK, &temp);
+		if (tdelta > delta->crit) {
+			multi->thresh.crit = temp;
+			delta->crit = tdelta;
+		}
+		multi->mask |= MLXSW_ENV_CRIT_MASK;
+	} else if (!(multi->mask & MLXSW_ENV_CRIT_MASK)) {
+		tdelta = temp - port->hot;
+		mlxsw_env_scale_temp(port->hot, port->crit, tdelta,
+				     MLXSW_ENV_HOT_MASK, &temp);
+		if (tdelta > delta->hot) {
+			multi->thresh.hot = temp;
+			delta->hot = tdelta;
+		}
+		multi->mask |= MLXSW_ENV_HOT_MASK;
+	}
+}
+
+static void
+mlxsw_env_finalize_temp(struct mlxsw_env_temp_thresh *delta,
+			struct mlxsw_env_temp_multi *multi, int *temp)
+{
+	/* If the values from the all temperature sensors are:
+	 * - above temperature warning threshold - pick for the temperature the
+	 *   value with biggest delta between the temperature alarm threshold;
+	 * - between the temperature warning threshold and the temperature
+	 *   alarm threshold - pick as the temperature the closest value to the
+	 *   the temperature warning threshold;
+	 * - below the temperature warning threshold - pick as the temperature
+	 *   the closest to the temperature warning threshold.
+	 */
+	if (multi->mask & MLXSW_ENV_CRIT_MASK)
+		*temp = multi->thresh.crit;
+	else if (multi->mask & MLXSW_ENV_HOT_MASK)
+		*temp = multi->thresh.hot;
+	else
+		*temp = multi->thresh.normal;
+}
+
+static int mlxsw_env_validate_cable_ident(struct mlxsw_core *core, int id,
+					  bool *qsfp)
+{
+	char eeprom_tmp[MLXSW_REG_MCIA_EEPROM_SIZE];
+	char mcia_pl[MLXSW_REG_MCIA_LEN];
+	u8 ident;
+	int err;
+
+	mlxsw_reg_mcia_pack(mcia_pl, id, 0, MLXSW_REG_MCIA_PAGE0_LO_OFF, 0, 1,
+			    MLXSW_REG_MCIA_I2C_ADDR_LOW);
+	err = mlxsw_reg_query(core, MLXSW_REG(mcia), mcia_pl);
+	if (err)
+		return err;
+	mlxsw_reg_mcia_eeprom_memcpy_from(mcia_pl, eeprom_tmp);
+	ident = eeprom_tmp[0];
+	switch (ident) {
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_SFP:
+		*qsfp = false;
+		break;
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP:
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP_PLUS:
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP28:
+	case MLXSW_REG_MCIA_EEPROM_MODULE_INFO_ID_QSFP_DD:
+		*qsfp = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int mlxsw_env_collect_port_temp(struct mlxsw_core *core, int *ports_temp_cache,
+				int port_count,
+				struct mlxsw_env_temp_multi *multi,
+				struct mlxsw_env_temp_thresh *delta,
+				bool *untrusted_sensor, int *temp)
+{
+	char eeprom_tmp[MLXSW_REG_MCIA_EEPROM_SIZE];
+	union mlxsw_env_port_thresh thresh;
+	char mcia_pl[MLXSW_REG_MCIA_LEN];
+	struct mlxsw_env_temp_thresh curr;
+	int port_temp, i;
+	bool qsfp;
+	int err;
+
+	memset(&curr, 0, sizeof(struct mlxsw_env_temp_thresh));
+	/* Read ports temperature. */
+	err = mlxsw_env_bulk_get(core, ports_temp_cache, port_count,
+				 untrusted_sensor);
+	if (err)
+		return err;
+
+	for (i = 0; i < port_count; i++) {
+		/* Skip port with no temperature sensor */
+		if (!ports_temp_cache[i])
+			continue;
+
+		/* Read Free Side Device Temperature Thresholds from page 03h
+		 * (MSB at lower byte address).
+		 * Bytes:
+		 * 128-129 - Temp High Alarm
+		 * 130-131 - Temp Low Alarm
+		 * 132-133 - Temp High Warning
+		 * 134-135 - Temp Low Warning
+		 */
+
+		/* Validate module identifier value. */
+		err = mlxsw_env_validate_cable_ident(core, i, &qsfp);
+		if (err)
+			return err;
+
+		if (qsfp)
+			mlxsw_reg_mcia_pack(mcia_pl, i, 0,
+					    MLXSW_REG_MCIA_TH_PAGE_NUM,
+					    MLXSW_REG_MCIA_TH_PAGE_OFF,
+					    MLXSW_REG_MCIA_TH_SIZE,
+					    MLXSW_REG_MCIA_I2C_ADDR_LOW);
+		else
+			mlxsw_reg_mcia_pack(mcia_pl, i, 0,
+					    MLXSW_REG_MCIA_PAGE0_LO, 0,
+					    MLXSW_REG_MCIA_TH_SIZE,
+					    MLXSW_REG_MCIA_I2C_ADDR_HIGH);
+
+		err = mlxsw_reg_query(core, MLXSW_REG(mcia), mcia_pl);
+		if (err)
+			return err;
+
+		mlxsw_reg_mcia_eeprom_memcpy_from(mcia_pl, eeprom_tmp);
+		memcpy(thresh.buf, eeprom_tmp, MLXSW_REG_MCIA_TH_SIZE);
+		/* Skip sensor with no threshold info. */
+		if (!thresh.t.temp_warn_hi || !thresh.t.temp_warn_hi)
+			continue;
+
+		port_temp = ports_temp_cache[i];
+		curr.hot = thresh.t.temp_warn_hi * 1000;
+		curr.crit = thresh.t.temp_alarm_hi * 1000;
+		mlxsw_env_process_temp(port_temp, &curr, delta, multi);
+	}
+
+	mlxsw_env_finalize_temp(delta, multi, temp);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_env.h b/drivers/net/ethernet/mellanox/mlxsw/core_env.h
new file mode 100644
index 0000000..a239d5b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_env.h
@@ -0,0 +1,63 @@
+/*
+ * drivers/net/ethernet/mellanox/mlxsw/core_env.h
+ * Copyright (c) 2018 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MLXSW_CORE_ENV_H
+#define _MLXSW_CORE_ENV_H
+
+#define MLXSW_ENV_TEMP_UNREACHABLE	150000	/* 150C */
+#define MLXSW_ENV_HOT_MASK		BIT(0)
+#define MLXSW_ENV_CRIT_MASK		BIT(1)
+#define MLXSW_ENV_TEMP_NORM		75000	/* 75C */
+#define MLXSW_ENV_TEMP_HIGH		85000	/* 85C */
+#define MLXSW_ENV_TEMP_HOT		105000	/* 105C */
+#define MLXSW_ENV_TEMP_CRIT		110000	/* 110C */
+#define MLXSW_ENV_TEMP_WINDOW		(MLXSW_ENV_TEMP_HOT - \
+					 MLXSW_ENV_TEMP_NORM)
+
+struct mlxsw_env_temp_thresh {
+	int normal;
+	int hot;
+	int crit;
+};
+
+struct mlxsw_env_temp_multi {
+	struct mlxsw_env_temp_thresh thresh;
+	u8 mask;
+};
+
+int mlxsw_env_collect_port_temp(struct mlxsw_core *core, int *ports_temp_cache,
+				int port_count,
+				struct mlxsw_env_temp_multi *multi,
+				struct mlxsw_env_temp_thresh *delta,
+				bool *untrusted_sensor, int *temp);
+#endif
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 08/12] mlxsw: core: Modify thermal zone definition
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Thermal zone trip points setting is modified for better alignment with
modified thermal algorithm.
The hysteresis thresholds for thermal trips are added in order to avoid
throttling around thermal trip point. If hysteresis temperature is not
considered PWM can have side effect of flip up/down on thermal trip
point boundary.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 63 ++++++++++++++--------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 152591d8..91c4946 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -39,16 +39,18 @@
 #include <linux/err.h>
 
 #include "core.h"
+#include "core_env.h"
 
-#define MLXSW_THERMAL_POLL_INT	1000	/* ms */
+#define MLXSW_THERMAL_POLL_INT		1000	/* ms */
 #define MLXSW_THERMAL_SLOW_POLL_INT	20000	/* ms */
-#define MLXSW_THERMAL_MAX_TEMP	110000	/* 110C */
-#define MLXSW_THERMAL_MAX_STATE	10
-#define MLXSW_THERMAL_MAX_DUTY	255
+#define MLXSW_THERMAL_HYSTERESIS_TEMP	5000	/* 5C */
+#define MLXSW_THERMAL_MAX_STATE		10
+#define MLXSW_THERMAL_MAX_DUTY		255
 
 struct mlxsw_thermal_trip {
 	int	type;
 	int	temp;
+	int	hyst;
 	int	min_state;
 	int	max_state;
 };
@@ -56,32 +58,29 @@ struct mlxsw_thermal_trip {
 static const struct mlxsw_thermal_trip default_thermal_trips[] = {
 	{	/* In range - 0-40% PWM */
 		.type		= THERMAL_TRIP_ACTIVE,
-		.temp		= 75000,
+		.temp		= MLXSW_ENV_TEMP_NORM,
+		.hyst		= MLXSW_THERMAL_HYSTERESIS_TEMP,
 		.min_state	= 0,
 		.max_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
 	},
-	{	/* High - 40-100% PWM */
-		.type		= THERMAL_TRIP_ACTIVE,
-		.temp		= 80000,
-		.min_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
-		.max_state	= MLXSW_THERMAL_MAX_STATE,
-	},
 	{
-		/* Very high - 100% PWM */
+		/* In range - 40-100% PWM */
 		.type		= THERMAL_TRIP_ACTIVE,
-		.temp		= 85000,
-		.min_state	= MLXSW_THERMAL_MAX_STATE,
+		.temp		= MLXSW_ENV_TEMP_HIGH,
+		.hyst		= MLXSW_THERMAL_HYSTERESIS_TEMP,
+		.min_state	= (4 * MLXSW_THERMAL_MAX_STATE) / 10,
 		.max_state	= MLXSW_THERMAL_MAX_STATE,
 	},
 	{	/* Warning */
 		.type		= THERMAL_TRIP_HOT,
-		.temp		= 105000,
+		.temp		= MLXSW_ENV_TEMP_HOT,
+		.hyst		= MLXSW_THERMAL_HYSTERESIS_TEMP,
 		.min_state	= MLXSW_THERMAL_MAX_STATE,
 		.max_state	= MLXSW_THERMAL_MAX_STATE,
 	},
 	{	/* Critical - soft poweroff */
 		.type		= THERMAL_TRIP_CRITICAL,
-		.temp		= MLXSW_THERMAL_MAX_TEMP,
+		.temp		= MLXSW_ENV_TEMP_CRIT,
 		.min_state	= MLXSW_THERMAL_MAX_STATE,
 		.max_state	= MLXSW_THERMAL_MAX_STATE,
 	}
@@ -257,22 +256,42 @@ static int mlxsw_thermal_set_trip_temp(struct thermal_zone_device *tzdev,
 	struct mlxsw_thermal *thermal = tzdev->devdata;
 
 	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS ||
-	    temp > MLXSW_THERMAL_MAX_TEMP)
+	    temp > MLXSW_ENV_TEMP_CRIT)
 		return -EINVAL;
 
 	thermal->trips[trip].temp = temp;
 	return 0;
 }
 
+static int mlxsw_thermal_get_trip_hyst(struct thermal_zone_device *tzdev,
+				       int trip, int *p_hyst)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	*p_hyst = thermal->trips[trip].hyst;
+	return 0;
+}
+
+static int mlxsw_thermal_set_trip_hyst(struct thermal_zone_device *tzdev,
+				       int trip, int hyst)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+
+	thermal->trips[trip].hyst = hyst;
+	return 0;
+}
+
 static struct thermal_zone_device_ops mlxsw_thermal_ops = {
-	.bind = mlxsw_thermal_bind,
-	.unbind = mlxsw_thermal_unbind,
-	.get_mode = mlxsw_thermal_get_mode,
-	.set_mode = mlxsw_thermal_set_mode,
-	.get_temp = mlxsw_thermal_get_temp,
+	.bind		= mlxsw_thermal_bind,
+	.unbind		= mlxsw_thermal_unbind,
+	.get_mode	= mlxsw_thermal_get_mode,
+	.set_mode	= mlxsw_thermal_set_mode,
+	.get_temp	= mlxsw_thermal_get_temp,
 	.get_trip_type	= mlxsw_thermal_get_trip_type,
 	.get_trip_temp	= mlxsw_thermal_get_trip_temp,
 	.set_trip_temp	= mlxsw_thermal_set_trip_temp,
+	.get_trip_hyst	= mlxsw_thermal_get_trip_hyst,
+	.set_trip_hyst	= mlxsw_thermal_set_trip_hyst,
 };
 
 static int mlxsw_thermal_get_max_state(struct thermal_cooling_device *cdev,
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 04/12] mlxsw: core: Extend hwmon interface with FAN fault attribute
From: Vadim Pasternak @ 2018-06-21 15:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Add new FAN hwmon attribute for exposing FAN faults (fault is set in
case FAN tachometer is below allowed minimum).

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c | 62 +++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
index 84185f8..dfd7adc 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
@@ -44,6 +44,7 @@
 #define MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT 127
 #define MLXSW_HWMON_ATTR_COUNT (MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT * 4 + \
 				MLXSW_MFCR_TACHOS_MAX + MLXSW_MFCR_PWMS_MAX)
+#define MLXSW_HWMON_SPEED_MAX 50000	/* RPM */
 
 struct mlxsw_hwmon_attr {
 	struct device_attribute dev_attr;
@@ -61,6 +62,7 @@ struct mlxsw_hwmon {
 	struct attribute *attrs[MLXSW_HWMON_ATTR_COUNT + 1];
 	struct mlxsw_hwmon_attr hwmon_attrs[MLXSW_HWMON_ATTR_COUNT];
 	unsigned int attrs_count;
+	u16 tach_min;
 };
 
 static ssize_t mlxsw_hwmon_temp_show(struct device *dev,
@@ -152,6 +154,28 @@ static ssize_t mlxsw_hwmon_fan_rpm_show(struct device *dev,
 	return sprintf(buf, "%u\n", mlxsw_reg_mfsm_rpm_get(mfsm_pl));
 }
 
+static ssize_t mlxsw_hwmon_fan_fault_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	char mfsm_pl[MLXSW_REG_MFSM_LEN];
+	u16 tach;
+	int err;
+
+	mlxsw_reg_mfsm_pack(mfsm_pl, mlwsw_hwmon_attr->type_index);
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mfsm), mfsm_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query fan\n");
+		return err;
+	}
+	tach = mlxsw_reg_mfsm_rpm_get(mfsm_pl);
+
+	return sprintf(buf, "%u\n", (tach < mlxsw_hwmon->tach_min) ? 1 : 0);
+}
+
 static ssize_t mlxsw_hwmon_pwm_show(struct device *dev,
 				    struct device_attribute *attr,
 				    char *buf)
@@ -203,6 +227,7 @@ enum mlxsw_hwmon_attr_type {
 	MLXSW_HWMON_ATTR_TYPE_TEMP_MAX,
 	MLXSW_HWMON_ATTR_TYPE_TEMP_RST,
 	MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
+	MLXSW_HWMON_ATTR_TYPE_FAN_FAULT,
 	MLXSW_HWMON_ATTR_TYPE_PWM,
 };
 
@@ -240,6 +265,12 @@ static void mlxsw_hwmon_attr_add(struct mlxsw_hwmon *mlxsw_hwmon,
 		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
 			 "fan%u_input", num + 1);
 		break;
+	case MLXSW_HWMON_ATTR_TYPE_FAN_FAULT:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_fan_fault_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "fan%u_fault", num + 1);
+		break;
 	case MLXSW_HWMON_ATTR_TYPE_PWM:
 		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_pwm_show;
 		mlxsw_hwmon_attr->dev_attr.store = mlxsw_hwmon_pwm_store;
@@ -297,9 +328,9 @@ static int mlxsw_hwmon_fans_init(struct mlxsw_hwmon *mlxsw_hwmon)
 {
 	char mfcr_pl[MLXSW_REG_MFCR_LEN] = {0};
 	enum mlxsw_reg_mfcr_pwm_frequency freq;
+	u16 tacho_active, tach_min;
 	unsigned int type_index;
 	unsigned int num;
-	u16 tacho_active;
 	u8 pwm_active;
 	int err;
 
@@ -310,11 +341,38 @@ static int mlxsw_hwmon_fans_init(struct mlxsw_hwmon *mlxsw_hwmon)
 	}
 	mlxsw_reg_mfcr_unpack(mfcr_pl, &freq, &tacho_active, &pwm_active);
 	num = 0;
+	/* Set tachometer to maximum value as the initial seed. */
+	mlxsw_hwmon->tach_min = MLXSW_HWMON_SPEED_MAX;
 	for (type_index = 0; type_index < MLXSW_MFCR_TACHOS_MAX; type_index++) {
-		if (tacho_active & BIT(type_index))
+		if (tacho_active & BIT(type_index)) {
+			char mfsl_pl[MLXSW_REG_MFSL_LEN] = {0};
+
 			mlxsw_hwmon_attr_add(mlxsw_hwmon,
 					     MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
+					     type_index, num);
+			mlxsw_hwmon_attr_add(mlxsw_hwmon,
+					     MLXSW_HWMON_ATTR_TYPE_FAN_FAULT,
 					     type_index, num++);
+			/* Get tachometer minimum value. */
+			mlxsw_reg_mfsl_pack(mfsl_pl, type_index, 0, 0);
+			err = mlxsw_reg_query(mlxsw_hwmon->core,
+					      MLXSW_REG(mfsl), mfsl_pl);
+			if (err) {
+				dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query tachometer %d\n",
+					type_index);
+				return err;
+			}
+
+			tach_min = mlxsw_reg_mfsl_tach_min_get(mfsl_pl);
+			/* Store absolute minimal value of all tachometers for
+			 * alarm indication, because forward FANs could be
+			 * replaced with reversed and wise versa and in such
+			 * case the minimum values could be flipped.
+			 */
+			mlxsw_hwmon->tach_min = min_t(u16,
+						      mlxsw_hwmon->tach_min,
+						      tach_min);
+		}
 	}
 	num = 0;
 	for (type_index = 0; type_index < MLXSW_MFCR_PWMS_MAX; type_index++) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 06/12] mlxsw: core: Add bus frequency capability flag for the bus type
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Add low frequency bus capability in order to allow core functionality
separation based on bus type. Driver could run over PCIe, which is
considered as high frequency bus or I2C , which is considered as low
frequency bus. In the last case time setting, for example, for thermal
polling interval, should be increased.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core.h | 1 +
 drivers/net/ethernet/mellanox/mlxsw/i2c.c  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 552cfa2..95e6190 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -371,6 +371,7 @@ struct mlxsw_bus_info {
 	struct mlxsw_fw_rev fw_rev;
 	u8 vsd[MLXSW_CMD_BOARDINFO_VSD_LEN];
 	u8 psid[MLXSW_CMD_BOARDINFO_PSID_LEN];
+	bool low_frequency;
 };
 
 struct mlxsw_hwmon;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/i2c.c b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
index 25f9915..384b337 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/i2c.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/i2c.c
@@ -536,6 +536,7 @@ static int mlxsw_i2c_probe(struct i2c_client *client,
 	mlxsw_i2c->bus_info.device_kind = id->name;
 	mlxsw_i2c->bus_info.device_name = client->name;
 	mlxsw_i2c->bus_info.dev = &client->dev;
+	mlxsw_i2c->bus_info.low_frequency = true;
 	mlxsw_i2c->dev = &client->dev;
 
 	err = mlxsw_core_bus_device_register(&mlxsw_i2c->bus_info,
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 07/12] mlxsw: core: Set different thermal polling time based on bus type
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Use different thermal monitoring based on bus type.
For I2C bus time is set to 20 seconds, while for PCIe 1 second polling
interval is used.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index d866c98..152591d8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -41,6 +41,7 @@
 #include "core.h"
 
 #define MLXSW_THERMAL_POLL_INT	1000	/* ms */
+#define MLXSW_THERMAL_SLOW_POLL_INT	20000	/* ms */
 #define MLXSW_THERMAL_MAX_TEMP	110000	/* 110C */
 #define MLXSW_THERMAL_MAX_STATE	10
 #define MLXSW_THERMAL_MAX_DUTY	255
@@ -95,6 +96,7 @@ struct mlxsw_thermal {
 	struct mlxsw_core *core;
 	const struct mlxsw_bus_info *bus_info;
 	struct thermal_zone_device *tzdev;
+	int polling_delay;
 	struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
 	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
 	enum thermal_device_mode mode;
@@ -190,7 +192,7 @@ static int mlxsw_thermal_set_mode(struct thermal_zone_device *tzdev,
 	mutex_lock(&tzdev->lock);
 
 	if (mode == THERMAL_DEVICE_ENABLED)
-		tzdev->polling_delay = MLXSW_THERMAL_POLL_INT;
+		tzdev->polling_delay = thermal->polling_delay;
 	else
 		tzdev->polling_delay = 0;
 
@@ -397,13 +399,18 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 		}
 	}
 
+	if (bus_info->low_frequency)
+		thermal->polling_delay = MLXSW_THERMAL_SLOW_POLL_INT;
+	else
+		thermal->polling_delay = MLXSW_THERMAL_POLL_INT;
+
 	thermal->tzdev = thermal_zone_device_register("mlxsw",
 						      MLXSW_THERMAL_NUM_TRIPS,
 						      MLXSW_THERMAL_TRIP_MASK,
 						      thermal,
 						      &mlxsw_thermal_ops,
 						      NULL, 0,
-						      MLXSW_THERMAL_POLL_INT);
+						      thermal->polling_delay);
 	if (IS_ERR(thermal->tzdev)) {
 		err = PTR_ERR(thermal->tzdev);
 		dev_err(dev, "Failed to register thermal zone\n");
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 05/12] mlxsw: core: Extend hwmon interface with port temperature attributes
From: Vadim Pasternak @ 2018-06-21 15:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Add new attributes to hwmon object for exposing accumulative ports
temperature input and accumulative port temperature fault (if one of
sensors in untrusted - fault is set.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c | 102 +++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
index dfd7adc..ac28e6c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_hwmon.c
@@ -40,6 +40,7 @@
 #include <linux/err.h>
 
 #include "core.h"
+#include "core_env.h"
 
 #define MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT 127
 #define MLXSW_HWMON_ATTR_COUNT (MLXSW_HWMON_TEMP_SENSOR_MAX_COUNT * 4 + \
@@ -63,6 +64,9 @@ struct mlxsw_hwmon {
 	struct mlxsw_hwmon_attr hwmon_attrs[MLXSW_HWMON_ATTR_COUNT];
 	unsigned int attrs_count;
 	u16 tach_min;
+	int *ports_temp_cache;
+	int count;
+	bool untrusted_sensor;
 };
 
 static ssize_t mlxsw_hwmon_temp_show(struct device *dev,
@@ -222,6 +226,47 @@ static ssize_t mlxsw_hwmon_pwm_store(struct device *dev,
 	return len;
 }
 
+static ssize_t mlxsw_hwmon_port_temp_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+	struct mlxsw_env_temp_multi multi;
+	struct mlxsw_env_temp_thresh delta;
+	int temp;
+	int err;
+
+	memset(&multi, 0, sizeof(struct mlxsw_env_temp_multi));
+	memset(&delta, 0, sizeof(struct mlxsw_env_temp_thresh));
+	/* Set initial value for normal temperature to unreachable value. */
+	delta.normal = MLXSW_ENV_TEMP_UNREACHABLE;
+	/* Collect ports temperature */
+	err = mlxsw_env_collect_port_temp(mlxsw_hwmon->core,
+					  mlxsw_hwmon->ports_temp_cache,
+					  mlxsw_hwmon->count, &multi, &delta,
+					  &mlxsw_hwmon->untrusted_sensor,
+					  &temp);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to query port temp\n");
+		return err;
+	}
+
+	return sprintf(buf, "%u\n", temp);
+}
+
+static ssize_t mlxsw_hwmon_port_temp_fault_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct mlxsw_hwmon_attr *mlwsw_hwmon_attr =
+			container_of(attr, struct mlxsw_hwmon_attr, dev_attr);
+	struct mlxsw_hwmon *mlxsw_hwmon = mlwsw_hwmon_attr->hwmon;
+
+	return sprintf(buf, "%u\n", mlxsw_hwmon->untrusted_sensor ? 1 : 0);
+}
+
 enum mlxsw_hwmon_attr_type {
 	MLXSW_HWMON_ATTR_TYPE_TEMP,
 	MLXSW_HWMON_ATTR_TYPE_TEMP_MAX,
@@ -229,6 +274,8 @@ enum mlxsw_hwmon_attr_type {
 	MLXSW_HWMON_ATTR_TYPE_FAN_RPM,
 	MLXSW_HWMON_ATTR_TYPE_FAN_FAULT,
 	MLXSW_HWMON_ATTR_TYPE_PWM,
+	MLXSW_HWMON_ATTR_TYPE_TEMP_PORT,
+	MLXSW_HWMON_ATTR_TYPE_TEMP_PORT_FAULT,
 };
 
 static void mlxsw_hwmon_attr_add(struct mlxsw_hwmon *mlxsw_hwmon,
@@ -278,6 +325,19 @@ static void mlxsw_hwmon_attr_add(struct mlxsw_hwmon *mlxsw_hwmon,
 		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
 			 "pwm%u", num + 1);
 		break;
+	case MLXSW_HWMON_ATTR_TYPE_TEMP_PORT:
+		mlxsw_hwmon_attr->dev_attr.show = mlxsw_hwmon_port_temp_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_input", num + 1);
+		break;
+	case MLXSW_HWMON_ATTR_TYPE_TEMP_PORT_FAULT:
+		mlxsw_hwmon_attr->dev_attr.show =
+					mlxsw_hwmon_port_temp_fault_show;
+		mlxsw_hwmon_attr->dev_attr.attr.mode = 0444;
+		snprintf(mlxsw_hwmon_attr->name, sizeof(mlxsw_hwmon_attr->name),
+			 "temp%u_fault", num + 1);
+		break;
 	default:
 		WARN_ON(1);
 	}
@@ -384,6 +444,43 @@ static int mlxsw_hwmon_fans_init(struct mlxsw_hwmon *mlxsw_hwmon)
 	return 0;
 }
 
+static int mlxsw_hwmon_port_init(struct mlxsw_hwmon *mlxsw_hwmon)
+{
+	unsigned int max_ports = mlxsw_core_max_ports(mlxsw_hwmon->core);
+	struct device *dev = mlxsw_hwmon->bus_info->dev;
+	char mtcap_pl[MLXSW_REG_MTCAP_LEN] = {0};
+	u8 sensor_count;
+	int err;
+
+	mlxsw_hwmon->ports_temp_cache = devm_kmalloc_array(dev, max_ports,
+							   sizeof(int),
+							   GFP_KERNEL);
+	if (!mlxsw_hwmon->ports_temp_cache)
+		return -ENOMEM;
+	mlxsw_hwmon->count = max_ports;
+
+	err = mlxsw_reg_query(mlxsw_hwmon->core, MLXSW_REG(mtcap), mtcap_pl);
+	if (err) {
+		dev_err(mlxsw_hwmon->bus_info->dev, "Failed to get number of temp sensors\n");
+		return err;
+	}
+	/* Add extra attributes for port temperature - one attribute for the
+	 * cumulative temperature measurement and one attribute for the
+	 * cumulative temperature fault status. Sensor index will be assigned
+	 * to sensor_count value, while all indexed before sensor_count are
+	 * already utilized by the sensors connected through mtmp register by
+	 * mlxsw_hwmon_temp_init().
+	 */
+	sensor_count = mlxsw_reg_mtcap_sensor_count_get(mtcap_pl);
+	mlxsw_hwmon_attr_add(mlxsw_hwmon, MLXSW_HWMON_ATTR_TYPE_TEMP_PORT,
+			     sensor_count, sensor_count);
+	mlxsw_hwmon_attr_add(mlxsw_hwmon,
+			     MLXSW_HWMON_ATTR_TYPE_TEMP_PORT_FAULT,
+			     sensor_count, sensor_count);
+
+	return 0;
+}
+
 int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
 		     const struct mlxsw_bus_info *mlxsw_bus_info,
 		     struct mlxsw_hwmon **p_hwmon)
@@ -407,6 +504,10 @@ int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
 	if (err)
 		goto err_fans_init;
 
+	err = mlxsw_hwmon_port_init(mlxsw_hwmon);
+	if (err)
+		goto err_temp_port_init;
+
 	mlxsw_hwmon->groups[0] = &mlxsw_hwmon->group;
 	mlxsw_hwmon->group.attrs = mlxsw_hwmon->attrs;
 
@@ -424,6 +525,7 @@ int mlxsw_hwmon_init(struct mlxsw_core *mlxsw_core,
 	return 0;
 
 err_hwmon_register:
+err_temp_port_init:
 err_fans_init:
 err_temp_init:
 	return err;
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 09/12] mlxsw: core: Extend thermal zone operations with get_trend method
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594883-20619-1-git-send-email-vadimp@mellanox.com>

Thermal get_trend method is added in order to notify user in case of
fast temperature downgrade. It could happen in case one or few very hot
port cables are removed. In such situation temperature trend could go
down once, and then could stay in a stable state, while PWM state will
be decreased only once and could stay in not optimal high state.
Notification will allow user to take an appropriate action if
necessary.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 27 ++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 91c4946..1587820 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -281,6 +281,32 @@ static int mlxsw_thermal_set_trip_hyst(struct thermal_zone_device *tzdev,
 	return 0;
 }
 
+static int mlxsw_thermal_get_trend(struct thermal_zone_device *tzdev,
+				   int trip, enum thermal_trend *trend)
+{
+	int delta;
+
+	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
+		return -EINVAL;
+
+	delta = tzdev->last_temperature - tzdev->temperature;
+	if (delta > MLXSW_ENV_TEMP_WINDOW) {
+		/* Notify user about fast temperature decreasing by sending
+		 * hwmon uevent. Decreasing could happen in case one or few
+		 * very hot port cables have been removed. In this situation
+		 * temperature trend could go down once, and then could stay
+		 * in a stable state, while PWM state will be decreased only
+		 * once. As a side effect PWM could be not at optimal speed.
+		 * Notification will allow user to handle such case, if user
+		 * supposes to optimize PWM state.
+		 */
+		kobject_uevent(&tzdev->device.kobj, KOBJ_CHANGE);
+	}
+
+	/* Return non-zero value to pass control to get_tz_trend() routine. */
+	return 1;
+}
+
 static struct thermal_zone_device_ops mlxsw_thermal_ops = {
 	.bind		= mlxsw_thermal_bind,
 	.unbind		= mlxsw_thermal_unbind,
@@ -292,6 +318,7 @@ static struct thermal_zone_device_ops mlxsw_thermal_ops = {
 	.set_trip_temp	= mlxsw_thermal_set_trip_temp,
 	.get_trip_hyst	= mlxsw_thermal_get_trip_hyst,
 	.set_trip_hyst	= mlxsw_thermal_set_trip_hyst,
+	.get_trend	= mlxsw_thermal_get_trend,
 };
 
 static int mlxsw_thermal_get_max_state(struct thermal_cooling_device *cdev,
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 10/12] mlxsw: core: Extend cooling device with cooling levels
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak

Extend cooling device with cooling levels vector to allow more
flexibility of PWM setting.
Thermal zone algorithm operates with the numerical states for PWM
setting. Each state is the index, defined in range from 0 to 10 and
it's mapped to the relevant duty cycle value, which is written to PWM
controller. With the current definition FAN speed is set to 0% for
state 0, 10% for state 1, and so on up to 100% for the maximum state
10.
Some systems have limitation for the PWM speed minimum. For such
systems PWM setting speed to 0% will just disable the ability to
increase speed anymore and such device will be stall on zero speed.
Cooling levels allow to configure state vector according to the
particular system requirements. For example, if PWM speed is not
allowed to be below 30%, cooling levels could be configured as 30%,
30%, 30%, 30%, 40%, 50% and so on.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 59 +++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 1587820..53e4ef9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -46,6 +46,15 @@
 #define MLXSW_THERMAL_HYSTERESIS_TEMP	5000	/* 5C */
 #define MLXSW_THERMAL_MAX_STATE		10
 #define MLXSW_THERMAL_MAX_DUTY		255
+/* Minimum and maximum FAN allowed speed in percent: from 20% to 100%. Values
+ * MLXSW_THERMAL_MAX_STATE + x, where x is between 2 and 10 are used for
+ * setting FAN speed dynamic minimum. For example, if value is set to 14 (40%)
+ * cooling levels vector will be set to 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10 to
+ * introduce PWM speed in percent: 40, 40, 40, 40, 40, 50, 60. 70, 80, 90, 100.
+ */
+#define MLXSW_THERMAL_SPEED_MIN		(MLXSW_THERMAL_MAX_STATE + 2)
+#define MLXSW_THERMAL_SPEED_MAX		(MLXSW_THERMAL_MAX_STATE * 2)
+#define MLXSW_THERMAL_SPEED_MIN_LEVEL	2	/* 20 percent */
 
 struct mlxsw_thermal_trip {
 	int	type;
@@ -97,6 +106,7 @@ struct mlxsw_thermal {
 	struct thermal_zone_device *tzdev;
 	int polling_delay;
 	struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
+	u8 cooling_levels[MLXSW_THERMAL_MAX_STATE + 1];
 	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
 	enum thermal_device_mode mode;
 };
@@ -361,12 +371,52 @@ static int mlxsw_thermal_set_cur_state(struct thermal_cooling_device *cdev,
 	struct mlxsw_thermal *thermal = cdev->devdata;
 	struct device *dev = thermal->bus_info->dev;
 	char mfsc_pl[MLXSW_REG_MFSC_LEN];
-	int err, idx;
+	unsigned long cur_state;
+	int idx, i;
+	u8 duty;
+	int err;
 
 	idx = mlxsw_get_cooling_device_idx(thermal, cdev);
 	if (idx < 0)
 		return idx;
 
+	/* Verify if this request is for changing allowed FAN dynamical
+	 * minimum. If it is - update cooling levels accordingly and update
+	 * state, if current state is below the newly requested minimum state.
+	 * For example, if current state is 5, and minimal state is to be
+	 * changed from 4 to 6, thermal->cooling_levels[0 to 5] will be changed
+	 * all from 4 to 6. And state 5 (thermal->cooling_levels[4]) should be
+	 * overwritten.
+	 */
+	if (state >= MLXSW_THERMAL_SPEED_MIN &&
+	    state <= MLXSW_THERMAL_SPEED_MAX) {
+		state -= MLXSW_THERMAL_MAX_STATE;
+		for (i = 0; i < state; i++)
+			thermal->cooling_levels[i] = state;
+		for (i = state; i <= MLXSW_THERMAL_MAX_STATE; i++)
+			thermal->cooling_levels[i] = i;
+
+		mlxsw_reg_mfsc_pack(mfsc_pl, idx, 0);
+		err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
+		if (err) {
+			dev_err(dev, "Failed to query PWM duty\n");
+			return err;
+		}
+
+		duty = mlxsw_reg_mfsc_pwm_duty_cycle_get(mfsc_pl);
+		cur_state = mlxsw_duty_to_state(duty);
+
+		if (state < cur_state)
+			return 0;
+
+		state = cur_state;
+	}
+
+	if (state > MLXSW_THERMAL_MAX_STATE)
+		return -EINVAL;
+
+	/* Normalize the state to the valid speed range. */
+	state = thermal->cooling_levels[state];
 	mlxsw_reg_mfsc_pack(mfsc_pl, idx, mlxsw_state_to_duty(state));
 	err = mlxsw_reg_write(thermal->core, MLXSW_REG(mfsc), mfsc_pl);
 	if (err) {
@@ -445,6 +495,13 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 		}
 	}
 
+	/* Init cooling levels per PWM state. */
+	for (i = 0; i < MLXSW_THERMAL_SPEED_MIN_LEVEL; i++)
+		thermal->cooling_levels[i] = MLXSW_THERMAL_SPEED_MIN_LEVEL;
+	for (i = MLXSW_THERMAL_SPEED_MIN_LEVEL;
+	     i <= MLXSW_THERMAL_MAX_STATE; i++)
+		thermal->cooling_levels[i] = i;
+
 	if (bus_info->low_frequency)
 		thermal->polling_delay = MLXSW_THERMAL_SLOW_POLL_INT;
 	else
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 12/12] mlxsw: core: Add ports temperature measurement to thermal algorithm
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594915-20741-1-git-send-email-vadimp@mellanox.com>

Ports temperature has most significant impact on system thermal state
and should be considered by the thermal algorithm. The thermal zone
temperature is extended for reading ports temperatures along with a
chip temperature. The temperature value, provided to the core thermal
algorithm will be accumulated value of a chip and ports temperature
sensing, normalized according to the basic constant thresholds.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 66 ++++++++++++++++++++--
 1 file changed, 62 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 65962ed..23d6197 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -109,6 +109,8 @@ struct mlxsw_thermal {
 	u8 cooling_levels[MLXSW_THERMAL_MAX_STATE + 1];
 	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
 	enum thermal_device_mode mode;
+	int count;
+	int *ports_temp_cache;
 };
 
 static inline u8 mlxsw_state_to_duty(int state)
@@ -213,10 +215,11 @@ static int mlxsw_thermal_set_mode(struct thermal_zone_device *tzdev,
 	return 0;
 }
 
-static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
-				  int *p_temp)
+static int mlxsw_thermal_init_temp(struct mlxsw_thermal *thermal,
+				   struct mlxsw_env_temp_thresh *delta,
+				   struct mlxsw_env_temp_multi *multi,
+				   int *p_temp, bool *p_crit)
 {
-	struct mlxsw_thermal *thermal = tzdev->devdata;
 	struct device *dev = thermal->bus_info->dev;
 	char mtmp_pl[MLXSW_REG_MTMP_LEN];
 	unsigned int temp;
@@ -231,10 +234,58 @@ static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
 	}
 	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL);
 
-	*p_temp = (int) temp;
+	if (temp >= MLXSW_ENV_TEMP_CRIT) {
+		*p_crit = true;
+	} else if (temp < MLXSW_ENV_TEMP_NORM) {
+		multi->thresh.normal = temp;
+		delta->normal = MLXSW_ENV_TEMP_NORM - temp;
+	} else if (temp >= MLXSW_ENV_TEMP_HOT) {
+		multi->thresh.crit = temp;
+		delta->crit = temp - MLXSW_ENV_TEMP_HOT;
+		multi->mask |= MLXSW_ENV_CRIT_MASK;
+	} else {
+		multi->thresh.hot = temp;
+		delta->hot = temp - MLXSW_ENV_TEMP_NORM;
+		multi->mask |= MLXSW_ENV_HOT_MASK;
+	}
+	*p_temp = temp;
+
 	return 0;
 }
 
+static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
+				  int *p_temp)
+{
+	struct mlxsw_thermal *thermal = tzdev->devdata;
+	struct device *dev = thermal->bus_info->dev;
+	struct mlxsw_env_temp_multi multi;
+	struct mlxsw_env_temp_thresh delta;
+	bool crit = false;
+	int err;
+
+	memset(&multi, 0, sizeof(struct mlxsw_env_temp_multi));
+	memset(&delta, 0, sizeof(struct mlxsw_env_temp_thresh));
+	/* Read ASIC temperature */
+	err = mlxsw_thermal_init_temp(thermal, &delta, &multi,
+				      p_temp, &crit);
+	if (err) {
+		dev_err(dev, "Failed to query ASIC temp sensor\n");
+		return err;
+	}
+
+	/* No need to proceed ports temperature reading, since ASIC temperature
+	 * should be resulted in system shutdown.
+	 */
+	if (crit)
+		return 0;
+
+	/* Collect ports temperature */
+	return mlxsw_env_collect_port_temp(thermal->core,
+					   thermal->ports_temp_cache,
+					   thermal->count, &multi, &delta,
+					   NULL, p_temp);
+}
+
 static int mlxsw_thermal_get_trip_type(struct thermal_zone_device *tzdev,
 				       int trip,
 				       enum thermal_trip_type *p_type)
@@ -436,6 +487,7 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 		       const struct mlxsw_bus_info *bus_info,
 		       struct mlxsw_thermal **p_thermal)
 {
+	unsigned int max_ports = mlxsw_core_max_ports(core);
 	char mfcr_pl[MLXSW_REG_MFCR_LEN] = { 0 };
 	enum mlxsw_reg_mfcr_pwm_frequency freq;
 	struct device *dev = bus_info->dev;
@@ -452,6 +504,12 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 	thermal->core = core;
 	thermal->bus_info = bus_info;
 	memcpy(thermal->trips, default_thermal_trips, sizeof(thermal->trips));
+	thermal->ports_temp_cache = devm_kmalloc_array(dev, max_ports,
+						       sizeof(int),
+						       GFP_KERNEL);
+	if (!thermal->ports_temp_cache)
+		return -ENOMEM;
+	thermal->count = max_ports;
 
 	err = mlxsw_reg_query(thermal->core, MLXSW_REG(mfcr), mfcr_pl);
 	if (err) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH v0 11/12] mlxsw: core: Rename cooling device
From: Vadim Pasternak @ 2018-06-21 15:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, jiri, Vadim Pasternak
In-Reply-To: <1529594915-20741-1-git-send-email-vadimp@mellanox.com>

Name "Fan" is too common name, and such name is misleading, while it's
interpreted by user.
For example name "Fan" could be used by ACPI.

Signed-off-by: Vadim Pasternak <vadimp@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 53e4ef9..65962ed 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -484,7 +484,8 @@ int mlxsw_thermal_init(struct mlxsw_core *core,
 		if (pwm_active & BIT(i)) {
 			struct thermal_cooling_device *cdev;
 
-			cdev = thermal_cooling_device_register("Fan", thermal,
+			cdev = thermal_cooling_device_register("mlxsw_fan",
+							thermal,
 							&mlxsw_cooling_ops);
 			if (IS_ERR(cdev)) {
 				err = PTR_ERR(cdev);
-- 
2.1.4

^ permalink raw reply related

* Re: [GIT] Networking
From: Matteo Croce @ 2018-06-21 13:40 UTC (permalink / raw)
  To: mingo
  Cc: David S . Miller, alexei.starovoitov, sfr, torvalds, akpm, netdev,
	linux-kernel, tglx
In-Reply-To: <20180621084510.GA22870@gmail.com>

On Thu, Jun 21, 2018 at 8:46 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * David Miller <davem@davemloft.net> wrote:
>
> > 1) Fix crash on bpf_prog_load() errors, from Daniel Borkmann.
>
> > Daniel Borkmann (4):
> >       Merge branch 'bpf-misc-fixes'
> >       bpf: fix panic in prog load calls cleanup
> >       bpf: reject any prog that failed read-only lock
> >       bpf, xdp, i40e: fix i40e_build_skb skb reserve and truesize
>
> JFYI, I'm still seeing this BPF build error upstream, on a 32-bit allyesconfig I'm
> getting:
>
>   LD      vmlinux.o
>   ld: i386:x86-64 architecture of input file `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
>   Makefile:1010: recipe for target 'vmlinux' failed
>   make: *** [vmlinux] Error 1
>
> A similar looking build bug was reported by sfr three weeks ago:
>
> > Subject: linux-next: build failure after merge of the net-next tree
> >
> > ...
> >
> > x86_64-linux-ld: unknown architecture of input file `net/bpfilter/bpfilter_umh.o'
> > is incompatible with i386:x86-64 output
> >
> > Caused by commit
> >
> >  d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
> >
> > In my builds, the host is PowerPC 64 LE ...
> >
> > I have reverted that commit along with
> >
> >  61a552eb487f ("bpfilter: fix build dependency")
> >  13405468f49d ("bpfilter: don't pass O_CREAT when opening console for debug")
> >
> > for today.
>
> Is there a fix I could try?
>
> Thanks,
>
>         Ingo

Hi Ingo,

are you compiling a 32 bit kernel on an x86_64 host? I tried to
compile an i386 bit kernel on an i386 host and I have no issue,
running objdump by hand produces correct output:

$ uname -a
Linux debian32 4.16.0-2-686-pae #1 SMP Debian 4.16.16-1 (2018-06-19)
i686 GNU/Linux
$ objdump -f net/bpfilter/bpfilter_umh |awk -F' |,' '/file
format/{print "-O",$NF} /^architecture:/{print "-B",$2}'
-O elf32-i386
-B i386

then I tried to compile an i386 kernel on an x86_64 host and I get the
same error:

$ make -j8 ARCH=i386
...
  LD      vmlinux.o
ld: i386:x86-64 architecture of input file
`net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
make: *** [Makefile:1015: vmlinux] Error 1

the problem seems to be that bpfilter_umh is compiled with host flags,
and so it's a 64 bit binary in my case:

gcc  -static -o net/bpfilter/bpfilter_umh net/bpfilter/main.o
objcopy -I binary `LC_ALL=C objdump -f net/bpfilter/bpfilter_umh |awk
-F' |,' '/file format/{print "-O",$NF} /^architecture:/{print
"-B",$2}'` --rename-section .data=.init.rodata
net/bpfilter/bpfilter_umh net/bpfilter/bpfilter_umh.o
ld -m elf_i386 -r -o vmlinux.o --whole-archive built-in.a
--no-whole-archive --start-group lib/lib.a arch/x86/lib/lib.a
--end-group
ld: i386:x86-64 architecture of input file
`net/bpfilter/bpfilter_umh.o' is incompatible with i386 output

Any idea how to fix it without building it twice, for host and target?
-- 
Matteo Croce
per aspera ad upstream

^ permalink raw reply

* Re: [GIT] Networking
From: Stephen Rothwell @ 2018-06-21 13:46 UTC (permalink / raw)
  To: Matteo Croce
  Cc: mingo, David S . Miller, alexei.starovoitov, torvalds, akpm,
	netdev, linux-kernel, tglx
In-Reply-To: <CAGnkfhxGAYZNhJp7eyg+_j3LY31w7muFqerhQp7jGqQ02iFxkg@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3525 bytes --]

Hi Matteo,

On Thu, 21 Jun 2018 13:40:43 +0000 Matteo Croce <mcroce@redhat.com> wrote:
>
> On Thu, Jun 21, 2018 at 8:46 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * David Miller <davem@davemloft.net> wrote:
> >  
> > > 1) Fix crash on bpf_prog_load() errors, from Daniel Borkmann.  
> >  
> > > Daniel Borkmann (4):
> > >       Merge branch 'bpf-misc-fixes'
> > >       bpf: fix panic in prog load calls cleanup
> > >       bpf: reject any prog that failed read-only lock
> > >       bpf, xdp, i40e: fix i40e_build_skb skb reserve and truesize  
> >
> > JFYI, I'm still seeing this BPF build error upstream, on a 32-bit allyesconfig I'm
> > getting:
> >
> >   LD      vmlinux.o
> >   ld: i386:x86-64 architecture of input file `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
> >   Makefile:1010: recipe for target 'vmlinux' failed
> >   make: *** [vmlinux] Error 1
> >
> > A similar looking build bug was reported by sfr three weeks ago:
> >  
> > > Subject: linux-next: build failure after merge of the net-next tree
> > >
> > > ...
> > >
> > > x86_64-linux-ld: unknown architecture of input file `net/bpfilter/bpfilter_umh.o'
> > > is incompatible with i386:x86-64 output
> > >
> > > Caused by commit
> > >
> > >  d2ba09c17a06 ("net: add skeleton of bpfilter kernel module")
> > >
> > > In my builds, the host is PowerPC 64 LE ...
> > >
> > > I have reverted that commit along with
> > >
> > >  61a552eb487f ("bpfilter: fix build dependency")
> > >  13405468f49d ("bpfilter: don't pass O_CREAT when opening console for debug")
> > >
> > > for today.  
> >
> > Is there a fix I could try?
> 
> are you compiling a 32 bit kernel on an x86_64 host? I tried to
> compile an i386 bit kernel on an i386 host and I have no issue,
> running objdump by hand produces correct output:
> 
> $ uname -a
> Linux debian32 4.16.0-2-686-pae #1 SMP Debian 4.16.16-1 (2018-06-19)
> i686 GNU/Linux
> $ objdump -f net/bpfilter/bpfilter_umh |awk -F' |,' '/file
> format/{print "-O",$NF} /^architecture:/{print "-B",$2}'
> -O elf32-i386
> -B i386
> 
> then I tried to compile an i386 kernel on an x86_64 host and I get the
> same error:
> 
> $ make -j8 ARCH=i386
> ...
>   LD      vmlinux.o
> ld: i386:x86-64 architecture of input file
> `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
> make: *** [Makefile:1015: vmlinux] Error 1
> 
> the problem seems to be that bpfilter_umh is compiled with host flags,
> and so it's a 64 bit binary in my case:
> 
> gcc  -static -o net/bpfilter/bpfilter_umh net/bpfilter/main.o
> objcopy -I binary `LC_ALL=C objdump -f net/bpfilter/bpfilter_umh |awk
> -F' |,' '/file format/{print "-O",$NF} /^architecture:/{print
> "-B",$2}'` --rename-section .data=.init.rodata
> net/bpfilter/bpfilter_umh net/bpfilter/bpfilter_umh.o
> ld -m elf_i386 -r -o vmlinux.o --whole-archive built-in.a
> --no-whole-archive --start-group lib/lib.a arch/x86/lib/lib.a
> --end-group
> ld: i386:x86-64 architecture of input file
> `net/bpfilter/bpfilter_umh.o' is incompatible with i386 output
> 
> Any idea how to fix it without building it twice, for host and target?

This presumably has the same root cause that means I can't build a big
endian PowerPC version on a little endian host ...

Either I have to have CONFIG_BPFILTER turned off (or maybe just
CONFIG_BPFILTER_UMH) or build with a compiler that cannot link user
mode programs (which effectively does the same).
-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH net V2 1/1] net/smc: coordinate wait queues for nonblocking connect
From: Ursula Braun @ 2018-06-21 14:23 UTC (permalink / raw)
  To: davem
  Cc: netdev, linux-s390, schwidefsky, heiko.carstens, raspl, ubraun,
	xiyou.wangcong, hch

The recent poll change may lead to stalls for non-blocking connecting
SMC sockets, since sock_poll_wait is no longer performed on the
internal CLC socket, but on the outer SMC socket.  kernel_connect() on
the internal CLC socket returns with -EINPROGRESS, but the wake up
logic does not work in all cases. If the internal CLC socket is still
in state TCP_SYN_SENT when polled, sock_poll_wait() from sock_poll()
does not sleep. It is supposed to sleep till the state of the internal
CLC socket switches to TCP_ESTABLISHED.

This patch temporarily propagates the wait queue from the internal
CLC sock to the SMC sock, till the non-blocking connect() is
finished.

In addition locking is reduced due to the removed poll waits.

Fixes: c0129a061442 ("smc: convert to ->poll_mask")
Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
---
 net/smc/af_smc.c | 15 +++++++++++----
 net/smc/smc.h    |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da7f02edcd37..d76331aae6e1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -23,6 +23,7 @@
 #include <linux/workqueue.h>
 #include <linux/in.h>
 #include <linux/sched/signal.h>
+#include <linux/rcupdate.h>
 
 #include <net/sock.h>
 #include <net/tcp.h>
@@ -605,6 +606,13 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 
 	smc_copy_sock_settings_to_clc(smc);
 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+	if (flags & O_NONBLOCK) {
+		rcu_read_lock();
+		smc->smcwq = rcu_dereference(sk->sk_wq);
+		rcu_assign_pointer(sock->sk->sk_wq,
+				   rcu_dereference(smc->clcsock->sk->sk_wq));
+		rcu_read_unlock();
+	}
 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
 	if (rc)
 		goto out;
@@ -1285,12 +1293,9 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 
 	smc = smc_sk(sock->sk);
 	sock_hold(sk);
-	lock_sock(sk);
 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
 		/* delegate to CLC child sock */
-		release_sock(sk);
 		mask = smc->clcsock->ops->poll_mask(smc->clcsock, events);
-		lock_sock(sk);
 		sk->sk_err = smc->clcsock->sk->sk_err;
 		if (sk->sk_err) {
 			mask |= EPOLLERR;
@@ -1299,7 +1304,10 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			if (sk->sk_state == SMC_INIT &&
 			    mask & EPOLLOUT &&
 			    smc->clcsock->sk->sk_state != TCP_CLOSE) {
+				lock_sock(sk);
+				rcu_assign_pointer(sock->sk->sk_wq, smc->smcwq);
 				rc = __smc_connect(smc);
+				release_sock(sk);
 				if (rc < 0)
 					mask |= EPOLLERR;
 				/* success cases including fallback */
@@ -1334,7 +1342,6 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events)
 			mask |= EPOLLPRI;
 
 	}
-	release_sock(sk);
 	sock_put(sk);
 
 	return mask;
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 51ae1f10d81a..89d6d7ef973f 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -190,6 +190,7 @@ struct smc_connection {
 struct smc_sock {				/* smc sock container */
 	struct sock		sk;
 	struct socket		*clcsock;	/* internal tcp socket */
+	struct socket_wq	*smcwq;		/* original smcsock wq */
 	struct smc_connection	conn;		/* smc connection */
 	struct smc_sock		*listen_smc;	/* listen parent */
 	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
-- 
2.16.4

^ permalink raw reply related

* Re: [PATCH bpf-next 3/3] bpf: btf: json print map dump with btf info
From: Okash Khawaja @ 2018-06-21 14:26 UTC (permalink / raw)
  To: Quentin Monnet
  Cc: Daniel Borkmann, Martin KaFai Lau, Alexei Starovoitov,
	Yonghong Song, Jakub Kicinski, David S. Miller, netdev,
	kernel-team, linux-kernel
In-Reply-To: <86ae5059-54c8-d078-4f6b-b212285dbfec@netronome.com>

Hi Quentin,

On Thu, Jun 21, 2018 at 11:24:59AM +0100, Quentin Monnet wrote:
> Hi Okash,
> 
> Thanks for the patch! Please find some nitpicks inline below.
Thanks for your feedback. All of it makes sense so I'll send v2 with
those changes. Couple of responses are inlined below.

> 
> 2018-06-20 13:30 UTC-0700 ~ Okash Khawaja <osk@fb.com>
> > This patch modifies `bpftool map dump [-j|-p] id <map-id>` to json-
> > print and pretty-json-print map dump. It calls btf_dumper introduced in
> > previous patch to accomplish this.
> > 
> > The patch only prints debug info when -j or -p flags are supplied. Then
> > too, if the map has associated btf data loaded. Otherwise the usual
> > debug-less output is printed.
> > 
> > Signed-off-by: Okash Khawaja <osk@fb.com>
> > Acked-by: Martin KaFai Lau <kafai@fb.com>
> > 
> > ---
> >  tools/bpf/bpftool/map.c |   94 ++++++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 91 insertions(+), 3 deletions(-)
> > 
> > --- a/tools/bpf/bpftool/map.c
> > +++ b/tools/bpf/bpftool/map.c
> > @@ -43,9 +43,13 @@
> >  #include <unistd.h>
> >  #include <sys/types.h>
> >  #include <sys/stat.h>
> > +#include <linux/err.h>
> >  
> >  #include <bpf.h>
> >  
> > +#include "json_writer.h"
> > +#include "btf.h"
> > +#include "btf_dumper.h"
> >  #include "main.h"
> >  
> >  static const char * const map_type_name[] = {
> > @@ -508,6 +512,83 @@ static int do_show(int argc, char **argv
> >  	return errno == ENOENT ? 0 : -1;
> >  }
> >  
> > +
> > +static int do_dump_btf(struct btf *btf, struct bpf_map_info *map_info,
> > +		void *key, void *value)
> 
> Nit: Please align the second line on the opening parenthesis.
> 
> > +{
> > +	int ret;
> > +
> > +	jsonw_start_object(json_wtr);
> > +	jsonw_name(json_wtr, "key");
> > +
> > +	ret = btf_dumper_type(btf, json_wtr, map_info->btf_key_type_id, key);
> > +	if (ret)
> > +		goto out;
> > +
> > +	jsonw_end_object(json_wtr);
> > +
> > +	jsonw_start_object(json_wtr);
> > +	jsonw_name(json_wtr, "value");
> > +
> > +	ret = btf_dumper_type(btf, json_wtr, map_info->btf_value_type_id,
> > +			value);
> 
> Same comment.
> 
> > +
> > +out:
> > +	/* end of root object */
> > +	jsonw_end_object(json_wtr);
> 
> This is not the root JSON object, which is not produced in that
> function, so I find the comment misleading.
> 
> I also find it confusing that it closes the first JSON object of this
> function if there is an error, but the second if "btf_dumper_type()"
> succeeds. What about the following: closing the first object in all
> cases, before evaluating the value of "ret", and if "ret" is non-null
> returning immediately; and completely removing the "goto" from this
> function?
Code will be more intuitive that way so I'll re-organise it accordingly.

> 
> > +
> > +	return ret;
> > +}
> > +
> > +static struct btf *get_btf(struct bpf_map_info *map_info)
> > +{
> > +	int btf_fd = bpf_btf_get_fd_by_id(map_info->btf_id);
> > +	struct bpf_btf_info btf_info = { 0 };
> > +	__u32 len = sizeof(btf_info);
> > +	uint32_t last_size;
> > +	int err;
> > +	struct btf *btf = NULL;
> > +	void *ptr = NULL, *temp_ptr;
> 
> Nit: please sort declarations in reverse-Christmas-tree order.
> 
> > +
> > +	if (btf_fd < 0)
> > +		return NULL;
> > +
> > +	btf_info.btf_size = 4096;
> > +	do {
> > +		last_size = btf_info.btf_size;
> > +		temp_ptr = realloc(ptr, last_size);
> > +		if (!temp_ptr) {
> > +			p_err("unable allocate memory for debug info.");
> 
> "unable *to* allocate"?
> (Also most other error messages do not end with a period, but here this
> is just me being fussy.)
I think it makes sense to be consistent. I'll remove the full stop.

> 
> > +			goto exit_free;
> > +		}
> > +
> > +		ptr = temp_ptr;
> > +		bzero(ptr, last_size);
> > +		btf_info.btf = ptr_to_u64(ptr);
> > +		err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
> > +	} while (!err && btf_info.btf_size > last_size && last_size == 4096);
> 
> If I understand correctly, the first time you try to retrieve up to 4096
> bytes, and if the btf_info is larger than this, you try a second time
> with the size returned in btf_info.btf_size instead. I don't find it
> intuitive (but maybe this is just me?), do you think you could add a
> comment above this bloc maybe?
Yes that is what this code is doing. I'll add comments explaining it.

> 
> > +
> > +	if (err || btf_info.btf_size > last_size) {
> > +		p_info("can't get btf info. debug info won't be displayed. error: %s",
> > +				err ? strerror(errno) : "exceeds size retry");
> 
> Nit: Please align the second line on the opening parenthesis.
> 
> > +		goto exit_free;
> > +	}
> > +
> > +	btf = btf__new((uint8_t *) btf_info.btf,
> 
> Nit: No space between the cast and the name of the variable.
> 
> > +			btf_info.btf_size, NULL);
> 
> Same remark on parenthesis here...
> 
> > +	if (IS_ERR(btf)) {
> > +		printf("error when initialising btf: %s\n",
> > +				strerror(PTR_ERR(btf)));
> 
> ... and here.
> 
> > +		btf = NULL;
> > +	}
> > +
> > +exit_free:
> > +	close(btf_fd);
> > +	free(ptr);
> > +
> > +	return btf;
> > +}
> > +
> >  static int do_dump(int argc, char **argv)
> >  {
> >  	void *key, *value, *prev_key;
> > @@ -516,6 +597,7 @@ static int do_dump(int argc, char **argv
> >  	__u32 len = sizeof(info);
> >  	int err;
> >  	int fd;
> > +	struct btf *btf = NULL;
> 
> Reverse-Christmas-tree order, please.
> 
> >  
> >  	if (argc != 2)
> >  		usage();
> > @@ -538,6 +620,8 @@ static int do_dump(int argc, char **argv
> >  		goto exit_free;
> >  	}
> >  
> > +	btf = get_btf(&info);
> > +
> >  	prev_key = NULL;
> >  	if (json_output)
> >  		jsonw_start_array(json_wtr);
> > @@ -550,9 +634,12 @@ static int do_dump(int argc, char **argv
> >  		}
> >  
> >  		if (!bpf_map_lookup_elem(fd, key, value)) {
> > -			if (json_output)
> > -				print_entry_json(&info, key, value);
> > -			else
> > +			if (json_output) {
> > +				if (btf)
> > +					do_dump_btf(btf, &info, key, value);
> > +				else
> > +					print_entry_json(&info, key, value);
> > +			} else
> >  				print_entry_plain(&info, key, value);
> 
> Please add brackets around "print_entry_plain()" (to harmonise with the
> "if" of the same bloc).
> 
> >  		} else {
> >  			if (json_output) {
> > @@ -584,6 +671,7 @@ exit_free:
> >  	free(key);
> >  	free(value);
> >  	close(fd);
> > +	btf__free(btf);
> >  
> >  	return err;
> >  }
> > 
> 
> Thanks,
> Quentin

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox