* [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device
@ 2017-01-10 23:57 Tadeusz Struk
2017-01-11 7:12 ` Leon Romanovsky
2017-01-11 18:10 ` Jason Gunthorpe
0 siblings, 2 replies; 13+ messages in thread
From: Tadeusz Struk @ 2017-01-10 23:57 UTC (permalink / raw)
To: dledford
Cc: linux-rdma, linux-pci, dennis.dalessandro, ira.weiny,
tadeusz.struk
Some hardware have multiple HFIs within the same ASIC. In some devices the
numbers labeled on the faceplate of the device don't match the PCI bus
order, and the result is that the devices (ports) are probed in the opposite
order of their port numbers. The result is IB device unit numbers are in
reverse order from the faceplate numbering. This leads to confusion, and
errors.
We can fix this by enforcing the correct port order at module load time.
To reorder the ports to match the numbering labels on the back of the
device we need to delay registering devices with the ib_core until we
receive a probe for the affected devices, reorder them and start
initialization in the correct order later. We use a timer with 1 second
timeout. On hfi1 probes devices are ordered and stored in a list.
Each hfi1 probe updates the timer. When the timer timeouts all devices are
initialized and registered with ib_core in the right order.
When there will more than one second time gap between hfi1 probes,
the timer will be update in each call so there is no danger that a device
will be "missed".
A new module param called port_reorder is introduced to cover users, who
already prewired their clusters in the 'invalid' order. The default
behavior does not change and results in devices ordered in the PCI bus
order. Port_reorder=1 is used to apply special reordering to these devices.
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
---
drivers/infiniband/hw/hfi1/chip.c | 3 -
drivers/infiniband/hw/hfi1/chip.h | 4 +
drivers/infiniband/hw/hfi1/init.c | 170 +++++++++++++++++++++++++++++++++++++
3 files changed, 173 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index ef72bc2..d9aadac 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -129,9 +129,6 @@ struct flag_table {
#define NUM_MAP_ENTRIES 256
#define NUM_MAP_REGS 32
-/* Bit offset into the GUID which carries HFI id information */
-#define GUID_HFI_INDEX_SHIFT 39
-
/* extract the emulation revision */
#define emulator_rev(dd) ((dd)->irev >> 8)
/* parallel and serial emulation versions are 3 and 4 respectively */
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 043fd21..aa322b0 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -575,6 +575,10 @@ enum {
#define LOOPBACK_LCB 2
#define LOOPBACK_CABLE 3 /* external cable */
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT 39
+#define HFI_ASIC_GUID(guid) ((guid) & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
/* read and write hardware registers */
u64 read_csr(const struct hfi1_devdata *dd, u32 offset);
void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value);
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index e3b5bc9..93436bd 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -116,12 +116,29 @@ unsigned int user_credit_return_threshold = 33; /* default is 33% */
module_param(user_credit_return_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
+static bool port_reorder;
+module_param(port_reorder, bool, S_IRUGO);
+MODULE_PARM_DESC(port_reorder, "Device port reorder: 1 - order HFIs on the same ASIC in increasing port order, or 0 - order exactly as the kernel enumerates (default)");
+
static inline u64 encode_rcv_header_entry_size(u16);
static struct idr hfi1_unit_table;
u32 hfi1_cpulist_count;
unsigned long *hfi1_cpulist;
+struct hfi1_init_dev {
+ struct list_head list;
+ struct pci_dev *pdev;
+ const struct pci_device_id *ent;
+ u64 guid;
+};
+
+static struct timer_list init_timer;
+static LIST_HEAD(init_list);
+static DEFINE_MUTEX(init_mutex); /* protects init_list */
+static struct workqueue_struct *hfi1_init_wq;
+static void hfi1_deferred_init(struct work_struct *work);
+
/*
* Common code for creating the receive context array.
*/
@@ -1165,6 +1182,7 @@ void hfi1_disable_after_error(struct hfi1_devdata *dd)
static void remove_one(struct pci_dev *);
static int init_one(struct pci_dev *, const struct pci_device_id *);
+static void hfi1_probe_done(unsigned long data);
#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
#define PFX DRIVER_NAME ": "
@@ -1209,6 +1227,16 @@ static int __init hfi1_mod_init(void)
if (ret)
goto bail;
+ if (port_reorder) {
+ hfi1_init_wq = alloc_workqueue("hfi1_init_wq", WQ_MEM_RECLAIM,
+ 0);
+ if (!hfi1_init_wq) {
+ pr_err("Failed to create deffered dev init wq\n");
+ ret = -ENOMEM;
+ goto bail;
+ }
+ }
+
/* validate max MTU before any devices start */
if (!valid_opa_max_mtu(hfi1_max_mtu)) {
pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
@@ -1264,6 +1292,9 @@ static int __init hfi1_mod_init(void)
ret = hfi1_wss_init();
if (ret < 0)
goto bail_wss;
+
+ setup_timer(&init_timer, hfi1_probe_done, 0);
+
ret = pci_register_driver(&hfi1_pci_driver);
if (ret < 0) {
pr_err("Unable to register driver: error %d\n", -ret);
@@ -1288,6 +1319,8 @@ module_init(hfi1_mod_init);
*/
static void __exit hfi1_mod_cleanup(void)
{
+ struct hfi1_init_dev *pos, *tmp;
+
pci_unregister_driver(&hfi1_pci_driver);
node_affinity_destroy();
hfi1_wss_exit();
@@ -1298,6 +1331,18 @@ static void __exit hfi1_mod_cleanup(void)
idr_destroy(&hfi1_unit_table);
dispose_firmware(); /* asymmetric with obtain_firmware() */
dev_cleanup();
+
+ mutex_lock(&init_mutex);
+ list_for_each_entry_safe(pos, tmp, &init_list, list) {
+ list_del(&pos->list);
+ kfree(pos);
+ }
+ mutex_unlock(&init_mutex);
+ del_timer(&init_timer);
+ if (hfi1_init_wq) {
+ destroy_workqueue(hfi1_init_wq);
+ hfi1_init_wq = NULL;
+ }
}
module_exit(hfi1_mod_cleanup);
@@ -1415,7 +1460,7 @@ static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt)
return 0;
}
-static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+static int do_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
{
int ret = 0, j, pidx, initfail;
struct hfi1_devdata *dd;
@@ -1545,6 +1590,129 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
return ret;
}
+static void hfi1_deferred_init(struct work_struct *work)
+{
+ struct hfi1_init_dev *pos, *tmp;
+ int ret;
+
+ mutex_lock(&init_mutex);
+ list_for_each_entry_safe(pos, tmp, &init_list, list) {
+ ret = do_init_one(pos->pdev, pos->ent);
+ if (ret)
+ hfi1_early_err(&pos->pdev->dev, "%s: dev init failed, err %d\n",
+ __func__, ret);
+
+ list_del(&pos->list);
+ kfree(pos);
+ }
+ mutex_unlock(&init_mutex);
+ kfree(work);
+}
+
+static void hfi1_probe_done(unsigned long data)
+{
+ struct work_struct *work;
+
+ if (!hfi1_init_wq)
+ return;
+
+ /* Got probe for all devices. Now invoke hfi1_deferred_init */
+ work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return;
+
+ INIT_WORK(work, hfi1_deferred_init);
+ queue_work(hfi1_init_wq, work);
+}
+
+static int get_guid(struct pci_dev *pdev, u64 *guid)
+{
+ void __iomem *bar;
+
+ bar = ioremap_nocache(pci_resource_start(pdev, 0),
+ DC_DC8051_CFG_LOCAL_GUID + 8);
+ if (!bar) {
+ hfi1_early_err(&pdev->dev, "%s: unable to ioremap bar\n",
+ __func__);
+ return -ENOMEM;
+ }
+ /* make sure the DC is not in reset so the GUID is readable */
+ writeq(0ull, bar + CCE_DC_CTRL);
+ (void)readq(bar + CCE_DC_CTRL); /* force the write */
+
+ /* read device guid */
+ *guid = readq(bar + DC_DC8051_CFG_LOCAL_GUID);
+ iounmap(bar);
+ return 0;
+}
+
+static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ struct hfi1_init_dev *probed_pdev;
+ struct hfi1_init_dev *pos;
+ u64 guid;
+ int ret;
+
+ if (!port_reorder)
+ return do_init_one(pdev, ent);
+
+ probed_pdev = kzalloc(sizeof(*probed_pdev), GFP_KERNEL);
+ if (!probed_pdev)
+ return -ENOMEM;
+
+ ret = pci_enable_device(pdev);
+ if (ret) {
+ hfi1_early_err(&pdev->dev, "%s: cannot enable device, err %d\n",
+ __func__, ret);
+ goto fail;
+ }
+
+ ret = get_guid(pdev, &guid);
+ if (ret)
+ goto fail_get_guid;
+
+ probed_pdev->pdev = pdev;
+ probed_pdev->ent = ent;
+ probed_pdev->guid = guid;
+
+ mutex_lock(&init_mutex);
+ if (list_empty(&init_list)) {
+ list_add_tail(&probed_pdev->list, &init_list);
+ } else {
+ list_for_each_entry(pos, &init_list, list)
+ if (HFI_ASIC_GUID(pos->guid) == HFI_ASIC_GUID(guid))
+ break;
+
+ /* Match found, pos and entry are valid */
+ if (pos->guid == probed_pdev->guid) {
+ /* Something wrong - same hardware id */
+ hfi1_early_err(&pdev->dev, "%s: duplicate guid 0x%llx\n",
+ __func__, guid);
+ mutex_unlock(&init_mutex);
+ goto fail_get_guid;
+ }
+
+ /*
+ * Order around the first of the pair that was enumerated.
+ * Insert before or after the one already in the list.
+ */
+ if ((pos->guid >> GUID_HFI_INDEX_SHIFT) & 1)
+ list_add_tail(&probed_pdev->list, &pos->list);
+ else
+ list_add(&probed_pdev->list, &pos->list);
+ }
+ mutex_unlock(&init_mutex);
+ mod_timer(&init_timer, jiffies + msecs_to_jiffies(1000));
+ pci_disable_device(pdev);
+ return 0;
+
+fail_get_guid:
+ pci_disable_device(pdev);
+fail:
+ kfree(probed_pdev);
+ return ret;
+}
+
static void wait_for_clients(struct hfi1_devdata *dd)
{
/*
^ permalink raw reply related [flat|nested] 13+ messages in thread* Re: [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device
2017-01-10 23:57 [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device Tadeusz Struk
@ 2017-01-11 7:12 ` Leon Romanovsky
2017-01-11 17:20 ` Tadeusz Struk
2017-01-11 18:10 ` Jason Gunthorpe
1 sibling, 1 reply; 13+ messages in thread
From: Leon Romanovsky @ 2017-01-11 7:12 UTC (permalink / raw)
To: Tadeusz Struk
Cc: dledford, linux-rdma, linux-pci, dennis.dalessandro, ira.weiny
[-- Attachment #1: Type: text/plain, Size: 2084 bytes --]
On Tue, Jan 10, 2017 at 03:57:52PM -0800, Tadeusz Struk wrote:
> Some hardware have multiple HFIs within the same ASIC. In some devices the
> numbers labeled on the faceplate of the device don't match the PCI bus
> order, and the result is that the devices (ports) are probed in the opposite
> order of their port numbers. The result is IB device unit numbers are in
> reverse order from the faceplate numbering. This leads to confusion, and
> errors.
Can't you recognize such device at the initialization phase?
This is definitely specific HW implementation bug/issue/limitation.
> We can fix this by enforcing the correct port order at module load time.
> To reorder the ports to match the numbering labels on the back of the
> device we need to delay registering devices with the ib_core until we
> receive a probe for the affected devices, reorder them and start
> initialization in the correct order later. We use a timer with 1 second
> timeout. On hfi1 probes devices are ordered and stored in a list.
> Each hfi1 probe updates the timer. When the timer timeouts all devices are
> initialized and registered with ib_core in the right order.
> When there will more than one second time gap between hfi1 probes,
> the timer will be update in each call so there is no danger that a device
> will be "missed".
>
> A new module param called port_reorder is introduced to cover users, who
> already prewired their clusters in the 'invalid' order. The default
> behavior does not change and results in devices ordered in the PCI bus
> order. Port_reorder=1 is used to apply special reordering to these devices.
I always had an impression that module parameters are rarely beneficial
in upstream kernel for driver modules and adding them for new driver
code should be explicitly prohibited in CodingStyle guide.
You are adding new module parameter which will be with us forever to fix
special HW bug/implementation in some legacy installations. It will be
insane to add such thing to upstream kernel, errata and out-of-tree
implementations are best places for such things.
Thanks
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device
2017-01-11 7:12 ` Leon Romanovsky
@ 2017-01-11 17:20 ` Tadeusz Struk
2017-01-11 17:59 ` Leon Romanovsky
0 siblings, 1 reply; 13+ messages in thread
From: Tadeusz Struk @ 2017-01-11 17:20 UTC (permalink / raw)
To: Leon Romanovsky
Cc: dledford, linux-rdma, linux-pci, dennis.dalessandro, ira.weiny
Hi Leon,
On 01/10/2017 11:12 PM, Leon Romanovsky wrote:
> Can't you recognize such device at the initialization phase?
Yes, this is exactly what it does. It checks the device GUID
and reorders the ports during insmod.
> This is definitely specific HW implementation bug/issue/limitation.
Correct. As the commit message says this is to fix a HW issue.
>
> I always had an impression that module parameters are rarely beneficial
> in upstream kernel for driver modules and adding them for new driver
> code should be explicitly prohibited in CodingStyle guide.
>
> You are adding new module parameter which will be with us forever to fix
> special HW bug/implementation in some legacy installations. It will be
> insane to add such thing to upstream kernel, errata and out-of-tree
> implementations are best places for such things.
Agree, but the reality is that there are 5505 $(git grep module_param drivers/ | wc -l)
module parameters in the kernel already, and even mlx4 and mlx5 drivers use them.
I really considered every other possible solution, but module parameter is the
only possible way to do such things during insmod.
Thanks,
--
Tadeusz
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device
2017-01-11 17:20 ` Tadeusz Struk
@ 2017-01-11 17:59 ` Leon Romanovsky
0 siblings, 0 replies; 13+ messages in thread
From: Leon Romanovsky @ 2017-01-11 17:59 UTC (permalink / raw)
To: Tadeusz Struk
Cc: dledford, linux-rdma, linux-pci, dennis.dalessandro, ira.weiny
[-- Attachment #1: Type: text/plain, Size: 1720 bytes --]
On Wed, Jan 11, 2017 at 09:20:54AM -0800, Tadeusz Struk wrote:
> Hi Leon,
> On 01/10/2017 11:12 PM, Leon Romanovsky wrote:
> > Can't you recognize such device at the initialization phase?
>
> Yes, this is exactly what it does. It checks the device GUID
> and reorders the ports during insmod.
Why don't your firmware report the need to revert the ports?
>
> > This is definitely specific HW implementation bug/issue/limitation.
>
> Correct. As the commit message says this is to fix a HW issue.
>
> >
> > I always had an impression that module parameters are rarely beneficial
> > in upstream kernel for driver modules and adding them for new driver
> > code should be explicitly prohibited in CodingStyle guide.
> >
> > You are adding new module parameter which will be with us forever to fix
> > special HW bug/implementation in some legacy installations. It will be
> > insane to add such thing to upstream kernel, errata and out-of-tree
> > implementations are best places for such things.
>
> Agree, but the reality is that there are 5505 $(git grep module_param drivers/ | wc -l)
> module parameters in the kernel already, and even mlx4 and mlx5 drivers use them.
Regarding mlx4/mlx5, I'm not proud of that code and if it was dependent
on me, I would remove it without thinking twice.
Previous copy-paste can not be an excuse to add another module parameter.
> I really considered every other possible solution, but module parameter is the
> only possible way to do such things during insmod.
This is another problem with the module parameters - assumption that
everyone is running modules, but this is simply incorrect, most of my
smoke tests are performed with monolithic kernel.
> Thanks,
> --
> Tadeusz
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device
2017-01-10 23:57 [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device Tadeusz Struk
2017-01-11 7:12 ` Leon Romanovsky
@ 2017-01-11 18:10 ` Jason Gunthorpe
[not found] ` <20170111181042.GC22783-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
1 sibling, 1 reply; 13+ messages in thread
From: Jason Gunthorpe @ 2017-01-11 18:10 UTC (permalink / raw)
To: Tadeusz Struk
Cc: dledford, linux-rdma, linux-pci, dennis.dalessandro, ira.weiny
On Tue, Jan 10, 2017 at 03:57:52PM -0800, Tadeusz Struk wrote:
> We can fix this by enforcing the correct port order at module load time.
> To reorder the ports to match the numbering labels on the back of the
> device we need to delay registering devices with the ib_core until
> we
Sorry, no way - this is horrifying.
If you need stable names for RDMA devices then you need to add proper
infrastructure to the kernel to rename RDMA devices from user space
via udev. ala netdev.
or change the ib_core to allow your driver to specify the full name
and manage things in your driver.
No way on this insane block probe approach.
Jason
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2017-01-22 8:16 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-01-10 23:57 [PATCH] [RFC] IB/hfi1: Fix port ordering issue in a multiport device Tadeusz Struk
2017-01-11 7:12 ` Leon Romanovsky
2017-01-11 17:20 ` Tadeusz Struk
2017-01-11 17:59 ` Leon Romanovsky
2017-01-11 18:10 ` Jason Gunthorpe
[not found] ` <20170111181042.GC22783-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2017-01-18 21:01 ` Doug Ledford
[not found] ` <1484773260.2406.58.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2017-01-18 21:08 ` Jason Gunthorpe
2017-01-18 22:03 ` Tadeusz Struk
[not found] ` <934eb6c4-fbba-28d0-d6bb-f036cc9ced5c-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2017-01-19 0:17 ` Doug Ledford
2017-01-19 16:51 ` Tadeusz Struk
2017-01-19 0:16 ` Doug Ledford
[not found] ` <1484784989.2406.67.camel-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2017-01-19 17:58 ` Jason Gunthorpe
2017-01-22 8:16 ` Leon Romanovsky
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox