All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC/RFT 13/14] arm64: dts: meson-g12b: add cpus OPP tables
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Add the OPP table taken from the HardKernel Odroid-N2 DTS.

The Amlogic G12B SoC seems to available in 2 types :
- low-speed: Cortex-A73 Cluster up to 1,704GHz
- high-speed: Cortex-A73 Cluster up to 2.208GHz

The Cortex-A73 Cluster can be clocked up to 1,896GHz for both types.

The Vendor Amlogic A311D OPP table are slighly different, with lower
voltages than the HardKernel S922X tables but seems to be high-speed type.

This adds the conservative OPP table with the S922X higher voltages
and the maximum low-speed OPP frequency.

The values were tested to be stable on an HardKernel Odroid-N2 board
running the arm64 cpuburn at [1] and cycling between all the possible
cpufreq translations for both clusters and checking the final frequency
using the clock-measurer, script at [2].

[1] https://github.com/ssvb/cpuburn-arm/blob/master/cpuburn-a53.S
[2] https://gist.github.com/superna9999/d4de964dbc0f84b7d527e1df2ddea25f

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 arch/arm64/boot/dts/amlogic/meson-g12b.dtsi | 115 ++++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12b.dtsi
index ae63cd610892..334742de11a7 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12b.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12b.dtsi
@@ -95,6 +95,121 @@
 			compatible = "cache";
 		};
 	};
+
+	cpu_opp_table_0: opp-table-0 {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp-100000000 {
+			opp-hz = /bits/ 64 <100000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-250000000 {
+			opp-hz = /bits/ 64 <250000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-500000000 {
+			opp-hz = /bits/ 64 <500000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-666666666 {
+			opp-hz = /bits/ 64 <666666666>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1000000000 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1398000000 {
+			opp-hz = /bits/ 64 <1398000000>;
+			opp-microvolt = <761000>;
+		};
+
+		opp-1512000000 {
+			opp-hz = /bits/ 64 <1512000000>;
+			opp-microvolt = <791000>;
+		};
+
+		opp-1608000000 {
+			opp-hz = /bits/ 64 <1608000000>;
+			opp-microvolt = <831000>;
+		};
+
+		opp-1704000000 {
+			opp-hz = /bits/ 64 <1704000000>;
+			opp-microvolt = <861000>;
+		};
+		
+		opp-1896000000 {
+			opp-hz = /bits/ 64 <1896000000>;
+			opp-microvolt = <981000>;
+		};
+	};
+
+	cpub_opp_table_1: opp-table-1 {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp-100000000 {
+			opp-hz = /bits/ 64 <100000000>;
+			opp-microvolt = <751000>;
+		};
+
+		opp-250000000 {
+			opp-hz = /bits/ 64 <250000000>;
+			opp-microvolt = <751000>;
+		};
+
+		opp-500000000 {
+			opp-hz = /bits/ 64 <500000000>;
+			opp-microvolt = <751000>;
+		};
+
+		opp-666666666 {
+			opp-hz = /bits/ 64 <666666666>;
+			opp-microvolt = <751000>;
+		};
+
+		opp-1000000000 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt = <751000>;
+		};
+
+		opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <771000>;
+		};
+
+		opp-1398000000 {
+			opp-hz = /bits/ 64 <1398000000>;
+			opp-microvolt = <791000>;
+		};
+
+		opp-1512000000 {
+			opp-hz = /bits/ 64 <1512000000>;
+			opp-microvolt = <821000>;
+		};
+
+		opp-1608000000 {
+			opp-hz = /bits/ 64 <1608000000>;
+			opp-microvolt = <861000>;
+		};
+
+		opp-1704000000 {
+			opp-hz = /bits/ 64 <1704000000>;
+			opp-microvolt = <891000>;
+		};
+	};
 };
 
 &clkc {
-- 
2.21.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related

* [RFC/RFT 12/14] arm64: dts: meson-g12a: enable DVFS on G12A boards
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Enable DVFS for the U200, SEI520 and X96-Max Amlogic G12A based board
by setting the clock, OPP and supply for each CPU cores.

The CPU cluster power supply can achieve 0.73V to 1.01V using a PWM
output clocked at 800KHz with an inverse duty-cycle.

DVFS has been tested by running the arm64 cpuburn at [1] and cycling
between all the possible cpufreq translations and checking the final
frequency using the clock-measurer, script at [2].

[1] https://github.com/ssvb/cpuburn-arm/blob/master/cpuburn-a53.S
[2] https://gist.github.com/superna9999/d4de964dbc0f84b7d527e1df2ddea25f

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 .../boot/dts/amlogic/meson-g12a-sei510.dts    | 55 +++++++++++++++++++
 .../boot/dts/amlogic/meson-g12a-u200.dts      | 55 +++++++++++++++++++
 .../boot/dts/amlogic/meson-g12a-x96-max.dts   | 52 ++++++++++++++++++
 3 files changed, 162 insertions(+)

diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts
index bd26b35102ff..be45437c51e1 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a-sei510.dts
@@ -129,6 +129,25 @@
 		enable-active-high;
 	};
 
+	vddcpu: regulator-vddcpu {
+		/*
+		 * SY8120B1ABC DC/DC Regulator.
+		 */
+		compatible = "pwm-regulator";
+
+		regulator-name = "VDDCPU";
+		regulator-min-microvolt = <721000>;
+		regulator-max-microvolt = <1022000>;
+
+		vin-supply = <&dc_in>;
+
+		pwms = <&pwm_AO_cd 1 1250 0>;
+		pwm-dutycycle-range = <100 0>;
+
+		regulator-boot-on;
+		regulator-always-on;
+	};
+
 	vddio_ao1v8: regulator-vddio_ao1v8 {
 		compatible = "regulator-fixed";
 		regulator-name = "VDDIO_AO1V8";
@@ -297,6 +316,34 @@
 	status = "okay";
 };
 
+&cpu0 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu1 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu2 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu3 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
 &cvbs_vdac_port {
 	cvbs_vdac_out: endpoint {
 		remote-endpoint = <&cvbs_connector_in>;
@@ -339,6 +386,14 @@
 	pinctrl-names = "default";
 };
 
+&pwm_AO_cd {
+	pinctrl-0 = <&pwm_ao_d_e_pins>;
+	pinctrl-names = "default";
+	clocks = <&xtal>;
+	clock-names = "clkin1";
+	status = "okay";
+};
+
 &pwm_ef {
 	status = "okay";
 	pinctrl-0 = <&pwm_e_pins>;
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts
index 332a4b27174b..b6bd96ee394a 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a-u200.dts
@@ -136,6 +136,25 @@
 		regulator-always-on;
 	};
 
+	vddcpu: regulator-vddcpu {
+		/*
+		 * MP8756GD Regulator.
+		 */
+		compatible = "pwm-regulator";
+
+		regulator-name = "VDDCPU";
+		regulator-min-microvolt = <721000>;
+		regulator-max-microvolt = <1022000>;
+
+		vin-supply = <&main_12v>;
+
+		pwms = <&pwm_AO_cd 1 1250 0>;
+		pwm-dutycycle-range = <100 0>;
+
+		regulator-boot-on;
+		regulator-always-on;
+	};
+
 	sound {
 		compatible = "amlogic,axg-sound-card";
 		model = "G12A-U200";
@@ -305,6 +324,34 @@
 	status = "okay";
 };
 
+&cpu0 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu1 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu2 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu3 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
 &cvbs_vdac_port {
 	cvbs_vdac_out: endpoint {
 		remote-endpoint = <&cvbs_connector_in>;
@@ -369,6 +416,14 @@
 	pinctrl-names = "default";
 };
 
+&pwm_AO_cd {
+	pinctrl-0 = <&pwm_ao_d_e_pins>;
+	pinctrl-names = "default";
+	clocks = <&xtal>;
+	clock-names = "clkin1";
+	status = "okay";
+};
+
 /* SD card */
 &sd_emmc_b {
 	status = "okay";
diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts b/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts
index d37868d21114..099ccf67547e 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a-x96-max.dts
@@ -132,6 +132,22 @@
 		regulator-always-on;
 	};
 
+	vddcpu: regulator-vddcpu {
+		compatible = "pwm-regulator";
+
+		regulator-name = "VDDCPU";
+		regulator-min-microvolt = <721000>;
+		regulator-max-microvolt = <1022000>;
+
+		vin-supply = <&dc_in>;
+
+		pwms = <&pwm_AO_cd 1 1250 0>;
+		pwm-dutycycle-range = <100 0>;
+
+		regulator-boot-on;
+		regulator-always-on;
+	};
+
 	sound {
 		compatible = "amlogic,axg-sound-card";
 		model = "G12A-X96-MAX";
@@ -242,6 +258,34 @@
 	status = "okay";
 };
 
+&cpu0 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu1 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu2 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
+&cpu3 {
+	cpu-supply = <&vddcpu>;
+	operating-points-v2 = <&cpu_opp_table>;
+	clocks = <&clkc CLKID_CPU_CLK>;
+	clock-latency = <50000>;
+};
+
 &cvbs_vdac_port {
 	cvbs_vdac_out: endpoint {
 		remote-endpoint = <&cvbs_connector_in>;
@@ -279,6 +323,14 @@
 	pinctrl-names = "default";
 };
 
+&pwm_AO_cd {
+	pinctrl-0 = <&pwm_ao_d_e_pins>;
+	pinctrl-names = "default";
+	clocks = <&xtal>;
+	clock-names = "clkin1";
+	status = "okay";
+};
+
 &ext_mdio {
 	external_phy: ethernet-phy@0 {
 		/* Realtek RTL8211F (0x001cc916) */
-- 
2.21.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related

* Re: [PATCH 1/3] drm/stm: drv: fix suspend/resume
From: Benjamin Gaignard @ 2019-06-20 15:05 UTC (permalink / raw)
  To: Philippe CORNU
  Cc: Yannick FERTRE, Benjamin GAIGNARD, Vincent ABRIOU, David Airlie,
	Daniel Vetter, Maxime Coquelin, Alexandre TORGUE,
	dri-devel@lists.freedesktop.org,
	linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <7e6a87b6-e442-20cb-0d4e-68eb40c56042@st.com>

Le mar. 18 juin 2019 à 11:57, Philippe CORNU <philippe.cornu@st.com> a écrit :
>
> Hi Yannick,
>
> Thank you for your patch.
>
> Acked-by: Philippe Cornu <philippe.cornu@st.com>

I have corrected Fixes sha1 (should be 12 digits)
Applied on drm-misc-next.

Benjamin

>
> Philippe :-)
>
> On 6/17/19 9:18 AM, Yannick Fertré wrote:
> > Without this fix, the system can not go in "suspend" mode
> > due to an error in drv_suspend function.
> >
> > Fixes: 35ab6cf ("drm/stm: support runtime power management")
> >
> > Signed-off-by: Yannick Fertré <yannick.fertre@st.com>
> > ---
> >   drivers/gpu/drm/stm/drv.c | 15 ++++++++-------
> >   1 file changed, 8 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/stm/drv.c b/drivers/gpu/drm/stm/drv.c
> > index 5659572..9dee4e4 100644
> > --- a/drivers/gpu/drm/stm/drv.c
> > +++ b/drivers/gpu/drm/stm/drv.c
> > @@ -136,8 +136,7 @@ static __maybe_unused int drv_suspend(struct device *dev)
> >       struct ltdc_device *ldev = ddev->dev_private;
> >       struct drm_atomic_state *state;
> >
> > -     if (WARN_ON(!ldev->suspend_state))
> > -             return -ENOENT;
> > +     WARN_ON(ldev->suspend_state);
> >
> >       state = drm_atomic_helper_suspend(ddev);
> >       if (IS_ERR(state))
> > @@ -155,15 +154,17 @@ static __maybe_unused int drv_resume(struct device *dev)
> >       struct ltdc_device *ldev = ddev->dev_private;
> >       int ret;
> >
> > +     if (WARN_ON(!ldev->suspend_state))
> > +             return -ENOENT;
> > +
> >       pm_runtime_force_resume(dev);
> >       ret = drm_atomic_helper_resume(ddev, ldev->suspend_state);
> > -     if (ret) {
> > +     if (ret)
> >               pm_runtime_force_suspend(dev);
> > -             ldev->suspend_state = NULL;
> > -             return ret;
> > -     }
> >
> > -     return 0;
> > +     ldev->suspend_state = NULL;
> > +
> > +     return ret;
> >   }
> >
> >   static __maybe_unused int drv_runtime_suspend(struct device *dev)
> >
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply

* [RFC/RFT 11/14] arm64: dts: meson-g12a: add cpus OPP table
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Add the OPP table taken from the vendor u200 and u211 DTS.

The Amlogic G12A SoC seems to available in 3 types :
- low-speed: up to 1,8GHz
- mid-speed: up to 1,908GHz
- high-speed: up to 2.1GHz

And the S905X2 opp voltages are slightly higher than the S905D2
OPP voltages for the low-speed table.

This adds the conservative OPP table with the S905X2 higher voltages
and the maximum low-speed OPP frequency.

The values were tested to be stable on an Amlogic U200 Reference Board,
SeiRobotics SEI510 and X96 Max Set-Top-Boxes running the arm64 cpuburn
at [1] and cycling between all the possible cpufreq translations and
checking the final frequency using the clock-measurer, script at [2].

[1] https://github.com/ssvb/cpuburn-arm/blob/master/cpuburn-a53.S
[2] https://gist.github.com/superna9999/d4de964dbc0f84b7d527e1df2ddea25f

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 arch/arm64/boot/dts/amlogic/meson-g12a.dtsi | 62 ++++++++++++++++++++-
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi
index ac15967bb7fa..ba9aab39fd95 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi
@@ -48,8 +48,64 @@
 			compatible = "cache";
 		};
 	};
-};
 
-&sd_emmc_a {
-	amlogic,dram-access-quirk;
+	cpu_opp_table: opp-table {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp-100000000 {
+			opp-hz = /bits/ 64 <100000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-250000000 {
+			opp-hz = /bits/ 64 <250000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-500000000 {
+			opp-hz = /bits/ 64 <500000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-667000000 {
+			opp-hz = /bits/ 64 <666666666>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1000000000 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1398000000 {
+			opp-hz = /bits/ 64 <1398000000>;
+			opp-microvolt = <761000>;
+		};
+
+		opp-1512000000 {
+			opp-hz = /bits/ 64 <1512000000>;
+			opp-microvolt = <791000>;
+		};
+
+		opp-1608000000 {
+			opp-hz = /bits/ 64 <1608000000>;
+			opp-microvolt = <831000>;
+		};
+
+		opp-1704000000 {
+			opp-hz = /bits/ 64 <1704000000>;
+			opp-microvolt = <861000>;
+		};
+
+		opp-1800000000 {
+			opp-hz = /bits/ 64 <1800000000>;
+			opp-microvolt = <981000>;
+		};
+	};
 };
-- 
2.21.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related

* [RFC/RFT 11/14] arm64: dts: meson-g12a: add cpus OPP table
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Add the OPP table taken from the vendor u200 and u211 DTS.

The Amlogic G12A SoC seems to available in 3 types :
- low-speed: up to 1,8GHz
- mid-speed: up to 1,908GHz
- high-speed: up to 2.1GHz

And the S905X2 opp voltages are slightly higher than the S905D2
OPP voltages for the low-speed table.

This adds the conservative OPP table with the S905X2 higher voltages
and the maximum low-speed OPP frequency.

The values were tested to be stable on an Amlogic U200 Reference Board,
SeiRobotics SEI510 and X96 Max Set-Top-Boxes running the arm64 cpuburn
at [1] and cycling between all the possible cpufreq translations and
checking the final frequency using the clock-measurer, script at [2].

[1] https://github.com/ssvb/cpuburn-arm/blob/master/cpuburn-a53.S
[2] https://gist.github.com/superna9999/d4de964dbc0f84b7d527e1df2ddea25f

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 arch/arm64/boot/dts/amlogic/meson-g12a.dtsi | 62 ++++++++++++++++++++-
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi
index ac15967bb7fa..ba9aab39fd95 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12a.dtsi
@@ -48,8 +48,64 @@
 			compatible = "cache";
 		};
 	};
-};
 
-&sd_emmc_a {
-	amlogic,dram-access-quirk;
+	cpu_opp_table: opp-table {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp-100000000 {
+			opp-hz = /bits/ 64 <100000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-250000000 {
+			opp-hz = /bits/ 64 <250000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-500000000 {
+			opp-hz = /bits/ 64 <500000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-667000000 {
+			opp-hz = /bits/ 64 <666666666>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1000000000 {
+			opp-hz = /bits/ 64 <1000000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1200000000 {
+			opp-hz = /bits/ 64 <1200000000>;
+			opp-microvolt = <731000>;
+		};
+
+		opp-1398000000 {
+			opp-hz = /bits/ 64 <1398000000>;
+			opp-microvolt = <761000>;
+		};
+
+		opp-1512000000 {
+			opp-hz = /bits/ 64 <1512000000>;
+			opp-microvolt = <791000>;
+		};
+
+		opp-1608000000 {
+			opp-hz = /bits/ 64 <1608000000>;
+			opp-microvolt = <831000>;
+		};
+
+		opp-1704000000 {
+			opp-hz = /bits/ 64 <1704000000>;
+			opp-microvolt = <861000>;
+		};
+
+		opp-1800000000 {
+			opp-hz = /bits/ 64 <1800000000>;
+			opp-microvolt = <981000>;
+		};
+	};
 };
-- 
2.21.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related

* [RFC/RFT 10/14] arm64: dts: meson-g12-common: add pwm_a on GPIOE_2 pinmux
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Add the ao_pinctrl subnode for the pwm_a function on GPIOE_2.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
index 2baa04303762..76484801478d 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
@@ -1984,6 +1984,14 @@
 						};
 					};
 
+					pwm_a_e_pins: pwm-a-e {
+						mux {
+							groups = "pwm_a_e";
+							function = "pwm_a_e";
+							bias-disable;
+						};
+					};
+
 					pwm_ao_a_pins: pwm-ao-a {
 						mux {
 							groups = "pwm_ao_a";
-- 
2.21.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related

* [RFC/RFT 10/14] arm64: dts: meson-g12-common: add pwm_a on GPIOE_2 pinmux
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Add the ao_pinctrl subnode for the pwm_a function on GPIOE_2.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
index 2baa04303762..76484801478d 100644
--- a/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
+++ b/arch/arm64/boot/dts/amlogic/meson-g12-common.dtsi
@@ -1984,6 +1984,14 @@
 						};
 					};
 
+					pwm_a_e_pins: pwm-a-e {
+						mux {
+							groups = "pwm_a_e";
+							function = "pwm_a_e";
+							bias-disable;
+						};
+					};
+
 					pwm_ao_a_pins: pwm-ao-a {
 						mux {
 							groups = "pwm_ao_a";
-- 
2.21.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related

* [RFC/RFT 08/14] clk: meson: g12a: expose CPUB clock ID for G12B
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Expose the CPUB clock id to add DVFS to the second CPU cluster of
the Amlogic G12B SoC.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 include/dt-bindings/clock/g12a-clkc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/dt-bindings/clock/g12a-clkc.h b/include/dt-bindings/clock/g12a-clkc.h
index b6b127e45634..8ccc29ac7a72 100644
--- a/include/dt-bindings/clock/g12a-clkc.h
+++ b/include/dt-bindings/clock/g12a-clkc.h
@@ -137,5 +137,6 @@
 #define CLKID_VDEC_HEVC				207
 #define CLKID_VDEC_HEVCF			210
 #define CLKID_TS				212
+#define CLKID_CPUB_CLK				224
 
 #endif /* __G12A_CLKC_H */
-- 
2.21.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related

* [RFC/RFT 08/14] clk: meson: g12a: expose CPUB clock ID for G12B
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Expose the CPUB clock id to add DVFS to the second CPU cluster of
the Amlogic G12B SoC.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 include/dt-bindings/clock/g12a-clkc.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/dt-bindings/clock/g12a-clkc.h b/include/dt-bindings/clock/g12a-clkc.h
index b6b127e45634..8ccc29ac7a72 100644
--- a/include/dt-bindings/clock/g12a-clkc.h
+++ b/include/dt-bindings/clock/g12a-clkc.h
@@ -137,5 +137,6 @@
 #define CLKID_VDEC_HEVC				207
 #define CLKID_VDEC_HEVCF			210
 #define CLKID_TS				212
+#define CLKID_CPUB_CLK				224
 
 #endif /* __G12A_CLKC_H */
-- 
2.21.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related

* [RFC/RFT 07/14] clk: meson: g12a: add notifiers to handle cpu clock change
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

In order to implement clock switching for the CLKID_CPU_CLK and
CLKID_CPUB_CLK, notifiers are added on specific points of the
clock tree :

cpu_clk / cpub_clk
|   \- cpu_clk_dyn
|      |  \- cpu_clk_premux0
|      |        |- cpu_clk_postmux0
|      |        |    |- cpu_clk_dyn0_div
|      |        |    \- xtal/fclk_div2/fclk_div3
|      |        \- xtal/fclk_div2/fclk_div3
|      \- cpu_clk_premux1
|            |- cpu_clk_postmux1
|            |    |- cpu_clk_dyn1_div
|            |    \- xtal/fclk_div2/fclk_div3
|            \- xtal/fclk_div2/fclk_div3
\ sys_pll / sys1_pll

This for each cluster, a single one for G12A, two for G12B.

Each cpu_clk_premux1 tree is marked as read-only and CLK_SET_RATE_NO_REPARENT,
to be used as "parking" clock in a safe clock frequency.

A notifier is added on each cpu_clk_premux0 to detech when CCF want to
change the frequency of the cpu_clk_dyn tree.
In this notifier, the cpu_clk_premux1 tree is configured to use the xtal
clock and then the cpu_clk_dyn is switch to cpu_clk_premux1 while CCF
updates the cpu_clk_premux0 tree.

A notifier is added on each sys_pll/sys1_pll to detect when CCF wants to
change the PLL clock source of the cpu_clk.
In this notifier, the cpu_clk is switched to cpu_clk_dyn while CCF
updates the sys_pll/sys1_pll frequency.

A third small notifier is added on each cpu_clk / cpub_clk and cpu_clk_dyn,
add a small delay at PRE_RATE_CHANGE/POST_RATE_CHANGE to let the other
notofiers change propagate before changing the cpu_clk_premux0 and sys_pll
clock trees.

This notifier set permits switching the cpu_clk / cpub_clk without any
glitches and using a safe parking clock while switching between sub-GHz
clocks using the cpu_clk_dyn tree.

This setup has been tested and validated on the Amlogic G12A and G12B
SoCs running the arm64 cpuburn at [1] and cycling between all the possible
cpufreq translations of each cluster and checking the final frequency using
the clock-measurer, script at [2].

[1] https://github.com/ssvb/cpuburn-arm/blob/master/cpuburn-a53.S
[2] https://gist.github.com/superna9999/d4de964dbc0f84b7d527e1df2ddea25f

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 drivers/clk/meson/g12a.c | 500 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 468 insertions(+), 32 deletions(-)

diff --git a/drivers/clk/meson/g12a.c b/drivers/clk/meson/g12a.c
index 3c75ef5e4d24..2f3ecbbc98d9 100644
--- a/drivers/clk/meson/g12a.c
+++ b/drivers/clk/meson/g12a.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 
 #include "clk-input.h"
 #include "clk-mpll.h"
@@ -85,16 +86,9 @@ static struct clk_regmap g12a_fixed_pll = {
 	},
 };
 
-/*
- * Internal sys pll emulation configuration parameters
- */
-static const struct reg_sequence g12a_sys_init_regs[] = {
-	{ .reg = HHI_SYS_PLL_CNTL1,	.def = 0x00000000 },
-	{ .reg = HHI_SYS_PLL_CNTL2,	.def = 0x00000000 },
-	{ .reg = HHI_SYS_PLL_CNTL3,	.def = 0x48681c00 },
-	{ .reg = HHI_SYS_PLL_CNTL4,	.def = 0x88770290 },
-	{ .reg = HHI_SYS_PLL_CNTL5,	.def = 0x39272000 },
-	{ .reg = HHI_SYS_PLL_CNTL6,	.def = 0x56540000 },
+static const struct pll_mult_range g12a_sys_pll_mult_range = {
+	.min = 128,
+	.max = 250,
 };
 
 static struct clk_regmap g12a_sys_pll_dco = {
@@ -124,14 +118,15 @@ static struct clk_regmap g12a_sys_pll_dco = {
 			.shift   = 29,
 			.width   = 1,
 		},
-		.init_regs = g12a_sys_init_regs,
-		.init_count = ARRAY_SIZE(g12a_sys_init_regs),
+		.range = &g12a_sys_pll_mult_range,
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys_pll_dco",
-		.ops = &meson_clk_pll_ro_ops,
+		.ops = &meson_clk_pll_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal" },
 		.num_parents = 1,
+		/* This clock feeds the CPU, avoid disabling it */
+		.flags = CLK_IS_CRITICAL,
 	},
 };
 
@@ -144,9 +139,10 @@ static struct clk_regmap g12a_sys_pll = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys_pll",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &clk_regmap_divider_ops,
 		.parent_names = (const char *[]){ "sys_pll_dco" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -177,12 +173,15 @@ static struct clk_regmap g12b_sys1_pll_dco = {
 			.shift   = 29,
 			.width   = 1,
 		},
+		.range = &g12a_sys_pll_mult_range,
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys1_pll_dco",
-		.ops = &meson_clk_pll_ro_ops,
+		.ops = &meson_clk_pll_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal" },
 		.num_parents = 1,
+		/* This clock feeds the CPU, avoid disabling it */
+		.flags = CLK_IS_CRITICAL,
 	},
 };
 
@@ -195,9 +194,10 @@ static struct clk_regmap g12b_sys1_pll = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys1_pll",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &clk_regmap_divider_ops,
 		.parent_names = (const char *[]){ "sys1_pll_dco" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -266,7 +266,7 @@ static struct clk_regmap g12a_cpu_clk_premux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn0_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
@@ -275,6 +275,37 @@ static struct clk_regmap g12a_cpu_clk_premux0 = {
 };
 
 /* Datasheet names this field as "mux0_divn_tcnt" */
+#define SYS_CPU_DYN_ENABLE	BIT(26)
+
+/* This divider uses bit 26 to take change in account */
+static int g12a_cpu_clk_mux0_div_set_rate(struct clk_hw *hw, unsigned long rate,
+					  unsigned long parent_rate)
+{
+	struct clk_regmap *clk = to_clk_regmap(hw);
+	struct clk_regmap_div_data *div = clk_get_regmap_div_data(clk);
+	unsigned int val;
+	int ret;
+
+	ret = divider_get_val(rate, parent_rate, div->table, div->width,
+			      div->flags);
+	if (ret < 0)
+		return ret;
+
+	val = (unsigned int)ret << div->shift;
+
+	regmap_update_bits(clk->map, HHI_SYS_CPU_CLK_CNTL0,
+			   SYS_CPU_DYN_ENABLE, SYS_CPU_DYN_ENABLE);
+
+	return regmap_update_bits(clk->map, div->offset,
+				  clk_div_mask(div->width) << div->shift | SYS_CPU_DYN_ENABLE, val);
+};
+
+const struct clk_ops g12a_cpu_clk_mux0_div_ops = {
+	.recalc_rate = clk_regmap_div_recalc_rate,
+	.round_rate = clk_regmap_div_round_rate,
+	.set_rate = g12a_cpu_clk_mux0_div_set_rate,
+};
+
 static struct clk_regmap g12a_cpu_clk_mux0_div = {
 	.data = &(struct clk_regmap_div_data){
 		.offset = HHI_SYS_CPU_CLK_CNTL0,
@@ -283,9 +314,10 @@ static struct clk_regmap g12a_cpu_clk_mux0_div = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn0_div",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &g12a_cpu_clk_mux0_div_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn0_sel" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -298,10 +330,11 @@ static struct clk_regmap g12a_cpu_clk_postmux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn0",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn0_sel",
 						  "cpu_clk_dyn0_div" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -314,11 +347,13 @@ static struct clk_regmap g12a_cpu_clk_premux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn1_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
 		.num_parents = 3,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -346,10 +381,12 @@ static struct clk_regmap g12a_cpu_clk_postmux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn1",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn1_sel",
 						  "cpu_clk_dyn1_div" },
 		.num_parents = 2,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -362,10 +399,11 @@ static struct clk_regmap g12a_cpu_clk_dyn = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn0",
 						  "cpu_clk_dyn1" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -378,10 +416,11 @@ static struct clk_regmap g12a_cpu_clk = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn",
 						  "sys_pll" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -394,10 +433,11 @@ static struct clk_regmap g12b_cpu_clk = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn",
 						  "sys1_pll" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -410,7 +450,7 @@ static struct clk_regmap g12b_cpub_clk_premux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn0_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
@@ -418,6 +458,35 @@ static struct clk_regmap g12b_cpub_clk_premux0 = {
 	},
 };
 
+/* This divider uses bit 26 to take change in account */
+static int g12b_cpub_clk_mux0_div_set_rate(struct clk_hw *hw, unsigned long rate,
+					  unsigned long parent_rate)
+{
+	struct clk_regmap *clk = to_clk_regmap(hw);
+	struct clk_regmap_div_data *div = clk_get_regmap_div_data(clk);
+	unsigned int val;
+	int ret;
+
+	ret = divider_get_val(rate, parent_rate, div->table, div->width,
+			      div->flags);
+	if (ret < 0)
+		return ret;
+
+	val = (unsigned int)ret << div->shift;
+
+	regmap_update_bits(clk->map, HHI_SYS_CPUB_CLK_CNTL,
+			   SYS_CPU_DYN_ENABLE, SYS_CPU_DYN_ENABLE);
+
+	return regmap_update_bits(clk->map, div->offset,
+				  clk_div_mask(div->width) << div->shift | SYS_CPU_DYN_ENABLE, val);
+};
+
+const struct clk_ops g12b_cpub_clk_mux0_div_ops = {
+	.recalc_rate = clk_regmap_div_recalc_rate,
+	.round_rate = clk_regmap_div_round_rate,
+	.set_rate = g12b_cpub_clk_mux0_div_set_rate,
+};
+
 /* Datasheet names this field as "mux0_divn_tcnt" */
 static struct clk_regmap g12b_cpub_clk_mux0_div = {
 	.data = &(struct clk_regmap_div_data){
@@ -427,9 +496,10 @@ static struct clk_regmap g12b_cpub_clk_mux0_div = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn0_div",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &g12b_cpub_clk_mux0_div_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn0_sel" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -442,10 +512,11 @@ static struct clk_regmap g12b_cpub_clk_postmux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn0",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn0_sel",
 						  "cpub_clk_dyn0_div" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -458,11 +529,13 @@ static struct clk_regmap g12b_cpub_clk_premux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn1_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
 		.num_parents = 3,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -490,10 +563,12 @@ static struct clk_regmap g12b_cpub_clk_postmux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn1",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn1_sel",
 						  "cpub_clk_dyn1_div" },
 		.num_parents = 2,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -506,10 +581,11 @@ static struct clk_regmap g12b_cpub_clk_dyn = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn0",
 						  "cpub_clk_dyn1" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -522,13 +598,228 @@ static struct clk_regmap g12b_cpub_clk = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn",
 						  "sys_pll" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
+static int g12a_cpu_clk_mux_notifier_cb(struct notifier_block *nb,
+					unsigned long event, void *data)
+{
+	switch (event) {
+	case POST_RATE_CHANGE:
+	case PRE_RATE_CHANGE:
+		/* Wait for clock propagation before/after changing the mux */
+		udelay(100);
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+struct notifier_block g12a_cpu_clk_mux_nb = {
+	.notifier_call = g12a_cpu_clk_mux_notifier_cb,
+};
+
+struct g12a_cpu_clk_postmux_nb_data {
+	struct notifier_block nb;
+	struct clk_hw *xtal;
+	struct clk_hw *cpu_clk_dyn;
+	struct clk_hw *cpu_clk_postmux0;
+	struct clk_hw *cpu_clk_postmux1;
+	struct clk_hw *cpu_clk_premux1;
+};
+
+static int g12a_cpu_clk_postmux_notifier_cb(struct notifier_block *nb,
+					 unsigned long event, void *data)
+{
+	struct g12a_cpu_clk_postmux_nb_data *nb_data =
+		container_of(nb, struct g12a_cpu_clk_postmux_nb_data, nb);
+
+	switch (event) {
+	case PRE_RATE_CHANGE:
+		/*
+		 * This notifier means cpu_clk_postmux0 clock will be changed
+		 * to feed cpu_clk, this the current path :
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_postmux0
+		 *                \- cpu_clk_muxX_div
+		 *                      \- cpu_clk_premux0
+		 *				\- fclk_div3 or fclk_div2
+		 *		OR
+		 *                \- cpu_clk_premux0
+		 *			\- fclk_div3 or fclk_div2
+		 */
+
+		/* Setup cpu_clk_premux1 to xtal */
+		clk_hw_set_parent(nb_data->cpu_clk_premux1,
+				  nb_data->xtal);
+
+		/* Setup cpu_clk_postmux1 to bypass divider */
+		clk_hw_set_parent(nb_data->cpu_clk_postmux1,
+				  nb_data->cpu_clk_premux1);
+
+		/* Switch to parking clk on cpu_clk_postmux1 */
+		clk_hw_set_parent(nb_data->cpu_clk_dyn,
+				  nb_data->cpu_clk_postmux1);
+
+		/*
+		 * Now, cpu_clk is 24MHz in the current path :
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_postmux1
+		 *                \- cpu_clk_premux1
+		 *                      \- xtal
+		 */
+
+		udelay(100);
+
+		return NOTIFY_OK;
+
+	case POST_RATE_CHANGE:
+		/*
+		 * The cpu_clk_postmux0 has ben updated, now switch back
+		 * cpu_clk_dyn to cpu_clk_postmux0 and take the changes
+		 * in account.
+		 */
+
+		/* Configure cpu_clk_dyn back to cpu_clk_postmux0 */
+		clk_hw_set_parent(nb_data->cpu_clk_dyn,
+				  nb_data->cpu_clk_postmux0);
+
+		/*
+		 * new path :
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_postmux0
+		 *                \- cpu_clk_muxX_div
+		 *                      \- cpu_clk_premux0
+		 *				\- fclk_div3 or fclk_div2
+		 *		OR
+		 *                \- cpu_clk_premux0
+		 *			\- fclk_div3 or fclk_div2
+		 */
+
+		udelay(100);
+
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct g12a_cpu_clk_postmux_nb_data g12a_cpu_clk_postmux0_nb_data = {
+	.cpu_clk_dyn = &g12a_cpu_clk_dyn.hw,
+	.cpu_clk_postmux0 = &g12a_cpu_clk_postmux0.hw,
+	.cpu_clk_postmux1 = &g12a_cpu_clk_postmux1.hw,
+	.cpu_clk_premux1 = &g12a_cpu_clk_premux1.hw,
+	.nb.notifier_call = g12a_cpu_clk_postmux_notifier_cb,
+};
+
+static struct g12a_cpu_clk_postmux_nb_data g12b_cpub_clk_postmux0_nb_data = {
+	.cpu_clk_dyn = &g12b_cpub_clk_dyn.hw,
+	.cpu_clk_postmux0 = &g12b_cpub_clk_postmux0.hw,
+	.cpu_clk_postmux1 = &g12b_cpub_clk_postmux1.hw,
+	.cpu_clk_premux1 = &g12b_cpub_clk_premux1.hw,
+	.nb.notifier_call = g12a_cpu_clk_postmux_notifier_cb,
+};
+
+struct g12a_sys_pll_nb_data {
+	struct notifier_block nb;
+	struct clk_hw *sys_pll;
+	struct clk_hw *cpu_clk;
+	struct clk_hw *cpu_clk_dyn;
+};
+
+static int g12a_sys_pll_notifier_cb(struct notifier_block *nb,
+				    unsigned long event, void *data)
+{
+	struct g12a_sys_pll_nb_data *nb_data =
+		container_of(nb, struct g12a_sys_pll_nb_data, nb);
+
+	switch (event) {
+	case PRE_RATE_CHANGE:
+		/*
+		 * This notifier means sys_pll clock will be changed
+		 * to feed cpu_clk, this the current path :
+		 * cpu_clk
+		 *    \- sys_pll
+		 *          \- sys_pll_dco
+		 */
+
+		/* Configure cpu_clk to use cpu_clk_dyn */
+		clk_hw_set_parent(nb_data->cpu_clk,
+				  nb_data->cpu_clk_dyn);
+
+		/*
+		 * Now, cpu_clk uses the dyn path
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_dynX
+		 *                \- cpu_clk_dynX_sel
+		 *		     \- cpu_clk_dynX_div
+		 *                      \- xtal/fclk_div2/fclk_div3
+		 *                   \- xtal/fclk_div2/fclk_div3
+		 */
+
+		udelay(100);
+
+		return NOTIFY_OK;
+
+	case POST_RATE_CHANGE:
+		/*
+		 * The sys_pll has ben updated, now switch back cpu_clk to
+		 * sys_pll
+		 */
+
+		/* Configure cpu_clk to use sys_pll */
+		clk_hw_set_parent(nb_data->cpu_clk,
+				  nb_data->sys_pll);
+
+		udelay(100);
+
+		/* new path :
+		 * cpu_clk
+		 *    \- sys_pll
+		 *          \- sys_pll_dco
+		 */
+
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct g12a_sys_pll_nb_data g12a_sys_pll_nb_data = {
+	.sys_pll = &g12a_sys_pll.hw,
+	.cpu_clk = &g12a_cpu_clk.hw,
+	.cpu_clk_dyn = &g12a_cpu_clk_dyn.hw,
+	.nb.notifier_call = g12a_sys_pll_notifier_cb,
+};
+
+/* G12B first CPU cluster uses sys1_pll */
+static struct g12a_sys_pll_nb_data g12b_cpu_clk_sys1_pll_nb_data = {
+	.sys_pll = &g12b_sys1_pll.hw,
+	.cpu_clk = &g12b_cpu_clk.hw,
+	.cpu_clk_dyn = &g12a_cpu_clk_dyn.hw,
+	.nb.notifier_call = g12a_sys_pll_notifier_cb,
+};
+
+/* G12B second CPU cluster uses sys_pll */
+static struct g12a_sys_pll_nb_data g12b_cpub_clk_sys_pll_nb_data = {
+	.sys_pll = &g12a_sys_pll.hw,
+	.cpu_clk = &g12b_cpub_clk.hw,
+	.cpu_clk_dyn = &g12b_cpub_clk_dyn.hw,
+	.nb.notifier_call = g12a_sys_pll_notifier_cb,
+};
+
 static struct clk_regmap g12a_cpu_clk_div16_en = {
 	.data = &(struct clk_regmap_gate_data){
 		.offset = HHI_SYS_CPU_CLK_CNTL1,
@@ -3792,18 +4083,163 @@ static const struct reg_sequence g12a_init_regs[] = {
 	{ .reg = HHI_MPLL_CNTL0,	.def = 0x00000543 },
 };
 
+static int meson_g12a_dvfs_setup_common(struct platform_device *pdev,
+					struct clk_hw **hws)
+{
+	const char *notifier_clk_name;
+	struct clk *notifier_clk;
+	struct clk_hw *xtal;
+	int ret;
+
+	xtal = clk_hw_get_parent_by_index(hws[CLKID_CPU_CLK_DYN1_SEL], 0);
+
+	/* Setup clock notifier for cpu_clk_postmux0 */
+	g12a_cpu_clk_postmux0_nb_data.xtal = xtal;
+	notifier_clk_name = clk_hw_get_name(&g12a_cpu_clk_postmux0.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12a_cpu_clk_postmux0_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk_postmux0 notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for cpu_clk_dyn mux */
+	notifier_clk_name = clk_hw_get_name(&g12a_cpu_clk_dyn.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk_dyn notifier\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int meson_g12b_dvfs_setup(struct platform_device *pdev)
+{
+	struct clk_hw **hws = g12b_hw_onecell_data.hws;
+	const char *notifier_clk_name;
+	struct clk *notifier_clk;
+	struct clk_hw *xtal;
+	int ret;
+
+	ret = meson_g12a_dvfs_setup_common(pdev, hws);
+	if (ret)
+		return ret;
+
+	xtal = clk_hw_get_parent_by_index(hws[CLKID_CPU_CLK_DYN1_SEL], 0);
+
+	/* Setup clock notifier for cpu_clk mux */
+	notifier_clk_name = clk_hw_get_name(&g12b_cpu_clk.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for sys1_pll */
+	notifier_clk_name = clk_hw_get_name(&g12b_sys1_pll.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12b_cpu_clk_sys1_pll_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the sys1_pll notifier\n");
+		return ret;
+	}
+
+	/* Add notifiers for the second CPU cluster */
+
+	/* Setup clock notifier for cpub_clk_postmux0 */
+	g12b_cpub_clk_postmux0_nb_data.xtal = xtal;
+	notifier_clk_name = clk_hw_get_name(&g12b_cpub_clk_postmux0.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12b_cpub_clk_postmux0_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpub_clk_postmux0 notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for cpub_clk_dyn mux */
+	notifier_clk_name = clk_hw_get_name(&g12b_cpub_clk_dyn.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpub_clk_dyn notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for cpub_clk mux */
+	notifier_clk_name = clk_hw_get_name(&g12b_cpub_clk.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpub_clk notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for sys_pll */
+	notifier_clk_name = clk_hw_get_name(&g12a_sys_pll.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12b_cpub_clk_sys_pll_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the sys_pll notifier\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+
+static int meson_g12a_dvfs_setup(struct platform_device *pdev)
+{
+	struct clk_hw **hws = g12a_hw_onecell_data.hws;
+	const char *notifier_clk_name;
+	struct clk *notifier_clk;
+	int ret;
+
+	ret = meson_g12a_dvfs_setup_common(pdev, hws);
+	if (ret)
+		return ret;
+
+	/* Setup clock notifier for cpu_clk mux */
+	notifier_clk_name = clk_hw_get_name(&g12a_cpu_clk.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for sys_pll */
+	notifier_clk_name = clk_hw_get_name(&g12a_sys_pll.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_sys_pll_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the sys_pll notifier\n");
+		return ret;
+	}
+
+	return 0;
+}
+
 static const struct meson_eeclkc_data g12a_clkc_data = {
 	.regmap_clks = g12a_clk_regmaps,
 	.regmap_clk_num = ARRAY_SIZE(g12a_clk_regmaps),
 	.hw_onecell_data = &g12a_hw_onecell_data,
 	.init_regs = g12a_init_regs,
 	.init_count = ARRAY_SIZE(g12a_init_regs),
+	.setup = meson_g12a_dvfs_setup,
 };
 
 static const struct meson_eeclkc_data g12b_clkc_data = {
 	.regmap_clks = g12a_clk_regmaps,
 	.regmap_clk_num = ARRAY_SIZE(g12a_clk_regmaps),
-	.hw_onecell_data = &g12b_hw_onecell_data
+	.hw_onecell_data = &g12b_hw_onecell_data,
+	.setup = meson_g12b_dvfs_setup,
 };
 
 static const struct of_device_id clkc_match_table[] = {
-- 
2.21.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related

* [PATCH v4 25/25] MAINTAINERS: Add maintainer for IBNBD/IBTRS modules
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 MAINTAINERS | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a6954776a37e..0b7fd93f738d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7590,6 +7590,20 @@ IBM ServeRAID RAID DRIVER
 S:	Orphan
 F:	drivers/scsi/ips.*
 
+IBNBD BLOCK DRIVERS
+M:	IBNBD/IBTRS Storage Team <ibnbd@cloud.ionos.com>
+L:	linux-block@vger.kernel.org
+S:	Maintained
+T:	git git://github.com/profitbricks/ibnbd.git
+F:	drivers/block/ibnbd/
+
+IBTRS TRANSPORT DRIVERS
+M:	IBNBD/IBTRS Storage Team <ibnbd@cloud.ionos.com>
+L:	linux-rdma@vger.kernel.org
+S:	Maintained
+T:	git git://github.com/profitbricks/ibnbd.git
+F:	drivers/infiniband/ulp/ibtrs/
+
 ICH LPC AND GPIO DRIVER
 M:	Peter Tyser <ptyser@xes-inc.com>
 S:	Maintained
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 24/25] ibnbd: a bit of documentation
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

README with description of major sysfs entries.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/README | 315 +++++++++++++++++++++++++++++++++++++
 1 file changed, 315 insertions(+)
 create mode 100644 drivers/block/ibnbd/README

diff --git a/drivers/block/ibnbd/README b/drivers/block/ibnbd/README
new file mode 100644
index 000000000000..ce9293c290b9
--- /dev/null
+++ b/drivers/block/ibnbd/README
@@ -0,0 +1,315 @@
+***************************************
+Infiniband Network Block Device (IBNBD)
+***************************************
+
+Introduction
+------------
+
+IBNBD (InfiniBand Network Block Device) is a pair of kernel modules
+(client and server) that allow for remote access of a block device on
+the server over IBTRS protocol using the RDMA (InfiniBand, RoCE, iWarp)
+transport. After being mapped, the remote block devices can be accessed
+on the client side as local block devices.
+
+I/O is transferred between client and server by the IBTRS transport
+modules. The administration of IBNBD and IBTRS modules is done via
+sysfs entries.
+
+Requirements
+------------
+
+  IBTRS kernel modules
+
+Quick Start
+-----------
+
+Server side:
+  # modprobe ibnbd_server
+
+Client side:
+  # modprobe ibnbd_client
+  # echo "sessname=blya path=ip:10.50.100.66 device_path=/dev/ram0" > \
+            /sys/devices/virtual/ibnbd-client/ctl/map_device
+
+  Where "sessname=" is a session name, a string to identify the session
+  on client and on server sides; "path=" is a destination IP address or
+  a pair of a source and a destination IPs, separated by comma.  Multiple
+  "path=" options can be specified in order to use multipath  (see IBTRS
+  description for details); "device_path=" is the block device to be
+  mapped from the server side. After the session to the server machine is
+  established, the mapped device will appear on the client side under
+  /dev/ibnbd<N>.
+
+
+======================
+Client Sysfs Interface
+======================
+
+All sysfs files that are not read-only provide the usage information on read:
+
+Example:
+  # cat /sys/devices/virtual/ibnbd-client/ctl/map_device
+
+  > Usage: echo "sessname=<name of the ibtrs session> path=<[srcaddr,]dstaddr>
+  > [path=<[srcaddr,]dstaddr>] device_path=<full path on remote side>
+  > [access_mode=<ro|rw|migration>]
+  > [io_mode=<fileio|blockio>]" > map_device
+  >
+  > addr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]
+
+Entries under /sys/devices/virtual/ibnbd-client/ctl/
+=======================================
+
+map_device (RW)
+---------------
+
+Expected format is the following:
+
+    sessname=<name of the ibtrs session>
+    path=<[srcaddr,]dstaddr> [path=<[srcaddr,]dstaddr> ...]
+    device_path=<full path on remote side>
+    [access_mode=<ro|rw|migration>]
+    [io_mode=<fileio|blockio>]
+
+Where:
+
+sessname: accepts a string not bigger than 256 chars, which identifies
+          a given session on the client and on the server.
+          I.e. "clt_hostname-srv_hostname" could be a natural choice.
+
+path:     describes a connection between the client and the server by
+      specifying destination and, when required, the source address.
+      The addresses are to be provided in the following format:
+
+            ip:<IPv6>
+            ip:<IPv4>
+            gid:<GID>
+
+          for example:
+
+          path=ip:10.0.0.66
+                         The single addr is treated as the destination.
+                         The connection will be established to this
+                         server from any client IP address.
+
+          path=ip:10.0.0.66,ip:10.0.1.66
+                         First addr is the source address and the second
+                         is the destination.
+
+          If multiple "path=" options are specified multiple connection
+          will be established and data will be sent according to
+          the selected multipath policy (see IBTRS mp_policy sysfs entry
+          description).
+
+device_path: Path to the block device on the server side. Path is specified
+         relative to the directory on server side configured in the
+         'dev_search_path' module parameter of the ibnbd_server.
+         The ibnbd_server prepends the <device_path> received from client
+         with <dev_search_path> and tries to open the
+         <dev_search_path>/<device_path> block device.  On success,
+         a /dev/ibnbd<N> device file, a /sys/block/ibnbd_client/ibnbd<N>/
+         directory and an entry in /sys/devices/virtual/ibnbd-client/ctl/devices
+         will be created.
+
+         If 'dev_search_path' contains '%SESSNAME%', then each session can
+         have different devices namespace, e.g. server was configured with
+         the following parameter "dev_search_path=/run/ibnbd-devs/%SESSNAME%",
+         client has this string "sessname=blya device_path=sda", then server
+         will try to open: /run/ibnbd-devs/blya/sda.
+
+access_mode: the access_mode parameter specifies if the device is to be
+             mapped as "ro" read-only or "rw" read-write. The server allows
+             a device to be exported in rw mode only once. The "migration"
+             access mode has to be specified if a second mapping in read-write
+             mode is desired.
+
+             By default "rw" is used.
+
+io_mode:  the io_mode parameter specifies if the device on the server
+          will be opened as block device "blockio" or as file "fileio".
+          When the device is opened as file, the VFS page cache is used
+          for read I/O operations, write I/O operations bypass the page
+          cache and go directly to disk (except meta updates, like file
+          access time).
+
+          By default "blockio" mode is used.
+
+Exit Codes:
+
+If the device is already mapped it will fail with EEXIST. If the input
+has an invalid format it will return EINVAL. If the device path cannot
+be found on the server, it will fail with ENOENT.
+
+Finding device file after mapping
+---------------------------------
+
+After mapping, the device file can be found by:
+ o  The symlink /sys/devices/virtual/ibnbd-client/ctl/devices/<device_id>
+    points to /sys/block/<dev-name>. The last part of the symlink destination
+    is the same as the device name.  By extracting the last part of the
+    path the path to the device /dev/<dev-name> can be build.
+
+ o /dev/block/$(cat /sys/devices/virtual/ibnbd-client/ctl/devices/<device_id>/dev)
+
+How to find the <device_id> of the device is described on the next
+section.
+
+Entries under /sys/devices/virtual/ibnbd-client/ctl/devices/
+============================================================
+
+For each device mapped on the client a new symbolic link is created as
+/sys/devices/virtual/ibnbd-client/ctl/devices/<device_id>, which points
+to the block device created by ibnbd (/sys/block/ibnbd<N>/).
+The <device_id> of each device is created as follows:
+
+- If the 'device_path' provided during mapping contains slashes ("/"),
+  they are replaced by exclamation mark ("!") and used as as the
+  <device_id>. Otherwise, the <device_id> will be the same as the
+  "device_path" provided.
+
+Entries under /sys/block/ibnbd<N>/ibnbd_client/
+===============================================
+
+unmap_device (RW)
+-----------------
+
+To unmap a volume, "normal" or "force" has to be written to:
+  /sys/block/ibnbd<N>/ibnbd_client/unmap_device
+
+When "normal" is used, the operation will fail with EBUSY if any process
+is using the device.  When "force" is used, the device is also unmapped
+when device is in use.  All I/Os that are in progress will fail.
+
+Example:
+
+   # echo "normal" > /sys/block/ibnbd0/ibnbd/unmap_device
+
+state (RO)
+----------
+
+The file contains the current state of the block device. The state file
+returns "open" when the device is successfully mapped from the server
+and accepting I/O requests. When the connection to the server gets
+disconnected in case of an error (e.g. link failure), the state file
+returns "closed" and all I/O requests submitted to it will fail with -EIO.
+
+session (RO)
+------------
+
+IBNBD uses IBTRS session to transport the data between client and
+server.  The entry "session" contains the name of the session, that
+was used to establish the IBTRS session.  It's the same name that
+was passed as server parameter to the map_device entry.
+
+mapping_path (RO)
+-----------------
+
+Contains the path that was passed as "device_path" to the map_device
+operation.
+
+io_mode (RO)
+------------
+
+Contains the way device is accessed: blockio or fileio
+
+access_mode (RO)
+----------------
+
+Contains the device access mode: ro, rw or migration.
+
+======================
+Server Sysfs Interface
+======================
+
+Entries under /sys/devices/virtual/ibnbd-server/ctl/
+====================================================
+
+When a client maps a device, a directory entry with the name of the
+block device is created under /sys/devices/virtual/ibnbd-server/ctl/devices/.
+
+Entries under /sys/devices/virtual/ibnbd-server/ctl/devices/<device_name>/
+==========================================================================
+
+block_dev (link)
+---------------
+
+Is a symlink to the sysfs entry of the exported device.
+
+Example:
+
+  block_dev -> ../../../../devices/virtual/block/ram0
+
+io_mode (RO)
+-----------
+
+Contains the way device is accessed: blockio or fileio
+
+Entries under /sys/devices/virtual/ibnbd-server/ctl/devices/<device_name>/sessions/
+===================================================================================
+
+For each client a particular device is exported to, following directory will be
+created:
+
+/sys/devices/virtual/ibnbd-server/ctl/devices/<device_name>/sessions/<session-name>/
+
+When the device is unmapped by that client, the directory will be removed.
+
+Entries under /sys/devices/virtual/ibnbd-server/ctl/devices/<device_name>/sessions/<session-name>
+=================================================================================================
+
+read_only (RO)
+--------------
+
+Contains '1' if device is mapped read-only, otherwise '0'.
+
+mapping_path (RO)
+-----------------
+
+Contains the relative device path provided by the user during mapping.
+
+access_mode (RO)
+-----------------
+
+Contains the device access mode: ro, rw or migration.
+
+
+==============================
+IBNBD-Server Module Parameters
+==============================
+
+dev_search_path
+---------------
+
+When a device is mapped from the client, the server generates the path
+to the block device on the server side by concatenating dev_search_path
+and the "device_path" that was specified in the map_device operation.
+
+The default dev_search_path is: "/".
+
+dev_search_path option can also contain %SESSNAME% in order to provide
+different deviec namespaces for different sessions.  See "device_path"
+option for details.
+
+==============================
+Protocol (ibnbd/ibnbd-proto.h)
+==============================
+
+1. Before mapping first device from a given server, client sends an
+IBNBD_MSG_SESS_INFO to the server. Server responds with
+IBNBD_MSG_SESS_INFO_RSP. Currently the messages only contain the protocol
+version for backward compatibility.
+
+2. Client requests to open a device by sending IBNBD_MSG_OPEN message. This
+contains the path to the device, access mode (read-only or writable), and
+io_mode which specifies if the device should be opened as block device or
+using file io. Server responds to the message with IBNBD_MSG_OPEN_RSP. This
+contains a 32 bit device id to be used for  IOs and device "geometry" related
+information: side, max_hw_sectors, etc.
+
+3. Client attaches IBNBD_MSG_IO to each IO message send to a device. This
+message contains device id, provided by server in his ibnbd_msg_open_rsp,
+sector to be accessed, read-write flags and bi_size.
+
+4. Client closes a device by sending IBNBD_MSG_CLOSE which contains only the
+device id provided by the server.
+
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 23/25] ibnbd: include client and server modules into kernel compilation
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

Add IBNBD Makefile, Kconfig and also corresponding lines into upper
block layer files.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/Kconfig        |  2 ++
 drivers/block/Makefile       |  1 +
 drivers/block/ibnbd/Kconfig  | 24 ++++++++++++++++++++++++
 drivers/block/ibnbd/Makefile | 13 +++++++++++++
 4 files changed, 40 insertions(+)
 create mode 100644 drivers/block/ibnbd/Kconfig
 create mode 100644 drivers/block/ibnbd/Makefile

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 20bb4bfa4be6..9904c030d488 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -468,4 +468,6 @@ config BLK_DEV_RSXX
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.
 
+source "drivers/block/ibnbd/Kconfig"
+
 endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index a53cc1e3a2d3..bde0b015e07a 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_IBNBD)	+= ibnbd/
 
 obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 null_blk-objs	:= null_blk_main.o
diff --git a/drivers/block/ibnbd/Kconfig b/drivers/block/ibnbd/Kconfig
new file mode 100644
index 000000000000..936a91c8392e
--- /dev/null
+++ b/drivers/block/ibnbd/Kconfig
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config BLK_DEV_IBNBD
+	bool
+
+config BLK_DEV_IBNBD_CLIENT
+	tristate "Network block device driver on top of IBTRS transport"
+	depends on INFINIBAND_IBTRS_CLIENT
+	select BLK_DEV_IBNBD
+	help
+	  IBNBD client allows for mapping of a remote block devices over
+	  IBTRS protocol from a target system where IBNBD server is running.
+
+	  If unsure, say N.
+
+config BLK_DEV_IBNBD_SERVER
+	tristate "Network block device over RDMA Infiniband server support"
+	depends on INFINIBAND_IBTRS_SERVER
+	select BLK_DEV_IBNBD
+	help
+	  IBNBD server allows for exporting local block devices to a remote client
+	  over IBTRS protocol.
+
+	  If unsure, say N.
diff --git a/drivers/block/ibnbd/Makefile b/drivers/block/ibnbd/Makefile
new file mode 100644
index 000000000000..4eb817a16ed2
--- /dev/null
+++ b/drivers/block/ibnbd/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+ccflags-y := -Idrivers/infiniband/ulp/ibtrs
+
+ibnbd-client-y := ibnbd-clt.o \
+		  ibnbd-clt-sysfs.o
+
+ibnbd-server-y := ibnbd-srv.o \
+		  ibnbd-srv-dev.o \
+		  ibnbd-srv-sysfs.o
+
+obj-$(CONFIG_BLK_DEV_IBNBD_CLIENT) += ibnbd-client.o
+obj-$(CONFIG_BLK_DEV_IBNBD_SERVER) += ibnbd-server.o
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 22/25] ibnbd: server: sysfs interface functions
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This is the sysfs interface to IBNBD mapped devices on server side:

  /sys/devices/virtual/ibnbd-server/ctl/devices/<device_name>/
    |- block_dev
    |  *** link pointing to the corresponding block device sysfs entry
    |
    |- sessions/<session-name>/
    |  *** sessions directory
       |
       |- read_only
       |  *** is devices mapped as read only
       |
       |- mapping_path
          *** relative device path provided by the client during mapping

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-srv-sysfs.c | 270 ++++++++++++++++++++++++++
 1 file changed, 270 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv-sysfs.c

diff --git a/drivers/block/ibnbd/ibnbd-srv-sysfs.c b/drivers/block/ibnbd/ibnbd-srv-sysfs.c
new file mode 100644
index 000000000000..2b40514950ed
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv-sysfs.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <uapi/linux/limits.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/stat.h>
+#include <linux/genhd.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+
+#include "ibnbd-srv.h"
+
+static struct device *ibnbd_dev;
+static struct class *ibnbd_dev_class;
+static struct kobject *ibnbd_devs_kobj;
+
+static ssize_t ibnbd_srv_dev_mode_show(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       char *page)
+{
+	struct ibnbd_srv_dev *srv_dev;
+
+	srv_dev = container_of(kobj, struct ibnbd_srv_dev, dev_kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n",
+			 ibnbd_io_mode_str(srv_dev->mode));
+}
+
+static struct kobj_attribute ibnbd_srv_dev_mode_attr =
+	__ATTR(io_mode, 0444, ibnbd_srv_dev_mode_show, NULL);
+
+static struct attribute *ibnbd_srv_default_dev_attrs[] = {
+	&ibnbd_srv_dev_mode_attr.attr,
+	NULL,
+};
+
+static struct attribute_group ibnbd_srv_default_dev_attr_group = {
+	.attrs = ibnbd_srv_default_dev_attrs,
+};
+
+static struct kobj_type ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+};
+
+int ibnbd_srv_create_dev_sysfs(struct ibnbd_srv_dev *dev,
+			       struct block_device *bdev,
+			       const char *dir_name)
+{
+	struct kobject *bdev_kobj;
+	int ret;
+
+	ret = kobject_init_and_add(&dev->dev_kobj, &ktype,
+				   ibnbd_devs_kobj, dir_name);
+	if (ret)
+		return ret;
+
+	ret = kobject_init_and_add(&dev->dev_sessions_kobj,
+				   &ktype,
+				   &dev->dev_kobj, "sessions");
+	if (ret)
+		goto err;
+
+	ret = sysfs_create_group(&dev->dev_kobj,
+				 &ibnbd_srv_default_dev_attr_group);
+	if (ret)
+		goto err2;
+
+	bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj;
+	ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev");
+	if (ret)
+		goto err3;
+
+	return 0;
+
+err3:
+	sysfs_remove_group(&dev->dev_kobj,
+			   &ibnbd_srv_default_dev_attr_group);
+err2:
+	kobject_del(&dev->dev_sessions_kobj);
+	kobject_put(&dev->dev_sessions_kobj);
+err:
+	kobject_del(&dev->dev_kobj);
+	kobject_put(&dev->dev_kobj);
+	return ret;
+}
+
+void ibnbd_srv_destroy_dev_sysfs(struct ibnbd_srv_dev *dev)
+{
+	sysfs_remove_link(&dev->dev_kobj, "block_dev");
+	sysfs_remove_group(&dev->dev_kobj, &ibnbd_srv_default_dev_attr_group);
+	kobject_del(&dev->dev_sessions_kobj);
+	kobject_put(&dev->dev_sessions_kobj);
+	kobject_del(&dev->dev_kobj);
+	kobject_put(&dev->dev_kobj);
+}
+
+static ssize_t ibnbd_srv_dev_session_ro_show(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     char *page)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct ibnbd_srv_sess_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n",
+			 (sess_dev->open_flags & FMODE_WRITE) ? "0" : "1");
+}
+
+static struct kobj_attribute ibnbd_srv_dev_session_ro_attr =
+	__ATTR(read_only, 0444,
+	       ibnbd_srv_dev_session_ro_show,
+	       NULL);
+
+static ssize_t
+ibnbd_srv_dev_session_access_mode_show(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       char *page)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct ibnbd_srv_sess_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n",
+			 ibnbd_access_mode_str(sess_dev->access_mode));
+}
+
+static struct kobj_attribute ibnbd_srv_dev_session_access_mode_attr =
+	__ATTR(access_mode, 0444,
+	       ibnbd_srv_dev_session_access_mode_show,
+	       NULL);
+
+static ssize_t
+ibnbd_srv_dev_session_mapping_path_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct ibnbd_srv_sess_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname);
+}
+
+static struct kobj_attribute ibnbd_srv_dev_session_mapping_path_attr =
+	__ATTR(mapping_path, 0444,
+	       ibnbd_srv_dev_session_mapping_path_show,
+	       NULL);
+
+static struct attribute *ibnbd_srv_default_dev_sessions_attrs[] = {
+	&ibnbd_srv_dev_session_access_mode_attr.attr,
+	&ibnbd_srv_dev_session_ro_attr.attr,
+	&ibnbd_srv_dev_session_mapping_path_attr.attr,
+	NULL,
+};
+
+static struct attribute_group ibnbd_srv_default_dev_session_attr_group = {
+	.attrs = ibnbd_srv_default_dev_sessions_attrs,
+};
+
+void ibnbd_srv_destroy_dev_session_sysfs(struct ibnbd_srv_sess_dev *sess_dev)
+{
+	DECLARE_COMPLETION_ONSTACK(sysfs_compl);
+
+	sysfs_remove_group(&sess_dev->kobj,
+			   &ibnbd_srv_default_dev_session_attr_group);
+
+	sess_dev->sysfs_release_compl = &sysfs_compl;
+	kobject_del(&sess_dev->kobj);
+	kobject_put(&sess_dev->kobj);
+	wait_for_completion(&sysfs_compl);
+}
+
+static void ibnbd_srv_sess_dev_release(struct kobject *kobj)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct ibnbd_srv_sess_dev, kobj);
+	if (sess_dev->sysfs_release_compl)
+		complete_all(sess_dev->sysfs_release_compl);
+}
+
+static struct kobj_type ibnbd_srv_sess_dev_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.release	= ibnbd_srv_sess_dev_release,
+};
+
+int ibnbd_srv_create_dev_session_sysfs(struct ibnbd_srv_sess_dev *sess_dev)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&sess_dev->kobj, &ibnbd_srv_sess_dev_ktype,
+				   &sess_dev->dev->dev_sessions_kobj, "%s",
+				   sess_dev->sess->sessname);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&sess_dev->kobj,
+				 &ibnbd_srv_default_dev_session_attr_group);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	kobject_del(&sess_dev->kobj);
+	kobject_put(&sess_dev->kobj);
+
+	return ret;
+}
+
+int ibnbd_srv_create_sysfs_files(void)
+{
+	int err;
+
+	ibnbd_dev_class = class_create(THIS_MODULE, "ibnbd-server");
+	if (unlikely(IS_ERR(ibnbd_dev_class)))
+		return PTR_ERR(ibnbd_dev_class);
+
+	ibnbd_dev = device_create(ibnbd_dev_class, NULL,
+				  MKDEV(0, 0), NULL, "ctl");
+	if (unlikely(IS_ERR(ibnbd_dev))) {
+		err = PTR_ERR(ibnbd_dev);
+		goto cls_destroy;
+	}
+	ibnbd_devs_kobj = kobject_create_and_add("devices", &ibnbd_dev->kobj);
+	if (unlikely(!ibnbd_devs_kobj)) {
+		err = -ENOMEM;
+		goto dev_destroy;
+	}
+
+	return 0;
+
+dev_destroy:
+	device_destroy(ibnbd_dev_class, MKDEV(0, 0));
+cls_destroy:
+	class_destroy(ibnbd_dev_class);
+
+	return err;
+}
+
+void ibnbd_srv_destroy_sysfs_files(void)
+{
+	kobject_del(ibnbd_devs_kobj);
+	kobject_put(ibnbd_devs_kobj);
+	device_destroy(ibnbd_dev_class, MKDEV(0, 0));
+	class_destroy(ibnbd_dev_class);
+}
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 21/25] ibnbd: server: functionality for IO submission to file or block dev
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This provides helper functions for IO submission to file or block dev.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-srv-dev.c | 408 ++++++++++++++++++++++++++++
 drivers/block/ibnbd/ibnbd-srv-dev.h | 143 ++++++++++
 2 files changed, 551 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv-dev.c
 create mode 100644 drivers/block/ibnbd/ibnbd-srv-dev.h

diff --git a/drivers/block/ibnbd/ibnbd-srv-dev.c b/drivers/block/ibnbd/ibnbd-srv-dev.c
new file mode 100644
index 000000000000..5c1a518638b2
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv-dev.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include "ibnbd-srv-dev.h"
+#include "ibnbd-log.h"
+
+#define IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS 0
+
+struct ibnbd_dev_file_io_work {
+	struct ibnbd_dev	*dev;
+	void			*priv;
+
+	sector_t		sector;
+	void			*data;
+	size_t			len;
+	size_t			bi_size;
+	enum ibnbd_io_flags	flags;
+
+	struct work_struct	work;
+};
+
+struct ibnbd_dev_blk_io {
+	struct ibnbd_dev *dev;
+	void		 *priv;
+};
+
+static struct workqueue_struct *fileio_wq;
+
+int ibnbd_dev_init(void)
+{
+	fileio_wq = alloc_workqueue("%s", WQ_UNBOUND,
+				    IBNBD_DEV_MAX_FILEIO_ACTIVE_WORKERS,
+				    "ibnbd_server_fileio_wq");
+	if (!fileio_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ibnbd_dev_destroy(void)
+{
+	destroy_workqueue(fileio_wq);
+}
+
+static inline struct block_device *ibnbd_dev_open_bdev(const char *path,
+						       fmode_t flags)
+{
+	return blkdev_get_by_path(path, flags, THIS_MODULE);
+}
+
+static int ibnbd_dev_blk_open(struct ibnbd_dev *dev, const char *path,
+			      fmode_t flags)
+{
+	dev->bdev = ibnbd_dev_open_bdev(path, flags);
+	return PTR_ERR_OR_ZERO(dev->bdev);
+}
+
+static int ibnbd_dev_vfs_open(struct ibnbd_dev *dev, const char *path,
+			      fmode_t flags)
+{
+	int oflags = O_DSYNC; /* enable write-through */
+
+	if (flags & FMODE_WRITE)
+		oflags |= O_RDWR;
+	else if (flags & FMODE_READ)
+		oflags |= O_RDONLY;
+	else
+		return -EINVAL;
+
+	dev->file = filp_open(path, oflags, 0);
+	return PTR_ERR_OR_ZERO(dev->file);
+}
+
+struct ibnbd_dev *ibnbd_dev_open(const char *path, fmode_t flags,
+				 enum ibnbd_io_mode mode, struct bio_set *bs,
+				 ibnbd_dev_io_fn io_cb)
+{
+	struct ibnbd_dev *dev;
+	int ret;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	if (mode == IBNBD_BLOCKIO) {
+		dev->blk_open_flags = flags;
+		ret = ibnbd_dev_blk_open(dev, path, dev->blk_open_flags);
+		if (ret)
+			goto err;
+	} else if (mode == IBNBD_FILEIO) {
+		dev->blk_open_flags = FMODE_READ;
+		ret = ibnbd_dev_blk_open(dev, path, dev->blk_open_flags);
+		if (ret)
+			goto err;
+
+		ret = ibnbd_dev_vfs_open(dev, path, flags);
+		if (ret)
+			goto blk_put;
+	} else {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	dev->blk_open_flags	= flags;
+	dev->mode		= mode;
+	dev->io_cb		= io_cb;
+	bdevname(dev->bdev, dev->name);
+	dev->ibd_bio_set	= bs;
+
+	return dev;
+
+blk_put:
+	blkdev_put(dev->bdev, dev->blk_open_flags);
+err:
+	kfree(dev);
+	return ERR_PTR(ret);
+}
+
+void ibnbd_dev_close(struct ibnbd_dev *dev)
+{
+	flush_workqueue(fileio_wq);
+	blkdev_put(dev->bdev, dev->blk_open_flags);
+	if (dev->mode == IBNBD_FILEIO)
+		filp_close(dev->file, dev->file);
+	kfree(dev);
+}
+
+static void ibnbd_dev_bi_end_io(struct bio *bio)
+{
+	struct ibnbd_dev_blk_io *io = bio->bi_private;
+
+	io->dev->io_cb(io->priv, blk_status_to_errno(bio->bi_status));
+	bio_put(bio);
+	kfree(io);
+}
+
+static void bio_map_kern_endio(struct bio *bio)
+{
+	bio_put(bio);
+}
+
+/**
+ *	ibnbd_bio_map_kern	-	map kernel address into bio
+ *	@q: the struct request_queue for the bio
+ *	@data: pointer to buffer to map
+ *	@bs: bio_set to use.
+ *	@len: length in bytes
+ *	@gfp_mask: allocation flags for bio allocation
+ *
+ *	Map the kernel address into a bio suitable for io to a block
+ *	device. Returns an error pointer in case of error.
+ */
+static struct bio *ibnbd_bio_map_kern(struct request_queue *q, void *data,
+				      struct bio_set *bs,
+				      unsigned int len, gfp_t gfp_mask)
+{
+	unsigned long kaddr = (unsigned long)data;
+	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned long start = kaddr >> PAGE_SHIFT;
+	const int nr_pages = end - start;
+	int offset, i;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_mask, nr_pages, bs);
+	if (!bio)
+		return ERR_PTR(-ENOMEM);
+
+	offset = offset_in_page(kaddr);
+	for (i = 0; i < nr_pages; i++) {
+		unsigned int bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+				    offset) < bytes) {
+			/* we don't support partial mappings */
+			bio_put(bio);
+			return ERR_PTR(-EINVAL);
+		}
+
+		data += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	bio->bi_end_io = bio_map_kern_endio;
+	return bio;
+}
+
+static int ibnbd_dev_blk_submit_io(struct ibnbd_dev *dev, sector_t sector,
+				   void *data, size_t len, u32 bi_size,
+				   enum ibnbd_io_flags flags, short prio,
+				   void *priv)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct ibnbd_dev_blk_io *io;
+	struct bio *bio;
+
+	/* check if the buffer is suitable for bdev */
+	if (unlikely(WARN_ON(!blk_rq_aligned(q, (unsigned long)data, len))))
+		return -EINVAL;
+
+	/* Generate bio with pages pointing to the rdma buffer */
+	bio = ibnbd_bio_map_kern(q, data, dev->ibd_bio_set, len, GFP_KERNEL);
+	if (unlikely(IS_ERR(bio)))
+		return PTR_ERR(bio);
+
+	io = kmalloc(sizeof(*io), GFP_KERNEL);
+	if (unlikely(!io)) {
+		bio_put(bio);
+		return -ENOMEM;
+	}
+
+	io->dev		= dev;
+	io->priv	= priv;
+
+	bio->bi_end_io		= ibnbd_dev_bi_end_io;
+	bio->bi_private		= io;
+	bio->bi_opf		= ibnbd_to_bio_flags(flags);
+	bio->bi_iter.bi_sector	= sector;
+	bio->bi_iter.bi_size	= bi_size;
+	bio_set_prio(bio, prio);
+	bio_set_dev(bio, dev->bdev);
+
+	submit_bio(bio);
+
+	return 0;
+}
+
+static int ibnbd_dev_file_handle_flush(struct ibnbd_dev_file_io_work *w,
+				       loff_t start)
+{
+	int ret;
+	loff_t end;
+	int len = w->bi_size;
+
+	if (len)
+		end = start + len - 1;
+	else
+		end = LLONG_MAX;
+
+	ret = vfs_fsync_range(w->dev->file, start, end, 1);
+	if (unlikely(ret))
+		pr_info_ratelimited("I/O FLUSH failed on %s, vfs_sync err: %d\n",
+				    w->dev->name, ret);
+	return ret;
+}
+
+static int ibnbd_dev_file_handle_fua(struct ibnbd_dev_file_io_work *w,
+				     loff_t start)
+{
+	int ret;
+	loff_t end;
+	int len = w->bi_size;
+
+	if (len)
+		end = start + len - 1;
+	else
+		end = LLONG_MAX;
+
+	ret = vfs_fsync_range(w->dev->file, start, end, 1);
+	if (unlikely(ret))
+		pr_info_ratelimited("I/O FUA failed on %s, vfs_sync err: %d\n",
+				    w->dev->name, ret);
+	return ret;
+}
+
+static int ibnbd_dev_file_handle_write_same(struct ibnbd_dev_file_io_work *w)
+{
+	int i;
+
+	if (unlikely(WARN_ON(w->bi_size % w->len)))
+		return -EINVAL;
+
+	for (i = 1; i < w->bi_size / w->len; i++)
+		memcpy(w->data + i * w->len, w->data, w->len);
+
+	return 0;
+}
+
+static void ibnbd_dev_file_submit_io_worker(struct work_struct *w)
+{
+	struct ibnbd_dev_file_io_work *dev_work;
+	struct file *f;
+	int ret, len;
+	loff_t off;
+
+	dev_work = container_of(w, struct ibnbd_dev_file_io_work, work);
+	off = dev_work->sector * ibnbd_dev_get_logical_bsize(dev_work->dev);
+	f = dev_work->dev->file;
+	len = dev_work->bi_size;
+
+	if (ibnbd_op(dev_work->flags) == IBNBD_OP_FLUSH) {
+		ret = ibnbd_dev_file_handle_flush(dev_work, off);
+		if (unlikely(ret))
+			goto out;
+	}
+
+	if (ibnbd_op(dev_work->flags) == IBNBD_OP_WRITE_SAME) {
+		ret = ibnbd_dev_file_handle_write_same(dev_work);
+		if (unlikely(ret))
+			goto out;
+	}
+
+	/* TODO Implement support for DIRECT */
+	if (dev_work->bi_size) {
+		loff_t off_tmp = off;
+
+		if (ibnbd_op(dev_work->flags) == IBNBD_OP_WRITE)
+			ret = kernel_write(f, dev_work->data, dev_work->bi_size,
+					   &off_tmp);
+		else
+			ret = kernel_read(f, dev_work->data, dev_work->bi_size,
+					  &off_tmp);
+
+		if (unlikely(ret < 0)) {
+			goto out;
+		} else if (unlikely(ret != dev_work->bi_size)) {
+			/* TODO implement support for partial completions */
+			ret = -EIO;
+			goto out;
+		} else {
+			ret = 0;
+		}
+	}
+
+	if (dev_work->flags & IBNBD_F_FUA)
+		ret = ibnbd_dev_file_handle_fua(dev_work, off);
+out:
+	dev_work->dev->io_cb(dev_work->priv, ret);
+	kfree(dev_work);
+}
+
+static int ibnbd_dev_file_submit_io(struct ibnbd_dev *dev, sector_t sector,
+				    void *data, size_t len, size_t bi_size,
+				    enum ibnbd_io_flags flags, void *priv)
+{
+	struct ibnbd_dev_file_io_work *w;
+
+	if (!ibnbd_flags_supported(flags)) {
+		pr_info_ratelimited("Unsupported I/O flags: 0x%x on device "
+				    "%s\n", flags, dev->name);
+		return -ENOTSUPP;
+	}
+
+	w = kmalloc(sizeof(*w), GFP_KERNEL);
+	if (!w)
+		return -ENOMEM;
+
+	w->dev		= dev;
+	w->priv		= priv;
+	w->sector	= sector;
+	w->data		= data;
+	w->len		= len;
+	w->bi_size	= bi_size;
+	w->flags	= flags;
+	INIT_WORK(&w->work, ibnbd_dev_file_submit_io_worker);
+
+	if (unlikely(!queue_work(fileio_wq, &w->work))) {
+		kfree(w);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+int ibnbd_dev_submit_io(struct ibnbd_dev *dev, sector_t sector, void *data,
+			size_t len, u32 bi_size, enum ibnbd_io_flags flags,
+			short prio, void *priv)
+{
+	if (dev->mode == IBNBD_FILEIO)
+		return ibnbd_dev_file_submit_io(dev, sector, data, len, bi_size,
+						flags, priv);
+	else if (dev->mode == IBNBD_BLOCKIO)
+		return ibnbd_dev_blk_submit_io(dev, sector, data, len, bi_size,
+					       flags, prio, priv);
+
+	pr_warn("Submitting I/O to %s failed, dev->mode contains invalid "
+		"value: '%d', memory corrupted?", dev->name, dev->mode);
+
+	return -EINVAL;
+}
diff --git a/drivers/block/ibnbd/ibnbd-srv-dev.h b/drivers/block/ibnbd/ibnbd-srv-dev.h
new file mode 100644
index 000000000000..131746e38a9d
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv-dev.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#ifndef IBNBD_SRV_DEV_H
+#define IBNBD_SRV_DEV_H
+
+#include <linux/fs.h>
+#include "ibnbd-proto.h"
+
+typedef void ibnbd_dev_io_fn(void *priv, int error);
+
+struct ibnbd_dev {
+	struct block_device	*bdev;
+	struct bio_set		*ibd_bio_set;
+	struct file		*file;
+	fmode_t			blk_open_flags;
+	enum ibnbd_io_mode	mode;
+	char			name[BDEVNAME_SIZE];
+	ibnbd_dev_io_fn		*io_cb;
+};
+
+/** ibnbd_dev_init() - Initialize ibnbd_dev
+ *
+ * This functions initialized the ibnbd-dev component.
+ * It has to be called 1x time before ibnbd_dev_open() is used
+ */
+int ibnbd_dev_init(void);
+
+/** ibnbd_dev_destroy() - Destroy ibnbd_dev
+ *
+ * This functions destroys the ibnbd-dev component.
+ * It has to be called after the last device was closed.
+ */
+void ibnbd_dev_destroy(void);
+
+/**
+ * ibnbd_dev_open() - Open a device
+ * @flags:	open flags
+ * @mode:	open via VFS or block layer
+ * @bs:		bio_set to use during block io,
+ * @io_cb:	is called when I/O finished
+ */
+struct ibnbd_dev *ibnbd_dev_open(const char *path, fmode_t flags,
+				 enum ibnbd_io_mode mode, struct bio_set *bs,
+				 ibnbd_dev_io_fn io_cb);
+
+/**
+ * ibnbd_dev_close() - Close a device
+ */
+void ibnbd_dev_close(struct ibnbd_dev *dev);
+
+static inline int ibnbd_dev_get_logical_bsize(const struct ibnbd_dev *dev)
+{
+	return bdev_logical_block_size(dev->bdev);
+}
+
+static inline int ibnbd_dev_get_phys_bsize(const struct ibnbd_dev *dev)
+{
+	return bdev_physical_block_size(dev->bdev);
+}
+
+static inline int ibnbd_dev_get_max_segs(const struct ibnbd_dev *dev)
+{
+	return queue_max_segments(bdev_get_queue(dev->bdev));
+}
+
+static inline int ibnbd_dev_get_max_hw_sects(const struct ibnbd_dev *dev)
+{
+	return queue_max_hw_sectors(bdev_get_queue(dev->bdev));
+}
+
+static inline int
+ibnbd_dev_get_max_write_same_sects(const struct ibnbd_dev *dev)
+{
+	return bdev_write_same(dev->bdev);
+}
+
+static inline int ibnbd_dev_get_secure_discard(const struct ibnbd_dev *dev)
+{
+	if (dev->mode == IBNBD_BLOCKIO)
+		return blk_queue_secure_erase(bdev_get_queue(dev->bdev));
+	return 0;
+}
+
+static inline int ibnbd_dev_get_max_discard_sects(const struct ibnbd_dev *dev)
+{
+	if (!blk_queue_discard(bdev_get_queue(dev->bdev)))
+		return 0;
+
+	if (dev->mode == IBNBD_BLOCKIO)
+		return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev),
+						 REQ_OP_DISCARD);
+	return 0;
+}
+
+static inline int ibnbd_dev_get_discard_granularity(const struct ibnbd_dev *dev)
+{
+	if (dev->mode == IBNBD_BLOCKIO)
+		return bdev_get_queue(dev->bdev)->limits.discard_granularity;
+	return 0;
+}
+
+static inline int ibnbd_dev_get_discard_alignment(const struct ibnbd_dev *dev)
+{
+	if (dev->mode == IBNBD_BLOCKIO)
+		return bdev_get_queue(dev->bdev)->limits.discard_alignment;
+	return 0;
+}
+
+/**
+ * ibnbd_dev_submit_io() - Submit an I/O to the disk
+ * @dev:	device to that the I/O is submitted
+ * @sector:	address to read/write data to
+ * @data:	I/O data to write or buffer to read I/O date into
+ * @len:	length of @data
+ * @bi_size:	Amount of data that will be read/written
+ * @prio:       IO priority
+ * @priv:	private data passed to @io_fn
+ */
+int ibnbd_dev_submit_io(struct ibnbd_dev *dev, sector_t sector, void *data,
+			size_t len, u32 bi_size, enum ibnbd_io_flags flags,
+			short prio, void *priv);
+
+#endif /* IBNBD_SRV_DEV_H */
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 20/25] ibnbd: server: main functionality
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This is main functionality of ibnbd-server module, which handles IBTRS
events and IBNBD protocol requests, like map (open) or unmap (close)
device.  Also server side is responsible for processing incoming IBTRS
IO requests and forward them to local mapped devices.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-srv.c | 945 ++++++++++++++++++++++++++++++++
 1 file changed, 945 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv.c

diff --git a/drivers/block/ibnbd/ibnbd-srv.c b/drivers/block/ibnbd/ibnbd-srv.c
new file mode 100644
index 000000000000..07b4ae49fda2
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv.c
@@ -0,0 +1,945 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+
+#include "ibnbd-srv.h"
+#include "ibnbd-srv-dev.h"
+
+MODULE_AUTHOR("ibnbd@profitbricks.com");
+MODULE_VERSION(IBNBD_VER_STRING);
+MODULE_DESCRIPTION("InfiniBand Network Block Device Server");
+MODULE_LICENSE("GPL");
+
+#define DEFAULT_DEV_SEARCH_PATH "/"
+
+static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH;
+
+static int dev_search_path_set(const char *val, const struct kernel_param *kp)
+{
+	char *dup;
+
+	if (strlen(val) >= sizeof(dev_search_path))
+		return -EINVAL;
+
+	dup = kstrdup(val, GFP_KERNEL);
+
+	if (dup[strlen(dup) - 1] == '\n')
+		dup[strlen(dup) - 1] = '\0';
+
+	strlcpy(dev_search_path, dup, sizeof(dev_search_path));
+
+	kfree(dup);
+	pr_info("dev_search_path changed to '%s'\n", dev_search_path);
+
+	return 0;
+}
+
+static struct kparam_string dev_search_path_kparam_str = {
+	.maxlen	= sizeof(dev_search_path),
+	.string	= dev_search_path
+};
+
+static const struct kernel_param_ops dev_search_path_ops = {
+	.set	= dev_search_path_set,
+	.get	= param_get_string,
+};
+
+module_param_cb(dev_search_path, &dev_search_path_ops,
+		&dev_search_path_kparam_str, 0444);
+MODULE_PARM_DESC(dev_search_path, "Sets the dev_search_path."
+		 " When a device is mapped this path is prepended to the"
+		 " device path from the map device operation.  If %SESSNAME%"
+		 " is specified in a path, then device will be searched in a"
+		 " session namespace."
+		 " (default: " DEFAULT_DEV_SEARCH_PATH ")");
+
+static int def_io_mode = IBNBD_BLOCKIO;
+
+static int def_io_mode_set(const char *val, const struct kernel_param *kp)
+{
+	int io_mode, rc;
+
+	rc = kstrtoint(val, 0, &io_mode);
+	if (unlikely(rc))
+		return rc;
+
+	switch (io_mode) {
+	case IBNBD_FILEIO:
+	case IBNBD_BLOCKIO:
+		def_io_mode = io_mode;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct kernel_param_ops def_io_mode_ops = {
+	.set	= def_io_mode_set,
+	.get	= param_get_int,
+};
+module_param_cb(def_io_mode, &def_io_mode_ops, &def_io_mode, 0444);
+MODULE_PARM_DESC(def_io_mode, "By default, export devices in"
+		 " blockio(" __stringify(_IBNBD_BLOCKIO) ") or"
+		 " fileio(" __stringify(_IBNBD_FILEIO) ") mode."
+		 " (default: " __stringify(_IBNBD_BLOCKIO) " (blockio))");
+
+static DEFINE_MUTEX(sess_lock);
+static DEFINE_SPINLOCK(dev_lock);
+
+static LIST_HEAD(sess_list);
+static LIST_HEAD(dev_list);
+
+struct ibnbd_io_private {
+	struct ibtrs_srv_op		*id;
+	struct ibnbd_srv_sess_dev	*sess_dev;
+};
+
+static void ibnbd_sess_dev_release(struct kref *kref)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kref, struct ibnbd_srv_sess_dev, kref);
+	complete(sess_dev->destroy_comp);
+}
+
+static inline void ibnbd_put_sess_dev(struct ibnbd_srv_sess_dev *sess_dev)
+{
+	kref_put(&sess_dev->kref, ibnbd_sess_dev_release);
+}
+
+static void ibnbd_endio(void *priv, int error)
+{
+	struct ibnbd_io_private *ibnbd_priv = priv;
+	struct ibnbd_srv_sess_dev *sess_dev = ibnbd_priv->sess_dev;
+
+	ibnbd_put_sess_dev(sess_dev);
+
+	ibtrs_srv_resp_rdma(ibnbd_priv->id, error);
+
+	kfree(priv);
+}
+
+static struct ibnbd_srv_sess_dev *
+ibnbd_get_sess_dev(int dev_id, struct ibnbd_srv_session *srv_sess)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+	int ret = 0;
+
+	read_lock(&srv_sess->index_lock);
+	sess_dev = idr_find(&srv_sess->index_idr, dev_id);
+	if (likely(sess_dev))
+		ret = kref_get_unless_zero(&sess_dev->kref);
+	read_unlock(&srv_sess->index_lock);
+
+	if (unlikely(!sess_dev || !ret))
+		return ERR_PTR(-ENXIO);
+
+	return sess_dev;
+}
+
+static int process_rdma(struct ibtrs_srv *sess,
+			struct ibnbd_srv_session *srv_sess,
+			struct ibtrs_srv_op *id, void *data, u32 datalen,
+			const void *usr, size_t usrlen)
+{
+	const struct ibnbd_msg_io *msg = usr;
+	struct ibnbd_io_private *priv;
+	struct ibnbd_srv_sess_dev *sess_dev;
+	u32 dev_id;
+	int err;
+
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+	if (unlikely(!priv))
+		return -ENOMEM;
+
+	dev_id = le32_to_cpu(msg->device_id);
+
+	sess_dev = ibnbd_get_sess_dev(dev_id, srv_sess);
+	if (unlikely(IS_ERR(sess_dev))) {
+		pr_err_ratelimited("Got I/O request on session %s for "
+				   "unknown device id %d\n",
+				   srv_sess->sessname, dev_id);
+		err = -ENOTCONN;
+		goto err;
+	}
+
+	priv->sess_dev = sess_dev;
+	priv->id = id;
+
+	err = ibnbd_dev_submit_io(sess_dev->ibnbd_dev, le64_to_cpu(msg->sector),
+				  data, datalen, le32_to_cpu(msg->bi_size),
+				  le32_to_cpu(msg->rw),
+				  srv_sess->ver < IBNBD_PROTO_VER_MAJOR ||
+				  usrlen < sizeof(*msg) ?
+				  0 : le16_to_cpu(msg->prio), priv);
+	if (unlikely(err)) {
+		ibnbd_err(sess_dev,
+			  "Submitting I/O to device failed, err: %d\n", err);
+		goto sess_dev_put;
+	}
+
+	return 0;
+
+sess_dev_put:
+	ibnbd_put_sess_dev(sess_dev);
+err:
+	kfree(priv);
+	return err;
+}
+
+static void destroy_device(struct ibnbd_srv_dev *dev)
+{
+	WARN(!list_empty(&dev->sess_dev_list),
+	     "Device %s is being destroyed but still in use!\n",
+	     dev->id);
+
+	spin_lock(&dev_lock);
+	list_del(&dev->list);
+	spin_unlock(&dev_lock);
+
+	if (dev->dev_kobj.state_in_sysfs)
+		/*
+		 * Destroy kobj only if it was really created.
+		 * The following call should be sync, because
+		 *  we free the memory afterwards.
+		 */
+		ibnbd_srv_destroy_dev_sysfs(dev);
+
+	kfree(dev);
+}
+
+static void destroy_device_cb(struct kref *kref)
+{
+	struct ibnbd_srv_dev *dev;
+
+	dev = container_of(kref, struct ibnbd_srv_dev, kref);
+
+	destroy_device(dev);
+}
+
+static void ibnbd_put_srv_dev(struct ibnbd_srv_dev *dev)
+{
+	kref_put(&dev->kref, destroy_device_cb);
+}
+
+static void ibnbd_destroy_sess_dev(struct ibnbd_srv_sess_dev *sess_dev)
+{
+	DECLARE_COMPLETION_ONSTACK(dc);
+
+	write_lock(&sess_dev->sess->index_lock);
+	idr_remove(&sess_dev->sess->index_idr, sess_dev->device_id);
+	write_unlock(&sess_dev->sess->index_lock);
+
+	sess_dev->destroy_comp = &dc;
+	ibnbd_put_sess_dev(sess_dev);
+	wait_for_completion(&dc);
+
+	ibnbd_dev_close(sess_dev->ibnbd_dev);
+	list_del(&sess_dev->sess_list);
+	mutex_lock(&sess_dev->dev->lock);
+	list_del(&sess_dev->dev_list);
+	if (sess_dev->open_flags & FMODE_WRITE)
+		sess_dev->dev->open_write_cnt--;
+	mutex_unlock(&sess_dev->dev->lock);
+
+	ibnbd_put_srv_dev(sess_dev->dev);
+
+	ibnbd_info(sess_dev, "Device closed\n");
+	kfree(sess_dev);
+}
+
+static void destroy_sess(struct ibnbd_srv_session *srv_sess)
+{
+	struct ibnbd_srv_sess_dev *sess_dev, *tmp;
+
+	if (list_empty(&srv_sess->sess_dev_list))
+		goto out;
+
+	mutex_lock(&srv_sess->lock);
+	list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list,
+				 sess_list) {
+		ibnbd_srv_destroy_dev_session_sysfs(sess_dev);
+		ibnbd_destroy_sess_dev(sess_dev);
+	}
+	mutex_unlock(&srv_sess->lock);
+
+out:
+	idr_destroy(&srv_sess->index_idr);
+	bioset_exit(&srv_sess->sess_bio_set);
+
+	pr_info("IBTRS Session %s disconnected\n", srv_sess->sessname);
+
+	mutex_lock(&sess_lock);
+	list_del(&srv_sess->list);
+	mutex_unlock(&sess_lock);
+
+	kfree(srv_sess);
+}
+
+static int create_sess(struct ibtrs_srv *ibtrs)
+{
+	struct ibnbd_srv_session *srv_sess;
+	char sessname[NAME_MAX];
+	int err;
+
+	err = ibtrs_srv_get_sess_name(ibtrs, sessname, sizeof(sessname));
+	if (unlikely(err)) {
+		pr_err("ibtrs_srv_get_sess_name(%s): %d\n", sessname, err);
+
+		return err;
+	}
+	srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL);
+	if (!srv_sess)
+		return -ENOMEM;
+	srv_sess->queue_depth = ibtrs_srv_get_queue_depth(ibtrs);
+
+	err = bioset_init(&srv_sess->sess_bio_set, srv_sess->queue_depth, 0,
+			  BIOSET_NEED_BVECS);
+	if (err) {
+		pr_err("Allocating srv_session for session %s failed\n",
+		       sessname);
+		kfree(srv_sess);
+		return err;
+	}
+
+	idr_init(&srv_sess->index_idr);
+	rwlock_init(&srv_sess->index_lock);
+	INIT_LIST_HEAD(&srv_sess->sess_dev_list);
+	mutex_init(&srv_sess->lock);
+	mutex_lock(&sess_lock);
+	list_add(&srv_sess->list, &sess_list);
+	mutex_unlock(&sess_lock);
+
+	srv_sess->ibtrs = ibtrs;
+	srv_sess->queue_depth = ibtrs_srv_get_queue_depth(ibtrs);
+	strlcpy(srv_sess->sessname, sessname, sizeof(srv_sess->sessname));
+
+	ibtrs_srv_set_sess_priv(ibtrs, srv_sess);
+
+	return 0;
+}
+
+static int ibnbd_srv_link_ev(struct ibtrs_srv *ibtrs,
+			     enum ibtrs_srv_link_ev ev, void *priv)
+{
+	struct ibnbd_srv_session *srv_sess = priv;
+
+	switch (ev) {
+	case IBTRS_SRV_LINK_EV_CONNECTED:
+		return create_sess(ibtrs);
+
+	case IBTRS_SRV_LINK_EV_DISCONNECTED:
+		if (WARN_ON(!srv_sess))
+			return -EINVAL;
+
+		destroy_sess(srv_sess);
+		return 0;
+
+	default:
+		pr_warn("Received unknown IBTRS session event %d from session"
+			" %s\n", ev, srv_sess->sessname);
+		return -EINVAL;
+	}
+}
+
+static int process_msg_close(struct ibtrs_srv *ibtrs,
+			     struct ibnbd_srv_session *srv_sess,
+			     void *data, size_t datalen, const void *usr,
+			     size_t usrlen)
+{
+	const struct ibnbd_msg_close *close_msg = usr;
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = ibnbd_get_sess_dev(le32_to_cpu(close_msg->device_id),
+				      srv_sess);
+	if (unlikely(IS_ERR(sess_dev)))
+		return 0;
+
+	ibnbd_srv_destroy_dev_session_sysfs(sess_dev);
+	ibnbd_put_sess_dev(sess_dev);
+	mutex_lock(&srv_sess->lock);
+	ibnbd_destroy_sess_dev(sess_dev);
+	mutex_unlock(&srv_sess->lock);
+	return 0;
+}
+
+static int process_msg_open(struct ibtrs_srv *ibtrs,
+			    struct ibnbd_srv_session *srv_sess,
+			    const void *msg, size_t len,
+			    void *data, size_t datalen);
+
+static int process_msg_sess_info(struct ibtrs_srv *ibtrs,
+				 struct ibnbd_srv_session *srv_sess,
+				 const void *msg, size_t len,
+				 void *data, size_t datalen);
+
+static int ibnbd_srv_rdma_ev(struct ibtrs_srv *ibtrs, void *priv,
+			     struct ibtrs_srv_op *id, int dir,
+			     void *data, size_t datalen, const void *usr,
+			     size_t usrlen)
+{
+	struct ibnbd_srv_session *srv_sess = priv;
+	const struct ibnbd_msg_hdr *hdr = usr;
+	int ret = 0;
+	u16 type;
+
+	if (unlikely(WARN_ON(!srv_sess)))
+		return -ENODEV;
+
+	type = le16_to_cpu(hdr->type);
+
+	switch (type) {
+	case IBNBD_MSG_IO:
+		return process_rdma(ibtrs, srv_sess, id, data, datalen, usr,
+				    usrlen);
+	case IBNBD_MSG_CLOSE:
+		ret = process_msg_close(ibtrs, srv_sess, data, datalen,
+					usr, usrlen);
+		break;
+	case IBNBD_MSG_OPEN:
+		ret = process_msg_open(ibtrs, srv_sess, usr, usrlen,
+				       data, datalen);
+		break;
+	case IBNBD_MSG_SESS_INFO:
+		ret = process_msg_sess_info(ibtrs, srv_sess, usr, usrlen,
+					    data, datalen);
+		break;
+	default:
+		pr_warn("Received unexpected message type %d with dir %d from"
+			" session %s\n", type, dir, srv_sess->sessname);
+		return -EINVAL;
+	}
+
+	ibtrs_srv_resp_rdma(id, ret);
+	return 0;
+}
+
+static struct ibnbd_srv_sess_dev
+*ibnbd_sess_dev_alloc(struct ibnbd_srv_session *srv_sess)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+	int error;
+
+	sess_dev = kzalloc(sizeof(*sess_dev), GFP_KERNEL);
+	if (!sess_dev)
+		return ERR_PTR(-ENOMEM);
+
+	idr_preload(GFP_KERNEL);
+	write_lock(&srv_sess->index_lock);
+
+	error = idr_alloc(&srv_sess->index_idr, sess_dev, 0, -1, GFP_NOWAIT);
+	if (error < 0) {
+		pr_warn("Allocating idr failed, err: %d\n", error);
+		goto out_unlock;
+	}
+
+	sess_dev->device_id = error;
+	error = 0;
+
+out_unlock:
+	write_unlock(&srv_sess->index_lock);
+	idr_preload_end();
+	if (error) {
+		kfree(sess_dev);
+		return ERR_PTR(error);
+	}
+
+	return sess_dev;
+}
+
+static struct ibnbd_srv_dev *ibnbd_srv_init_srv_dev(const char *id,
+						    enum ibnbd_io_mode mode)
+{
+	struct ibnbd_srv_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	strlcpy(dev->id, id, sizeof(dev->id));
+	dev->mode = mode;
+	kref_init(&dev->kref);
+	INIT_LIST_HEAD(&dev->sess_dev_list);
+	mutex_init(&dev->lock);
+
+	return dev;
+}
+
+static struct ibnbd_srv_dev *
+ibnbd_srv_find_or_add_srv_dev(struct ibnbd_srv_dev *new_dev)
+{
+	struct ibnbd_srv_dev *dev;
+
+	spin_lock(&dev_lock);
+	list_for_each_entry(dev, &dev_list, list) {
+		if (!strncmp(dev->id, new_dev->id, sizeof(dev->id))) {
+			if (!kref_get_unless_zero(&dev->kref))
+				/*
+				 * We lost the race, device is almost dead.
+				 *  Continue traversing to find a valid one.
+				 */
+				continue;
+			spin_unlock(&dev_lock);
+			return dev;
+		}
+	}
+	list_add(&new_dev->list, &dev_list);
+	spin_unlock(&dev_lock);
+
+	return new_dev;
+}
+
+static int ibnbd_srv_check_update_open_perm(struct ibnbd_srv_dev *srv_dev,
+					    struct ibnbd_srv_session *srv_sess,
+					    enum ibnbd_io_mode io_mode,
+					    enum ibnbd_access_mode access_mode)
+{
+	int ret = -EPERM;
+
+	mutex_lock(&srv_dev->lock);
+
+	if (srv_dev->mode != io_mode) {
+		pr_err("Mapping device '%s' for session %s in %s mode forbidden,"
+		       " device is already mapped from other client(s) in"
+		       " %s mode\n", srv_dev->id, srv_sess->sessname,
+		       ibnbd_io_mode_str(io_mode),
+		       ibnbd_io_mode_str(srv_dev->mode));
+		goto out;
+	}
+
+	switch (access_mode) {
+	case IBNBD_ACCESS_RO:
+		ret = 0;
+		break;
+	case IBNBD_ACCESS_RW:
+		if (srv_dev->open_write_cnt == 0)  {
+			srv_dev->open_write_cnt++;
+			ret = 0;
+		} else {
+			pr_err("Mapping device '%s' for session %s with"
+			       " RW permissions failed. Device already opened"
+			       " as 'RW' by %d client(s) in %s mode.\n",
+			       srv_dev->id, srv_sess->sessname,
+			       srv_dev->open_write_cnt,
+			       ibnbd_io_mode_str(srv_dev->mode));
+		}
+		break;
+	case IBNBD_ACCESS_MIGRATION:
+		if (srv_dev->open_write_cnt < 2) {
+			srv_dev->open_write_cnt++;
+			ret = 0;
+		} else {
+			pr_err("Mapping device '%s' for session %s with"
+			       " migration permissions failed. Device already"
+			       " opened as 'RW' by %d client(s) in %s mode.\n",
+			       srv_dev->id, srv_sess->sessname,
+			       srv_dev->open_write_cnt,
+			       ibnbd_io_mode_str(srv_dev->mode));
+		}
+		break;
+	default:
+		pr_err("Received mapping request for device '%s' on session %s"
+		       " with invalid access mode: %d\n", srv_dev->id,
+		       srv_sess->sessname, access_mode);
+		ret = -EINVAL;
+	}
+
+out:
+	mutex_unlock(&srv_dev->lock);
+
+	return ret;
+}
+
+static struct ibnbd_srv_dev *
+ibnbd_srv_get_or_create_srv_dev(struct ibnbd_dev *ibnbd_dev,
+				struct ibnbd_srv_session *srv_sess,
+				enum ibnbd_io_mode io_mode,
+				enum ibnbd_access_mode access_mode)
+{
+	int ret;
+	struct ibnbd_srv_dev *new_dev, *dev;
+
+	new_dev = ibnbd_srv_init_srv_dev(ibnbd_dev->name, io_mode);
+	if (IS_ERR(new_dev))
+		return new_dev;
+
+	dev = ibnbd_srv_find_or_add_srv_dev(new_dev);
+	if (dev != new_dev)
+		kfree(new_dev);
+
+	ret = ibnbd_srv_check_update_open_perm(dev, srv_sess, io_mode,
+					       access_mode);
+	if (ret) {
+		ibnbd_put_srv_dev(dev);
+		return ERR_PTR(ret);
+	}
+
+	return dev;
+}
+
+static void ibnbd_srv_fill_msg_open_rsp(struct ibnbd_msg_open_rsp *rsp,
+					struct ibnbd_srv_sess_dev *sess_dev)
+{
+	struct ibnbd_dev *ibnbd_dev = sess_dev->ibnbd_dev;
+
+	rsp->hdr.type = cpu_to_le16(IBNBD_MSG_OPEN_RSP);
+	rsp->device_id =
+		cpu_to_le32(sess_dev->device_id);
+	rsp->nsectors =
+		cpu_to_le64(get_capacity(ibnbd_dev->bdev->bd_disk));
+	rsp->logical_block_size	=
+		cpu_to_le16(ibnbd_dev_get_logical_bsize(ibnbd_dev));
+	rsp->physical_block_size =
+		cpu_to_le16(ibnbd_dev_get_phys_bsize(ibnbd_dev));
+	rsp->max_segments =
+		cpu_to_le16(ibnbd_dev_get_max_segs(ibnbd_dev));
+	rsp->max_hw_sectors =
+		cpu_to_le32(ibnbd_dev_get_max_hw_sects(ibnbd_dev));
+	rsp->max_write_same_sectors =
+		cpu_to_le32(ibnbd_dev_get_max_write_same_sects(ibnbd_dev));
+	rsp->max_discard_sectors =
+		cpu_to_le32(ibnbd_dev_get_max_discard_sects(ibnbd_dev));
+	rsp->discard_granularity =
+		cpu_to_le32(ibnbd_dev_get_discard_granularity(ibnbd_dev));
+	rsp->discard_alignment =
+		cpu_to_le32(ibnbd_dev_get_discard_alignment(ibnbd_dev));
+	rsp->secure_discard =
+		cpu_to_le16(ibnbd_dev_get_secure_discard(ibnbd_dev));
+	rsp->rotational =
+		!blk_queue_nonrot(bdev_get_queue(ibnbd_dev->bdev));
+	rsp->io_mode =
+		ibnbd_dev->mode;
+}
+
+static struct ibnbd_srv_sess_dev *
+ibnbd_srv_create_set_sess_dev(struct ibnbd_srv_session *srv_sess,
+			      const struct ibnbd_msg_open *open_msg,
+			      struct ibnbd_dev *ibnbd_dev, fmode_t open_flags,
+			      struct ibnbd_srv_dev *srv_dev)
+{
+	struct ibnbd_srv_sess_dev *sdev = ibnbd_sess_dev_alloc(srv_sess);
+
+	if (IS_ERR(sdev))
+		return sdev;
+
+	kref_init(&sdev->kref);
+
+	strlcpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname));
+
+	sdev->ibnbd_dev		= ibnbd_dev;
+	sdev->sess		= srv_sess;
+	sdev->dev		= srv_dev;
+	sdev->open_flags	= open_flags;
+	sdev->access_mode	= open_msg->access_mode;
+
+	return sdev;
+}
+
+static char *ibnbd_srv_get_full_path(struct ibnbd_srv_session *srv_sess,
+				     const char *dev_name)
+{
+	char *full_path;
+	char *a, *b;
+
+	full_path = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!full_path)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Replace %SESSNAME% with a real session name in order to
+	 * create device namespace.
+	 */
+	a = strnstr(dev_search_path, "%SESSNAME%", sizeof(dev_search_path));
+	if (a) {
+		int len = a - dev_search_path;
+
+		len = snprintf(full_path, PATH_MAX, "%.*s/%s/%s", len,
+			       dev_search_path, srv_sess->sessname, dev_name);
+		if (len >= PATH_MAX) {
+			pr_err("Tooooo looong path: %s, %s, %s\n",
+			       dev_search_path, srv_sess->sessname, dev_name);
+			kfree(full_path);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		snprintf(full_path, PATH_MAX, "%s/%s",
+			 dev_search_path, dev_name);
+	}
+
+	/* eliminitate duplicated slashes */
+	a = strchr(full_path, '/');
+	b = a;
+	while (*b != '\0') {
+		if (*b == '/' && *a == '/') {
+			b++;
+		} else {
+			a++;
+			*a = *b;
+			b++;
+		}
+	}
+	a++;
+	*a = '\0';
+
+	return full_path;
+}
+
+static int process_msg_sess_info(struct ibtrs_srv *ibtrs,
+				 struct ibnbd_srv_session *srv_sess,
+				 const void *msg, size_t len,
+				 void *data, size_t datalen)
+{
+	const struct ibnbd_msg_sess_info *sess_info_msg = msg;
+	struct ibnbd_msg_sess_info_rsp *rsp = data;
+
+	srv_sess->ver = min_t(u8, sess_info_msg->ver, IBNBD_PROTO_VER_MAJOR);
+	pr_debug("Session %s using protocol version %d (client version: %d,"
+		 " server version: %d)\n", srv_sess->sessname,
+		 srv_sess->ver, sess_info_msg->ver, IBNBD_PROTO_VER_MAJOR);
+
+	rsp->hdr.type = cpu_to_le16(IBNBD_MSG_SESS_INFO_RSP);
+	rsp->ver = srv_sess->ver;
+
+	return 0;
+}
+
+/**
+ * find_srv_sess_dev() - a dev is already opened by this name
+ *
+ * Return struct ibnbd_srv_sess_dev if srv_sess already opened the dev_name
+ * NULL if the session didn't open the device yet.
+ */
+static struct ibnbd_srv_sess_dev *
+find_srv_sess_dev(struct ibnbd_srv_session *srv_sess, const char *dev_name)
+{
+	struct ibnbd_srv_sess_dev *sess_dev;
+
+	if (list_empty(&srv_sess->sess_dev_list))
+		return NULL;
+
+	list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list)
+		if (!strcmp(sess_dev->pathname, dev_name))
+			return sess_dev;
+
+	return NULL;
+}
+
+static int process_msg_open(struct ibtrs_srv *ibtrs,
+			    struct ibnbd_srv_session *srv_sess,
+			    const void *msg, size_t len,
+			    void *data, size_t datalen)
+{
+	int ret;
+	struct ibnbd_srv_dev *srv_dev;
+	struct ibnbd_srv_sess_dev *srv_sess_dev;
+	const struct ibnbd_msg_open *open_msg = msg;
+	fmode_t open_flags;
+	char *full_path;
+	struct ibnbd_dev *ibnbd_dev;
+	enum ibnbd_io_mode io_mode;
+	struct ibnbd_msg_open_rsp *rsp = data;
+
+	pr_debug("Open message received: session='%s' path='%s' access_mode=%d"
+		 " io_mode=%d\n", srv_sess->sessname, open_msg->dev_name,
+		 open_msg->access_mode, open_msg->io_mode);
+	open_flags = FMODE_READ;
+	if (open_msg->access_mode != IBNBD_ACCESS_RO)
+		open_flags |= FMODE_WRITE;
+
+	mutex_lock(&srv_sess->lock);
+
+	srv_sess_dev = find_srv_sess_dev(srv_sess, open_msg->dev_name);
+	if (srv_sess_dev)
+		goto fill_response;
+
+	if ((strlen(dev_search_path) + strlen(open_msg->dev_name))
+	    >= PATH_MAX) {
+		pr_err("Opening device for session %s failed, device path too"
+		       " long. '%s/%s' is longer than PATH_MAX (%d)\n",
+		       srv_sess->sessname, dev_search_path, open_msg->dev_name,
+		       PATH_MAX);
+		ret = -EINVAL;
+		goto reject;
+	}
+	full_path = ibnbd_srv_get_full_path(srv_sess, open_msg->dev_name);
+	if (IS_ERR(full_path)) {
+		ret = PTR_ERR(full_path);
+		pr_err("Opening device '%s' for client %s failed,"
+		       " failed to get device full path, err: %d\n",
+		       open_msg->dev_name, srv_sess->sessname, ret);
+		goto reject;
+	}
+
+	if (open_msg->io_mode == IBNBD_BLOCKIO)
+		io_mode = IBNBD_BLOCKIO;
+	else if (open_msg->io_mode == IBNBD_FILEIO)
+		io_mode = IBNBD_FILEIO;
+	else
+		io_mode = def_io_mode;
+
+	ibnbd_dev = ibnbd_dev_open(full_path, open_flags, io_mode,
+				   &srv_sess->sess_bio_set, ibnbd_endio);
+	if (IS_ERR(ibnbd_dev)) {
+		pr_err("Opening device '%s' on session %s failed,"
+		       " failed to open the block device, err: %ld\n",
+		       full_path, srv_sess->sessname, PTR_ERR(ibnbd_dev));
+		ret = PTR_ERR(ibnbd_dev);
+		goto free_path;
+	}
+
+	srv_dev = ibnbd_srv_get_or_create_srv_dev(ibnbd_dev, srv_sess, io_mode,
+						  open_msg->access_mode);
+	if (IS_ERR(srv_dev)) {
+		pr_err("Opening device '%s' on session %s failed,"
+		       " creating srv_dev failed, err: %ld\n",
+		       full_path, srv_sess->sessname, PTR_ERR(srv_dev));
+		ret = PTR_ERR(srv_dev);
+		goto ibnbd_dev_close;
+	}
+
+	srv_sess_dev = ibnbd_srv_create_set_sess_dev(srv_sess, open_msg,
+						     ibnbd_dev, open_flags,
+						     srv_dev);
+	if (IS_ERR(srv_sess_dev)) {
+		pr_err("Opening device '%s' on session %s failed,"
+		       " creating sess_dev failed, err: %ld\n",
+		       full_path, srv_sess->sessname, PTR_ERR(srv_sess_dev));
+		ret = PTR_ERR(srv_sess_dev);
+		goto srv_dev_put;
+	}
+
+	/* Create the srv_dev sysfs files if they haven't been created yet. The
+	 * reason to delay the creation is not to create the sysfs files before
+	 * we are sure the device can be opened.
+	 */
+	mutex_lock(&srv_dev->lock);
+	if (!srv_dev->dev_kobj.state_in_sysfs) {
+		ret = ibnbd_srv_create_dev_sysfs(srv_dev, ibnbd_dev->bdev,
+						 ibnbd_dev->name);
+		if (ret) {
+			mutex_unlock(&srv_dev->lock);
+			ibnbd_err(srv_sess_dev, "Opening device failed, failed to"
+				  " create device sysfs files, err: %d\n",
+				  ret);
+			goto free_srv_sess_dev;
+		}
+	}
+
+	ret = ibnbd_srv_create_dev_session_sysfs(srv_sess_dev);
+	if (ret) {
+		mutex_unlock(&srv_dev->lock);
+		ibnbd_err(srv_sess_dev, "Opening device failed, failed to create"
+			  " dev client sysfs files, err: %d\n", ret);
+		goto free_srv_sess_dev;
+	}
+
+	list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list);
+	mutex_unlock(&srv_dev->lock);
+
+	list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list);
+
+	ibnbd_info(srv_sess_dev, "Opened device '%s' in %s mode\n",
+		   srv_dev->id, ibnbd_io_mode_str(io_mode));
+
+	kfree(full_path);
+
+fill_response:
+	ibnbd_srv_fill_msg_open_rsp(rsp, srv_sess_dev);
+	mutex_unlock(&srv_sess->lock);
+	return 0;
+
+free_srv_sess_dev:
+	write_lock(&srv_sess->index_lock);
+	idr_remove(&srv_sess->index_idr, srv_sess_dev->device_id);
+	write_unlock(&srv_sess->index_lock);
+	kfree(srv_sess_dev);
+srv_dev_put:
+	if (open_msg->access_mode != IBNBD_ACCESS_RO) {
+		mutex_lock(&srv_dev->lock);
+		srv_dev->open_write_cnt--;
+		mutex_unlock(&srv_dev->lock);
+	}
+	ibnbd_put_srv_dev(srv_dev);
+ibnbd_dev_close:
+	ibnbd_dev_close(ibnbd_dev);
+free_path:
+	kfree(full_path);
+reject:
+	mutex_unlock(&srv_sess->lock);
+	return ret;
+}
+
+static struct ibtrs_srv_ctx *ibtrs_ctx;
+
+static int __init ibnbd_srv_init_module(void)
+{
+	int err;
+
+	pr_info("Loading module %s, version %s, proto %s\n",
+		KBUILD_MODNAME, IBNBD_VER_STRING, IBNBD_PROTO_VER_STRING);
+
+	ibtrs_ctx = ibtrs_srv_open(ibnbd_srv_rdma_ev, ibnbd_srv_link_ev,
+				   IBTRS_PORT);
+	if (unlikely(IS_ERR(ibtrs_ctx))) {
+		err = PTR_ERR(ibtrs_ctx);
+		pr_err("ibtrs_srv_open(), err: %d\n", err);
+		goto out;
+	}
+	err = ibnbd_dev_init();
+	if (err) {
+		pr_err("ibnbd_dev_init(), err: %d\n", err);
+		goto srv_close;
+	}
+
+	err = ibnbd_srv_create_sysfs_files();
+	if (err) {
+		pr_err("ibnbd_srv_create_sysfs_files(), err: %d\n", err);
+		goto dev_destroy;
+	}
+
+	return 0;
+
+dev_destroy:
+	ibnbd_dev_destroy();
+srv_close:
+	ibtrs_srv_close(ibtrs_ctx);
+out:
+
+	return err;
+}
+
+static void __exit ibnbd_srv_cleanup_module(void)
+{
+	ibtrs_srv_close(ibtrs_ctx);
+	WARN_ON(!list_empty(&sess_list));
+	ibnbd_srv_destroy_sysfs_files();
+	ibnbd_dev_destroy();
+	pr_info("Module unloaded\n");
+}
+
+module_init(ibnbd_srv_init_module);
+module_exit(ibnbd_srv_cleanup_module);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 17/25] ibnbd: client: main functionality
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This is main functionality of ibnbd-client module, which provides
interface to map remote device as local block device /dev/ibnbd<N>
and feeds IBTRS with IO requests.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-clt.c | 1832 +++++++++++++++++++++++++++++++
 1 file changed, 1832 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-clt.c

diff --git a/drivers/block/ibnbd/ibnbd-clt.c b/drivers/block/ibnbd/ibnbd-clt.c
new file mode 100644
index 000000000000..b5675fac0a60
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-clt.c
@@ -0,0 +1,1832 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Swapnil Ingle <swapnil.ingle@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/scatterlist.h>
+#include <linux/idr.h>
+
+#include "ibnbd-clt.h"
+
+MODULE_AUTHOR("ibnbd@profitbricks.com");
+MODULE_DESCRIPTION("InfiniBand Network Block Device Client");
+MODULE_VERSION(IBNBD_VER_STRING);
+MODULE_LICENSE("GPL");
+
+/*
+ * This is for closing devices when unloading the module:
+ * we might be closing a lot (>256) of devices in parallel
+ * and it is better not to use the system_wq.
+ */
+static struct workqueue_struct *unload_wq;
+static int ibnbd_client_major;
+static DEFINE_IDA(index_ida);
+static DEFINE_MUTEX(ida_lock);
+static DEFINE_MUTEX(sess_lock);
+static LIST_HEAD(sess_list);
+
+static bool softirq_enable;
+module_param(softirq_enable, bool, 0444);
+MODULE_PARM_DESC(softirq_enable, "finish request in softirq_fn."
+		 " (default: 0)");
+/*
+ * Maximum number of partitions an instance can have.
+ * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself)
+ */
+#define IBNBD_PART_BITS		6
+#define KERNEL_SECTOR_SIZE      512
+
+static inline bool ibnbd_clt_get_sess(struct ibnbd_clt_session *sess)
+{
+	return refcount_inc_not_zero(&sess->refcount);
+}
+
+static void free_sess(struct ibnbd_clt_session *sess);
+
+static void ibnbd_clt_put_sess(struct ibnbd_clt_session *sess)
+{
+	might_sleep();
+
+	if (refcount_dec_and_test(&sess->refcount))
+		free_sess(sess);
+}
+
+static inline bool ibnbd_clt_dev_is_mapped(struct ibnbd_clt_dev *dev)
+{
+	return dev->dev_state == DEV_STATE_MAPPED;
+}
+
+static void ibnbd_clt_put_dev(struct ibnbd_clt_dev *dev)
+{
+	might_sleep();
+
+	if (refcount_dec_and_test(&dev->refcount)) {
+		mutex_lock(&ida_lock);
+		ida_simple_remove(&index_ida, dev->clt_device_id);
+		mutex_unlock(&ida_lock);
+		kfree(dev->hw_queues);
+		ibnbd_clt_put_sess(dev->sess);
+		kfree(dev);
+	}
+}
+
+static inline bool ibnbd_clt_get_dev(struct ibnbd_clt_dev *dev)
+{
+	return refcount_inc_not_zero(&dev->refcount);
+}
+
+static int ibnbd_clt_set_dev_attr(struct ibnbd_clt_dev *dev,
+				  const struct ibnbd_msg_open_rsp *rsp)
+{
+	struct ibnbd_clt_session *sess = dev->sess;
+
+	if (unlikely(!rsp->logical_block_size))
+		return -EINVAL;
+
+	dev->device_id		    = le32_to_cpu(rsp->device_id);
+	dev->nsectors		    = le64_to_cpu(rsp->nsectors);
+	dev->logical_block_size	    = le16_to_cpu(rsp->logical_block_size);
+	dev->physical_block_size    = le16_to_cpu(rsp->physical_block_size);
+	dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors);
+	dev->max_discard_sectors    = le32_to_cpu(rsp->max_discard_sectors);
+	dev->discard_granularity    = le32_to_cpu(rsp->discard_granularity);
+	dev->discard_alignment	    = le32_to_cpu(rsp->discard_alignment);
+	dev->secure_discard	    = le16_to_cpu(rsp->secure_discard);
+	dev->rotational		    = rsp->rotational;
+	dev->remote_io_mode	    = rsp->io_mode;
+
+	dev->max_hw_sectors = sess->max_io_size / dev->logical_block_size;
+	dev->max_segments = BMAX_SEGMENTS;
+
+	if (dev->remote_io_mode == IBNBD_BLOCKIO) {
+		dev->max_hw_sectors = min_t(u32, dev->max_hw_sectors,
+					    le32_to_cpu(rsp->max_hw_sectors));
+		dev->max_segments = min_t(u16, dev->max_segments,
+					  le16_to_cpu(rsp->max_segments));
+	}
+
+	return 0;
+}
+
+static int ibnbd_clt_revalidate_disk(struct ibnbd_clt_dev *dev,
+				     size_t new_nsectors)
+{
+	int err = 0;
+
+	ibnbd_info(dev, "Device size changed from %zu to %zu sectors\n",
+		   dev->nsectors, new_nsectors);
+	dev->nsectors = new_nsectors;
+	set_capacity(dev->gd,
+		     dev->nsectors * (dev->logical_block_size /
+				      KERNEL_SECTOR_SIZE));
+	err = revalidate_disk(dev->gd);
+	if (err)
+		ibnbd_err(dev, "Failed to change device size from"
+			  " %zu to %zu, err: %d\n", dev->nsectors,
+			  new_nsectors, err);
+	return err;
+}
+
+static int process_msg_open_rsp(struct ibnbd_clt_dev *dev,
+				struct ibnbd_msg_open_rsp *rsp)
+{
+	int err = 0;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state == DEV_STATE_UNMAPPED) {
+		ibnbd_info(dev, "Ignoring Open-Response message from server for "
+			   " unmapped device\n");
+		err = -ENOENT;
+		goto out;
+	}
+	if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
+		u64 nsectors = le64_to_cpu(rsp->nsectors);
+
+		/*
+		 * If the device was remapped and the size changed in the
+		 * meantime we need to revalidate it
+		 */
+		if (dev->nsectors != nsectors)
+			ibnbd_clt_revalidate_disk(dev, nsectors);
+		ibnbd_info(dev, "Device online, device remapped successfully\n");
+	}
+	err = ibnbd_clt_set_dev_attr(dev, rsp);
+	if (unlikely(err))
+		goto out;
+	dev->dev_state = DEV_STATE_MAPPED;
+
+out:
+	mutex_unlock(&dev->lock);
+
+	return err;
+}
+
+int ibnbd_clt_resize_disk(struct ibnbd_clt_dev *dev, size_t newsize)
+{
+	int ret = 0;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state != DEV_STATE_MAPPED) {
+		pr_err("Failed to set new size of the device, "
+		       "device is not opened\n");
+		ret = -ENOENT;
+		goto out;
+	}
+	ret = ibnbd_clt_revalidate_disk(dev, newsize);
+
+out:
+	mutex_unlock(&dev->lock);
+
+	return ret;
+}
+
+static inline void ibnbd_clt_dev_requeue(struct ibnbd_queue *q)
+{
+	if (WARN_ON(!q->hctx))
+		return;
+
+	/* We can come here from interrupt, thus async=true */
+	blk_mq_run_hw_queue(q->hctx, true);
+}
+
+enum {
+	IBNBD_DELAY_10ms   = 10,
+	IBNBD_DELAY_IFBUSY = -1,
+};
+
+/**
+ * ibnbd_get_cpu_qlist() - finds a list with HW queues to be requeued
+ *
+ * Description:
+ *     Each CPU has a list of HW queues, which needs to be requeed.  If a list
+ *     is not empty - it is marked with a bit.  This function finds first
+ *     set bit in a bitmap and returns corresponding CPU list.
+ */
+static struct ibnbd_cpu_qlist *
+ibnbd_get_cpu_qlist(struct ibnbd_clt_session *sess, int cpu)
+{
+	int bit;
+
+	/* First half */
+	bit = find_next_bit(sess->cpu_queues_bm, nr_cpu_ids, cpu);
+	if (bit < nr_cpu_ids) {
+		return per_cpu_ptr(sess->cpu_queues, bit);
+	} else if (cpu != 0) {
+		/* Second half */
+		bit = find_next_bit(sess->cpu_queues_bm, cpu, 0);
+		if (bit < cpu)
+			return per_cpu_ptr(sess->cpu_queues, bit);
+	}
+
+	return NULL;
+}
+
+static inline int nxt_cpu(int cpu)
+{
+	return (cpu + 1) % nr_cpu_ids;
+}
+
+/**
+ * ibnbd_requeue_if_needed() - requeue if CPU queue is marked as non empty
+ *
+ * Description:
+ *     Each CPU has it's own list of HW queues, which should be requeued.
+ *     Function finds such list with HW queues, takes a list lock, picks up
+ *     the first HW queue out of the list and requeues it.
+ *
+ * Return:
+ *     True if the queue was requeued, false otherwise.
+ *
+ * Context:
+ *     Does not matter.
+ */
+static inline bool ibnbd_requeue_if_needed(struct ibnbd_clt_session *sess)
+{
+	struct ibnbd_queue *q = NULL;
+	struct ibnbd_cpu_qlist *cpu_q;
+	unsigned long flags;
+	int *cpup;
+
+	/*
+	 * To keep fairness and not to let other queues starve we always
+	 * try to wake up someone else in round-robin manner.  That of course
+	 * increases latency but queues always have a chance to be executed.
+	 */
+	cpup = get_cpu_ptr(sess->cpu_rr);
+	for (cpu_q = ibnbd_get_cpu_qlist(sess, nxt_cpu(*cpup)); cpu_q;
+	     cpu_q = ibnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) {
+		if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
+			continue;
+		if (likely(test_bit(cpu_q->cpu, sess->cpu_queues_bm))) {
+			q = list_first_entry_or_null(&cpu_q->requeue_list,
+						     typeof(*q), requeue_list);
+			if (WARN_ON(!q))
+				goto clear_bit;
+			list_del_init(&q->requeue_list);
+			clear_bit_unlock(0, &q->in_list);
+
+			if (list_empty(&cpu_q->requeue_list)) {
+				/* Clear bit if nothing is left */
+clear_bit:
+				clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
+			}
+		}
+		spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
+
+		if (q)
+			break;
+	}
+
+	/**
+	 * Saves the CPU that is going to be requeued on the per-cpu var. Just
+	 * incrementing it doesn't work because ibnbd_get_cpu_qlist() will
+	 * always return the first CPU with something on the queue list when the
+	 * value stored on the var is greater than the last CPU with something
+	 * on the list.
+	 */
+	if (cpu_q)
+		*cpup = cpu_q->cpu;
+	put_cpu_var(sess->cpu_rr);
+
+	if (q)
+		ibnbd_clt_dev_requeue(q);
+
+	return !!q;
+}
+
+/**
+ * ibnbd_requeue_all_if_idle() - requeue all queues left in the list if
+ *     session is idling (there are no requests in-flight).
+ *
+ * Description:
+ *     This function tries to rerun all stopped queues if there are no
+ *     requests in-flight anymore.  This function tries to solve an obvious
+ *     problem, when number of tags < than number of queues (hctx), which
+ *     are stopped and put to sleep.  If last tag, which has been just put,
+ *     does not wake up all left queues (hctxs), IO requests hang forever.
+ *
+ *     That can happen when all number of tags, say N, have been exhausted
+ *     from one CPU, and we have many block devices per session, say M.
+ *     Each block device has it's own queue (hctx) for each CPU, so eventually
+ *     we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids.
+ *     If number of tags N < M x nr_cpu_ids finally we will get an IO hang.
+ *
+ *     To avoid this hang last caller of ibnbd_put_tag() (last caller is the
+ *     one who observes sess->busy == 0) must wake up all remaining queues.
+ *
+ * Context:
+ *     Does not matter.
+ */
+static inline void ibnbd_requeue_all_if_idle(struct ibnbd_clt_session *sess)
+{
+	bool requeued;
+
+	do {
+		requeued = ibnbd_requeue_if_needed(sess);
+	} while (atomic_read(&sess->busy) == 0 && requeued);
+}
+
+static struct ibtrs_tag *ibnbd_get_tag(struct ibnbd_clt_session *sess,
+				       enum ibtrs_clt_con_type con_type,
+				       int wait)
+{
+	struct ibtrs_tag *tag;
+
+	tag = ibtrs_clt_get_tag(sess->ibtrs, con_type,
+				wait ? IBTRS_TAG_WAIT : IBTRS_TAG_NOWAIT);
+	if (likely(tag))
+		/* We have a subtle rare case here, when all tags can be
+		 * consumed before busy counter increased.  This is safe,
+		 * because loser will get NULL as a tag, observe 0 busy
+		 * counter and immediately restart the queue himself.
+		 */
+		atomic_inc(&sess->busy);
+
+	return tag;
+}
+
+static void ibnbd_put_tag(struct ibnbd_clt_session *sess, struct ibtrs_tag *tag)
+{
+	ibtrs_clt_put_tag(sess->ibtrs, tag);
+	atomic_dec(&sess->busy);
+	/* Paired with ibnbd_clt_dev_add_to_requeue().  Decrement first
+	 * and then check queue bits.
+	 */
+	smp_mb__after_atomic();
+	ibnbd_requeue_all_if_idle(sess);
+}
+
+static struct ibnbd_iu *ibnbd_get_iu(struct ibnbd_clt_session *sess,
+				     enum ibtrs_clt_con_type con_type,
+				     int wait)
+{
+	struct ibnbd_iu *iu;
+	struct ibtrs_tag *tag;
+
+	tag = ibnbd_get_tag(sess, con_type,
+			    wait ? IBTRS_TAG_WAIT : IBTRS_TAG_NOWAIT);
+	if (unlikely(!tag))
+		return NULL;
+	iu = ibtrs_tag_to_pdu(tag);
+	iu->tag = tag; /* yes, ibtrs_tag_from_pdu() can be nice here,
+			* but also we have to think about MQ mode
+			*/
+	/*
+	 * 1st reference is dropped after finishing sending a "user" message,
+	 * 2nd reference is dropped after confirmation with the response is
+	 * returned.
+	 * 1st and 2nd can happen in any order, so the ibnbd_iu should be
+	 * released (ibtrs_tag returned to ibbtrs) only leased after both
+	 * are finished.
+	 */
+	atomic_set(&iu->refcount, 2);
+	init_waitqueue_head(&iu->comp.wait);
+	iu->comp.errno = INT_MAX;
+
+	return iu;
+}
+
+static void ibnbd_put_iu(struct ibnbd_clt_session *sess, struct ibnbd_iu *iu)
+{
+	if (atomic_dec_and_test(&iu->refcount))
+		ibnbd_put_tag(sess, iu->tag);
+}
+
+static void ibnbd_softirq_done_fn(struct request *rq)
+{
+	struct ibnbd_clt_dev *dev	= rq->rq_disk->private_data;
+	struct ibnbd_clt_session *sess	= dev->sess;
+	struct ibnbd_iu *iu;
+
+	iu = blk_mq_rq_to_pdu(rq);
+	ibnbd_put_tag(sess, iu->tag);
+	blk_mq_end_request(rq, iu->status);
+}
+
+static void msg_io_conf(void *priv, int errno)
+{
+	struct ibnbd_iu *iu = (struct ibnbd_iu *)priv;
+	struct ibnbd_clt_dev *dev = iu->dev;
+	struct request *rq = iu->rq;
+
+	iu->status = errno ? BLK_STS_IOERR : BLK_STS_OK;
+
+	if (softirq_enable) {
+		blk_mq_complete_request(rq);
+	} else {
+		ibnbd_put_tag(dev->sess, iu->tag);
+		blk_mq_end_request(rq, iu->status);
+	}
+
+	if (errno)
+		ibnbd_info_rl(dev, "%s I/O failed with err: %d\n",
+			      rq_data_dir(rq) == READ ? "read" : "write",
+			      errno);
+}
+
+static void wake_up_iu_comp(struct ibnbd_iu *iu, int errno)
+{
+	iu->comp.errno = errno;
+	wake_up(&iu->comp.wait);
+}
+
+static void msg_conf(void *priv, int errno)
+{
+	struct ibnbd_iu *iu = (struct ibnbd_iu *)priv;
+
+	iu->errno = errno;
+	schedule_work(&iu->work);
+}
+
+enum {
+	NO_WAIT = 0,
+	WAIT    = 1
+};
+
+static int send_usr_msg(struct ibtrs_clt *ibtrs, int dir,
+			struct ibnbd_iu *iu, struct kvec *vec, size_t nr,
+			size_t len, struct scatterlist *sg, unsigned int sg_len,
+			void (*conf)(struct work_struct *work),
+			int *errno, bool wait)
+{
+	int err;
+
+	INIT_WORK(&iu->work, conf);
+	err = ibtrs_clt_request(dir, msg_conf, ibtrs, iu->tag,
+				iu, vec, nr, len, sg, sg_len);
+	if (!err && wait) {
+		wait_event(iu->comp.wait, iu->comp.errno != INT_MAX);
+		*errno = iu->comp.errno;
+	} else {
+		*errno = 0;
+	}
+
+	return err;
+}
+
+static void msg_close_conf(struct work_struct *work)
+{
+	struct ibnbd_iu *iu = container_of(work, struct ibnbd_iu, work);
+	struct ibnbd_clt_dev *dev = iu->dev;
+
+	wake_up_iu_comp(iu, iu->errno);
+	ibnbd_put_iu(dev->sess, iu);
+	ibnbd_clt_put_dev(dev);
+}
+
+static int send_msg_close(struct ibnbd_clt_dev *dev, u32 device_id, bool wait)
+{
+	struct ibnbd_clt_session *sess = dev->sess;
+	struct ibnbd_msg_close msg;
+	struct ibnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	int err, errno;
+
+	iu = ibnbd_get_iu(sess, IBTRS_USR_CON, IBTRS_TAG_WAIT);
+	if (unlikely(!iu))
+		return -ENOMEM;
+
+	iu->buf = NULL;
+	iu->dev = dev;
+
+	sg_mark_end(&iu->sglist[0]);
+
+	msg.hdr.type	= cpu_to_le16(IBNBD_MSG_CLOSE);
+	msg.device_id	= cpu_to_le32(device_id);
+
+	WARN_ON(!ibnbd_clt_get_dev(dev));
+	err = send_usr_msg(sess->ibtrs, WRITE, iu, &vec, 1, 0, NULL, 0,
+			   msg_close_conf, &errno, wait);
+	if (unlikely(err)) {
+		ibnbd_clt_put_dev(dev);
+		ibnbd_put_iu(sess, iu);
+	} else {
+		err = errno;
+	}
+
+	ibnbd_put_iu(sess, iu);
+	return err;
+}
+
+static void msg_open_conf(struct work_struct *work)
+{
+	struct ibnbd_iu *iu = container_of(work, struct ibnbd_iu, work);
+	struct ibnbd_msg_open_rsp *rsp = iu->buf;
+	struct ibnbd_clt_dev *dev = iu->dev;
+	int errno = iu->errno;
+
+	if (errno) {
+		ibnbd_err(dev, "Opening failed, server responded: %d\n", errno);
+	} else {
+		errno = process_msg_open_rsp(dev, rsp);
+		if (unlikely(errno)) {
+			u32 device_id = le32_to_cpu(rsp->device_id);
+			/*
+			 * If server thinks its fine, but we fail to process
+			 * then be nice and send a close to server.
+			 */
+			(void)send_msg_close(dev, device_id, NO_WAIT);
+		}
+	}
+	kfree(rsp);
+	wake_up_iu_comp(iu, errno);
+	ibnbd_put_iu(dev->sess, iu);
+	ibnbd_clt_put_dev(dev);
+}
+
+static void msg_sess_info_conf(struct work_struct *work)
+{
+	struct ibnbd_iu *iu = container_of(work, struct ibnbd_iu, work);
+	struct ibnbd_msg_sess_info_rsp *rsp = iu->buf;
+	struct ibnbd_clt_session *sess = iu->sess;
+
+	if (likely(!iu->errno))
+		sess->ver = min_t(u8, rsp->ver, IBNBD_PROTO_VER_MAJOR);
+
+	kfree(rsp);
+	wake_up_iu_comp(iu, iu->errno);
+	ibnbd_put_iu(sess, iu);
+	ibnbd_clt_put_sess(sess);
+}
+
+static int send_msg_open(struct ibnbd_clt_dev *dev, bool wait)
+{
+	struct ibnbd_clt_session *sess = dev->sess;
+	struct ibnbd_msg_open_rsp *rsp;
+	struct ibnbd_msg_open msg;
+	struct ibnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	int err, errno;
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (unlikely(!rsp))
+		return -ENOMEM;
+
+	iu = ibnbd_get_iu(sess, IBTRS_USR_CON, IBTRS_TAG_WAIT);
+	if (unlikely(!iu)) {
+		kfree(rsp);
+		return -ENOMEM;
+	}
+
+	iu->buf = rsp;
+	iu->dev = dev;
+
+	sg_init_one(iu->sglist, rsp, sizeof(*rsp));
+
+	msg.hdr.type	= cpu_to_le16(IBNBD_MSG_OPEN);
+	msg.access_mode	= dev->access_mode;
+	msg.io_mode	= dev->io_mode;
+	strlcpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
+
+	WARN_ON(!ibnbd_clt_get_dev(dev));
+	err = send_usr_msg(sess->ibtrs, READ, iu,
+			   &vec, 1, sizeof(*rsp), iu->sglist, 1,
+			   msg_open_conf, &errno, wait);
+	if (unlikely(err)) {
+		ibnbd_clt_put_dev(dev);
+		ibnbd_put_iu(sess, iu);
+		kfree(rsp);
+	} else {
+		err = errno;
+	}
+
+	ibnbd_put_iu(sess, iu);
+	return err;
+}
+
+static int send_msg_sess_info(struct ibnbd_clt_session *sess, bool wait)
+{
+	struct ibnbd_msg_sess_info_rsp *rsp;
+	struct ibnbd_msg_sess_info msg;
+	struct ibnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	int err, errno;
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (unlikely(!rsp))
+		return -ENOMEM;
+
+	iu = ibnbd_get_iu(sess, IBTRS_USR_CON, IBTRS_TAG_WAIT);
+	if (unlikely(!iu)) {
+		kfree(rsp);
+		return -ENOMEM;
+	}
+
+	iu->buf = rsp;
+	iu->sess = sess;
+
+	sg_init_one(iu->sglist, rsp, sizeof(*rsp));
+
+	msg.hdr.type = cpu_to_le16(IBNBD_MSG_SESS_INFO);
+	msg.ver      = IBNBD_PROTO_VER_MAJOR;
+
+	if (unlikely(!ibnbd_clt_get_sess(sess))) {
+		/*
+		 * That can happen only in one case, when IBTRS has restablished
+		 * the connection and link_ev() is called, but session is almost
+		 * dead, last reference on session is put and caller is waiting
+		 * for IBTRS to close everything.
+		 */
+		err = -ENODEV;
+		goto put_iu;
+	}
+	err = send_usr_msg(sess->ibtrs, READ, iu,
+			   &vec, 1, sizeof(*rsp), iu->sglist, 1,
+			   msg_sess_info_conf, &errno, wait);
+	if (unlikely(err)) {
+		ibnbd_clt_put_sess(sess);
+put_iu:
+		ibnbd_put_iu(sess, iu);
+		kfree(rsp);
+	} else {
+		err = errno;
+	}
+
+	ibnbd_put_iu(sess, iu);
+	return err;
+}
+
+static void set_dev_states_to_disconnected(struct ibnbd_clt_session *sess)
+{
+	struct ibnbd_clt_dev *dev;
+
+	mutex_lock(&sess->lock);
+	list_for_each_entry(dev, &sess->devs_list, list) {
+		ibnbd_err(dev, "Device disconnected.\n");
+
+		mutex_lock(&dev->lock);
+		if (dev->dev_state == DEV_STATE_MAPPED)
+			dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED;
+		mutex_unlock(&dev->lock);
+	}
+	mutex_unlock(&sess->lock);
+}
+
+static void remap_devs(struct ibnbd_clt_session *sess)
+{
+	struct ibnbd_clt_dev *dev;
+	struct ibtrs_attrs attrs;
+	int err;
+
+	/*
+	 * Careful here: we are called from IBTRS link event directly,
+	 * thus we can't send any IBTRS request and wait for response
+	 * or IBTRS will not be able to complete request with failure
+	 * if something goes wrong (failing of outstanding requests
+	 * happens exactly from the context where we are blocking now).
+	 *
+	 * So to avoid deadlocks each usr message sent from here must
+	 * be asynchronous.
+	 */
+
+	err = send_msg_sess_info(sess, NO_WAIT);
+	if (unlikely(err)) {
+		pr_err("send_msg_sess_info(\"%s\"): %d\n", sess->sessname, err);
+		return;
+	}
+
+	ibtrs_clt_query(sess->ibtrs, &attrs);
+	mutex_lock(&sess->lock);
+	sess->max_io_size = attrs.max_io_size;
+
+	list_for_each_entry(dev, &sess->devs_list, list) {
+		bool skip;
+
+		mutex_lock(&dev->lock);
+		skip = (dev->dev_state == DEV_STATE_INIT);
+		mutex_unlock(&dev->lock);
+		if (skip)
+			/*
+			 * When device is establishing connection for the first
+			 * time - do not remap, it will be closed soon.
+			 */
+			continue;
+
+		ibnbd_info(dev, "session reconnected, remapping device\n");
+		err = send_msg_open(dev, NO_WAIT);
+		if (unlikely(err)) {
+			ibnbd_err(dev, "send_msg_open(): %d\n", err);
+			break;
+		}
+	}
+	mutex_unlock(&sess->lock);
+}
+
+static void ibnbd_clt_link_ev(void *priv, enum ibtrs_clt_link_ev ev)
+{
+	struct ibnbd_clt_session *sess = priv;
+
+	switch (ev) {
+	case IBTRS_CLT_LINK_EV_DISCONNECTED:
+		set_dev_states_to_disconnected(sess);
+		break;
+	case IBTRS_CLT_LINK_EV_RECONNECTED:
+		remap_devs(sess);
+		break;
+	default:
+		pr_err("Unknown session event received (%d), session: %s\n",
+		       ev, sess->sessname);
+	}
+}
+
+static void ibnbd_init_cpu_qlists(struct ibnbd_cpu_qlist __percpu *cpu_queues)
+{
+	unsigned int cpu;
+	struct ibnbd_cpu_qlist *cpu_q;
+
+	for_each_possible_cpu(cpu) {
+		cpu_q = per_cpu_ptr(cpu_queues, cpu);
+
+		cpu_q->cpu = cpu;
+		INIT_LIST_HEAD(&cpu_q->requeue_list);
+		spin_lock_init(&cpu_q->requeue_lock);
+	}
+}
+
+static struct blk_mq_ops ibnbd_mq_ops;
+static int setup_mq_tags(struct ibnbd_clt_session *sess)
+{
+	struct blk_mq_tag_set *tags = &sess->tag_set;
+
+	memset(tags, 0, sizeof(*tags));
+	tags->ops		= &ibnbd_mq_ops;
+	tags->queue_depth	= sess->queue_depth;
+	tags->numa_node		= NUMA_NO_NODE;
+	tags->flags		= BLK_MQ_F_SHOULD_MERGE |
+				  BLK_MQ_F_TAG_SHARED;
+	tags->cmd_size		= sizeof(struct ibnbd_iu);
+	tags->nr_hw_queues	= num_online_cpus();
+
+	return blk_mq_alloc_tag_set(tags);
+}
+
+static void destroy_mq_tags(struct ibnbd_clt_session *sess)
+{
+	if (sess->tag_set.tags)
+		blk_mq_free_tag_set(&sess->tag_set);
+}
+
+static inline void wake_up_ibtrs_waiters(struct ibnbd_clt_session *sess)
+{
+	/* paired with rmb() in wait_for_ibtrs_connection() */
+	smp_wmb();
+	sess->ibtrs_ready = true;
+	wake_up_all(&sess->ibtrs_waitq);
+}
+
+static void close_ibtrs(struct ibnbd_clt_session *sess)
+{
+	might_sleep();
+
+	if (!IS_ERR_OR_NULL(sess->ibtrs)) {
+		ibtrs_clt_close(sess->ibtrs);
+		sess->ibtrs = NULL;
+		wake_up_ibtrs_waiters(sess);
+	}
+}
+
+static void free_sess(struct ibnbd_clt_session *sess)
+{
+	WARN_ON(!list_empty(&sess->devs_list));
+
+	might_sleep();
+
+	close_ibtrs(sess);
+	destroy_mq_tags(sess);
+	if (!list_empty(&sess->list)) {
+		mutex_lock(&sess_lock);
+		list_del(&sess->list);
+		mutex_unlock(&sess_lock);
+	}
+	free_percpu(sess->cpu_queues);
+	free_percpu(sess->cpu_rr);
+	kfree(sess);
+}
+
+static struct ibnbd_clt_session *alloc_sess(const char *sessname)
+{
+	struct ibnbd_clt_session *sess;
+	int err, cpu;
+
+	sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE);
+	if (unlikely(!sess)) {
+		pr_err("Failed to create session %s,"
+		       " allocating session struct failed\n", sessname);
+		return ERR_PTR(-ENOMEM);
+	}
+	strlcpy(sess->sessname, sessname, sizeof(sess->sessname));
+	atomic_set(&sess->busy, 0);
+	mutex_init(&sess->lock);
+	INIT_LIST_HEAD(&sess->devs_list);
+	INIT_LIST_HEAD(&sess->list);
+	bitmap_zero(sess->cpu_queues_bm, NR_CPUS);
+	init_waitqueue_head(&sess->ibtrs_waitq);
+	refcount_set(&sess->refcount, 1);
+
+	sess->cpu_queues = alloc_percpu(struct ibnbd_cpu_qlist);
+	if (unlikely(!sess->cpu_queues)) {
+		pr_err("Failed to create session to %s,"
+		       " alloc of percpu var (cpu_queues) failed\n", sessname);
+		err = -ENOMEM;
+		goto err;
+	}
+	ibnbd_init_cpu_qlists(sess->cpu_queues);
+
+	/**
+	 * That is simple percpu variable which stores cpu indeces, which are
+	 * incremented on each access.  We need that for the sake of fairness
+	 * to wake up queues in a round-robin manner.
+	 */
+	sess->cpu_rr = alloc_percpu(int);
+	if (unlikely(!sess->cpu_rr)) {
+		pr_err("Failed to create session %s,"
+		       " alloc of percpu var (cpu_rr) failed\n", sessname);
+		err = -ENOMEM;
+		goto err;
+	}
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(sess->cpu_rr, cpu) = cpu;
+
+	return sess;
+
+err:
+	free_sess(sess);
+
+	return ERR_PTR(err);
+}
+
+static int wait_for_ibtrs_connection(struct ibnbd_clt_session *sess)
+{
+	wait_event(sess->ibtrs_waitq, sess->ibtrs_ready);
+	/* paired with wmb() in wake_up_ibtrs_waiters() */
+	smp_rmb();
+	if (unlikely(IS_ERR_OR_NULL(sess->ibtrs)))
+		return -ECONNRESET;
+
+	return 0;
+}
+
+static void wait_for_ibtrs_disconnection(struct ibnbd_clt_session *sess)
+__releases(&sess_lock)
+__acquires(&sess_lock)
+{
+	DEFINE_WAIT_FUNC(wait, autoremove_wake_function);
+
+	prepare_to_wait(&sess->ibtrs_waitq, &wait, TASK_UNINTERRUPTIBLE);
+	if (IS_ERR_OR_NULL(sess->ibtrs)) {
+		finish_wait(&sess->ibtrs_waitq, &wait);
+		return;
+	}
+	mutex_unlock(&sess_lock);
+	/* After unlock session can be freed, so careful */
+	schedule();
+	mutex_lock(&sess_lock);
+}
+
+static struct ibnbd_clt_session *__find_and_get_sess(const char *sessname)
+__releases(&sess_lock)
+__acquires(&sess_lock)
+{
+	struct ibnbd_clt_session *sess;
+	int err;
+
+again:
+	list_for_each_entry(sess, &sess_list, list) {
+		if (strcmp(sessname, sess->sessname))
+			continue;
+
+		if (unlikely(sess->ibtrs_ready && IS_ERR_OR_NULL(sess->ibtrs)))
+			/*
+			 * No IBTRS connection, session is dying.
+			 */
+			continue;
+
+		if (likely(ibnbd_clt_get_sess(sess))) {
+			/*
+			 * Alive session is found, wait for IBTRS connection.
+			 */
+			mutex_unlock(&sess_lock);
+			err = wait_for_ibtrs_connection(sess);
+			if (unlikely(err))
+				ibnbd_clt_put_sess(sess);
+			mutex_lock(&sess_lock);
+
+			if (unlikely(err))
+				/* Session is dying, repeat the loop */
+				goto again;
+
+			return sess;
+		}
+		/*
+		 * Ref is 0, session is dying, wait for IBTRS disconnect
+		 * in order to avoid session names clashes.
+		 */
+		wait_for_ibtrs_disconnection(sess);
+		/*
+		 * IBTRS is disconnected and soon session will be freed,
+		 * so repeat a loop.
+		 */
+		goto again;
+	}
+
+	return NULL;
+}
+
+static struct ibnbd_clt_session *find_and_get_sess(const char *sessname)
+{
+	struct ibnbd_clt_session *sess;
+
+	mutex_lock(&sess_lock);
+	sess = __find_and_get_sess(sessname);
+	mutex_unlock(&sess_lock);
+
+	return sess;
+}
+
+static struct ibnbd_clt_session *
+find_and_get_or_insert_sess(struct ibnbd_clt_session *sess)
+{
+	struct ibnbd_clt_session *found;
+
+	mutex_lock(&sess_lock);
+	found = __find_and_get_sess(sess->sessname);
+	if (!found)
+		list_add(&sess->list, &sess_list);
+	mutex_unlock(&sess_lock);
+
+	return found;
+}
+
+static struct ibnbd_clt_session *
+find_and_get_or_create_sess(const char *sessname,
+			    const struct ibtrs_addr *paths,
+			    size_t path_cnt)
+{
+	struct ibnbd_clt_session *sess, *found;
+	struct ibtrs_attrs attrs;
+	int err;
+
+	sess = find_and_get_sess(sessname);
+	if (sess)
+		return sess;
+
+	sess = alloc_sess(sessname);
+	if (unlikely(IS_ERR(sess)))
+		return sess;
+
+	found = find_and_get_or_insert_sess(sess);
+	if (unlikely(found)) {
+		free_sess(sess);
+
+		return found;
+	}
+	/*
+	 * Nothing was found, establish ibtrs connection and proceed further.
+	 */
+	sess->ibtrs = ibtrs_clt_open(sess, ibnbd_clt_link_ev, sessname,
+				     paths, path_cnt, IBTRS_PORT,
+				     sizeof(struct ibnbd_iu),
+				     RECONNECT_DELAY, BMAX_SEGMENTS,
+				     MAX_RECONNECTS);
+	if (unlikely(IS_ERR(sess->ibtrs))) {
+		err = PTR_ERR(sess->ibtrs);
+		goto wake_up_and_put;
+	}
+	ibtrs_clt_query(sess->ibtrs, &attrs);
+	sess->max_io_size = attrs.max_io_size;
+	sess->queue_depth = attrs.queue_depth;
+
+	err = setup_mq_tags(sess);
+	if (unlikely(err))
+		goto close_ibtrs;
+
+	err = send_msg_sess_info(sess, WAIT);
+	if (unlikely(err))
+		goto close_ibtrs;
+
+	wake_up_ibtrs_waiters(sess);
+
+	return sess;
+
+close_ibtrs:
+	close_ibtrs(sess);
+put_sess:
+	ibnbd_clt_put_sess(sess);
+
+	return ERR_PTR(err);
+
+wake_up_and_put:
+	wake_up_ibtrs_waiters(sess);
+	goto put_sess;
+}
+
+static int ibnbd_client_open(struct block_device *block_device, fmode_t mode)
+{
+	struct ibnbd_clt_dev *dev = block_device->bd_disk->private_data;
+
+	if (dev->read_only && (mode & FMODE_WRITE))
+		return -EPERM;
+
+	if (dev->dev_state == DEV_STATE_UNMAPPED ||
+	    !ibnbd_clt_get_dev(dev))
+		return -EIO;
+
+	return 0;
+}
+
+static void ibnbd_client_release(struct gendisk *gen, fmode_t mode)
+{
+	struct ibnbd_clt_dev *dev = gen->private_data;
+
+	ibnbd_clt_put_dev(dev);
+}
+
+static int ibnbd_client_getgeo(struct block_device *block_device,
+			       struct hd_geometry *geo)
+{
+	u64 size;
+	struct ibnbd_clt_dev *dev;
+
+	dev = block_device->bd_disk->private_data;
+	size = dev->size * (dev->logical_block_size / KERNEL_SECTOR_SIZE);
+	geo->cylinders	= (size & ~0x3f) >> 6;	/* size/64 */
+	geo->heads	= 4;
+	geo->sectors	= 16;
+	geo->start	= 0;
+
+	return 0;
+}
+
+static const struct block_device_operations ibnbd_client_ops = {
+	.owner		= THIS_MODULE,
+	.open		= ibnbd_client_open,
+	.release	= ibnbd_client_release,
+	.getgeo		= ibnbd_client_getgeo
+};
+
+static size_t ibnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len)
+{
+	struct scatterlist *sg;
+	size_t tsize = 0;
+	int i;
+
+	for_each_sg(sglist, sg, len, i)
+		tsize += sg->length;
+	return tsize;
+}
+
+/*
+ * Get iorio of current task
+ */
+static short ibnbd_current_ioprio(void)
+{
+	struct task_struct *tsp = current;
+	unsigned short prio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+	if (likely(tsp->io_context))
+		prio = tsp->io_context->ioprio;
+	return prio;
+}
+
+static short ibnbd_ioprio_best(unsigned short prio1, unsigned short prio2)
+{
+	if (!ioprio_valid(prio1)) {
+		if (!ioprio_valid(prio2))
+			return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+		else
+			return prio2;
+	}
+	if (!ioprio_valid(prio2))
+		return prio1;
+
+	return min(prio1, prio2);
+}
+
+static int ibnbd_client_xfer_request(struct ibnbd_clt_dev *dev,
+				     struct request *rq,
+				     struct ibnbd_iu *iu)
+{
+	struct ibtrs_clt *ibtrs = dev->sess->ibtrs;
+	struct ibtrs_tag *tag = iu->tag;
+	struct ibnbd_msg_io msg;
+	unsigned int sg_cnt = 0;
+	struct kvec vec;
+	size_t size;
+	int err;
+
+	iu->rq		= rq;
+	iu->dev		= dev;
+	msg.sector	= cpu_to_le64(blk_rq_pos(rq));
+	msg.bi_size	= cpu_to_le32(blk_rq_bytes(rq));
+	msg.rw		= cpu_to_le32(rq_to_ibnbd_flags(rq));
+	msg.prio	= cpu_to_le16(ibnbd_ioprio_best(
+						req_get_ioprio(rq),
+						ibnbd_current_ioprio()));
+
+	/*
+	 * We only support discards with single segment for now.
+	 * See queue limits.
+	 */
+	if (req_op(rq) != REQ_OP_DISCARD)
+		sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sglist);
+
+	if (sg_cnt == 0)
+		/* Do not forget to mark the end */
+		sg_mark_end(&iu->sglist[0]);
+
+	msg.hdr.type	= cpu_to_le16(IBNBD_MSG_IO);
+	msg.device_id	= cpu_to_le32(dev->device_id);
+
+	vec = (struct kvec) {
+		.iov_base = &msg,
+		.iov_len  = dev->sess->ver < IBNBD_PROTO_VER_MAJOR ?
+				sizeof(struct ibnbd_msg_io_old) : sizeof(msg)
+	};
+	size = ibnbd_clt_get_sg_size(iu->sglist, sg_cnt);
+	err = ibtrs_clt_request(rq_data_dir(rq), msg_io_conf, ibtrs, tag,
+				iu, &vec, 1, size, iu->sglist, sg_cnt);
+	if (unlikely(err)) {
+		ibnbd_err_rl(dev, "IBTRS failed to transfer IO, err: %d\n",
+			     err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * ibnbd_clt_dev_add_to_requeue() - add device to requeue if session is busy
+ *
+ * Description:
+ *     If session is busy, that means someone will requeue us when resources
+ *     are freed.  If session is not doing anything - device is not added to
+ *     the list and @false is returned.
+ */
+static inline bool ibnbd_clt_dev_add_to_requeue(struct ibnbd_clt_dev *dev,
+						struct ibnbd_queue *q)
+{
+	struct ibnbd_clt_session *sess = dev->sess;
+	struct ibnbd_cpu_qlist *cpu_q;
+	unsigned long flags;
+	bool added = true;
+	bool need_set;
+
+	cpu_q = get_cpu_ptr(sess->cpu_queues);
+	spin_lock_irqsave(&cpu_q->requeue_lock, flags);
+
+	if (likely(!test_and_set_bit_lock(0, &q->in_list))) {
+		if (WARN_ON(!list_empty(&q->requeue_list)))
+			goto unlock;
+
+		need_set = !test_bit(cpu_q->cpu, sess->cpu_queues_bm);
+		if (need_set) {
+			set_bit(cpu_q->cpu, sess->cpu_queues_bm);
+			/* Paired with ibnbd_put_tag().	 Set a bit first
+			 * and then observe the busy counter.
+			 */
+			smp_mb__before_atomic();
+		}
+		if (likely(atomic_read(&sess->busy))) {
+			list_add_tail(&q->requeue_list, &cpu_q->requeue_list);
+		} else {
+			/* Very unlikely, but possible: busy counter was
+			 * observed as zero.  Drop all bits and return
+			 * false to restart the queue by ourselves.
+			 */
+			if (need_set)
+				clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
+			clear_bit_unlock(0, &q->in_list);
+			added = false;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
+	put_cpu_ptr(sess->cpu_queues);
+
+	return added;
+}
+
+static void ibnbd_clt_dev_kick_mq_queue(struct ibnbd_clt_dev *dev,
+					struct blk_mq_hw_ctx *hctx,
+					int delay)
+{
+	struct ibnbd_queue *q = hctx->driver_data;
+
+	if (delay != IBNBD_DELAY_IFBUSY)
+		blk_mq_delay_run_hw_queue(hctx, delay);
+	else if (unlikely(!ibnbd_clt_dev_add_to_requeue(dev, q)))
+		/*
+		 * If session is not busy we have to restart
+		 * the queue ourselves.
+		 */
+		blk_mq_delay_run_hw_queue(hctx, IBNBD_DELAY_10ms);
+}
+
+static blk_status_t ibnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				   const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct ibnbd_clt_dev *dev = rq->rq_disk->private_data;
+	struct ibnbd_iu *iu = blk_mq_rq_to_pdu(rq);
+	int err;
+
+	if (unlikely(!ibnbd_clt_dev_is_mapped(dev)))
+		return BLK_STS_IOERR;
+
+	iu->tag = ibnbd_get_tag(dev->sess, IBTRS_IO_CON, IBTRS_TAG_NOWAIT);
+	if (unlikely(!iu->tag)) {
+		ibnbd_clt_dev_kick_mq_queue(dev, hctx, IBNBD_DELAY_IFBUSY);
+		return BLK_STS_RESOURCE;
+	}
+
+	blk_mq_start_request(rq);
+	err = ibnbd_client_xfer_request(dev, rq, iu);
+	if (likely(err == 0))
+		return BLK_STS_OK;
+	if (unlikely(err == -EAGAIN || err == -ENOMEM)) {
+		ibnbd_clt_dev_kick_mq_queue(dev, hctx, IBNBD_DELAY_10ms);
+		ibnbd_put_tag(dev->sess, iu->tag);
+		return BLK_STS_RESOURCE;
+	}
+
+	ibnbd_put_tag(dev->sess, iu->tag);
+	return BLK_STS_IOERR;
+}
+
+static int ibnbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
+			      unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct ibnbd_iu *iu = blk_mq_rq_to_pdu(rq);
+
+	sg_init_table(iu->sglist, BMAX_SEGMENTS);
+	return 0;
+}
+
+static inline void ibnbd_init_hw_queue(struct ibnbd_clt_dev *dev,
+				       struct ibnbd_queue *q,
+				       struct blk_mq_hw_ctx *hctx)
+{
+	INIT_LIST_HEAD(&q->requeue_list);
+	q->dev  = dev;
+	q->hctx = hctx;
+}
+
+static void ibnbd_init_mq_hw_queues(struct ibnbd_clt_dev *dev)
+{
+	int i;
+	struct blk_mq_hw_ctx *hctx;
+	struct ibnbd_queue *q;
+
+	queue_for_each_hw_ctx(dev->queue, hctx, i) {
+		q = &dev->hw_queues[i];
+		ibnbd_init_hw_queue(dev, q, hctx);
+		hctx->driver_data = q;
+	}
+}
+
+static struct blk_mq_ops ibnbd_mq_ops = {
+	.queue_rq	= ibnbd_queue_rq,
+	.init_request	= ibnbd_init_request,
+	.complete	= ibnbd_softirq_done_fn,
+};
+
+static int index_to_minor(int index)
+{
+	return index << IBNBD_PART_BITS;
+}
+
+static int minor_to_index(int minor)
+{
+	return minor >> IBNBD_PART_BITS;
+}
+
+static int setup_mq_dev(struct ibnbd_clt_dev *dev)
+{
+	dev->queue = blk_mq_init_queue(&dev->sess->tag_set);
+	if (IS_ERR(dev->queue)) {
+		ibnbd_err(dev,
+			  "Initializing multiqueue queue failed, err: %ld\n",
+			  PTR_ERR(dev->queue));
+		return PTR_ERR(dev->queue);
+	}
+	ibnbd_init_mq_hw_queues(dev);
+	return 0;
+}
+
+static void setup_request_queue(struct ibnbd_clt_dev *dev)
+{
+	blk_queue_logical_block_size(dev->queue, dev->logical_block_size);
+	blk_queue_physical_block_size(dev->queue, dev->physical_block_size);
+	blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors);
+	blk_queue_max_write_same_sectors(dev->queue,
+					 dev->max_write_same_sectors);
+
+	/*
+	 * we don't support discards to "discontiguous" segments
+	 * in on request
+	 */
+	blk_queue_max_discard_segments(dev->queue, 1);
+
+	blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors);
+	dev->queue->limits.discard_granularity	= dev->discard_granularity;
+	dev->queue->limits.discard_alignment	= dev->discard_alignment;
+	if (dev->max_discard_sectors)
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, dev->queue);
+	if (dev->secure_discard)
+		blk_queue_flag_set(QUEUE_FLAG_SECERASE, dev->queue);
+
+	blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
+	blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
+	blk_queue_max_segments(dev->queue, dev->max_segments);
+	blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
+	blk_queue_virt_boundary(dev->queue, 4095);
+	blk_queue_write_cache(dev->queue, true, true);
+	dev->queue->queuedata = dev;
+}
+
+static void ibnbd_clt_setup_gen_disk(struct ibnbd_clt_dev *dev, int idx)
+{
+	dev->gd->major		= ibnbd_client_major;
+	dev->gd->first_minor	= index_to_minor(idx);
+	dev->gd->fops		= &ibnbd_client_ops;
+	dev->gd->queue		= dev->queue;
+	dev->gd->private_data	= dev;
+	snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "ibnbd%d",
+		 idx);
+	pr_debug("disk_name=%s, capacity=%zu\n",
+		 dev->gd->disk_name,
+		 dev->nsectors * (dev->logical_block_size / KERNEL_SECTOR_SIZE)
+		 );
+
+	set_capacity(dev->gd, dev->nsectors * (dev->logical_block_size /
+					       KERNEL_SECTOR_SIZE));
+
+	if (dev->access_mode == IBNBD_ACCESS_RO) {
+		dev->read_only = true;
+		set_disk_ro(dev->gd, true);
+	} else {
+		dev->read_only = false;
+	}
+
+	if (!dev->rotational)
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+}
+
+static void ibnbd_clt_add_gen_disk(struct ibnbd_clt_dev *dev)
+{
+	add_disk(dev->gd);
+}
+
+static int ibnbd_client_setup_device(struct ibnbd_clt_session *sess,
+				     struct ibnbd_clt_dev *dev, int idx)
+{
+	int err;
+
+	dev->size = dev->nsectors * dev->logical_block_size;
+
+	err = setup_mq_dev(dev);
+	if (err)
+		return err;
+
+	setup_request_queue(dev);
+
+	dev->gd = alloc_disk_node(1 << IBNBD_PART_BITS,	NUMA_NO_NODE);
+	if (!dev->gd) {
+		ibnbd_err(dev, "Failed to allocate disk node\n");
+		blk_cleanup_queue(dev->queue);
+		return -ENOMEM;
+	}
+
+	ibnbd_clt_setup_gen_disk(dev, idx);
+
+	return 0;
+}
+
+static struct ibnbd_clt_dev *init_dev(struct ibnbd_clt_session *sess,
+				      enum ibnbd_access_mode access_mode,
+				      enum ibnbd_io_mode io_mode,
+				      const char *pathname)
+{
+	struct ibnbd_clt_dev *dev;
+	int ret;
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, NUMA_NO_NODE);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	dev->hw_queues = kcalloc(nr_cpu_ids, sizeof(*dev->hw_queues),
+				 GFP_KERNEL);
+	if (unlikely(!dev->hw_queues)) {
+		pr_err("Failed to initialize device '%s' from session"
+		       " %s, allocating hw_queues failed.", pathname,
+		       sess->sessname);
+		ret = -ENOMEM;
+		goto out_alloc;
+	}
+
+	mutex_lock(&ida_lock);
+	ret = ida_simple_get(&index_ida, 0, minor_to_index(1 << MINORBITS),
+			     GFP_KERNEL);
+	mutex_unlock(&ida_lock);
+	if (ret < 0) {
+		pr_err("Failed to initialize device '%s' from session %s,"
+		       " allocating idr failed, err: %d\n", pathname,
+		       sess->sessname, ret);
+		goto out_queues;
+	}
+	dev->clt_device_id	= ret;
+	dev->sess		= sess;
+	dev->access_mode	= access_mode;
+	dev->io_mode		= io_mode;
+	strlcpy(dev->pathname, pathname, sizeof(dev->pathname));
+	mutex_init(&dev->lock);
+	refcount_set(&dev->refcount, 1);
+	dev->dev_state = DEV_STATE_INIT;
+
+	/*
+	 * Here we called from sysfs entry, thus clt-sysfs is
+	 * responsible that session will not disappear.
+	 */
+	WARN_ON(!ibnbd_clt_get_sess(sess));
+
+	return dev;
+
+out_queues:
+	kfree(dev->hw_queues);
+out_alloc:
+	kfree(dev);
+	return ERR_PTR(ret);
+}
+
+static bool __exists_dev(const char *pathname)
+{
+	struct ibnbd_clt_session *sess;
+	struct ibnbd_clt_dev *dev;
+	bool found = false;
+
+	list_for_each_entry(sess, &sess_list, list) {
+		mutex_lock(&sess->lock);
+		list_for_each_entry(dev, &sess->devs_list, list) {
+			if (!strncmp(dev->pathname, pathname,
+				     sizeof(dev->pathname))) {
+				found = true;
+				break;
+			}
+		}
+		mutex_unlock(&sess->lock);
+		if (found)
+			break;
+	}
+
+	return found;
+}
+
+static bool exists_devpath(const char *pathname)
+{
+	bool found;
+
+	mutex_lock(&sess_lock);
+	found = __exists_dev(pathname);
+	mutex_unlock(&sess_lock);
+
+	return found;
+}
+
+static bool insert_dev_if_not_exists_devpath(const char *pathname,
+					     struct ibnbd_clt_session *sess,
+					     struct ibnbd_clt_dev *dev)
+{
+	bool found;
+
+	mutex_lock(&sess_lock);
+	found = __exists_dev(pathname);
+	if (!found) {
+		mutex_lock(&sess->lock);
+		list_add_tail(&dev->list, &sess->devs_list);
+		mutex_unlock(&sess->lock);
+	}
+	mutex_unlock(&sess_lock);
+
+	return found;
+}
+
+static void delete_dev(struct ibnbd_clt_dev *dev)
+{
+	struct ibnbd_clt_session *sess = dev->sess;
+
+	mutex_lock(&sess->lock);
+	list_del(&dev->list);
+	mutex_unlock(&sess->lock);
+}
+
+struct ibnbd_clt_dev *ibnbd_clt_map_device(const char *sessname,
+					   struct ibtrs_addr *paths,
+					   size_t path_cnt,
+					   const char *pathname,
+					   enum ibnbd_access_mode access_mode,
+					   enum ibnbd_io_mode io_mode)
+{
+	struct ibnbd_clt_session *sess;
+	struct ibnbd_clt_dev *dev;
+	int ret;
+
+	if (unlikely(exists_devpath(pathname)))
+		return ERR_PTR(-EEXIST);
+
+	sess = find_and_get_or_create_sess(sessname, paths, path_cnt);
+	if (unlikely(IS_ERR(sess)))
+		return ERR_CAST(sess);
+
+	dev = init_dev(sess, access_mode, io_mode, pathname);
+	if (unlikely(IS_ERR(dev))) {
+		pr_err("map_device: failed to map device '%s' from session %s,"
+		       " can't initialize device, err: %ld\n", pathname,
+		       sess->sessname, PTR_ERR(dev));
+		ret = PTR_ERR(dev);
+		goto put_sess;
+	}
+	if (unlikely(insert_dev_if_not_exists_devpath(pathname, sess, dev))) {
+		ret = -EEXIST;
+		goto put_dev;
+	}
+	ret = send_msg_open(dev, WAIT);
+	if (unlikely(ret)) {
+		ibnbd_err(dev, "map_device: failed, can't open remote device,"
+			  " err: %d\n", ret);
+		goto del_dev;
+	}
+	mutex_lock(&dev->lock);
+	pr_debug("Opened remote device: session=%s, path='%s'\n",
+		 sess->sessname, pathname);
+	ret = ibnbd_client_setup_device(sess, dev, dev->clt_device_id);
+	if (ret) {
+		ibnbd_err(dev, "map_device: Failed to configure device, err: %d\n",
+			  ret);
+		mutex_unlock(&dev->lock);
+		goto del_dev;
+	}
+
+	ibnbd_info(dev, "map_device: Device mapped as %s (nsectors: %zu,"
+		   " logical_block_size: %d, physical_block_size: %d,"
+		   " max_write_same_sectors: %d, max_discard_sectors: %d,"
+		   " discard_granularity: %d, discard_alignment: %d, "
+		   "secure_discard: %d, max_segments: %d, max_hw_sectors: %d, "
+		   "rotational: %d)\n",
+		   dev->gd->disk_name, dev->nsectors, dev->logical_block_size,
+		   dev->physical_block_size, dev->max_write_same_sectors,
+		   dev->max_discard_sectors, dev->discard_granularity,
+		   dev->discard_alignment, dev->secure_discard,
+		   dev->max_segments, dev->max_hw_sectors, dev->rotational);
+
+	mutex_unlock(&dev->lock);
+
+	ibnbd_clt_add_gen_disk(dev);
+	ibnbd_clt_put_sess(sess);
+
+	return dev;
+
+del_dev:
+	delete_dev(dev);
+put_dev:
+	ibnbd_clt_put_dev(dev);
+put_sess:
+	ibnbd_clt_put_sess(sess);
+
+	return ERR_PTR(ret);
+}
+
+static void destroy_gen_disk(struct ibnbd_clt_dev *dev)
+{
+	del_gendisk(dev->gd);
+	/*
+	 * Before marking queue as dying (blk_cleanup_queue() does that)
+	 * we have to be sure that everything in-flight has gone.
+	 * Blink with freeze/unfreeze.
+	 */
+	blk_mq_freeze_queue(dev->queue);
+	blk_mq_unfreeze_queue(dev->queue);
+	blk_cleanup_queue(dev->queue);
+	put_disk(dev->gd);
+}
+
+static void destroy_sysfs(struct ibnbd_clt_dev *dev,
+			  const struct attribute *sysfs_self)
+{
+	ibnbd_clt_remove_dev_symlink(dev);
+	if (dev->kobj.state_initialized) {
+		if (sysfs_self)
+			/* To avoid deadlock firstly commit suicide */
+			sysfs_remove_file_self(&dev->kobj, sysfs_self);
+		kobject_del(&dev->kobj);
+		kobject_put(&dev->kobj);
+	}
+}
+
+int ibnbd_clt_unmap_device(struct ibnbd_clt_dev *dev, bool force,
+			   const struct attribute *sysfs_self)
+{
+	struct ibnbd_clt_session *sess = dev->sess;
+	int refcount, ret = 0;
+	bool was_mapped;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state == DEV_STATE_UNMAPPED) {
+		ibnbd_info(dev, "Device is already being unmapped\n");
+		ret = -EALREADY;
+		goto err;
+	}
+	refcount = refcount_read(&dev->refcount);
+	if (!force && refcount > 1) {
+		ibnbd_err(dev, "Closing device failed, device is in use,"
+			  " (%d device users)\n", refcount - 1);
+		ret = -EBUSY;
+		goto err;
+	}
+	was_mapped = (dev->dev_state == DEV_STATE_MAPPED);
+	dev->dev_state = DEV_STATE_UNMAPPED;
+	mutex_unlock(&dev->lock);
+
+	delete_dev(dev);
+	destroy_sysfs(dev, sysfs_self);
+	destroy_gen_disk(dev);
+	if (was_mapped && sess->ibtrs)
+		send_msg_close(dev, dev->device_id, WAIT);
+
+	ibnbd_info(dev, "Device is unmapped\n");
+
+	/* Likely last reference put */
+	ibnbd_clt_put_dev(dev);
+
+	/*
+	 * Here device and session can be vanished!
+	 */
+
+	return 0;
+err:
+	mutex_unlock(&dev->lock);
+
+	return ret;
+}
+
+int ibnbd_clt_remap_device(struct ibnbd_clt_dev *dev)
+{
+	int err;
+
+	mutex_lock(&dev->lock);
+	if (likely(dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED))
+		err = 0;
+	else if (dev->dev_state == DEV_STATE_UNMAPPED)
+		err = -ENODEV;
+	else if (dev->dev_state == DEV_STATE_MAPPED)
+		err = -EALREADY;
+	else
+		err = -EBUSY;
+	mutex_unlock(&dev->lock);
+	if (likely(!err)) {
+		ibnbd_info(dev, "Remapping device.\n");
+		err = send_msg_open(dev, WAIT);
+		if (unlikely(err))
+			ibnbd_err(dev, "remap_device: %d\n", err);
+	}
+
+	return err;
+}
+
+static void unmap_device_work(struct work_struct *work)
+{
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(work, typeof(*dev), unmap_on_rmmod_work);
+	ibnbd_clt_unmap_device(dev, true, NULL);
+}
+
+static void ibnbd_destroy_sessions(void)
+{
+	struct ibnbd_clt_session *sess, *sn;
+	struct ibnbd_clt_dev *dev, *tn;
+
+	/* Firstly forbid access through sysfs interface */
+	ibnbd_clt_destroy_default_group();
+	ibnbd_clt_destroy_sysfs_files();
+
+	/*
+	 * Here at this point there is no any concurrent access to sessions
+	 * list and devices list:
+	 *   1. New session or device can'be be created - session sysfs files
+	 *      are removed.
+	 *   2. Device or session can't be removed - module reference is taken
+	 *      into account in unmap device sysfs callback.
+	 *   3. No IO requests inflight - each file open of block_dev increases
+	 *      module reference in get_disk().
+	 *
+	 * But still there can be user requests inflights, which are sent by
+	 * asynchronous send_msg_*() functions, thus before unmapping devices
+	 * IBTRS session must be explicitly closed.
+	 */
+
+	list_for_each_entry_safe(sess, sn, &sess_list, list) {
+		WARN_ON(!ibnbd_clt_get_sess(sess));
+		close_ibtrs(sess);
+		list_for_each_entry_safe(dev, tn, &sess->devs_list, list) {
+			/*
+			 * Here unmap happens in parallel for only one reason:
+			 * blk_cleanup_queue() takes around half a second, so
+			 * on huge amount of devices the whole module unload
+			 * procedure takes minutes.
+			 */
+			INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work);
+			queue_work(unload_wq, &dev->unmap_on_rmmod_work);
+		}
+		ibnbd_clt_put_sess(sess);
+	}
+	/* Wait for all scheduled unmap works */
+	flush_workqueue(unload_wq);
+	WARN_ON(!list_empty(&sess_list));
+}
+
+static int __init ibnbd_client_init(void)
+{
+	int err;
+
+	pr_info("Loading module %s, version %s, proto %s: "
+		"(softirq_enable: %d)\n", KBUILD_MODNAME,
+		IBNBD_VER_STRING, IBNBD_PROTO_VER_STRING,
+		softirq_enable);
+
+	ibnbd_client_major = register_blkdev(ibnbd_client_major, "ibnbd");
+	if (ibnbd_client_major <= 0) {
+		pr_err("Failed to load module,"
+		       " block device registration failed\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = ibnbd_clt_create_sysfs_files();
+	if (err) {
+		pr_err("Failed to load module,"
+		       " creating sysfs device files failed, err: %d\n",
+		       err);
+		goto out_unregister_blk;
+	}
+
+	unload_wq = alloc_workqueue("ibnbd_unload_wq", WQ_MEM_RECLAIM, 0);
+	if (!unload_wq) {
+		pr_err("Failed to load module, alloc ibnbd_unload_wq failed\n");
+		goto out_destroy_sysfs_files;
+	}
+
+	return 0;
+
+out_destroy_sysfs_files:
+	ibnbd_clt_destroy_sysfs_files();
+out_unregister_blk:
+	unregister_blkdev(ibnbd_client_major, "ibnbd");
+out:
+	return err;
+}
+
+static void __exit ibnbd_client_exit(void)
+{
+	pr_info("Unloading module\n");
+	ibnbd_destroy_sessions();
+	unregister_blkdev(ibnbd_client_major, "ibnbd");
+	ida_destroy(&index_ida);
+	destroy_workqueue(unload_wq);
+	pr_info("Module unloaded\n");
+}
+
+module_init(ibnbd_client_init);
+module_exit(ibnbd_client_exit);
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 19/25] ibnbd: server: private header with server structs and functions
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This header describes main structs and functions used by ibnbd-server
module, namely structs for managing sessions from different clients
and mapped (opened) devices.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-srv.h | 94 +++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-srv.h

diff --git a/drivers/block/ibnbd/ibnbd-srv.h b/drivers/block/ibnbd/ibnbd-srv.h
new file mode 100644
index 000000000000..6e46c3e97bf4
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-srv.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#ifndef IBNBD_SRV_H
+#define IBNBD_SRV_H
+
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+
+#include "ibtrs.h"
+#include "ibnbd-proto.h"
+#include "ibnbd-log.h"
+
+struct ibnbd_srv_session {
+	/* Entry inside global sess_list */
+	struct list_head        list;
+	struct ibtrs_srv	*ibtrs;
+	char			sessname[NAME_MAX];
+	int			queue_depth;
+	struct bio_set		sess_bio_set;
+
+	rwlock_t                index_lock ____cacheline_aligned;
+	struct idr              index_idr;
+	/* List of struct ibnbd_srv_sess_dev */
+	struct list_head        sess_dev_list;
+	struct mutex		lock;
+	u8			ver;
+};
+
+struct ibnbd_srv_dev {
+	/* Entry inside global dev_list */
+	struct list_head                list;
+	struct kobject                  dev_kobj;
+	struct kobject                  dev_sessions_kobj;
+	struct kref                     kref;
+	char				id[NAME_MAX];
+	/* List of ibnbd_srv_sess_dev structs */
+	struct list_head		sess_dev_list;
+	struct mutex			lock;
+	int				open_write_cnt;
+	enum ibnbd_io_mode		mode;
+};
+
+/* Structure which binds N devices and N sessions */
+struct ibnbd_srv_sess_dev {
+	/* Entry inside ibnbd_srv_dev struct */
+	struct list_head		dev_list;
+	/* Entry inside ibnbd_srv_session struct */
+	struct list_head		sess_list;
+	struct ibnbd_dev		*ibnbd_dev;
+	struct ibnbd_srv_session        *sess;
+	struct ibnbd_srv_dev		*dev;
+	struct kobject                  kobj;
+	struct completion		*sysfs_release_compl;
+	u32                             device_id;
+	fmode_t                         open_flags;
+	struct kref			kref;
+	struct completion               *destroy_comp;
+	char				pathname[NAME_MAX];
+	enum ibnbd_access_mode		access_mode;
+};
+
+/* ibnbd-srv-sysfs.c */
+
+int ibnbd_srv_create_dev_sysfs(struct ibnbd_srv_dev *dev,
+			       struct block_device *bdev,
+			       const char *dir_name);
+void ibnbd_srv_destroy_dev_sysfs(struct ibnbd_srv_dev *dev);
+int ibnbd_srv_create_dev_session_sysfs(struct ibnbd_srv_sess_dev *sess_dev);
+void ibnbd_srv_destroy_dev_session_sysfs(struct ibnbd_srv_sess_dev *sess_dev);
+int ibnbd_srv_create_sysfs_files(void);
+void ibnbd_srv_destroy_sysfs_files(void);
+
+#endif /* IBNBD_SRV_H */
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 18/25] ibnbd: client: sysfs interface functions
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This is the sysfs interface to IBNBD block devices on client side:

  /sys/devices/virtual/ibnbd-client/ctl/
    |- map_device
    |  *** maps remote device
    |
    |- devices/
       *** all mapped devices

  /sys/block/ibnbd<N>/ibnbd_client/
    |- unmap_device
    |  *** unmaps device
    |
    |- state
    |  *** device state
    |
    |- session
    |  *** session name
    |
    |- mapping_path
       *** path of the dev that was mapped on server

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-clt-sysfs.c | 691 ++++++++++++++++++++++++++
 1 file changed, 691 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-clt-sysfs.c

diff --git a/drivers/block/ibnbd/ibnbd-clt-sysfs.c b/drivers/block/ibnbd/ibnbd-clt-sysfs.c
new file mode 100644
index 000000000000..4fd365e68e0f
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-clt-sysfs.c
@@ -0,0 +1,691 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Swapnil Ingle <swapnil.ingle@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/device.h>
+#include <rdma/ib.h>
+#include <rdma/rdma_cm.h>
+
+#include "ibnbd-clt.h"
+
+static struct device *ibnbd_dev;
+static struct class *ibnbd_dev_class;
+static struct kobject *ibnbd_devs_kobj;
+
+enum {
+	IBNBD_OPT_ERR		= 0,
+	IBNBD_OPT_PATH		= 1 << 0,
+	IBNBD_OPT_DEV_PATH	= 1 << 1,
+	IBNBD_OPT_ACCESS_MODE	= 1 << 3,
+	IBNBD_OPT_IO_MODE	= 1 << 5,
+	IBNBD_OPT_SESSNAME	= 1 << 6,
+};
+
+static unsigned int ibnbd_opt_mandatory[] = {
+	IBNBD_OPT_PATH,
+	IBNBD_OPT_DEV_PATH,
+	IBNBD_OPT_SESSNAME,
+};
+
+static const match_table_t ibnbd_opt_tokens = {
+	{	IBNBD_OPT_PATH,		"path=%s"		},
+	{	IBNBD_OPT_DEV_PATH,	"device_path=%s"	},
+	{	IBNBD_OPT_ACCESS_MODE,	"access_mode=%s"	},
+	{	IBNBD_OPT_IO_MODE,	"io_mode=%s"		},
+	{	IBNBD_OPT_SESSNAME,	"sessname=%s"		},
+	{	IBNBD_OPT_ERR,		NULL			},
+};
+
+/* remove new line from string */
+static void strip(char *s)
+{
+	char *p = s;
+
+	while (*s != '\0') {
+		if (*s != '\n')
+			*p++ = *s++;
+		else
+			++s;
+	}
+	*p = '\0';
+}
+
+static int ibnbd_clt_parse_map_options(const char *buf,
+				       char *sessname,
+				       struct ibtrs_addr *paths,
+				       size_t *path_cnt,
+				       size_t max_path_cnt,
+				       char *pathname,
+				       enum ibnbd_access_mode *access_mode,
+				       enum ibnbd_io_mode *io_mode)
+{
+	char *options, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+	int p_cnt = 0;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = strstrip(options);
+	strip(sep_opt);
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, ibnbd_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case IBNBD_OPT_SESSNAME:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("map_device: sessname too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strlcpy(sessname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case IBNBD_OPT_PATH:
+			if (p_cnt >= max_path_cnt) {
+				pr_err("map_device: too many (> %zu) paths "
+				       "provided\n", max_path_cnt);
+				ret = -ENOMEM;
+				goto out;
+			}
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret = ibtrs_addr_to_sockaddr(p, strlen(p), IBTRS_PORT,
+						     &paths[p_cnt]);
+			if (ret) {
+				pr_err("Can't parse path %s: %d\n", p, ret);
+				kfree(p);
+				goto out;
+			}
+
+			p_cnt++;
+
+			kfree(p);
+			break;
+
+		case IBNBD_OPT_DEV_PATH:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("map_device: Device path too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strlcpy(pathname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case IBNBD_OPT_ACCESS_MODE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			if (!strcmp(p, "ro")) {
+				*access_mode = IBNBD_ACCESS_RO;
+			} else if (!strcmp(p, "rw")) {
+				*access_mode = IBNBD_ACCESS_RW;
+			} else if (!strcmp(p, "migration")) {
+				*access_mode = IBNBD_ACCESS_MIGRATION;
+			} else {
+				pr_err("map_device: Invalid access_mode:"
+				       " '%s'\n", p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+
+			kfree(p);
+			break;
+
+		case IBNBD_OPT_IO_MODE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (!strcmp(p, "blockio")) {
+				*io_mode = IBNBD_BLOCKIO;
+			} else if (!strcmp(p, "fileio")) {
+				*io_mode = IBNBD_FILEIO;
+			} else {
+				pr_err("map_device: Invalid io_mode: '%s'.\n",
+				       p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			kfree(p);
+			break;
+
+		default:
+			pr_err("map_device: Unknown parameter or missing value"
+			       " '%s'\n", p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibnbd_opt_mandatory); i++) {
+		if ((opt_mask & ibnbd_opt_mandatory[i])) {
+			ret = 0;
+		} else {
+			pr_err("map_device: Parameters missing\n");
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	*path_cnt = p_cnt;
+	kfree(options);
+	return ret;
+}
+
+static ssize_t ibnbd_clt_state_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *page)
+{
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	switch (dev->dev_state) {
+	case (DEV_STATE_INIT):
+		return scnprintf(page, PAGE_SIZE, "init\n");
+	case (DEV_STATE_MAPPED):
+		/* TODO fix cli tool before changing to proper state */
+		return scnprintf(page, PAGE_SIZE, "open\n");
+	case (DEV_STATE_MAPPED_DISCONNECTED):
+		/* TODO fix cli tool before changing to proper state */
+		return scnprintf(page, PAGE_SIZE, "closed\n");
+	case (DEV_STATE_UNMAPPED):
+		return scnprintf(page, PAGE_SIZE, "unmapped\n");
+	default:
+		return scnprintf(page, PAGE_SIZE, "unknown\n");
+	}
+}
+
+static struct kobj_attribute ibnbd_clt_state_attr =
+	__ATTR(state, 0444, ibnbd_clt_state_show, NULL);
+
+static ssize_t ibnbd_clt_mapping_path_show(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   char *page)
+{
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname);
+}
+
+static struct kobj_attribute ibnbd_clt_mapping_path_attr =
+	__ATTR(mapping_path, 0444, ibnbd_clt_mapping_path_show, NULL);
+
+static ssize_t ibnbd_clt_io_mode_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *page)
+{
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n",
+			 ibnbd_io_mode_str(dev->remote_io_mode));
+}
+
+static struct kobj_attribute ibnbd_clt_io_mode =
+	__ATTR(io_mode, 0444, ibnbd_clt_io_mode_show, NULL);
+
+static ssize_t ibnbd_clt_access_mode_show(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *page)
+{
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n",
+			 ibnbd_access_mode_str(dev->access_mode));
+}
+
+static struct kobj_attribute ibnbd_clt_access_mode =
+	__ATTR(access_mode, 0444, ibnbd_clt_access_mode_show, NULL);
+
+static ssize_t ibnbd_clt_unmap_dev_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo <normal|force> > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t ibnbd_clt_unmap_dev_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct ibnbd_clt_dev *dev;
+	char *opt, *options;
+	bool force;
+	int err;
+
+	opt = kstrdup(buf, GFP_KERNEL);
+	if (!opt)
+		return -ENOMEM;
+
+	options = strstrip(opt);
+	strip(options);
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	if (sysfs_streq(options, "normal")) {
+		force = false;
+	} else if (sysfs_streq(options, "force")) {
+		force = true;
+	} else {
+		ibnbd_err(dev, "unmap_device: Invalid value: %s\n", options);
+		err = -EINVAL;
+		goto out;
+	}
+
+	ibnbd_info(dev, "Unmapping device, option: %s.\n",
+		   force ? "force" : "normal");
+
+	/*
+	 * We take explicit module reference only for one reason: do not
+	 * race with lockless ibnbd_destroy_sessions().
+	 */
+	if (!try_module_get(THIS_MODULE)) {
+		err = -ENODEV;
+		goto out;
+	}
+	err = ibnbd_clt_unmap_device(dev, force, &attr->attr);
+	if (unlikely(err)) {
+		if (unlikely(err != -EALREADY))
+			ibnbd_err(dev, "unmap_device: %d\n",  err);
+		goto module_put;
+	}
+
+	/*
+	 * Here device can be vanished!
+	 */
+
+	err = count;
+
+module_put:
+	module_put(THIS_MODULE);
+out:
+	kfree(opt);
+
+	return err;
+}
+
+static struct kobj_attribute ibnbd_clt_unmap_device_attr =
+	__ATTR(unmap_device, 0644, ibnbd_clt_unmap_dev_show,
+	       ibnbd_clt_unmap_dev_store);
+
+static ssize_t ibnbd_clt_resize_dev_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	return scnprintf(page, PAGE_SIZE,
+			 "Usage: echo <new size in sectors> > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t ibnbd_clt_resize_dev_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	int ret;
+	unsigned long sectors;
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	ret = kstrtoul(buf, 0, &sectors);
+	if (ret)
+		return ret;
+
+	ret = ibnbd_clt_resize_disk(dev, (size_t)sectors);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static struct kobj_attribute ibnbd_clt_resize_dev_attr =
+	__ATTR(resize, 0644, ibnbd_clt_resize_dev_show,
+	       ibnbd_clt_resize_dev_store);
+
+static ssize_t ibnbd_clt_remap_dev_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n",
+			 attr->attr.name);
+}
+
+static ssize_t ibnbd_clt_remap_dev_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct ibnbd_clt_dev *dev;
+	char *opt, *options;
+	int err;
+
+	opt = kstrdup(buf, GFP_KERNEL);
+	if (!opt)
+		return -ENOMEM;
+
+	options = strstrip(opt);
+	strip(options);
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+	if (!sysfs_streq(options, "1")) {
+		ibnbd_err(dev, "remap_device: Invalid value: %s\n", options);
+		err = -EINVAL;
+		goto out;
+	}
+	err = ibnbd_clt_remap_device(dev);
+	if (likely(!err))
+		err = count;
+
+out:
+	kfree(opt);
+
+	return err;
+}
+
+static struct kobj_attribute ibnbd_clt_remap_device_attr =
+	__ATTR(remap_device, 0644, ibnbd_clt_remap_dev_show,
+	       ibnbd_clt_remap_dev_store);
+
+static ssize_t ibnbd_clt_session_show(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      char *page)
+{
+	struct ibnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct ibnbd_clt_dev, kobj);
+
+	return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname);
+}
+
+static struct kobj_attribute ibnbd_clt_session_attr =
+	__ATTR(session, 0444, ibnbd_clt_session_show, NULL);
+
+static struct attribute *ibnbd_dev_attrs[] = {
+	&ibnbd_clt_unmap_device_attr.attr,
+	&ibnbd_clt_resize_dev_attr.attr,
+	&ibnbd_clt_remap_device_attr.attr,
+	&ibnbd_clt_mapping_path_attr.attr,
+	&ibnbd_clt_state_attr.attr,
+	&ibnbd_clt_session_attr.attr,
+	&ibnbd_clt_io_mode.attr,
+	&ibnbd_clt_access_mode.attr,
+	NULL,
+};
+
+void ibnbd_clt_remove_dev_symlink(struct ibnbd_clt_dev *dev)
+{
+	/*
+	 * The module_is_live() check is crucial and helps to avoid annoying
+	 * sysfs warning raised in sysfs_remove_link(), when the whole sysfs
+	 * path was just removed, see ibnbd_close_sessions().
+	 */
+	if (strlen(dev->blk_symlink_name) && module_is_live(THIS_MODULE))
+		sysfs_remove_link(ibnbd_devs_kobj, dev->blk_symlink_name);
+}
+
+static struct kobj_type ibnbd_dev_ktype = {
+	.sysfs_ops      = &kobj_sysfs_ops,
+	.default_attrs  = ibnbd_dev_attrs,
+};
+
+static int ibnbd_clt_add_dev_kobj(struct ibnbd_clt_dev *dev)
+{
+	int ret;
+	struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj;
+
+	ret = kobject_init_and_add(&dev->kobj, &ibnbd_dev_ktype, gd_kobj, "%s",
+				   "ibnbd");
+	if (ret)
+		ibnbd_err(dev, "Failed to create device sysfs dir, err: %d\n",
+			  ret);
+
+	return ret;
+}
+
+static ssize_t ibnbd_clt_map_device_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	return scnprintf(page, PAGE_SIZE, "Usage: echo \""
+			 "sessname=<name of the ibtrs session>"
+			 " path=<[srcaddr,]dstaddr>"
+			 " [path=<[srcaddr,]dstaddr>]"
+			 " device_path=<full path on remote side>"
+			 " [access_mode=<ro|rw|migration>]"
+			 " [io_mode=<fileio|blockio>]\" > %s\n\n"
+			 "addr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]\n",
+			 attr->attr.name);
+}
+
+static int ibnbd_clt_get_path_name(struct ibnbd_clt_dev *dev, char *buf,
+				   size_t len)
+{
+	int ret;
+	char pathname[NAME_MAX], *s;
+
+	strlcpy(pathname, dev->pathname, sizeof(pathname));
+	while ((s = strchr(pathname, '/')))
+		s[0] = '!';
+
+	ret = snprintf(buf, len, "%s", pathname);
+	if (ret >= len)
+		return -ENAMETOOLONG;
+
+	return 0;
+}
+
+static int ibnbd_clt_add_dev_symlink(struct ibnbd_clt_dev *dev)
+{
+	struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj;
+	int ret;
+
+	ret = ibnbd_clt_get_path_name(dev, dev->blk_symlink_name,
+				      sizeof(dev->blk_symlink_name));
+	if (ret) {
+		ibnbd_err(dev, "Failed to get /sys/block symlink path, err: %d\n",
+			  ret);
+		goto out_err;
+	}
+
+	ret = sysfs_create_link(ibnbd_devs_kobj, gd_kobj,
+				dev->blk_symlink_name);
+	if (ret) {
+		ibnbd_err(dev, "Creating /sys/block symlink failed, err: %d\n",
+			  ret);
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	dev->blk_symlink_name[0] = '\0';
+	return ret;
+}
+
+static ssize_t ibnbd_clt_map_device_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct ibnbd_clt_dev *dev;
+	int ret;
+	char pathname[NAME_MAX];
+	char sessname[NAME_MAX];
+	enum ibnbd_access_mode access_mode = IBNBD_ACCESS_RW;
+	enum ibnbd_io_mode io_mode = IBNBD_AUTOIO;
+
+	struct sockaddr_storage *addrs;
+	struct ibtrs_addr paths[6];
+	size_t path_cnt;
+
+	addrs = kcalloc(ARRAY_SIZE(paths) * 2, sizeof(*addrs), GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+
+	for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) {
+		paths[path_cnt].src = &addrs[path_cnt * 2];
+		paths[path_cnt].dst = &addrs[path_cnt * 2 + 1];
+	}
+
+	ret = ibnbd_clt_parse_map_options(buf, sessname, paths,
+					  &path_cnt, ARRAY_SIZE(paths),
+					  pathname, &access_mode, &io_mode);
+	if (ret)
+		goto out;
+
+	pr_info("Mapping device %s on session %s, (access_mode: %s, "
+		"io_mode: %s)\n", pathname, sessname,
+		ibnbd_access_mode_str(access_mode), ibnbd_io_mode_str(io_mode));
+
+	dev = ibnbd_clt_map_device(sessname, paths, path_cnt, pathname,
+				   access_mode, io_mode);
+	if (unlikely(IS_ERR(dev))) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+
+	ret = ibnbd_clt_add_dev_kobj(dev);
+	if (unlikely(ret))
+		goto unmap_dev;
+
+	ret = ibnbd_clt_add_dev_symlink(dev);
+	if (ret)
+		goto unmap_dev;
+
+	kfree(addrs);
+	return count;
+
+unmap_dev:
+	ibnbd_clt_unmap_device(dev, true, NULL);
+out:
+	kfree(addrs);
+	return ret;
+}
+
+static struct kobj_attribute ibnbd_clt_map_device_attr =
+	__ATTR(map_device, 0644,
+	       ibnbd_clt_map_device_show, ibnbd_clt_map_device_store);
+
+static struct attribute *default_attrs[] = {
+	&ibnbd_clt_map_device_attr.attr,
+	NULL,
+};
+
+static struct attribute_group default_attr_group = {
+	.attrs = default_attrs,
+};
+
+int ibnbd_clt_create_sysfs_files(void)
+{
+	int err;
+
+	ibnbd_dev_class = class_create(THIS_MODULE, "ibnbd-client");
+	if (unlikely(IS_ERR(ibnbd_dev_class)))
+		return PTR_ERR(ibnbd_dev_class);
+
+	ibnbd_dev = device_create(ibnbd_dev_class, NULL,
+				  MKDEV(0, 0), NULL, "ctl");
+	if (unlikely(IS_ERR(ibnbd_dev))) {
+		err = PTR_ERR(ibnbd_dev);
+		goto cls_destroy;
+	}
+	ibnbd_devs_kobj = kobject_create_and_add("devices", &ibnbd_dev->kobj);
+	if (unlikely(!ibnbd_devs_kobj)) {
+		err = -ENOMEM;
+		goto dev_destroy;
+	}
+	err = sysfs_create_group(&ibnbd_dev->kobj, &default_attr_group);
+	if (unlikely(err))
+		goto put_devs_kobj;
+
+	return 0;
+
+put_devs_kobj:
+	kobject_del(ibnbd_devs_kobj);
+	kobject_put(ibnbd_devs_kobj);
+dev_destroy:
+	device_destroy(ibnbd_dev_class, MKDEV(0, 0));
+cls_destroy:
+	class_destroy(ibnbd_dev_class);
+
+	return err;
+}
+
+void ibnbd_clt_destroy_default_group(void)
+{
+	sysfs_remove_group(&ibnbd_dev->kobj, &default_attr_group);
+}
+
+void ibnbd_clt_destroy_sysfs_files(void)
+{
+	kobject_del(ibnbd_devs_kobj);
+	kobject_put(ibnbd_devs_kobj);
+	device_destroy(ibnbd_dev_class, MKDEV(0, 0));
+	class_destroy(ibnbd_dev_class);
+}
-- 
2.17.1


^ permalink raw reply related

* [RFC/RFT 07/14] clk: meson: g12a: add notifiers to handle cpu clock change
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

In order to implement clock switching for the CLKID_CPU_CLK and
CLKID_CPUB_CLK, notifiers are added on specific points of the
clock tree :

cpu_clk / cpub_clk
|   \- cpu_clk_dyn
|      |  \- cpu_clk_premux0
|      |        |- cpu_clk_postmux0
|      |        |    |- cpu_clk_dyn0_div
|      |        |    \- xtal/fclk_div2/fclk_div3
|      |        \- xtal/fclk_div2/fclk_div3
|      \- cpu_clk_premux1
|            |- cpu_clk_postmux1
|            |    |- cpu_clk_dyn1_div
|            |    \- xtal/fclk_div2/fclk_div3
|            \- xtal/fclk_div2/fclk_div3
\ sys_pll / sys1_pll

This for each cluster, a single one for G12A, two for G12B.

Each cpu_clk_premux1 tree is marked as read-only and CLK_SET_RATE_NO_REPARENT,
to be used as "parking" clock in a safe clock frequency.

A notifier is added on each cpu_clk_premux0 to detech when CCF want to
change the frequency of the cpu_clk_dyn tree.
In this notifier, the cpu_clk_premux1 tree is configured to use the xtal
clock and then the cpu_clk_dyn is switch to cpu_clk_premux1 while CCF
updates the cpu_clk_premux0 tree.

A notifier is added on each sys_pll/sys1_pll to detect when CCF wants to
change the PLL clock source of the cpu_clk.
In this notifier, the cpu_clk is switched to cpu_clk_dyn while CCF
updates the sys_pll/sys1_pll frequency.

A third small notifier is added on each cpu_clk / cpub_clk and cpu_clk_dyn,
add a small delay at PRE_RATE_CHANGE/POST_RATE_CHANGE to let the other
notofiers change propagate before changing the cpu_clk_premux0 and sys_pll
clock trees.

This notifier set permits switching the cpu_clk / cpub_clk without any
glitches and using a safe parking clock while switching between sub-GHz
clocks using the cpu_clk_dyn tree.

This setup has been tested and validated on the Amlogic G12A and G12B
SoCs running the arm64 cpuburn at [1] and cycling between all the possible
cpufreq translations of each cluster and checking the final frequency using
the clock-measurer, script at [2].

[1] https://github.com/ssvb/cpuburn-arm/blob/master/cpuburn-a53.S
[2] https://gist.github.com/superna9999/d4de964dbc0f84b7d527e1df2ddea25f

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 drivers/clk/meson/g12a.c | 500 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 468 insertions(+), 32 deletions(-)

diff --git a/drivers/clk/meson/g12a.c b/drivers/clk/meson/g12a.c
index 3c75ef5e4d24..2f3ecbbc98d9 100644
--- a/drivers/clk/meson/g12a.c
+++ b/drivers/clk/meson/g12a.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
+#include <linux/clk.h>
 
 #include "clk-input.h"
 #include "clk-mpll.h"
@@ -85,16 +86,9 @@ static struct clk_regmap g12a_fixed_pll = {
 	},
 };
 
-/*
- * Internal sys pll emulation configuration parameters
- */
-static const struct reg_sequence g12a_sys_init_regs[] = {
-	{ .reg = HHI_SYS_PLL_CNTL1,	.def = 0x00000000 },
-	{ .reg = HHI_SYS_PLL_CNTL2,	.def = 0x00000000 },
-	{ .reg = HHI_SYS_PLL_CNTL3,	.def = 0x48681c00 },
-	{ .reg = HHI_SYS_PLL_CNTL4,	.def = 0x88770290 },
-	{ .reg = HHI_SYS_PLL_CNTL5,	.def = 0x39272000 },
-	{ .reg = HHI_SYS_PLL_CNTL6,	.def = 0x56540000 },
+static const struct pll_mult_range g12a_sys_pll_mult_range = {
+	.min = 128,
+	.max = 250,
 };
 
 static struct clk_regmap g12a_sys_pll_dco = {
@@ -124,14 +118,15 @@ static struct clk_regmap g12a_sys_pll_dco = {
 			.shift   = 29,
 			.width   = 1,
 		},
-		.init_regs = g12a_sys_init_regs,
-		.init_count = ARRAY_SIZE(g12a_sys_init_regs),
+		.range = &g12a_sys_pll_mult_range,
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys_pll_dco",
-		.ops = &meson_clk_pll_ro_ops,
+		.ops = &meson_clk_pll_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal" },
 		.num_parents = 1,
+		/* This clock feeds the CPU, avoid disabling it */
+		.flags = CLK_IS_CRITICAL,
 	},
 };
 
@@ -144,9 +139,10 @@ static struct clk_regmap g12a_sys_pll = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys_pll",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &clk_regmap_divider_ops,
 		.parent_names = (const char *[]){ "sys_pll_dco" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -177,12 +173,15 @@ static struct clk_regmap g12b_sys1_pll_dco = {
 			.shift   = 29,
 			.width   = 1,
 		},
+		.range = &g12a_sys_pll_mult_range,
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys1_pll_dco",
-		.ops = &meson_clk_pll_ro_ops,
+		.ops = &meson_clk_pll_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal" },
 		.num_parents = 1,
+		/* This clock feeds the CPU, avoid disabling it */
+		.flags = CLK_IS_CRITICAL,
 	},
 };
 
@@ -195,9 +194,10 @@ static struct clk_regmap g12b_sys1_pll = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "sys1_pll",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &clk_regmap_divider_ops,
 		.parent_names = (const char *[]){ "sys1_pll_dco" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -266,7 +266,7 @@ static struct clk_regmap g12a_cpu_clk_premux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn0_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
@@ -275,6 +275,37 @@ static struct clk_regmap g12a_cpu_clk_premux0 = {
 };
 
 /* Datasheet names this field as "mux0_divn_tcnt" */
+#define SYS_CPU_DYN_ENABLE	BIT(26)
+
+/* This divider uses bit 26 to take change in account */
+static int g12a_cpu_clk_mux0_div_set_rate(struct clk_hw *hw, unsigned long rate,
+					  unsigned long parent_rate)
+{
+	struct clk_regmap *clk = to_clk_regmap(hw);
+	struct clk_regmap_div_data *div = clk_get_regmap_div_data(clk);
+	unsigned int val;
+	int ret;
+
+	ret = divider_get_val(rate, parent_rate, div->table, div->width,
+			      div->flags);
+	if (ret < 0)
+		return ret;
+
+	val = (unsigned int)ret << div->shift;
+
+	regmap_update_bits(clk->map, HHI_SYS_CPU_CLK_CNTL0,
+			   SYS_CPU_DYN_ENABLE, SYS_CPU_DYN_ENABLE);
+
+	return regmap_update_bits(clk->map, div->offset,
+				  clk_div_mask(div->width) << div->shift | SYS_CPU_DYN_ENABLE, val);
+};
+
+const struct clk_ops g12a_cpu_clk_mux0_div_ops = {
+	.recalc_rate = clk_regmap_div_recalc_rate,
+	.round_rate = clk_regmap_div_round_rate,
+	.set_rate = g12a_cpu_clk_mux0_div_set_rate,
+};
+
 static struct clk_regmap g12a_cpu_clk_mux0_div = {
 	.data = &(struct clk_regmap_div_data){
 		.offset = HHI_SYS_CPU_CLK_CNTL0,
@@ -283,9 +314,10 @@ static struct clk_regmap g12a_cpu_clk_mux0_div = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn0_div",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &g12a_cpu_clk_mux0_div_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn0_sel" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -298,10 +330,11 @@ static struct clk_regmap g12a_cpu_clk_postmux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn0",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn0_sel",
 						  "cpu_clk_dyn0_div" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -314,11 +347,13 @@ static struct clk_regmap g12a_cpu_clk_premux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn1_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
 		.num_parents = 3,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -346,10 +381,12 @@ static struct clk_regmap g12a_cpu_clk_postmux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn1",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn1_sel",
 						  "cpu_clk_dyn1_div" },
 		.num_parents = 2,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -362,10 +399,11 @@ static struct clk_regmap g12a_cpu_clk_dyn = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk_dyn",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn0",
 						  "cpu_clk_dyn1" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -378,10 +416,11 @@ static struct clk_regmap g12a_cpu_clk = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn",
 						  "sys_pll" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -394,10 +433,11 @@ static struct clk_regmap g12b_cpu_clk = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpu_clk",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpu_clk_dyn",
 						  "sys1_pll" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -410,7 +450,7 @@ static struct clk_regmap g12b_cpub_clk_premux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn0_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
@@ -418,6 +458,35 @@ static struct clk_regmap g12b_cpub_clk_premux0 = {
 	},
 };
 
+/* This divider uses bit 26 to take change in account */
+static int g12b_cpub_clk_mux0_div_set_rate(struct clk_hw *hw, unsigned long rate,
+					  unsigned long parent_rate)
+{
+	struct clk_regmap *clk = to_clk_regmap(hw);
+	struct clk_regmap_div_data *div = clk_get_regmap_div_data(clk);
+	unsigned int val;
+	int ret;
+
+	ret = divider_get_val(rate, parent_rate, div->table, div->width,
+			      div->flags);
+	if (ret < 0)
+		return ret;
+
+	val = (unsigned int)ret << div->shift;
+
+	regmap_update_bits(clk->map, HHI_SYS_CPUB_CLK_CNTL,
+			   SYS_CPU_DYN_ENABLE, SYS_CPU_DYN_ENABLE);
+
+	return regmap_update_bits(clk->map, div->offset,
+				  clk_div_mask(div->width) << div->shift | SYS_CPU_DYN_ENABLE, val);
+};
+
+const struct clk_ops g12b_cpub_clk_mux0_div_ops = {
+	.recalc_rate = clk_regmap_div_recalc_rate,
+	.round_rate = clk_regmap_div_round_rate,
+	.set_rate = g12b_cpub_clk_mux0_div_set_rate,
+};
+
 /* Datasheet names this field as "mux0_divn_tcnt" */
 static struct clk_regmap g12b_cpub_clk_mux0_div = {
 	.data = &(struct clk_regmap_div_data){
@@ -427,9 +496,10 @@ static struct clk_regmap g12b_cpub_clk_mux0_div = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn0_div",
-		.ops = &clk_regmap_divider_ro_ops,
+		.ops = &g12b_cpub_clk_mux0_div_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn0_sel" },
 		.num_parents = 1,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -442,10 +512,11 @@ static struct clk_regmap g12b_cpub_clk_postmux0 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn0",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn0_sel",
 						  "cpub_clk_dyn0_div" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -458,11 +529,13 @@ static struct clk_regmap g12b_cpub_clk_premux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn1_sel",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ IN_PREFIX "xtal",
 						  "fclk_div2",
 						  "fclk_div3" },
 		.num_parents = 3,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -490,10 +563,12 @@ static struct clk_regmap g12b_cpub_clk_postmux1 = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn1",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn1_sel",
 						  "cpub_clk_dyn1_div" },
 		.num_parents = 2,
+		/* This sub-tree is used a parking clock */
+		.flags = CLK_SET_RATE_NO_REPARENT,
 	},
 };
 
@@ -506,10 +581,11 @@ static struct clk_regmap g12b_cpub_clk_dyn = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk_dyn",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn0",
 						  "cpub_clk_dyn1" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
@@ -522,13 +598,228 @@ static struct clk_regmap g12b_cpub_clk = {
 	},
 	.hw.init = &(struct clk_init_data){
 		.name = "cpub_clk",
-		.ops = &clk_regmap_mux_ro_ops,
+		.ops = &clk_regmap_mux_ops,
 		.parent_names = (const char *[]){ "cpub_clk_dyn",
 						  "sys_pll" },
 		.num_parents = 2,
+		.flags = CLK_SET_RATE_PARENT,
 	},
 };
 
+static int g12a_cpu_clk_mux_notifier_cb(struct notifier_block *nb,
+					unsigned long event, void *data)
+{
+	switch (event) {
+	case POST_RATE_CHANGE:
+	case PRE_RATE_CHANGE:
+		/* Wait for clock propagation before/after changing the mux */
+		udelay(100);
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+struct notifier_block g12a_cpu_clk_mux_nb = {
+	.notifier_call = g12a_cpu_clk_mux_notifier_cb,
+};
+
+struct g12a_cpu_clk_postmux_nb_data {
+	struct notifier_block nb;
+	struct clk_hw *xtal;
+	struct clk_hw *cpu_clk_dyn;
+	struct clk_hw *cpu_clk_postmux0;
+	struct clk_hw *cpu_clk_postmux1;
+	struct clk_hw *cpu_clk_premux1;
+};
+
+static int g12a_cpu_clk_postmux_notifier_cb(struct notifier_block *nb,
+					 unsigned long event, void *data)
+{
+	struct g12a_cpu_clk_postmux_nb_data *nb_data =
+		container_of(nb, struct g12a_cpu_clk_postmux_nb_data, nb);
+
+	switch (event) {
+	case PRE_RATE_CHANGE:
+		/*
+		 * This notifier means cpu_clk_postmux0 clock will be changed
+		 * to feed cpu_clk, this the current path :
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_postmux0
+		 *                \- cpu_clk_muxX_div
+		 *                      \- cpu_clk_premux0
+		 *				\- fclk_div3 or fclk_div2
+		 *		OR
+		 *                \- cpu_clk_premux0
+		 *			\- fclk_div3 or fclk_div2
+		 */
+
+		/* Setup cpu_clk_premux1 to xtal */
+		clk_hw_set_parent(nb_data->cpu_clk_premux1,
+				  nb_data->xtal);
+
+		/* Setup cpu_clk_postmux1 to bypass divider */
+		clk_hw_set_parent(nb_data->cpu_clk_postmux1,
+				  nb_data->cpu_clk_premux1);
+
+		/* Switch to parking clk on cpu_clk_postmux1 */
+		clk_hw_set_parent(nb_data->cpu_clk_dyn,
+				  nb_data->cpu_clk_postmux1);
+
+		/*
+		 * Now, cpu_clk is 24MHz in the current path :
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_postmux1
+		 *                \- cpu_clk_premux1
+		 *                      \- xtal
+		 */
+
+		udelay(100);
+
+		return NOTIFY_OK;
+
+	case POST_RATE_CHANGE:
+		/*
+		 * The cpu_clk_postmux0 has ben updated, now switch back
+		 * cpu_clk_dyn to cpu_clk_postmux0 and take the changes
+		 * in account.
+		 */
+
+		/* Configure cpu_clk_dyn back to cpu_clk_postmux0 */
+		clk_hw_set_parent(nb_data->cpu_clk_dyn,
+				  nb_data->cpu_clk_postmux0);
+
+		/*
+		 * new path :
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_postmux0
+		 *                \- cpu_clk_muxX_div
+		 *                      \- cpu_clk_premux0
+		 *				\- fclk_div3 or fclk_div2
+		 *		OR
+		 *                \- cpu_clk_premux0
+		 *			\- fclk_div3 or fclk_div2
+		 */
+
+		udelay(100);
+
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct g12a_cpu_clk_postmux_nb_data g12a_cpu_clk_postmux0_nb_data = {
+	.cpu_clk_dyn = &g12a_cpu_clk_dyn.hw,
+	.cpu_clk_postmux0 = &g12a_cpu_clk_postmux0.hw,
+	.cpu_clk_postmux1 = &g12a_cpu_clk_postmux1.hw,
+	.cpu_clk_premux1 = &g12a_cpu_clk_premux1.hw,
+	.nb.notifier_call = g12a_cpu_clk_postmux_notifier_cb,
+};
+
+static struct g12a_cpu_clk_postmux_nb_data g12b_cpub_clk_postmux0_nb_data = {
+	.cpu_clk_dyn = &g12b_cpub_clk_dyn.hw,
+	.cpu_clk_postmux0 = &g12b_cpub_clk_postmux0.hw,
+	.cpu_clk_postmux1 = &g12b_cpub_clk_postmux1.hw,
+	.cpu_clk_premux1 = &g12b_cpub_clk_premux1.hw,
+	.nb.notifier_call = g12a_cpu_clk_postmux_notifier_cb,
+};
+
+struct g12a_sys_pll_nb_data {
+	struct notifier_block nb;
+	struct clk_hw *sys_pll;
+	struct clk_hw *cpu_clk;
+	struct clk_hw *cpu_clk_dyn;
+};
+
+static int g12a_sys_pll_notifier_cb(struct notifier_block *nb,
+				    unsigned long event, void *data)
+{
+	struct g12a_sys_pll_nb_data *nb_data =
+		container_of(nb, struct g12a_sys_pll_nb_data, nb);
+
+	switch (event) {
+	case PRE_RATE_CHANGE:
+		/*
+		 * This notifier means sys_pll clock will be changed
+		 * to feed cpu_clk, this the current path :
+		 * cpu_clk
+		 *    \- sys_pll
+		 *          \- sys_pll_dco
+		 */
+
+		/* Configure cpu_clk to use cpu_clk_dyn */
+		clk_hw_set_parent(nb_data->cpu_clk,
+				  nb_data->cpu_clk_dyn);
+
+		/*
+		 * Now, cpu_clk uses the dyn path
+		 * cpu_clk
+		 *    \- cpu_clk_dyn
+		 *          \- cpu_clk_dynX
+		 *                \- cpu_clk_dynX_sel
+		 *		     \- cpu_clk_dynX_div
+		 *                      \- xtal/fclk_div2/fclk_div3
+		 *                   \- xtal/fclk_div2/fclk_div3
+		 */
+
+		udelay(100);
+
+		return NOTIFY_OK;
+
+	case POST_RATE_CHANGE:
+		/*
+		 * The sys_pll has ben updated, now switch back cpu_clk to
+		 * sys_pll
+		 */
+
+		/* Configure cpu_clk to use sys_pll */
+		clk_hw_set_parent(nb_data->cpu_clk,
+				  nb_data->sys_pll);
+
+		udelay(100);
+
+		/* new path :
+		 * cpu_clk
+		 *    \- sys_pll
+		 *          \- sys_pll_dco
+		 */
+
+		return NOTIFY_OK;
+
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct g12a_sys_pll_nb_data g12a_sys_pll_nb_data = {
+	.sys_pll = &g12a_sys_pll.hw,
+	.cpu_clk = &g12a_cpu_clk.hw,
+	.cpu_clk_dyn = &g12a_cpu_clk_dyn.hw,
+	.nb.notifier_call = g12a_sys_pll_notifier_cb,
+};
+
+/* G12B first CPU cluster uses sys1_pll */
+static struct g12a_sys_pll_nb_data g12b_cpu_clk_sys1_pll_nb_data = {
+	.sys_pll = &g12b_sys1_pll.hw,
+	.cpu_clk = &g12b_cpu_clk.hw,
+	.cpu_clk_dyn = &g12a_cpu_clk_dyn.hw,
+	.nb.notifier_call = g12a_sys_pll_notifier_cb,
+};
+
+/* G12B second CPU cluster uses sys_pll */
+static struct g12a_sys_pll_nb_data g12b_cpub_clk_sys_pll_nb_data = {
+	.sys_pll = &g12a_sys_pll.hw,
+	.cpu_clk = &g12b_cpub_clk.hw,
+	.cpu_clk_dyn = &g12b_cpub_clk_dyn.hw,
+	.nb.notifier_call = g12a_sys_pll_notifier_cb,
+};
+
 static struct clk_regmap g12a_cpu_clk_div16_en = {
 	.data = &(struct clk_regmap_gate_data){
 		.offset = HHI_SYS_CPU_CLK_CNTL1,
@@ -3792,18 +4083,163 @@ static const struct reg_sequence g12a_init_regs[] = {
 	{ .reg = HHI_MPLL_CNTL0,	.def = 0x00000543 },
 };
 
+static int meson_g12a_dvfs_setup_common(struct platform_device *pdev,
+					struct clk_hw **hws)
+{
+	const char *notifier_clk_name;
+	struct clk *notifier_clk;
+	struct clk_hw *xtal;
+	int ret;
+
+	xtal = clk_hw_get_parent_by_index(hws[CLKID_CPU_CLK_DYN1_SEL], 0);
+
+	/* Setup clock notifier for cpu_clk_postmux0 */
+	g12a_cpu_clk_postmux0_nb_data.xtal = xtal;
+	notifier_clk_name = clk_hw_get_name(&g12a_cpu_clk_postmux0.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12a_cpu_clk_postmux0_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk_postmux0 notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for cpu_clk_dyn mux */
+	notifier_clk_name = clk_hw_get_name(&g12a_cpu_clk_dyn.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk_dyn notifier\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int meson_g12b_dvfs_setup(struct platform_device *pdev)
+{
+	struct clk_hw **hws = g12b_hw_onecell_data.hws;
+	const char *notifier_clk_name;
+	struct clk *notifier_clk;
+	struct clk_hw *xtal;
+	int ret;
+
+	ret = meson_g12a_dvfs_setup_common(pdev, hws);
+	if (ret)
+		return ret;
+
+	xtal = clk_hw_get_parent_by_index(hws[CLKID_CPU_CLK_DYN1_SEL], 0);
+
+	/* Setup clock notifier for cpu_clk mux */
+	notifier_clk_name = clk_hw_get_name(&g12b_cpu_clk.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for sys1_pll */
+	notifier_clk_name = clk_hw_get_name(&g12b_sys1_pll.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12b_cpu_clk_sys1_pll_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the sys1_pll notifier\n");
+		return ret;
+	}
+
+	/* Add notifiers for the second CPU cluster */
+
+	/* Setup clock notifier for cpub_clk_postmux0 */
+	g12b_cpub_clk_postmux0_nb_data.xtal = xtal;
+	notifier_clk_name = clk_hw_get_name(&g12b_cpub_clk_postmux0.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12b_cpub_clk_postmux0_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpub_clk_postmux0 notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for cpub_clk_dyn mux */
+	notifier_clk_name = clk_hw_get_name(&g12b_cpub_clk_dyn.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpub_clk_dyn notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for cpub_clk mux */
+	notifier_clk_name = clk_hw_get_name(&g12b_cpub_clk.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpub_clk notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for sys_pll */
+	notifier_clk_name = clk_hw_get_name(&g12a_sys_pll.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk,
+				    &g12b_cpub_clk_sys_pll_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the sys_pll notifier\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+
+static int meson_g12a_dvfs_setup(struct platform_device *pdev)
+{
+	struct clk_hw **hws = g12a_hw_onecell_data.hws;
+	const char *notifier_clk_name;
+	struct clk *notifier_clk;
+	int ret;
+
+	ret = meson_g12a_dvfs_setup_common(pdev, hws);
+	if (ret)
+		return ret;
+
+	/* Setup clock notifier for cpu_clk mux */
+	notifier_clk_name = clk_hw_get_name(&g12a_cpu_clk.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the cpu_clk notifier\n");
+		return ret;
+	}
+
+	/* Setup clock notifier for sys_pll */
+	notifier_clk_name = clk_hw_get_name(&g12a_sys_pll.hw);
+	notifier_clk = __clk_lookup(notifier_clk_name);
+	ret = clk_notifier_register(notifier_clk, &g12a_sys_pll_nb_data.nb);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to register the sys_pll notifier\n");
+		return ret;
+	}
+
+	return 0;
+}
+
 static const struct meson_eeclkc_data g12a_clkc_data = {
 	.regmap_clks = g12a_clk_regmaps,
 	.regmap_clk_num = ARRAY_SIZE(g12a_clk_regmaps),
 	.hw_onecell_data = &g12a_hw_onecell_data,
 	.init_regs = g12a_init_regs,
 	.init_count = ARRAY_SIZE(g12a_init_regs),
+	.setup = meson_g12a_dvfs_setup,
 };
 
 static const struct meson_eeclkc_data g12b_clkc_data = {
 	.regmap_clks = g12a_clk_regmaps,
 	.regmap_clk_num = ARRAY_SIZE(g12a_clk_regmaps),
-	.hw_onecell_data = &g12b_hw_onecell_data
+	.hw_onecell_data = &g12b_hw_onecell_data,
+	.setup = meson_g12b_dvfs_setup,
 };
 
 static const struct of_device_id clkc_match_table[] = {
-- 
2.21.0


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related

* [PATCH v4 16/25] ibnbd: client: private header with client structs and functions
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

This header describes main structs and functions used by ibnbd-client
module, mainly for managing IBNBD sessions and mapped block devices,
creating and destroying sysfs entries.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-clt.h | 166 ++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-clt.h

diff --git a/drivers/block/ibnbd/ibnbd-clt.h b/drivers/block/ibnbd/ibnbd-clt.h
new file mode 100644
index 000000000000..005becfb110f
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-clt.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Swapnil Ingle <swapnil.ingle@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#ifndef IBNBD_CLT_H
+#define IBNBD_CLT_H
+
+#include <linux/wait.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/blk-mq.h>
+#include <linux/refcount.h>
+
+#include "ibtrs.h"
+#include "ibnbd-proto.h"
+#include "ibnbd-log.h"
+
+#define BMAX_SEGMENTS 29
+#define RECONNECT_DELAY 30
+#define MAX_RECONNECTS -1
+
+enum ibnbd_clt_dev_state {
+	DEV_STATE_INIT,
+	DEV_STATE_MAPPED,
+	DEV_STATE_MAPPED_DISCONNECTED,
+	DEV_STATE_UNMAPPED,
+};
+
+struct ibnbd_iu_comp {
+	wait_queue_head_t wait;
+	int errno;
+};
+
+struct ibnbd_iu {
+	union {
+		struct request *rq; /* for block io */
+		void *buf; /* for user messages */
+	};
+	struct ibtrs_tag	*tag;
+	union {
+		/* use to send msg associated with a dev */
+		struct ibnbd_clt_dev *dev;
+		/* use to send msg associated with a sess */
+		struct ibnbd_clt_session *sess;
+	};
+	blk_status_t		status;
+	struct scatterlist	sglist[BMAX_SEGMENTS];
+	struct work_struct	work;
+	int			errno;
+	struct ibnbd_iu_comp	comp;
+	atomic_t		refcount;
+};
+
+struct ibnbd_cpu_qlist {
+	struct list_head	requeue_list;
+	spinlock_t		requeue_lock;
+	unsigned int		cpu;
+};
+
+struct ibnbd_clt_session {
+	struct list_head        list;
+	struct ibtrs_clt        *ibtrs;
+	wait_queue_head_t       ibtrs_waitq;
+	bool                    ibtrs_ready;
+	struct ibnbd_cpu_qlist	__percpu
+				*cpu_queues;
+	DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
+	int	__percpu	*cpu_rr; /* per-cpu var for CPU round-robin */
+	atomic_t		busy;
+	int			queue_depth;
+	u32			max_io_size;
+	struct blk_mq_tag_set	tag_set;
+	struct mutex		lock; /* protects state and devs_list */
+	struct list_head        devs_list; /* list of struct ibnbd_clt_dev */
+	refcount_t		refcount;
+	char			sessname[NAME_MAX];
+	u8			ver; /* protocol version */
+};
+
+/**
+ * Submission queues.
+ */
+struct ibnbd_queue {
+	struct list_head	requeue_list;
+	unsigned long		in_list;
+	struct ibnbd_clt_dev	*dev;
+	struct blk_mq_hw_ctx	*hctx;
+};
+
+struct ibnbd_clt_dev {
+	struct ibnbd_clt_session	*sess;
+	struct request_queue	*queue;
+	struct ibnbd_queue	*hw_queues;
+	u32			device_id;
+	/* local Idr index - used to track minor number allocations. */
+	u32			clt_device_id;
+	struct mutex		lock;
+	enum ibnbd_clt_dev_state	dev_state;
+	enum ibnbd_io_mode	io_mode; /* user requested */
+	enum ibnbd_io_mode	remote_io_mode; /* server really used */
+	char			pathname[NAME_MAX];
+	enum ibnbd_access_mode	access_mode;
+	bool			read_only;
+	bool			rotational;
+	u32			max_hw_sectors;
+	u32			max_write_same_sectors;
+	u32			max_discard_sectors;
+	u32			discard_granularity;
+	u32			discard_alignment;
+	u16			secure_discard;
+	u16			physical_block_size;
+	u16			logical_block_size;
+	u16			max_segments;
+	size_t			nsectors;
+	u64			size;		/* device size in bytes */
+	struct list_head        list;
+	struct gendisk		*gd;
+	struct kobject		kobj;
+	char			blk_symlink_name[NAME_MAX];
+	refcount_t		refcount;
+	struct work_struct	unmap_on_rmmod_work;
+};
+
+/* ibnbd-clt.c */
+
+struct ibnbd_clt_dev *ibnbd_clt_map_device(const char *sessname,
+					   struct ibtrs_addr *paths,
+					   size_t path_cnt,
+					   const char *pathname,
+					   enum ibnbd_access_mode access_mode,
+					   enum ibnbd_io_mode io_mode);
+int ibnbd_clt_unmap_device(struct ibnbd_clt_dev *dev, bool force,
+			   const struct attribute *sysfs_self);
+
+int ibnbd_clt_remap_device(struct ibnbd_clt_dev *dev);
+int ibnbd_clt_resize_disk(struct ibnbd_clt_dev *dev, size_t newsize);
+
+/* ibnbd-clt-sysfs.c */
+
+int ibnbd_clt_create_sysfs_files(void);
+
+void ibnbd_clt_destroy_sysfs_files(void);
+void ibnbd_clt_destroy_default_group(void);
+
+void ibnbd_clt_remove_dev_symlink(struct ibnbd_clt_dev *dev);
+
+#endif /* IBNBD_CLT_H */
-- 
2.17.1


^ permalink raw reply related

* [Qemu-devel] [PATCH v4 06/13] vfio: Add migration state change notifier
From: Kirti Wankhede @ 2019-06-20 14:37 UTC (permalink / raw)
  To: alex.williamson, cjia
  Cc: Kirti Wankhede, Zhengxiao.zx, kevin.tian, yi.l.liu, yan.y.zhao,
	eskultet, ziye.yang, qemu-devel, cohuck, shuangtai.tst, dgilbert,
	zhi.a.wang, mlevitsk, pasic, aik, yulei.zhang, eauger, felipe,
	jonathan.davies, changpeng.liu, Ken.Xue
In-Reply-To: <1561041461-22326-1-git-send-email-kwankhede@nvidia.com>

Added migration state change notifier to get notification on migration state
change. These states are translated to VFIO device state and conveyed to vendor
driver.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 hw/vfio/migration.c           | 49 +++++++++++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |  1 +
 2 files changed, 50 insertions(+)

diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
index 15af218c23d1..7f9858e6c995 100644
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -112,6 +112,48 @@ static void vfio_vmstate_change(void *opaque, int running, RunState state)
     vbasedev->vm_running = running;
 }
 
+static void vfio_migration_state_notifier(Notifier *notifier, void *data)
+{
+    MigrationState *s = data;
+    VFIODevice *vbasedev = container_of(notifier, VFIODevice, migration_state);
+    int ret;
+
+    switch (s->state) {
+    case MIGRATION_STATUS_ACTIVE:
+        if (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING) {
+            if (vbasedev->vm_running) {
+                ret = vfio_migration_set_state(vbasedev,
+                          VFIO_DEVICE_STATE_RUNNING | VFIO_DEVICE_STATE_SAVING);
+                if (ret) {
+                    error_report("Failed to set state RUNNING and SAVING");
+                }
+            } else {
+                ret = vfio_migration_set_state(vbasedev,
+                                               VFIO_DEVICE_STATE_SAVING);
+                if (ret) {
+                    error_report("Failed to set state STOP and SAVING");
+                }
+            }
+        } else {
+            ret = vfio_migration_set_state(vbasedev,
+                                           VFIO_DEVICE_STATE_RESUMING);
+            if (ret) {
+                error_report("Failed to set state RESUMING");
+            }
+        }
+        return;
+
+    case MIGRATION_STATUS_CANCELLING:
+    case MIGRATION_STATUS_CANCELLED:
+    case MIGRATION_STATUS_FAILED:
+        ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_RUNNING);
+        if (ret) {
+            error_report("Failed to set state RUNNING");
+        }
+        return;
+    }
+}
+
 static int vfio_migration_init(VFIODevice *vbasedev,
                                struct vfio_region_info *info)
 {
@@ -131,6 +173,9 @@ static int vfio_migration_init(VFIODevice *vbasedev,
     vbasedev->vm_state = qemu_add_vm_change_state_handler(vfio_vmstate_change,
                                                           vbasedev);
 
+    vbasedev->migration_state.notify = vfio_migration_state_notifier;
+    add_migration_state_change_notifier(&vbasedev->migration_state);
+
     return 0;
 }
 
@@ -167,6 +212,10 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
         return;
     }
 
+    if (vbasedev->migration_state.notify) {
+        remove_migration_state_change_notifier(&vbasedev->migration_state);
+    }
+
     if (vbasedev->vm_state) {
         qemu_del_vm_change_state_handler(vbasedev->vm_state);
     }
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index f2392e97fa57..1d26e6be8d48 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -133,6 +133,7 @@ typedef struct VFIODevice {
     uint32_t device_state;
     VMChangeStateEntry *vm_state;
     int vm_running;
+    Notifier migration_state;
 } VFIODevice;
 
 struct VFIODeviceOps {
-- 
2.7.0



^ permalink raw reply related

* [PATCH v4 15/25] ibnbd: private headers with IBNBD protocol structs and helpers
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

These are common private headers with IBNBD protocol structures,
logging, sysfs and other helper functions, which are used on
both client and server sides.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/block/ibnbd/ibnbd-log.h   |  59 +++++
 drivers/block/ibnbd/ibnbd-proto.h | 378 ++++++++++++++++++++++++++++++
 2 files changed, 437 insertions(+)
 create mode 100644 drivers/block/ibnbd/ibnbd-log.h
 create mode 100644 drivers/block/ibnbd/ibnbd-proto.h

diff --git a/drivers/block/ibnbd/ibnbd-log.h b/drivers/block/ibnbd/ibnbd-log.h
new file mode 100644
index 000000000000..7a7ac3908564
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-log.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#ifndef IBNBD_LOG_H
+#define IBNBD_LOG_H
+
+#include "ibnbd-clt.h"
+#include "ibnbd-srv.h"
+
+void unknown_type(void);
+
+#define ibnbd_log(fn, dev, fmt, ...) ({				\
+	__builtin_choose_expr(						\
+		__builtin_types_compatible_p(				\
+			typeof(dev), struct ibnbd_clt_dev *),		\
+		fn("<%s@%s> " fmt, (dev)->pathname,			\
+		(dev)->sess->sessname,					\
+		   ##__VA_ARGS__),					\
+		__builtin_choose_expr(					\
+			__builtin_types_compatible_p(typeof(dev),	\
+					struct ibnbd_srv_sess_dev *),	\
+			fn("<%s@%s>: " fmt, (dev)->pathname,		\
+			   (dev)->sess->sessname, ##__VA_ARGS__),	\
+			unknown_type()));				\
+})
+
+#define ibnbd_err(dev, fmt, ...)	\
+	ibnbd_log(pr_err, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_err_rl(dev, fmt, ...)	\
+	ibnbd_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_wrn(dev, fmt, ...)	\
+	ibnbd_log(pr_warn, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_wrn_rl(dev, fmt, ...) \
+	ibnbd_log(pr_warn_ratelimited, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_info(dev, fmt, ...) \
+	ibnbd_log(pr_info, dev, fmt, ##__VA_ARGS__)
+#define ibnbd_info_rl(dev, fmt, ...) \
+	ibnbd_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__)
+
+#endif /* IBNBD_LOG_H */
diff --git a/drivers/block/ibnbd/ibnbd-proto.h b/drivers/block/ibnbd/ibnbd-proto.h
new file mode 100644
index 000000000000..e5a0a539447b
--- /dev/null
+++ b/drivers/block/ibnbd/ibnbd-proto.h
@@ -0,0 +1,378 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * InfiniBand Network Block Driver
+ *
+ * Copyright (c) 2014 - 2017 ProfitBricks GmbH. All rights reserved.
+ * Authors: Fabian Holler <mail@fholler.de>
+ *          Jack Wang <jinpu.wang@profitbricks.com>
+ *          Kleber Souza <kleber.souza@profitbricks.com>
+ *          Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Milind Dumbare <Milind.dumbare@gmail.com>
+ *
+ * Copyright (c) 2017 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Authors: Danil Kipnis <danil.kipnis@profitbricks.com>
+ *          Roman Penyaev <roman.penyaev@profitbricks.com>
+ *
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Authors: Roman Penyaev <roman.penyaev@profitbricks.com>
+ *          Jack Wang <jinpu.wang@cloud.ionos.com>
+ *          Danil Kipnis <danil.kipnis@cloud.ionos.com>
+ */
+
+#ifndef IBNBD_PROTO_H
+#define IBNBD_PROTO_H
+
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/limits.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib.h>
+
+#define IBNBD_PROTO_VER_MAJOR 2
+#define IBNBD_PROTO_VER_MINOR 0
+
+#define IBNBD_PROTO_VER_STRING __stringify(IBNBD_PROTO_VER_MAJOR) "." \
+			       __stringify(IBNBD_PROTO_VER_MINOR)
+
+#ifndef IBNBD_VER_STRING
+#define IBNBD_VER_STRING __stringify(IBNBD_PROTO_VER_MAJOR) "." \
+			 __stringify(IBNBD_PROTO_VER_MINOR)
+#endif
+
+/* TODO: should be configurable */
+#define IBTRS_PORT 1234
+
+/**
+ * enum ibnbd_msg_types - IBNBD message types
+ * @IBNBD_MSG_SESS_INFO:	initial session info from client to server
+ * @IBNBD_MSG_SESS_INFO_RSP:	initial session info from server to client
+ * @IBNBD_MSG_OPEN:		open (map) device request
+ * @IBNBD_MSG_OPEN_RSP:		response to an @IBNBD_MSG_OPEN
+ * @IBNBD_MSG_IO:		block IO request operation
+ * @IBNBD_MSG_CLOSE:		close (unmap) device request
+ */
+enum ibnbd_msg_type {
+	IBNBD_MSG_SESS_INFO,
+	IBNBD_MSG_SESS_INFO_RSP,
+	IBNBD_MSG_OPEN,
+	IBNBD_MSG_OPEN_RSP,
+	IBNBD_MSG_IO,
+	IBNBD_MSG_CLOSE,
+};
+
+/**
+ * struct ibnbd_msg_hdr - header of IBNBD messages
+ * @type:	Message type, valid values see: enum ibnbd_msg_types
+ */
+struct ibnbd_msg_hdr {
+	__le16		type;
+	__le16		__padding;
+};
+
+enum ibnbd_access_mode {
+	IBNBD_ACCESS_RO,
+	IBNBD_ACCESS_RW,
+	IBNBD_ACCESS_MIGRATION,
+};
+
+#define _IBNBD_FILEIO  0
+#define _IBNBD_BLOCKIO 1
+#define _IBNBD_AUTOIO  2
+
+enum ibnbd_io_mode {
+	IBNBD_FILEIO = _IBNBD_FILEIO,
+	IBNBD_BLOCKIO = _IBNBD_BLOCKIO,
+	IBNBD_AUTOIO = _IBNBD_AUTOIO,
+};
+
+/**
+ * struct ibnbd_msg_sess_info - initial session info from client to server
+ * @hdr:		message header
+ * @ver:		IBNBD protocol version
+ */
+struct ibnbd_msg_sess_info {
+	struct ibnbd_msg_hdr hdr;
+	u8		ver;
+	u8		reserved[31];
+};
+
+/**
+ * struct ibnbd_msg_sess_info_rsp - initial session info from server to client
+ * @hdr:		message header
+ * @ver:		IBNBD protocol version
+ */
+struct ibnbd_msg_sess_info_rsp {
+	struct ibnbd_msg_hdr hdr;
+	u8		ver;
+	u8		reserved[31];
+};
+
+/**
+ * struct ibnbd_msg_open - request to open a remote device.
+ * @hdr:		message header
+ * @access_mode:	the mode to open remote device, valid values see:
+ *			enum ibnbd_access_mode
+ * @io_mode:		Open volume on server as block device or as file
+ * @device_name:	device path on remote side
+ */
+struct ibnbd_msg_open {
+	struct ibnbd_msg_hdr hdr;
+	u8		access_mode;
+	u8		io_mode;
+	s8		dev_name[NAME_MAX];
+	u8		__padding[3];
+};
+
+/**
+ * struct ibnbd_msg_close - request to close a remote device.
+ * @hdr:	message header
+ * @device_id:	device_id on server side to identify the device
+ */
+struct ibnbd_msg_close {
+	struct ibnbd_msg_hdr hdr;
+	__le32		device_id;
+};
+
+/**
+ * struct ibnbd_msg_open_rsp - response message to IBNBD_MSG_OPEN
+ * @hdr:		message header
+ * @nsectors:		number of sectors
+ * @device_id:		device_id on server side to identify the device
+ * @queue_flags:	queue_flags of the device on server side
+ * @max_hw_sectors:	max hardware sectors in the usual 512b unit
+ * @max_write_same_sectors: max sectors for WRITE SAME in the 512b unit
+ * @max_discard_sectors: max. sectors that can be discarded at once
+ * @discard_granularity: size of the internal discard allocation unit
+ * @discard_alignment: offset from internal allocation assignment
+ * @physical_block_size: physical block size device supports
+ * @logical_block_size: logical block size device supports
+ * @max_segments:	max segments hardware support in one transfer
+ * @secure_discard:	supports secure discard
+ * @rotation:		is a rotational disc?
+ * @io_mode:		io_mode device is opened.
+ */
+struct ibnbd_msg_open_rsp {
+	struct ibnbd_msg_hdr	hdr;
+	__le32			device_id;
+	__le64			nsectors;
+	__le32			max_hw_sectors;
+	__le32			max_write_same_sectors;
+	__le32			max_discard_sectors;
+	__le32			discard_granularity;
+	__le32			discard_alignment;
+	__le16			physical_block_size;
+	__le16			logical_block_size;
+	__le16			max_segments;
+	__le16			secure_discard;
+	u8			rotational;
+	u8			io_mode;
+	u8			__padding[10];
+};
+
+/**
+ * struct ibnbd_msg_io_old - message for I/O read/write for 
+ * ver < IBNBD_PROTO_VER_MAJOR
+ * This structure is there only to know the size of the "old" message format
+ * @hdr:	message header
+ * @device_id:	device_id on server side to find the right device
+ * @sector:	bi_sector attribute from struct bio
+ * @rw:		bitmask, valid values are defined in enum ibnbd_io_flags
+ * @bi_size:    number of bytes for I/O read/write
+ * @prio:       priority
+ */
+struct ibnbd_msg_io_old {
+	struct ibnbd_msg_hdr hdr;
+	__le32		device_id;
+	__le64		sector;
+	__le32		rw;
+	__le32		bi_size;
+};
+
+/**
+ * struct ibnbd_msg_io - message for I/O read/write
+ * @hdr:	message header
+ * @device_id:	device_id on server side to find the right device
+ * @sector:	bi_sector attribute from struct bio
+ * @rw:		bitmask, valid values are defined in enum ibnbd_io_flags
+ * @bi_size:    number of bytes for I/O read/write
+ * @prio:       priority
+ */
+struct ibnbd_msg_io {
+	struct ibnbd_msg_hdr hdr;
+	__le32		device_id;
+	__le64		sector;
+	__le32		rw;
+	__le32		bi_size;
+	__le16		prio;
+};
+
+#define IBNBD_OP_BITS  8
+#define IBNBD_OP_MASK  ((1 << IBNBD_OP_BITS) - 1)
+
+/**
+ * enum ibnbd_io_flags - IBNBD request types from rq_flag_bits
+ * @IBNBD_OP_READ:	     read sectors from the device
+ * @IBNBD_OP_WRITE:	     write sectors to the device
+ * @IBNBD_OP_FLUSH:	     flush the volatile write cache
+ * @IBNBD_OP_DISCARD:        discard sectors
+ * @IBNBD_OP_SECURE_ERASE:   securely erase sectors
+ * @IBNBD_OP_WRITE_SAME:     write the same sectors many times
+
+ * @IBNBD_F_SYNC:	     request is sync (sync write or read)
+ * @IBNBD_F_FUA:             forced unit access
+ */
+enum ibnbd_io_flags {
+
+	/* Operations */
+
+	IBNBD_OP_READ		= 0,
+	IBNBD_OP_WRITE		= 1,
+	IBNBD_OP_FLUSH		= 2,
+	IBNBD_OP_DISCARD	= 3,
+	IBNBD_OP_SECURE_ERASE	= 4,
+	IBNBD_OP_WRITE_SAME	= 5,
+
+	IBNBD_OP_LAST,
+
+	/* Flags */
+
+	IBNBD_F_SYNC  = 1<<(IBNBD_OP_BITS + 0),
+	IBNBD_F_FUA   = 1<<(IBNBD_OP_BITS + 1),
+
+	IBNBD_F_ALL   = (IBNBD_F_SYNC | IBNBD_F_FUA)
+
+};
+
+static inline u32 ibnbd_op(u32 flags)
+{
+	return (flags & IBNBD_OP_MASK);
+}
+
+static inline u32 ibnbd_flags(u32 flags)
+{
+	return (flags & ~IBNBD_OP_MASK);
+}
+
+static inline bool ibnbd_flags_supported(u32 flags)
+{
+	u32 op;
+
+	op = ibnbd_op(flags);
+	flags = ibnbd_flags(flags);
+
+	if (op >= IBNBD_OP_LAST)
+		return false;
+	if (flags & ~IBNBD_F_ALL)
+		return false;
+
+	return true;
+}
+
+static inline u32 ibnbd_to_bio_flags(u32 ibnbd_flags)
+{
+	u32 bio_flags;
+
+	switch (ibnbd_op(ibnbd_flags)) {
+	case IBNBD_OP_READ:
+		bio_flags = REQ_OP_READ;
+		break;
+	case IBNBD_OP_WRITE:
+		bio_flags = REQ_OP_WRITE;
+		break;
+	case IBNBD_OP_FLUSH:
+		bio_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
+		break;
+	case IBNBD_OP_DISCARD:
+		bio_flags = REQ_OP_DISCARD;
+		break;
+	case IBNBD_OP_SECURE_ERASE:
+		bio_flags = REQ_OP_SECURE_ERASE;
+		break;
+	case IBNBD_OP_WRITE_SAME:
+		bio_flags = REQ_OP_WRITE_SAME;
+		break;
+	default:
+		WARN(1, "Unknown IBNBD type: %d (flags %d)\n",
+		     ibnbd_op(ibnbd_flags), ibnbd_flags);
+		bio_flags = 0;
+	}
+
+	if (ibnbd_flags & IBNBD_F_SYNC)
+		bio_flags |= REQ_SYNC;
+
+	if (ibnbd_flags & IBNBD_F_FUA)
+		bio_flags |= REQ_FUA;
+
+	return bio_flags;
+}
+
+static inline u32 rq_to_ibnbd_flags(struct request *rq)
+{
+	u32 ibnbd_flags;
+
+	switch (req_op(rq)) {
+	case REQ_OP_READ:
+		ibnbd_flags = IBNBD_OP_READ;
+		break;
+	case REQ_OP_WRITE:
+		ibnbd_flags = IBNBD_OP_WRITE;
+		break;
+	case REQ_OP_DISCARD:
+		ibnbd_flags = IBNBD_OP_DISCARD;
+		break;
+	case REQ_OP_SECURE_ERASE:
+		ibnbd_flags = IBNBD_OP_SECURE_ERASE;
+		break;
+	case REQ_OP_WRITE_SAME:
+		ibnbd_flags = IBNBD_OP_WRITE_SAME;
+		break;
+	case REQ_OP_FLUSH:
+		ibnbd_flags = IBNBD_OP_FLUSH;
+		break;
+	default:
+		WARN(1, "Unknown request type %d (flags %llu)\n",
+		     req_op(rq), (unsigned long long)rq->cmd_flags);
+		ibnbd_flags = 0;
+	}
+
+	if (op_is_sync(rq->cmd_flags))
+		ibnbd_flags |= IBNBD_F_SYNC;
+
+	if (op_is_flush(rq->cmd_flags))
+		ibnbd_flags |= IBNBD_F_FUA;
+
+	return ibnbd_flags;
+}
+
+static inline const char *ibnbd_io_mode_str(enum ibnbd_io_mode mode)
+{
+	switch (mode) {
+	case IBNBD_FILEIO:
+		return "fileio";
+	case IBNBD_BLOCKIO:
+		return "blockio";
+	case IBNBD_AUTOIO:
+		return "autoio";
+	default:
+		return "unknown";
+	}
+}
+
+static inline const char *ibnbd_access_mode_str(enum ibnbd_access_mode mode)
+{
+	switch (mode) {
+	case IBNBD_ACCESS_RO:
+		return "ro";
+	case IBNBD_ACCESS_RW:
+		return "rw";
+	case IBNBD_ACCESS_MIGRATION:
+		return "migration";
+	default:
+		return "unknown";
+	}
+}
+
+#endif /* IBNBD_PROTO_H */
-- 
2.17.1


^ permalink raw reply related

* [PATCH v4 14/25] ibtrs: a bit of documentation
From: Jack Wang @ 2019-06-20 15:03 UTC (permalink / raw)
  To: linux-block, linux-rdma
  Cc: axboe, hch, sagi, bvanassche, jgg, dledford, danil.kipnis,
	rpenyaev, Roman Pen, Jack Wang
In-Reply-To: <20190620150337.7847-1-jinpuwang@gmail.com>

From: Roman Pen <roman.penyaev@profitbricks.com>

README with description of major sysfs entries.

Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
 drivers/infiniband/ulp/ibtrs/README | 385 ++++++++++++++++++++++++++++
 1 file changed, 385 insertions(+)
 create mode 100644 drivers/infiniband/ulp/ibtrs/README

diff --git a/drivers/infiniband/ulp/ibtrs/README b/drivers/infiniband/ulp/ibtrs/README
new file mode 100644
index 000000000000..86d5cf836097
--- /dev/null
+++ b/drivers/infiniband/ulp/ibtrs/README
@@ -0,0 +1,385 @@
+****************************
+InfiniBand Transport (IBTRS)
+****************************
+
+IBTRS (InfiniBand Transport) is a reliable high speed transport library
+which provides support to establish optimal number of connections
+between client and server machines using RDMA (InfiniBand, RoCE, iWarp)
+transport. It is optimized to transfer (read/write) IO blocks.
+
+In its core interface it follows the BIO semantics of providing the
+possibility to either write data from an sg list to the remote side
+or to request ("read") data transfer from the remote side into a given
+sg list.
+
+IBTRS provides I/O fail-over and load-balancing capabilities by using
+multipath I/O (see "add_path" and "mp_policy" configuration entries).
+
+IBTRS is used by the IBNBD (Infiniband Network Block Device) modules.
+
+======================
+Client Sysfs Interface
+======================
+
+This chapter describes only the most important files of sysfs interface
+on client side.
+
+Entries under /sys/devices/virtual/ibtrs-client/
+================================================
+
+When a user of IBTRS API creates a new session, a directory entry with
+the name of that session is created.
+
+Entries under /sys/devices/virtual/ibtrs-client/<session-name>/
+===============================================================
+
+add_path (RW)
+-------------
+
+Adds a new path (connection) to an existing session. Expected format is the
+following:
+
+  <[source addr,]destination addr>
+
+  *addr ::= [ ip:<ipv4|ipv6> | gid:<gid> ]
+
+max_reconnect_attempts (RW)
+---------------------------
+
+Maximum number reconnect attempts the client should make before giving up
+after connection breaks unexpectedly.
+
+mp_policy (RW)
+--------------
+
+Multipath policy specifies which path should be selected on each IO:
+
+   round-robin (0):
+       select path in per CPU round-robin manner.
+
+   min-inflight (1):
+       select path with minimum inflights.
+
+Entries under /sys/devices/virtual/ibtrs-client/<session-name>/paths/
+=====================================================================
+
+
+Each path belonging to a given session is listed here by its source and
+destination address. When a new path is added to a session by writing to
+the "add_path" entry, a directory <src@dst> is created.
+
+Entries under /sys/devices/virtual/ibtrs-client/<session-name>/paths/<src@dst>/
+===============================================================================
+
+state (R)
+---------
+
+Contains "connected" if the session is connected to the peer and fully
+functional.  Otherwise the file contains "disconnected"
+
+reconnect (RW)
+--------------
+
+Write "1" to the file in order to reconnect the path.
+Operation is blocking and returns 0 if reconnect was successful.
+
+disconnect (RW)
+---------------
+
+Write "1" to the file in order to disconnect the path.
+Operation blocks until IBTRS path is disconnected.
+
+remove_path (RW)
+----------------
+
+Write "1" to the file in order to disconnected and remove the path
+from the session.  Operation blocks until the path is disconnected
+and removed from the session.
+
+hca_name (R)
+------------
+
+Contains the the name of HCA the connection established on.
+
+hca_port (R)
+------------
+
+Contains the port number of active port traffic is going through.
+
+src_addr (R)
+------------
+
+Contains the source address of the path
+
+dst_addr (R)
+------------
+
+Contains the destination address of the path
+
+
+Entries under /sys/devices/virtual/ibtrs-client/<session-name>/paths/<src@dst>/stats/
+=====================================================================================
+
+Write "0" to any file in that directory to reset corresponding statistics.
+
+reset_all (RW)
+--------------
+
+Read will return usage help, write 0 will clear all the statistics.
+
+sg_entries (RW)
+---------------
+
+Data to be transferred via RDMA is passed to IBTRS as scatter-gather
+list. A scatter-gather list can contain multiple entries.
+Scatter-gather list with less entries require less processing power
+and can therefore transferred faster. The file sg_entries outputs a
+per-CPU distribution table for the number of entries in the
+scatter-gather lists, that were passed to the IBTRS API function
+ibtrs_clt_request (READ or WRITE).
+
+cpu_migration (RW)
+------------------
+
+IBTRS expects that each HCA IRQ is pinned to a separate CPU. If it's
+not the case, the processing of an I/O response could be processed on a
+different CPU than where it was originally submitted.  This file shows
+how many interrupts where generated on a non expected CPU.
+"from:" is the CPU on which the IRQ was expected, but not generated.
+"to:" is the CPU on which the IRQ was generated, but not expected.
+
+reconnects (RW)
+---------------
+
+Contains 2 unsigned int values, the first one records number of successful
+reconnects in the path lifetime, the second one records number of failed
+reconnects in the path lifetime.
+
+rdma_lat (RW)
+-------------
+
+Latency distribution of IBTRS requests.
+The format is:
+   1 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+   2 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+   4 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+   8 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+  16 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+  ...
+  65536 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+  >= 65536 ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+  maximum ms: <CNT-LAT-READ> <CNT-LAT-WRITE>
+
+wc_completion (RW)
+------------------
+
+Contains 2 unsigned int values, the first one records max number of work
+requests processed in work_completion in session lifetime, the second
+one records average number of work requests processed in work_completion
+in session lifetime.
+
+rdma (RW)
+---------
+
+Contains statistics regarding rdma operations and inflight operations.
+The output consists of 6 values:
+
+<read-count> <read-total-size> <write-count> <write-total-size> \
+<inflights> <failovered>
+
+======================
+Server Sysfs Interface
+======================
+
+Entries under /sys/devices/virtual/ibtrs-server/
+================================================
+
+When a user of IBTRS API creates a new session on a client side, a
+directory entry with the name of that session is created in here.
+
+Entries under /sys/devices/virtual/ibtrs-server/<session-name>/paths/
+=====================================================================
+
+When new path is created by writing to "add_path" entry on client side,
+a directory entry named as <source address>@<destination address> is created
+on server.
+
+Entries under /sys/devices/virtual/ibtrs-server/<session-name>/paths/<src@dst>/
+===============================================================================
+
+disconnect (RW)
+---------------
+
+When "1" is written to the file, the IBTRS session is being disconnected.
+Operations is non-blocking and returns control immediately to the caller.
+
+hca_name (R)
+------------
+
+Contains the the name of HCA the connection established on.
+
+hca_port (R)
+------------
+
+Contains the port number of active port traffic is going through.
+
+src_addr (R)
+------------
+
+Contains the source address of the path
+
+dst_addr (R)
+------------
+
+Contains the destination address of the path
+
+Entries under /sys/devices/virtual/ibtrs-server/<session-name>/paths/<src@dst>/stats/
+=====================================================================================
+
+When "0" is written to a file in this directory, the corresponding counters
+will be reset.
+
+reset_all (RW)
+--------------
+
+Read will return usage help, write 0 will clear all the counters about
+stats.
+
+rdma (RW)
+---------
+
+Contains statistics regarding rdma operations and inflight operations.
+The output consists of 5 values:
+
+<read-count> <read-total-size> <write-count> <write-total-size> <inflights>
+
+wc_completion (RW)
+------------------
+
+Contains 3 values, the first one is int, records max number of work
+requests processed in work_completion in session lifetime, the second
+one long int records total number of work requests processed in
+work_completion in session lifetime and the 3rd one long int records
+total number of calls to the cq completion handler. Division of 2nd
+number through 3rd gives the average number of completions processed
+in completion handler.
+
+==================
+Transport protocol
+==================
+
+Overview
+--------
+An established connection between a client and a server is called ibtrs
+session. A session is associated with a set of memory chunks reserved on the
+server side for a given client for rdma transfer. A session
+consists of multiple paths, each representing a separate physical link
+between client and server. Those are used for load balancing and failover.
+Each path consists of as many connections (QPs) as there are cpus on
+the client.
+
+When processing an incoming rdma write or read request ibtrs client uses memory
+chunks reserved for him on the server side. Their number, size and addresses
+need to be exchanged between client and server during the connection
+establishment phase. Apart from the memory related information client needs to
+inform the server about the session name and identify each path and connection
+individually.
+
+On an established session client sends to server write or read messages.
+Server uses immediate field to tell the client which request is being
+acknowledged and for errno. Client uses immediate field to tell the server
+which of the memory chunks has been accessed and at which offset the message
+can be found.
+
+Connection establishment
+------------------------
+
+1. Client starts establishing connections belonging to a path of a session one
+by one via attaching IBTRS_MSG_CON_REQ messages to the rdma_connect requests.
+Those include uuid of the session and uuid of the path to be
+established. They are used by the server to find a persisting session/path or
+to create a new one when necessary. The message also contains the protocol
+version and magic for compatibility, total number of connections per session
+(as many as cpus on the client), the id of the current connection and
+the reconnect counter, which is used to resolve the situations where
+client is trying to reconnect a path, while server is still destroying the old
+one.
+
+2. Server accepts the connection requests one by one and attaches
+IBTRS_MSG_CONN_RSP messages to the rdma_accept. Apart from magic and
+protocol version, the messages include error code, queue depth supported by
+the server (number of memory chunks which are going to be allocated for that
+session) and the maximum size of one io.
+
+3. After all connections of a path are established client sends to server the
+IBTRS_MSG_INFO_REQ message, containing the name of the session. This message
+requests the address information from the server.
+
+4. Server replies to the session info request message with IBTRS_MSG_INFO_RSP,
+which contains the addresses and keys of the RDMA buffers allocated for that
+session.
+
+5. Session becomes connected after all paths to be established are connected
+(i.e. steps 1-4 finished for all paths requested for a session)
+
+6. Server and client exchange periodically heartbeat messages (empty rdma
+messages with an immediate field) which are used to detect a crash on remote
+side or network outage in an absence of IO.
+
+7. On any RDMA related error or in the case of a heartbeat timeout, the
+corresponding path is disconnected, all the inflight IO are failed over to a
+healthy path, if any, and the reconnect mechanism is triggered.
+
+CLT                                     SRV
+*for each connection belonging to a path and for each path:
+IBTRS_MSG_CON_REQ  ------------------->
+                   <------------------- IBTRS_MSG_CON_RSP
+...
+*after all connections are established:
+IBTRS_MSG_INFO_REQ ------------------->
+                   <------------------- IBTRS_MSG_INFO_RSP
+*heartbeat is started from both sides:
+                   -------------------> [IBTRS_HB_MSG_IMM]
+[IBTRS_HB_MSG_ACK] <-------------------
+[IBTRS_HB_MSG_IMM] <-------------------
+                   -------------------> [IBTRS_HB_MSG_ACK]
+
+IO path
+-------
+
+* Write *
+
+1. When processing a write request client selects one of the memory chunks
+on the server side and rdma writes there the user data, user header and the
+IBTRS_MSG_RDMA_WRITE message. Apart from the type (write), the message only
+contains size of the user header. The client tells the server which chunk has
+been accessed and at what offset the IBTRS_MSG_RDMA_WRITE can be found by
+using the IMM field.
+
+2. When confirming a write request server sends an "empty" rdma message with
+an immediate field. The 32 bit field is used to specify the outstanding
+inflight IO and for the error code.
+
+CLT                                                          SRV
+usr_data + usr_hdr + ibtrs_msg_rdma_write -----------------> [IBTRS_IO_REQ_IMM]
+[IBTRS_IO_RSP_IMM]                        <----------------- (id + errno)
+
+* Read *
+
+1. When processing a read request client selects one of the memory chunks
+on the server side and rdma writes there the user header and the
+IBTRS_MSG_RDMA_READ message. This message contains the type (read), size of
+the user header, flags (specifying if memory invalidation is necessary) and the
+list of addresses along with keys for the data to be read into.
+
+2. When confirming a read request server transfers the requested data first,
+attaches an invalidation message if requested and finally an "empty" rdma
+message with an immediate field. The 32 bit field is used to specify the
+outstanding inflight IO and the error code.
+
+CLT                                           SRV
+usr_hdr + ibtrs_msg_rdma_read --------------> [IBTRS_IO_REQ_IMM]
+[IBTRS_IO_RSP_IMM]            <-------------- usr_data + (id + errno)
+or in case client requested invalidation:
+[IBTRS_IO_RSP_IMM_W_INV]      <-------------- usr_data + (INV) + (id + errno)
+
-- 
2.17.1


^ permalink raw reply related

* [RFC/RFT 06/14] soc: amlogic: meson-clk-measure: add G12B second cluster cpu clk
From: Neil Armstrong @ 2019-06-20 15:00 UTC (permalink / raw)
  To: jbrunet, khilman
  Cc: Neil Armstrong, martin.blumenstingl, linux-kernel, linux-amlogic,
	linux-clk, linux-arm-kernel
In-Reply-To: <20190620150013.13462-1-narmstrong@baylibre.com>

Add the G12B second CPU cluster CPU and SYS_PLL measure IDs.

These IDs returns 0Hz on G12A.

Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
---
 drivers/soc/amlogic/meson-clk-measure.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/soc/amlogic/meson-clk-measure.c b/drivers/soc/amlogic/meson-clk-measure.c
index c470e24f1dfa..f09b404b39d3 100644
--- a/drivers/soc/amlogic/meson-clk-measure.c
+++ b/drivers/soc/amlogic/meson-clk-measure.c
@@ -324,6 +324,8 @@ static struct meson_msr_id clk_msr_g12a[CLK_MSR_MAX] = {
 	CLK_MSR_ID(84, "co_tx"),
 	CLK_MSR_ID(89, "hdmi_todig"),
 	CLK_MSR_ID(90, "hdmitx_sys"),
+	CLK_MSR_ID(91, "sys_cpub_div16"),
+	CLK_MSR_ID(92, "sys_pll_cpub_div16"),
 	CLK_MSR_ID(94, "eth_phy_rx"),
 	CLK_MSR_ID(95, "eth_phy_pll"),
 	CLK_MSR_ID(96, "vpu_b"),
-- 
2.21.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related


This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.