All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: Re: [PATCH 3/3] arm64: allwinner: dts: h6: add Wi-Fi support for Pine H64 model A/B
From: Chen-Yu Tsai @ 2018-07-24  2:37 UTC (permalink / raw)
  To: Icenowy Zheng
  Cc: Maxime Ripard, linux-arm-kernel, devicetree, linux-kernel,
	linux-sunxi
In-Reply-To: <AC678B03-19B7-4110-B333-72188CE71296-h8G6r0blFSE@public.gmane.org>

On Tue, Jul 24, 2018 at 10:28 AM, Icenowy Zheng <icenowy-h8G6r0blFSE@public.gmane.org> wrote:
>
>
> 于 2018年7月24日 GMT+08:00 上午10:26:02, Chen-Yu Tsai <wens-jdAy2FN1RRM@public.gmane.org> 写到:
>>On Tue, Jul 24, 2018 at 10:23 AM, Icenowy Zheng <icenowy-h8G6r0blFSE@public.gmane.org>
>>wrote:
>>>
>>>
>>> 于 2018年7月24日 GMT+08:00 上午10:21:59, Chen-Yu Tsai <wens-jdAy2FN1RRM@public.gmane.org> 写到:
>>>>On Tue, Jul 24, 2018 at 9:15 AM, Icenowy Zheng <icenowy-h8G6r0blFSE@public.gmane.org>
>>wrote:
>>>>> The Pine H64 model A has a Wi-Fi module connector and the model B
>>has
>>>>an
>>>>> on-board RTL8723BS Wi-Fi module.
>>>>>
>>>>> Add support for them. For model A, as it's not defaultly present,
>>>>keep
>>>>> it disabled now.
>>>>
>>>>Nope. Pine64 actually has two WiFi/BT modules. And they require
>>>>different
>>>>device tree snippets for both the WiFi and BT side. This is better
>>>>resolved
>>>>with device tree overlays.
>>>>
>>>>I have both, though I've yet found time to work on them.
>>>
>>> I have also both.
>>>
>>> The skeleton here can get the Wi-Fi of both to work.
>>
>>Cool. Then I can put away my RTL module for now. :)
>
> P.S. SDIO is auto detectable, and for BCM chips, the OOB interrupt
> is only a bonus function and it can fall back to standard in-band
> interrupt (which doesn't need special binding, and is currently
> used by mainline r8723bs driver.)

Correct. With BT you'll have serdev device nodes with different
compatibles. Then you'll have to resort to overlays, and you'd probably
end up adding WiFi OOB interrupt bits as well.

So the question remaining is: should we enable the MMC part, along
with power sequencing and regulator supplies, by default? Thinking
more about it, I'm actually OK with it. The board connectors are
clearly marked as being for a WiFi+BT module. The whole space on
the board is surrounded by a box in silkscreen. Sorry for the
initial nack.

Maxime, any thoughts?

>>
>>ChenYu
>>
>>>
>>>>
>>>>ChenYu
>>>>
>>>>> Signed-off-by: Icenowy Zheng <icenowy-h8G6r0blFSE@public.gmane.org>
>>>>> ---
>>>>>  .../allwinner/sun50i-h6-pine-h64-model-b.dts  |  8 +++++
>>>>>  .../boot/dts/allwinner/sun50i-h6-pine-h64.dts | 29
>>>>+++++++++++++++++++
>>>>>  2 files changed, 37 insertions(+)
>>>>>
>>>>> diff --git
>>>>a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>> index d0fcc25efb00..d0f775613c9b 100644
>>>>> --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>> +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>> @@ -18,3 +18,11 @@
>>>>>                 };
>>>>>         };
>>>>>  };
>>>>> +
>>>>> +&mmc1 {
>>>>> +       status = "okay";
>>>>> +};
>>>>> +
>>>>> +&wifi_pwrseq {
>>>>> +       status = "okay";
>>>>> +};
>>>>> diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>> index a85867f8b684..75db6d4139bf 100644
>>>>> --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>> +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>> @@ -40,6 +40,12 @@
>>>>>                         gpios = <&r_pio 0 7 GPIO_ACTIVE_HIGH>; /*
>>PL7
>>>>*/
>>>>>                 };
>>>>>         };
>>>>> +
>>>>> +       wifi_pwrseq: wifi_pwrseq {
>>>>> +               compatible = "mmc-pwrseq-simple";
>>>>> +               reset-gpios = <&r_pio 1 3 GPIO_ACTIVE_LOW>; /* PL2
>>*/
>>>>> +               status = "disabled";
>>>>> +       };
>>>>>  };
>>>>>
>>>>>  &mmc0 {
>>>>> @@ -50,6 +56,17 @@
>>>>>         status = "okay";
>>>>>  };
>>>>>
>>>>> +&mmc1 {
>>>>> +       pinctrl-names = "default";
>>>>> +       pinctrl-0 = <&mmc1_pins>;
>>>>> +       vmmc-supply = <&reg_cldo2>;
>>>>> +       vqmmc-supply = <&reg_bldo2>;
>>>>> +       mmc-pwrseq = <&wifi_pwrseq>;
>>>>> +       bus-width = <4>;
>>>>> +       non-removable;
>>>>> +       status = "disabled";
>>>>> +};
>>>>> +
>>>>>  &mmc2 {
>>>>>         pinctrl-names = "default";
>>>>>         pinctrl-0 = <&mmc2_pins>;
>>>>> @@ -128,12 +145,24 @@
>>>>>                         };
>>>>>
>>>>>                         reg_cldo2: cldo2 {
>>>>> +                               /*
>>>>> +                                * This regulator is connected with
>>>>CLDO3.
>>>>> +                                * Before the kernel can support
>>>>synchronized
>>>>> +                                * enable of coupled regulators,
>>keep
>>>>them
>>>>> +                                * both always on as a ugly hack.
>>>>> +                                */
>>>>> +                               regulator-always-on;
>>>>>                                 regulator-min-microvolt =
>><3300000>;
>>>>>                                 regulator-max-microvolt =
>><3300000>;
>>>>>                                 regulator-name = "vcc-wifi-1";
>>>>>                         };
>>>>>
>>>>>                         reg_cldo3: cldo3 {
>>>>> +                               /*
>>>>> +                                * This regulator is connected with
>>>>CLDO2.
>>>>> +                                * See the comments for CLDO2.
>>>>> +                                */
>>>>> +                               regulator-always-on;
>>>>>                                 regulator-min-microvolt =
>><3300000>;
>>>>>                                 regulator-max-microvolt =
>><3300000>;
>>>>>                                 regulator-name = "vcc-wifi-2";
>>>>> --
>>>>> 2.18.0
>>>>>
>>>
>>> --
>>> You received this message because you are subscribed to the Google
>>Groups "linux-sunxi" group.
>>> To unsubscribe from this group and stop receiving emails from it,
>>send an email to linux-sunxi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
>>> For more options, visit https://groups.google.com/d/optout.
>
> --
> You received this message because you are subscribed to the Google Groups "linux-sunxi" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-sunxi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
> For more options, visit https://groups.google.com/d/optout.

-- 
You received this message because you are subscribed to the Google Groups "linux-sunxi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to linux-sunxi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* [linux-sunxi] Re: [PATCH 3/3] arm64: allwinner: dts: h6: add Wi-Fi support for Pine H64 model A/B
From: Chen-Yu Tsai @ 2018-07-24  2:37 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <AC678B03-19B7-4110-B333-72188CE71296@aosc.io>

On Tue, Jul 24, 2018 at 10:28 AM, Icenowy Zheng <icenowy@aosc.io> wrote:
>
>
> ? 2018?7?24? GMT+08:00 ??10:26:02, Chen-Yu Tsai <wens@csie.org> ??:
>>On Tue, Jul 24, 2018 at 10:23 AM, Icenowy Zheng <icenowy@aosc.io>
>>wrote:
>>>
>>>
>>> ? 2018?7?24? GMT+08:00 ??10:21:59, Chen-Yu Tsai <wens@csie.org> ??:
>>>>On Tue, Jul 24, 2018 at 9:15 AM, Icenowy Zheng <icenowy@aosc.io>
>>wrote:
>>>>> The Pine H64 model A has a Wi-Fi module connector and the model B
>>has
>>>>an
>>>>> on-board RTL8723BS Wi-Fi module.
>>>>>
>>>>> Add support for them. For model A, as it's not defaultly present,
>>>>keep
>>>>> it disabled now.
>>>>
>>>>Nope. Pine64 actually has two WiFi/BT modules. And they require
>>>>different
>>>>device tree snippets for both the WiFi and BT side. This is better
>>>>resolved
>>>>with device tree overlays.
>>>>
>>>>I have both, though I've yet found time to work on them.
>>>
>>> I have also both.
>>>
>>> The skeleton here can get the Wi-Fi of both to work.
>>
>>Cool. Then I can put away my RTL module for now. :)
>
> P.S. SDIO is auto detectable, and for BCM chips, the OOB interrupt
> is only a bonus function and it can fall back to standard in-band
> interrupt (which doesn't need special binding, and is currently
> used by mainline r8723bs driver.)

Correct. With BT you'll have serdev device nodes with different
compatibles. Then you'll have to resort to overlays, and you'd probably
end up adding WiFi OOB interrupt bits as well.

So the question remaining is: should we enable the MMC part, along
with power sequencing and regulator supplies, by default? Thinking
more about it, I'm actually OK with it. The board connectors are
clearly marked as being for a WiFi+BT module. The whole space on
the board is surrounded by a box in silkscreen. Sorry for the
initial nack.

Maxime, any thoughts?

>>
>>ChenYu
>>
>>>
>>>>
>>>>ChenYu
>>>>
>>>>> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>>>>> ---
>>>>>  .../allwinner/sun50i-h6-pine-h64-model-b.dts  |  8 +++++
>>>>>  .../boot/dts/allwinner/sun50i-h6-pine-h64.dts | 29
>>>>+++++++++++++++++++
>>>>>  2 files changed, 37 insertions(+)
>>>>>
>>>>> diff --git
>>>>a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>> index d0fcc25efb00..d0f775613c9b 100644
>>>>> --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>> +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64-model-b.dts
>>>>> @@ -18,3 +18,11 @@
>>>>>                 };
>>>>>         };
>>>>>  };
>>>>> +
>>>>> +&mmc1 {
>>>>> +       status = "okay";
>>>>> +};
>>>>> +
>>>>> +&wifi_pwrseq {
>>>>> +       status = "okay";
>>>>> +};
>>>>> diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>> index a85867f8b684..75db6d4139bf 100644
>>>>> --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>> +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-pine-h64.dts
>>>>> @@ -40,6 +40,12 @@
>>>>>                         gpios = <&r_pio 0 7 GPIO_ACTIVE_HIGH>; /*
>>PL7
>>>>*/
>>>>>                 };
>>>>>         };
>>>>> +
>>>>> +       wifi_pwrseq: wifi_pwrseq {
>>>>> +               compatible = "mmc-pwrseq-simple";
>>>>> +               reset-gpios = <&r_pio 1 3 GPIO_ACTIVE_LOW>; /* PL2
>>*/
>>>>> +               status = "disabled";
>>>>> +       };
>>>>>  };
>>>>>
>>>>>  &mmc0 {
>>>>> @@ -50,6 +56,17 @@
>>>>>         status = "okay";
>>>>>  };
>>>>>
>>>>> +&mmc1 {
>>>>> +       pinctrl-names = "default";
>>>>> +       pinctrl-0 = <&mmc1_pins>;
>>>>> +       vmmc-supply = <&reg_cldo2>;
>>>>> +       vqmmc-supply = <&reg_bldo2>;
>>>>> +       mmc-pwrseq = <&wifi_pwrseq>;
>>>>> +       bus-width = <4>;
>>>>> +       non-removable;
>>>>> +       status = "disabled";
>>>>> +};
>>>>> +
>>>>>  &mmc2 {
>>>>>         pinctrl-names = "default";
>>>>>         pinctrl-0 = <&mmc2_pins>;
>>>>> @@ -128,12 +145,24 @@
>>>>>                         };
>>>>>
>>>>>                         reg_cldo2: cldo2 {
>>>>> +                               /*
>>>>> +                                * This regulator is connected with
>>>>CLDO3.
>>>>> +                                * Before the kernel can support
>>>>synchronized
>>>>> +                                * enable of coupled regulators,
>>keep
>>>>them
>>>>> +                                * both always on as a ugly hack.
>>>>> +                                */
>>>>> +                               regulator-always-on;
>>>>>                                 regulator-min-microvolt =
>><3300000>;
>>>>>                                 regulator-max-microvolt =
>><3300000>;
>>>>>                                 regulator-name = "vcc-wifi-1";
>>>>>                         };
>>>>>
>>>>>                         reg_cldo3: cldo3 {
>>>>> +                               /*
>>>>> +                                * This regulator is connected with
>>>>CLDO2.
>>>>> +                                * See the comments for CLDO2.
>>>>> +                                */
>>>>> +                               regulator-always-on;
>>>>>                                 regulator-min-microvolt =
>><3300000>;
>>>>>                                 regulator-max-microvolt =
>><3300000>;
>>>>>                                 regulator-name = "vcc-wifi-2";
>>>>> --
>>>>> 2.18.0
>>>>>
>>>
>>> --
>>> You received this message because you are subscribed to the Google
>>Groups "linux-sunxi" group.
>>> To unsubscribe from this group and stop receiving emails from it,
>>send an email to linux-sunxi+unsubscribe at googlegroups.com.
>>> For more options, visit https://groups.google.com/d/optout.
>
> --
> You received this message because you are subscribed to the Google Groups "linux-sunxi" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to linux-sunxi+unsubscribe at googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [PATCH v3] sshd: add sshd.service
From: Zheng, Ruoqin @ 2018-07-24  2:37 UTC (permalink / raw)
  To: Burton, Ross; +Cc: OE-core
In-Reply-To: <CAJTo0LZTO7OxZokj2pymX7PaLqBRq5KiyyBij3Ncq8j59YrE7w@mail.gmail.com>

Hi Ross:
        I want to add this for in Ubuntu and Fedora, sshd.socket and sshd.service can both coexist.
        So, maybe we provide both of them, and user can choose the way they want.        

--------------------------------------------------
Zheng Ruoqin
Nanjing Fujitsu Nanda Software Tech. Co., Ltd.(FNST)
ADDR.: No.6 Wenzhu Road, Software Avenue,
       Nanjing, 210012, China
MAIL : zhengrq.fnst@cn.fujistu.com

-----Original Message-----
From: Burton, Ross [mailto:ross.burton@intel.com] 
Sent: Monday, July 23, 2018 6:18 PM
To: Zheng, Ruoqin/郑 若钦 <zhengrq.fnst@cn.fujitsu.com>
Cc: OE-core <openembedded-core@lists.openembedded.org>
Subject: Re: [OE-core] [PATCH v3] sshd: add sshd.service

Still no explanation why you'd want to do this and not use socket activation.

Ross

On 23 July 2018 at 10:02, Zheng Ruoqin <zhengrq.fnst@cn.fujitsu.com> wrote:
> Add sshd.service for user to start the sshd daemon.
>
> Signed-off-by: Zheng Ruoqin <zhengrq.fnst@cn.fujitsu.com>
> ---
>  meta/recipes-connectivity/openssh/openssh/sshd.service | 16 ++++++++++++++++
>  meta/recipes-connectivity/openssh/openssh_7.7p1.bb     |  6 ++++++
>  2 files changed, 22 insertions(+)
>  create mode 100644 
> meta/recipes-connectivity/openssh/openssh/sshd.service
>
> diff --git a/meta/recipes-connectivity/openssh/openssh/sshd.service 
> b/meta/recipes-connectivity/openssh/openssh/sshd.service
> new file mode 100644
> index 0000000..2d2717d
> --- /dev/null
> +++ b/meta/recipes-connectivity/openssh/openssh/sshd.service
> @@ -0,0 +1,16 @@
> +[Unit]
> +Description=OpenSSH server daemon
> +Wants=sshdgenkeys.service
> +After=sshdgenkeys.service
> +
> +[Service]
> +Environment="SSHD_OPTS="
> +EnvironmentFile=-/etc/default/ssh
> +ExecStart=-@SBINDIR@/sshd -i $SSHD_OPTS ExecReload=@BASE_BINDIR@/kill 
> +-HUP $MAINPID KillMode=process Restart=on-failure RestartSec=42s
> +
> +[Install]
> +WantedBy=multi-user.target
> diff --git a/meta/recipes-connectivity/openssh/openssh_7.7p1.bb 
> b/meta/recipes-connectivity/openssh/openssh_7.7p1.bb
> index b3da5f6..b4f4c6d 100644
> --- a/meta/recipes-connectivity/openssh/openssh_7.7p1.bb
> +++ b/meta/recipes-connectivity/openssh/openssh_7.7p1.bb
> @@ -17,6 +17,7 @@ SRC_URI = "http://ftp.openbsd.org/pub/OpenBSD/OpenSSH/portable/openssh-${PV}.tar
>             file://ssh_config \
>             file://init \
>             ${@bb.utils.contains('DISTRO_FEATURES', 'pam', 
> '${PAM_SRC_URI}', '', d)} \
> +           ${@bb.utils.contains('SSHD_DAEMON', 'service', 
> + '${SSHD_DAEMON_SRC_URI}', '', d)} \
>             file://sshd.socket \
>             file://sshd@.service \
>             file://sshdgenkeys.service \ @@ -30,6 +31,8 @@ SRC_URI = 
> "http://ftp.openbsd.org/pub/OpenBSD/OpenSSH/portable/openssh-${PV}.tar
>
>  PAM_SRC_URI = "file://sshd"
>
> +SSHD_DAEMON_SRC_URI = "file://sshd.service"
> +
>  SRC_URI[md5sum] = "68ba883aff6958297432e5877e9a0fe2"
>  SRC_URI[sha256sum] = "d73be7e684e99efcd024be15a30bffcbe41b012b2f7b3c9084aed621775e6b8f"
>
> @@ -111,6 +114,9 @@ do_install_append () {
>         echo "HostKey /var/run/ssh/ssh_host_ed25519_key" >> 
> ${D}${sysconfdir}/ssh/sshd_config_readonly
>
>         install -d ${D}${systemd_unitdir}/system
> +       if [ "${@bb.utils.filter('SSHD_DAEMON', 'service', d)}" ]; then
> +               install -c -m 0644 ${WORKDIR}/sshd.service ${D}${systemd_unitdir}/system
> +        fi
>         install -c -m 0644 ${WORKDIR}/sshd.socket ${D}${systemd_unitdir}/system
>         install -c -m 0644 ${WORKDIR}/sshd@.service ${D}${systemd_unitdir}/system
>         install -c -m 0644 ${WORKDIR}/sshdgenkeys.service 
> ${D}${systemd_unitdir}/system
> --
> 2.7.4
>
>
>
> --
> _______________________________________________
> Openembedded-core mailing list
> Openembedded-core@lists.openembedded.org
> http://lists.openembedded.org/mailman/listinfo/openembedded-core





^ permalink raw reply

* Re: [RFC 1/2] x86/entry/64: Use the TSS sp2 slot for rsp_scratch
From: Andy Lutomirski @ 2018-07-24  2:36 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Andy Lutomirski, X86 ML, LKML, Borislav Petkov, Linus Torvalds
In-Reply-To: <854ac759-efec-3e35-59a9-8da35b2b5156@linux.intel.com>

On Mon, Jul 23, 2018 at 5:38 AM, Dave Hansen
<dave.hansen@linux.intel.com> wrote:
> On 07/22/2018 10:45 AM, Andy Lutomirski wrote:
>> +     /*
>> +      * sp2 is scratch space used by the SYSCALL64 handler.  Linux does
>> +      * not use rung 2, so sp2 is not otherwise needed.
>> +      */
>>       u64                     sp2;
>
> Could we call out the actual thing that we use this slot for, and the
> symbol name so folks can find the corresponding code that does this?
> While I know the spot in entry_64 you're talking about, it might not be
> patently obvious to everyone, and it's also a bit more challenging to
> grep for than normal C code.
>
> Maybe:
>
>         /*
>          * Since Linux does not use ring 2, the 'sp2' slot is unused.
>          * entry_SYSCALL_64 uses this as scratch space to stash the user
>          * %RSP value.
>          */

I'll improve this for v2.

^ permalink raw reply

* [PATCH] net/ixgbe: remove hardcoded CRC STRIP config from ixgbe
From: Wei Zhao @ 2018-07-24  2:36 UTC (permalink / raw)
  To: dev; +Cc: ferruh.yigit, stable, Wei Zhao, Wenzhuo Lu

There is CRC related ifdefs for ixgbe:
CONFIG_RTE_LIBRTE_IXGBE_PF_DISABLE_STRIP_CRC=n
It is used in VF drivers ixgbevf_dev_configure() functions.
VF cannot change the CRC strip behavior and based on what PF
configured it needs to response proper to user
ixgbevf_dev_configure() request. Right now what PF set is
defined by above config options but this method is too static.

Signed-off-by: Wei Zhao <wei.zhao1@intel.com>
Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 app/test-pmd/parameters.c        |  6 ++++++
 app/test-pmd/testpmd.c           |  2 ++
 app/test-pmd/testpmd.h           |  1 +
 drivers/net/ixgbe/ixgbe_ethdev.c | 20 ++++++++++----------
 lib/librte_ethdev/rte_ethdev.h   |  1 +
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
index 962fad7..b981b0f 100644
--- a/app/test-pmd/parameters.c
+++ b/app/test-pmd/parameters.c
@@ -188,6 +188,7 @@ usage(char* progname)
 	printf("  --tx-offloads=0xXXXXXXXX: hexadecimal bitmask of TX queue offloads\n");
 	printf("  --hot-plug: enable hot plug for device.\n");
 	printf("  --vxlan-gpe-port=N: UPD port of tunnel VXLAN-GPE\n");
+	printf("  --pf-crc-keep: disable pf CRC strip function for device\n");
 	printf("  --mlockall: lock all memory\n");
 	printf("  --no-mlockall: do not lock all memory\n");
 }
@@ -623,6 +624,7 @@ launch_args_parse(int argc, char** argv)
 		{ "tx-offloads",		1, 0, 0 },
 		{ "hot-plug",			0, 0, 0 },
 		{ "vxlan-gpe-port",		1, 0, 0 },
+		{ "pf-crc-keep",		0, 0, 0 },
 		{ "mlockall",			0, 0, 0 },
 		{ "no-mlockall",		0, 0, 0 },
 		{ 0, 0, 0, 0 },
@@ -1131,6 +1133,9 @@ launch_args_parse(int argc, char** argv)
 					rte_exit(EXIT_FAILURE,
 						 "vxlan-gpe-port must be >= 0\n");
 			}
+			if (!strcmp(lgopts[opt_idx].name, "pf-crc-keep")) {
+				rx_offloads_disable |= DEV_RX_OFFLOAD_CRC_STRIP;
+			}
 			if (!strcmp(lgopts[opt_idx].name, "print-event"))
 				if (parse_event_printing_config(optarg, 1)) {
 					rte_exit(EXIT_FAILURE,
@@ -1163,4 +1168,5 @@ launch_args_parse(int argc, char** argv)
 	/* Set offload configuration from command line parameters. */
 	rx_mode.offloads = rx_offloads;
 	tx_mode.offloads = tx_offloads;
+	rx_mode.offloads_disable = rx_offloads_disable;
 }
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index d3ce92f..c94328a 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -287,6 +287,8 @@ uint8_t rmv_interrupt = 1; /* enabled by default */
 
 uint8_t hot_plug = 0; /**< hotplug disabled by default. */
 
+uint64_t rx_offloads_disable = 0;  /**< rx offload enabled by default. */
+
 /*
  * Display or mask ether events
  * Default to all events except VF_MBOX
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 4fc30a8..d9734d3 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -313,6 +313,7 @@ extern uint32_t event_print_mask;
 /**< set by "--print-event xxxx" and "--mask-event xxxx parameters */
 extern uint8_t hot_plug; /**< enable by "--hot-plug" parameter */
 extern int do_mlockall; /**< set by "--mlockall" or "--no-mlockall" parameter */
+extern uint64_t rx_offloads_disable;
 
 #ifdef RTE_LIBRTE_IXGBE_BYPASS
 extern uint32_t bypass_timeout; /**< Store the NIC bypass watchdog timeout */
diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index 26b1927..25c1187 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -5007,17 +5007,17 @@ ixgbevf_dev_configure(struct rte_eth_dev *dev)
 	 * VF has no ability to enable/disable HW CRC
 	 * Keep the persistent behavior the same as Host PF
 	 */
-#ifndef RTE_LIBRTE_IXGBE_PF_DISABLE_STRIP_CRC
-	if (rte_eth_dev_must_keep_crc(conf->rxmode.offloads)) {
-		PMD_INIT_LOG(NOTICE, "VF can't disable HW CRC Strip");
-		conf->rxmode.offloads |= DEV_RX_OFFLOAD_CRC_STRIP;
-	}
-#else
-	if (!rte_eth_dev_must_keep_crc(conf->rxmode.offloads)) {
-		PMD_INIT_LOG(NOTICE, "VF can't enable HW CRC Strip");
-		conf->rxmode.offloads &= ~DEV_RX_OFFLOAD_CRC_STRIP;
+	if (conf->rxmode.offloads_disable & DEV_RX_OFFLOAD_CRC_STRIP) {
+		if (rte_eth_dev_must_keep_crc(conf->rxmode.offloads)) {
+			PMD_INIT_LOG(NOTICE, "VF can't disable HW CRC Strip");
+			conf->rxmode.offloads |= DEV_RX_OFFLOAD_CRC_STRIP;
+		}
+	} else {
+		if (!rte_eth_dev_must_keep_crc(conf->rxmode.offloads)) {
+			PMD_INIT_LOG(NOTICE, "VF can't enable HW CRC Strip");
+			conf->rxmode.offloads &= ~DEV_RX_OFFLOAD_CRC_STRIP;
+		}
 	}
-#endif
 
 	/*
 	 * Initialize to TRUE. If any of Rx queues doesn't meet the bulk
diff --git a/lib/librte_ethdev/rte_ethdev.h b/lib/librte_ethdev/rte_ethdev.h
index f5f593b..35a2dd1 100644
--- a/lib/librte_ethdev/rte_ethdev.h
+++ b/lib/librte_ethdev/rte_ethdev.h
@@ -334,6 +334,7 @@ struct rte_eth_rxmode {
 	 * structure are allowed to be set.
 	 */
 	uint64_t offloads;
+	uint64_t offloads_disable;
 };
 
 /**
-- 
2.7.5

^ permalink raw reply related

* [RFC PATCH 25/25] staging: erofs: introduce cached decompression
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This patch adds an optional choice which can be
enabled by users in order to cache both incomplete
ends of compressed clusters as a complement to
the in-place decompression in order to boost random
read, but it costs more memory than the in-place
decompression only.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/Kconfig     |  38 ++++++
 drivers/staging/erofs/internal.h  |  25 ++++
 drivers/staging/erofs/super.c     |  73 ++++++++++
 drivers/staging/erofs/unzip_vle.c | 275 ++++++++++++++++++++++++++++++++++++++
 drivers/staging/erofs/utils.c     |  17 ++-
 5 files changed, 427 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
index b55ce1c..663b755 100644
--- a/drivers/staging/erofs/Kconfig
+++ b/drivers/staging/erofs/Kconfig
@@ -101,3 +101,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	  than 2. Otherwise, the image cannot be mounted
 	  correctly on this kernel.
 
+choice
+	prompt "EROFS VLE Data Decompression mode"
+	depends on EROFS_FS_ZIP
+	default EROFS_FS_ZIP_CACHE_BIPOLAR
+	help
+	  EROFS supports three options for VLE decompression.
+	  "In-place Decompression Only" consumes the minimum memory
+	  with lowest random read.
+
+	  "Bipolar Cached Decompression" consumes the maximum memory
+	  with highest random read.
+
+	  If unsure, select "Bipolar Cached Decompression"
+
+config EROFS_FS_ZIP_NO_CACHE
+	bool "In-place Decompression Only"
+	help
+	  Read compressed data into page cache and do in-place
+	  decompression directly.
+
+config EROFS_FS_ZIP_CACHE_UNIPOLAR
+	bool "Unipolar Cached Decompression"
+	help
+	  For each request, it caches the last compressed page
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+config EROFS_FS_ZIP_CACHE_BIPOLAR
+	bool "Bipolar Cached Decompression"
+	help
+	  For each request, it caches the both end compressed pages
+	  for further reading.
+	  It still decompresses in place for the rest compressed pages.
+
+	  Recommended for performance priority.
+
+endchoice
+
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 676cb1e..4d76f83 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -60,6 +60,18 @@ struct erofs_fault_info {
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR
+#define EROFS_FS_ZIP_CACHE_LVL	(2)
+#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR)
+#define EROFS_FS_ZIP_CACHE_LVL	(1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL	(0)
+#endif
+
+#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
+#define EROFS_FS_HAS_MANAGED_CACHE
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -89,6 +101,11 @@ struct erofs_sb_info {
 		spinlock_t lock;
 #endif
 	} workstn;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct inode *managed_cache;
+#endif
+
 #endif
 
 	u32 build_time_nsec;
@@ -252,6 +269,14 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+#define EROFS_UNALLOCATED_CACHED_PAGE	((void *)0x5F0EF00D)
+
+extern int try_to_free_cached_page(struct address_space *, struct page *);
+extern int try_to_free_all_cached_pages(struct erofs_sb_info *,
+	struct erofs_workgroup *);
+#endif
+
 #endif
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index a0db717..1c04d74 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -264,6 +264,63 @@ static int parse_options(struct super_block *sb, char *options)
 	return 0;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static const struct address_space_operations managed_cache_aops;
+
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	int ret = 1;	/* 0 - busy */
+	struct address_space *const mapping = page->mapping;
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(mapping->a_ops != &managed_cache_aops);
+
+	if (PagePrivate(page))
+		ret = try_to_free_cached_page(mapping, page);
+
+	return ret;
+}
+
+static void managed_cache_invalidatepage(struct page *page,
+	unsigned int offset, unsigned int length)
+{
+	const unsigned int stop = length + offset;
+
+	BUG_ON(!PageLocked(page));
+
+	/* Check for overflow */
+	BUG_ON(stop > PAGE_SIZE || stop < length);
+
+	if (offset == 0 && stop == PAGE_SIZE)
+		while(!managed_cache_releasepage(page, GFP_NOFS))
+			cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+	.releasepage = managed_cache_releasepage,
+	.invalidatepage = managed_cache_invalidatepage,
+};
+
+struct inode *erofs_init_managed_cache(struct super_block *sb)
+{
+	struct inode *inode = new_inode(sb);
+
+	if (unlikely(inode == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	set_nlink(inode, 1);
+	inode->i_size = OFFSET_MAX;
+
+	inode->i_mapping->a_ops = &managed_cache_aops;
+	mapping_set_gfp_mask(inode->i_mapping,
+	                     GFP_NOFS | __GFP_HIGHMEM |
+	                     __GFP_MOVABLE |  __GFP_NOFAIL);
+	return inode;
+}
+
+#endif
+
 static int erofs_read_super(struct super_block *sb,
 	const char *dev_name, void *data, int silent)
 {
@@ -318,6 +375,14 @@ static int erofs_read_super(struct super_block *sb,
 #endif
 #endif
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	sbi->managed_cache = erofs_init_managed_cache(sb);
+	if (IS_ERR(sbi->managed_cache)) {
+		err = PTR_ERR(sbi->managed_cache);
+		goto err_init_managed_cache;
+	}
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -372,6 +437,10 @@ static int erofs_read_super(struct super_block *sb,
 	if (sb->s_root == NULL)
 		iput(inode);
 err_iget:
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+err_init_managed_cache:
+#endif
 err_parseopt:
 err_sbread:
 	sb->s_fs_info = NULL;
@@ -397,6 +466,10 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	iput(sbi->managed_cache);
+#endif
+
 	mutex_lock(&sbi->umount_mutex);
 
 #ifdef CONFIG_EROFS_FS_ZIP
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c
index a739dc6..cd71be9a 100644
--- a/drivers/staging/erofs/unzip_vle.c
+++ b/drivers/staging/erofs/unzip_vle.c
@@ -95,6 +95,111 @@ struct z_erofs_vle_work_builder {
 #define VLE_WORK_BUILDER_INIT()	\
 	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static bool grab_managed_cache_pages(struct address_space *mapping,
+				     erofs_blk_t start,
+				     struct page **compressed_pages,
+				     int clusterblks,
+				     bool reserve_allocation)
+{
+	bool noio = true;
+	unsigned int i;
+
+	/* TODO: optimize by introducing find_get_pages_range */
+	for (i = 0; i < clusterblks; ++i) {
+		struct page *page, *found;
+
+		if (READ_ONCE(compressed_pages[i]) != NULL)
+			continue;
+
+		page = found = find_get_page(mapping, start + i);
+		if (found == NULL) {
+			noio = false;
+			if (!reserve_allocation)
+				continue;
+			page = EROFS_UNALLOCATED_CACHED_PAGE;
+		}
+
+		if (NULL == cmpxchg(compressed_pages + i, NULL, page))
+                        continue;
+
+		if (found != NULL)
+			put_page(found);
+	}
+	return noio;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				 struct erofs_workgroup *egrp)
+{
+	struct z_erofs_vle_workgroup *const grp =
+		container_of(egrp, struct z_erofs_vle_workgroup, obj);
+	struct address_space *const mapping = sbi->managed_cache->i_mapping;
+	const int clusterpages = erofs_clusterpages(sbi);
+	int i;
+
+	/*
+	 * refcount of workgroup is now freezed as 1,
+	 * therefore no need to worry about available decompression users.
+	 */
+	for (i = 0; i < clusterpages; ++i) {
+		struct page *page = grp->compressed_pages[i];
+
+		if (page == NULL || page->mapping != mapping)
+			continue;
+
+		/* block other users from reclaiming or migrating the page */
+		if (!trylock_page(page))
+			return -EBUSY;
+
+		/* barrier is implied in the following 'unlock_page' */
+		WRITE_ONCE(grp->compressed_pages[i], NULL);
+
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+int try_to_free_cached_page(struct address_space *mapping, struct page *page)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_vle_workgroup *grp;
+	int ret = 0;	/* 0 - busy */
+
+	/* prevent the workgroup from being freed */
+	rcu_read_lock();
+	grp = (void *)page_private(page);
+
+	if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
+		unsigned i;
+
+		for (i = 0; i < clusterpages; ++i) {
+			if (grp->compressed_pages[i] == page) {
+				WRITE_ONCE(grp->compressed_pages[i], NULL);
+				ret = 1;
+				break;
+			}
+		}
+		erofs_workgroup_unfreeze(&grp->obj, 1);
+	}
+	rcu_read_unlock();
+
+	if (ret) {
+		ClearPagePrivate(page);
+		put_page(page);
+	}
+	return ret;
+}
+#endif
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
 	struct z_erofs_vle_work_builder *b,
@@ -463,6 +568,9 @@ struct z_erofs_vle_frontend {
 	z_erofs_vle_owned_workgrp_t owned_head;
 
 	bool initial;
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	erofs_off_t cachedzone_la;
+#endif
 };
 
 #define VLE_FRONTEND_INIT(__i) { \
@@ -489,6 +597,12 @@ static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
 	bool tight = builder_is_followed(builder);
 	struct z_erofs_vle_work *work = builder->work;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const mngda = sbi->managed_cache->i_mapping;
+	struct z_erofs_vle_workgroup *grp;
+	bool noio_outoforder;
+#endif
+
 	enum z_erofs_page_type page_type;
 	unsigned cur, end, spiltted, index;
 	int err;
@@ -529,6 +643,21 @@ static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
 	if (unlikely(err))
 		goto err_out;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	grp = fe->builder.grp;
+
+	/* let's do out-of-order decompression for noio */
+	noio_outoforder = grab_managed_cache_pages(
+		mngda, erofs_blknr(map->m_pa),
+		grp->compressed_pages, erofs_blknr(map->m_plen),
+		/* compressed page caching policy */
+		fe->initial | (EROFS_FS_ZIP_CACHE_LVL >= 2 ?
+			map->m_la < fe->cachedzone_la : 0));
+
+	if (noio_outoforder && builder_is_followed(builder))
+		builder->role = Z_EROFS_VLE_WORK_PRIMARY;
+#endif
+
 	tight &= builder_is_followed(builder);
 	work = builder->work;
 hitted:
@@ -616,15 +745,39 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 #endif
 	unsigned i;
 	struct bio_vec *bvec;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *mngda = NULL;
+#endif
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
+		bool cachemngd = false;
 
 		DBG_BUGON(PageUptodate(page));
 		BUG_ON(page->mapping == NULL);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (unlikely(mngda == NULL && !z_erofs_is_stagingpage(page))) {
+			struct inode *const inode = page->mapping->host;
+			struct super_block *const sb = inode->i_sb;
+
+			mngda = EROFS_SB(sb)->managed_cache->i_mapping;
+		}
+
+		/*
+		 * If mngda has not gotten, it equals NULL,
+		 * however, page->mapping never be NULL if working properly.
+		 */
+		cachemngd = (page->mapping == mngda);
+#endif
+
 		if (unlikely(err))
 			SetPageError(page);
+		else if (cachemngd)
+			SetPageUptodate(page);
+
+		if (cachemngd)
+			unlock_page(page);
 	}
 
 	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
@@ -639,6 +792,9 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	struct list_head *page_pool)
 {
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const mngda = sbi->managed_cache->i_mapping;
+#endif
 	const unsigned clusterpages = erofs_clusterpages(sbi);
 
 	struct z_erofs_pagevec_ctor ctor;
@@ -736,6 +892,13 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
 		if (z_erofs_is_stagingpage(page))
 			continue;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		else if (page->mapping == mngda) {
+			BUG_ON(PageLocked(page));
+			BUG_ON(!PageUptodate(page));
+			continue;
+		}
+#endif
 
 		/* only non-head page could be reused as a compressed page */
 		pagenr = z_erofs_onlinepage_index(page);
@@ -813,6 +976,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 	for (i = 0; i < clusterpages; ++i) {
 		page = compressed_pages[i];
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (page->mapping == mngda)
+			continue;
+#endif
 		/* recycle all individual staging pages */
 		(void)z_erofs_gather_if_stagingpage(page_pool, page);
 
@@ -907,7 +1074,32 @@ static void z_erofs_vle_unzip_wq(struct work_struct *work)
 	return io;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+/* true - unlocked (noio), false - locked (need submit io) */
+static inline bool recover_managed_page(
+	struct z_erofs_vle_workgroup *grp,
+	struct page *page)
+{
+	wait_on_page_locked(page);
+	if (PagePrivate(page) && PageUptodate(page))
+		return true;
+
+	lock_page(page);
+	if (unlikely(!PagePrivate(page))) {
+		set_page_private(page, (unsigned long)grp);
+		SetPagePrivate(page);
+	}
+	if (unlikely(PageUptodate(page))) {
+		unlock_page(page);
+		return true;
+	}
+	return false;
+}
+
+#define __FSIO_1 1
+#else
 #define __FSIO_1 0
+#endif
 
 static bool z_erofs_vle_submit_all(struct super_block *sb,
 				   z_erofs_vle_owned_workgrp_t owned_head,
@@ -918,6 +1110,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 	const unsigned clusterpages = erofs_clusterpages(sbi);
 	const gfp_t gfp = GFP_NOFS;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	struct address_space *const mngda = sbi->managed_cache->i_mapping;
+	struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL;
+#endif
 	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
 	struct bio *bio;
 	tagptr1_t bi_private;
@@ -933,6 +1129,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
          * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
 	 */
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	ios[0] = prepare_io_handler(sb, fg_io + 0, false);
+#endif
+
 	if (force_fg) {
 		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
 		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
@@ -953,6 +1153,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		struct page **compressed_pages, *oldpage, *page;
 		pgoff_t first_index;
 		unsigned i = 0;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		unsigned noio = 0;
+		bool cachemngd;
+#endif
 		int err;
 
 		/* no possible 'owned_head' equals the following */
@@ -973,15 +1177,40 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 		/* fulfill all compressed pages */
 		oldpage = page = READ_ONCE(compressed_pages[i]);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		cachemngd = false;
+
+		if (page == EROFS_UNALLOCATED_CACHED_PAGE) {
+			cachemngd = true;
+			goto do_allocpage;
+		} else if (page != NULL) {
+			if (page->mapping != mngda)
+				BUG_ON(PageUptodate(page));
+			else if (recover_managed_page(grp, page)) {
+				/* page is uptodate, skip io submission */
+				force_submit = true;
+				++noio;
+				goto skippage;
+			}
+		} else {
+do_allocpage:
+#else
 		if (page != NULL)
 			BUG_ON(PageUptodate(page));
 		else {
+#endif
 			page = __stagingpage_alloc(pagepool, gfp);
 
 			if (oldpage != cmpxchg(compressed_pages + i,
 				oldpage, page)) {
 				list_add(&page->lru, pagepool);
 				goto repeat;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+			} else if (cachemngd && !add_to_page_cache_lru(page,
+				mngda, first_index + i, gfp)) {
+				set_page_private(page, (unsigned long)grp);
+				SetPagePrivate(page);
+#endif
 			}
 		}
 
@@ -1005,14 +1234,51 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
 
 		force_submit = false;
 		last_index = first_index + i;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skippage:
+#endif
 		if (++i < clusterpages)
 			goto repeat;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (noio < clusterpages)
+			lstgrp_io = grp;
+		else {
+			z_erofs_vle_owned_workgrp_t iogrp_next =
+				owned_head == Z_EROFS_VLE_WORKGRP_TAIL ?
+				Z_EROFS_VLE_WORKGRP_TAIL_CLOSED :
+				owned_head;
+
+			if (lstgrp_io == NULL)
+				ios[1]->head = iogrp_next;
+			else
+				WRITE_ONCE(lstgrp_io->next, iogrp_next);
+
+			if (lstgrp_noio == NULL)
+				ios[0]->head = grp;
+			else
+				WRITE_ONCE(lstgrp_noio->next, grp);
+
+			lstgrp_noio = grp;
+		}
+#endif
 	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
 
 	if (bio != NULL)
 		__submit_bio(bio, REQ_OP_READ, 0);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 	BUG_ON(!nr_bios);
+#else
+	if (lstgrp_noio != NULL)
+		WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	if (!force_fg && !nr_bios) {
+		kvfree(container_of(ios[1],
+			struct z_erofs_vle_unzip_io_sb, io));
+		return true;
+	}
+#endif
 
 	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
 	return true;
@@ -1028,6 +1294,9 @@ static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
 	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
 		return;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+	z_erofs_vle_unzip_all(sb, &io[0], pagepool);
+#endif
 	if (!force_fg)
 		return;
 
@@ -1047,6 +1316,9 @@ static int z_erofs_vle_normalaccess_readpage(struct file *file,
 	int err;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = page->index << PAGE_SHIFT;
+#endif
 	err = z_erofs_do_read_page(&f, page, &pagepool);
 	(void)z_erofs_vle_work_iter_end(&f.builder);
 
@@ -1077,6 +1349,9 @@ static inline int __z_erofs_vle_normalaccess_readpages(
 	struct page *head = NULL;
 	LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+	f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT;
+#endif
 	for (; nr_pages; --nr_pages) {
 		struct page *page = lru_to_page(pages);
 
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index df4aadd..f6c263f 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -143,13 +143,28 @@ unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 		if (cleanup)
 			BUG_ON(cnt != 1);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
 		else if (cnt > 1)
+#else
+		if (!erofs_workgroup_try_to_freeze(grp, 1))
+#endif
 			continue;
 
 		if (radix_tree_delete(&sbi->workstn.tree,
-			grp->index) != grp)
+			grp->index) != grp) {
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skip:
+			erofs_workgroup_unfreeze(grp, 1);
+#endif
 			continue;
+		}
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+		if (try_to_free_all_cached_pages(sbi, grp))
+			goto skip;
+
+		erofs_workgroup_unfreeze(grp, 1);
+#endif
 		/* (rarely) grabbed again when freeing */
 		erofs_workgroup_put(grp);
 
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 24/25] staging: erofs: introduce VLE decompression support
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This patch introduces the basic in-place VLE decompression
implementation for the erofs file system.

Compared with fixed-sized input compression, it implements
what we call 'the variable-length extent compression' which
specifies the same output size for each compression block
to make the full use of IO bandwidth (which means almost
all data from block device can be directly used for decomp-
ression), improve the real (rather than just via data caching,
which costs more memory) random read and keep the relatively
lower compression ratios (it saves more storage space than
fixed-sized input compression which is also configured with
the same input block size), as illustrated below:

        |---  variable-length extent ---|------ VLE ------|---  VLE ---|
         /> clusterofs                  /> clusterofs     /> clusterofs /> clusterofs
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
...||   |       ||           ||         | ||           || |         || | ... original data
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
   ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++
        size         size         size         size         size
         \                             /                 /            /
          \                      /              /            /
           \               /            /            /
            ++-----------++-----------++-----------++
        ... ||           ||           ||           || ... compressed clusters
            ++-----------++-----------++-----------++
            ++->cluster<-++->cluster<-++->cluster<-++
                 size         size         size

The main point of 'in-place' refers to the decompression mode:
Instead of allocating independent compressed pages and data
structures, it reuses the allocated file cache pages at most
to store its compressed data and the corresponding pagevec in
a time-sharing approach by default, which will be useful for
low memory scenario.

In the end, unlike the other filesystems with (de)compression
support using a relatively large compression block size, which
reads and decompresses >= 128KB at once, and gains a more
good-looking random read (In fact it collects small random reads
into large sequential reads and caches all decompressed data
in memory, but it is unacceptable especially for embedded devices
with limited memory, and it is not the real random read), we
select a universal small-sized 4KB compressed cluster, which is
the smallest page size for most architectures, and all compressed
clusters can be read and decompressed independently, which ensures
random read number for all use cases.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/inode.c     |    5 +
 drivers/staging/erofs/internal.h  |    6 +
 drivers/staging/erofs/staging.h   |   46 ++
 drivers/staging/erofs/super.c     |   25 +
 drivers/staging/erofs/unzip_vle.c | 1128 ++++++++++++++++++++++++++++++++++++-
 drivers/staging/erofs/unzip_vle.h |  204 +++++++
 drivers/staging/erofs/utils.c     |   61 +-
 7 files changed, 1473 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c
index 001ddb9..7d07056 100644
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -214,7 +214,12 @@ int fill_inode(struct inode *inode, int isdir)
 		}
 
 		if (is_inode_layout_compression(inode)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+			inode->i_mapping->a_ops =
+				&z_erofs_vle_normalaccess_aops;
+#else
 			err = -ENOTSUPP;
+#endif
 			goto out_unlock;
 		}
 
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 9c25ffa..676cb1e 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -274,6 +274,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 
 typedef u64 erofs_off_t;
@@ -355,6 +358,9 @@ static inline bool is_inode_layout_inline(struct inode *inode)
 extern const struct file_operations erofs_dir_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
+#endif
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
diff --git a/drivers/staging/erofs/staging.h b/drivers/staging/erofs/staging.h
index a9bfd8c..47c9708d 100644
--- a/drivers/staging/erofs/staging.h
+++ b/drivers/staging/erofs/staging.h
@@ -85,3 +85,49 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #endif
 
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 12, 0))
+
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	void *buffer = NULL;
+
+	if (size == 0)
+		return NULL;
+
+	/* do not attempt kmalloc if we need more than 16 pages at once */
+	if (size <= (16 * PAGE_SIZE))
+		buffer = kmalloc(size, flags);
+	if (!buffer) {
+		if (flags & __GFP_ZERO)
+			buffer = vzalloc(size);
+		else
+			buffer = vmalloc(size);
+	}
+	return buffer;
+}
+
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0))
+static inline void kvfree(const void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+#endif
+
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index 1d4bcaa..a0db717 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -119,6 +119,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -452,6 +459,11 @@ static void erofs_kill_sb(struct super_block *sb)
 };
 MODULE_ALIAS_FS("erofs");
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
+
 int __init erofs_module_init(void)
 {
 	int err;
@@ -467,6 +479,12 @@ int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -475,6 +493,10 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -485,6 +507,9 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("successfully finalize erofs");
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c
index e6752cf..a739dc6 100644
--- a/drivers/staging/erofs/unzip_vle.c
+++ b/drivers/staging/erofs/unzip_vle.c
@@ -10,7 +10,1133 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "unzip_vle.h"
+#include <linux/prefetch.h>
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	BUG_ON(z_erofs_workqueue == NULL);
+	BUG_ON(z_erofs_workgroup_cachep == NULL);
+
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(z_erofs_workgroup_cachep);
+}
+
+static inline int init_unzip_workqueue(void)
+{
+	const unsigned onlinecpus = num_online_cpus();
+
+	/*
+	 * we don't need too many threads, limiting threads
+	 * could improve scheduling performance.
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+		WQ_UNBOUND | WQ_CPU_INTENSIVE | WQ_HIGHPRI |
+		WQ_NON_REENTRANT, onlinecpus + onlinecpus / 4);
+
+	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
+}
+
+int z_erofs_init_zip_subsystem(void)
+{
+	z_erofs_workgroup_cachep =
+		kmem_cache_create("erofs_compress",
+		Z_EROFS_WORKGROUP_SIZE, 0,
+		SLAB_RECLAIM_ACCOUNT, NULL);
+
+	if (z_erofs_workgroup_cachep != NULL) {
+		if (!init_unzip_workqueue())
+			return 0;
+
+		kmem_cache_destroy(z_erofs_workgroup_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_vle_work_role {
+	Z_EROFS_VLE_WORK_SECONDARY,
+	Z_EROFS_VLE_WORK_PRIMARY,
+	/*
+	 * The current work has at least been linked with the following
+	 * processed chained works, which means if the processing page
+	 * is the tail partial page of the work, the current work can
+	 * safely use the whole page, as illustrated below:
+	 * +--------------+-------------------------------------------+
+	 * |  tail page   |      head page (of the previous work)     |
+	 * +--------------+-------------------------------------------+
+	 *   /\  which belongs to the current work
+	 * [  (*) this page can be used for the current work itself.  ]
+	 */
+	Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
+	Z_EROFS_VLE_WORK_MAX
+};
+
+struct z_erofs_vle_work_builder {
+	enum z_erofs_vle_work_role role;
+	/*
+	 * 'hosted = false' means that the current workgroup doesn't belong to
+	 * the owned chained workgroups. In the other words, it is none of our
+	 * business to submit this workgroup.
+	 */
+	bool hosted;
+
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+	struct z_erofs_pagevec_ctor vector;
+
+	/* pages used for reading the compressed data */
+	struct page **compressed_pages;
+	unsigned compressed_deficit;
+};
+
+#define VLE_WORK_BUILDER_INIT()	\
+	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_to_reuse_as_compressed_page(
+	struct z_erofs_vle_work_builder *b,
+	struct page *page)
+{
+	while (b->compressed_deficit) {
+		--b->compressed_deficit;
+		if (NULL == cmpxchg(b->compressed_pages++, NULL, page))
+			return true;
+	}
+
+	return false;
+}
+
+/* callers must be with work->lock held */
+static int z_erofs_vle_work_add_page(
+	struct z_erofs_vle_work_builder *builder,
+	struct page *page,
+	enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for the compressed data storage */
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY &&
+		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		try_to_reuse_as_compressed_page(builder, page))
+		return 0;
+
+	ret = z_erofs_pagevec_ctor_enqueue(&builder->vector,
+		page, type, &occupied);
+	builder->work->vcnt += (unsigned)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static inline bool try_to_claim_workgroup(
+	struct z_erofs_vle_workgroup *grp,
+	z_erofs_vle_owned_workgrp_t *owned_head,
+	bool *hosted)
+{
+	DBG_BUGON(*hosted == true);
+
+	/* let's claim these following types of workgroup */
+retry:
+	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
+		/* type 1, nil workgroup */
+		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
+			goto retry;
+
+		*owned_head = grp;
+		*hosted = true;
+	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
+		/*
+		 * type 2, link to the end of a existing open chain,
+		 * be careful that its submission itself is governed
+		 * by the original owned chain.
+		 */
+		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
+			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
+			goto retry;
+
+		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
+	} else
+		return false;	/* :( better luck next time */
+
+	return true;	/* lucky, I am the followee :) */
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_lookup(struct super_block *sb,
+			pgoff_t idx, unsigned pageofs,
+			struct z_erofs_vle_workgroup **grp_ret,
+			enum z_erofs_vle_work_role *role,
+			z_erofs_vle_owned_workgrp_t *owned_head,
+			bool *hosted)
+{
+	bool tag, primary;
+	struct erofs_workgroup *egrp;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	egrp = erofs_find_workgroup(sb, idx, &tag);
+	if (egrp == NULL) {
+		*grp_ret = NULL;
+		return NULL;
+	}
+
+	*grp_ret = grp = container_of(egrp,
+		struct z_erofs_vle_workgroup, obj);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_work(grp, pageofs);
+	primary = true;
+#else
+	BUG();
+#endif
+
+	DBG_BUGON(work->pageofs != pageofs);
+
+	/*
+	 * lock must be taken first to avoid grp->next == NIL between
+	 * claiming workgroup and adding pages:
+	 *                        grp->next != NIL
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                        mutex_lock(&work->lock)
+	 *                        add all pages to pagevec
+	 *
+	 * [correct locking case 1]:
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
+	 *   ...                          *role = SECONDARY
+	 *                                add all pages to pagevec
+	 *                                ...
+	 *                                mutex_unlock(grp->work[c])
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *
+	 * [correct locking case 2]:
+	 *   mutex_lock(grp->work[b])
+	 *   ...
+	 *   mutex_lock(grp->work[a])
+	 *   ...
+	 *   mutex_lock(grp->work[c])
+	 *   ...
+	 *   grp->next = NIL
+	 *   mutex_unlock_all
+	 *                                mutex_lock(grp->work[a])
+	 *                                *role = PRIMARY_OWNER
+	 *                                add all pages to pagevec
+	 *                                ...
+	 */
+	mutex_lock(&work->lock);
+
+	*hosted = false;
+	if (!primary)
+		*role = Z_EROFS_VLE_WORK_SECONDARY;
+	/* claim the workgroup if possible */
+	else if (try_to_claim_workgroup(grp, owned_head, hosted))
+		*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	else
+		*role = Z_EROFS_VLE_WORK_PRIMARY;
+
+	return work;
+}
+
+static struct z_erofs_vle_work *
+z_erofs_vle_work_register(struct super_block *sb,
+			  struct z_erofs_vle_workgroup **grp_ret,
+			  struct erofs_map_blocks *map,
+			  pgoff_t index, unsigned pageofs,
+			  enum z_erofs_vle_work_role *role,
+			  z_erofs_vle_owned_workgrp_t *owned_head,
+			  bool *hosted)
+{
+	bool newgrp = false;
+	struct z_erofs_vle_workgroup *grp = *grp_ret;
+	struct z_erofs_vle_work *work;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	BUG_ON(grp != NULL);
+#else
+	if (grp != NULL)
+		goto skip;
+#endif
+	/* no available workgroup, let's allocate one */
+	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
+	if (unlikely(grp == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	grp->obj.index = index;
+	grp->llen = map->m_llen;
+
+	z_erofs_vle_set_workgrp_fmt(grp,
+		(map->m_flags & EROFS_MAP_ZIPPED) ?
+			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
+			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
+	atomic_set(&grp->obj.refcount, 1);
+
+	/* new workgrps have been claimed as type 1 */
+	WRITE_ONCE(grp->next, *owned_head);
+	/* primary and followed work for all new workgrps */
+	*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
+	/* it should be submitted by ourselves */
+	*hosted = true;
+
+	newgrp = true;
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip:
+	/* currently unimplemented */
+	BUG();
+#else
+	work = z_erofs_vle_grab_primary_work(grp);
+#endif
+	work->pageofs = pageofs;
+
+	mutex_init(&work->lock);
+
+	if (newgrp) {
+		int err = erofs_register_workgroup(sb, &grp->obj, 0);
+
+		if (err) {
+			kmem_cache_free(z_erofs_workgroup_cachep, grp);
+			return ERR_PTR(-EAGAIN);
+		}
+	}
+
+	*owned_head = *grp_ret = grp;
+
+	mutex_lock(&work->lock);
+	return work;
+}
+
+static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
+					 unsigned int llen)
+{
+	while(1) {
+		unsigned int orig_llen = grp->llen;
+
+		if (orig_llen >= llen || orig_llen ==
+			cmpxchg(&grp->llen, orig_llen, llen))
+			break;
+	}
+}
+
+#define builder_is_followed(builder) \
+	((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
+
+static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder,
+				       struct super_block *sb,
+				       struct erofs_map_blocks *map,
+				       z_erofs_vle_owned_workgrp_t *owned_head)
+{
+	const unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
+	const erofs_blk_t index = erofs_blknr(map->m_pa);
+	const unsigned pageofs = map->m_la & ~PAGE_MASK;
+	struct z_erofs_vle_workgroup *grp;
+	struct z_erofs_vle_work *work;
+
+	DBG_BUGON(builder->work != NULL);
+
+	/* must be Z_EROFS_WORK_TAIL or the next chained work */
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+	DBG_BUGON(erofs_blkoff(map->m_pa));
+
+repeat:
+	work = z_erofs_vle_work_lookup(sb, index,
+		pageofs, &grp, &builder->role, owned_head, &builder->hosted);
+	if (work != NULL) {
+		__update_workgrp_llen(grp, map->m_llen);
+		goto got_it;
+	}
+
+	work = z_erofs_vle_work_register(sb, &grp, map, index, pageofs,
+		&builder->role, owned_head, &builder->hosted);
+
+	if (unlikely(work == ERR_PTR(-EAGAIN)))
+		goto repeat;
+
+	if (unlikely(IS_ERR(work)))
+		return PTR_ERR(work);
+got_it:
+	z_erofs_pagevec_ctor_init(&builder->vector,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
+
+	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) {
+		/* enable possibly in-place decompression */
+		builder->compressed_pages = grp->compressed_pages;
+		builder->compressed_deficit = clusterpages;
+	} else {
+		builder->compressed_pages = NULL;
+		builder->compressed_deficit = 0;
+	}
+
+	builder->grp = grp;
+	builder->work = work;
+	return 0;
+}
+
+/*
+ * keep in mind that no referenced workgroups will be freed
+ * only after a RCU grace period, so rcu_read_lock() could
+ * prevent a workgroup from being freed.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_vle_work *work =	container_of(head,
+		struct z_erofs_vle_work, rcu);
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	kmem_cache_free(z_erofs_workgroup_cachep, grp);
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
+		struct z_erofs_vle_workgroup, obj);
+	struct z_erofs_vle_work *const work = &vgrp->work;
+
+	call_rcu(&work->rcu, z_erofs_rcu_callback);
+}
+
+void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp,
+	struct z_erofs_vle_work *work __maybe_unused)
+{
+	erofs_workgroup_put(&grp->obj);
+}
+
+void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
+{
+	struct z_erofs_vle_workgroup *grp =
+		z_erofs_vle_work_workgroup(work, true);
+
+	__z_erofs_vle_work_release(grp, work);
+}
+
+static inline bool
+z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder)
+{
+	struct z_erofs_vle_work *work = builder->work;
+
+	if (work == NULL)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&builder->vector, false);
+	mutex_unlock(&work->lock);
+
+	/*
+	 * if all pending pages are added, don't hold work reference
+	 * any longer if the current work isn't hosted by ourselves.
+	 */
+	if (!builder->hosted)
+		__z_erofs_vle_work_release(builder->grp, work);
+
+	builder->work = NULL;
+	builder->grp = NULL;
+	return true;
+}
+
+static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
+					       gfp_t gfp)
+{
+	struct page *page = erofs_allocpage(pagepool, gfp);
+
+	if (unlikely(page == NULL))
+		return NULL;
+
+	page->mapping = Z_EROFS_MAPPING_STAGING;
+	return page;
+}
+
+struct z_erofs_vle_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_vle_work_builder builder;
+	struct erofs_map_blocks_iter m_iter;
+
+	z_erofs_vle_owned_workgrp_t owned_head;
+
+	bool initial;
+};
+
+#define VLE_FRONTEND_INIT(__i) { \
+	.inode = __i, \
+	.m_iter = { \
+		{ .m_llen = 0, .m_plen = 0 }, \
+		.mpage = NULL \
+	}, \
+	.builder = VLE_WORK_BUILDER_INIT(), \
+	.owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \
+	.initial = true, }
+
+static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
+				struct page *page,
+				struct list_head *page_pool)
+{
+	struct super_block *const sb = fe->inode->i_sb;
+	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+	struct erofs_map_blocks_iter *const m = &fe->m_iter;
+	struct erofs_map_blocks *const map = &m->map;
+	struct z_erofs_vle_work_builder *const builder = &fe->builder;
+	const loff_t offset = page_offset(page);
+
+	bool tight = builder_is_followed(builder);
+	struct z_erofs_vle_work *work = builder->work;
+
+	enum z_erofs_page_type page_type;
+	unsigned cur, end, spiltted, index;
+	int err;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+            offset + cur < map->m_la + map->m_llen)
+		goto hitted;
+
+	/* go ahead the next map_blocks */
+	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (!z_erofs_vle_work_iter_end(builder))
+		fe->initial = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
+	if (unlikely(err))
+		goto err_out;
+
+	/* deal with hole (FIXME! broken now) */
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
+		goto hitted;
+
+	DBG_BUGON(map->m_plen != 1 << sbi->clusterbits);
+	BUG_ON(erofs_blkoff(map->m_pa));
+
+	err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head);
+	if (unlikely(err))
+		goto err_out;
+
+	tight &= builder_is_followed(builder);
+	work = builder->work;
+hitted:
+	cur = end - min_t(unsigned, offset + end - map->m_la, end);
+	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+retry:
+	err = z_erofs_vle_work_add_page(builder, page, page_type);
+	/* should allocate an additional staging page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *const newpage =
+			__stagingpage_alloc(page_pool, GFP_NOFS);
+
+		err = z_erofs_vle_work_add_page(builder,
+			newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (unlikely(err))
+		goto err_out;
+
+	index = page->index - map->m_la / PAGE_SIZE;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_fixup(page, index, true);
+	++spiltted;
+
+	/* also update nr_pages and increase queued_pages */
+	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	if ((end = cur) > 0)
+		goto repeat;
+
+	/* FIXME! avoid the last relundant fixup & endio */
+	z_erofs_onlinepage_endio(page);
+
+	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		__func__, page, spiltted, map->m_llen);
+	return 0;
+
+err_out:
+	/* TODO: the missing error handing cases */
+	return err;
+}
+
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
+	bool background = tagptr_unfold_tags(t);
+
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+
+	if (background)
+		queue_work(z_erofs_workqueue, &io->u.work);
+	else
+		wake_up(&io->u.wait);
+}
+
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
+static inline void z_erofs_vle_read_endio(struct bio *bio, int err)
+#else
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+#endif
+{
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 13, 0))
+	const int err = bio->bi_status;
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0))
+	const int err = bio->bi_error;
+#endif
+	unsigned i;
+	struct bio_vec *bvec;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		BUG_ON(page->mapping == NULL);
+
+		if (unlikely(err))
+			SetPageError(page);
+	}
+
+	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+	bio_put(bio);
+}
+
+static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static int z_erofs_vle_unzip(struct super_block *sb,
+	struct z_erofs_vle_workgroup *grp,
+	struct list_head *page_pool)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned nr_pages;
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	unsigned sparsemem_pages = 0;
+#endif
+	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+	unsigned i, llen;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped;
+	struct z_erofs_vle_work *work;
+	void *vout;
+	int err;
+
+	might_sleep();
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	work = z_erofs_vle_grab_primary_work(grp);
+#else
+	BUG();
+#endif
+	BUG_ON(!READ_ONCE(work->nr_pages));
+
+	mutex_lock(&work->lock);
+	nr_pages = work->nr_pages;
+
+	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
+		pages = pages_onstack;
+	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
+		mutex_trylock(&z_pagemap_global_lock))
+		pages = z_pagemap_global;
+	else {
+repeat:
+		pages = kvmalloc_array(nr_pages,
+			sizeof(struct page *), GFP_KERNEL);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (unlikely(pages == NULL)) {
+			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
+				goto repeat;
+			else {
+				mutex_lock(&z_pagemap_global_lock);
+				pages = z_pagemap_global;
+			}
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	z_erofs_pagevec_ctor_init(&ctor,
+		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
+
+	for (i = 0; i < work->vcnt; ++i) {
+		unsigned pagenr;
+
+		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
+
+		/* all pages in pagevec ought to be valid */
+		DBG_BUGON(page == NULL);
+		DBG_BUGON(page->mapping == NULL);
+
+		if (z_erofs_gather_if_stagingpage(page_pool, page))
+			continue;
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+	}
+
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = grp->compressed_pages;
+
+	for(i = 0; i < clusterpages; ++i) {
+		unsigned pagenr;
+
+		page = compressed_pages[i];
+
+		/* all compressed pages ought to be valid */
+		DBG_BUGON(page == NULL);
+		DBG_BUGON(page->mapping == NULL);
+
+		if (z_erofs_is_stagingpage(page))
+			continue;
+
+		/* only non-head page could be reused as a compressed page */
+		pagenr = z_erofs_onlinepage_index(page);
+
+		BUG_ON(pagenr >= nr_pages);
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+		BUG_ON(pages[pagenr] != NULL);
+		++sparsemem_pages;
+#endif
+		pages[pagenr] = page;
+
+		overlapped = true;
+	}
+
+	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
+
+	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
+		/* FIXME! this should be fixed in the future */
+		BUG_ON(grp->llen != llen);
+
+		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
+			pages, nr_pages, work->pageofs);
+		goto out;
+	}
+
+	if (llen > grp->llen)
+		llen = grp->llen;
+
+	err = z_erofs_vle_unzip_fast_percpu(compressed_pages,
+		clusterpages, pages, llen, work->pageofs,
+		z_erofs_onlinepage_endio);
+	if (err != -ENOTSUPP)
+		goto out_percpu;
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+	if (sparsemem_pages >= nr_pages) {
+		BUG_ON(sparsemem_pages > nr_pages);
+		goto skip_allocpage;
+	}
+#endif
+
+	for (i = 0; i < nr_pages; ++i) {
+		if (pages[i] != NULL)
+			continue;
+
+		pages[i] = __stagingpage_alloc(page_pool, GFP_NOFS);
+	}
+
+#ifndef CONFIG_EROFS_FS_ZIP_MULTIREF
+skip_allocpage:
+#endif
+	vout = erofs_vmap(pages, nr_pages);
+
+	err = z_erofs_vle_unzip_vmap(compressed_pages,
+		clusterpages, vout, llen, work->pageofs, overlapped);
+
+	erofs_vunmap(vout, nr_pages);
+
+out:
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+		DBG_BUGON(page->mapping == NULL);
+
+		/* recycle all individual staging pages */
+		if (z_erofs_gather_if_stagingpage(page_pool, page))
+			continue;
+
+		if (unlikely(err < 0))
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+out_percpu:
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		/* recycle all individual staging pages */
+		(void)z_erofs_gather_if_stagingpage(page_pool, page);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (unlikely(pages != pages_onstack))
+		kvfree(pages);
+
+	work->nr_pages = 0;
+	work->vcnt = 0;
+
+	/* all work locks MUST be taken before the following line */
+
+	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
+
+	/* all work locks SHOULD be released right now */
+	mutex_unlock(&work->lock);
+
+	z_erofs_vle_work_release(work);
+	return err;
+}
+
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+				  struct z_erofs_vle_unzip_io *io,
+				  struct list_head *page_pool)
+{
+	z_erofs_vle_owned_workgrp_t owned = io->head;
+
+	while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) {
+		struct z_erofs_vle_workgroup *grp;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned;
+		owned = READ_ONCE(grp->next);
+
+		z_erofs_vle_unzip(sb, grp, page_pool);
+	};
+}
+
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
+		struct z_erofs_vle_unzip_io_sb, io.u.work);
+	LIST_HEAD(page_pool);
+
+	BUG_ON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
+
+	put_pages_list(&page_pool);
+	kvfree(iosb);
+}
+
+static inline struct z_erofs_vle_unzip_io *
+prepare_io_handler(struct super_block *sb,
+		   struct z_erofs_vle_unzip_io *io,
+		   bool background)
+{
+	struct z_erofs_vle_unzip_io_sb *iosb;
+
+	if (!background) {
+		/* waitqueue available for foreground io */
+		BUG_ON(io == NULL);
+
+		init_waitqueue_head(&io->u.wait);
+		atomic_set(&io->pending_bios, 0);
+		goto out;
+	}
+
+	if (io != NULL)
+		BUG();
+	else {
+		/* allocate extra io descriptor for background io */
+		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
+			GFP_KERNEL | __GFP_NOFAIL);
+		BUG_ON(iosb == NULL);
+
+		io = &iosb->io;
+	}
+
+	iosb->sb = sb;
+	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+	io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED;
+	return io;
+}
+
+#define __FSIO_1 0
+
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+				   z_erofs_vle_owned_workgrp_t owned_head,
+				   struct list_head *pagepool,
+				   struct z_erofs_vle_unzip_io *fg_io,
+				   bool force_fg)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned clusterpages = erofs_clusterpages(sbi);
+	const gfp_t gfp = GFP_NOFS;
+	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
+	struct bio *bio;
+	tagptr1_t bi_private;
+	/* since bio will be NULL, no need to initialize last_index */
+	pgoff_t uninitialized_var(last_index);
+	bool force_submit = false;
+	unsigned nr_bios;
+
+	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
+		return false;
+
+	/*
+	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
+         * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
+	 */
+	if (force_fg) {
+		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
+	} else {
+		ios[__FSIO_1] = prepare_io_handler(sb, NULL, true);
+		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1);
+	}
+
+	nr_bios = 0;
+	force_submit = false;
+	bio = NULL;
+
+	/* by default, all need io submission */
+	ios[__FSIO_1]->head = owned_head;
+
+	do {
+		struct z_erofs_vle_workgroup *grp;
+		struct page **compressed_pages, *oldpage, *page;
+		pgoff_t first_index;
+		unsigned i = 0;
+		int err;
+
+		/* no possible 'owned_head' equals the following */
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
+
+		grp = owned_head;
+
+		/* close the main owned chain at first */
+		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
+			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+		first_index = grp->obj.index;
+		compressed_pages = grp->compressed_pages;
+
+		force_submit |= (first_index != last_index + 1);
+repeat:
+		/* fulfill all compressed pages */
+		oldpage = page = READ_ONCE(compressed_pages[i]);
+
+		if (page != NULL)
+			BUG_ON(PageUptodate(page));
+		else {
+			page = __stagingpage_alloc(pagepool, gfp);
+
+			if (oldpage != cmpxchg(compressed_pages + i,
+				oldpage, page)) {
+				list_add(&page->lru, pagepool);
+				goto repeat;
+			}
+		}
+
+		if (bio != NULL && force_submit) {
+submit_bio_retry:
+			__submit_bio(bio, REQ_OP_READ, 0);
+			bio = NULL;
+		}
+
+		if (bio == NULL) {
+			bio = prepare_bio(sb, first_index + i,
+				BIO_MAX_PAGES, z_erofs_vle_read_endio);
+			bio->bi_private = tagptr_cast_ptr(bi_private);
+
+			++nr_bios;
+		}
+
+		err = bio_add_page(bio, page, PAGE_SIZE, 0);
+		if (err < PAGE_SIZE)
+			goto submit_bio_retry;
+
+		force_submit = false;
+		last_index = first_index + i;
+		if (++i < clusterpages)
+			goto repeat;
+	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
+
+	if (bio != NULL)
+		__submit_bio(bio, REQ_OP_READ, 0);
+
+	BUG_ON(!nr_bios);
+
+	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
+	return true;
+}
+
+static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
+				     struct list_head *pagepool,
+				     bool force_fg)
+{
+	struct super_block *sb = f->inode->i_sb;
+	struct z_erofs_vle_unzip_io io[1 + __FSIO_1];
+
+	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
+		return;
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	wait_event(io[__FSIO_1].u.wait,
+		!atomic_read(&io[__FSIO_1].pending_bios));
+
+	/* let's synchronous decompression */
+	z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool);
+}
+
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+	struct inode *const inode = page->mapping->host;
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	int err;
+	LIST_HEAD(pagepool);
+
+	err = z_erofs_do_read_page(&f, page, &pagepool);
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	if (err) {
+		errln("%s, failed to read, err [%d]", __func__, err);
+		goto out;
+	}
+
+	z_erofs_submit_and_unzip(&f, &pagepool, true);
+out:
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static inline int __z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages, bool sync)
+{
+	struct inode *const inode = mapping->host;
+
+	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		BUG_ON(PagePrivate(page));
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head != NULL) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page, &pagepool);
+		if (err) {
+			struct erofs_vnode *vi = EROFS_V(inode);
+
+			errln("%s, readahead error at page %lu of nid %llu",
+				__func__, page->index, vi->nid);
+		}
+
+		put_page(page);
+	}
+
+	(void)z_erofs_vle_work_iter_end(&f.builder);
+
+	z_erofs_submit_and_unzip(&f, &pagepool, sync);
+
+	if (f.m_iter.mpage != NULL)
+		put_page(f.m_iter.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+static int z_erofs_vle_normalaccess_readpages(
+	struct file *filp,
+	struct address_space *mapping,
+	struct list_head *pages, unsigned nr_pages)
+{
+	return __z_erofs_vle_normalaccess_readpages(filp,
+		mapping, pages, nr_pages,
+		nr_pages < 4 /* sync */);
+}
+
+const struct address_space_operations z_erofs_vle_normalaccess_aops = {
+	.readpage = z_erofs_vle_normalaccess_readpage,
+	.readpages = z_erofs_vle_normalaccess_readpages,
+};
 
 #define __vle_cluster_advise(x, bit, bits) \
 	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
diff --git a/drivers/staging/erofs/unzip_vle.h b/drivers/staging/erofs/unzip_vle.h
index b34f5bc..2388a09 100644
--- a/drivers/staging/erofs/unzip_vle.h
+++ b/drivers/staging/erofs/unzip_vle.h
@@ -14,9 +14,213 @@
 #define __EROFS_FS_UNZIP_VLE_H
 
 #include "internal.h"
+#include "unzip_pagevec.h"
+
+/*
+ *  - 0x5FA11OC8D ('fsallocated', Z_EROFS_MAPPING_STAGING) -
+ * used for temporary allocated pages (via erofs_allocpage),
+ * in order to seperate those from NULL mapping (eg. truncated pages)
+ */
+#define Z_EROFS_MAPPING_STAGING		((void *)0x5FA110C8D)
+
+#define z_erofs_is_stagingpage(page)	\
+	((page)->mapping == Z_EROFS_MAPPING_STAGING)
+
+static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool,
+						 struct page *page)
+{
+	if (z_erofs_is_stagingpage(page)) {
+		list_add(&page->lru, page_pool);
+		return true;
+	}
+	return false;
+}
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
 
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+
+	atomic_t refcount;
+#endif
+	struct mutex lock;
+
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
+
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
+
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	struct z_erofs_vle_work work;
+
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
+
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+	unsigned int llen, flags;
+};
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
+
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
+
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+
+#endif
+
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_vle_owned_workgrp_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+
+		BUG_ON(id != index);
+	}
+
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
+
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 035cbd7..df4aadd 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -12,6 +12,7 @@
  */
 
 #include "internal.h"
+#include <linux/pagevec.h>
 
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
 
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
+
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+
+	found = radix_tree_gang_lookup(&sbi->workstn.tree,
+		batch, first_index, PAGEVEC_SIZE);
+
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		first_index = grp->index + 1;
+
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+
+		if (cleanup)
+			BUG_ON(cnt != 1);
+
+		else if (cnt > 1)
+			continue;
+
+		if (radix_tree_delete(&sbi->workstn.tree,
+			grp->index) != grp)
+			continue;
+
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 
 #endif
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 23/25] staging: erofs: introduce workstation for decompression
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This patch introduces another concept used by the unzip
subsystem called 'workstation'. It can be seen as a sparse
array that stores pointers pointed to data structures
related to the corresponding physical blocks.

All lookup cases are protected by RCU read lock. Besides,
reference count and spin_lock are also introduced to
manage its lifetime and serialize all update operations.

'workstation' is currently implemented on the in-kernel
radix tree approach for backward compatibility.
With the evolution of linux kernel, it could be migrated
into XArray implementation in the future.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/internal.h | 103 +++++++++++++++++++++++++++++++++++++++
 drivers/staging/erofs/super.c    |  12 +++++
 drivers/staging/erofs/utils.c    |  81 ++++++++++++++++++++++++++++--
 3 files changed, 193 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 12a5e4d3..9c25ffa 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -81,6 +81,14 @@ struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* cluster size in bit shift */
 	unsigned char clusterbits;
+
+	/* the dedicated workstation for compression */
+	struct {
+		struct radix_tree_root tree;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+		spinlock_t lock;
+#endif
+	} workstn;
 #endif
 
 	u32 build_time_nsec;
@@ -151,6 +159,101 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+#define erofs_workstn_lock(sbi)         spin_lock(&(sbi)->workstn.lock)
+#define erofs_workstn_unlock(sbi)       spin_unlock(&(sbi)->workstn.lock)
+#else
+#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn.tree)
+#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn.tree)
+#endif
+
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+	/* the workgroup index in the workstation */
+	pgoff_t index;
+
+	/* overall workgroup reference count */
+	atomic_t refcount;
+};
+
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+
+static inline bool erofs_workgroup_try_to_freeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	if (v != atomic_cmpxchg(&grp->refcount,
+		v, EROFS_LOCKED_MAGIC))
+		return false;
+	preempt_disable();
+#else
+	preempt_disable();
+	if (atomic_read(&grp->refcount) != v) {
+		preempt_enable();
+		return false;
+	}
+#endif
+	return true;
+}
+
+static inline void erofs_workgroup_unfreeze(
+	struct erofs_workgroup *grp, int v)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+	atomic_set(&grp->refcount, v);
+#endif
+	preempt_enable();
+}
+
+static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
+{
+	const int locked = (int)EROFS_LOCKED_MAGIC;
+	int o;
+
+repeat:
+	o = atomic_read(&grp->refcount);
+
+	/* spin if it is temporarily locked at the reclaim path */
+	if (unlikely(o == locked)) {
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+		do
+			cpu_relax();
+		while (atomic_read(&grp->refcount) == locked);
+#endif
+		goto repeat;
+	}
+
+	if (unlikely(o <= 0))
+		return -1;
+
+	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+		goto repeat;
+
+	*ocnt = o;
+	return 0;
+}
+
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+
+extern int erofs_workgroup_put(struct erofs_workgroup *grp);
+
+extern struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag);
+
+extern int erofs_register_workgroup(struct super_block *sb,
+	struct erofs_workgroup *grp, bool tag);
+
+extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+	unsigned long nr_shrink, bool cleanup);
+
+static inline void erofs_workstation_cleanup_all(struct super_block *sb)
+{
+	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
+}
+
+#endif
+
 /* we strictly follow PAGE_SIZE and no buffer head yet */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
 
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index 2e2a1f5..1d4bcaa 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -304,6 +304,13 @@ static int erofs_read_super(struct super_block *sb,
 	if (!silent)
 		infoln("root inode @ nid %llu", ROOT_NID(sbi));
 
+#ifdef CONFIG_EROFS_FS_ZIP
+	INIT_RADIX_TREE(&sbi->workstn.tree, GFP_ATOMIC);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 17, 0))
+	spin_lock_init(&sbi->workstn.lock);
+#endif
+#endif
+
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
 	if (IS_ERR(inode)) {
@@ -384,6 +391,11 @@ static void erofs_put_super(struct super_block *sb)
 	__putname(sbi->dev_name);
 
 	mutex_lock(&sbi->umount_mutex);
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	erofs_workstation_cleanup_all(sb);
+#endif
+
 	erofs_unregister_super(sb);
 	mutex_unlock(&sbi->umount_mutex);
 
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index c1d83ce..035cbd7 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -29,6 +29,83 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
+#ifdef CONFIG_EROFS_FS_ZIP
+
+/* radix_tree and the future XArray both don't use tagptr_t yet */
+struct erofs_workgroup *erofs_find_workgroup(
+	struct super_block *sb, pgoff_t index, bool *tag)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_workgroup *grp;
+	int oldcount;
+
+repeat:
+	rcu_read_lock();
+	grp = radix_tree_lookup(&sbi->workstn.tree, index);
+	if (grp != NULL) {
+		*tag = radix_tree_exceptional_entry(grp);
+		grp = (void *)((unsigned long)grp &
+			~RADIX_TREE_EXCEPTIONAL_ENTRY);
+
+		if (erofs_workgroup_get(grp, &oldcount)) {
+			/* prefer to relax rcu read side */
+			rcu_read_unlock();
+			goto repeat;
+		}
+
+		/* decrease refcount added by erofs_workgroup_put */
+		if (unlikely(oldcount == 1))
+			atomic_long_dec(&erofs_global_shrink_cnt);
+		BUG_ON(index != grp->index);
+	}
+	rcu_read_unlock();
+	return grp;
+}
+
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp,
+			     bool tag)
+{
+	struct erofs_sb_info *sbi;
+	int err;
+
+	/* grp->refcount should not < 1 */
+	BUG_ON(!atomic_read(&grp->refcount));
+
+	err = radix_tree_preload(GFP_NOFS);
+	if (err)
+		return err;
+
+	sbi = EROFS_SB(sb);
+	erofs_workstn_lock(sbi);
+
+	if (tag)
+		grp = (void *)((unsigned long)grp |
+			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+
+	err = radix_tree_insert(&sbi->workstn.tree,
+		grp->index, grp);
+
+	if (!err) {
+		__erofs_workgroup_get(grp);
+	}
+
+	erofs_workstn_unlock(sbi);
+	radix_tree_preload_end();
+	return err;
+}
+
+unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+				       unsigned long nr_shrink,
+				       bool cleanup)
+{
+	return 0;
+}
+
+#endif
 
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
@@ -37,9 +114,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-/* global shrink count (for all mounted EROFS instances) */
-static atomic_long_t erofs_global_shrink_cnt;
-
 void erofs_register_super(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -112,6 +186,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		list_move_tail(&sbi->list, &erofs_sb_list);
 		mutex_unlock(&sbi->umount_mutex);
 
+		freed += erofs_shrink_workstation(sbi, nr, false);
 		if (freed >= nr)
 			break;
 	}
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 22/25] staging: erofs: introduce erofs shrinker
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This patch adds a dedicated shrinker targeting to free unneeded
memory consumed by a number of erofs in-memory data structures.

Like F2FS and UBIFS, it also adds:
  - sbi->umount_mutex to avoid races on shrinker and put_super
  - sbi->shrinker_run_no to not revisit recently scaned objects

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/internal.h |  7 ++++
 drivers/staging/erofs/super.c    | 15 +++++++
 drivers/staging/erofs/utils.c    | 85 +++++++++++++++++++++++++++++++++++++---
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index b7eead6..12a5e4d3 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -68,6 +68,7 @@ struct erofs_fault_info {
 struct erofs_sb_info {
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
+	struct mutex umount_mutex;
 
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -95,6 +96,7 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+	unsigned int shrinker_run_no;
 
 #ifdef CONFIG_EROFS_FAULT_INJECTION
 	struct erofs_fault_info fault_info;	/* For fault injection */
@@ -433,5 +435,10 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 extern void erofs_register_super(struct super_block *sb);
 extern void erofs_unregister_super(struct super_block *sb);
 
+extern unsigned long erofs_shrink_count(struct shrinker *shrink,
+	struct shrink_control *sc);
+extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
+	struct shrink_control *sc);
+
 #endif
 
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index 52d2453..2e2a1f5 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -383,7 +383,9 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	mutex_lock(&sbi->umount_mutex);
 	erofs_unregister_super(sb);
+	mutex_unlock(&sbi->umount_mutex);
 
 	kfree(sbi);
 	sb->s_fs_info = NULL;
@@ -423,6 +425,12 @@ static void erofs_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -443,6 +451,10 @@ int __init erofs_module_init(void)
 	if (err)
 		goto icache_err;
 
+	err = register_shrinker(&erofs_shrinker_info);
+	if (err)
+		goto shrinker_err;
+
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -451,6 +463,8 @@ int __init erofs_module_init(void)
 	return 0;
 
 fs_err:
+	unregister_shrinker(&erofs_shrinker_info);
+shrinker_err:
 	erofs_exit_inode_cache();
 icache_err:
 	return err;
@@ -459,6 +473,7 @@ int __init erofs_module_init(void)
 void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("successfully finalize erofs");
 }
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 6748def..c1d83ce 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -29,20 +29,93 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-static DEFINE_MUTEX(erofs_sb_list_lock);
+
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+
 void erofs_register_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
-	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
-	mutex_unlock(&erofs_sb_list_lock);
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_init(&sbi->umount_mutex);
+
+	spin_lock(&erofs_sb_list_lock);
+	list_add(&sbi->list, &erofs_sb_list);
+	spin_unlock(&erofs_sb_list_lock);
 }
 
 void erofs_unregister_super(struct super_block *sb)
 {
-	mutex_lock(&erofs_sb_list_lock);
+	spin_lock(&erofs_sb_list_lock);
 	list_del(&EROFS_SB(sb)->list);
-	mutex_unlock(&erofs_sb_list_lock);
+	spin_unlock(&erofs_sb_list_lock);
+}
+
+unsigned long erofs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
+{
+	return atomic_long_read(&erofs_global_shrink_cnt);
+}
+
+unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	struct erofs_sb_info *sbi;
+	struct list_head *p;
+
+	unsigned long nr = sc->nr_to_scan;
+	unsigned int run_no;
+	unsigned long freed = 0;
+
+	spin_lock(&erofs_sb_list_lock);
+	do
+		run_no = ++shrinker_run_no;
+	while (run_no == 0);
+
+	/* Iterate over all mounted superblocks and try to shrink them */
+	p = erofs_sb_list.next;
+	while (p != &erofs_sb_list) {
+		sbi = list_entry(p, struct erofs_sb_info, list);
+
+		/*
+		 * We move the ones we do to the end of the list, so we stop
+		 * when we see one we have already done.
+		 */
+		if (sbi->shrinker_run_no == run_no)
+			break;
+
+		if (!mutex_trylock(&sbi->umount_mutex)) {
+			p = p->next;
+			continue;
+		}
+
+		spin_unlock(&erofs_sb_list_lock);
+		sbi->shrinker_run_no = run_no;
+
+		/* add scan handlers here */
+
+		spin_lock(&erofs_sb_list_lock);
+		/* Get the next list element before we move this one */
+		p = p->next;
+
+		/*
+		 * Move this one to the end of the list to provide some
+		 * fairness.
+		 */
+		list_move_tail(&sbi->list, &erofs_sb_list);
+		mutex_unlock(&sbi->umount_mutex);
+
+		if (freed >= nr)
+			break;
+	}
+	spin_unlock(&erofs_sb_list_lock);
+	return freed;
 }
 
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 21/25] staging: erofs: introduce superblock registration
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

In order to introducing shrinker solution for erofs,
let's manage all mounted erofs instances at first.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/internal.h |  6 ++++++
 drivers/staging/erofs/super.c    |  4 ++++
 drivers/staging/erofs/utils.c    | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index dd8f225..b7eead6 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -66,6 +66,9 @@ struct erofs_fault_info {
 typedef u64 erofs_nid_t;
 
 struct erofs_sb_info {
+	/* list for all registered superblocks, mainly for shrinker */
+	struct list_head list;
+
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -427,5 +430,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 /* utils.c */
 extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
+extern void erofs_register_super(struct super_block *sb);
+extern void erofs_unregister_super(struct super_block *sb);
+
 #endif
 
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index 9561bca..52d2453 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -334,6 +334,8 @@ static int erofs_read_super(struct super_block *sb,
 	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
 	sbi->dev_name[PATH_MAX - 1] = '\0';
 
+	erofs_register_super(sb);
+
 	/*
 	 * We already have a positive dentry, which was instantiated
 	 * by d_make_root. Just need to d_rehash it.
@@ -381,6 +383,8 @@ static void erofs_put_super(struct super_block *sb)
 	infoln("unmounted for %s", sbi->dev_name);
 	__putname(sbi->dev_name);
 
+	erofs_unregister_super(sb);
+
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 3dec4f8..6748def 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -29,3 +29,20 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
+static DEFINE_MUTEX(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+
+void erofs_register_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_add(&EROFS_SB(sb)->list, &erofs_sb_list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
+void erofs_unregister_super(struct super_block *sb)
+{
+	mutex_lock(&erofs_sb_list_lock);
+	list_del(&EROFS_SB(sb)->list);
+	mutex_unlock(&erofs_sb_list_lock);
+}
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 20/25] staging: erofs: add a generic z_erofs VLE decompressor
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

Currently, this patch only simply implements LZ4
decompressor due to its development priority.

In the future, erofs will support more compression
algorithm and format other than LZ4, thus a generic
decompressor interface will be needed.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/Kconfig         |  14 +++
 drivers/staging/erofs/Makefile        |   2 +-
 drivers/staging/erofs/internal.h      |   5 +
 drivers/staging/erofs/unzip_vle.h     |  35 ++++++
 drivers/staging/erofs/unzip_vle_lz4.c | 209 ++++++++++++++++++++++++++++++++++
 5 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/erofs/unzip_vle.h
 create mode 100644 drivers/staging/erofs/unzip_vle_lz4.c

diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
index 63bec70..b55ce1c 100644
--- a/drivers/staging/erofs/Kconfig
+++ b/drivers/staging/erofs/Kconfig
@@ -87,3 +87,17 @@ config EROFS_FS_ZIP
 
 	  If you don't want to use compression feature, say N.
 
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+	int "EROFS Cluster Pages Hard Limit"
+	depends on EROFS_FS_ZIP
+	range 1 256
+	default "1"
+	help
+	  Indicates VLE compressed pages hard limit of a
+	  compressed cluster.
+
+	  For example, if files of a image are compressed
+	  into 8k-unit, the hard limit should not be less
+	  than 2. Otherwise, the image cannot be mounted
+	  correctly on this kernel.
+
diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
index e409637..9a766eb 100644
--- a/drivers/staging/erofs/Makefile
+++ b/drivers/staging/erofs/Makefile
@@ -9,5 +9,5 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
 ccflags-y += -I$(src)/include
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_lz4.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_lz4.o unzip_vle_lz4.o
 
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 88ebd39..dd8f225 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -163,6 +163,11 @@ static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#endif
+
 typedef u64 erofs_off_t;
 
 /* data type for filesystem-wide blocks number */
diff --git a/drivers/staging/erofs/unzip_vle.h b/drivers/staging/erofs/unzip_vle.h
new file mode 100644
index 0000000..b34f5bc
--- /dev/null
+++ b/drivers/staging/erofs/unzip_vle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/drivers/staging/erofs/unzip_vle.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_FS_UNZIP_VLE_H
+#define __EROFS_FS_UNZIP_VLE_H
+
+#include "internal.h"
+
+#define Z_EROFS_VLE_INLINE_PAGEVECS     3
+
+/* unzip_vle_lz4.c */
+extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned nr_pages, unsigned short pageofs);
+
+extern int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+	unsigned clusterpages, struct page **pages,
+	unsigned outlen, unsigned short pageofs,
+	void (*endio)(struct page *));
+
+extern int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+	unsigned clusterpages, void *vaddr, unsigned llen,
+	unsigned short pageofs, bool overlapped);
+
+#endif
+
diff --git a/drivers/staging/erofs/unzip_vle_lz4.c b/drivers/staging/erofs/unzip_vle_lz4.c
new file mode 100644
index 0000000..0ed158b
--- /dev/null
+++ b/drivers/staging/erofs/unzip_vle_lz4.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/drivers/staging/erofs/unzip_vle_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "unzip_vle.h"
+
+#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
+#endif
+
+static struct {
+	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
+} erofs_pcpubuf[NR_CPUS];
+
+int z_erofs_vle_plain_copy(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   struct page **pages,
+			   unsigned nr_pages,
+			   unsigned short pageofs)
+{
+	unsigned i, j;
+	void *src = NULL;
+	const unsigned righthalf = PAGE_SIZE - pageofs;
+	char *percpu_data;
+	bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
+
+	preempt_disable();
+	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
+
+	j = 0;
+	for(i = 0; i < nr_pages; j = i++) {
+		struct page *page = pages[i];
+		void *dst;
+
+		if (page == NULL) {
+			if (src != NULL) {
+				if (!mirrored[j])
+					kunmap_atomic(src);
+				src = NULL;
+			}
+			continue;
+		}
+
+		dst = kmap_atomic(page);
+
+		for(; j < clusterpages; ++j) {
+			if (compressed_pages[j] != page)
+				continue;
+
+			BUG_ON(mirrored[j]);
+			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
+			mirrored[j] = true;
+			break;
+		}
+
+		if (i) {
+			if (src == NULL)
+				src = mirrored[i-1] ?
+					percpu_data + (i-1) * PAGE_SIZE :
+					kmap_atomic(compressed_pages[i-1]);
+
+			memcpy(dst, src + righthalf, pageofs);
+
+			if (!mirrored[i-1])
+				kunmap_atomic(src);
+
+			if (unlikely(i >= clusterpages)) {
+				kunmap_atomic(dst);
+				break;
+			}
+		}
+
+		if (!righthalf)
+			src = NULL;
+		else {
+			src = mirrored[i] ? percpu_data + i * PAGE_SIZE :
+				kmap_atomic(compressed_pages[i]);
+
+			memcpy(dst + pageofs, src, righthalf);
+		}
+
+		kunmap_atomic(dst);
+	}
+
+	if (src != NULL && !mirrored[j])
+		kunmap_atomic(src);
+
+	preempt_enable();
+	return 0;
+}
+
+extern int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
+
+int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
+				  unsigned clusterpages,
+				  struct page **pages,
+				  unsigned outlen,
+				  unsigned short pageofs,
+				  void (*endio)(struct page *))
+{
+	void *vin, *vout;
+	unsigned nr_pages, i, j;
+	int ret;
+
+	if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE)
+		return -ENOTSUPP;
+
+	nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE);
+
+	if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else
+		vin = erofs_vmap(compressed_pages, clusterpages);
+
+	preempt_disable();
+	vout = erofs_pcpubuf[smp_processor_id()].data;
+
+	ret = z_erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, outlen);
+
+	if (ret >= 0) {
+		outlen = ret;
+		ret = 0;
+	}
+
+	for(i = 0; i < nr_pages; ++i) {
+		j = min((unsigned)PAGE_SIZE - pageofs, outlen);
+
+		if (pages[i] != NULL) {
+			if (ret < 0)
+				SetPageError(pages[i]);
+			else if (clusterpages == 1 && pages[i] == compressed_pages[0])
+				memcpy(vin + pageofs, vout + pageofs, j);
+			else {
+				void *dst = kmap_atomic(pages[i]);
+
+				memcpy(dst + pageofs, vout + pageofs, j);
+				kunmap_atomic(dst);
+			}
+			endio(pages[i]);
+		}
+		vout += PAGE_SIZE;
+		outlen -= j;
+		pageofs = 0;
+	}
+	preempt_enable();
+
+	if (clusterpages == 1)
+		kunmap_atomic(vin);
+	else
+		erofs_vunmap(vin, clusterpages);
+
+	return ret;
+}
+
+int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
+			   unsigned clusterpages,
+			   void *vout,
+			   unsigned llen,
+			   unsigned short pageofs,
+			   bool overlapped)
+{
+	void *vin;
+	unsigned i;
+	int ret;
+
+	if (overlapped) {
+		preempt_disable();
+		vin = erofs_pcpubuf[smp_processor_id()].data;
+
+		for(i = 0; i < clusterpages; ++i) {
+			void *t = kmap_atomic(compressed_pages[i]);
+
+			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
+			kunmap_atomic(t);
+		}
+	} else if (clusterpages == 1)
+		vin = kmap_atomic(compressed_pages[0]);
+	else {
+		vin = erofs_vmap(compressed_pages, clusterpages);
+	}
+
+	ret = z_erofs_unzip_lz4(vin, vout + pageofs,
+		clusterpages * PAGE_SIZE, llen);
+	if (ret > 0)
+		ret = 0;
+
+	if (!overlapped) {
+		if (clusterpages == 1)
+			kunmap_atomic(vin);
+		else {
+			erofs_vunmap(vin, clusterpages);
+		}
+	} else
+		preempt_enable();
+
+	return ret;
+}
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 19/25] staging: erofs: introduce a customized LZ4 decompression
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

We have to reduce the memory cost as much as possible,
so we don't want to decompress more data beyond
the output buffer size, however "LZ4_decompress_safe_partial"
doesn't guarantee to stop at the arbitary end position,
but stop just after its current LZ4 "sequence" is completed.

Link: https://groups.google.com/forum/#!topic/lz4c/_3kkz5N6n00

Therefore, I hacked the LZ4 decompression logic by hand,
probably NOT the fastest approach, and hope for better
implementation.

Signed-off-by: Miao Xie <miaoxie at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/Makefile    |   2 +-
 drivers/staging/erofs/lz4defs.h   | 227 ++++++++++++++++++++++++++++++++++
 drivers/staging/erofs/unzip_lz4.c | 251 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 479 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/erofs/lz4defs.h
 create mode 100644 drivers/staging/erofs/unzip_lz4.c

diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
index 490fa6c..e409637 100644
--- a/drivers/staging/erofs/Makefile
+++ b/drivers/staging/erofs/Makefile
@@ -9,5 +9,5 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
 ccflags-y += -I$(src)/include
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_lz4.o
 
diff --git a/drivers/staging/erofs/lz4defs.h b/drivers/staging/erofs/lz4defs.h
new file mode 100644
index 0000000..00a0b58
--- /dev/null
+++ b/drivers/staging/erofs/lz4defs.h
@@ -0,0 +1,227 @@
+#ifndef __LZ4DEFS_H__
+#define __LZ4DEFS_H__
+
+/*
+ * lz4defs.h -- common and architecture specific defines for the kernel usage
+
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011-2016, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
+ *
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid at informatik.uni-hamburg.de>
+ */
+
+#include <asm/unaligned.h>
+#include <linux/string.h>	 /* memset, memcpy */
+
+#define FORCE_INLINE __always_inline
+
+/*-************************************
+ *	Basic Types
+ **************************************/
+#include <linux/types.h>
+
+typedef	uint8_t BYTE;
+typedef uint16_t U16;
+typedef uint32_t U32;
+typedef	int32_t S32;
+typedef uint64_t U64;
+typedef uintptr_t uptrval;
+
+/*-************************************
+ *	Architecture specifics
+ **************************************/
+#if defined(CONFIG_64BIT)
+#define LZ4_ARCH64 1
+#else
+#define LZ4_ARCH64 0
+#endif
+
+#if defined(__LITTLE_ENDIAN)
+#define LZ4_LITTLE_ENDIAN 1
+#else
+#define LZ4_LITTLE_ENDIAN 0
+#endif
+
+/*-************************************
+ *	Constants
+ **************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
+
+/* Increase this value ==> compression run slower on incompressible data */
+#define LZ4_SKIPTRIGGER 6
+
+#define HASH_UNIT sizeof(size_t)
+
+#define KB (1 << 10)
+#define MB (1 << 20)
+#define GB (1U << 30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+#define STEPSIZE sizeof(size_t)
+
+#define ML_BITS	4
+#define ML_MASK	((1U << ML_BITS) - 1)
+#define RUN_BITS (8 - ML_BITS)
+#define RUN_MASK ((1U << RUN_BITS) - 1)
+
+/*-************************************
+ *	Reading and writing into memory
+ **************************************/
+static FORCE_INLINE U16 LZ4_read16(const void *ptr)
+{
+	return get_unaligned((const U16 *)ptr);
+}
+
+static FORCE_INLINE U32 LZ4_read32(const void *ptr)
+{
+	return get_unaligned((const U32 *)ptr);
+}
+
+static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr)
+{
+	return get_unaligned((const size_t *)ptr);
+}
+
+static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value)
+{
+	put_unaligned(value, (U16 *)memPtr);
+}
+
+static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value)
+{
+	put_unaligned(value, (U32 *)memPtr);
+}
+
+static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr)
+{
+	return get_unaligned_le16(memPtr);
+}
+
+static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
+{
+	return put_unaligned_le16(value, memPtr);
+}
+
+static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
+{
+#if LZ4_ARCH64
+	U64 a = get_unaligned((const U64 *)src);
+
+	put_unaligned(a, (U64 *)dst);
+#else
+	U32 a = get_unaligned((const U32 *)src);
+	U32 b = get_unaligned((const U32 *)src + 1);
+
+	put_unaligned(a, (U32 *)dst);
+	put_unaligned(b, (U32 *)dst + 1);
+#endif
+}
+
+/*
+ * customized variant of memcpy,
+ * which can overwrite up to 7 bytes beyond dstEnd
+ */
+static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
+	const void *srcPtr, void *dstEnd)
+{
+	BYTE *d = (BYTE *)dstPtr;
+	const BYTE *s = (const BYTE *)srcPtr;
+	BYTE *const e = (BYTE *)dstEnd;
+
+	do {
+		LZ4_copy8(d, s);
+		d += 8;
+		s += 8;
+	} while (d < e);
+}
+
+static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
+{
+#if LZ4_LITTLE_ENDIAN
+	return __ffs(val) >> 3;
+#else
+	return (BITS_PER_LONG - 1 - __fls(val)) >> 3;
+#endif
+}
+
+static FORCE_INLINE unsigned int LZ4_count(
+	const BYTE *pIn,
+	const BYTE *pMatch,
+	const BYTE *pInLimit)
+{
+	const BYTE *const pStart = pIn;
+
+	while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+		size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+
+		if (!diff) {
+			pIn += STEPSIZE;
+			pMatch += STEPSIZE;
+			continue;
+		}
+
+		pIn += LZ4_NbCommonBytes(diff);
+
+		return (unsigned int)(pIn - pStart);
+	}
+
+#if LZ4_ARCH64
+	if ((pIn < (pInLimit - 3))
+		&& (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
+		pIn += 4;
+		pMatch += 4;
+	}
+#endif
+
+	if ((pIn < (pInLimit - 1))
+		&& (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
+		pIn += 2;
+		pMatch += 2;
+	}
+
+	if ((pIn < pInLimit) && (*pMatch == *pIn))
+		pIn++;
+
+	return (unsigned int)(pIn - pStart);
+}
+
+typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+#endif
diff --git a/drivers/staging/erofs/unzip_lz4.c b/drivers/staging/erofs/unzip_lz4.c
new file mode 100644
index 0000000..b01a8d9
--- /dev/null
+++ b/drivers/staging/erofs/unzip_lz4.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * linux/drivers/staging/erofs/unzip_lz4.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * Original code taken from 'linux/lib/lz4/lz4_decompress.c'
+ */
+
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Copyright (C) 2011 - 2016, Yann Collet.
+ * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *	* Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ *	* Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * You can contact the author at :
+ *	- LZ4 homepage : http://www.lz4.org
+ *	- LZ4 source repository : https://github.com/lz4/lz4
+ *
+ *	Changed for kernel usage by:
+ *	Sven Schmidt <4sschmid at informatik.uni-hamburg.de>
+ */
+#include "internal.h"
+#include <asm/unaligned.h>
+#include "lz4defs.h"
+
+/*
+ * no public solution to solve our requirement yet.
+ * see: <required buffer size for LZ4_decompress_safe_partial>
+ *      https://groups.google.com/forum/#!topic/lz4c/_3kkz5N6n00
+ */
+static FORCE_INLINE int customized_lz4_decompress_safe_partial(
+	const void * const source,
+	void * const dest,
+	int inputSize,
+	int outputSize)
+{
+	/* Local Variables */
+	const BYTE *ip = (const BYTE *) source;
+	const BYTE * const iend = ip + inputSize;
+
+	BYTE *op = (BYTE *) dest;
+	BYTE * const oend = op + outputSize;
+	BYTE *cpy;
+
+	static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
+	static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
+
+	/* Empty output buffer */
+	if (unlikely(outputSize == 0))
+		return ((inputSize == 1) && (*ip == 0)) ? 0 : -1;
+
+	/* Main Loop : decode sequences */
+	while (1) {
+		size_t length;
+		const BYTE *match;
+		size_t offset;
+
+		/* get literal length */
+		unsigned int const token = *ip++;
+
+		length = token>>ML_BITS;
+
+		if (length == RUN_MASK) {
+			unsigned int s;
+
+			do {
+				s = *ip++;
+				length += s;
+			} while ((ip < iend - RUN_MASK) & (s == 255));
+
+			if (unlikely((size_t)(op + length) < (size_t)(op))) {
+				/* overflow detection */
+				goto _output_error;
+			}
+			if (unlikely((size_t)(ip + length) < (size_t)(ip))) {
+				/* overflow detection */
+				goto _output_error;
+			}
+		}
+
+		/* copy literals */
+		cpy = op + length;
+		if ((cpy > oend - WILDCOPYLENGTH) ||
+			(ip + length > iend - (2 + 1 + LASTLITERALS))) {
+			if (cpy > oend) {
+				memcpy(op, ip, length = oend - op);
+				op += length;
+				break;
+			}
+
+			if (unlikely(ip + length > iend)) {
+				/*
+				 * Error :
+				 * read attempt beyond
+				 * end of input buffer
+				 */
+				goto _output_error;
+			}
+
+			memcpy(op, ip, length);
+			ip += length;
+			op += length;
+
+			if (ip > iend - 2)
+				break;
+			/* Necessarily EOF, due to parsing restrictions */
+			/* break; */
+		} else {
+			LZ4_wildCopy(op, ip, cpy);
+			ip += length;
+			op = cpy;
+		}
+
+		/* get offset */
+		offset = LZ4_readLE16(ip);
+		ip += 2;
+		match = op - offset;
+
+		if (unlikely(match < (const BYTE *)dest)) {
+			/* Error : offset outside buffers */
+			goto _output_error;
+		}
+
+		/* get matchlength */
+		length = token & ML_MASK;
+		if (length == ML_MASK) {
+			unsigned int s;
+
+			do {
+				s = *ip++;
+
+				if (ip > iend - LASTLITERALS)
+					goto _output_error;
+
+				length += s;
+			} while (s == 255);
+
+			if (unlikely((size_t)(op + length) < (size_t)op)) {
+				/* overflow detection */
+				goto _output_error;
+			}
+		}
+
+		length += MINMATCH;
+
+		/* copy match within block */
+		cpy = op + length;
+
+		if (unlikely(cpy >= oend - WILDCOPYLENGTH)) {
+			if (cpy >= oend) {
+				while (op < oend)
+					*op++ = *match++;
+				break;
+			}
+			goto __match;
+		}
+
+		/* costs ~1%; silence an msan warning when offset == 0 */
+		LZ4_write32(op, (U32)offset);
+
+		if (unlikely(offset < 8)) {
+			const int dec64 = dec64table[offset];
+
+			op[0] = match[0];
+			op[1] = match[1];
+			op[2] = match[2];
+			op[3] = match[3];
+			match += dec32table[offset];
+			memcpy(op + 4, match, 4);
+			match -= dec64;
+		} else {
+			LZ4_copy8(op, match);
+			match += 8;
+		}
+
+		op += 8;
+
+		if (unlikely(cpy > oend - 12)) {
+			BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
+
+			if (op < oCopyLimit) {
+				LZ4_wildCopy(op, match, oCopyLimit);
+				match += oCopyLimit - op;
+				op = oCopyLimit;
+			}
+__match:
+			while (op < cpy)
+				*op++ = *match++;
+		} else {
+			LZ4_copy8(op, match);
+
+			if (length > 16)
+				LZ4_wildCopy(op + 8, match + 8, cpy);
+		}
+
+		op = cpy; /* correction */
+	}
+	DBG_BUGON((void *)ip - source > inputSize);
+	DBG_BUGON((void *)op - dest > outputSize);
+
+	/* Nb of output bytes decoded */
+	return (int) ((void *)op - dest);
+
+	/* Overflow error detected */
+_output_error:
+	return -ERANGE;
+}
+
+int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen)
+{
+	int ret = customized_lz4_decompress_safe_partial(in,
+		out, inlen, outlen);
+
+	if (ret >= 0)
+		return ret;
+
+	/*
+	 * LZ4_decompress_safe will return an error code
+	 * (< 0) if decompression failed
+	 */
+	errln("%s, failed to decompress, in[%p, %lu] outlen[%p, %lu]",
+	      __func__, in, inlen, out, outlen);
+	WARN_ON(1);
+	print_hex_dump(KERN_DEBUG, "raw data [in]: ", DUMP_PREFIX_OFFSET,
+		16, 1, in, inlen, true);
+	print_hex_dump(KERN_DEBUG, "raw data [out]: ", DUMP_PREFIX_OFFSET,
+		16, 1, out, outlen, true);
+	return -EIO;
+}
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 18/25] staging: erofs: globalize prepare_bio and __submit_bio
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

The unzip subsystem also uses these functions,
let's export them to internal.h.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/data.c     | 41 +++++++++-------------------------------
 drivers/staging/erofs/internal.h | 41 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c
index c25847d..adec1aa 100644
--- a/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@ -46,33 +46,6 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-static void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	submit_bio(0, bio);
-#else
-	submit_bio(bio);
-#endif
-}
-
-static struct bio *prepare_bio(struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages)
-{
-	struct bio *bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, nr_pages);
-
-	BUG_ON(bio == NULL);
-
-	bio->bi_end_io = read_endio;
-	bio_set_dev(bio, sb->s_bdev);
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
-	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#else
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-#endif
-	return bio;
-}
-
 /* prio -- true is used for dir */
 struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio)
@@ -95,7 +68,7 @@ struct page *erofs_get_meta_page(struct super_block *sb,
 		struct bio *bio;
 		int err;
 
-		bio = prepare_bio(sb, blkaddr, 1);
+		bio = prepare_bio(sb, blkaddr, 1, read_endio);
 		err = bio_add_page(bio, page, PAGE_SIZE, 0);
 		BUG_ON(err != PAGE_SIZE);
 
@@ -251,6 +224,8 @@ static inline struct bio *erofs_read_raw_page(
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
+		erofs_blk_t blknr;
+		unsigned blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (unlikely(err))
@@ -268,6 +243,9 @@ static inline struct bio *erofs_read_raw_page(
 		/* for RAW access mode, m_plen must be equal to m_llen */
 		BUG_ON(map.m_plen != map.m_llen);
 
+		blknr = erofs_blknr(map.m_pa);
+		blkoff = erofs_blkoff(map.m_pa);
+
 		/* deal with inline page */
 		if (map.m_flags & EROFS_MAP_META) {
 			void *vsrc, *vto;
@@ -275,8 +253,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			BUG_ON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb,
-				erofs_blknr(map.m_pa), 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -285,7 +262,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			vsrc = kmap_atomic(ipage);
 			vto = kmap_atomic(page);
-			memcpy(vto, vsrc + erofs_blkoff(map.m_pa), map.m_plen);
+			memcpy(vto, vsrc + blkoff, map.m_plen);
 			memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
 			kunmap_atomic(vto);
 			kunmap_atomic(vsrc);
@@ -309,7 +286,7 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, erofs_blknr(map.m_pa), nblocks);
+		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 10d5952..88ebd39 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -290,6 +290,47 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* data.c */
+static inline struct bio *prepare_bio(
+	struct super_block *sb,
+	erofs_blk_t blkaddr, unsigned nr_pages,
+	bio_end_io_t endio)
+{
+	gfp_t gfp = GFP_NOIO;
+	struct bio *bio = bio_alloc(gfp, nr_pages);
+
+	if (unlikely(bio == NULL) &&
+		(current->flags & PF_MEMALLOC)) {
+		do {
+			nr_pages /= 2;
+			if (unlikely(!nr_pages)) {
+				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
+				BUG_ON(bio == NULL);
+				break;
+			}
+			bio = bio_alloc(gfp, nr_pages);
+		} while (bio == NULL);
+	}
+
+	bio->bi_end_io = endio;
+	bio_set_dev(bio, sb->s_bdev);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 0))
+	bio->bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#else
+	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
+#endif
+	return bio;
+}
+
+static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
+{
+	bio_set_op_attrs(bio, op, op_flags);
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+	submit_bio(0, bio);
+#else
+	submit_bio(bio);
+#endif
+}
+
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 17/25] staging: erofs: add erofs_allocpage
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This patch introduces an temporary _on-stack_ page
pool to reuse the freed page directly as much as
it can for better performance and release all pages
at a time, it also slightly reduces the possibility of
the potential memory allocation failure.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/Makefile   |  2 +-
 drivers/staging/erofs/internal.h |  3 +++
 drivers/staging/erofs/staging.h  |  4 ++++
 drivers/staging/erofs/utils.c    | 31 +++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/erofs/utils.c

diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
index 8558c76..490fa6c 100644
--- a/drivers/staging/erofs/Makefile
+++ b/drivers/staging/erofs/Makefile
@@ -7,7 +7,7 @@ ccflags-y += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 obj-$(CONFIG_EROFS_FS) += erofs.o
 # staging requirement: to be self-contained in its own directory
 ccflags-y += -I$(src)/include
-erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 3be6178..10d5952 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -378,5 +378,8 @@ static inline void erofs_vunmap(const void *mem, unsigned int count)
 #endif
 }
 
+/* utils.c */
+extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
+
 #endif
 
diff --git a/drivers/staging/erofs/staging.h b/drivers/staging/erofs/staging.h
index 7712a7b..a9bfd8c 100644
--- a/drivers/staging/erofs/staging.h
+++ b/drivers/staging/erofs/staging.h
@@ -81,3 +81,7 @@ static inline bool sb_rdonly(const struct super_block *sb) {
 
 #endif
 
+#ifndef lru_to_page
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+#endif
+
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
new file mode 100644
index 0000000..3dec4f8
--- /dev/null
+++ b/drivers/staging/erofs/utils.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/drivers/staging/erofs/utils.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+#include "internal.h"
+
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	if (!list_empty(pool)) {
+		page = lru_to_page(pool);
+		list_del(&page->lru);
+	} else {
+		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+
+		BUG_ON(page == NULL);
+		BUG_ON(page->mapping != NULL);
+	}
+	return page;
+}
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 16/25] staging: erofs: add erofs_map_blocks_iter
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This patch introduces an iterable L2P mapping
operation 'erofs_map_blocks_iter'.
Compared with 'erofs_map_blocks', it avoids
a number of redundant 'release and regrab'
processes if they request the same meta page.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/Kconfig     |  10 ++
 drivers/staging/erofs/Makefile    |   1 +
 drivers/staging/erofs/data.c      |  36 +++++-
 drivers/staging/erofs/internal.h  |  12 ++
 drivers/staging/erofs/unzip_vle.c | 236 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 293 insertions(+), 2 deletions(-)
 create mode 100644 drivers/staging/erofs/unzip_vle.c

diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
index edda055..63bec70 100644
--- a/drivers/staging/erofs/Kconfig
+++ b/drivers/staging/erofs/Kconfig
@@ -77,3 +77,13 @@ config EROFS_FAULT_INJECTION
 	help
 	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
 	  If unsure, say N.
+
+config EROFS_FS_ZIP
+	bool "EROFS Data Compresssion Support"
+	depends on EROFS_FS
+	help
+	  Currently we support VLE Compression only.
+	  Play at your own risk.
+
+	  If you don't want to use compression feature, say N.
+
diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
index 977b7e0..8558c76 100644
--- a/drivers/staging/erofs/Makefile
+++ b/drivers/staging/erofs/Makefile
@@ -9,4 +9,5 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
 ccflags-y += -I$(src)/include
 erofs-objs := super.o inode.o data.o namei.o dir.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o
 
diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c
index 554ba49..c25847d 100644
--- a/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@ -172,12 +172,44 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	return 0;
 }
 
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_map_blocks_iter(struct inode *,
+	struct erofs_map_blocks *, struct page **, int);
+#endif
+
+int erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* by default, reading raw data never use erofs_map_blocks_iter */
+	if (unlikely(!is_inode_layout_compression(inode))) {
+		if (*mpage_ret != NULL)
+			put_page(*mpage_ret);
+		*mpage_ret = NULL;
+
+		return erofs_map_blocks(inode, map, flags);
+	}
+
+#ifdef CONFIG_EROFS_FS_ZIP
+	return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags);
+#else
+	/* data compression is not available */
+	return -ENOTSUPP;
+#endif
+}
+
 int erofs_map_blocks(struct inode *inode,
 	struct erofs_map_blocks *map, int flags)
 {
-	if (unlikely(is_inode_layout_compression(inode)))
-		return -ENOTSUPP;
+	if (unlikely(is_inode_layout_compression(inode))) {
+		struct page *mpage = NULL;
+		int err;
 
+		err = erofs_map_blocks_iter(inode, map, &mpage, flags);
+		if (mpage != NULL)
+			put_page(mpage);
+		return err;
+	}
 	return erofs_map_blocks_flatmode(inode, map, flags);
 }
 
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index e1f6864..3be6178 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -74,6 +74,10 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
+#ifdef CONFIG_EROFS_FS_ZIP
+	/* cluster size in bit shift */
+	unsigned char clusterbits;
+#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
@@ -289,6 +293,14 @@ struct erofs_map_blocks {
 extern struct page *erofs_get_meta_page(struct super_block *sb,
 	erofs_blk_t blkaddr, bool prio);
 extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
+extern int erofs_map_blocks_iter(struct inode *, struct erofs_map_blocks *,
+	struct page **, int);
+
+struct erofs_map_blocks_iter {
+	struct erofs_map_blocks map;
+	struct page *mpage;
+};
+
 
 static inline struct page *erofs_get_inline_page(struct inode *inode,
 	erofs_blk_t blkaddr)
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c
new file mode 100644
index 0000000..e6752cf
--- /dev/null
+++ b/drivers/staging/erofs/unzip_vle.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/drivers/staging/erofs/unzip_vle.c
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+#define __vle_cluster_advise(x, bit, bits) \
+	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
+
+#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
+	Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT, Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS)
+
+enum {
+	Z_EROFS_VLE_CLUSTER_TYPE_PLAIN,
+	Z_EROFS_VLE_CLUSTER_TYPE_HEAD,
+	Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+	Z_EROFS_VLE_CLUSTER_TYPE_RESERVED,
+	Z_EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
+#define vle_cluster_type(di)	\
+	__vle_cluster_type((di)->di_advise)
+
+static inline unsigned
+vle_compressed_index_clusterofs(unsigned clustersize,
+	struct z_erofs_vle_decompressed_index *di)
+{
+	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
+		__func__, di, di->di_advise, vle_cluster_type(di),
+		di->di_clusterofs, di->di_u.blkaddr);
+
+	switch(vle_cluster_type(di)) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		break;
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		return di->di_clusterofs;
+	default:
+		BUG_ON(1);
+	}
+	return clustersize;
+}
+
+static inline erofs_blk_t
+vle_extent_blkaddr(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct z_erofs_vle_decompressed_index);
+
+	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
+}
+
+static inline unsigned int
+vle_extent_blkoff(struct inode *inode, pgoff_t index)
+{
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	struct erofs_vnode *vi = EROFS_V(inode);
+
+	unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
+		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
+		index * sizeof(struct z_erofs_vle_decompressed_index);
+
+	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
+}
+
+/*
+ * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
+ * ---
+ * VLE compression mode attempts to compress a number of logical data into
+ * a physical cluster with a fixed size.
+ * VLE compression mode uses "struct z_erofs_vle_decompressed_index".
+ */
+static erofs_off_t vle_get_logical_extent_head(
+	struct inode *inode,
+	struct page **page_iter,
+	void **kaddr_iter,
+	unsigned lcn,	/* logical cluster number */
+	erofs_blk_t *pcn,
+	unsigned *flags)
+{
+	/* for extent meta */
+	struct page *page = *page_iter;
+	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
+	struct z_erofs_vle_decompressed_index *di;
+	unsigned long long ofs;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	if (page->index != blkaddr) {
+		kunmap_atomic(*kaddr_iter);
+		unlock_page(page);
+		put_page(page);
+
+		*page_iter = page = erofs_get_meta_page(inode->i_sb,
+			blkaddr, false);
+		*kaddr_iter = kmap_atomic(page);
+	}
+
+	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
+	switch(vle_cluster_type(di)) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		BUG_ON(!di->di_u.delta[0]);
+		BUG_ON(lcn < di->di_u.delta[0]);
+
+		ofs = vle_get_logical_extent_head(inode,
+			page_iter, kaddr_iter,
+			lcn - di->di_u.delta[0], pcn, flags);
+		break;
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		*flags ^= EROFS_MAP_ZIPPED;
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		ofs = (unsigned long long)lcn * clustersize +
+			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
+		*pcn = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		BUG_ON(1);
+	}
+	return ofs;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode,
+	struct erofs_map_blocks *map,
+	struct page **mpage_ret, int flags)
+{
+	/* logicial extent (start, end) offset */
+	unsigned long long ofs, end;
+	struct z_erofs_vle_decompressed_index *di;
+	erofs_blk_t e_blkaddr, pcn;
+	unsigned lcn, logical_cluster_ofs;
+	struct page *mpage = *mpage_ret;
+	void *kaddr;
+	bool initial;
+	unsigned clustersize = 1 << EROFS_SB(inode->i_sb)->clusterbits;
+
+	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
+	initial = !map->m_llen;
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (unlikely(map->m_la >= inode->i_size)) {
+		BUG_ON(!initial);
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size - 1;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
+		map->m_la, map->m_llen);
+
+	ofs = map->m_la + map->m_llen;
+
+	lcn = ofs / clustersize;
+	e_blkaddr = vle_extent_blkaddr(inode, lcn);
+
+	if (mpage == NULL || mpage->index != e_blkaddr) {
+		if (mpage != NULL)
+			put_page(mpage);
+
+		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
+		*mpage_ret = mpage;
+	} else {
+		lock_page(mpage);
+		DBG_BUGON(!PageUptodate(mpage));
+	}
+
+	kaddr = kmap_atomic(mpage);
+	di = kaddr + vle_extent_blkoff(inode, lcn);
+
+	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
+		e_blkaddr, vle_extent_blkoff(inode, lcn));
+
+	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
+	if (!initial) {
+		/* [walking mode] 'map' has been already initialized */
+		map->m_llen += logical_cluster_ofs;
+		goto unmap_out;
+	}
+
+	/* by default, compressed */
+	map->m_flags |= EROFS_MAP_ZIPPED;
+
+	end = (u64)(lcn + 1) * clustersize;
+
+	switch(vle_cluster_type(di)) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (ofs % clustersize >= logical_cluster_ofs)
+			map->m_flags ^= EROFS_MAP_ZIPPED;
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (ofs % clustersize == logical_cluster_ofs) {
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			goto exact_hitted;
+		}
+
+		if (ofs % clustersize > logical_cluster_ofs) {
+			ofs = lcn * clustersize | logical_cluster_ofs;
+			pcn = le32_to_cpu(di->di_u.blkaddr);
+			break;
+		}
+
+		BUG_ON(!lcn);	/* logical cluster number >= 1 */
+		end = (lcn-- * clustersize) | logical_cluster_ofs;
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		ofs = vle_get_logical_extent_head(inode, mpage_ret,
+			&kaddr, lcn, &pcn, &map->m_flags);
+		mpage = *mpage_ret;
+	}
+
+	map->m_la = ofs;
+exact_hitted:
+	map->m_llen = end - ofs;
+	map->m_plen = clustersize;
+	map->m_pa = blknr_to_addr(pcn);
+	map->m_flags |= EROFS_MAP_MAPPED;
+unmap_out:
+	kunmap_atomic(kaddr);
+	unlock_page(mpage);
+out:
+	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		__func__, map->m_la, map->m_pa,
+		map->m_llen, map->m_plen, map->m_flags);
+	return 0;
+}
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 15/25] staging: erofs: introduce pagevec for unzip subsystem
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

For each compressed cluster, there is a straight-forward
way of allocating a fixed or variable-sized (for VLE) array
to record the corresponding file pages for its decompression
if we decide to decompress these pages asynchronously (eg.
read-ahead case), however it could take much extra on-heap
memory compared with traditional uncompressed filesystems.

This patch introduces a pagevec solution to reuse some
allocated file page in the time-sharing approach storing
parts of the array itself in order to minimize the extra
memory overhead, thus only a constant and small-sized array
used for booting the whole array itself up will be needed.

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/unzip_pagevec.h | 172 ++++++++++++++++++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 drivers/staging/erofs/unzip_pagevec.h

diff --git a/drivers/staging/erofs/unzip_pagevec.h b/drivers/staging/erofs/unzip_pagevec.h
new file mode 100644
index 0000000..6710316
--- /dev/null
+++ b/drivers/staging/erofs/unzip_pagevec.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/drivers/staging/erofs/unzip_pagevec.h
+ *
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_UNZIP_PAGEVEC_H
+#define __EROFS_UNZIP_PAGEVEC_H
+
+#include <linux/tagptr.h>
+
+/* page type in pagevec for unzip subsystem */
+enum z_erofs_page_type {
+	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+
+	Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+
+	Z_EROFS_VLE_PAGE_TYPE_HEAD,
+	Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+	__bad_page_type_exclusive(void);
+
+/* pagevec tagged pointer */
+typedef tagptr2_t	erofs_vtptr_t;
+
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+	struct page *curr, *next;
+	erofs_vtptr_t *pages;
+
+	unsigned int nr, index;
+};
+
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+				             bool atomic)
+{
+	if (ctor->curr == NULL)
+		return;
+
+	if (atomic)
+		kunmap_atomic(ctor->pages);
+	else
+		kunmap(ctor->curr);
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+			       unsigned nr)
+{
+	unsigned index;
+
+	/* keep away from occupied pages */
+	if (ctor->next != NULL)
+		return ctor->next;
+
+	for(index = 0; index < nr; ++index) {
+		const erofs_vtptr_t t = ctor->pages[index];
+		const unsigned tags = tagptr_unfold_tags(t);
+
+		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+			return tagptr_unfold_ptr(t);
+	}
+
+	if (unlikely(nr >= ctor->nr))
+		BUG();
+
+	return NULL;
+}
+
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+			      bool atomic)
+{
+	struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+
+	z_erofs_pagevec_ctor_exit(ctor, atomic);
+
+	ctor->curr = next;
+	ctor->next = NULL;
+	ctor->pages = atomic ?
+		kmap_atomic(ctor->curr) : kmap(ctor->curr);
+
+	ctor->nr = PAGE_SIZE / sizeof(struct page *);
+	ctor->index = 0;
+}
+
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+					     unsigned nr,
+					     erofs_vtptr_t *pages, unsigned i)
+{
+	ctor->nr = nr;
+	ctor->curr = ctor->next = NULL;
+	ctor->pages = pages;
+
+	if (i >= nr) {
+		i -= nr;
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+		while (i > ctor->nr) {
+			i -= ctor->nr;
+			z_erofs_pagevec_ctor_pagedown(ctor, false);
+		}
+	}
+
+	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+	ctor->index = i;
+}
+
+static inline bool
+z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
+			     struct page *page,
+			     enum z_erofs_page_type type,
+			     bool *occupied)
+{
+	*occupied = false;
+	if (unlikely(ctor->next == NULL && type))
+		if (ctor->index + 1 == ctor->nr)
+			return false;
+
+	if (unlikely(ctor->index >= ctor->nr))
+		z_erofs_pagevec_ctor_pagedown(ctor, false);
+
+	/* exclusive page type must be 0 */
+	if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+		__bad_page_type_exclusive();
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (type == (uintptr_t)ctor->next) {
+		ctor->next = page;
+		*occupied = true;
+	}
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, page, type);
+	return true;
+}
+
+static inline struct page *
+z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			     enum z_erofs_page_type *type)
+{
+	erofs_vtptr_t t;
+
+	if (unlikely(ctor->index >= ctor->nr)) {
+		BUG_ON(ctor->next == NULL);
+		z_erofs_pagevec_ctor_pagedown(ctor, true);
+	}
+
+	t = ctor->pages[ctor->index];
+
+	*type = tagptr_unfold_tags(t);
+
+	/* should remind that collector->next never equal to 1, 2 */
+	if (*type == (uintptr_t)ctor->next)
+		ctor->next = tagptr_unfold_ptr(t);
+
+	ctor->pages[ctor->index++] =
+		tagptr_fold(erofs_vtptr_t, NULL, 0);
+
+	return tagptr_unfold_ptr(t);
+}
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 14/25] staging: erofs: <linux/tagptr.h>: introduce tagged pointer
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

Currently kernel has scattered tagged pointer usages hacked
by hand in plain code, without a unique and portable functionset
to highlight the tagged pointer itself and wrap these hacked code
in order to clean up all over meaningless magic masks.

Therefore, this patch introduces simple generic methods to fold
tags into a pointer integer. It currently supports the last n bits
of the pointer for tags, which can be selected by users.

In addition, it will also be used for the upcoming EROFS filesystem,
which heavily uses tagged pointer approach for high performance
and reducing extra memory allocation.

Link: https://en.wikipedia.org/wiki/Tagged_pointer

Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/include/linux/tagptr.h | 110 +++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 drivers/staging/erofs/include/linux/tagptr.h

diff --git a/drivers/staging/erofs/include/linux/tagptr.h b/drivers/staging/erofs/include/linux/tagptr.h
new file mode 100644
index 0000000..b5c6016
--- /dev/null
+++ b/drivers/staging/erofs/include/linux/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25 at huawei.com>
+ */
+#ifndef _LINUX_TAGPTR_H
+#define _LINUX_TAGPTR_H
+
+#include <linux/types.h>
+#include <linux/build_bug.h>
+
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {	\
+	uintptr_t v;	\
+} tagptr##n##_t;
+
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+
+#undef __MAKE_TAGPTR
+
+extern void __compiletime_error("bad tagptr tags")
+	__bad_tagptr_tags(void);
+
+extern void __compiletime_error("bad tagptr type")
+	__bad_tagptr_type(void);
+
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n)	\
+	__builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+		(1UL << (n)) - 1 :
+
+#define __tagptr_mask(ptr)	(\
+	__tagptr_mask_1(ptr, 1) ( \
+	__tagptr_mask_1(ptr, 2) ( \
+	__tagptr_mask_1(ptr, 3) ( \
+	__tagptr_mask_1(ptr, 4) ( \
+	__bad_tagptr_type(), 0)))))
+
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+	((typeof(type)){ .v = (uintptr_t)(val) })
+
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+		__bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+	((void *)((tptr).v & ~__tagptr_mask(tptr)))
+
+#define tagptr_unfold_tags(tptr) \
+	((tptr).v & __tagptr_mask(tptr))
+
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+	typeof(_tptr1) tptr1 = (_tptr1); \
+	typeof(_tptr2) tptr2 = (_tptr2); \
+	(void) (&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	typeof(_o) o = (_o); \
+	typeof(_n) n = (_n); \
+	(void) (&o == &n); \
+	(void) (&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	*ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v |= tags; \
+*ptptr; })
+
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+	typeof(_ptptr) ptptr = (_ptptr); \
+	const typeof(_tags) tags = (_tags); \
+	if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+		__bad_tagptr_tags(); \
+	ptptr->v &= ~tags; \
+*ptptr; })
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 13/25] staging: erofs: support tracepoint
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

From: Chao Yu <yuchao0@huawei.com>

Add basic tracepoints for ->readpage{,s}, ->lookup,
->destroy_inode, fill_inode and map_blocks.

Reviewed-by: Gao Xiang <gaoxiang25 at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
---
 drivers/staging/erofs/data.c                       |  13 +-
 drivers/staging/erofs/include/trace/events/erofs.h | 240 +++++++++++++++++++++
 drivers/staging/erofs/inode.c                      |   4 +
 drivers/staging/erofs/namei.c                      |   4 +
 drivers/staging/erofs/super.c                      |   3 +
 5 files changed, 261 insertions(+), 3 deletions(-)
 create mode 100644 drivers/staging/erofs/include/trace/events/erofs.h

diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c
index 29703b9..554ba49 100644
--- a/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@ -13,6 +13,8 @@
 #include "internal.h"
 #include <linux/prefetch.h>
 
+#include <trace/events/erofs.h>
+
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0))
 static inline void read_endio(struct bio *bio, int err)
 #else
@@ -128,6 +130,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	u64 offset = map->m_la;
 	struct erofs_vnode *vi = EROFS_V(inode);
 
+	trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
 	BUG_ON(is_inode_layout_compression(inode));
 
 	nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
@@ -165,8 +168,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 
 out:
 	map->m_llen = map->m_plen;
-	debugln("%s, m_la 0x%llx m_pa %llx m_len %llu",
-		__func__, map->m_la, map->m_pa, map->m_plen);
+	trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
 	return 0;
 }
 
@@ -320,6 +322,8 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page)
 	erofs_off_t last_block;
 	struct bio *bio;
 
+	trace_erofs_readpage(page, true);
+
 	bio = erofs_read_raw_page(NULL, page->mapping,
 		page, &last_block, 1, false);
 
@@ -337,9 +341,12 @@ static int erofs_raw_access_readpages(struct file *filp,
 	erofs_off_t last_block;
 	struct bio *bio = NULL;
 	gfp_t gfp = readahead_gfp_mask(mapping);
+	struct page *page = list_last_entry(pages, struct page, lru);
+
+	trace_erofs_readpages(mapping->host, page, nr_pages, true);
 
 	for (; nr_pages; --nr_pages) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
+		page = list_entry(pages->prev, struct page, lru);
 
 		prefetchw(&page->flags);
 		list_del(&page->lru);
diff --git a/drivers/staging/erofs/include/trace/events/erofs.h b/drivers/staging/erofs/include/trace/events/erofs.h
new file mode 100644
index 0000000..5aead93
--- /dev/null
+++ b/drivers/staging/erofs/include/trace/events/erofs.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM erofs
+
+#if !defined(_TRACE_EROFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EROFS_H
+
+#include <linux/tracepoint.h>
+
+#define show_dev(dev)		MAJOR(dev), MINOR(dev)
+#define show_dev_nid(entry)	show_dev(entry->dev), entry->nid
+
+#define show_file_type(type)						\
+	__print_symbolic(type,						\
+		{ 0,		"FILE" },				\
+		{ 1,		"DIR" })
+
+#define show_map_flags(flags) __print_flags(flags, "|",	\
+	{ EROFS_GET_BLOCKS_RAW,	"RAW" })
+
+#define show_mflags(flags) __print_flags(flags, "",	\
+	{ EROFS_MAP_MAPPED,	"M" },			\
+	{ EROFS_MAP_META,	"I" },			\
+	{ EROFS_MAP_ZIPPED,	"Z" })
+
+TRACE_EVENT(erofs_lookup,
+
+	TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags),
+
+	TP_ARGS(dir, dentry, flags),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(erofs_nid_t,	nid	)
+		__field(const char *,	name	)
+		__field(unsigned int,	flags	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= dir->i_sb->s_dev;
+		__entry->nid	= EROFS_V(dir)->nid;
+		__entry->name	= dentry->d_name.name;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("dev = (%d,%d), pnid = %llu, name:%s, flags:%x",
+		show_dev_nid(__entry),
+		__entry->name,
+		__entry->flags)
+);
+
+TRACE_EVENT(erofs_fill_inode,
+	TP_PROTO(struct inode *inode, int isdir),
+	TP_ARGS(inode, isdir),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(erofs_nid_t,	nid	)
+		__field(erofs_blk_t,	blkaddr )
+		__field(unsigned int,	ofs	)
+		__field(int,		isdir	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->nid		= EROFS_V(inode)->nid;
+		__entry->blkaddr	= erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid));
+		__entry->ofs		= erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid));
+		__entry->isdir		= isdir;
+	),
+
+	TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u, isdir %d",
+		  show_dev_nid(__entry),
+		  __entry->blkaddr, __entry->ofs,
+		  __entry->isdir)
+);
+
+TRACE_EVENT(erofs_readpage,
+
+	TP_PROTO(struct page *page, bool raw),
+
+	TP_ARGS(page, raw),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(erofs_nid_t,    nid     )
+		__field(int,		dir	)
+		__field(pgoff_t,	index	)
+		__field(int,		uptodate)
+		__field(bool,		raw	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= page->mapping->host->i_sb->s_dev;
+		__entry->nid	= EROFS_V(page->mapping->host)->nid;
+		__entry->dir	= S_ISDIR(page->mapping->host->i_mode);
+		__entry->index	= page->index;
+		__entry->uptodate = PageUptodate(page);
+		__entry->raw = raw;
+	),
+
+	TP_printk("dev = (%d,%d), nid = %llu, %s, index = %lu, uptodate = %d "
+		"raw = %d",
+		show_dev_nid(__entry),
+		show_file_type(__entry->dir),
+		(unsigned long)__entry->index,
+		__entry->uptodate,
+		__entry->raw)
+);
+
+TRACE_EVENT(erofs_readpages,
+
+	TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage,
+		bool raw),
+
+	TP_ARGS(inode, page, nrpage, raw),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev	)
+		__field(erofs_nid_t,	nid	)
+		__field(pgoff_t,	start	)
+		__field(unsigned int,	nrpage	)
+		__field(bool,		raw	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->nid	= EROFS_V(inode)->nid;
+		__entry->start	= page->index;
+		__entry->nrpage	= nrpage;
+		__entry->raw	= raw;
+	),
+
+	TP_printk("dev = (%d,%d), nid = %llu, start = %lu nrpage = %u raw = %d",
+		show_dev_nid(__entry),
+		(unsigned long)__entry->start,
+		__entry->nrpage,
+		__entry->raw)
+);
+
+DECLARE_EVENT_CLASS(erofs__map_blocks_enter,
+	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
+		 unsigned int flags),
+
+	TP_ARGS(inode, map, flags),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	erofs_nid_t,	nid		)
+		__field(	erofs_off_t,	la		)
+		__field(	u64,		llen		)
+		__field(	unsigned int,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->dev    = inode->i_sb->s_dev;
+		__entry->nid    = EROFS_V(inode)->nid;
+		__entry->la	= map->m_la;
+		__entry->llen	= map->m_llen;
+		__entry->flags	= flags;
+	),
+
+	TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s",
+		  show_dev_nid(__entry),
+		  __entry->la, __entry->llen, show_map_flags(__entry->flags))
+);
+
+DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter,
+	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
+		 unsigned flags),
+
+	TP_ARGS(inode, map, flags)
+);
+
+DECLARE_EVENT_CLASS(erofs__map_blocks_exit,
+	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
+		 unsigned int flags, int ret),
+
+	TP_ARGS(inode, map, flags, ret),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	erofs_nid_t,	nid		)
+		__field(        unsigned int,   flags           )
+		__field(	erofs_off_t,	la		)
+		__field(	erofs_off_t,	pa		)
+		__field(	u64,		llen		)
+		__field(	u64,		plen		)
+		__field(        unsigned int,	mflags		)
+		__field(	int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->dev    = inode->i_sb->s_dev;
+		__entry->nid    = EROFS_V(inode)->nid;
+		__entry->flags	= flags;
+		__entry->la	= map->m_la;
+		__entry->pa	= map->m_pa;
+		__entry->llen	= map->m_llen;
+		__entry->plen	= map->m_plen;
+		__entry->mflags	= map->m_flags;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("dev = (%d,%d), nid = %llu, flags %s "
+		  "la %llu pa %llu llen %llu plen %llu mflags %s ret %d",
+		  show_dev_nid(__entry), show_map_flags(__entry->flags),
+		  __entry->la, __entry->pa, __entry->llen, __entry->plen,
+		  show_mflags(__entry->mflags), __entry->ret)
+);
+
+DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_flatmode_exit,
+	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
+		 unsigned flags, int ret),
+
+	TP_ARGS(inode, map, flags, ret)
+);
+
+TRACE_EVENT(erofs_destroy_inode,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,		dev		)
+		__field(	erofs_nid_t,	nid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->nid	= EROFS_V(inode)->nid;
+	),
+
+	TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry))
+);
+
+#endif /* _TRACE_EROFS_H */
+
+ /* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c
index a26ca5a..001ddb9 100644
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -12,6 +12,8 @@
  */
 #include "xattr.h"
 
+#include <trace/events/erofs.h>
+
 /* no locking */
 static int read_inode(struct inode *inode, void *data)
 {
@@ -156,6 +158,8 @@ int fill_inode(struct inode *inode, int isdir)
 	erofs_blk_t blkaddr;
 	unsigned ofs;
 
+	trace_erofs_fill_inode(inode, isdir);
+
 	blkaddr = erofs_blknr(iloc(sbi, vi->nid));
 	ofs = erofs_blkoff(iloc(sbi, vi->nid));
 
diff --git a/drivers/staging/erofs/namei.c b/drivers/staging/erofs/namei.c
index caaa60b..376bfec 100644
--- a/drivers/staging/erofs/namei.c
+++ b/drivers/staging/erofs/namei.c
@@ -13,6 +13,8 @@
 #include "internal.h"
 #include "xattr.h"
 
+#include <trace/events/erofs.h>
+
 /* based on the value of qn->len is accurate */
 static inline int dirnamecmp(struct qstr *qn,
 	struct qstr *qd, unsigned *matched)
@@ -209,6 +211,8 @@ struct dentry *erofs_lookup(struct inode *dir,
 	/* dentry must be unhashed in lookup, no need to worry about */
 	DBG_BUGON(!d_unhashed(dentry));
 
+	trace_erofs_lookup(dir, dentry, flags);
+
 	/* file name exceeds fs limit */
 	if (unlikely(dentry->d_name.len > EROFS_NAME_LEN))
 		return ERR_PTR(-ENAMETOOLONG);
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index aee83dd..9561bca 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -17,6 +17,9 @@
 #include <linux/seq_file.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/erofs.h>
+
 static struct kmem_cache *erofs_inode_cachep __read_mostly;
 
 static void init_once(void *ptr)
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 12/25] staging: erofs: introduce error injection infrastructure
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

From: Chao Yu <yuchao0@huawei.com>

This patch introduces error injection infrastructure, with it, we can
inject error in any kernel exported common functions which erofs used,
so that it can force erofs running into error paths, it turns out that
tests can cover real rare paths more easily to find bugs.

Reviewed-by: Gao Xiang <gaoxiang25 at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
---
 drivers/staging/erofs/Kconfig    |  6 +++++
 drivers/staging/erofs/inode.c    |  3 ++-
 drivers/staging/erofs/internal.h | 57 ++++++++++++++++++++++++++++++++++++++++
 drivers/staging/erofs/super.c    | 38 +++++++++++++++++++++++++++
 4 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
index 1a5ec1b..edda055 100644
--- a/drivers/staging/erofs/Kconfig
+++ b/drivers/staging/erofs/Kconfig
@@ -71,3 +71,9 @@ config EROFS_FS_USE_VM_MAP_RAM
 
 	  If you don't know what these are, say N.
 
+config EROFS_FAULT_INJECTION
+	bool "EROFS fault injection facility"
+	depends on EROFS_FS
+	help
+	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
+	  If unsure, say N.
diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c
index 0ec0cf5..a26ca5a 100644
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -113,6 +113,7 @@ static int read_inode(struct inode *inode, void *data)
 int fill_inline_data(struct inode *inode, void *data, unsigned m_pofs)
 {
 	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
 	int mode = vi->data_mapping_mode;
 
 	DBG_BUGON(mode >= EROFS_INODE_LAYOUT_MAX);
@@ -123,7 +124,7 @@ int fill_inline_data(struct inode *inode, void *data, unsigned m_pofs)
 
 	/* fast symlink (following ext4) */
 	if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) {
-		char *lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
+		char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL);
 
 		if (unlikely(lnk == NULL))
 			return -ENOMEM;
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 9b263e2..e1f6864 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -44,6 +44,22 @@
 #define DBG_BUGON(...)          ((void)0)
 #endif
 
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+enum {
+	FAULT_KMALLOC,
+	FAULT_MAX,
+};
+
+extern char *erofs_fault_name[FAULT_MAX];
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
+
+struct erofs_fault_info {
+	atomic_t inject_ops;
+	unsigned int inject_rate;
+	unsigned int inject_type;
+};
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -72,14 +88,55 @@ struct erofs_sb_info {
 	char *dev_name;
 
 	unsigned int mount_opt;
+
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+	struct erofs_fault_info fault_info;	/* For fault injection */
+#endif
 };
 
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+#define erofs_show_injection_info(type)					\
+	infoln("inject %s in %s of %pS", erofs_fault_name[type],        \
+		__func__, __builtin_return_address(0))
+
+static inline bool time_to_inject(struct erofs_sb_info *sbi, int type)
+{
+	struct erofs_fault_info *ffi = &sbi->fault_info;
+
+	if (!ffi->inject_rate)
+		return false;
+
+	if (!IS_FAULT_SET(ffi, type))
+		return false;
+
+	atomic_inc(&ffi->inject_ops);
+	if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
+		atomic_set(&ffi->inject_ops, 0);
+		return true;
+	}
+	return false;
+}
+#endif
+
+static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
+					size_t size, gfp_t flags)
+{
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+	if (time_to_inject(sbi, FAULT_KMALLOC)) {
+		erofs_show_injection_info(FAULT_KMALLOC);
+		return NULL;
+	}
+#endif
+	return kmalloc(size, flags);
+}
+
 #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
 #define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
 
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
+#define EROFS_MOUNT_FAULT_INJECTION	0x00000040
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index 1ea517b..aee83dd 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -137,6 +137,26 @@ static int superblock_read(struct super_block *sb)
 	return ret;
 }
 
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+char *erofs_fault_name[FAULT_MAX] = {
+	[FAULT_KMALLOC]		= "kmalloc",
+};
+
+static void erofs_build_fault_attr(struct erofs_sb_info *sbi,
+						unsigned int rate)
+{
+	struct erofs_fault_info *ffi = &sbi->fault_info;
+
+	if (rate) {
+		atomic_set(&ffi->inject_ops, 0);
+		ffi->inject_rate = rate;
+		ffi->inject_type = (1 << FAULT_MAX) - 1;
+	} else {
+		memset(ffi, 0, sizeof(struct erofs_fault_info));
+	}
+}
+#endif
+
 static void default_options(struct erofs_sb_info *sbi)
 {
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -153,6 +173,7 @@ enum {
 	Opt_nouser_xattr,
 	Opt_acl,
 	Opt_noacl,
+	Opt_fault_injection,
 	Opt_err
 };
 
@@ -161,6 +182,7 @@ enum {
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
+	{Opt_fault_injection, "fault_injection=%u"},
 	{Opt_err, NULL}
 };
 
@@ -168,6 +190,7 @@ static int parse_options(struct super_block *sb, char *options)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
+	int arg = 0;
 
 	if (!options)
 		return 0;
@@ -212,6 +235,16 @@ static int parse_options(struct super_block *sb, char *options)
 			infoln("noacl options not supported");
 			break;
 #endif
+		case Opt_fault_injection:
+			if (args->from && match_int(args, &arg))
+				return -EINVAL;
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+			erofs_build_fault_attr(EROFS_SB(sb), arg);
+			set_opt(EROFS_SB(sb), FAULT_INJECTION);
+#else
+			infoln("FAULT_INJECTION was not selected");
+#endif
+			break;
 		default:
 			errln("Unrecognized mount option \"%s\" "
 					"or missing value", p);
@@ -461,6 +494,11 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	else
 		seq_puts(seq, ",noacl");
 #endif
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+	if (test_opt(sbi, FAULT_INJECTION))
+		seq_printf(seq, ",fault_injection=%u",
+				sbi->fault_info.inject_rate);
+#endif
 	return 0;
 }
 
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 11/25] staging: erofs: support special inode
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

From: Chao Yu <yuchao0@huawei.com>

This patch adds to support special inode, such as block dev, char,
socket, pipe inode.

Reviewed-by: Gao Xiang <gaoxiang25 at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
---
 drivers/staging/erofs/inode.c    | 37 +++++++++++++++++++++++++++++++++++--
 drivers/staging/erofs/internal.h |  1 +
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c
index 125dad3a..0ec0cf5 100644
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -34,8 +34,18 @@ static int read_inode(struct inode *inode, void *data)
 		vi->inode_isize = sizeof(struct erofs_inode_v2);
 		vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount);
 
-		vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr);
 		inode->i_mode = le16_to_cpu(v2->i_mode);
+		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+						S_ISLNK(inode->i_mode)) {
+			vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr);
+		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(v2->i_u.rdev));
+		} else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+			inode->i_rdev = 0;
+		} else {
+			return -EIO;
+		}
 
 		i_uid_write(inode, le32_to_cpu(v2->i_uid));
 		i_gid_write(inode, le32_to_cpu(v2->i_gid));
@@ -54,8 +64,18 @@ static int read_inode(struct inode *inode, void *data)
 		vi->inode_isize = sizeof(struct erofs_inode_v1);
 		vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount);
 
-		vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr);
 		inode->i_mode = le16_to_cpu(v1->i_mode);
+		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+						S_ISLNK(inode->i_mode)) {
+			vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr);
+		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(v1->i_u.rdev));
+		} else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+			inode->i_rdev = 0;
+		} else {
+			return -EIO;
+		}
 
 		i_uid_write(inode, le16_to_cpu(v1->i_uid));
 		i_gid_write(inode, le16_to_cpu(v1->i_gid));
@@ -177,6 +197,12 @@ int fill_inode(struct inode *inode, int isdir)
 				&page_symlink_inode_operations;
 #endif
 			inode_nohighmem(inode);
+		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+#ifdef CONFIG_EROFS_FS_XATTR
+			inode->i_op = &erofs_special_inode_operations;
+#endif
+			init_special_inode(inode, inode->i_mode, inode->i_rdev);
 		} else {
 			err = -EIO;
 			goto out_unlock;
@@ -248,6 +274,13 @@ struct inode *erofs_iget(struct super_block *sb,
 	.listxattr = erofs_listxattr,
 #endif
 };
+
+const struct inode_operations erofs_special_inode_operations = {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0))
+		.listxattr = erofs_listxattr,
+#endif
+};
+
 #endif
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0))
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 9490060..9b263e2 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -261,6 +261,7 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 #ifdef CONFIG_EROFS_FS_XATTR
 extern const struct inode_operations erofs_symlink_xattr_iops;
 extern const struct inode_operations erofs_fast_symlink_xattr_iops;
+extern const struct inode_operations erofs_special_inode_operations;
 #endif
 
 static inline void set_inode_fast_symlink(struct inode *inode)
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 10/25] staging: erofs: introduce xattr & acl support
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This implements xattr and acl functionalities.

Inline and shared xattrs are introduced for flexibility.
Specifically, if the same xattr occurs for many times
in a large number of inodes or the value of a xattr is so large
that it isn't suitable to be inlined, a shared xattr
kept in the xattr meta will be used instead.

Signed-off-by: Miao Xie <miaoxie at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/Kconfig    |  37 +++
 drivers/staging/erofs/Makefile   |   1 +
 drivers/staging/erofs/inode.c    |  62 +++-
 drivers/staging/erofs/internal.h |  26 ++
 drivers/staging/erofs/namei.c    |   7 +
 drivers/staging/erofs/super.c    |  67 ++++
 drivers/staging/erofs/xattr.c    | 678 +++++++++++++++++++++++++++++++++++++++
 drivers/staging/erofs/xattr.h    |  93 ++++++
 8 files changed, 970 insertions(+), 1 deletion(-)
 create mode 100644 drivers/staging/erofs/xattr.c
 create mode 100644 drivers/staging/erofs/xattr.h

diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
index 077430f..1a5ec1b 100644
--- a/drivers/staging/erofs/Kconfig
+++ b/drivers/staging/erofs/Kconfig
@@ -26,6 +26,43 @@ config EROFS_FS_DEBUG
 
 	  For daily use, say N.
 
+config EROFS_FS_XATTR
+	bool "EROFS extended attributes"
+	depends on EROFS_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config EROFS_FS_POSIX_ACL
+	bool "EROFS Access Control Lists"
+	depends on EROFS_FS_XATTR
+	select FS_POSIX_ACL
+	default y
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
+config EROFS_FS_SECURITY
+	bool "EROFS Security Labels"
+	depends on EROFS_FS_XATTR
+	help
+	  Security labels provide an access control facility to support Linux
+	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+	  Linux. This option enables an extended attribute handler for file
+	  security labels in the erofs filesystem, so that it requires enabling
+	  the extended attribute support in advance.
+
+	  If you are not using a security module, say N.
+
 config EROFS_FS_USE_VM_MAP_RAM
 	bool "EROFS VM_MAP_RAM Support"
 	depends on EROFS_FS
diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
index 31e909e..977b7e0 100644
--- a/drivers/staging/erofs/Makefile
+++ b/drivers/staging/erofs/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_EROFS_FS) += erofs.o
 # staging requirement: to be self-contained in its own directory
 ccflags-y += -I$(src)/include
 erofs-objs := super.o inode.o data.o namei.o dir.o
+erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 
diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c
index b9a9d2c..125dad3a 100644
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -10,7 +10,7 @@
  * License.  See the file COPYING in the main directory of the Linux
  * distribution for more details.
  */
-#include "internal.h"
+#include "xattr.h"
 
 /* no locking */
 static int read_inode(struct inode *inode, void *data)
@@ -156,15 +156,26 @@ int fill_inode(struct inode *inode, int isdir)
 	if (!err) {
 		/* setup the new inode */
 		if (S_ISREG(inode->i_mode)) {
+#ifdef CONFIG_EROFS_FS_XATTR
+			if (vi->xattr_isize)
+				inode->i_op = &erofs_generic_xattr_iops;
+#endif
 			inode->i_fop = &generic_ro_fops;
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op =
+#ifdef CONFIG_EROFS_FS_XATTR
+				vi->xattr_isize ? &erofs_dir_xattr_iops :
+#endif
 				&erofs_dir_iops;
 			inode->i_fop = &erofs_dir_fops;
 		} else if (S_ISLNK(inode->i_mode)) {
 			/* by default, page_get_link is used for symlink */
 			inode->i_op =
+#ifdef CONFIG_EROFS_FS_XATTR
+				&erofs_symlink_xattr_iops,
+#else
 				&page_symlink_inode_operations;
+#endif
 			inode_nohighmem(inode);
 		} else {
 			err = -EIO;
@@ -212,6 +223,33 @@ struct inode *erofs_iget(struct super_block *sb,
 	return inode;
 }
 
+#ifdef CONFIG_EROFS_FS_XATTR
+const struct inode_operations erofs_generic_xattr_iops = {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0))
+	.getxattr = generic_getxattr,
+	.listxattr = erofs_listxattr,
+#endif
+};
+#endif
+
+#ifdef CONFIG_EROFS_FS_XATTR
+const struct inode_operations erofs_symlink_xattr_iops = {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0))
+	.readlink = generic_readlink,
+#endif
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+#else
+	.get_link = page_get_link,
+#endif
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0))
+	.getxattr = generic_getxattr,
+	.listxattr = erofs_listxattr,
+#endif
+};
+#endif
+
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0))
 #include <linux/namei.h>
 
@@ -229,3 +267,25 @@ static void *erofs_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_XATTR
+const struct inode_operations erofs_fast_symlink_xattr_iops = {
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0))
+	.readlink = generic_readlink,
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0))
+	.follow_link = erofs_follow_fast_link,
+#else
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
+	.follow_link = simple_follow_link,
+#else
+	.get_link = simple_get_link,
+#endif
+#endif
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0))
+	.getxattr = generic_getxattr,
+	.listxattr = erofs_listxattr,
+#endif
+};
+#endif
+
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 5056177..9490060 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -52,6 +52,9 @@
 struct erofs_sb_info {
 	u32 blocks;
 	u32 meta_blkaddr;
+#ifdef CONFIG_EROFS_FS_XATTR
+	u32 xattr_blkaddr;
+#endif
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
@@ -74,6 +77,10 @@ struct erofs_sb_info {
 #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
 #define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
 
+/* Mount flags set via mount options or defaults */
+#define EROFS_MOUNT_XATTR_USER		0x00000010
+#define EROFS_MOUNT_POSIX_ACL		0x00000020
+
 #define clear_opt(sbi, option)	((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
@@ -242,17 +249,36 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 	erofs_nid_t *nid, unsigned *d_type);
 
 /* xattr.c */
+#ifdef CONFIG_EROFS_FS_XATTR
 extern const struct xattr_handler *erofs_xattr_handlers[];
+#endif
 
 /* symlink */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 2, 0))
+extern const struct inode_operations simple_symlink_inode_operations;
+#endif
+
+#ifdef CONFIG_EROFS_FS_XATTR
+extern const struct inode_operations erofs_symlink_xattr_iops;
+extern const struct inode_operations erofs_fast_symlink_xattr_iops;
+#endif
+
 static inline void set_inode_fast_symlink(struct inode *inode)
 {
+#ifdef CONFIG_EROFS_FS_XATTR
+	inode->i_op = &erofs_fast_symlink_xattr_iops;
+#else
 	inode->i_op = &simple_symlink_inode_operations;
+#endif
 }
 
 static inline bool is_inode_fast_symlink(struct inode *inode)
 {
+#ifdef CONFIG_EROFS_FS_XATTR
+	return inode->i_op == &erofs_fast_symlink_xattr_iops;
+#else
 	return inode->i_op == &simple_symlink_inode_operations;
+#endif
 }
 
 static inline void *erofs_vmap(struct page **pages, unsigned int count)
diff --git a/drivers/staging/erofs/namei.c b/drivers/staging/erofs/namei.c
index 27b6712..caaa60b 100644
--- a/drivers/staging/erofs/namei.c
+++ b/drivers/staging/erofs/namei.c
@@ -11,6 +11,7 @@
  * distribution for more details.
  */
 #include "internal.h"
+#include "xattr.h"
 
 /* based on the value of qn->len is accurate */
 static inline int dirnamecmp(struct qstr *qn,
@@ -239,5 +240,11 @@ struct dentry *erofs_lookup(struct inode *dir,
 
 const struct inode_operations erofs_dir_xattr_iops = {
 	.lookup = erofs_lookup,
+#ifdef CONFIG_EROFS_FS_XATTR
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 9, 0))
+	.getxattr = generic_getxattr,
+	.listxattr = erofs_listxattr,
+#endif
+#endif
 };
 
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index 31bfef0..1ea517b 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -14,6 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/statfs.h>
 #include <linux/parser.h>
+#include <linux/seq_file.h>
 #include "internal.h"
 
 static struct kmem_cache *erofs_inode_cachep __read_mostly;
@@ -111,6 +112,9 @@ static int superblock_read(struct super_block *sb)
 
 	sbi->blocks = le32_to_cpu(layout->blocks);
 	sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr);
+#ifdef CONFIG_EROFS_FS_XATTR
+	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
+#endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
 
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
@@ -135,13 +139,28 @@ static int superblock_read(struct super_block *sb)
 
 static void default_options(struct erofs_sb_info *sbi)
 {
+#ifdef CONFIG_EROFS_FS_XATTR
+	set_opt(sbi, XATTR_USER);
+#endif
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	set_opt(sbi, POSIX_ACL);
+#endif
 }
 
 enum {
+	Opt_user_xattr,
+	Opt_nouser_xattr,
+	Opt_acl,
+	Opt_noacl,
 	Opt_err
 };
 
 static match_table_t erofs_tokens = {
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
 	{Opt_err, NULL}
 };
 
@@ -163,6 +182,36 @@ static int parse_options(struct super_block *sb, char *options)
 		token = match_token(p, erofs_tokens, args);
 
 		switch (token) {
+#ifdef CONFIG_EROFS_FS_XATTR
+		case Opt_user_xattr:
+			set_opt(EROFS_SB(sb), XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt(EROFS_SB(sb), XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+			infoln("user_xattr options not supported");
+			break;
+		case Opt_nouser_xattr:
+			infoln("nouser_xattr options not supported");
+			break;
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(EROFS_SB(sb), POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(EROFS_SB(sb), POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+			infoln("acl options not supported");
+			break;
+		case Opt_noacl:
+			infoln("noacl options not supported");
+			break;
+#endif
 		default:
 			errln("Unrecognized mount option \"%s\" "
 					"or missing value", p);
@@ -205,6 +254,10 @@ static int erofs_read_super(struct super_block *sb,
 
 	sb->s_op = &erofs_sops;
 
+#ifdef CONFIG_EROFS_FS_XATTR
+	sb->s_xattr = erofs_xattr_handlers;
+#endif
+
 	/* set erofs default mount options */
 	default_options(sbi);
 
@@ -394,6 +447,20 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 {
+	struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
+
+#ifdef CONFIG_EROFS_FS_XATTR
+	if (test_opt(sbi, XATTR_USER))
+		seq_puts(seq, ",user_xattr");
+	else
+		seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	if (test_opt(sbi, POSIX_ACL))
+		seq_puts(seq, ",acl");
+	else
+		seq_puts(seq, ",noacl");
+#endif
 	return 0;
 }
 
diff --git a/drivers/staging/erofs/xattr.c b/drivers/staging/erofs/xattr.c
new file mode 100644
index 0000000..6818265
--- /dev/null
+++ b/drivers/staging/erofs/xattr.c
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/drivers/staging/erofs/xattr.c
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include <linux/security.h>
+#include "xattr.h"
+
+struct xattr_iter {
+	struct super_block *sb;
+	struct page *page;
+	void *kaddr;
+
+	erofs_blk_t blkaddr;
+	unsigned ofs;
+};
+
+static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
+{
+	/* only init_inode_xattrs use non-atomic once */
+	if (unlikely(!atomic))
+		kunmap(it->page);
+	else
+		kunmap_atomic(it->kaddr);
+	unlock_page(it->page);
+	put_page(it->page);
+}
+
+static void init_inode_xattrs(struct inode *inode)
+{
+	struct xattr_iter it;
+	unsigned i;
+	struct erofs_xattr_ibody_header *ih;
+	struct erofs_sb_info *sbi;
+	struct erofs_vnode *vi;
+	bool atomic_map;
+
+	if (likely(inode_has_inited_xattr(inode)))
+		return;
+
+	vi = EROFS_V(inode);
+	BUG_ON(!vi->xattr_isize);
+
+	sbi = EROFS_I_SB(inode);
+	it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
+	it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
+
+	it.page = erofs_get_inline_page(inode, it.blkaddr);
+	BUG_ON(IS_ERR(it.page));
+
+	/* read in shared xattr array (non-atomic, see kmalloc below) */
+	it.kaddr = kmap(it.page);
+	atomic_map = false;
+
+	ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
+
+	vi->xattr_shared_count = ih->h_shared_count;
+	vi->xattr_shared_xattrs = (unsigned *)kmalloc_array(
+		vi->xattr_shared_count, sizeof(unsigned),
+		GFP_KERNEL | __GFP_NOFAIL);
+
+	/* let's skip ibody header */
+	it.ofs += sizeof(struct erofs_xattr_ibody_header);
+
+	for (i = 0; i < vi->xattr_shared_count; ++i) {
+		if (unlikely(it.ofs >= EROFS_BLKSIZ)) {
+			/* cannot be unaligned */
+			BUG_ON(it.ofs != EROFS_BLKSIZ);
+			xattr_iter_end(&it, atomic_map);
+
+			it.page = erofs_get_meta_page(inode->i_sb,
+				++it.blkaddr, S_ISDIR(inode->i_mode));
+			BUG_ON(IS_ERR(it.page));
+
+			it.kaddr = kmap_atomic(it.page);
+			atomic_map = true;
+			it.ofs = 0;
+		}
+		vi->xattr_shared_xattrs[i] =
+			le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
+		it.ofs += sizeof(__le32);
+	}
+	xattr_iter_end(&it, atomic_map);
+
+	inode_set_inited_xattr(inode);
+}
+
+struct xattr_iter_handlers {
+	int (*entry)(struct xattr_iter *, struct erofs_xattr_entry *);
+	int (*name)(struct xattr_iter *, unsigned, char *, unsigned);
+	int (*alloc_buffer)(struct xattr_iter *, unsigned);
+	void (*value)(struct xattr_iter *, unsigned, char *, unsigned);
+};
+
+static void xattr_iter_fixup(struct xattr_iter *it)
+{
+	if (unlikely(it->ofs >= EROFS_BLKSIZ)) {
+		xattr_iter_end(it, true);
+
+		it->blkaddr += erofs_blknr(it->ofs);
+		it->page = erofs_get_meta_page(it->sb, it->blkaddr, false);
+		BUG_ON(IS_ERR(it->page));
+
+		it->kaddr = kmap_atomic(it->page);
+		it->ofs = erofs_blkoff(it->ofs);
+	}
+}
+
+static int inline_xattr_iter_begin(struct xattr_iter *it,
+	struct inode *inode)
+{
+	unsigned xattr_header_sz, inline_xattr_ofs;
+	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+
+	xattr_header_sz = inlinexattr_header_size(inode);
+	if (unlikely(xattr_header_sz >= vi->xattr_isize)) {
+		BUG_ON(xattr_header_sz > vi->xattr_isize);
+		return -ENOATTR;
+	}
+
+	inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
+
+	it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
+	it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
+
+	it->page = erofs_get_inline_page(inode, it->blkaddr);
+	BUG_ON(IS_ERR(it->page));
+	it->kaddr = kmap_atomic(it->page);
+
+	return vi->xattr_isize - xattr_header_sz;
+}
+
+static int xattr_foreach(struct xattr_iter *it,
+	struct xattr_iter_handlers *op, unsigned *tlimit)
+{
+	struct erofs_xattr_entry entry;
+	unsigned value_sz, processed, slice;
+	int err;
+
+	/* 0. fixup blkaddr, ofs, ipage */
+	xattr_iter_fixup(it);
+
+	/*
+	 * 1. read xattr entry to the memory,
+	 *    since we do EROFS_XATTR_ALIGN
+	 *    therefore entry should be in the page
+	 */
+	entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
+	if (tlimit != NULL) {
+		unsigned entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry);
+
+		BUG_ON(*tlimit < entry_sz);
+		*tlimit -= entry_sz;
+	}
+
+	it->ofs += sizeof(struct erofs_xattr_entry);
+	value_sz = le16_to_cpu(entry.e_value_size);
+
+	/* handle entry */
+	err = op->entry(it, &entry);
+	if (err) {
+		it->ofs += entry.e_name_len + value_sz;
+		goto out;
+	}
+
+	/* 2. handle xattr name (ofs will finally be at the end of name) */
+	processed = 0;
+
+	while (processed < entry.e_name_len) {
+		if (it->ofs >= EROFS_BLKSIZ) {
+			BUG_ON(it->ofs > EROFS_BLKSIZ);
+
+			xattr_iter_fixup(it);
+			it->ofs = 0;
+		}
+
+		slice = min_t(unsigned, PAGE_SIZE - it->ofs,
+			entry.e_name_len - processed);
+
+		/* handle name */
+		err = op->name(it, processed, it->kaddr + it->ofs, slice);
+		if (err) {
+			it->ofs += entry.e_name_len - processed + value_sz;
+			goto out;
+		}
+
+		it->ofs += slice;
+		processed += slice;
+	}
+
+	/* 3. handle xattr value */
+	processed = 0;
+
+	if (op->alloc_buffer != NULL) {
+		err = op->alloc_buffer(it, value_sz);
+		if (err) {
+			it->ofs += value_sz;
+			goto out;
+		}
+	}
+
+	while (processed < value_sz) {
+		if (it->ofs >= EROFS_BLKSIZ) {
+			BUG_ON(it->ofs > EROFS_BLKSIZ);
+			xattr_iter_fixup(it);
+			it->ofs = 0;
+		}
+
+		slice = min_t(unsigned, PAGE_SIZE - it->ofs,
+			value_sz - processed);
+		op->value(it, processed, it->kaddr + it->ofs, slice);
+		it->ofs += slice;
+		processed += slice;
+	}
+
+out:
+	/* we assume that ofs is aligned with 4 bytes */
+	it->ofs = EROFS_XATTR_ALIGN(it->ofs);
+	return err;
+}
+
+struct getxattr_iter {
+	struct xattr_iter it;
+
+	char *buffer;
+	int buffer_size, index;
+	struct qstr name;
+};
+
+static int xattr_entrymatch(struct xattr_iter *_it,
+	struct erofs_xattr_entry *entry)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+	return (it->index != entry->e_name_index ||
+		it->name.len != entry->e_name_len) ? -ENOATTR : 0;
+}
+
+static int xattr_namematch(struct xattr_iter *_it,
+	unsigned processed, char *buf, unsigned len)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+	return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0;
+}
+
+static int xattr_checkbuffer(struct xattr_iter *_it,
+	unsigned value_sz)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+	int err = it->buffer_size < value_sz ? -ERANGE : 0;
+
+	it->buffer_size = value_sz;
+	return it->buffer == NULL ? 1 : err;
+}
+
+static void xattr_copyvalue(struct xattr_iter *_it,
+	unsigned processed, char *buf, unsigned len)
+{
+	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+
+	memcpy(it->buffer + processed, buf, len);
+}
+
+struct xattr_iter_handlers find_xattr_handlers = {
+	.entry = xattr_entrymatch,
+	.name = xattr_namematch,
+	.alloc_buffer = xattr_checkbuffer,
+	.value = xattr_copyvalue
+};
+
+static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+	int ret;
+	unsigned remaining;
+
+	ret = inline_xattr_iter_begin(&it->it, inode);
+	if (ret < 0)
+		return ret;
+
+	remaining = ret;
+	while (remaining) {
+		if ((ret = xattr_foreach(&it->it,
+			&find_xattr_handlers, &remaining)) >= 0)
+			break;
+	}
+	xattr_iter_end(&it->it, true);
+
+	return ret < 0 ? ret : it->buffer_size;
+}
+
+static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+	unsigned i;
+	int ret = -ENOATTR;
+
+	for (i = 0; i < vi->xattr_shared_count; ++i) {
+		erofs_blk_t blkaddr =
+			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+
+		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+
+		if (!i || blkaddr != it->it.blkaddr) {
+			if (i)
+				xattr_iter_end(&it->it, true);
+
+			it->it.page = erofs_get_meta_page(inode->i_sb,
+				blkaddr, false);
+			BUG_ON(IS_ERR(it->it.page));
+			it->it.kaddr = kmap_atomic(it->it.page);
+			it->it.blkaddr = blkaddr;
+		}
+
+		if ((ret = xattr_foreach(&it->it,
+			&find_xattr_handlers, NULL)) >= 0)
+			break;
+	}
+	if (vi->xattr_shared_count)
+		xattr_iter_end(&it->it, true);
+
+	return ret < 0 ? ret : it->buffer_size;
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
+static int erofs_xattr_get_prefix(struct erofs_sb_info *sbi,
+	int type, const char **prefix)
+{
+	switch (type) {
+	case EROFS_XATTR_INDEX_USER:
+		if (!test_opt(sbi, XATTR_USER))
+			return -EOPNOTSUPP;
+		*prefix = XATTR_USER_PREFIX;
+		return XATTR_USER_PREFIX_LEN;
+
+	case EROFS_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		*prefix = XATTR_TRUSTED_PREFIX;
+		return XATTR_TRUSTED_PREFIX_LEN;
+
+	case EROFS_XATTR_INDEX_SECURITY:
+		*prefix = XATTR_SECURITY_PREFIX;
+		return XATTR_SECURITY_PREFIX_LEN;
+	}
+	return -EINVAL;
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
+static size_t erofs_xattr_generic_list(struct dentry *dentry, char *list,
+	size_t list_size, const char *name, size_t name_len, int type)
+#else
+static size_t erofs_xattr_generic_list(const struct xattr_handler *handler,
+	struct dentry *dentry, char *list, size_t list_size,
+	const char *name, size_t name_len)
+#endif
+{
+	struct erofs_sb_info *sbi = EROFS_SB(dentry->d_sb);
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0))
+	int type = handler->flags;
+#endif
+	int total_len, prefix_len;
+	const char *prefix;
+
+	prefix_len = erofs_xattr_get_prefix(sbi, type, &prefix);
+	if (prefix_len < 0)
+		return prefix_len;
+
+	total_len = prefix_len + name_len + 1;
+	if (list && total_len <= list_size) {
+		memcpy(list, prefix, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+#else
+static bool erofs_xattr_user_list(struct dentry *dentry)
+{
+	return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER);
+}
+
+static bool erofs_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
+}
+#endif
+
+int erofs_getxattr(struct inode *inode, int index,
+	const char *name,
+	void *buffer, size_t buffer_size)
+{
+	int ret;
+	struct getxattr_iter it;
+
+	if (unlikely(name == NULL))
+		return -EINVAL;
+
+	init_inode_xattrs(inode);
+
+	it.index = index;
+
+	it.name.len = strlen(name);
+	if (it.name.len > EROFS_NAME_LEN)
+		return -ERANGE;
+	it.name.name = name;
+
+	it.buffer = buffer;
+	it.buffer_size = buffer_size;
+
+	it.it.sb = inode->i_sb;
+	ret = inline_getxattr(inode, &it);
+	if (ret == -ENOATTR)
+		ret = shared_getxattr(inode, &it);
+	return ret;
+}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
+static int erofs_xattr_generic_get(struct dentry *dentry,
+	const char *name,
+	void *buffer, size_t size, int type)
+{
+	struct inode *inode = d_inode(dentry);
+#else
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0))
+static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+	struct dentry *dentry, const char *name, void *buffer,
+	size_t size)
+{
+	struct inode *inode = d_inode(dentry);
+#else
+static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+		struct dentry *unused, struct inode *inode,
+		const char *name, void *buffer, size_t size)
+{
+#endif
+#endif
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
+	switch (type) {
+#else
+	switch (handler->flags) {
+#endif
+
+	case EROFS_XATTR_INDEX_USER:
+		if (!test_opt(sbi, XATTR_USER))
+			return -EOPNOTSUPP;
+		break;
+	case EROFS_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		break;
+	case EROFS_XATTR_INDEX_SECURITY:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
+	if (name[0] == '\0')
+		return -EINVAL;
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
+	return erofs_getxattr(inode, type, name, buffer, size);
+#else
+	return erofs_getxattr(inode, handler->flags, name, buffer, size);
+#endif
+}
+
+const struct xattr_handler erofs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.flags	= EROFS_XATTR_INDEX_USER,
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
+	.list	= erofs_xattr_generic_list,
+#else
+	.list	= erofs_xattr_user_list,
+#endif
+
+	.get	= erofs_xattr_generic_get,
+};
+
+const struct xattr_handler erofs_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.flags	= EROFS_XATTR_INDEX_TRUSTED,
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
+	.list	= erofs_xattr_generic_list,
+#else
+	.list	= erofs_xattr_trusted_list,
+#endif
+	.get	= erofs_xattr_generic_get,
+};
+
+#ifdef CONFIG_EROFS_FS_SECURITY
+const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.flags	= EROFS_XATTR_INDEX_SECURITY,
+	.get	= erofs_xattr_generic_get,
+};
+#endif
+
+#ifdef CONFIG_EROFS_FS_XATTR
+const struct xattr_handler *erofs_xattr_handlers[] = {
+	&erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+	&erofs_xattr_security_handler,
+#endif
+	NULL,
+};
+#endif
+
+struct listxattr_iter {
+	struct xattr_iter it;
+
+	struct dentry *dentry;
+	char *buffer;
+	int buffer_size, buffer_ofs;
+};
+
+static int xattr_entrylist(struct xattr_iter *_it,
+	struct erofs_xattr_entry *entry)
+{
+	struct listxattr_iter *it =
+		container_of(_it, struct listxattr_iter, it);
+	unsigned prefix_len;
+	const char *prefix;
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0))
+	struct erofs_sb_info *sbi = EROFS_SB(it->dentry->d_sb);
+	int ret = erofs_xattr_get_prefix(sbi, entry->e_name_index, &prefix);
+
+	if (ret < 0)
+		return 1;
+
+	prefix_len = ret;
+#else
+	const struct xattr_handler *h =
+		erofs_xattr_handler(entry->e_name_index);
+
+	if (h == NULL || (h->list != NULL && !h->list(it->dentry)))
+		return 1;
+
+	prefix = h->name;
+	prefix_len = h->prefix ? strlen(prefix) : 0;
+#endif
+
+	if (it->buffer == NULL) {
+		it->buffer_ofs += prefix_len + entry->e_name_len + 1;
+		return 1;
+	}
+
+	if (it->buffer_ofs + prefix_len
+		+ entry->e_name_len + 1 > it->buffer_size)
+		return -ERANGE;
+
+	memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
+	it->buffer_ofs += prefix_len;
+	return 0;
+}
+
+static int xattr_namelist(struct xattr_iter *_it,
+	unsigned processed, char *buf, unsigned len)
+{
+	struct listxattr_iter *it =
+		container_of(_it, struct listxattr_iter, it);
+
+	memcpy(it->buffer + it->buffer_ofs, buf, len);
+	it->buffer_ofs += len;
+	return 0;
+}
+
+static int xattr_skipvalue(struct xattr_iter *_it,
+	unsigned value_sz)
+{
+	struct listxattr_iter *it =
+		container_of(_it, struct listxattr_iter, it);
+
+	it->buffer[it->buffer_ofs++] = '\0';
+	return 1;
+}
+
+struct xattr_iter_handlers list_xattr_handlers = {
+	.entry = xattr_entrylist,
+	.name = xattr_namelist,
+	.alloc_buffer = xattr_skipvalue,
+	.value = NULL
+};
+
+static int inline_listxattr(struct listxattr_iter *it)
+{
+	int ret;
+	unsigned remaining;
+
+	ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
+	if (ret < 0)
+		return ret;
+
+	remaining = ret;
+	while (remaining) {
+		if ((ret = xattr_foreach(&it->it,
+			&list_xattr_handlers, &remaining)) < 0)
+			break;
+	}
+	xattr_iter_end(&it->it, true);
+	return ret < 0 ? ret : it->buffer_ofs;
+}
+
+static int shared_listxattr(struct listxattr_iter *it)
+{
+	struct inode *inode = d_inode(it->dentry);
+	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+	unsigned i;
+	int ret = 0;
+
+	for (i = 0; i < vi->xattr_shared_count; ++i) {
+		erofs_blk_t blkaddr =
+			xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+
+		it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+		if (!i || blkaddr != it->it.blkaddr) {
+			if (i)
+				xattr_iter_end(&it->it, true);
+
+			it->it.page = erofs_get_meta_page(inode->i_sb,
+				blkaddr, false);
+			BUG_ON(IS_ERR(it->it.page));
+			it->it.kaddr = kmap_atomic(it->it.page);
+			it->it.blkaddr = blkaddr;
+		}
+
+		if ((ret = xattr_foreach(&it->it,
+			&list_xattr_handlers, NULL)) < 0)
+			break;
+	}
+	if (vi->xattr_shared_count)
+		xattr_iter_end(&it->it, true);
+
+	return ret < 0 ? ret : it->buffer_ofs;
+}
+
+ssize_t erofs_listxattr(struct dentry *dentry,
+	char *buffer, size_t buffer_size)
+{
+	int ret;
+	struct listxattr_iter it;
+
+	init_inode_xattrs(d_inode(dentry));
+
+	it.dentry = dentry;
+	it.buffer = buffer;
+	it.buffer_size = buffer_size;
+	it.buffer_ofs = 0;
+
+	it.it.sb = dentry->d_sb;
+
+	ret = inline_listxattr(&it);
+	if (ret < 0 && ret != -ENOATTR)
+		return ret;
+	return shared_listxattr(&it);
+}
+
diff --git a/drivers/staging/erofs/xattr.h b/drivers/staging/erofs/xattr.h
new file mode 100644
index 0000000..0c73792
--- /dev/null
+++ b/drivers/staging/erofs/xattr.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * linux/drivers/staging/erofs/xattr.h
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#ifndef __EROFS_XATTR_H
+#define __EROFS_XATTR_H
+
+#include "internal.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+/* Attribute not found */
+#define ENOATTR         ENODATA
+
+static inline unsigned inlinexattr_header_size(struct inode *inode)
+{
+	return sizeof(struct erofs_xattr_ibody_header)
+		+ sizeof(u32) * EROFS_V(inode)->xattr_shared_count;
+}
+
+static inline erofs_blk_t
+xattrblock_addr(struct erofs_sb_info *sbi, unsigned xattr_id)
+{
+#ifdef CONFIG_EROFS_FS_XATTR
+	return sbi->xattr_blkaddr +
+		xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
+#else
+	return 0;
+#endif
+}
+
+static inline unsigned
+xattrblock_offset(struct erofs_sb_info *sbi, unsigned xattr_id)
+{
+	return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
+}
+
+extern const struct xattr_handler erofs_xattr_user_handler;
+extern const struct xattr_handler erofs_xattr_trusted_handler;
+#ifdef CONFIG_EROFS_FS_SECURITY
+extern const struct xattr_handler erofs_xattr_security_handler;
+#endif
+
+static inline const struct xattr_handler *erofs_xattr_handler(unsigned index)
+{
+static const struct xattr_handler *xattr_handler_map[] = {
+	[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+	[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+	[EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+		&posix_acl_default_xattr_handler,
+#endif
+	[EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+	[EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+#endif
+};
+	return index && index < ARRAY_SIZE(xattr_handler_map) ?
+		xattr_handler_map[index] : NULL;
+}
+
+#ifdef CONFIG_EROFS_FS_XATTR
+
+extern const struct inode_operations erofs_generic_xattr_iops;
+extern const struct inode_operations erofs_dir_xattr_iops;
+
+int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
+ssize_t erofs_listxattr(struct dentry *, char *, size_t);
+#else
+static int __maybe_unused erofs_getxattr(struct inode *inode, int index,
+	const char *name,
+	void *buffer, size_t buffer_size)
+{
+	return -ENOTSUPP;
+}
+
+static ssize_t __maybe_unused erofs_listxattr(struct dentry *dentry,
+	char *buffer, size_t buffer_size)
+{
+	return -ENOTSUPP;
+}
+#endif
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 09/25] staging: erofs: update Kconfig and Makefile
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This commit adds Makefile and Kconfig for erofs, and
updates Makefile and Kconfig files in the fs directory.

Signed-off-by: Miao Xie <miaoxie at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/Kconfig        |  2 ++
 drivers/staging/Makefile       |  1 +
 drivers/staging/erofs/Kconfig  | 36 ++++++++++++++++++++++++++++++++++++
 drivers/staging/erofs/Makefile | 11 +++++++++++
 4 files changed, 50 insertions(+)
 create mode 100644 drivers/staging/erofs/Kconfig
 create mode 100644 drivers/staging/erofs/Makefile

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 75a4804..de9c95a 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -126,4 +126,6 @@ source "drivers/staging/mt7621-eth/Kconfig"
 
 source "drivers/staging/mt7621-dts/Kconfig"
 
+source "drivers/staging/erofs/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index e84959a..a33ac34 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -54,3 +54,4 @@ obj-$(CONFIG_SOC_MT7621)	+= mt7621-dma/
 obj-$(CONFIG_SOC_MT7621)	+= mt7621-mmc/
 obj-$(CONFIG_SOC_MT7621)	+= mt7621-eth/
 obj-$(CONFIG_SOC_MT7621)	+= mt7621-dts/
+obj-$(CONFIG_EROFS_FS)		+= erofs/
diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
new file mode 100644
index 0000000..077430f
--- /dev/null
+++ b/drivers/staging/erofs/Kconfig
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config EROFS_FS
+	tristate "EROFS filesystem support"
+	depends on BLOCK
+	help
+	  EROFS(Enhanced Read-Only File System) is a lightweight
+	  read-only file system with modern designs (eg. page-sized
+	  blocks, inline xattrs/data, etc.) for scenarios which need
+	  high-performance read-only requirements, eg. firmwares in
+	  mobile phone or LIVECDs.
+
+	  It also provides VLE compression support, focusing on
+	  random read improvements, keeping relatively lower
+	  compression ratios, which is useful for high-performance
+	  devices with limited memory and ROM space.
+
+	  If unsure, say N.
+
+config EROFS_FS_DEBUG
+	bool "EROFS debugging feature"
+	depends on EROFS_FS
+	help
+	  Print EROFS debugging messages and enable more BUG_ONs
+	  which check the filesystem consistency aggressively.
+
+	  For daily use, say N.
+
+config EROFS_FS_USE_VM_MAP_RAM
+	bool "EROFS VM_MAP_RAM Support"
+	depends on EROFS_FS
+	help
+	  use vm_map_ram/vm_unmap_ram instead of vmap/vunmap.
+
+	  If you don't know what these are, say N.
+
diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
new file mode 100644
index 0000000..31e909e
--- /dev/null
+++ b/drivers/staging/erofs/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+EROFS_VERSION = "1.0pre1"
+
+ccflags-y += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+
+obj-$(CONFIG_EROFS_FS) += erofs.o
+# staging requirement: to be self-contained in its own directory
+ccflags-y += -I$(src)/include
+erofs-objs := super.o inode.o data.o namei.o dir.o
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 08/25] staging: erofs: definitions for kernel compatibility
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

erofs file system is designed for 3.1x ~ latest kernel,
staging.h is introduced for this compatibility.

This patch _should_ be dropped in the near future of course.

Signed-off-by: Miao Xie <miaoxie at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/staging.h | 83 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 drivers/staging/erofs/staging.h

diff --git a/drivers/staging/erofs/staging.h b/drivers/staging/erofs/staging.h
new file mode 100644
index 0000000..7712a7b
--- /dev/null
+++ b/drivers/staging/erofs/staging.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* should be avoid in the future */
+#include <linux/version.h>
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 14, 31))
+__SETPAGEFLAG(Referenced, referenced)
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 0))
+#define d_inode(d) ((d)->d_inode)
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 1, 0))
+#define d_really_is_negative(d) (d_inode(d) == NULL)
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0))
+/* Restricts the given gfp_mask to what the mapping allows. */
+static inline gfp_t mapping_gfp_constraint(
+	struct address_space *mapping,
+	gfp_t gfp_mask)
+{
+	return mapping_gfp_mask(mapping) & gfp_mask;
+}
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 116))
+static inline void inode_nohighmem(struct inode *inode)
+{
+	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
+
+/* bio stuffs */
+#define REQ_OP_READ    READ
+#define REQ_OP_WRITE   WRITE
+#define bio_op(bio)    ((bio)->bi_rw & 1)
+
+static inline void bio_set_op_attrs(struct bio *bio,
+	unsigned op, unsigned op_flags) {
+	bio->bi_rw = op | op_flags;
+}
+
+static inline gfp_t readahead_gfp_mask(struct address_space *x)
+{
+	return mapping_gfp_mask(x) |  __GFP_COLD |
+	                              __GFP_NORETRY | __GFP_NOWARN;
+}
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 13))
+#define READ_ONCE(x)		ACCESS_ONCE(x)
+#define WRITE_ONCE(x, val)	(ACCESS_ONCE(x) = (val))
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 18, 40))
+static inline int lockref_put_return(struct lockref *lockref)
+{
+	return -1;
+}
+#endif
+
+#ifndef WQ_NON_REENTRANT
+#define WQ_NON_REENTRANT 0
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 6, 0))
+#define page_cache_get(page)            get_page(page)
+#define page_cache_release(page)        put_page(page)
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(4, 14, 0))
+static inline bool sb_rdonly(const struct super_block *sb) {
+	return sb->s_flags & MS_RDONLY;
+}
+
+#define bio_set_dev(bio, bdev)	((bio)->bi_bdev = (bdev))
+
+#endif
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 07/25] staging: erofs: add namei functions
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This commit adds functions that transfer names to inodes.

Signed-off-by: Miao Xie <miaoxie at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/namei.c | 243 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 drivers/staging/erofs/namei.c

diff --git a/drivers/staging/erofs/namei.c b/drivers/staging/erofs/namei.c
new file mode 100644
index 0000000..27b6712
--- /dev/null
+++ b/drivers/staging/erofs/namei.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/drivers/staging/erofs/namei.c
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+/* based on the value of qn->len is accurate */
+static inline int dirnamecmp(struct qstr *qn,
+	struct qstr *qd, unsigned *matched)
+{
+	unsigned i = *matched, len = min(qn->len, qd->len);
+loop:
+	if (unlikely(i >= len)) {
+		*matched = i;
+		if (qn->len < qd->len) {
+			/*
+			 * actually (qn->len == qd->len)
+			 * when qd->name[i] == '\0'
+			 */
+			return qd->name[i] == '\0' ? 0 : -1;
+		}
+		return (qn->len > qd->len);
+	}
+
+	if (qn->name[i] != qd->name[i]) {
+		*matched = i;
+		return qn->name[i] > qd->name[i] ? 1 : -1;
+	}
+
+	++i;
+	goto loop;
+}
+
+static struct erofs_dirent *find_target_dirent(
+	struct qstr *name,
+	u8 *data, int maxsize)
+{
+	unsigned ndirents, head, back;
+	unsigned startprfx, endprfx;
+	struct erofs_dirent *const de = (struct erofs_dirent *)data;
+
+	/* make sure that maxsize is valid */
+	BUG_ON(maxsize < sizeof(struct erofs_dirent));
+
+	ndirents = le16_to_cpu(de->nameoff) / sizeof(*de);
+
+	/* corrupted dir (may be unnecessary...) */
+	BUG_ON(!ndirents);
+
+	head = 0;
+	back = ndirents - 1;
+	startprfx = endprfx = 0;
+
+	while (head <= back) {
+		unsigned mid = head + (back - head) / 2;
+		unsigned nameoff = le16_to_cpu(de[mid].nameoff);
+		unsigned matched = min(startprfx, endprfx);
+
+		struct qstr dname = QSTR_INIT(data + nameoff,
+			unlikely(mid >= ndirents - 1) ?
+				maxsize - nameoff :
+				le16_to_cpu(de[mid + 1].nameoff) - nameoff);
+
+		/* string comparison without already matched prefix */
+		int ret = dirnamecmp(name, &dname, &matched);
+
+		if (unlikely(!ret))
+			return de + mid;
+		else if (ret > 0) {
+			head = mid + 1;
+			startprfx = matched;
+		} else if (unlikely(mid < 1))	/* fix "mid" overflow */
+			break;
+		else {
+			back = mid - 1;
+			endprfx = matched;
+		}
+	}
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct page *find_target_block_classic(
+	struct inode *dir,
+	struct qstr *name, int *_diff)
+{
+	unsigned startprfx, endprfx;
+	unsigned head, back;
+	struct address_space *const mapping = dir->i_mapping;
+	struct page *candidate = ERR_PTR(-ENOENT);
+
+	startprfx = endprfx = 0;
+	head = 0;
+	back = inode_datablocks(dir) - 1;
+
+	while (head <= back) {
+		unsigned mid = head + (back - head) / 2;
+		struct page *page = read_mapping_page(mapping, mid, NULL);
+
+		if (IS_ERR(page)) {
+exact_out:
+			if (!IS_ERR(candidate)) /* valid candidate */
+				put_page(candidate);
+			return page;
+		} else {
+			int diff;
+			unsigned ndirents, matched;
+			struct qstr dname;
+			struct erofs_dirent *de = kmap_atomic(page);
+			unsigned nameoff = le16_to_cpu(de->nameoff);
+
+			ndirents = nameoff / sizeof(*de);
+
+			/* corrupted dir (should have one entry at least) */
+			BUG_ON(!ndirents || nameoff > PAGE_SIZE);
+
+			matched = min(startprfx, endprfx);
+
+			dname.name = (u8 *)de + nameoff;
+			dname.len = ndirents == 1 ?
+				/* since the rest of the last page is 0 */
+				EROFS_BLKSIZ - nameoff
+				: le16_to_cpu(de[1].nameoff) - nameoff;
+
+			/* string comparison without already matched prefix */
+			diff = dirnamecmp(name, &dname, &matched);
+			kunmap_atomic(de);
+
+			if (unlikely(!diff)) {
+				*_diff = 0;
+				goto exact_out;
+			} else if (diff > 0) {
+				head = mid + 1;
+				startprfx = matched;
+
+				if (likely(!IS_ERR(candidate)))
+					put_page(candidate);
+				candidate = page;
+			} else {
+				put_page(page);
+
+				if (unlikely(mid < 1))	/* fix "mid" overflow */
+					break;
+
+				back = mid - 1;
+				endprfx = matched;
+			}
+		}
+	}
+	*_diff = 1;
+	return candidate;
+}
+
+int erofs_namei(struct inode *dir,
+	struct qstr *name,
+	erofs_nid_t *nid, unsigned *d_type)
+{
+	int diff;
+	struct page *page;
+	u8 *data;
+	struct erofs_dirent *de;
+
+	if (unlikely(!dir->i_size))
+		return -ENOENT;
+
+	diff = 1;
+	page = find_target_block_classic(dir, name, &diff);
+
+	if (unlikely(IS_ERR(page)))
+		return PTR_ERR(page);
+
+	data = kmap_atomic(page);
+	/* the target page has been mapped */
+	de = likely(diff) ?
+		/* since the rest of the last page is 0 */
+		find_target_dirent(name, data, EROFS_BLKSIZ) :
+		(struct erofs_dirent *)data;
+
+	if (likely(!IS_ERR(de))) {
+		*nid = le64_to_cpu(de->nid);
+		*d_type = de->file_type;
+	}
+
+	kunmap_atomic(data);
+	put_page(page);
+
+	return IS_ERR(de) ? PTR_ERR(de) : 0;
+}
+
+/* NOTE: i_mutex is already held by vfs */
+struct dentry *erofs_lookup(struct inode *dir,
+	struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	erofs_nid_t nid;
+	unsigned d_type;
+	struct inode *inode;
+
+	DBG_BUGON(!d_really_is_negative(dentry));
+	/* dentry must be unhashed in lookup, no need to worry about */
+	DBG_BUGON(!d_unhashed(dentry));
+
+	/* file name exceeds fs limit */
+	if (unlikely(dentry->d_name.len > EROFS_NAME_LEN))
+		return ERR_PTR(-ENAMETOOLONG);
+
+	/* false uninitialized warnings on gcc 4.8.x */
+	err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
+
+	if (err == -ENOENT) {
+		/* negative dentry */
+		inode = NULL;
+		goto negative_out;
+	} else if (unlikely(err))
+		return ERR_PTR(err);
+
+	debugln("%s, %s (nid %llu) found, d_type %u", __func__,
+		dentry->d_name.name, nid, d_type);
+
+	inode = erofs_iget(dir->i_sb, nid, d_type == EROFS_FT_DIR);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+negative_out:
+	return d_splice_alias(inode, dentry);
+}
+
+const struct inode_operations erofs_dir_iops = {
+	.lookup = erofs_lookup,
+};
+
+const struct inode_operations erofs_dir_xattr_iops = {
+	.lookup = erofs_lookup,
+};
+
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH 06/25] staging: erofs: add directory operations
From: Gao Xiang @ 2018-07-24  2:36 UTC (permalink / raw)

In-Reply-To: <1532399805-65674-1-git-send-email-gaoxiang25@huawei.com>

This adds functions for directory, mainly readdir.

Signed-off-by: Miao Xie <miaoxie at huawei.com>
Signed-off-by: Chao Yu <yuchao0 at huawei.com>
Signed-off-by: Gao Xiang <gaoxiang25 at huawei.com>
---
 drivers/staging/erofs/dir.c | 145 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 drivers/staging/erofs/dir.c

diff --git a/drivers/staging/erofs/dir.c b/drivers/staging/erofs/dir.c
new file mode 100644
index 0000000..d910424
--- /dev/null
+++ b/drivers/staging/erofs/dir.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/drivers/staging/erofs/dir.c
+ *
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25 at huawei.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+#include "internal.h"
+
+const unsigned char erofs_filetype_table[EROFS_FT_MAX] = {
+	[EROFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EROFS_FT_REG_FILE]	= DT_REG,
+	[EROFS_FT_DIR]		= DT_DIR,
+	[EROFS_FT_CHRDEV]	= DT_CHR,
+	[EROFS_FT_BLKDEV]	= DT_BLK,
+	[EROFS_FT_FIFO]		= DT_FIFO,
+	[EROFS_FT_SOCK]		= DT_SOCK,
+	[EROFS_FT_SYMLINK]	= DT_LNK,
+};
+
+static int erofs_fill_dentries(struct dir_context *ctx,
+	void *dentry_blk, unsigned *ofs,
+	unsigned nameoff, unsigned maxsize)
+{
+	struct erofs_dirent *de = dentry_blk;
+	const struct erofs_dirent *end = dentry_blk + nameoff;
+
+	de = dentry_blk + *ofs;
+	while (de < end) {
+		const char *de_name;
+		int de_namelen;
+		unsigned char d_type;
+#ifdef CONFIG_EROFS_FS_DEBUG
+		unsigned dbg_namelen;
+		unsigned char dbg_namebuf[EROFS_NAME_LEN];
+#endif
+
+		if (unlikely(de->file_type < EROFS_FT_MAX))
+			d_type = erofs_filetype_table[de->file_type];
+		else
+			d_type = DT_UNKNOWN;
+
+		nameoff = le16_to_cpu(de->nameoff);
+		de_name = (char *)dentry_blk + nameoff;
+
+		de_namelen = unlikely(de + 1 >= end) ?
+			/* last directory entry */
+			strnlen(de_name, maxsize - nameoff) :
+			le16_to_cpu(de[1].nameoff) - nameoff;
+
+		/* the corrupted directory found */
+		BUG_ON(de_namelen < 0);
+
+#ifdef CONFIG_EROFS_FS_DEBUG
+		dbg_namelen = min(EROFS_NAME_LEN - 1, de_namelen);
+		memcpy(dbg_namebuf, de_name, dbg_namelen);
+		dbg_namebuf[dbg_namelen] = '\0';
+
+		debugln("%s, found de_name %s de_len %d d_type %d", __func__,
+			dbg_namebuf, de_namelen, d_type);
+#endif
+
+		if (!dir_emit(ctx, de_name, de_namelen,
+					le64_to_cpu(de->nid), d_type))
+			/* stoped by some reason */
+			return 1;
+		++de;
+		*ofs += sizeof(struct erofs_dirent);
+	}
+	*ofs = maxsize;
+	return 0;
+}
+
+static int erofs_readdir(struct file *f, struct dir_context *ctx)
+{
+	struct inode *dir = file_inode(f);
+	struct address_space *mapping = dir->i_mapping;
+	const size_t dirsize = i_size_read(dir);
+	unsigned i = ctx->pos / EROFS_BLKSIZ;
+	unsigned ofs = ctx->pos % EROFS_BLKSIZ;
+	int err = 0;
+	bool initial = true;
+
+	while (ctx->pos < dirsize) {
+		struct page *dentry_page;
+		struct erofs_dirent *de;
+		unsigned nameoff, maxsize;
+
+		dentry_page = read_mapping_page(mapping, i, NULL);
+		if (IS_ERR(dentry_page))
+			continue;
+
+		lock_page(dentry_page);
+		de = (struct erofs_dirent *)kmap(dentry_page);
+
+		nameoff = le16_to_cpu(de->nameoff);
+
+		if (unlikely(nameoff < sizeof(struct erofs_dirent) ||
+			nameoff >= PAGE_SIZE)) {
+			errln("%s, invalid de[0].nameoff %u",
+				__func__, nameoff);
+
+			err = -EIO;
+			goto skip_this;
+		}
+
+		maxsize = min_t(unsigned, dirsize - ctx->pos + ofs, PAGE_SIZE);
+
+		/* search dirents at the arbitrary position */
+		if (unlikely(initial)) {
+			initial = false;
+
+			ofs = roundup(ofs, sizeof(struct erofs_dirent));
+			if (unlikely(ofs >= nameoff))
+				goto skip_this;
+		}
+
+		err = erofs_fill_dentries(ctx, de, &ofs, nameoff, maxsize);
+skip_this:
+		kunmap(dentry_page);
+
+		unlock_page(dentry_page);
+		put_page(dentry_page);
+
+		ctx->pos = blknr_to_addr(i) + ofs;
+
+		if (unlikely(err))
+			break;
+		++i;
+		ofs = 0;
+	}
+	return err < 0 ? err : 0;
+}
+
+const struct file_operations erofs_dir_fops = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.iterate	= erofs_readdir,
+};
+
-- 
1.9.1

^ permalink raw reply related


This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.