Linux-ARM-Kernel Archive on lore.kernel.org

Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH RESEND 2/2] gpio: axp209: add pinctrl support
From: Chen-Yu Tsai @ 2016-11-24 16:08 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161123141151.25315-3-quentin.schulz@free-electrons.com>

On Wed, Nov 23, 2016 at 10:11 PM, Quentin Schulz
<quentin.schulz@free-electrons.com> wrote:
> The GPIOs present in the AXP209 PMIC have multiple functions. They
> typically allow a pin to be used as GPIO input or output and can also be
> used as ADC or regulator for example.[1]
>
> This adds the possibility to use all functions of the GPIOs present in
> the AXP209 PMIC thanks to pinctrl subsystem.
>
> [1] see registers 90H, 92H and 93H at
>     http://dl.linux-sunxi.org/AXP/AXP209_Datasheet_v1.0en.pdf
>
> Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
> ---
>  .../devicetree/bindings/gpio/gpio-axp209.txt       |  28 +-
>  drivers/gpio/gpio-axp209.c                         | 551 ++++++++++++++++++---
>  2 files changed, 503 insertions(+), 76 deletions(-)
>
> diff --git a/Documentation/devicetree/bindings/gpio/gpio-axp209.txt b/Documentation/devicetree/bindings/gpio/gpio-axp209.txt
> index a661130..a5bfe87 100644
> --- a/Documentation/devicetree/bindings/gpio/gpio-axp209.txt
> +++ b/Documentation/devicetree/bindings/gpio/gpio-axp209.txt
> @@ -1,4 +1,4 @@
> -AXP209 GPIO controller
> +AXP209 GPIO & pinctrl controller
>
>  This driver follows the usual GPIO bindings found in
>  Documentation/devicetree/bindings/gpio/gpio.txt
> @@ -28,3 +28,29 @@ axp209: pmic at 34 {
>                 #gpio-cells = <2>;
>         };
>  };
> +
> +The GPIOs can be muxed to other functions and therefore, must be a subnode of
> +axp_gpio.
> +
> +Example:
> +
> +&axp_gpio {
> +       gpio0_adc: gpio0_adc {
> +               pin = "GPIO0";
> +               function = "adc";
> +       };
> +};
> +
> +&example_node {
> +       pinctrl-names = "default";
> +       pinctrl-0 = <&gpio0_adc>;
> +};
> +
> +GPIOs and their functions
> +-------------------------
> +
> +GPIO   |       Functions
> +------------------------
> +GPIO0  |       gpio_in, gpio_out, ldo, adc
> +GPIO1  |       gpio_in, gpio_out, ldo, adc
> +GPIO2  |       gpio_in, gpio_out
> diff --git a/drivers/gpio/gpio-axp209.c b/drivers/gpio/gpio-axp209.c
> index 4a346b7..0a64cfc 100644
> --- a/drivers/gpio/gpio-axp209.c
> +++ b/drivers/gpio/gpio-axp209.c
> @@ -1,7 +1,8 @@
>  /*
> - * AXP20x GPIO driver
> + * AXP20x Pin control driver
>   *
>   * Copyright (C) 2016 Maxime Ripard <maxime.ripard@free-electrons.com>
> + * Copyright (C) 2016 Quentin Schulz <quentin.schulz@free-electrons.com>
>   *
>   * This program is free software; you can redistribute it and/or modify it
>   * under  the terms of the GNU General  Public License as published by the
> @@ -21,52 +22,103 @@
>  #include <linux/platform_device.h>
>  #include <linux/regmap.h>
>  #include <linux/slab.h>
> +#include <linux/pinctrl/pinctrl.h>
> +#include <linux/pinctrl/pinmux.h>
> +#include <linux/pinctrl/pinconf-generic.h>
>
>  #define AXP20X_GPIO_FUNCTIONS          0x7
>  #define AXP20X_GPIO_FUNCTION_OUT_LOW   0
>  #define AXP20X_GPIO_FUNCTION_OUT_HIGH  1
>  #define AXP20X_GPIO_FUNCTION_INPUT     2
>
> -struct axp20x_gpio {
> -       struct gpio_chip        chip;
> -       struct regmap           *regmap;
> -};
> +#define AXP20X_PINCTRL_PIN(_pin_num, _pin, _regs)              \
> +       {                                                       \
> +               .number = _pin_num,                             \
> +               .name = _pin,                                   \
> +               .drv_data = _regs,                              \
> +       }
>
> -static int axp20x_gpio_get_reg(unsigned offset)
> -{
> -       switch (offset) {
> -       case 0:
> -               return AXP20X_GPIO0_CTRL;
> -       case 1:
> -               return AXP20X_GPIO1_CTRL;
> -       case 2:
> -               return AXP20X_GPIO2_CTRL;
> +#define AXP20X_PIN(_pin, ...)                                  \
> +       {                                                       \
> +               .pin = _pin,                                    \
> +               .functions = (struct axp20x_desc_function[]) {  \
> +                             __VA_ARGS__, { } },               \
>         }
>
> -       return -EINVAL;
> -}
> +#define AXP20X_FUNCTION(_val, _name)                           \
> +       {                                                       \
> +               .name = _name,                                  \
> +               .muxval = _val,                                 \
> +       }
>
> -static int axp20x_gpio_input(struct gpio_chip *chip, unsigned offset)
> -{
> -       struct axp20x_gpio *gpio = gpiochip_get_data(chip);
> -       int reg;
> +struct axp20x_desc_function {
> +       const char      *name;
> +       u8              muxval;
> +};
>
> -       reg = axp20x_gpio_get_reg(offset);
> -       if (reg < 0)
> -               return reg;
> +struct axp20x_desc_pin {
> +       struct pinctrl_pin_desc         pin;
> +       struct axp20x_desc_function     *functions;
> +};
>
> -       return regmap_update_bits(gpio->regmap, reg,
> -                                 AXP20X_GPIO_FUNCTIONS,
> -                                 AXP20X_GPIO_FUNCTION_INPUT);
> -}
> +struct axp20x_pinctrl_desc {
> +       const struct axp20x_desc_pin    *pins;
> +       int                             npins;
> +       unsigned int                    pin_base;

You do not need pin_base.

> +};
> +
> +struct axp20x_pinctrl_function {
> +       const char      *name;
> +       const char      **groups;
> +       unsigned int    ngroups;
> +};
> +
> +struct axp20x_pinctrl_group {
> +       const char      *name;
> +       unsigned long   config;
> +       unsigned int    pin;
> +};
> +
> +struct axp20x_pctl {
> +       struct pinctrl_dev                      *pctl_dev;
> +       struct device                           *dev;
> +       struct gpio_chip                        chip;
> +       struct regmap                           *regmap;
> +       const struct axp20x_pinctrl_desc        *desc;
> +       struct axp20x_pinctrl_group             *groups;
> +       unsigned int                            ngroups;
> +       struct axp20x_pinctrl_function          *functions;
> +       unsigned int                            nfunctions;
> +};
> +
> +static const struct axp20x_desc_pin axp209_pins[] = {
> +       AXP20X_PIN(AXP20X_PINCTRL_PIN(0, "GPIO0", (void *)AXP20X_GPIO0_CTRL),
> +                  AXP20X_FUNCTION(0x0, "gpio_out"),
> +                  AXP20X_FUNCTION(0x2, "gpio_in"),
> +                  AXP20X_FUNCTION(0x3, "ldo"),
> +                  AXP20X_FUNCTION(0x4, "adc")),
> +       AXP20X_PIN(AXP20X_PINCTRL_PIN(1, "GPIO1", (void *)AXP20X_GPIO1_CTRL),
> +                  AXP20X_FUNCTION(0x0, "gpio_out"),
> +                  AXP20X_FUNCTION(0x2, "gpio_in"),
> +                  AXP20X_FUNCTION(0x3, "ldo"),
> +                  AXP20X_FUNCTION(0x4, "adc")),
> +       AXP20X_PIN(AXP20X_PINCTRL_PIN(2, "GPIO2", (void *)AXP20X_GPIO2_CTRL),
> +                  AXP20X_FUNCTION(0x0, "gpio_out"),
> +                  AXP20X_FUNCTION(0x2, "gpio_in")),
> +};
> +
> +static const struct axp20x_pinctrl_desc axp20x_pinctrl_data = {
> +       .pins   = axp209_pins,
> +       .npins  = ARRAY_SIZE(axp209_pins),
> +};
>
>  static int axp20x_gpio_get(struct gpio_chip *chip, unsigned offset)
>  {
> -       struct axp20x_gpio *gpio = gpiochip_get_data(chip);
> +       struct axp20x_pctl *pctl = gpiochip_get_data(chip);
>         unsigned int val;
>         int ret;
>
> -       ret = regmap_read(gpio->regmap, AXP20X_GPIO20_SS, &val);
> +       ret = regmap_read(pctl->regmap, AXP20X_GPIO20_SS, &val);
>         if (ret)
>                 return ret;
>
> @@ -75,15 +127,12 @@ static int axp20x_gpio_get(struct gpio_chip *chip, unsigned offset)
>
>  static int axp20x_gpio_get_direction(struct gpio_chip *chip, unsigned offset)
>  {
> -       struct axp20x_gpio *gpio = gpiochip_get_data(chip);
> +       struct axp20x_pctl *pctl = gpiochip_get_data(chip);
> +       int pin_reg = (int)pctl->desc->pins[offset].pin.drv_data;
>         unsigned int val;
> -       int reg, ret;
> -
> -       reg = axp20x_gpio_get_reg(offset);
> -       if (reg < 0)
> -               return reg;
> +       int ret;
>
> -       ret = regmap_read(gpio->regmap, reg, &val);
> +       ret = regmap_read(pctl->regmap, pin_reg, &val);
>         if (ret)
>                 return ret;
>
> @@ -102,33 +151,335 @@ static int axp20x_gpio_get_direction(struct gpio_chip *chip, unsigned offset)
>         return val & 2;
>  }
>
> -static int axp20x_gpio_output(struct gpio_chip *chip, unsigned offset,
> +static void axp20x_gpio_set(struct gpio_chip *chip, unsigned int offset,
> +                           int value)
> +{
> +       struct axp20x_pctl *pctl = gpiochip_get_data(chip);
> +       int pin_reg = (int)pctl->desc->pins[offset].pin.drv_data;
> +
> +       regmap_update_bits(pctl->regmap, pin_reg,
> +                          AXP20X_GPIO_FUNCTIONS,
> +                          value ? AXP20X_GPIO_FUNCTION_OUT_HIGH
> +                                : AXP20X_GPIO_FUNCTION_OUT_LOW);
> +}
> +
> +static int axp20x_gpio_input(struct gpio_chip *chip, unsigned int offset)
> +{
> +       return pinctrl_gpio_direction_input(chip->base + offset);
> +}
> +
> +static int axp20x_gpio_output(struct gpio_chip *chip, unsigned int offset,
>                               int value)
>  {
> -       struct axp20x_gpio *gpio = gpiochip_get_data(chip);
> -       int reg;
> +       chip->set(chip, offset, value);
>
> -       reg = axp20x_gpio_get_reg(offset);
> -       if (reg < 0)
> -               return reg;
> +       return 0;
> +}
>
> -       return regmap_update_bits(gpio->regmap, reg,
> -                                 AXP20X_GPIO_FUNCTIONS,
> -                                 value ? AXP20X_GPIO_FUNCTION_OUT_HIGH
> -                                 : AXP20X_GPIO_FUNCTION_OUT_LOW);
> +static int axp20x_pmx_set(struct pinctrl_dev *pctldev, unsigned int offset,
> +                         u8 config)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +       int pin_reg = (int)pctl->desc->pins[offset].pin.drv_data;
> +
> +       return regmap_update_bits(pctl->regmap, pin_reg, AXP20X_GPIO_FUNCTIONS,
> +                                 config);
>  }
>
> -static void axp20x_gpio_set(struct gpio_chip *chip, unsigned offset,
> -                           int value)
> +static int axp20x_pmx_func_cnt(struct pinctrl_dev *pctldev)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +
> +       return pctl->nfunctions;
> +}
> +
> +static const char *axp20x_pmx_func_name(struct pinctrl_dev *pctldev,
> +                                       unsigned int selector)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +
> +       return pctl->functions[selector].name;
> +}
> +
> +static int axp20x_pmx_func_groups(struct pinctrl_dev *pctldev,
> +                                 unsigned int selector,
> +                                 const char * const **groups,
> +                                 unsigned int *num_groups)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +
> +       *groups = pctl->functions[selector].groups;
> +       *num_groups = pctl->functions[selector].ngroups;
> +
> +       return 0;
> +}
> +
> +static struct axp20x_desc_function *
> +axp20x_pinctrl_desc_find_func_by_name(struct axp20x_pctl *pctl,
> +                                     const char *group, const char *func)
> +{
> +       const struct axp20x_desc_pin *pin;
> +       struct axp20x_desc_function *desc_func;
> +       int i;
> +
> +       for (i = 0; i < pctl->desc->npins; i++) {
> +               pin = &pctl->desc->pins[i];
> +
> +               if (!strcmp(pin->pin.name, group)) {
> +                       desc_func = pin->functions;
> +
> +                       while (desc_func->name) {
> +                               if (!strcmp(desc_func->name, func))
> +                                       return desc_func;
> +                               desc_func++;
> +                       }
> +
> +                       /*
> +                        * Pins are uniquely named. Groups are named after one
> +                        * pin name. If one pin matches group name but its
> +                        * function cannot be found, no other pin will match
> +                        * group name.
> +                        */
> +                       return NULL;
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
> +static int axp20x_pmx_set_mux(struct pinctrl_dev *pctldev,
> +                             unsigned int function, unsigned int group)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +       struct axp20x_pinctrl_group *g = pctl->groups + group;
> +       struct axp20x_pinctrl_function *func = pctl->functions + function;
> +       struct axp20x_desc_function *desc_func =
> +               axp20x_pinctrl_desc_find_func_by_name(pctl, g->name,
> +                                                     func->name);
> +       if (!desc_func)
> +               return -EINVAL;
> +
> +       return axp20x_pmx_set(pctldev, g->pin, desc_func->muxval);
> +}
> +
> +static struct axp20x_desc_function *
> +axp20x_pctl_desc_find_func_by_pin(struct axp20x_pctl *pctl, unsigned int offset,
> +                                 const char *func)
> +{
> +       const struct axp20x_desc_pin *pin;
> +       struct axp20x_desc_function *desc_func;
> +       int i;
> +
> +       for (i = 0; i < pctl->desc->npins; i++) {
> +               pin = &pctl->desc->pins[i];
> +
> +               if (pin->pin.number == offset) {
> +                       desc_func = pin->functions;
> +
> +                       while (desc_func->name) {
> +                               if (!strcmp(desc_func->name, func))
> +                                       return desc_func;
> +
> +                               desc_func++;
> +                       }
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
> +static int axp20x_pmx_gpio_set_direction(struct pinctrl_dev *pctldev,
> +                                        struct pinctrl_gpio_range *range,
> +                                        unsigned int offset, bool input)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +       struct axp20x_desc_function *desc_func;
> +       const char *func;
> +
> +       if (input)
> +               func = "gpio_in";
> +       else
> +               func = "gpio_out";
> +
> +       desc_func = axp20x_pctl_desc_find_func_by_pin(pctl, offset, func);
> +       if (!desc_func)
> +               return -EINVAL;
> +
> +       return axp20x_pmx_set(pctldev, offset, desc_func->muxval);
> +}
> +
> +static const struct pinmux_ops axp20x_pmx_ops = {
> +       .get_functions_count    = axp20x_pmx_func_cnt,
> +       .get_function_name      = axp20x_pmx_func_name,
> +       .get_function_groups    = axp20x_pmx_func_groups,
> +       .set_mux                = axp20x_pmx_set_mux,
> +       .gpio_set_direction     = axp20x_pmx_gpio_set_direction,
> +       .strict                 = true,
> +};
> +
> +static int axp20x_groups_cnt(struct pinctrl_dev *pctldev)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +
> +       return pctl->ngroups;
> +}
> +
> +static int axp20x_group_pins(struct pinctrl_dev *pctldev, unsigned int selector,
> +                            const unsigned int **pins, unsigned int *num_pins)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +       struct axp20x_pinctrl_group *g = pctl->groups + selector;
> +
> +       *pins = (unsigned int *)&g->pin;
> +       *num_pins = 1;
> +
> +       return 0;
> +}
> +
> +static const char *axp20x_group_name(struct pinctrl_dev *pctldev,
> +                                    unsigned int selector)
> +{
> +       struct axp20x_pctl *pctl = pinctrl_dev_get_drvdata(pctldev);
> +
> +       return pctl->groups[selector].name;
> +}
> +
> +static const struct pinctrl_ops axp20x_pctrl_ops = {
> +       .dt_node_to_map         = pinconf_generic_dt_node_to_map_group,
> +       .dt_free_map            = pinconf_generic_dt_free_map,
> +       .get_groups_count       = axp20x_groups_cnt,
> +       .get_group_name         = axp20x_group_name,
> +       .get_group_pins         = axp20x_group_pins,
> +};
> +
> +static struct axp20x_pinctrl_function *
> +axp20x_pinctrl_function_by_name(struct axp20x_pctl *pctl, const char *name)
> +{
> +       struct axp20x_pinctrl_function *func = pctl->functions;
> +
> +       while (func->name) {
> +               if (!strcmp(func->name, name))
> +                       return func;
> +               func++;
> +       }
> +
> +       return NULL;
> +}
> +
> +static int axp20x_pinctrl_add_function(struct axp20x_pctl *pctl,
> +                                      const char *name)
>  {
> -       axp20x_gpio_output(chip, offset, value);
> +       struct axp20x_pinctrl_function *func = pctl->functions;
> +
> +       while (func->name) {
> +               if (!strcmp(func->name, name)) {
> +                       func->ngroups++;
> +                       return -EEXIST;
> +               }
> +
> +               func++;
> +       }
> +
> +       func->name = name;
> +       func->ngroups = 1;
> +
> +       pctl->nfunctions++;
> +
> +       return 0;
>  }
>
> -static int axp20x_gpio_probe(struct platform_device *pdev)
> +static int axp20x_attach_group_function(struct platform_device *pdev,
> +                                       const struct axp20x_desc_pin *pin)
> +{
> +       struct axp20x_pctl *pctl = platform_get_drvdata(pdev);
> +       struct axp20x_desc_function *desc_func = pin->functions;
> +       struct axp20x_pinctrl_function *func;
> +       const char **func_grp;
> +
> +       while (desc_func->name) {
> +               func = axp20x_pinctrl_function_by_name(pctl, desc_func->name);
> +               if (!func)
> +                       return -EINVAL;
> +
> +               if (!func->groups) {
> +                       func->groups = devm_kzalloc(&pdev->dev,
> +                                                   func->ngroups * sizeof(const char *),
> +                                                   GFP_KERNEL);
> +                       if (!func->groups)
> +                               return -ENOMEM;
> +               }
> +
> +               func_grp = func->groups;
> +               while (*func_grp)
> +                       func_grp++;
> +
> +               *func_grp = pin->pin.name;
> +               desc_func++;
> +       }
> +
> +       return 0;
> +}
> +
> +static int axp20x_build_state(struct platform_device *pdev)
> +{
> +       struct axp20x_pctl *pctl = platform_get_drvdata(pdev);
> +       unsigned int npins = pctl->desc->npins;
> +       const struct axp20x_desc_pin *pin;
> +       struct axp20x_desc_function *func;
> +       int i, ret;
> +
> +       pctl->ngroups = npins;
> +       pctl->groups = devm_kzalloc(&pdev->dev,
> +                                   pctl->ngroups * sizeof(*pctl->groups),
> +                                   GFP_KERNEL);
> +       if (!pctl->groups)
> +               return -ENOMEM;
> +
> +       for (i = 0; i < npins; i++) {
> +               pctl->groups[i].name = pctl->desc->pins[i].pin.name;
> +               pctl->groups[i].pin = pctl->desc->pins[i].pin.number;
> +       }
> +
> +       /* We assume 4 functions per pin should be enough as a default max */
> +       pctl->functions = devm_kzalloc(&pdev->dev,
> +                                      npins * 4 * sizeof(*pctl->functions),
> +                                      GFP_KERNEL);
> +       if (!pctl->functions)
> +               return -ENOMEM;
> +
> +       /* Create a list of uniquely named functions */
> +       for (i = 0; i < npins; i++) {
> +               pin = &pctl->desc->pins[i];
> +               func = pin->functions;
> +
> +               while (func->name) {
> +                       axp20x_pinctrl_add_function(pctl, func->name);
> +                       func++;
> +               }
> +       }
> +
> +       pctl->functions = krealloc(pctl->functions,
> +                                  pctl->nfunctions * sizeof(*pctl->functions),
> +                                  GFP_KERNEL);
> +
> +       for (i = 0; i < npins; i++) {
> +               pin = &pctl->desc->pins[i];
> +               ret = axp20x_attach_group_function(pdev, pin);
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       return 0;
> +}
> +
> +static int axp20x_pctl_probe(struct platform_device *pdev)
>  {
>         struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent);
> -       struct axp20x_gpio *gpio;
> -       int ret;
> +       const struct axp20x_desc_pin *pin;
> +       struct axp20x_pctl *pctl;
> +       struct pinctrl_desc *pctrl_desc;
> +       struct pinctrl_pin_desc *pins;
> +       int ret, i;
>
>         if (!of_device_is_available(pdev->dev.of_node))
>                 return -ENODEV;
> @@ -138,51 +489,101 @@ static int axp20x_gpio_probe(struct platform_device *pdev)
>                 return -EINVAL;
>         }
>
> -       gpio = devm_kzalloc(&pdev->dev, sizeof(*gpio), GFP_KERNEL);
> -       if (!gpio)
> +       pctl = devm_kzalloc(&pdev->dev, sizeof(*pctl), GFP_KERNEL);
> +       if (!pctl)
> +               return -ENOMEM;
> +
> +       pctl->chip.base                 = -1;
> +       pctl->chip.can_sleep            = true;
> +       pctl->chip.request              = gpiochip_generic_request;
> +       pctl->chip.free                 = gpiochip_generic_free;
> +       pctl->chip.parent               = &pdev->dev;
> +       pctl->chip.label                = dev_name(&pdev->dev);
> +       pctl->chip.owner                = THIS_MODULE;
> +       pctl->chip.get                  = axp20x_gpio_get;
> +       pctl->chip.get_direction        = axp20x_gpio_get_direction;
> +       pctl->chip.set                  = axp20x_gpio_set;
> +       pctl->chip.direction_input      = axp20x_gpio_input;
> +       pctl->chip.direction_output     = axp20x_gpio_output;
> +       pctl->chip.ngpio                = 3;
> +       pctl->chip.can_sleep            = true;
> +
> +       pctl->regmap = axp20x->regmap;
> +
> +       pctl->desc = &axp20x_pinctrl_data;
> +       pctl->dev = &pdev->dev;
> +
> +       platform_set_drvdata(pdev, pctl);
> +
> +       ret = axp20x_build_state(pdev);
> +       if (ret)
> +               return ret;
> +
> +       pins = devm_kzalloc(&pdev->dev, pctl->desc->npins * sizeof(*pins),
> +                           GFP_KERNEL);
> +       if (!pins)
>                 return -ENOMEM;
>
> -       gpio->chip.base                 = -1;
> -       gpio->chip.can_sleep            = true;
> -       gpio->chip.parent               = &pdev->dev;
> -       gpio->chip.label                = dev_name(&pdev->dev);
> -       gpio->chip.owner                = THIS_MODULE;
> -       gpio->chip.get                  = axp20x_gpio_get;
> -       gpio->chip.get_direction        = axp20x_gpio_get_direction;
> -       gpio->chip.set                  = axp20x_gpio_set;
> -       gpio->chip.direction_input      = axp20x_gpio_input;
> -       gpio->chip.direction_output     = axp20x_gpio_output;
> -       gpio->chip.ngpio                = 3;
> -
> -       gpio->regmap = axp20x->regmap;
> -
> -       ret = devm_gpiochip_add_data(&pdev->dev, &gpio->chip, gpio);
> +       for (i = 0; i < pctl->desc->npins; i++)
> +               pins[i] = pctl->desc->pins[i].pin;
> +
> +       pctrl_desc = devm_kzalloc(&pdev->dev, sizeof(*pctrl_desc), GFP_KERNEL);
> +       if (!pctrl_desc)
> +               return -ENOMEM;
> +
> +       pctrl_desc->name = dev_name(&pdev->dev);
> +       pctrl_desc->owner = THIS_MODULE;
> +       pctrl_desc->pins = pins;
> +       pctrl_desc->npins = pctl->desc->npins;
> +       pctrl_desc->pctlops = &axp20x_pctrl_ops;
> +       pctrl_desc->pmxops = &axp20x_pmx_ops;
> +
> +       pctl->pctl_dev = devm_pinctrl_register(&pdev->dev, pctrl_desc, pctl);
> +       if (IS_ERR(pctl->pctl_dev)) {
> +               dev_err(&pdev->dev, "couldn't register pinctrl driver\n");
> +               return PTR_ERR(pctl->pctl_dev);
> +       }
> +
> +       ret = devm_gpiochip_add_data(&pdev->dev, &pctl->chip, pctl);
>         if (ret) {
>                 dev_err(&pdev->dev, "Failed to register GPIO chip\n");
>                 return ret;
>         }
>
> +       for (i = 0; i < pctl->desc->npins; i++) {
> +               pin = pctl->desc->pins + i;
> +
> +               ret = gpiochip_add_pin_range(&pctl->chip, dev_name(&pdev->dev),
> +                                            pin->pin.number, pin->pin.number,
> +                                            1);

The pins, unlike in sunxi, are sequential and contiguous. There's no need for
the loop. Just add them in one go.

> +               if (ret) {
> +                       dev_err(&pdev->dev, "failed to add pin range\n");
> +                       return ret;
> +               }
> +       }
> +
>         dev_info(&pdev->dev, "AXP209 GPIO driver loaded\n");
>
>         return 0;
>  }
>
> -static const struct of_device_id axp20x_gpio_match[] = {
> +static const struct of_device_id axp20x_pctl_match[] = {
>         { .compatible = "x-powers,axp209-gpio" },
>         { }
>  };
> -MODULE_DEVICE_TABLE(of, axp20x_gpio_match);
> +MODULE_DEVICE_TABLE(of, axp20x_pctl_match);
>
> -static struct platform_driver axp20x_gpio_driver = {
> -       .probe          = axp20x_gpio_probe,
> +static struct platform_driver axp20x_pctl_driver = {
> +       .probe          = axp20x_pctl_probe,
>         .driver = {
>                 .name           = "axp20x-gpio",
> -               .of_match_table = axp20x_gpio_match,
> +               .of_match_table = axp20x_pctl_match,
>         },
>  };
>
> -module_platform_driver(axp20x_gpio_driver);
> +module_platform_driver(axp20x_pctl_driver);
>
>  MODULE_AUTHOR("Maxime Ripard <maxime.ripard@free-electrons.com>");
> +MODULE_AUTHOR("Quentin Schulz <quentin.schulz@free-electrons.com>");
>  MODULE_DESCRIPTION("AXP20x PMIC GPIO driver");
>  MODULE_LICENSE("GPL");
> --
> 2.9.3
>

Apart from the minor comments above, and Thomas' earlier comments,
this patch looks good to me.

ChenYu

^ permalink raw reply

* [kvm-unit-tests PATCH v7 00/11] QEMU MTTCG Test cases
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel

Hi,

Looking at my records it seems as though it has been a while since I
last posted these tests. As I'm hoping to get the final bits of MTTCG
merged upstream on the next QEMU development cycle I've been re-basing
these and getting them cleaned up for merging.

Some of the patches might be worth taking now if the maintainers are
happy to do so (run_test tweaks, libcflat updates?). The others could
do with more serious review. I've CC'd some of the ARM guys to look
over the tlbflush/barrier tests so they can cast their expert eyes
over them ;-)

There are two additions to the series.

The tcg-test is a general torture test aimed at QEMU's TCG execution
model. It stresses the cpu execution loop through the use of
cross-page and computed jumps. It can also add IRQ's and self-modifying
code to the mix.

The tlbflush-data test is a new one, the old tlbflush test is renamed
tlbflush-code to better indicate the code path it exercise. The the
code test tests the translation invalidation pathways in QEMU the data
test exercises the SoftMMU's TLBs and explicitly that tlbflush
completion semantics are correct.

The tlbflush-data passes most of the times on real hardware but
definitely showed the problem with deferred TLB flushes running under
MTTCG QEMU. I've looked at some of the failure cases on real hardware
and it did look like a timestamp appeared on a page that shouldn't
have been accessible at the time - I don't know if this is a real
silicon bug or my misreading of the semantics so I'd appreciate
a comment from the experts.

The code needs to be applied on top of Drew's latest ARM GIC patches
or you can grab my tree from:

  https://github.com/stsquad/kvm-unit-tests/tree/mttcg/current-tests-v7

Cheers,

Alex.

Alex Benn?e (11):
  run_tests: allow forcing of acceleration mode
  run_tests: allow disabling of timeouts
  run_tests: allow passing of options to QEMU
  libcflat: add PRI(dux)32 format types
  lib: add isaac prng library from CCAN
  arm/Makefile.common: force -fno-pic
  arm/tlbflush-code: Add TLB flush during code execution test
  arm/tlbflush-data: Add TLB flush during data writes test
  arm/locking-tests: add comprehensive locking test
  arm/barrier-litmus-tests: add simple mp and sal litmus tests
  arm/tcg-test: some basic TCG exercising tests

 Makefile                  |   2 +
 arm/Makefile.arm          |   2 +
 arm/Makefile.arm64        |   2 +
 arm/Makefile.common       |  11 ++
 arm/barrier-litmus-test.c | 437 ++++++++++++++++++++++++++++++++++++++++++++++
 arm/locking-test.c        | 302 ++++++++++++++++++++++++++++++++
 arm/tcg-test-asm.S        | 170 ++++++++++++++++++
 arm/tcg-test-asm64.S      | 169 ++++++++++++++++++
 arm/tcg-test.c            | 337 +++++++++++++++++++++++++++++++++++
 arm/tlbflush-code.c       | 212 ++++++++++++++++++++++
 arm/tlbflush-data.c       | 401 ++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg         | 190 ++++++++++++++++++++
 lib/arm/asm/barrier.h     |  63 ++++++-
 lib/arm64/asm/barrier.h   |  50 ++++++
 lib/libcflat.h            |   5 +
 lib/prng.c                | 162 +++++++++++++++++
 lib/prng.h                |  82 +++++++++
 run_tests.sh              |  18 +-
 scripts/functions.bash    |  13 +-
 scripts/runtime.bash      |   8 +
 20 files changed, 2626 insertions(+), 10 deletions(-)
 create mode 100644 arm/barrier-litmus-test.c
 create mode 100644 arm/locking-test.c
 create mode 100644 arm/tcg-test-asm.S
 create mode 100644 arm/tcg-test-asm64.S
 create mode 100644 arm/tcg-test.c
 create mode 100644 arm/tlbflush-code.c
 create mode 100644 arm/tlbflush-data.c
 create mode 100644 lib/prng.c
 create mode 100644 lib/prng.h

-- 
2.10.1

^ permalink raw reply

* [kvm-unit-tests PATCH v7 01/11] run_tests: allow forcing of acceleration mode
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

While tests can be pegged to tcg it is useful to override this from time
to time, especially when testing correctness on real systems.
---
 run_tests.sh         | 8 ++++++--
 scripts/runtime.bash | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/run_tests.sh b/run_tests.sh
index 254129d..b88c36f 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -13,9 +13,10 @@ function usage()
 {
 cat <<EOF
 
-Usage: $0 [-g group] [-h] [-v]
+Usage: $0 [-g group] [-a accel] [-h] [-v]
 
     -g: Only execute tests in the given group
+    -a: Force acceleration mode (tcg/kvm)
     -h: Output this help text
     -v: Enables verbose mode
 
@@ -28,11 +29,14 @@ EOF
 RUNTIME_arch_run="./$TEST_DIR/run"
 source scripts/runtime.bash
 
-while getopts "g:hv" opt; do
+while getopts "g:a:hv" opt; do
     case $opt in
         g)
             only_group=$OPTARG
             ;;
+        a)
+            force_accel=$OPTARG
+            ;;
         h)
             usage
             exit
diff --git a/scripts/runtime.bash b/scripts/runtime.bash
index 11a40a9..578cf32 100644
--- a/scripts/runtime.bash
+++ b/scripts/runtime.bash
@@ -75,6 +75,10 @@ function run()
         return;
     fi
 
+    if [ -n "$force_accel" ]; then
+        accel=$force_accel
+    fi
+
     if [ -n "$arch" ] && [ "$arch" != "$ARCH" ]; then
         echo "`SKIP` $1 ($arch only)"
         return 2
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 02/11] run_tests: allow disabling of timeouts
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

Certainly during development of the tests and MTTCG there are times when
the timeout just gets in the way.

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
---
 run_tests.sh         | 8 ++++++--
 scripts/runtime.bash | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/run_tests.sh b/run_tests.sh
index b88c36f..4f2e5cb 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -13,10 +13,11 @@ function usage()
 {
 cat <<EOF
 
-Usage: $0 [-g group] [-a accel] [-h] [-v]
+Usage: $0 [-g group] [-a accel] [-t] [-h] [-v]
 
     -g: Only execute tests in the given group
     -a: Force acceleration mode (tcg/kvm)
+    -t: disable timeouts
     -h: Output this help text
     -v: Enables verbose mode
 
@@ -29,7 +30,7 @@ EOF
 RUNTIME_arch_run="./$TEST_DIR/run"
 source scripts/runtime.bash
 
-while getopts "g:a:hv" opt; do
+while getopts "g:a:thv" opt; do
     case $opt in
         g)
             only_group=$OPTARG
@@ -37,6 +38,9 @@ while getopts "g:a:hv" opt; do
         a)
             force_accel=$OPTARG
             ;;
+        t)
+            no_timeout="yes"
+            ;;
         h)
             usage
             exit
diff --git a/scripts/runtime.bash b/scripts/runtime.bash
index 578cf32..968ff6d 100644
--- a/scripts/runtime.bash
+++ b/scripts/runtime.bash
@@ -79,6 +79,10 @@ function run()
         accel=$force_accel
     fi
 
+    if [ "$no_timeout" = "yes" ]; then
+        timeout=""
+    fi
+
     if [ -n "$arch" ] && [ "$arch" != "$ARCH" ]; then
         echo "`SKIP` $1 ($arch only)"
         return 2
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 03/11] run_tests: allow passing of options to QEMU
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

This introduces a the option -o for passing of options directly to QEMU
which is useful. In my case I'm using it to toggle MTTCG on an off:

  ./run_tests.sh -t -o "-tcg mttcg=on"

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
---
 run_tests.sh           | 10 +++++++---
 scripts/functions.bash | 13 +++++++------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/run_tests.sh b/run_tests.sh
index 4f2e5cb..05cc7fb 100755
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -13,10 +13,11 @@ function usage()
 {
 cat <<EOF
 
-Usage: $0 [-g group] [-a accel] [-t] [-h] [-v]
+Usage: $0 [-g group] [-a accel] [-o qemu_opts] [-t] [-h] [-v]
 
     -g: Only execute tests in the given group
     -a: Force acceleration mode (tcg/kvm)
+    -o: additional options for QEMU command line
     -t: disable timeouts
     -h: Output this help text
     -v: Enables verbose mode
@@ -30,7 +31,7 @@ EOF
 RUNTIME_arch_run="./$TEST_DIR/run"
 source scripts/runtime.bash
 
-while getopts "g:a:thv" opt; do
+while getopts "g:a:o:thv" opt; do
     case $opt in
         g)
             only_group=$OPTARG
@@ -38,6 +39,9 @@ while getopts "g:a:thv" opt; do
         a)
             force_accel=$OPTARG
             ;;
+        o)
+            extra_opts=$OPTARG
+            ;;
         t)
             no_timeout="yes"
             ;;
@@ -67,4 +71,4 @@ RUNTIME_log_stdout () {
 config=$TEST_DIR/unittests.cfg
 rm -f test.log
 printf "BUILD_HEAD=$(cat build-head)\n\n" > test.log
-for_each_unittest $config run
+for_each_unittest $config run "$extra_opts"
diff --git a/scripts/functions.bash b/scripts/functions.bash
index ee9143c..d38a69e 100644
--- a/scripts/functions.bash
+++ b/scripts/functions.bash
@@ -2,11 +2,12 @@
 function for_each_unittest()
 {
 	local unittests="$1"
-	local cmd="$2"
-	local testname
+        local cmd="$2"
+        local extra_opts=$3
+        local testname
 	local smp
 	local kernel
-	local opts
+        local opts=$extra_opts
 	local groups
 	local arch
 	local check
@@ -21,7 +22,7 @@ function for_each_unittest()
 			testname=${BASH_REMATCH[1]}
 			smp=1
 			kernel=""
-			opts=""
+                        opts=$extra_opts
 			groups=""
 			arch=""
 			check=""
@@ -32,7 +33,7 @@ function for_each_unittest()
 		elif [[ $line =~ ^smp\ *=\ *(.*)$ ]]; then
 			smp=${BASH_REMATCH[1]}
 		elif [[ $line =~ ^extra_params\ *=\ *(.*)$ ]]; then
-			opts=${BASH_REMATCH[1]}
+                        opts="$opts ${BASH_REMATCH[1]}"
 		elif [[ $line =~ ^groups\ *=\ *(.*)$ ]]; then
 			groups=${BASH_REMATCH[1]}
 		elif [[ $line =~ ^arch\ *=\ *(.*)$ ]]; then
@@ -45,6 +46,6 @@ function for_each_unittest()
 			timeout=${BASH_REMATCH[1]}
 		fi
 	done
-	"$cmd" "$testname" "$groups" "$smp" "$kernel" "$opts" "$arch" "$check" "$accel" "$timeout"
+        "$cmd" "$testname" "$groups" "$smp" "$kernel" "$opts" "$arch" "$check" "$accel" "$timeout"
 	exec {fd}<&-
 }
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 04/11] libcflat: add PRI(dux)32 format types
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

So we can have portable formatting of uint32_t types.

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
---
 lib/libcflat.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/libcflat.h b/lib/libcflat.h
index bdcc561..6dab5be 100644
--- a/lib/libcflat.h
+++ b/lib/libcflat.h
@@ -55,12 +55,17 @@ typedef _Bool		bool;
 #define true  1
 
 #if __SIZEOF_LONG__ == 8
+#  define __PRI32_PREFIX
 #  define __PRI64_PREFIX	"l"
 #  define __PRIPTR_PREFIX	"l"
 #else
+#  define __PRI32_PREFIX        "l"
 #  define __PRI64_PREFIX	"ll"
 #  define __PRIPTR_PREFIX
 #endif
+#define PRId32  __PRI32_PREFIX	"d"
+#define PRIu32  __PRI32_PREFIX	"u"
+#define PRIx32  __PRI32_PREFIX	"x"
 #define PRId64  __PRI64_PREFIX	"d"
 #define PRIu64  __PRI64_PREFIX	"u"
 #define PRIx64  __PRI64_PREFIX	"x"
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 05/11] lib: add isaac prng library from CCAN
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

It's often useful to introduce some sort of random variation when
testing several racing CPU conditions. Instead of each test implementing
some half-arsed PRNG bring in a a decent one which has good statistical
randomness. Obviously it is deterministic for a given seed value which
is likely the behaviour you want.

I've pulled in the ISAAC library from CCAN:

    http://ccodearchive.net/info/isaac.html

I shaved off the float related stuff which is less useful for unit
testing and re-indented to fit the style. The original license was
CC0 (Public Domain) which is compatible with the LGPL v2 of
kvm-unit-tests.

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
CC: Timothy B. Terriberry <tterribe@xiph.org>
Acked-by: Andrew Jones <drjones@redhat.com>
---
 arm/Makefile.common |   1 +
 lib/prng.c          | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/prng.h          |  82 ++++++++++++++++++++++++++
 3 files changed, 245 insertions(+)
 create mode 100644 lib/prng.c
 create mode 100644 lib/prng.h

diff --git a/arm/Makefile.common b/arm/Makefile.common
index 6c0898f..52f7440 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -40,6 +40,7 @@ cflatobjs += lib/pci-testdev.o
 cflatobjs += lib/virtio.o
 cflatobjs += lib/virtio-mmio.o
 cflatobjs += lib/chr-testdev.o
+cflatobjs += lib/prng.o
 cflatobjs += lib/arm/io.o
 cflatobjs += lib/arm/setup.o
 cflatobjs += lib/arm/mmu.o
diff --git a/lib/prng.c b/lib/prng.c
new file mode 100644
index 0000000..ebd6df7
--- /dev/null
+++ b/lib/prng.c
@@ -0,0 +1,162 @@
+/*
+ * Pseudo Random Number Generator
+ *
+ * Lifted from ccan modules ilog/isaac under CC0
+ *   - http://ccodearchive.net/info/isaac.html
+ *   - http://ccodearchive.net/info/ilog.html
+ *
+ * And lightly hacked to compile under the KVM unit test environment.
+ * This provides a handy RNG for torture tests that want to vary
+ * delays and the like.
+ *
+ */
+
+/*Written by Timothy B. Terriberry (tterribe at xiph.org) 1999-2009.
+  CC0 (Public domain) - see LICENSE file for details
+  Based on the public domain implementation by Robert J. Jenkins Jr.*/
+
+#include "libcflat.h"
+
+#include <string.h>
+#include "prng.h"
+
+#define ISAAC_MASK        (0xFFFFFFFFU)
+
+/* Extract ISAAC_SZ_LOG bits (starting at bit 2). */
+static inline uint32_t lower_bits(uint32_t x)
+{
+	return (x & ((ISAAC_SZ-1) << 2)) >> 2;
+}
+
+/* Extract next ISAAC_SZ_LOG bits (starting at bit ISAAC_SZ_LOG+2). */
+static inline uint32_t upper_bits(uint32_t y)
+{
+	return (y >> (ISAAC_SZ_LOG+2)) & (ISAAC_SZ-1);
+}
+
+static void isaac_update(isaac_ctx *_ctx){
+	uint32_t *m;
+	uint32_t *r;
+	uint32_t  a;
+	uint32_t  b;
+	uint32_t  x;
+	uint32_t  y;
+	int       i;
+	m=_ctx->m;
+	r=_ctx->r;
+	a=_ctx->a;
+	b=_ctx->b+(++_ctx->c);
+	for(i=0;i<ISAAC_SZ/2;i++){
+		x=m[i];
+		a=(a^a<<13)+m[i+ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+		x=m[++i];
+		a=(a^a>>6)+m[i+ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+		x=m[++i];
+		a=(a^a<<2)+m[i+ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+		x=m[++i];
+		a=(a^a>>16)+m[i+ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+	}
+	for(i=ISAAC_SZ/2;i<ISAAC_SZ;i++){
+		x=m[i];
+		a=(a^a<<13)+m[i-ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+		x=m[++i];
+		a=(a^a>>6)+m[i-ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+		x=m[++i];
+		a=(a^a<<2)+m[i-ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+		x=m[++i];
+		a=(a^a>>16)+m[i-ISAAC_SZ/2];
+		m[i]=y=m[lower_bits(x)]+a+b;
+		r[i]=b=m[upper_bits(y)]+x;
+	}
+	_ctx->b=b;
+	_ctx->a=a;
+	_ctx->n=ISAAC_SZ;
+}
+
+static void isaac_mix(uint32_t _x[8]){
+	static const unsigned char SHIFT[8]={11,2,8,16,10,4,8,9};
+	int i;
+	for(i=0;i<8;i++){
+		_x[i]^=_x[(i+1)&7]<<SHIFT[i];
+		_x[(i+3)&7]+=_x[i];
+		_x[(i+1)&7]+=_x[(i+2)&7];
+		i++;
+		_x[i]^=_x[(i+1)&7]>>SHIFT[i];
+		_x[(i+3)&7]+=_x[i];
+		_x[(i+1)&7]+=_x[(i+2)&7];
+	}
+}
+
+
+void isaac_init(isaac_ctx *_ctx,const unsigned char *_seed,int _nseed){
+	_ctx->a=_ctx->b=_ctx->c=0;
+	memset(_ctx->r,0,sizeof(_ctx->r));
+	isaac_reseed(_ctx,_seed,_nseed);
+}
+
+void isaac_reseed(isaac_ctx *_ctx,const unsigned char *_seed,int _nseed){
+	uint32_t *m;
+	uint32_t *r;
+	uint32_t  x[8];
+	int       i;
+	int       j;
+	m=_ctx->m;
+	r=_ctx->r;
+	if(_nseed>ISAAC_SEED_SZ_MAX)_nseed=ISAAC_SEED_SZ_MAX;
+	for(i=0;i<_nseed>>2;i++){
+		r[i]^=(uint32_t)_seed[i<<2|3]<<24|(uint32_t)_seed[i<<2|2]<<16|
+			(uint32_t)_seed[i<<2|1]<<8|_seed[i<<2];
+	}
+	_nseed-=i<<2;
+	if(_nseed>0){
+		uint32_t ri;
+		ri=_seed[i<<2];
+		for(j=1;j<_nseed;j++)ri|=(uint32_t)_seed[i<<2|j]<<(j<<3);
+		r[i++]^=ri;
+	}
+	x[0]=x[1]=x[2]=x[3]=x[4]=x[5]=x[6]=x[7]=0x9E3779B9U;
+	for(i=0;i<4;i++)isaac_mix(x);
+	for(i=0;i<ISAAC_SZ;i+=8){
+		for(j=0;j<8;j++)x[j]+=r[i+j];
+		isaac_mix(x);
+		memcpy(m+i,x,sizeof(x));
+	}
+	for(i=0;i<ISAAC_SZ;i+=8){
+		for(j=0;j<8;j++)x[j]+=m[i+j];
+		isaac_mix(x);
+		memcpy(m+i,x,sizeof(x));
+	}
+	isaac_update(_ctx);
+}
+
+uint32_t isaac_next_uint32(isaac_ctx *_ctx){
+	if(!_ctx->n)isaac_update(_ctx);
+	return _ctx->r[--_ctx->n];
+}
+
+uint32_t isaac_next_uint(isaac_ctx *_ctx,uint32_t _n){
+	uint32_t r;
+	uint32_t v;
+	uint32_t d;
+	do{
+		r=isaac_next_uint32(_ctx);
+		v=r%_n;
+		d=r-v;
+	}
+	while(((d+_n-1)&ISAAC_MASK)<d);
+	return v;
+}
diff --git a/lib/prng.h b/lib/prng.h
new file mode 100644
index 0000000..bf5776d
--- /dev/null
+++ b/lib/prng.h
@@ -0,0 +1,82 @@
+/*
+ * PRNG Header
+ */
+#ifndef __PRNG_H__
+#define __PRNG_H__
+
+# include <stdint.h>
+
+
+
+typedef struct isaac_ctx isaac_ctx;
+
+
+
+/*This value may be lowered to reduce memory usage on embedded platforms, at
+  the cost of reducing security and increasing bias.
+  Quoting Bob Jenkins: "The current best guess is that bias is detectable after
+  2**37 values for [ISAAC_SZ_LOG]=3, 2**45 for 4, 2**53 for 5, 2**61 for 6,
+  2**69 for 7, and 2**77 values for [ISAAC_SZ_LOG]=8."*/
+#define ISAAC_SZ_LOG      (8)
+#define ISAAC_SZ          (1<<ISAAC_SZ_LOG)
+#define ISAAC_SEED_SZ_MAX (ISAAC_SZ<<2)
+
+
+
+/*ISAAC is the most advanced of a series of pseudo-random number generators
+  designed by Robert J. Jenkins Jr. in 1996.
+  http://www.burtleburtle.net/bob/rand/isaac.html
+  To quote:
+  No efficient method is known for deducing their internal states.
+  ISAAC requires an amortized 18.75 instructions to produce a 32-bit value.
+  There are no cycles in ISAAC shorter than 2**40 values.
+  The expected cycle length is 2**8295 values.*/
+struct isaac_ctx{
+	unsigned n;
+	uint32_t r[ISAAC_SZ];
+	uint32_t m[ISAAC_SZ];
+	uint32_t a;
+	uint32_t b;
+	uint32_t c;
+};
+
+
+/**
+ * isaac_init - Initialize an instance of the ISAAC random number generator.
+ * @_ctx:   The instance to initialize.
+ * @_seed:  The specified seed bytes.
+ *          This may be NULL if _nseed is less than or equal to zero.
+ * @_nseed: The number of bytes to use for the seed.
+ *          If this is greater than ISAAC_SEED_SZ_MAX, the extra bytes are
+ *           ignored.
+ */
+void isaac_init(isaac_ctx *_ctx,const unsigned char *_seed,int _nseed);
+
+/**
+ * isaac_reseed - Mix a new batch of entropy into the current state.
+ * To reset ISAAC to a known state, call isaac_init() again instead.
+ * @_ctx:   The instance to reseed.
+ * @_seed:  The specified seed bytes.
+ *          This may be NULL if _nseed is zero.
+ * @_nseed: The number of bytes to use for the seed.
+ *          If this is greater than ISAAC_SEED_SZ_MAX, the extra bytes are
+ *           ignored.
+ */
+void isaac_reseed(isaac_ctx *_ctx,const unsigned char *_seed,int _nseed);
+/**
+ * isaac_next_uint32 - Return the next random 32-bit value.
+ * @_ctx: The ISAAC instance to generate the value with.
+ */
+uint32_t isaac_next_uint32(isaac_ctx *_ctx);
+/**
+ * isaac_next_uint - Uniform random integer less than the given value.
+ * @_ctx: The ISAAC instance to generate the value with.
+ * @_n:   The upper bound on the range of numbers returned (not inclusive).
+ *        This must be greater than zero and less than 2**32.
+ *        To return integers in the full range 0...2**32-1, use
+ *         isaac_next_uint32() instead.
+ * Return: An integer uniformly distributed between 0 and _n-1 (inclusive).
+ */
+uint32_t isaac_next_uint(isaac_ctx *_ctx,uint32_t _n);
+
+#endif
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 06/11] arm/Makefile.common: force -fno-pic
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

As distro compilers move towards defaults for build hardening for things
like ASLR we need to force -fno-pic. Failure to do can lead to weird
relocation problems when we build our "lat" binaries.

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
---
 arm/Makefile.common | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arm/Makefile.common b/arm/Makefile.common
index 52f7440..cca0d9c 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -21,6 +21,7 @@ phys_base = $(LOADADDR)
 
 CFLAGS += -std=gnu99
 CFLAGS += -ffreestanding
+CFLAGS += -fno-pic
 CFLAGS += -Wextra
 CFLAGS += -O2
 CFLAGS += -I lib -I lib/libfdt
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 07/11] arm/tlbflush-code: Add TLB flush during code execution test
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

This adds a fairly brain dead torture test for TLB flushes intended for
stressing the MTTCG QEMU build. It takes the usual -smp option for
multiple CPUs.

By default it CPU0 will do a TLBIALL flush after each cycle. You can
pass options via -append to control additional aspects of the test:

  - "page" flush each page in turn (one per function)
  - "self" do the flush after each computation cycle
  - "verbose" report progress on each computation cycle

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
CC: Mark Rutland <mark.rutland@arm.com>

---
v2
  - rename to tlbflush-test
  - made makefile changes cleaner
  - added self/other flush mode
  - create specific prefix
  - whitespace fixes
v3
  - using new SMP framework for test runing
v4
  - merge in the unitests.cfg
v5
  - max out at -smp 4
  - printf fmtfix
v7
  - rename to tlbflush-code
  - int -> bool flags
---
 arm/Makefile.common |   2 +
 arm/tlbflush-code.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg   |  24 ++++++
 3 files changed, 238 insertions(+)
 create mode 100644 arm/tlbflush-code.c

diff --git a/arm/Makefile.common b/arm/Makefile.common
index cca0d9c..de99a6e 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -13,6 +13,7 @@ tests-common  = $(TEST_DIR)/selftest.flat
 tests-common += $(TEST_DIR)/spinlock-test.flat
 tests-common += $(TEST_DIR)/pci-test.flat
 tests-common += $(TEST_DIR)/gic.flat
+tests-common += $(TEST_DIR)/tlbflush-code.flat
 
 all: test_cases
 
@@ -81,3 +82,4 @@ generated_files = $(asm-offsets)
 test_cases: $(generated_files) $(tests-common) $(tests)
 
 $(TEST_DIR)/selftest.o $(cstart.o): $(asm-offsets)
+$(TEST_DIR)/tlbflush-code.elf: $(cstart.o) $(TEST_DIR)/tlbflush-code.o
diff --git a/arm/tlbflush-code.c b/arm/tlbflush-code.c
new file mode 100644
index 0000000..cb5cdc2
--- /dev/null
+++ b/arm/tlbflush-code.c
@@ -0,0 +1,212 @@
+/*
+ * TLB Flush Race Tests
+ *
+ * These tests are designed to test for incorrect TLB flush semantics
+ * under emulation. The initial CPU will set all the others working a
+ * compuation task and will then trigger TLB flushes across the
+ * system. It doesn't actually need to re-map anything but the flushes
+ * themselves will trigger QEMU's TCG self-modifying code detection
+ * which will invalidate any generated  code causing re-translation.
+ * Eventually the code buffer will fill and a general tb_lush() will
+ * be triggered.
+ *
+ * Copyright (C) 2016, Linaro, Alex Benn?e <alex.bennee@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.
+ */
+
+#include <libcflat.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+
+#define SEQ_LENGTH 10
+#define SEQ_HASH 0x7cd707fe
+
+static cpumask_t smp_test_complete;
+static int flush_count = 1000000;
+static bool flush_self;
+static bool flush_page;
+static bool flush_verbose;
+
+/*
+ * Work functions
+ *
+ * These work functions need to be:
+ *
+ *  - page aligned, so we can flush one function at a time
+ *  - have branches, so QEMU TCG generates multiple basic blocks
+ *  - call across pages, so we exercise the TCG basic block slow path
+ */
+
+/* Adler32 */
+__attribute__((aligned(PAGE_SIZE))) uint32_t hash_array(const void *buf,
+							size_t buflen)
+{
+	const uint8_t *data = (uint8_t *) buf;
+	uint32_t s1 = 1;
+	uint32_t s2 = 0;
+
+	for (size_t n = 0; n < buflen; n++) {
+		s1 = (s1 + data[n]) % 65521;
+		s2 = (s2 + s1) % 65521;
+	}
+	return (s2 << 16) | s1;
+}
+
+__attribute__((aligned(PAGE_SIZE))) void create_fib_sequence(int length,
+							unsigned int *array)
+{
+	int i;
+
+	/* first two values */
+	array[0] = 0;
+	array[1] = 1;
+	for (i=2; i<length; i++) {
+		array[i] = array[i-2] + array[i-1];
+	}
+}
+
+__attribute__((aligned(PAGE_SIZE))) unsigned long long factorial(unsigned int n)
+{
+	unsigned int i;
+	unsigned long long fac = 1;
+	for (i=1; i<=n; i++)
+	{
+		fac = fac * i;
+	}
+	return fac;
+}
+
+__attribute__((aligned(PAGE_SIZE))) void factorial_array
+(unsigned int n, unsigned int *input, unsigned long long *output)
+{
+	unsigned int i;
+	for (i=0; i<n; i++) {
+		output[i] = factorial(input[i]);
+	}
+}
+
+__attribute__((aligned(PAGE_SIZE))) unsigned int do_computation(void)
+{
+	unsigned int fib_array[SEQ_LENGTH];
+	unsigned long long facfib_array[SEQ_LENGTH];
+	uint32_t fib_hash, facfib_hash;
+
+	create_fib_sequence(SEQ_LENGTH, &fib_array[0]);
+	fib_hash = hash_array(&fib_array[0], sizeof(fib_array));
+	factorial_array(SEQ_LENGTH, &fib_array[0], &facfib_array[0]);
+	facfib_hash = hash_array(&facfib_array[0], sizeof(facfib_array));
+
+	return (fib_hash ^ facfib_hash);
+}
+
+/* This provides a table of the work functions so we can flush each
+ * page individually
+ */
+static void * pages[] = {&hash_array, &create_fib_sequence, &factorial,
+			 &factorial_array, &do_computation};
+
+static void do_flush(int i)
+{
+	if (flush_page) {
+		flush_tlb_page((unsigned long)pages[i % ARRAY_SIZE(pages)]);
+	} else {
+		flush_tlb_all();
+	}
+}
+
+
+static void just_compute(void)
+{
+	int i, errors = 0;
+	int cpu = smp_processor_id();
+
+	uint32_t result;
+
+	printf("CPU%d online\n", cpu);
+
+	for (i=0; i < flush_count; i++) {
+		result = do_computation();
+
+		if (result != SEQ_HASH) {
+			errors++;
+			printf("CPU%d: seq%d 0x%"PRIx32"!=0x%x\n",
+				cpu, i, result, SEQ_HASH);
+		}
+
+		if (flush_verbose && (i % 1000) == 0) {
+			printf("CPU%d: seq%d\n", cpu, i);
+		}
+
+		if (flush_self) {
+			do_flush(i);
+		}
+	}
+
+	report("CPU%d: Done - Errors: %d\n", errors == 0, cpu, errors);
+
+	cpumask_set_cpu(cpu, &smp_test_complete);
+	if (cpu != 0)
+		halt();
+}
+
+static void just_flush(void)
+{
+	int cpu = smp_processor_id();
+	int i = 0;
+
+	/* set our CPU as done, keep flushing until everyone else
+	   finished */
+	cpumask_set_cpu(cpu, &smp_test_complete);
+
+	while (!cpumask_full(&smp_test_complete)) {
+		do_flush(i++);
+	}
+
+	report("CPU%d: Done - Triggered %d flushes\n", true, cpu, i);
+}
+
+int main(int argc, char **argv)
+{
+	int cpu, i;
+	char prefix[100];
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+
+		if (strcmp(arg, "page") == 0) {
+			flush_page = true;
+                }
+
+                if (strcmp(arg, "self") == 0) {
+			flush_self = true;
+                }
+
+		if (strcmp(arg, "verbose") == 0) {
+			flush_verbose = true;
+                }
+	}
+
+	snprintf(prefix, sizeof(prefix), "tlbflush_%s_%s",
+		flush_page?"page":"all",
+		flush_self?"self":"other");
+	report_prefix_push(prefix);
+
+	for_each_present_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+		smp_boot_secondary(cpu, just_compute);
+	}
+
+	if (flush_self)
+		just_compute();
+	else
+		just_flush();
+
+	while (!cpumask_full(&smp_test_complete))
+		cpu_relax();
+
+	return report_summary();
+}
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index c7392c7..beaae84 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -72,3 +72,27 @@ file = gic.flat
 smp = $MAX_SMP
 extra_params = -machine gic-version=3 -append 'ipi'
 groups = gic
+
+# TLB Torture Tests
+[tlbflush-code::all_other]
+file = tlbflush-code.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+groups = tlbflush
+
+[tlbflush-code::page_other]
+file = tlbflush-code.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'page'
+groups = tlbflush
+
+[tlbflush-code::all_self]
+file = tlbflush-code.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'self'
+groups = tlbflush
+
+[tlbflush-code::page_self]
+file = tlbflush-code.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'page self'
+groups = tlbflush
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 08/11] arm/tlbflush-data: Add TLB flush during data writes test
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

This test is the cousin of the tlbflush-code test. Instead of flushing
running code it re-maps virtual addresses while a buffer is being filled
up. It then audits the results checking for writes that have ended up in
the wrong place.

While tlbflush-code exercises QEMU's translation invalidation logic this
test stresses the SoftMMU cputlb code and ensures it is semantically
correct.

The test optionally takes two parameters for debugging:

   cycles           - change the default number of test iterations
   page             - flush pages individually instead of all

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
CC: Mark Rutland <mark.rutland@arm.com>
---
 arm/Makefile.common |   2 +
 arm/tlbflush-data.c | 401 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg   |  12 ++
 3 files changed, 415 insertions(+)
 create mode 100644 arm/tlbflush-data.c

diff --git a/arm/Makefile.common b/arm/Makefile.common
index de99a6e..528166d 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -14,6 +14,7 @@ tests-common += $(TEST_DIR)/spinlock-test.flat
 tests-common += $(TEST_DIR)/pci-test.flat
 tests-common += $(TEST_DIR)/gic.flat
 tests-common += $(TEST_DIR)/tlbflush-code.flat
+tests-common += $(TEST_DIR)/tlbflush-data.flat
 
 all: test_cases
 
@@ -83,3 +84,4 @@ test_cases: $(generated_files) $(tests-common) $(tests)
 
 $(TEST_DIR)/selftest.o $(cstart.o): $(asm-offsets)
 $(TEST_DIR)/tlbflush-code.elf: $(cstart.o) $(TEST_DIR)/tlbflush-code.o
+$(TEST_DIR)/tlbflush-data.elf: $(cstart.o) $(TEST_DIR)/tlbflush-data.o
diff --git a/arm/tlbflush-data.c b/arm/tlbflush-data.c
new file mode 100644
index 0000000..7920179
--- /dev/null
+++ b/arm/tlbflush-data.c
@@ -0,0 +1,401 @@
+/*
+ * TLB Flush Race Tests
+ *
+ * These tests are designed to test for incorrect TLB flush semantics
+ * under emulation. The initial CPU will set all the others working on
+ * a writing to a set of pages. It will then re-map one of the pages
+ * back and forth while recording the timestamps of when each page was
+ * active. The test fails if a write was detected on a page after the
+ * tlbflush switching to a new page should have completed.
+ *
+ * Copyright (C) 2016, Linaro, Alex Benn?e <alex.bennee@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.
+ */
+
+#include <libcflat.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+
+#define NR_TIMESTAMPS 		((PAGE_SIZE/sizeof(u64)) << 2)
+#define NR_AUDIT_RECORDS	16384
+#define NR_DYNAMIC_PAGES 	3
+#define MAX_CPUS 		8
+
+#define MIN(a, b)		((a) < (b) ? (a) : (b))
+
+typedef struct {
+	u64    		timestamps[NR_TIMESTAMPS];
+} write_buffer;
+
+typedef struct {
+	write_buffer 	*newbuf;
+	u64		time_before_flush;
+	u64		time_after_flush;
+} audit_rec_t;
+
+typedef struct {
+	audit_rec_t 	records[NR_AUDIT_RECORDS];
+} audit_buffer;
+
+typedef struct {
+	write_buffer 	*stable_pages;
+	write_buffer    *dynamic_pages[NR_DYNAMIC_PAGES];
+	audit_buffer 	*audit;
+	unsigned int 	flush_count;
+} test_data_t;
+
+static test_data_t test_data[MAX_CPUS];
+
+static cpumask_t ready;
+static cpumask_t complete;
+
+static bool test_complete;
+static bool flush_verbose;
+static bool flush_by_page;
+static int test_cycles=3;
+static int secondary_cpus;
+
+static write_buffer * alloc_test_pages(void)
+{
+	write_buffer *pg;
+	pg = calloc(NR_TIMESTAMPS, sizeof(u64));
+	return pg;
+}
+
+static void setup_pages_for_cpu(int cpu)
+{
+	unsigned int i;
+
+	test_data[cpu].stable_pages = alloc_test_pages();
+
+	for (i=0; i<NR_DYNAMIC_PAGES; i++) {
+		test_data[cpu].dynamic_pages[i] = alloc_test_pages();
+	}
+
+	test_data[cpu].audit = calloc(NR_AUDIT_RECORDS, sizeof(audit_rec_t));
+}
+
+static audit_rec_t * get_audit_record(audit_buffer *buf, unsigned int record)
+{
+	return &buf->records[record];
+}
+
+/* Sync on a given cpumask */
+static void wait_on(int cpu, cpumask_t *mask)
+{
+	cpumask_set_cpu(cpu, mask);
+	while (!cpumask_full(mask))
+		cpu_relax();
+}
+
+static uint64_t sync_start(void)
+{
+	const uint64_t gate_mask = ~0x7ff;
+	uint64_t gate, now;
+	gate = get_cntvct() & gate_mask;
+	do {
+		now = get_cntvct();
+	} while ((now & gate_mask) == gate);
+
+	return now;
+}
+
+static void do_page_writes(void)
+{
+	unsigned int i, runs = 0;
+	int cpu = smp_processor_id();
+	write_buffer *stable_pages = test_data[cpu].stable_pages;
+	write_buffer *moving_page = test_data[cpu].dynamic_pages[0];
+
+	printf("CPU%d: ready %p/%p @ 0x%08" PRIx64"\n",
+		cpu, stable_pages, moving_page, get_cntvct());
+
+	while (!test_complete) {
+		u64 run_start, run_end;
+
+		smp_mb();
+		wait_on(cpu, &ready);
+		run_start = sync_start();
+
+		for (i = 0; i < NR_TIMESTAMPS; i++) {
+			u64 ts = get_cntvct();
+			moving_page->timestamps[i] = ts;
+			stable_pages->timestamps[i] = ts;
+		}
+
+		run_end = get_cntvct();
+		printf("CPU%d: run %d 0x%" PRIx64 "->0x%" PRIx64 " (%" PRId64 " cycles)\n",
+			cpu, runs++, run_start, run_end, run_end - run_start);
+
+		/* wait on completion - gets clear my main thread*/
+		wait_on(cpu, &complete);
+	}
+}
+
+
+/*
+ * This is the core of the test. Timestamps are taken either side of
+ * the updating of the page table and the flush instruction. By
+ * keeping track of when the page mapping is changed we can detect any
+ * writes that shouldn't have made it to the other pages.
+ *
+ * This isn't the recommended way to update the page table. ARM
+ * recommends break-before-make so accesses that are in flight can
+ * trigger faults that can be handled cleanly.
+ */
+
+/* This mimics  __flush_tlb_range from the kernel, doing a series of
+ * flush operations and then the dsb() to complete. */
+static void flush_pages(unsigned long start, unsigned long end)
+{
+	unsigned long addr;
+	start = start >> 12;
+	end = end >> 12;
+
+	dsb(ishst);
+	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT -12)) {
+#if defined(__aarch64__)
+		asm("tlbi	vaae1is, %0" :: "r" (addr));
+#else
+		asm volatile("mcr p15, 0, %0, c8, c7, 3" :: "r" (addr));
+#endif
+	}
+	dsb(ish);
+}
+
+static void remap_one_page(test_data_t *data)
+{
+	u64 ts_before, ts_after;
+	int pg = (data->flush_count % (NR_DYNAMIC_PAGES + 1));
+	write_buffer *dynamic_pages_vaddr = data->dynamic_pages[0];
+	write_buffer *newbuf_paddr = data->dynamic_pages[pg];
+	write_buffer *end_page_paddr = newbuf_paddr+1;
+
+	ts_before = get_cntvct();
+	/* update the page table */
+	mmu_set_range_ptes(mmu_idmap,
+			(unsigned long) dynamic_pages_vaddr,
+			(unsigned long) newbuf_paddr,
+			(unsigned long) end_page_paddr,
+			__pgprot(PTE_WBWA));
+	/* until the flush + isb() writes may still go to old address */
+	if (flush_by_page) {
+		flush_pages((unsigned long)dynamic_pages_vaddr, (unsigned long)(dynamic_pages_vaddr+1));
+	} else {
+		flush_tlb_all();
+	}
+	ts_after = get_cntvct();
+
+	if (data->flush_count < NR_AUDIT_RECORDS) {
+		audit_rec_t *rec = get_audit_record(data->audit, data->flush_count);
+		rec->newbuf = newbuf_paddr;
+		rec->time_before_flush = ts_before;
+		rec->time_after_flush = ts_after;
+	}
+	data->flush_count++;
+}
+
+static int check_pages(int cpu, char *msg,
+		write_buffer *base_page, write_buffer *test_page,
+		audit_buffer *audit, unsigned int flushes)
+{
+	write_buffer *prev_page = base_page;
+	unsigned int empty = 0, write = 0, late = 0, weird = 0;
+	unsigned int ts_index = 0, audit_index;
+	u64 ts;
+
+	/* For each audit record */
+	for (audit_index = 0; audit_index < MIN(flushes, NR_AUDIT_RECORDS); audit_index++) {
+		audit_rec_t *rec = get_audit_record(audit, audit_index);
+
+		do {
+			/* Work through timestamps until we overtake
+			 * this audit record */
+			ts = test_page->timestamps[ts_index];
+
+			if (ts == 0) {
+				empty++;
+			} else if (ts < rec->time_before_flush) {
+				if (test_page == prev_page) {
+					write++;
+				} else {
+					late++;
+				}
+			} else if (ts >= rec->time_before_flush
+				&& ts <= rec->time_after_flush) {
+				if (test_page == prev_page
+					|| test_page == rec->newbuf) {
+					write++;
+				} else {
+					weird++;
+				}
+			} else if (ts > rec->time_after_flush) {
+				if (test_page == rec->newbuf) {
+					write++;
+				}
+				/* It's possible the ts is way ahead
+				 * of the current record so we can't
+				 * call a non-match weird...
+				 *
+				 * Time to skip to next audit record
+				 */
+				break;
+			}
+
+			ts = test_page->timestamps[ts_index++];
+		} while (ts <= rec->time_after_flush && ts_index < NR_TIMESTAMPS);
+
+
+		/* Next record */
+		prev_page = rec->newbuf;
+	} /* for each audit record */
+
+	if (flush_verbose) {
+		printf("CPU%d: %s %p => %p %u/%u/%u/%u (0/OK/L/?) = %u total\n",
+			cpu, msg, test_page, base_page,
+			empty, write, late, weird, empty+write+late+weird);
+	}
+
+	return weird;
+}
+
+static int audit_cpu_pages(int cpu, test_data_t *data)
+{
+	unsigned int pg, writes=0, ts_index = 0;
+	write_buffer *test_page;
+	int errors = 0;
+
+	/* first the stable page */
+	test_page = data->stable_pages;
+	do {
+		if (test_page->timestamps[ts_index++]) {
+			writes++;
+		}
+	} while (ts_index < NR_TIMESTAMPS);
+
+	if (writes != ts_index) {
+		errors += 1;
+	}
+
+	if (flush_verbose) {
+		printf("CPU%d: stable page %p %u writes\n",
+			cpu, test_page, writes);
+	}
+
+
+	/* Restore the mapping for dynamic page */
+	test_page = data->dynamic_pages[0];
+
+	mmu_set_range_ptes(mmu_idmap,
+			(unsigned long) test_page,
+			(unsigned long) test_page,
+			(unsigned long) &test_page[1],
+			__pgprot(PTE_WBWA));
+	flush_tlb_all();
+
+	for (pg=0; pg<NR_DYNAMIC_PAGES; pg++) {
+		errors += check_pages(cpu, "dynamic page", test_page,
+				data->dynamic_pages[pg],
+				data->audit, data->flush_count);
+	}
+
+	/* reset for next run */
+	memset(data->stable_pages, 0, sizeof(write_buffer));
+	for (pg=0; pg<NR_DYNAMIC_PAGES; pg++) {
+		memset(data->dynamic_pages[pg], 0, sizeof(write_buffer));
+	}
+	memset(data->audit, 0, sizeof(audit_buffer));
+	data->flush_count = 0;
+	smp_mb();
+
+	report("CPU%d: checked, errors: %d", errors == 0, cpu, errors);
+	return errors;
+}
+
+static void do_page_flushes(void)
+{
+	int i, cpu;
+
+	printf("CPU0: ready @ 0x%08" PRIx64"\n", get_cntvct());
+
+	for (i=0; i<test_cycles; i++) {
+		unsigned int flushes=0;
+		u64 run_start, run_end;
+		int cpus_finished;
+
+		cpumask_clear(&complete);
+		wait_on(0, &ready);
+		run_start = sync_start();
+
+		do {
+			for_each_present_cpu(cpu) {
+				if (cpu == 0)
+					continue;
+
+				/* do remap & flush */
+				remap_one_page(&test_data[cpu]);
+				flushes++;
+			}
+
+			cpus_finished = cpumask_weight(&complete);
+		} while (cpus_finished < secondary_cpus);
+
+		run_end = get_cntvct();
+
+		printf("CPU0: run %d 0x%" PRIx64 "->0x%" PRIx64 " (%" PRId64 " cycles, %u flushes)\n",
+			i, run_start, run_end, run_end - run_start, flushes);
+
+		/* Reset our ready mask for next cycle */
+		cpumask_clear_cpu(0, &ready);
+		smp_mb();
+		wait_on(0, &complete);
+
+		/* Check for discrepancies */
+		for_each_present_cpu(cpu) {
+			if (cpu == 0)
+				continue;
+			audit_cpu_pages(cpu, &test_data[cpu]);
+		}
+	}
+
+	test_complete = true;
+	smp_mb();
+	cpumask_set_cpu(0, &ready);
+	cpumask_set_cpu(0, &complete);
+}
+
+int main(int argc, char **argv)
+{
+	int cpu, i;
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+		if (strcmp(arg, "verbose") == 0) {
+			flush_verbose = true;
+		}
+		if (strcmp(arg, "page") == 0) {
+			flush_by_page = true;
+		}
+		if (strstr(arg, "cycles=") != NULL) {
+			char *p = strstr(arg, "=");
+			test_cycles = atol(p+1);
+		}
+	}
+
+	for_each_present_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+
+		setup_pages_for_cpu(cpu);
+		smp_boot_secondary(cpu, do_page_writes);
+		secondary_cpus++;
+	}
+
+	/* CPU 0 does the flushes and checks the results */
+	do_page_flushes();
+
+	return report_summary();
+}
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index beaae84..7dc7799 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -96,3 +96,15 @@ file = tlbflush-code.flat
 smp = $(($MAX_SMP>4?4:$MAX_SMP))
 extra_params = -append 'page self'
 groups = tlbflush
+
+[tlbflush-data::all]
+file = tlbflush-data.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+groups = tlbflush
+
+[tlbflush-data::page]
+file = tlbflush-data.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append "page"
+groups = tlbflush
+
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 09/11] arm/locking-tests: add comprehensive locking test
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

This test has been written mainly to stress multi-threaded TCG behaviour
but will demonstrate failure by default on real hardware. The test takes
the following parameters:

  - "lock" use GCC's locking semantics
  - "atomic" use GCC's __atomic primitives
  - "wfelock" use WaitForEvent sleep
  - "excl" use load/store exclusive semantics

Also two more options allow the test to be tweaked

  - "noshuffle" disables the memory shuffling
  - "count=%ld" set your own per-CPU increment count

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>

---
v2
  - Don't use thumb style strexeq stuff
  - Add atomic and wfelock tests
  - Add count/noshuffle test controls
  - Move barrier tests to separate test file
v4
  - fix up unitests.cfg to use correct test name
  - move into "locking" group, remove barrier tests
  - use a table to add tests, mark which are expected to work
  - correctly report XFAIL
v5
  - max out at -smp 4 in unittest.cfg
v7
  - make test control flags bools
  - default the count to 100000 (so it doesn't timeout)
---
 arm/Makefile.common |   2 +
 arm/locking-test.c  | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg   |  34 ++++++
 3 files changed, 338 insertions(+)
 create mode 100644 arm/locking-test.c

diff --git a/arm/Makefile.common b/arm/Makefile.common
index 528166d..eb4cfdf 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -15,6 +15,7 @@ tests-common += $(TEST_DIR)/pci-test.flat
 tests-common += $(TEST_DIR)/gic.flat
 tests-common += $(TEST_DIR)/tlbflush-code.flat
 tests-common += $(TEST_DIR)/tlbflush-data.flat
+tests-common += $(TEST_DIR)/locking-test.flat
 
 all: test_cases
 
@@ -85,3 +86,4 @@ test_cases: $(generated_files) $(tests-common) $(tests)
 $(TEST_DIR)/selftest.o $(cstart.o): $(asm-offsets)
 $(TEST_DIR)/tlbflush-code.elf: $(cstart.o) $(TEST_DIR)/tlbflush-code.o
 $(TEST_DIR)/tlbflush-data.elf: $(cstart.o) $(TEST_DIR)/tlbflush-data.o
+$(TEST_DIR)/locking-test.elf: $(cstart.o) $(TEST_DIR)/locking-test.o
diff --git a/arm/locking-test.c b/arm/locking-test.c
new file mode 100644
index 0000000..f10c61b
--- /dev/null
+++ b/arm/locking-test.c
@@ -0,0 +1,302 @@
+#include <libcflat.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+
+#include <prng.h>
+
+#define MAX_CPUS 8
+
+/* Test definition structure
+ *
+ * A simple structure that describes the test name, expected pass and
+ * increment function.
+ */
+
+/* Function pointers for test */
+typedef void (*inc_fn)(int cpu);
+
+typedef struct {
+	const char *test_name;
+	bool  should_pass;
+	inc_fn main_fn;
+} test_descr_t;
+
+/* How many increments to do */
+static int increment_count = 1000000;
+static bool do_shuffle = true;
+
+/* Shared value all the tests attempt to safely increment using
+ * various forms of atomic locking and exclusive behaviour.
+ */
+static unsigned int shared_value;
+
+/* PAGE_SIZE * uint32_t means we span several pages */
+__attribute__((aligned(PAGE_SIZE))) static uint32_t memory_array[PAGE_SIZE];
+
+/* We use the alignment of the following to ensure accesses to locking
+ * and synchronisation primatives don't interfere with the page of the
+ * shared value
+ */
+__attribute__((aligned(PAGE_SIZE))) static unsigned int per_cpu_value[MAX_CPUS];
+__attribute__((aligned(PAGE_SIZE))) static cpumask_t smp_test_complete;
+__attribute__((aligned(PAGE_SIZE))) struct isaac_ctx prng_context[MAX_CPUS];
+
+/* Some of the approaches use a global lock to prevent contention. */
+static int global_lock;
+
+/* In any SMP setting this *should* fail due to cores stepping on
+ * each other updating the shared variable
+ */
+static void increment_shared(int cpu)
+{
+	(void)cpu;
+
+	shared_value++;
+}
+
+/* GCC __sync primitives are deprecated in favour of __atomic */
+static void increment_shared_with_lock(int cpu)
+{
+	(void)cpu;
+
+	while (__sync_lock_test_and_set(&global_lock, 1));
+	shared_value++;
+	__sync_lock_release(&global_lock);
+}
+
+/* In practice even __ATOMIC_RELAXED uses ARM's ldxr/stex exclusive
+ * semantics */
+static void increment_shared_with_atomic(int cpu)
+{
+	(void)cpu;
+
+	__atomic_add_fetch(&shared_value, 1, __ATOMIC_SEQ_CST);
+}
+
+
+/*
+ * Load/store exclusive with WFE (wait-for-event)
+ *
+ * See ARMv8 ARM examples:
+ *   Use of Wait For Event (WFE) and Send Event (SEV) with locks
+ */
+
+static void increment_shared_with_wfelock(int cpu)
+{
+	(void)cpu;
+
+#if defined(__aarch64__)
+	asm volatile(
+	"	mov     w1, #1\n"
+	"       sevl\n"
+	"       prfm PSTL1KEEP, [%[lock]]\n"
+	"1:     wfe\n"
+	"	ldaxr	w0, [%[lock]]\n"
+	"	cbnz    w0, 1b\n"
+	"	stxr    w0, w1, [%[lock]]\n"
+	"	cbnz	w0, 1b\n"
+	/* lock held */
+	"	ldr	w0, [%[sptr]]\n"
+	"	add	w0, w0, #0x1\n"
+	"	str	w0, [%[sptr]]\n"
+	/* now release */
+	"	stlr	wzr, [%[lock]]\n"
+	: /* out */
+	: [lock] "r" (&global_lock), [sptr] "r" (&shared_value) /* in */
+	: "w0", "w1", "cc");
+#else
+	asm volatile(
+	"	mov     r1, #1\n"
+	"1:	ldrex	r0, [%[lock]]\n"
+	"	cmp     r0, #0\n"
+	"	wfene\n"
+	"	strexeq r0, r1, [%[lock]]\n"
+	"	cmpeq	r0, #0\n"
+	"	bne	1b\n"
+	"	dmb\n"
+	/* lock held */
+	"	ldr	r0, [%[sptr]]\n"
+	"	add	r0, r0, #0x1\n"
+	"	str	r0, [%[sptr]]\n"
+	/* now release */
+	"	mov	r0, #0\n"
+	"	dmb\n"
+	"	str	r0, [%[lock]]\n"
+	"	dsb\n"
+	"	sev\n"
+	: /* out */
+	: [lock] "r" (&global_lock), [sptr] "r" (&shared_value) /* in */
+	: "r0", "r1", "cc");
+#endif
+}
+
+
+/*
+ * Hand-written version of the load/store exclusive
+ */
+static void increment_shared_with_excl(int cpu)
+{
+	(void)cpu;
+
+#if defined(__aarch64__)
+        asm volatile(
+	"1:	ldxr	w0, [%[sptr]]\n"
+	"	add     w0, w0, #0x1\n"
+	"	stxr	w1, w0, [%[sptr]]\n"
+	"	cbnz	w1, 1b\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "w0", "w1", "cc");
+#else
+	asm volatile(
+	"1:	ldrex	r0, [%[sptr]]\n"
+	"	add     r0, r0, #0x1\n"
+	"	strex	r1, r0, [%[sptr]]\n"
+	"	cmp	r1, #0\n"
+	"	bne	1b\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "r0", "r1", "cc");
+#endif
+}
+
+/* Test array */
+static test_descr_t tests[] = {
+	{ "none", false, increment_shared },
+	{ "lock", true, increment_shared_with_lock },
+	{ "atomic", true, increment_shared_with_atomic },
+	{ "wfelock", true, increment_shared_with_wfelock },
+	{ "excl", true, increment_shared_with_excl }
+};
+
+/* The idea of this is just to generate some random load/store
+ * activity which may or may not race with an un-barried incremented
+ * of the shared counter
+ */
+static void shuffle_memory(int cpu)
+{
+	int i;
+	uint32_t lspat = isaac_next_uint32(&prng_context[cpu]);
+	uint32_t seq = isaac_next_uint32(&prng_context[cpu]);
+	int count = seq & 0x1f;
+	uint32_t val=0;
+
+	seq >>= 5;
+
+	for (i=0; i<count; i++) {
+		int index = seq & ~PAGE_MASK;
+		if (lspat & 1) {
+			val ^= memory_array[index];
+		} else {
+			memory_array[index] = val;
+		}
+		seq >>= PAGE_SHIFT;
+		seq ^= lspat;
+		lspat >>= 1;
+	}
+
+}
+
+static inc_fn increment_function;
+
+static void do_increment(void)
+{
+	int i;
+	int cpu = smp_processor_id();
+
+	printf("CPU%d: online and ++ing\n", cpu);
+
+	for (i=0; i < increment_count; i++) {
+		per_cpu_value[cpu]++;
+		increment_function(cpu);
+
+		if (do_shuffle)
+			shuffle_memory(cpu);
+	}
+
+	printf("CPU%d: Done, %d incs\n", cpu, per_cpu_value[cpu]);
+
+	cpumask_set_cpu(cpu, &smp_test_complete);
+	if (cpu != 0)
+		halt();
+}
+
+static void setup_and_run_test(test_descr_t *test)
+{
+	unsigned int i, sum = 0;
+	int cpu, cpu_cnt = 0;
+
+	increment_function = test->main_fn;
+
+	/* fill our random page */
+        for (i=0; i<PAGE_SIZE; i++) {
+		memory_array[i] = isaac_next_uint32(&prng_context[0]);
+	}
+
+	for_each_present_cpu(cpu) {
+		uint32_t seed2 = isaac_next_uint32(&prng_context[0]);
+		cpu_cnt++;
+		if (cpu == 0)
+			continue;
+
+		isaac_init(&prng_context[cpu], (unsigned char *) &seed2, sizeof(seed2));
+		smp_boot_secondary(cpu, do_increment);
+	}
+
+	do_increment();
+
+	while (!cpumask_full(&smp_test_complete))
+		cpu_relax();
+
+	/* All CPUs done, do we add up */
+	for_each_present_cpu(cpu) {
+		sum += per_cpu_value[cpu];
+	}
+
+	if (test->should_pass) {
+		report("total incs %d", sum == shared_value, shared_value);
+	} else {
+		report_xfail("total incs %d", true, sum == shared_value, shared_value);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	static const unsigned char seed[] = "myseed";
+	test_descr_t *test = &tests[0];
+	int i;
+	unsigned int j;
+
+	isaac_init(&prng_context[0], &seed[0], sizeof(seed));
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+
+		/* Check for test name */
+		for (j = 0; j < ARRAY_SIZE(tests); j++) {
+			if (strcmp(arg, tests[j].test_name) == 0)
+				test = &tests[j];
+		}
+
+		/* Test modifiers */
+		if (strcmp(arg, "noshuffle") == 0) {
+			do_shuffle = false;
+			report_prefix_push("noshuffle");
+		} else if (strstr(arg, "count=") != NULL) {
+			char *p = strstr(arg, "=");
+			increment_count = atol(p+1);
+		} else {
+			isaac_reseed(&prng_context[0], (unsigned char *) arg, strlen(arg));
+		}
+	}
+
+	if (test) {
+		setup_and_run_test(test);
+	} else {
+		report("Unknown test", false);
+	}
+
+	return report_summary();
+}
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index 7dc7799..abbfe79 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -108,3 +108,37 @@ smp = $(($MAX_SMP>4?4:$MAX_SMP))
 extra_params = -append "page"
 groups = tlbflush
 
+# Locking tests
+[locking::none]
+file = locking-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+groups = locking
+accel = tcg
+
+[locking::lock]
+file = locking-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'lock'
+groups = locking
+accel = tcg
+
+[locking::atomic]
+file = locking-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'atomic'
+groups = locking
+accel = tcg
+
+[locking::wfelock]
+file = locking-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'wfelock'
+groups = locking
+accel = tcg
+
+[locking::excl]
+file = locking-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'excl'
+groups = locking
+accel = tcg
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 10/11] arm/barrier-litmus-tests: add simple mp and sal litmus tests
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

This adds a framework for adding simple barrier litmus tests against
ARM. The litmus tests aren't as comprehensive as the academic exercises
which will attempt to do all sorts of things to keep racing CPUs synced
up. These tests do honour the "sync" parameter to do a poor-mans
equivalent.

The two litmus tests are:
  - message passing
  - store-after-load

They both have case that should fail (although won't on single-threaded
TCG setups). If barriers aren't working properly the store-after-load
test will fail even on an x86 backend as x86 allows re-ording of non
aliased stores.

I've imported a few more of the barrier primatives from the Linux source
tree so we consistently use macros.

The arm64 barrier primitives trip up on -Wstrict-aliasing so this is
disabled in the Makefile.

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>
CC: Will Deacon <will.deacon@arm.com>

---
v7
  - merge in store-after-load
  - clean-up sync-up code
  - use new counter api
  - fix xfail for sal test
v6
  - add a unittest.cfg
  - -fno-strict-aliasing
---
 Makefile                  |   2 +
 arm/Makefile.common       |   2 +
 arm/barrier-litmus-test.c | 437 ++++++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg         |  36 ++++
 lib/arm/asm/barrier.h     |  63 ++++++-
 lib/arm64/asm/barrier.h   |  50 ++++++
 6 files changed, 589 insertions(+), 1 deletion(-)
 create mode 100644 arm/barrier-litmus-test.c

diff --git a/Makefile b/Makefile
index 5201472..53594a1 100644
--- a/Makefile
+++ b/Makefile
@@ -51,10 +51,12 @@ fomit_frame_pointer := $(call cc-option, $(frame-pointer-flag), "")
 fnostack_protector := $(call cc-option, -fno-stack-protector, "")
 fnostack_protector_all := $(call cc-option, -fno-stack-protector-all, "")
 wno_frame_address := $(call cc-option, -Wno-frame-address, "")
+fno_strict_aliasing := $(call cc-option, -fno-strict-aliasing, "")
 CFLAGS += $(fomit_frame_pointer)
 CFLAGS += $(fno_stack_protector)
 CFLAGS += $(fno_stack_protector_all)
 CFLAGS += $(wno_frame_address)
+CFLAGS += $(fno_strict_aliasing)
 
 CXXFLAGS += $(CFLAGS)
 
diff --git a/arm/Makefile.common b/arm/Makefile.common
index eb4cfdf..a508128 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -16,6 +16,7 @@ tests-common += $(TEST_DIR)/gic.flat
 tests-common += $(TEST_DIR)/tlbflush-code.flat
 tests-common += $(TEST_DIR)/tlbflush-data.flat
 tests-common += $(TEST_DIR)/locking-test.flat
+tests-common += $(TEST_DIR)/barrier-litmus-test.flat
 
 all: test_cases
 
@@ -87,3 +88,4 @@ $(TEST_DIR)/selftest.o $(cstart.o): $(asm-offsets)
 $(TEST_DIR)/tlbflush-code.elf: $(cstart.o) $(TEST_DIR)/tlbflush-code.o
 $(TEST_DIR)/tlbflush-data.elf: $(cstart.o) $(TEST_DIR)/tlbflush-data.o
 $(TEST_DIR)/locking-test.elf: $(cstart.o) $(TEST_DIR)/locking-test.o
+$(TEST_DIR)/barrier-litmus-test.elf: $(cstart.o) $(TEST_DIR)/barrier-litmus-test.o
diff --git a/arm/barrier-litmus-test.c b/arm/barrier-litmus-test.c
new file mode 100644
index 0000000..2557a88
--- /dev/null
+++ b/arm/barrier-litmus-test.c
@@ -0,0 +1,437 @@
+/*
+ * ARM Barrier Litmus Tests
+ *
+ * This test provides a framework for testing barrier conditions on
+ * the processor. It's simpler than the more involved barrier testing
+ * frameworks as we are looking for simple failures of QEMU's TCG not
+ * weird edge cases the silicon gets wrong.
+ */
+
+#include <libcflat.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+
+#define MAX_CPUS 8
+
+/* Array size and access controls */
+static int array_size = 100000;
+static int wait_if_ahead = 0;
+
+static cpumask_t cpu_mask;
+
+/*
+ * These test_array_* structures are a contiguous array modified by two or more
+ * competing CPUs. The padding is to ensure the variables do not share
+ * cache lines.
+ *
+ * All structures start zeroed.
+ */
+
+typedef struct test_array
+{
+	volatile unsigned int x;
+	uint8_t dummy[64];
+	volatile unsigned int y;
+	uint8_t dummy2[64];
+	volatile unsigned int r[MAX_CPUS];
+} test_array;
+
+volatile test_array *array;
+
+/* Test definition structure
+ *
+ * The first function will always run on the primary CPU, it is
+ * usually the one that will detect any weirdness and trigger the
+ * failure of the test.
+ */
+
+typedef void (*test_fn)(void);
+
+typedef struct {
+	const char *test_name;
+	bool  should_pass;
+	test_fn main_fn;
+	test_fn secondary_fns[MAX_CPUS-1];
+} test_descr_t;
+
+/* Litmus tests */
+
+static unsigned long sync_start(void)
+{
+	const unsigned long gate_mask = ~0x3ffff;
+	unsigned long gate, now;
+	gate = get_cntvct() & gate_mask;
+	do {
+		now =get_cntvct();
+	} while ((now & gate_mask)==gate);
+
+	return now;
+}
+
+/* Simple Message Passing
+ *
+ * x is the message data
+ * y is the flag to indicate the data is ready
+ *
+ * Reading x == 0 when y == 1 is a failure.
+ */
+
+void message_passing_write(void)
+{
+	int i;
+
+	sync_start();
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		entry->x = 1;
+		entry->y = 1;
+	}
+
+	halt();
+}
+
+void message_passing_read(void)
+{
+	int i;
+	int errors = 0, ready = 0;
+
+	sync_start();
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int x,y;
+		y = entry->y;
+		x = entry->x;
+
+		if (y && !x)
+			errors++;
+		ready += y;
+	}
+
+	report_xfail("mp: %d errors, %d ready", true, errors == 0, errors, ready);
+}
+
+/* Simple Message Passing with barriers */
+void message_passing_write_barrier(void)
+{
+	int i;
+	sync_start();
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		entry->x = 1;
+		smp_wmb();
+		entry->y = 1;
+	}
+
+	halt();
+}
+
+void message_passing_read_barrier(void)
+{
+	int i;
+	int errors = 0, ready = 0, not_ready = 0;
+
+	sync_start();
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int x, y;
+		y = entry->y;
+		smp_rmb();
+		x = entry->x;
+
+		if (y && !x)
+			errors++;
+
+		if (y) {
+			ready++;
+		} else {
+			not_ready++;
+
+			if (not_ready > 2) {
+				entry = &array[i+1];
+				do {
+					not_ready = 0;
+				} while (wait_if_ahead && !entry->y);
+			}
+		}
+	}
+
+	report("mp barrier: %d errors, %d ready", errors == 0, errors, ready);
+}
+
+/* Simple Message Passing with Acquire/Release */
+void message_passing_write_release(void)
+{
+	int i;
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		entry->x = 1;
+		smp_store_release(&entry->y, 1);
+	}
+
+	halt();
+}
+
+void message_passing_read_acquire(void)
+{
+	int i;
+	int errors = 0, ready = 0, not_ready = 0;
+
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int x, y;
+		y = smp_load_acquire(&entry->y);
+		x = entry->x;
+
+		if (y && !x)
+			errors++;
+
+		if (y) {
+			ready++;
+		} else {
+			not_ready++;
+
+			if (not_ready > 2) {
+				entry = &array[i+1];
+				do {
+					not_ready = 0;
+				} while (wait_if_ahead && !entry->y);
+			}
+		}
+	}
+
+	report("mp acqrel: %d errors, %d ready", errors == 0, errors, ready);
+}
+
+/*
+ * Store after load
+ *
+ * T1: write 1 to x, load r from y
+ * T2: write 1 to y, load r from x
+ *
+ * Without memory fence r[0] && r[1] == 0
+ * With memory fence both == 0 should be impossible
+ */
+
+static void check_store_and_load_results(char *name, int thread, bool xfail,
+					unsigned long start, unsigned long end)
+{
+	int i;
+	int neither = 0;
+	int only_first = 0;
+	int only_second = 0;
+	int both = 0;
+
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		if (entry->r[0] == 0 &&
+		    entry->r[1] == 0) {
+			neither++;
+		} else if (entry->r[0] &&
+			entry->r[1]) {
+			both++;
+		} else if (entry->r[0]) {
+			only_first++;
+		} else {
+			only_second++;
+		}
+	}
+
+	printf("T%d: %08lx->%08lx neither=%d only_t1=%d only_t2=%d both=%d\n", thread,
+		start, end, neither, only_first, only_second, both);
+
+	if (thread == 1) {
+		if (xfail) {
+			report_xfail("%s: errors=%d", true, neither==0,
+				name, neither);
+		} else {
+			report("%s: errors=%d", neither==0, name, neither);
+		}
+
+	}
+}
+
+/*
+ * This attempts to synchronise the start of both threads to roughly
+ * the same time. On real hardware there is a little latency as the
+ * secondary vCPUs are powered up however this effect it much more
+ * exaggerated on a TCG host.
+ *
+ * Busy waits until the we pass a future point in time, returns final
+ * start time.
+ */
+
+void store_and_load_1(void)
+{
+	int i;
+	unsigned long start, end;
+
+	start = sync_start();
+	for (i=0; i<array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int r;
+		entry->x = 1;
+		r = entry->y;
+		entry->r[0] = r;
+	}
+	end = get_cntvct();
+
+	smp_mb();
+
+	while (!cpumask_test_cpu(1, &cpu_mask))
+		cpu_relax();
+
+	check_store_and_load_results("sal", 1, true, start, end);
+}
+
+void store_and_load_2(void)
+{
+	int i;
+	unsigned long start, end;
+
+	start = sync_start();
+	for (i=0; i<array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int r;
+		entry->y = 1;
+		r = entry->x;
+		entry->r[1] = r;
+	}
+	end = get_cntvct();
+
+	check_store_and_load_results("sal", 2, true, start, end);
+
+	cpumask_set_cpu(1, &cpu_mask);
+
+	halt();
+}
+
+void store_and_load_barrier_1(void)
+{
+	int i;
+	unsigned long start, end;
+
+	start = sync_start();
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int r;
+		entry->x = 1;
+		smp_mb();
+		r = entry->y;
+		entry->r[0] = r;
+	}
+	end = get_cntvct();
+
+	smp_mb();
+
+	while (!cpumask_test_cpu(1, &cpu_mask))
+		cpu_relax();
+
+	check_store_and_load_results("sal_barrier", 1, false, start, end);
+}
+
+void store_and_load_barrier_2(void)
+{
+	int i;
+	unsigned long start, end;
+
+	start = sync_start();
+	for (i=0; i< array_size; i++) {
+		volatile test_array *entry = &array[i];
+		unsigned int r;
+		entry->y = 1;
+		smp_mb();
+		r = entry->x;
+		entry->r[1] = r;
+	}
+	end = get_cntvct();
+
+	check_store_and_load_results("sal_barrier", 2, false, start, end);
+
+	cpumask_set_cpu(1, &cpu_mask);
+
+	halt();
+}
+
+
+/* Test array */
+static test_descr_t tests[] = {
+
+	{ "mp",         false,
+	  message_passing_read,
+	  { message_passing_write }
+	},
+
+	{ "mp_barrier", true,
+	  message_passing_read_barrier,
+	  { message_passing_write_barrier }
+	},
+
+	{ "mp_acqrel", true,
+	  message_passing_read_acquire,
+	  { message_passing_write_release }
+	},
+
+	{ "sal",       false,
+	  store_and_load_1,
+	  { store_and_load_2 }
+	},
+
+	{ "sal_barrier", true,
+	  store_and_load_barrier_1,
+	  { store_and_load_barrier_2 }
+	},
+};
+
+
+void setup_and_run_litmus(test_descr_t *test)
+{
+	array = calloc(array_size, sizeof(test_array));
+
+	if (array) {
+		int i = 0;
+		printf("Allocated test array @ %p\n", array);
+
+		while (test->secondary_fns[i]) {
+			smp_boot_secondary(i+1, test->secondary_fns[i]);
+			i++;
+		}
+
+		test->main_fn();
+	} else {
+		report("%s: failed to allocate memory",false, test->test_name);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+	unsigned int j;
+	test_descr_t *test = NULL;
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+
+		for (j = 0; j < ARRAY_SIZE(tests); j++) {
+			if (strcmp(arg, tests[j].test_name) == 0)
+				test = &tests[j];
+		}
+
+		/* Test modifiers */
+		if (strstr(arg, "count=") != NULL) {
+			char *p = strstr(arg, "=");
+			array_size = atol(p+1);
+		} else if (strcmp (arg, "wait") == 0) {
+			wait_if_ahead = 1;
+		}
+	}
+
+	if (test) {
+		setup_and_run_litmus(test);
+	} else {
+		report("Unknown test", false);
+	}
+
+	return report_summary();
+}
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index abbfe79..355dcfb 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -142,3 +142,39 @@ smp = $(($MAX_SMP>4?4:$MAX_SMP))
 extra_params = -append 'excl'
 groups = locking
 accel = tcg
+
+[barrier-litmus::mp]
+file = barrier-litmus-test.flat
+smp = 2
+extra_params = -append 'mp'
+groups = barrier
+accel = tcg
+
+[barrier-litmus::mp-barrier]
+file = barrier-litmus-test.flat
+smp = 2
+extra_params = -append 'mp_barrier'
+groups = barrier
+accel = tcg
+
+[barrier-litmus::mp-acqrel]
+file = barrier-litmus-test.flat
+smp = 2
+extra_params = -append 'mp_acqrel'
+groups = barrier
+accel = tcg
+
+[barrier-litmus::sal]
+file = barrier-litmus-test.flat
+smp = 2
+extra_params = -append 'sal'
+groups = barrier
+accel = tcg
+
+[barrier-litmus::sal-barrier]
+file = barrier-litmus-test.flat
+smp = 2
+extra_params = -append 'sal_barrier'
+groups = barrier
+accel = tcg
+
diff --git a/lib/arm/asm/barrier.h b/lib/arm/asm/barrier.h
index 394a4a2..e3b7a2e 100644
--- a/lib/arm/asm/barrier.h
+++ b/lib/arm/asm/barrier.h
@@ -1,9 +1,11 @@
 #ifndef _ASMARM_BARRIER_H_
 #define _ASMARM_BARRIER_H_
 /*
- * Adapted form arch/arm/include/asm/barrier.h
+ * Adapted from arch/arm/include/asm/barrier.h
  */
 
+#include <stdint.h>
+
 #define sev()		asm volatile("sev" : : : "memory")
 #define wfe()		asm volatile("wfe" : : : "memory")
 #define wfi()		asm volatile("wfi" : : : "memory")
@@ -20,4 +22,63 @@
 #define smp_rmb()	smp_mb()
 #define smp_wmb()	dmb(ishst)
 
+extern void abort(void);
+
+static inline void __write_once_size(volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(volatile uint8_t *)p = *(uint8_t *)res; break;
+	case 2: *(volatile uint16_t *)p = *(uint16_t *)res; break;
+	case 4: *(volatile uint32_t *)p = *(uint32_t *)res; break;
+	case 8: *(volatile uint64_t *)p = *(uint64_t *)res; break;
+	default:
+		/* unhandled case */
+		abort();
+	}
+}
+
+#define WRITE_ONCE(x, val) \
+({							\
+	union { typeof(x) __val; char __c[1]; } __u =	\
+		{ .__val = (typeof(x)) (val) }; \
+	__write_once_size(&(x), __u.__c, sizeof(x));	\
+	__u.__val;					\
+})
+
+#define smp_store_release(p, v)						\
+do {									\
+	smp_mb();							\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+
+static inline
+void __read_once_size(const volatile void *p, void *res, int size)
+{
+	switch (size) {
+	case 1: *(uint8_t *)res = *(volatile uint8_t *)p; break;
+	case 2: *(uint16_t *)res = *(volatile uint16_t *)p; break;
+	case 4: *(uint32_t *)res = *(volatile uint32_t *)p; break;
+	case 8: *(uint64_t *)res = *(volatile uint64_t *)p; break;
+	default:
+		/* unhandled case */
+		abort();
+	}
+}
+
+#define READ_ONCE(x)							\
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	__read_once_size(&(x), __u.__c, sizeof(x));			\
+	__u.__val;							\
+})
+
+
+#define smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
+	smp_mb();							\
+	___p1;								\
+})
+
 #endif /* _ASMARM_BARRIER_H_ */
diff --git a/lib/arm64/asm/barrier.h b/lib/arm64/asm/barrier.h
index dbdac9d..aafabdc 100644
--- a/lib/arm64/asm/barrier.h
+++ b/lib/arm64/asm/barrier.h
@@ -19,4 +19,54 @@
 #define smp_rmb()	dmb(ishld)
 #define smp_wmb()	dmb(ishst)
 
+#define smp_store_release(p, v)						\
+do {									\
+	switch (sizeof(*p)) {						\
+	case 1:								\
+		asm volatile ("stlrb %w1, %0"				\
+				: "=Q" (*p) : "r" (v) : "memory");	\
+		break;							\
+	case 2:								\
+		asm volatile ("stlrh %w1, %0"				\
+				: "=Q" (*p) : "r" (v) : "memory");	\
+		break;							\
+	case 4:								\
+		asm volatile ("stlr %w1, %0"				\
+				: "=Q" (*p) : "r" (v) : "memory");	\
+		break;							\
+	case 8:								\
+		asm volatile ("stlr %1, %0"				\
+				: "=Q" (*p) : "r" (v) : "memory");	\
+		break;							\
+	}								\
+} while (0)
+
+#define smp_load_acquire(p)						\
+({									\
+	union { typeof(*p) __val; char __c[1]; } __u;			\
+	switch (sizeof(*p)) {						\
+	case 1:								\
+		asm volatile ("ldarb %w0, %1"				\
+			: "=r" (*(u8 *)__u.__c)				\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	case 2:								\
+		asm volatile ("ldarh %w0, %1"				\
+			: "=r" (*(u16 *)__u.__c)			\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	case 4:								\
+		asm volatile ("ldar %w0, %1"				\
+			: "=r" (*(u32 *)__u.__c)			\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	case 8:								\
+		asm volatile ("ldar %0, %1"				\
+			: "=r" (*(u64 *)__u.__c)			\
+			: "Q" (*p) : "memory");				\
+		break;							\
+	}								\
+	__u.__val;							\
+})
+
 #endif /* _ASMARM64_BARRIER_H_ */
-- 
2.10.1

^ permalink raw reply related

* [kvm-unit-tests PATCH v7 11/11] arm/tcg-test: some basic TCG exercising tests
From: Alex Bennée @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124161033.11456-1-alex.bennee@linaro.org>

These tests are not really aimed at KVM at all but exist to stretch
QEMU's TCG code generator. In particular these exercise the ability of
the TCG to:

  * Chain TranslationBlocks together (tight)
  * Handle heavy usage of the tb_jump_cache (paged)
  * Pathological case of computed local jumps (computed)

In addition the tests can be varied by adding IPI IRQs or SMC sequences
into the mix to stress the tcg_exit and invalidation mechanisms.

To explicitly stress the tb_flush() mechanism you can use the mod/rounds
parameters to force more frequent tb invalidation. Combined with setting
-tb-size 1 in QEMU to limit the code generation buffer size.

Signed-off-by: Alex Benn?e <alex.bennee@linaro.org>

---
v5
  - added armv8 version of the tcg tests
  - max out at -smp 4 in unittests.cfg
  - add up IRQs sent and delivered for PASS/FAIL
  - take into account error count
  - add "rounds=" parameter
  - tweak smc to tb-size=1
  - printf fmt fix
v7
  - merged in IRQ numerology
  - updated to latest IRQ API
---
 arm/Makefile.arm     |   2 +
 arm/Makefile.arm64   |   2 +
 arm/Makefile.common  |   1 +
 arm/tcg-test-asm.S   | 170 ++++++++++++++++++++++++++
 arm/tcg-test-asm64.S | 169 ++++++++++++++++++++++++++
 arm/tcg-test.c       | 337 +++++++++++++++++++++++++++++++++++++++++++++++++++
 arm/unittests.cfg    |  84 +++++++++++++
 7 files changed, 765 insertions(+)
 create mode 100644 arm/tcg-test-asm.S
 create mode 100644 arm/tcg-test-asm64.S
 create mode 100644 arm/tcg-test.c

diff --git a/arm/Makefile.arm b/arm/Makefile.arm
index 92f3757..7058bd2 100644
--- a/arm/Makefile.arm
+++ b/arm/Makefile.arm
@@ -24,4 +24,6 @@ tests =
 
 include $(TEST_DIR)/Makefile.common
 
+$(TEST_DIR)/tcg-test.elf: $(cstart.o) $(TEST_DIR)/tcg-test.o $(TEST_DIR)/tcg-test-asm.o
+
 arch_clean: arm_clean
diff --git a/arm/Makefile.arm64 b/arm/Makefile.arm64
index 0b0761c..678fca4 100644
--- a/arm/Makefile.arm64
+++ b/arm/Makefile.arm64
@@ -16,5 +16,7 @@ tests =
 
 include $(TEST_DIR)/Makefile.common
 
+$(TEST_DIR)/tcg-test.elf: $(cstart.o) $(TEST_DIR)/tcg-test.o $(TEST_DIR)/tcg-test-asm64.o
+
 arch_clean: arm_clean
 	$(RM) lib/arm64/.*.d
diff --git a/arm/Makefile.common b/arm/Makefile.common
index a508128..9af758f 100644
--- a/arm/Makefile.common
+++ b/arm/Makefile.common
@@ -17,6 +17,7 @@ tests-common += $(TEST_DIR)/tlbflush-code.flat
 tests-common += $(TEST_DIR)/tlbflush-data.flat
 tests-common += $(TEST_DIR)/locking-test.flat
 tests-common += $(TEST_DIR)/barrier-litmus-test.flat
+tests-common += $(TEST_DIR)/tcg-test.flat
 
 all: test_cases
 
diff --git a/arm/tcg-test-asm.S b/arm/tcg-test-asm.S
new file mode 100644
index 0000000..6e823b7
--- /dev/null
+++ b/arm/tcg-test-asm.S
@@ -0,0 +1,170 @@
+/*
+ * TCG Test assembler functions for armv7 tests.
+ *
+ * Copyright (C) 2016, Linaro Ltd, Alex Benn?e <alex.bennee@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.
+ *
+ * These helper functions are written in pure asm to control the size
+ * of the basic blocks and ensure they fit neatly into page
+ * aligned chunks. The pattern of branches they follow is determined by
+ * the 32 bit seed they are passed. It should be the same for each set.
+ *
+ * Calling convention
+ *  - r0, iterations
+ *  - r1, jump pattern
+ *  - r2-r3, scratch
+ *
+ * Returns r0
+ */
+
+.arm
+
+.section .text
+
+/* Tight - all blocks should quickly be patched and should run
+ * very fast unless irqs or smc gets in the way
+ */
+
+.global tight_start
+tight_start:
+        subs    r0, r0, #1
+        beq     tight_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     tightA
+        b       tight_start
+
+tightA:
+        subs    r0, r0, #1
+        beq     tight_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     tightB
+        b       tight_start
+
+tightB:
+        subs    r0, r0, #1
+        beq     tight_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     tight_start
+        b       tightA
+
+.global tight_end
+tight_end:
+        mov     pc, lr
+
+/*
+ * Computed jumps cannot be hardwired into the basic blocks so each one
+ * will cause an exit for the main execution loop to look up the next block.
+ *
+ * There is some caching which should ameliorate the cost a little.
+ */
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+        .global computed_start
+computed_start:
+        subs    r0, r0, #1
+        beq     computed_end
+
+        /* Jump table */
+        ror     r1, r1, #1
+        and     r2, r1, #1
+        adr     r3, computed_jump_table
+        ldr     r2, [r3, r2, lsl #2]
+        mov     pc, r2
+
+        b       computed_err
+
+computed_jump_table:
+        .word   computed_start
+        .word   computedA
+
+computedA:
+        subs    r0, r0, #1
+        beq     computed_end
+
+        /* Jump into code */
+        ror     r1, r1, #1
+        and     r2, r1, #1
+        adr     r3, 1f
+        add	r3, r2, lsl #2
+        mov     pc, r3
+1:      b       computed_start
+        b       computedB
+
+        b       computed_err
+
+
+computedB:
+        subs    r0, r0, #1
+        beq     computed_end
+        ror     r1, r1, #1
+
+        /* Conditional register load */
+        adr     r3, computedA
+        tst     r1, #1
+        adreq   r3, computed_start
+        mov     pc, r3
+
+        b       computed_err
+
+computed_err:
+        mov     r0, #1
+        .global computed_end
+computed_end:
+        mov     pc, lr
+
+
+/*
+ * Page hoping
+ *
+ * Each block is in a different page, hence the blocks never get joined
+ */
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+        .global paged_start
+paged_start:
+        subs    r0, r0, #1
+        beq     paged_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     pagedA
+        b       paged_start
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+pagedA:
+        subs    r0, r0, #1
+        beq     paged_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     pagedB
+        b       paged_start
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+pagedB:
+        subs    r0, r0, #1
+        beq     paged_end
+
+        ror     r1, r1, #1
+        tst     r1, #1
+        beq     paged_start
+        b       pagedA
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+.global paged_end
+paged_end:
+        mov     pc, lr
+
+.global test_code_end
+test_code_end:
diff --git a/arm/tcg-test-asm64.S b/arm/tcg-test-asm64.S
new file mode 100644
index 0000000..22bcfb4
--- /dev/null
+++ b/arm/tcg-test-asm64.S
@@ -0,0 +1,169 @@
+/*
+ * TCG Test assembler functions for armv8 tests.
+ *
+ * Copyright (C) 2016, Linaro Ltd, Alex Benn?e <alex.bennee@linaro.org>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.
+ *
+ * These helper functions are written in pure asm to control the size
+ * of the basic blocks and ensure they fit neatly into page
+ * aligned chunks. The pattern of branches they follow is determined by
+ * the 32 bit seed they are passed. It should be the same for each set.
+ *
+ * Calling convention
+ *  - x0, iterations
+ *  - x1, jump pattern
+ *  - x2-x3, scratch
+ *
+ * Returns x0
+ */
+
+.section .text
+
+/* Tight - all blocks should quickly be patched and should run
+ * very fast unless irqs or smc gets in the way
+ */
+
+.global tight_start
+tight_start:
+        subs    x0, x0, #1
+        beq     tight_end
+
+        ror     x1, x1, #1
+        tst     x1, #1
+        beq     tightA
+        b       tight_start
+
+tightA:
+        subs    x0, x0, #1
+        beq     tight_end
+
+        ror     x1, x1, #1
+        tst     x1, #1
+        beq     tightB
+        b       tight_start
+
+tightB:
+        subs    x0, x0, #1
+        beq     tight_end
+
+        ror     x1, x1, #1
+        tst     x1, #1
+        beq     tight_start
+        b       tightA
+
+.global tight_end
+tight_end:
+        ret
+
+/*
+ * Computed jumps cannot be hardwired into the basic blocks so each one
+ * will cause an exit for the main execution loop to look up the next block.
+ *
+ * There is some caching which should ameliorate the cost a little.
+ */
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+        .global computed_start
+computed_start:
+        subs    x0, x0, #1
+        beq     computed_end
+
+        /* Jump table */
+        ror     x1, x1, #1
+        and     x2, x1, #1
+        adr     x3, computed_jump_table
+        ldr     x2, [x3, x2, lsl #3]
+        br      x2
+
+        b       computed_err
+
+computed_jump_table:
+        .quad   computed_start
+        .quad   computedA
+
+computedA:
+        subs    x0, x0, #1
+        beq     computed_end
+
+        /* Jump into code */
+        ror     x1, x1, #1
+        and     x2, x1, #1
+        adr     x3, 1f
+        add	x3, x3, x2, lsl #2
+        br      x3
+1:      b       computed_start
+        b       computedB
+
+        b       computed_err
+
+
+computedB:
+        subs    x0, x0, #1
+        beq     computed_end
+        ror     x1, x1, #1
+
+        /* Conditional register load */
+        adr     x2, computedA
+        adr     x3, computed_start
+        tst     x1, #1
+        csel    x2, x3, x2, eq
+        br      x2
+
+        b       computed_err
+
+computed_err:
+        mov     x0, #1
+        .global computed_end
+computed_end:
+        ret
+
+
+/*
+ * Page hoping
+ *
+ * Each block is in a different page, hence the blocks never get joined
+ */
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+        .global paged_start
+paged_start:
+        subs    x0, x0, #1
+        beq     paged_end
+
+        ror     x1, x1, #1
+        tst     x1, #1
+        beq     pagedA
+        b       paged_start
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+pagedA:
+        subs    x0, x0, #1
+        beq     paged_end
+
+        ror     x1, x1, #1
+        tst     x1, #1
+        beq     pagedB
+        b       paged_start
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+pagedB:
+        subs    x0, x0, #1
+        beq     paged_end
+
+        ror     x1, x1, #1
+        tst     x1, #1
+        beq     paged_start
+        b       pagedA
+
+        /* Align << 13 == 4096 byte alignment */
+        .align 13
+.global paged_end
+paged_end:
+        ret
+
+.global test_code_end
+test_code_end:
diff --git a/arm/tcg-test.c b/arm/tcg-test.c
new file mode 100644
index 0000000..341dca3
--- /dev/null
+++ b/arm/tcg-test.c
@@ -0,0 +1,337 @@
+/*
+ * ARM TCG Tests
+ *
+ * These tests are explicitly aimed@stretching the QEMU TCG engine.
+ */
+
+#include <libcflat.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+#include <asm/gic.h>
+
+#include <prng.h>
+
+#define MAX_CPUS 8
+
+/* These entry points are in the assembly code */
+extern int tight_start(uint32_t count, uint32_t pattern);
+extern int computed_start(uint32_t count, uint32_t pattern);
+extern int paged_start(uint32_t count, uint32_t pattern);
+extern uint32_t tight_end;
+extern uint32_t computed_end;
+extern uint32_t paged_end;
+extern unsigned long test_code_end;
+
+typedef int (*test_fn)(uint32_t count, uint32_t pattern);
+
+typedef struct {
+	const char *test_name;
+	bool       should_pass;
+	test_fn    start_fn;
+	uint32_t   *code_end;
+} test_descr_t;
+
+/* Test array */
+static test_descr_t tests[] = {
+       /*
+	* Tight chain.
+	*
+	* These are a bunch of basic blocks that have fixed branches in
+	* a page aligned space. The branches taken are decided by a
+	* psuedo-random bitmap for each CPU.
+	*
+	* Once the basic blocks have been chained together by the TCG they
+	* should run until they reach their block count. This will be the
+	* most efficient mode in which generated code is run. The only other
+	* exits will be caused by interrupts or TB invalidation.
+	*/
+	{ "tight", true, tight_start, &tight_end },
+	/*
+	 * Computed jumps.
+	 *
+	 * A bunch of basic blocks which just do computed jumps so the basic
+	 * block is never chained but they are all within a page (maybe not
+	 * required). This will exercise the cache lookup but not the new
+	 * generation.
+	 */
+	{ "computed", true, computed_start, &computed_end },
+        /*
+	 * Page ping pong.
+	 *
+	 * Have the blocks are separated by PAGE_SIZE so they can never
+	 * be chained together.
+	 *
+	 */
+	{ "paged", true, paged_start, &paged_end}
+};
+
+static test_descr_t *test = NULL;
+
+static int iterations = 1000000;
+static int rounds = 1000;
+static int mod_freq = 5;
+static uint32_t pattern[MAX_CPUS];
+
+/* control flags */
+static int smc = 0;
+static int irq = 0;
+static int check_irq = 0;
+
+/* IRQ accounting */
+#define MAX_IRQ_IDS 16
+static int irqv;
+static unsigned long irq_sent_ts[MAX_CPUS][MAX_CPUS][MAX_IRQ_IDS];
+
+static int irq_recv[MAX_CPUS];
+static int irq_sent[MAX_CPUS];
+static int irq_overlap[MAX_CPUS];  /* if ts > now, i.e a race */
+static int irq_slow[MAX_CPUS];  /* if delay > threshold */
+static unsigned long irq_latency[MAX_CPUS]; /* cumulative time */
+
+static int errors[MAX_CPUS];
+
+static cpumask_t smp_test_complete;
+
+static cpumask_t ready;
+
+static void wait_on_ready(void)
+{
+	cpumask_set_cpu(smp_processor_id(), &ready);
+	while (!cpumask_full(&ready))
+		cpu_relax();
+}
+
+/* This triggers TCGs SMC detection by writing values to the executing
+ * code pages. We are not actually modifying the instructions and the
+ * underlying code will remain unchanged. However this should trigger
+ * invalidation of the Translation Blocks
+ */
+
+void trigger_smc_detection(uint32_t *start, uint32_t *end)
+{
+	volatile uint32_t *ptr = start;
+	while (ptr < end) {
+		uint32_t inst = *ptr;
+		*ptr++ = inst;
+	}
+}
+
+/* Handler for receiving IRQs */
+
+static void irq_handler(struct pt_regs *regs __unused)
+{
+	unsigned long then, now = get_cntvct();
+	int cpu = smp_processor_id();
+	u32 irqstat = gic_read_iar();
+	u32 irqnr = gic_iar_irqnr(irqstat);
+
+	if (irqnr != GICC_INT_SPURIOUS) {
+		unsigned int src_cpu = (irqstat >> 10) & 0x7; ;
+		gic_write_eoir(irqstat);
+		irq_recv[cpu]++;
+
+		then = irq_sent_ts[src_cpu][cpu][irqnr];
+
+		if (then > now) {
+			irq_overlap[cpu]++;
+		} else {
+			unsigned long latency = (now - then);
+			if (latency > 30000) {
+				irq_slow[cpu]++;
+			} else {
+				irq_latency[cpu] += latency;
+			}
+		}
+	}
+}
+
+/* This triggers cross-CPU IRQs. Each IRQ should cause the basic block
+ * execution to finish the main run-loop get entered again.
+ */
+int send_cross_cpu_irqs(int this_cpu, int irq)
+{
+	int cpu, sent = 0;
+	cpumask_t mask;
+
+	cpumask_copy(&mask, &cpu_present_mask);
+
+	for_each_present_cpu(cpu) {
+		if (cpu != this_cpu) {
+			irq_sent_ts[this_cpu][cpu][irq] = get_cntvct();
+			cpumask_clear_cpu(cpu, &mask);
+			sent++;
+		}
+	}
+
+	gic_ipi_send_mask(irq, &mask);
+
+	return sent;
+}
+
+void do_test(void)
+{
+	int cpu = smp_processor_id();
+	int i, irq_id = 0;
+
+	printf("CPU%d: online and setting up with pattern 0x%"PRIx32"\n", cpu, pattern[cpu]);
+
+	if (irq) {
+		gic_enable_defaults();
+#ifdef __arm__
+		install_exception_handler(EXCPTN_IRQ, irq_handler);
+#else
+		install_irq_handler(EL1H_IRQ, irq_handler);
+#endif
+		local_irq_enable();
+
+		wait_on_ready();
+	}
+
+	for (i=0; i<rounds; i++)
+	{
+		/* Enter the blocks */
+		errors[cpu] += test->start_fn(iterations, pattern[cpu]);
+
+		if ((i + cpu) % mod_freq == 0)
+		{
+			if (smc) {
+				trigger_smc_detection((uint32_t *) test->start_fn,
+						test->code_end);
+			}
+			if (irq) {
+				irq_sent[cpu] += send_cross_cpu_irqs(cpu, irq_id);
+				irq_id++;
+				irq_id = irq_id % 15;
+			}
+		}
+	}
+
+	smp_wmb();
+
+	cpumask_set_cpu(cpu, &smp_test_complete);
+	if (cpu != 0)
+		halt();
+}
+
+void report_irq_stats(int cpu)
+{
+	int recv = irq_recv[cpu];
+	int race = irq_overlap[cpu];
+	int slow = irq_slow[cpu];
+
+	unsigned long avg_latency = irq_latency[cpu] / (recv - (race + slow));
+
+	printf("CPU%d: %d irqs (%d races, %d slow,  %ld ticks avg latency)\n",
+		cpu, recv, race, slow, avg_latency);
+}
+
+
+void setup_and_run_tcg_test(void)
+{
+	static const unsigned char seed[] = "tcg-test";
+	struct isaac_ctx prng_context;
+	int cpu;
+	int total_err = 0, total_sent = 0, total_recv = 0;
+
+	isaac_init(&prng_context, &seed[0], sizeof(seed));
+
+	/* boot other CPUs */
+	for_each_present_cpu(cpu) {
+		pattern[cpu] = isaac_next_uint32(&prng_context);
+
+		if (cpu == 0)
+			continue;
+
+		smp_boot_secondary(cpu, do_test);
+	}
+
+	do_test();
+
+	while (!cpumask_full(&smp_test_complete))
+		cpu_relax();
+
+	smp_mb();
+
+	/* Now total up errors and irqs */
+	for_each_present_cpu(cpu) {
+		total_err += errors[cpu];
+		total_sent += irq_sent[cpu];
+		total_recv += irq_recv[cpu];
+
+		if (check_irq) {
+			report_irq_stats(cpu);
+		}
+	}
+
+	if (check_irq) {
+		if (total_sent != total_recv) {
+			report("%d IRQs sent, %d received\n", false, total_sent, total_recv);
+		} else {
+			report("%d errors, IRQs OK", total_err == 0, total_err);
+		}
+	} else {
+		report("%d errors, IRQs not checked", total_err == 0, total_err);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int i;
+	unsigned int j;
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+
+		for (j = 0; j < ARRAY_SIZE(tests); j++) {
+			if (strcmp(arg, tests[j].test_name) == 0) {
+				test = &tests[j];
+			}
+		}
+
+		/* Test modifiers */
+		if (strstr(arg, "mod=") != NULL) {
+			char *p = strstr(arg, "=");
+			mod_freq = atol(p+1);
+		}
+
+		if (strstr(arg, "rounds=") != NULL) {
+			char *p = strstr(arg, "=");
+			rounds = atol(p+1);
+		}
+
+		if (strcmp(arg, "smc") == 0) {
+			unsigned long test_start = (unsigned long) &tight_start;
+			unsigned long test_end = (unsigned long) &test_code_end;
+
+			smc = 1;
+			mmu_set_range_ptes(mmu_idmap, test_start, test_start, test_end,
+					__pgprot(PTE_WBWA));
+
+			report_prefix_push("smc");
+		}
+
+		if (strcmp(arg, "irq") == 0) {
+			irq = 1;
+			if (!gic_init())
+				report_abort("No supported gic present!");
+			irqv = gic_version();
+			report_prefix_push("irq");
+		}
+
+		if (strcmp(arg, "check_irq") == 0) {
+			check_irq = 1;
+		}
+	}
+
+	if (test) {
+		smp_mb();
+		setup_and_run_tcg_test();
+	} else {
+		report("Unknown test", false);
+	}
+
+	return report_summary();
+}
diff --git a/arm/unittests.cfg b/arm/unittests.cfg
index 355dcfb..38934f2 100644
--- a/arm/unittests.cfg
+++ b/arm/unittests.cfg
@@ -178,3 +178,87 @@ extra_params = -append 'sal_barrier'
 groups = barrier
 accel = tcg
 
+# TCG Tests
+[tcg::tight]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'tight'
+groups = tcg
+accel = tcg
+
+[tcg::tight-smc]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'tight smc' -tb-size 1
+groups = tcg
+accel = tcg
+
+[tcg::tight-irq]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'tight irq'
+groups = tcg
+accel = tcg
+
+[tcg::tight-smc-irq]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'tight smc irq'
+groups = tcg
+accel = tcg
+
+[tcg::computed]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'computed'
+groups = tcg
+accel = tcg
+
+[tcg::computed-smc]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'computed smc'
+groups = tcg
+accel = tcg
+
+[tcg::computed-irq]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'computed irq'
+groups = tcg
+accel = tcg
+
+[tcg::computed-smc-irq]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'computed smc irq'
+groups = tcg
+accel = tcg
+
+[tcg::paged]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'paged'
+groups = tcg
+accel = tcg
+
+[tcg::paged-smc]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'paged smc'
+groups = tcg
+accel = tcg
+
+[tcg::paged-irq]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'paged irq'
+groups = tcg
+accel = tcg
+
+[tcg::paged-smc-irq]
+file = tcg-test.flat
+smp = $(($MAX_SMP>4?4:$MAX_SMP))
+extra_params = -append 'paged smc irq'
+groups = tcg
+accel = tcg
-- 
2.10.1

^ permalink raw reply related

* [PATCH V3 0/8] IOMMU probe deferral support
From: Sricharan @ 2016-11-24 16:10 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <918128b9-cdb0-1454-000a-146cee7a05ea@arm.com>

Hi Robin,

<snip..>

>
>>>>>> iommu_group_get_for_dev which gets called in the add_device
>>>>>> callback, increases the reference count of the iommu_group,
>>>>>> so we do an iommu_group_put after that. iommu_group_get_for_dev
>>>>>> inturn calls device_group callback and in the case of arm-smmu
>>>>>> we call generic_device_group/pci_device_group which takes
>>>>>> care of increasing the group's reference. But when we return
>>>>>> an already existing group(when multiple devices have same group)
>>>>>> the reference is not incremented, resulting in issues when the
>>>>>> remove_device callback for the devices is invoked.
>>>>>> Fixing the same here.
>>>>>
>>>>> Bah, yes, this does look like my fault - after flip-flopping between
>>>>> about 3 different ways to keep refcounts for the S2CR entries, none of
>>>>> which would quite work, I ripped it all out but apparently still got
>>>>> things wrong, oh well. Thanks for figuring it out.
>>>>>
>>>>> On the probe-deferral angle, whilst it's useful to have uncovered this
>>>>> bug, I don't think we should actually be calling remove_device() from
>>>>> DMA teardown. I think it's preferable from a user perspective if group
>>>>> numbering remains stable, rather than changing depending on the order in
>>>>> which they unbind/rebind VFIO drivers. I'm really keen to try and get
>>>>> this in shape for 4.10, so I've taken the liberty of hacking up my own
>>>>> branch (iommu/defer) based on v3 - would you mind taking a look at the
>>>>> two "iommu/of:" commits to see what you think? (Ignore the PCI changes
>>>>> to your later patches - that was an experiment which didn't really work out)
>>>>
>>>> Ok, will take a look at this now and respond more on this.
>>>>
>>> Sorry for the delayed response on this. I was OOO for the last few days.
>>> So i tested this branch and it worked fine. I tested it with a pci device
>>> for both normal and deferred probe cases.  The of/iommu patches
>>> are the cleanup/preparation patches and it looks fine. One thing is without
>>> calling the remove_device callback, the resources like (smes for exmaple)
>>> and the group association of the device all remain allocated. That does not
>>> feel correct, given that the associated device does not exist. So to
>>> understand that, what happens with VFIO in this case which makes the
>>> group renumbering/rebinding a problem ?
>>>
>>
>> Would it be ok if i post a V4 based on your branch above ?
>
>Sure, as long as none of the hacks slip through :) - I've just pushed
>out a mild rework based on Lorenzo's v9, which I hope shouldn't break
>anything for you.
>

Ok sure, i will test and just the post out the stuff from your branch then
mostly by tomorrow.

>Having thought a bit more about the add/remove thing, I'm inclined to
>agree that the group numbering itself may not be that big an issue in
>practice - sure, it could break my little script, but it looks like QEMU
>and such work with the device ID rather than the group number directly,
>so might not even notice. However, the fact remains that the callbacks
>are intended to handle a device being added to/removed from its bus, and
>will continue to do so on other platforms, so I don't like the idea of
>introducing needlessly different behaviour. If you unbind a driver, the
>stream IDs and everything don't stop existing at the hardware level; the
>struct device to which the in-kernel data belongs still exists and
>doesn't stop being associated with its bus. There's no good reason for
>freeing SMEs that we'll only reallocate again (inadequately-specced
>hardware with not enough SMRs/contexts is not a *good* reason), and

ok, so SMRs/contexts was the reason i was adding the remove_dev
callback, but if thats not good enough then there was no other
intention.

>there are also some strong arguments against letting any stream IDs the
>kernel knows about go back to bypass after a driver has been bound - by

ok, but not sure why is this so ?

>keeping groups around as expected that's something we can implement
>quite easily without having to completely lock down bypass for stream
>IDs the kernel *doesn't* know about.
>

So do you mean in this case to keep the unbound device's group/context bank
to bypass rather than resetting the streamids ?

Regards,
 Sricharan

^ permalink raw reply

* [PATCH v2 0/3] arm64: dts: r8a7796: Add CAN/CAN FD support
From: Chris Paterson @ 2016-11-24 16:13 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1479903243-1860-1-git-send-email-chris.paterson2@renesas.com>

This series adds CAN and CAN FD support to the r8a7796.

Changes since v1:
- Split device tree changes from bindings documentation.
- Rebased on renesas-devel-20161123v2-v4.9-rc6.


Chris Paterson (3):
  arm64: dts: r8a7796: Add CAN external clock support
  arm64: dts: r8a7796: Add CAN support
  arm64: dts: r8a7796: Add CAN FD support

 arch/arm64/boot/dts/renesas/r8a7796.dtsi | 61 ++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

-- 
1.9.1

^ permalink raw reply

* [PATCH v2 1/3] arm64: dts: r8a7796: Add CAN external clock support
From: Chris Paterson @ 2016-11-24 16:13 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480004021-4037-1-git-send-email-chris.paterson2@renesas.com>

Adds external CAN clock node for r8a7796. This clock can be used as
fCAN clock of CAN and CAN FD controller.

Based on a patch for r8a7795 by Ramesh Shanmugasundaram.

Signed-off-by: Chris Paterson <chris.paterson2@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 arch/arm64/boot/dts/renesas/r8a7796.dtsi | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
index c34c684..61d165b 100644
--- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
@@ -69,6 +69,13 @@
 		clock-frequency = <0>;
 	};
 
+	/* External CAN clock - to be overridden by boards that provide it */
+	can_clk: can {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <0>;
+	};
+
 	/* External SCIF clock - to be overridden by boards that provide it */
 	scif_clk: scif {
 		compatible = "fixed-clock";
-- 
1.9.1

^ permalink raw reply related

* [PATCH v2 2/3] arm64: dts: r8a7796: Add CAN support
From: Chris Paterson @ 2016-11-24 16:13 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480004021-4037-1-git-send-email-chris.paterson2@renesas.com>

Adds CAN controller nodes for r8a7796.

Based on a patch for r8a7795 by Ramesh Shanmugasundaram.

Signed-off-by: Chris Paterson <chris.paterson2@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 arch/arm64/boot/dts/renesas/r8a7796.dtsi | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
index 61d165b..47fa29c 100644
--- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
@@ -362,6 +362,36 @@
 			status = "disabled";
 		};
 
+		can0: can at e6c30000 {
+			compatible = "renesas,can-r8a7796",
+				     "renesas,rcar-gen3-can";
+			reg = <0 0xe6c30000 0 0x1000>;
+			interrupts = <GIC_SPI 186 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 916>,
+			       <&cpg CPG_CORE R8A7796_CLK_CANFD>,
+			       <&can_clk>;
+			clock-names = "clkp1", "clkp2", "can_clk";
+			assigned-clocks = <&cpg CPG_CORE R8A7796_CLK_CANFD>;
+			assigned-clock-rates = <40000000>;
+			power-domains = <&sysc R8A7796_PD_ALWAYS_ON>;
+			status = "disabled";
+		};
+
+		can1: can at e6c38000 {
+			compatible = "renesas,can-r8a7796",
+				     "renesas,rcar-gen3-can";
+			reg = <0 0xe6c38000 0 0x1000>;
+			interrupts = <GIC_SPI 187 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 915>,
+			       <&cpg CPG_CORE R8A7796_CLK_CANFD>,
+			       <&can_clk>;
+			clock-names = "clkp1", "clkp2", "can_clk";
+			assigned-clocks = <&cpg CPG_CORE R8A7796_CLK_CANFD>;
+			assigned-clock-rates = <40000000>;
+			power-domains = <&sysc R8A7796_PD_ALWAYS_ON>;
+			status = "disabled";
+		};
+
 		scif2: serial at e6e88000 {
 			compatible = "renesas,scif-r8a7796",
 				     "renesas,rcar-gen3-scif", "renesas,scif";
-- 
1.9.1

^ permalink raw reply related

* [PATCH v2 3/3] arm64: dts: r8a7796: Add CAN FD support
From: Chris Paterson @ 2016-11-24 16:13 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480004021-4037-1-git-send-email-chris.paterson2@renesas.com>

Adds CAN FD controller node for r8a7796.

Based on a patch for r8a7795 by Ramesh Shanmugasundaram.

Signed-off-by: Chris Paterson <chris.paterson2@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 arch/arm64/boot/dts/renesas/r8a7796.dtsi | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
index 47fa29c..1ed1886 100644
--- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi
+++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi
@@ -392,6 +392,30 @@
 			status = "disabled";
 		};
 
+		canfd: can at e66c0000 {
+			compatible = "renesas,r8a7796-canfd",
+				     "renesas,rcar-gen3-canfd";
+			reg = <0 0xe66c0000 0 0x8000>;
+			interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>,
+				   <GIC_SPI 30 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&cpg CPG_MOD 914>,
+			       <&cpg CPG_CORE R8A7796_CLK_CANFD>,
+			       <&can_clk>;
+			clock-names = "fck", "canfd", "can_clk";
+			assigned-clocks = <&cpg CPG_CORE R8A7796_CLK_CANFD>;
+			assigned-clock-rates = <40000000>;
+			power-domains = <&sysc R8A7796_PD_ALWAYS_ON>;
+			status = "disabled";
+
+			channel0 {
+				status = "disabled";
+			};
+
+			channel1 {
+				status = "disabled";
+			};
+		};
+
 		scif2: serial at e6e88000 {
 			compatible = "renesas,scif-r8a7796",
 				     "renesas,rcar-gen3-scif", "renesas,scif";
-- 
1.9.1

^ permalink raw reply related

* [PATCH V7 2/3] ACPI: Add support for ResourceSource/IRQ domain mapping
From: Lorenzo Pieralisi @ 2016-11-24 16:15 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1479074375-2629-3-git-send-email-agustinv@codeaurora.org>

Hi Agustin,

On Sun, Nov 13, 2016 at 04:59:34PM -0500, Agustin Vega-Frias wrote:
> When an Extended IRQ Resource contains a valid ResourceSource
> use it to map the IRQ on the domain associated with the ACPI
> device referenced.
> 
> With this in place an irqchip driver can create its domain using
> irq_domain_create_linear and pass the device fwnode to create
> the domain mapping. When dependent devices are probed these
> changes allow the ACPI core find the domain and map the IRQ.
> 
> Signed-off-by: Agustin Vega-Frias <agustinv@codeaurora.org>
> ---
>  drivers/acpi/Makefile         |  2 +-
>  drivers/acpi/{gsi.c => irq.c} | 98 +++++++++++++++++++++++++++++++++++++------
>  drivers/acpi/resource.c       | 29 +++++++------
>  include/linux/acpi.h          | 19 +++++++++
>  4 files changed, 121 insertions(+), 27 deletions(-)
>  rename drivers/acpi/{gsi.c => irq.c} (53%)

It looks to me the direction is the right one but I have a question
for you and others below.

> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
> index 9ed0878..a391bbc 100644
> --- a/drivers/acpi/Makefile
> +++ b/drivers/acpi/Makefile
> @@ -55,7 +55,7 @@ acpi-$(CONFIG_DEBUG_FS)		+= debugfs.o
>  acpi-$(CONFIG_ACPI_NUMA)	+= numa.o
>  acpi-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o
>  acpi-y				+= acpi_lpat.o
> -acpi-$(CONFIG_ACPI_GENERIC_GSI) += gsi.o
> +acpi-$(CONFIG_ACPI_GENERIC_GSI) += irq.o
>  acpi-$(CONFIG_ACPI_WATCHDOG)	+= acpi_watchdog.o
>  
>  # These are (potentially) separate modules
> diff --git a/drivers/acpi/gsi.c b/drivers/acpi/irq.c
> similarity index 53%
> rename from drivers/acpi/gsi.c
> rename to drivers/acpi/irq.c
> index ee9e0f2..c6ecaab 100644
> --- a/drivers/acpi/gsi.c
> +++ b/drivers/acpi/irq.c
> @@ -18,6 +18,45 @@
>  static struct fwnode_handle *acpi_gsi_domain_id;
>  
>  /**
> + * acpi_get_irq_source_fwhandle() - Retrieve the fwhandle of the given
> + *                                  acpi_resource_source which is used
> + *                                  to be used as an IRQ domain id
> + * @source: acpi_resource_source to use for the lookup
> + *
> + * Returns: The appropriate IRQ fwhandle domain id
> + *          NULL on failure
> + */
> +struct fwnode_handle *
> +acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source)
> +{
> +	struct fwnode_handle *result;
> +	struct acpi_device *device;
> +	acpi_handle handle;
> +	acpi_status status;
> +
> +	if (!source->string_length)
> +		return acpi_gsi_domain_id;
> +
> +	status = acpi_get_handle(NULL, source->string_ptr, &handle);
> +	if (ACPI_FAILURE(status)) {
> +		pr_warn("Could not find handle for %s\n", source->string_ptr);
> +		return NULL;
> +	}
> +
> +	device = acpi_bus_get_acpi_device(handle);
> +	if (!device) {
> +		pr_warn("Could not get device for %s\n", source->string_ptr);
> +		return NULL;
> +	}
> +
> +	result = &device->fwnode;
> +	acpi_bus_put_acpi_device(device);
> +
> +	return result;
> +}
> +EXPORT_SYMBOL_GPL(acpi_get_irq_source_fwhandle);
> +
> +/**
>   * acpi_gsi_to_irq() - Retrieve the linux irq number for a given GSI
>   * @gsi: GSI IRQ number to map
>   * @irq: pointer where linux IRQ number is stored
> @@ -42,6 +81,50 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
>  EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
>  
>  /**
> + * acpi_register_irq() - Map a hardware to a linux IRQ number
> + * @source: IRQ source
> + * @hwirq: Hardware IRQ number
> + * @trigger: trigger type of the IRQ number to be mapped
> + * @polarity: polarity of the IRQ to be mapped
> + *
> + * Returns: a valid linux IRQ number on success
> + *          -EINVAL on failure

Nit: You need to update the return values list.

> + */
> +int acpi_register_irq(struct fwnode_handle *source, u32 hwirq, int trigger,
> +		      int polarity)
> +{
> +	struct irq_fwspec fwspec;
> +
> +	if (!source)
> +		return -EINVAL;
> +
> +	if (irq_find_matching_fwnode(source, DOMAIN_BUS_ANY) == NULL)
> +		return -EPROBE_DEFER;
> +
> +	fwspec.fwnode = source;
> +	fwspec.param[0] = hwirq;
> +	fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
> +	fwspec.param_count = 2;
> +
> +	return irq_create_fwspec_mapping(&fwspec);
> +}
> +EXPORT_SYMBOL_GPL(acpi_register_irq);
> +
> +/**
> + * acpi_unregister_irq() - Free a Hardware IRQ<->linux IRQ number mapping
> + * @hwirq: Hardware IRQ number
> + */
> +void acpi_unregister_irq(struct fwnode_handle *source, u32 hwirq)
> +{
> +	struct irq_domain *d = irq_find_matching_fwnode(source,
> +							DOMAIN_BUS_ANY);
> +	int irq = irq_find_mapping(d, hwirq);
> +
> +	irq_dispose_mapping(irq);
> +}
> +EXPORT_SYMBOL_GPL(acpi_unregister_irq);
> +
> +/**
>   * acpi_register_gsi() - Map a GSI to a linux IRQ number
>   * @dev: device for which IRQ has to be mapped
>   * @gsi: GSI IRQ number
> @@ -54,19 +137,12 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
>  int acpi_register_gsi(struct device *dev, u32 gsi, int trigger,
>  		      int polarity)
>  {
> -	struct irq_fwspec fwspec;
> -
>  	if (WARN_ON(!acpi_gsi_domain_id)) {
>  		pr_warn("GSI: No registered irqchip, giving up\n");
>  		return -EINVAL;
>  	}
>  
> -	fwspec.fwnode = acpi_gsi_domain_id;
> -	fwspec.param[0] = gsi;
> -	fwspec.param[1] = acpi_dev_get_irq_type(trigger, polarity);
> -	fwspec.param_count = 2;
> -
> -	return irq_create_fwspec_mapping(&fwspec);
> +	return acpi_register_irq(acpi_gsi_domain_id, gsi, trigger, polarity);
>  }
>  EXPORT_SYMBOL_GPL(acpi_register_gsi);
>  
> @@ -76,11 +152,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger,
>   */
>  void acpi_unregister_gsi(u32 gsi)
>  {
> -	struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id,
> -							DOMAIN_BUS_ANY);
> -	int irq = irq_find_mapping(d, gsi);
> -
> -	irq_dispose_mapping(irq);
> +	acpi_unregister_irq(acpi_gsi_domain_id, gsi);
>  }
>  EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
>  
> diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
> index 4beda15..83cff00 100644
> --- a/drivers/acpi/resource.c
> +++ b/drivers/acpi/resource.c
> @@ -374,21 +374,22 @@ unsigned int acpi_dev_get_irq_type(int triggering, int polarity)
>  }
>  EXPORT_SYMBOL_GPL(acpi_dev_get_irq_type);
>  
> -static void acpi_dev_irqresource_disabled(struct resource *res, u32 gsi)
> +static void acpi_dev_irqresource_disabled(struct resource *res, u32 hwirq)
>  {
> -	res->start = gsi;
> -	res->end = gsi;
> +	res->start = hwirq;
> +	res->end = hwirq;
>  	res->flags = IORESOURCE_IRQ | IORESOURCE_DISABLED | IORESOURCE_UNSET;
>  }
>  
> -static void acpi_dev_get_irqresource(struct resource *res, u32 gsi,
> +static void acpi_dev_get_irqresource(struct resource *res, u32 hwirq,
> +				     struct fwnode_handle *source,
>  				     u8 triggering, u8 polarity, u8 shareable,
>  				     bool legacy)
>  {
>  	int irq, p, t;
>  
> -	if (!valid_IRQ(gsi)) {
> -		acpi_dev_irqresource_disabled(res, gsi);
> +	if (!source && !valid_IRQ(hwirq)) {
> +		acpi_dev_irqresource_disabled(res, hwirq);
>  		return;
>  	}
>  
> @@ -402,25 +403,25 @@ static void acpi_dev_get_irqresource(struct resource *res, u32 gsi,
>  	 * using extended IRQ descriptors we take the IRQ configuration
>  	 * from _CRS directly.
>  	 */
> -	if (legacy && !acpi_get_override_irq(gsi, &t, &p)) {
> +	if (legacy && !acpi_get_override_irq(hwirq, &t, &p)) {
>  		u8 trig = t ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
>  		u8 pol = p ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
>  
>  		if (triggering != trig || polarity != pol) {
> -			pr_warning("ACPI: IRQ %d override to %s, %s\n", gsi,
> -				   t ? "level" : "edge", p ? "low" : "high");
> +			pr_warn("ACPI: IRQ %d override to %s, %s\n", hwirq,
> +				t ? "level" : "edge", p ? "low" : "high");
>  			triggering = trig;
>  			polarity = pol;
>  		}
>  	}
>  
>  	res->flags = acpi_dev_irq_flags(triggering, polarity, shareable);
> -	irq = acpi_register_gsi(NULL, gsi, triggering, polarity);
> +	irq = acpi_register_irq(source, hwirq, triggering, polarity);
>  	if (irq >= 0) {
>  		res->start = irq;
>  		res->end = irq;
>  	} else {
> -		acpi_dev_irqresource_disabled(res, gsi);
> +		acpi_dev_irqresource_disabled(res, hwirq);
>  	}
>  }
>  
> @@ -448,6 +449,7 @@ bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
>  {
>  	struct acpi_resource_irq *irq;
>  	struct acpi_resource_extended_irq *ext_irq;
> +	struct fwnode_handle *src;
>  
>  	switch (ares->type) {
>  	case ACPI_RESOURCE_TYPE_IRQ:
> @@ -460,7 +462,7 @@ bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
>  			acpi_dev_irqresource_disabled(res, 0);
>  			return false;
>  		}
> -		acpi_dev_get_irqresource(res, irq->interrupts[index],
> +		acpi_dev_get_irqresource(res, irq->interrupts[index], NULL,
>  					 irq->triggering, irq->polarity,
>  					 irq->sharable, true);
>  		break;
> @@ -470,7 +472,8 @@ bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
>  			acpi_dev_irqresource_disabled(res, 0);
>  			return false;
>  		}
> -		acpi_dev_get_irqresource(res, ext_irq->interrupts[index],
> +		src = acpi_get_irq_source_fwhandle(&ext_irq->resource_source);

Is there a reason why we need to do the domain look-up here ?

I would like to understand if, by reshuffling the code (and by returning
the resource_source to the calling code - somehow), it would be possible
to just mirror what the OF code does in of_irq_get(), namely:

(1) parse the irq entry -> of_irq_parse_one()
(2) look the domain up -> irq_find_host()
(3) create the mapping -> irq_create_of_mapping()

You wrote the code already, I think it is just a matter of shuffling
it around (well, minus returning the resource_source to the caller
which is phandle equivalent in DT).

You abstracted away (2) and (3) behind acpi_register_irq(), that
on anything than does not use ACPI_GENERIC_GSI is just glue code
to acpi_register_gsi().

Also, it is not a question on this patch but I ask it here because it
is related. On ACPI you are doing the reverse of what is done in
DT in platform_get_irq():

- get the resources already parsed -> platform_get_resource()
- if they are disabled -> acpi_irq_get()

and I think the ordering is tied to my question above because
you carry out the domain look up in acpi_dev_resource_interrupt()
so that if for any reason it fails the corresponding resource
is disabled so that we try to get it again through acpi_irq_get().

I suspect you did it this way to make sure:

a) keep the current ACPI IRQ parsing interface changes to a mininum
b) avoid changing the behaviour on x86/ia64; in particular, calling
   acpi_register_gsi() for the _same_ mapping (an IRQ that was already
   registered at device creation resource parsing) multiple times can
   trigger issues on x86/ia64

I think that's a reasonable approach but I wanted to get these
clarifications, I do not think you are far from getting this
done but since it is a significant change I think it is worth
discussing the points I raised above because I think the DT code
sequence in of_irq_get() (1-2-3 above) is cleaner from an IRQ
layer perspective (instead of having the domain look-up buried
inside the ACPI IRQ resource parsing API).

Thanks !
Lorenzo

> +		acpi_dev_get_irqresource(res, ext_irq->interrupts[index], src,
>  					 ext_irq->triggering, ext_irq->polarity,
>  					 ext_irq->sharable, false);
>  		break;
> diff --git a/include/linux/acpi.h b/include/linux/acpi.h
> index 325bdb9..1099b51 100644
> --- a/include/linux/acpi.h
> +++ b/include/linux/acpi.h
> @@ -321,6 +321,25 @@ void acpi_set_irq_model(enum acpi_irq_model_id model,
>   */
>  void acpi_unregister_gsi (u32 gsi);
>  
> +#ifdef CONFIG_ACPI_GENERIC_GSI
> +struct fwnode_handle *
> +acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source);
> +int acpi_register_irq(struct fwnode_handle *source, u32 hwirq, int trigger,
> +		      int polarity);
> +void acpi_unregister_irq(struct fwnode_handle *source, u32 hwirq);
> +#else
> +#define acpi_get_irq_source_fwhandle(source) (NULL)
> +static inline int acpi_register_irq(struct fwnode_handle *source, u32 hwirq,
> +				    int trigger, int polarity)
> +{
> +	return acpi_register_gsi(NULL, hwirq, trigger, polarity);
> +}
> +static inline void acpi_unregister_irq(struct fwnode_handle *source, u32 hwirq)
> +{
> +	acpi_unregister_gsi(hwirq);
> +}
> +#endif
> +
>  struct pci_dev;
>  
>  int acpi_pci_irq_enable (struct pci_dev *dev);
> -- 
> Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
> 

^ permalink raw reply

* [PATCH] ARM: pxa: ezx: fix a910 camera data
From: Arnd Bergmann @ 2016-11-24 16:29 UTC (permalink / raw)
  To: linux-arm-kernel

The camera_supply_dummy_device definition is shared between a780 and a910,
but only provided when the first is enabled and fails to build for a
configuration with only a910:

arch/arm/mach-pxa/ezx.c:1097:3: error: 'camera_supply_dummy_device' undeclared here (not in a function)

This moves the definition into its own section.

Fixes: 6c1b417adc8f ("ARM: pxa: ezx: use the new pxa_camera platform_data")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/mach-pxa/ezx.c | 56 ++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/arch/arm/mach-pxa/ezx.c b/arch/arm/mach-pxa/ezx.c
index 0b8300e6fca3..a057cf9c0e7b 100644
--- a/arch/arm/mach-pxa/ezx.c
+++ b/arch/arm/mach-pxa/ezx.c
@@ -696,32 +696,7 @@ static struct pxa27x_keypad_platform_data e2_keypad_platform_data = {
 };
 #endif /* CONFIG_MACH_EZX_E2 */
 
-#ifdef CONFIG_MACH_EZX_A780
-/* gpio_keys */
-static struct gpio_keys_button a780_buttons[] = {
-	[0] = {
-		.code       = SW_LID,
-		.gpio       = GPIO12_A780_FLIP_LID,
-		.active_low = 0,
-		.desc       = "A780 flip lid",
-		.type       = EV_SW,
-		.wakeup     = 1,
-	},
-};
-
-static struct gpio_keys_platform_data a780_gpio_keys_platform_data = {
-	.buttons  = a780_buttons,
-	.nbuttons = ARRAY_SIZE(a780_buttons),
-};
-
-static struct platform_device a780_gpio_keys = {
-	.name = "gpio-keys",
-	.id   = -1,
-	.dev  = {
-		.platform_data = &a780_gpio_keys_platform_data,
-	},
-};
-
+#if defined(CONFIG_MACH_EZX_A780) || defined(CONFIG_MACH_EZX_A910)
 /* camera */
 static struct regulator_consumer_supply camera_dummy_supplies[] = {
 	REGULATOR_SUPPLY("vdd", "0-005d"),
@@ -750,6 +725,35 @@ static struct platform_device camera_supply_dummy_device = {
 		.platform_data = &camera_dummy_config,
 	},
 };
+#endif
+
+#ifdef CONFIG_MACH_EZX_A780
+/* gpio_keys */
+static struct gpio_keys_button a780_buttons[] = {
+	[0] = {
+		.code       = SW_LID,
+		.gpio       = GPIO12_A780_FLIP_LID,
+		.active_low = 0,
+		.desc       = "A780 flip lid",
+		.type       = EV_SW,
+		.wakeup     = 1,
+	},
+};
+
+static struct gpio_keys_platform_data a780_gpio_keys_platform_data = {
+	.buttons  = a780_buttons,
+	.nbuttons = ARRAY_SIZE(a780_buttons),
+};
+
+static struct platform_device a780_gpio_keys = {
+	.name = "gpio-keys",
+	.id   = -1,
+	.dev  = {
+		.platform_data = &a780_gpio_keys_platform_data,
+	},
+};
+
+/* camera */
 static int a780_camera_reset(struct device *dev)
 {
 	gpio_set_value(GPIO19_GEN1_CAM_RST, 0);
-- 
2.9.0

^ permalink raw reply related

* Tearing down DMA transfer setup after DMA client has finished
From: Måns Rullgård @ 2016-11-24 16:37 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <58370530.5080605@free.fr>

Mason <slash.tmp@free.fr> writes:

> On 24/11/2016 15:17, M?ns Rullg?rd wrote:
>
>> Mason wrote:
>> 
>>> [   35.085854] SETUP DMA
>>> [   35.088272] START NAND TRANSFER
>>> [   35.091670] tangox_dma_pchan_start from tangox_dma_irq
>>> [   35.096882] tango_dma_callback from vchan_complete
>>> [   45.102513] DONE FAKE SPINNING
>>>
>>> So the IRQ rolls in, the ISR calls tangox_dma_pchan_start,
>>> which calls tangox_dma_pchan_detach to tear down the sbox
>>> setup; and only sometime later does the DMA framework call
>>> my callback function.
>> 
>> Yes, I realised this soon after I said it.  The dma driver could be
>> rearranged to make it work though.
>
> There is a way to make the tasklet run and invoke the callback
> before the interrupt service routine proceeds?

No, but it would be possible to defer the teardown to the tasklet.
Having said that, I'm not sure it's such a great idea since the tasklet
could be held up for an arbitrary length of time waiting for the target
to finish.

>>> So far, the work-arounds I've tested are:
>>>
>>> 1) delay sbox tear-down by 10 ?s in tangox_dma_pchan_detach.
>>> 2) statically setup sbox in probe, and never touch it henceforth.
>>>
>>> WA1 is fragile, it might break for devices other than NFC.
>>> WA2 is what I used when I wrote the NFC driver.
>>>
>>> Can tangox_dma_irq() be changed to have the framework call
>>> the client's callback *before* tangox_dma_pchan_start?
>>>
>>> (Thinking out loud) The DMA_PREP_INTERRUPT requests that the
>>> DMA framework invoke the callback from tasklet context,
>>> maybe a different flag DMA_PREP_INTERRUPT_EX can request
>>> calling the call-back directly from within the ISR?
>>>
>>> (Looking at existing flags) Could I use DMA_CTRL_ACK?
>>> Description sounds like some kind hand-shake between
>>> client and dmaengine.
>>>
>>> Grepping for DMA_PREP_INTERRUPT, I don't see where the framework
>>> checks that flag to spawn the tasklet? Or is that up to each
>>> driver individually?
>> 
>> Those flags all have defined meanings and abusing them for other things
>> is a bad idea.  As far as possible, device drivers should work with any
>> dma driver.
>
> I was asking about introducing a new flag, not abusing existing
> flags. (I don't understand the semantics of DMA_CTRL_ACK.)

This needs more than a new flag anyhow.

> (FWIW, both the NFC and the MBUS agent are custom designs,
> not third-party IP blocks.)

Sure, but who knows what will be in the next chip?

-- 
M?ns Rullg?rd

^ permalink raw reply

* [PATCH 0/3] arm64: dts: r8a7796: Add CAN/CAN FD support
From: Geert Uytterhoeven @ 2016-11-24 16:41 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <HK2PR0601MB1329C747CA5C6B5222C02C3FB7B60@HK2PR0601MB1329.apcprd06.prod.outlook.com>

Hi Chris,

On Thu, Nov 24, 2016 at 3:25 PM, Chris Paterson
<Chris.Paterson2@renesas.com> wrote:
> From: Simon Horman [mailto:horms at verge.net.au]
> Sent: 24 November 2016 10:18
>> On Thu, Nov 24, 2016 at 10:05:08AM +0000, Chris Paterson wrote:
>> > From: Simon Horman [mailto:horms at verge.net.au]
>> > > Regarding the arch/arm64/boot/dts/renesas/ portion, I would like
>> > > some consideration given to what effect enabling memory above 4Gb
>> > > (64bit
>> > > addressing) would have.
>> >
>> > Can you give me some guidance here? I'm not sure what you're referring
>> > to. As far as I know the DT reg definition here is 64-bit, or are you
>> > referring to DMA usage? If the later, neither CAN driver uses DMA.
>>
>> Sorry for not being clearer.
>>
>> What I would like to know is if there are any problems in the CAN driver or
>> hardware that would prevent it from functioning with memory that requires
>> 64bit addressing present.
>>
>> If the CAN hardware cannot use DMA then DMA doesn't need to be taken
>> into account. But if it DMA could be enabled in future for CAN, for example
>> after some driver enhancements, then it would be good to know if 64bit
>> memory can be supported - if not it would imply DMA cannot be enabled.
>
> Thank you for the clarification.
>
> The CAN interface for r8a7795/6 does not support DMA.
>
> With CAN FD there is currently a H/W issue that means DMA is unusable.

Is that issue present on R-Car M3-W, or only on R-Car H3 ES1.x?

> Potentially this issue could be fixed in the future and DMA support could
> be added to the driver. If this happens I can see no reason why the CAN FD
> IP wouldn't be able to handle DMA transfers when using 64bit addressing.

Yep, AFAIK it uses SYS-DMAC, which supports 64-bit addressing.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert at linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [net-next PATCH v1 1/2] net: dt-bindings: add RGMII TX delay configuration to meson8b-dwmac
From: Martin Blumenstingl @ 2016-11-24 16:52 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <20161124154858.GB20455@lunn.ch>

Hi Andrew,

On Thu, Nov 24, 2016 at 4:48 PM, Andrew Lunn <andrew@lunn.ch> wrote:
>> The configuration values are provided as preprocessor macros to make the
>> devicetree files easier to read.
>
> Hi Martin
>
> If i'm reading the code/comments correctly, you can set the delay to
> 0, 2, 4 or 6ns? So calling this property amlogic,tx-delay-ns would be
> even easier to read.
indeed, this sounds like a very nice idea (as it moves the calculation
from the programmer's brain to dwmac-meson8b.c)!

I'll send an updated version once I received enough feedback (in case
something else is wrong with the patches)

^ permalink raw reply

* [linux-sunxi] Re: [RFC PATCH 0/5] arm64: Allwinner H5 support
From: Ian Campbell @ 2016-11-24 17:03 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <560f873c-7ad9-5614-21f6-489879380ebe@arm.com>

On Thu, 2016-11-24 at 11:05 +0000, Andre Przywara wrote:
> 
> > I don't have any major comments but I guess it all depends on the DT
> > maintainers view on the symbolic link to share the DTSI.
> 
> I am curious too ;-)
> But I saw symlinks for the RaspberryPi 3 (check
> arch/arm64/boot/dts/broadcom) and VExpress, so I picked that low hanging
> fruit ;-)

See http://git.kernel.org/torvalds/linux/c/8ee57b8182c4?and the ML
discussion around the posting of that for some background on why the
symlinks are being used in preference to #include or /include/.

IIRC there was some further discussion on some lists when?http://git.ke
rnel.org/torvalds/linux/c/76aa75916880?was posted too.

Ian.

^ permalink raw reply

* [net-next PATCH v1 0/2] stmmac: dwmac-meson8b: configurable RGMII TX delay
From: Martin Blumenstingl @ 2016-11-24 17:05 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <1480002964.17538.131.camel@baylibre.com>

On Thu, Nov 24, 2016 at 4:56 PM, Jerome Brunet <jbrunet@baylibre.com> wrote:
> On Thu, 2016-11-24 at 15:34 +0100, Martin Blumenstingl wrote:
>> Currently the dwmac-meson8b stmmac glue driver uses a hardcoded 1/4
>> cycle TX clock delay. This seems to work fine for many boards (for
>> example Odroid-C2 or Amlogic's reference boards) but there are some
>> others where TX traffic is simply broken.
>> There are probably multiple reasons why it's working on some boards
>> while it's broken on others:
>> - some of Amlogic's reference boards are using a Micrel PHY
>> - hardware circuit design
>> - maybe more...
>>
>> This raises a question though:
>> Which device is supposed to enable the TX delay when both MAC and PHY
>> support it? And should we implement it for each PHY / MAC separately
>> or should we think about a more generic solution (currently it's not
>> possible to disable the TX delay generated by the RTL8211F PHY via
>> devicetree when using phy-mode "rgmii")?
>
> Actually you can skip the part which activate the Tx-delay on the phy
> by setting "phy-mode = "rgmii-id" instead of "rgmii"
>
> phy->interface will no longer be PHY_INTERFACE_MODE_RGMII
> but PHY_INTERFACE_MODE_RGMII_ID.
unfortunately this is not true for RTL8211F (I did my previous tests
with the same expectation in mind)!
the code seems to suggest that TX-delay is disabled whenever mode !=
PHY_INTERFACE_MODE_RGMII.
BUT: on my device RTL8211F_TX_DELAY is set even before
"phy_write(phydev, 0x11, reg);"!

Based on what I found it seems that rgmii-id, rgmii-txid and
rgmii-rxid are supposed to be handled by the PHY.
That would mean that we have two problems here:
1) drivers/net/phy/realtek.c:rtl8211f_config_init should check for
PHY_INTERFACE_MODE_RGMII_ID or PHY_INTERFACE_MODE_RGMII_TXID and
enable the TX-delay in that case - otherwise explicitly disable it
2) dwmac-meson8b.c should only use the configured TX-delay for
PHY_INTERFACE_MODE_RGMII
@Florian: could you please share your thoughts on this (who handles
the TX delay in which case)?


Regards,
Martin

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox