Netdev List
 help / color / mirror / Atom feed
* Re: Crash in netlink/sk_filter_trim_cap on ARMv7 on 4.18rc1
From: Daniel Borkmann @ 2018-06-25  8:48 UTC (permalink / raw)
  To: Peter Robinson, Eric Dumazet; +Cc: netdev, linux-arm-kernel, labbott
In-Reply-To: <CALeDE9Nkz+cbueruiez_43ZS1ebAiQHDaVZRARE8raypYz323A@mail.gmail.com>

On 06/24/2018 11:24 AM, Peter Robinson wrote:
>>> I'm seeing this netlink/sk_filter_trim_cap crash on ARMv7 across quite
>>> a few ARMv7 platforms on Fedora with 4.18rc1. I've tested RPi2/RPi3
>>> (doesn't happen on aarch64), AllWinner H3, BeagleBone and a few
>>> others, both LPAE/normal kernels.
>>>
>>> I'm a bit out of my depth in this part of the kernel but I'm wondering
>>> if it's known, I couldn't find anything that looked obvious on a few
>>> mailing lists.
>>>
>>> Peter
>>
>> Hi Peter
>>
>> Could you provide symbolic information ?
> 
> I passed in through scripts/decode_stacktrace.sh is that what you were after:
> 
> [    8.673880] Internal error: Oops: a06 [#10] SMP ARM
> [    8.673949] ---[ end trace 049df4786ea3140a ]---
> [    8.678754] Modules linked in:
> [    8.678766] CPU: 1 PID: 206 Comm: systemd-udevd Tainted: G      D
>         4.18.0-0.rc1.git0.1.fc29.armv7hl+lpae #1
> [    8.678769] Hardware name: Allwinner sun8i Family
> [    8.678781] PC is at sk_filter_trim_cap ()
> [    8.678790] LR is at   (null)
> [    8.709463] pc : lr : psr: 60000013 ()
> [    8.715722] sp : c996bd60  ip : 00000000  fp : 00000000
> [    8.720939] r10: ee79dc00  r9 : c12c9f80  r8 : 00000000
> [    8.726157] r7 : 00000000  r6 : 00000001  r5 : f1648000  r4 : 00000000
> [    8.732674] r3 : 00000007  r2 : 00000000  r1 : 00000000  r0 : 00000000
> [    8.739193] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
> [    8.746318] Control: 30c5387d  Table: 6e7bc880  DAC: ffe75ece
> [    8.752055] Process systemd-udevd (pid: 206, stack limit = 0x(ptrval))
> [    8.758574] Stack: (0xc996bd60 to 0xc996c000)
[...]

Should be fixed by (PR to Linus with fix is pending):

https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/?id=9262478220eac908ae6e168c3df2c453c87e2da3

Sorry for that,
Daniel

^ permalink raw reply

* Re: [PATCH net-next] route: add support for directed broadcast forwarding
From: Ido Schimmel @ 2018-06-25  9:09 UTC (permalink / raw)
  To: Xin Long; +Cc: network dev, davem, David Ahern
In-Reply-To: <671e900d14a124f1de7785ee44c437d0826b9e2a.1529894708.git.lucien.xin@gmail.com>

Hi Xin,

On Mon, Jun 25, 2018 at 10:45:08AM +0800, Xin Long wrote:
> This patch implements the feature described in rfc1812#section-5.3.5.2
> and rfc2644. It allows the router to forward directed broadcast when
> sysctl mc_forwarding is enabled.

You mean bc_forwarding?

> 
> Note that this feature could be done by iptables -j TEE, but it would
> cause some problems:
>   - target TEE's gateway param has to be set with a specific address,
>     and it's not flexible especially when the route wants forward all
>     directed broadcasts.
>   - this duplicates the directed broadcasts so this may cause side
>     effects to applications.
> 
> Besides, to keep consistent with other os router like BSD, it's also
> necessary to implement it in the route rx path.

Regarding the test you posted later in the thread, did you consider
adding it as part of tools/testing/selftests/net/forwarding/ ? I believe
you can write the test using VRFs instead of name spaces.

Thanks

^ permalink raw reply

* Re: [PATCH net-next] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-06-25  9:13 UTC (permalink / raw)
  To: network dev; +Cc: davem, David Ahern, Davide Caratti
In-Reply-To: <671e900d14a124f1de7785ee44c437d0826b9e2a.1529894708.git.lucien.xin@gmail.com>

On Mon, Jun 25, 2018 at 10:45 AM, Xin Long <lucien.xin@gmail.com> wrote:
> This patch implements the feature described in rfc1812#section-5.3.5.2
> and rfc2644. It allows the router to forward directed broadcast when
> sysctl mc_forwarding is enabled.
>
> Note that this feature could be done by iptables -j TEE, but it would
> cause some problems:
>   - target TEE's gateway param has to be set with a specific address,
>     and it's not flexible especially when the route wants forward all
>     directed broadcasts.
>   - this duplicates the directed broadcasts so this may cause side
>     effects to applications.
>
> Besides, to keep consistent with other os router like BSD, it's also
> necessary to implement it in the route rx path.
>
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
> ---
>  include/linux/inetdevice.h   | 1 +
>  include/uapi/linux/ip.h      | 1 +
>  include/uapi/linux/netconf.h | 1 +
>  net/ipv4/devinet.c           | 7 +++++++
>  net/ipv4/route.c             | 6 +++++-
>  5 files changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
> index 27650f1..c759d1c 100644
> --- a/include/linux/inetdevice.h
> +++ b/include/linux/inetdevice.h
> @@ -93,6 +93,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
>
>  #define IN_DEV_FORWARD(in_dev)         IN_DEV_CONF_GET((in_dev), FORWARDING)
>  #define IN_DEV_MFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
> +#define IN_DEV_BFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
>  #define IN_DEV_RPFILTER(in_dev)                IN_DEV_MAXCONF((in_dev), RP_FILTER)
>  #define IN_DEV_SRC_VMARK(in_dev)       IN_DEV_ORCONF((in_dev), SRC_VMARK)
>  #define IN_DEV_SOURCE_ROUTE(in_dev)    IN_DEV_ANDCONF((in_dev), \
> diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
> index b24a742..2b756b5 100644
> --- a/include/uapi/linux/ip.h
> +++ b/include/uapi/linux/ip.h
> @@ -139,6 +139,7 @@ enum
>  {
>         IPV4_DEVCONF_FORWARDING=1,
>         IPV4_DEVCONF_MC_FORWARDING,
> +       IPV4_DEVCONF_BC_FORWARDING,
>         IPV4_DEVCONF_PROXY_ARP,
>         IPV4_DEVCONF_ACCEPT_REDIRECTS,
>         IPV4_DEVCONF_SECURE_REDIRECTS,
> diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
> index c84fcdf..a5cd70e 100644
> --- a/include/uapi/linux/netconf.h
> +++ b/include/uapi/linux/netconf.h
> @@ -15,6 +15,7 @@ enum {
>         NETCONFA_FORWARDING,
>         NETCONFA_RP_FILTER,
>         NETCONFA_MC_FORWARDING,
> +       NETCONFA_BC_FORWARDING,
>         NETCONFA_PROXY_NEIGH,
>         NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
>         NETCONFA_INPUT,
As Davide Caratti noticed, this breaks UAPI, I will append it instead
in next version if the rest part is ok. Thanks Davide.

> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index d7585ab..ea30ab6 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
>                 size += nla_total_size(4);
>         if (all || type == NETCONFA_MC_FORWARDING)
>                 size += nla_total_size(4);
> +       if (all || type == NETCONFA_BC_FORWARDING)
> +               size += nla_total_size(4);
>         if (all || type == NETCONFA_PROXY_NEIGH)
>                 size += nla_total_size(4);
>         if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
> @@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
>             nla_put_s32(skb, NETCONFA_MC_FORWARDING,
>                         IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
>                 goto nla_put_failure;
> +       if ((all || type == NETCONFA_BC_FORWARDING) &&
> +           nla_put_s32(skb, NETCONFA_BC_FORWARDING,
> +                       IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
> +               goto nla_put_failure;
>         if ((all || type == NETCONFA_PROXY_NEIGH) &&
>             nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
>                         IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
> @@ -2259,6 +2265,7 @@ static struct devinet_sysctl_table {
>                 DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
>                                              devinet_sysctl_forward),
>                 DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
> +               DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
>
>                 DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
>                 DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 1df6e97..b678466 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>                 goto no_route;
>         }
>
> -       if (res->type == RTN_BROADCAST)
> +       if (res->type == RTN_BROADCAST) {
> +               if (IN_DEV_BFORWARD(in_dev))
> +                       goto make_route;
>                 goto brd_input;
> +       }
>
>         if (res->type == RTN_LOCAL) {
>                 err = fib_validate_source(skb, saddr, daddr, tos,
> @@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
>         if (res->type != RTN_UNICAST)
>                 goto martian_destination;
>
> +make_route:
>         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
>  out:   return err;
>
> --
> 2.1.0
>

^ permalink raw reply

* Re: [PATCH RFT] net: dsa: Allow configuring CPU port VLANs
From: Petr Machata @ 2018-06-25  9:13 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, jiri, ilias.apalodimas, Andrew Lunn, Vivien Didelot,
	David S. Miller, open list
In-Reply-To: <20180624153339.13572-1-f.fainelli@gmail.com>

Florian Fainelli <f.fainelli@gmail.com> writes:

>  	if (netif_is_bridge_master(vlan->obj.orig_dev))
> -		return -EOPNOTSUPP;
> +		info.port = dp->cpu_dp->index;

The condition above will trigger also when a VLAN is added on a member
port, and there's no other port with that VLAN. In that case the VLAN
comes without the BRIDGE_VLAN_INFO_BRENTRY flag. In mlxsw we have this
to get the bridge VLANs:

	if (netif_is_bridge_master(orig_dev)) {
		[...]
		if ((vlan->flags & BRIDGE_VLAN_INFO_BRENTRY) &&
		[...]

This doesn't appear to be done in DSA unless I'm missing something.

Thanks,
Petr

^ permalink raw reply

* Re: [PATCH net-next] route: add support for directed broadcast forwarding
From: Xin Long @ 2018-06-25  9:15 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: network dev, davem, David Ahern
In-Reply-To: <20180625090956.GA19457@splinter.mtl.com>

On Mon, Jun 25, 2018 at 5:09 PM, Ido Schimmel <idosch@idosch.org> wrote:
> Hi Xin,
>
> On Mon, Jun 25, 2018 at 10:45:08AM +0800, Xin Long wrote:
>> This patch implements the feature described in rfc1812#section-5.3.5.2
>> and rfc2644. It allows the router to forward directed broadcast when
>> sysctl mc_forwarding is enabled.
>
> You mean bc_forwarding?
yes, typo, bc_forwarding.

>
>>
>> Note that this feature could be done by iptables -j TEE, but it would
>> cause some problems:
>>   - target TEE's gateway param has to be set with a specific address,
>>     and it's not flexible especially when the route wants forward all
>>     directed broadcasts.
>>   - this duplicates the directed broadcasts so this may cause side
>>     effects to applications.
>>
>> Besides, to keep consistent with other os router like BSD, it's also
>> necessary to implement it in the route rx path.
>
> Regarding the test you posted later in the thread, did you consider
> adding it as part of tools/testing/selftests/net/forwarding/ ? I believe
> you can write the test using VRFs instead of name spaces.
didn't know that, but sounds good, will have a try. :)
Thanks.

>
> Thanks

^ permalink raw reply

* Re: [PATCH RFT] net: dsa: Allow configuring CPU port VLANs
From: Ilias Apalodimas @ 2018-06-25  9:17 UTC (permalink / raw)
  To: Petr Machata
  Cc: Florian Fainelli, netdev, jiri, Andrew Lunn, Vivien Didelot,
	David S. Miller, open list
In-Reply-To: <wih4lhrgrfd.fsf@dev-r-vrt-156.mtr.labs.mlnx>

On Mon, Jun 25, 2018 at 12:13:10PM +0300, Petr Machata wrote:
> Florian Fainelli <f.fainelli@gmail.com> writes:
> 
> >  	if (netif_is_bridge_master(vlan->obj.orig_dev))
> > -		return -EOPNOTSUPP;
> > +		info.port = dp->cpu_dp->index;
> 
> The condition above will trigger also when a VLAN is added on a member
> port, and there's no other port with that VLAN. In that case the VLAN
> comes without the BRIDGE_VLAN_INFO_BRENTRY flag. In mlxsw we have this
> to get the bridge VLANs:
> 
> 	if (netif_is_bridge_master(orig_dev)) {
> 		[...]
> 		if ((vlan->flags & BRIDGE_VLAN_INFO_BRENTRY) &&
> 		[...]
> 
> This doesn't appear to be done in DSA unless I'm missing something.
Petr's right. This will trigger for VLANs added on 'not cpu ports' if the VLAN
is not already a member. 

This command has BRIDGE_VLAN_INFO_BRENTRY set:
bridge vlan add dev br0 vid 100 pvid untagged self 
I had the same issue on my CPSW RFC and solved it
exactly the same was as Petr suggested.

> 
> Thanks,
> Petr

Regards
Ilias

^ permalink raw reply

* RE: bnx2x: kernel panic in the bnx2x driver
From: Kalluru, Sudarsana @ 2018-06-25  9:20 UTC (permalink / raw)
  To: Vishwanath Pai, Elior, Ariel, Dept-Eng Everest Linux L2
  Cc: davem@davemloft.net, netdev@vger.kernel.org, dbanerje@akamai.com,
	pai.vishwain@gmail.com
In-Reply-To: <20180622182738.GA13219@akamai.com>

Hi Vishwanath,
    Thanks for your help. Will plan to post the patch to net branch.

Thanks,
Sudarsana

-----Original Message-----
From: Vishwanath Pai [mailto:vpai@akamai.com] 
Sent: 22 June 2018 23:58
To: Kalluru, Sudarsana <Sudarsana.Kalluru@cavium.com>; Elior, Ariel <Ariel.Elior@cavium.com>; Dept-Eng Everest Linux L2 <Dept-EngEverestLinuxL2@cavium.com>
Cc: davem@davemloft.net; netdev@vger.kernel.org; dbanerje@akamai.com; pai.vishwain@gmail.com
Subject: Re: bnx2x: kernel panic in the bnx2x driver

The patch below worked for me (on 4.14.51 LTS kernel):

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 1e33abd..2b3863a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3387,14 +3387,20 @@ static int bnx2x_set_rss_flags(struct bnx2x *bp, struct ethtool_rxnfc *info)
                        DP(BNX2X_MSG_ETHTOOL,
                           "rss re-configured, UDP 4-tupple %s\n",
                           udp_rss_requested ? "enabled" : "disabled");
-                       return bnx2x_rss(bp, &bp->rss_conf_obj, false, true);
+                       if (bp->state == BNX2X_STATE_OPEN)
+                               return bnx2x_rss(bp, &bp->rss_conf_obj, false, true);
+                       else
+                               return 0;
                } else if ((info->flow_type == UDP_V6_FLOW) &&
                           (bp->rss_conf_obj.udp_rss_v6 != udp_rss_requested)) {
                        bp->rss_conf_obj.udp_rss_v6 = udp_rss_requested;
                        DP(BNX2X_MSG_ETHTOOL,
                           "rss re-configured, UDP 4-tupple %s\n",
                           udp_rss_requested ? "enabled" : "disabled");
-                       return bnx2x_rss(bp, &bp->rss_conf_obj, false, true);
+                       if (bp->state == BNX2X_STATE_OPEN)
+                               return bnx2x_rss(bp, &bp->rss_conf_obj, false, true);
+                       else
+                               return 0;
                }
                return 0;

Although I think there might be another place where we may need to fix this as well:
bnx2x_config_rss_eth()

Thanks,
Vishwanath

On 06/22/2018 10:57 AM, Vishwanath Pai wrote:
> Ah, that is great! I will test it out on my machine and let you know.
>
> Thanks,
> Vishwanath
>
> On 06/22/2018 10:21 AM, Kalluru, Sudarsana wrote:
>> Hi Vishwanath,
>>     The config will be cached in the device structure (bp->rss_conf_obj.udp_rss_v4) in this scenario, and will be applied in the load path (bnx2x_nic_load() --> bnx2x_init_rss()). Have unit tested the change on my setup.
>>
>> Thanks,
>> Sudarsana
>>
>> -----Original Message-----
>> From: Vishwanath Pai [mailto:vpai@akamai.com]
>> Sent: 22 June 2018 18:52
>> To: Kalluru, Sudarsana <Sudarsana.Kalluru@cavium.com>; Elior, Ariel 
>> <Ariel.Elior@cavium.com>; Dept-Eng Everest Linux L2 
>> <Dept-EngEverestLinuxL2@cavium.com>
>> Cc: davem@davemloft.net; netdev@vger.kernel.org; dbanerje@akamai.com; 
>> pai.vishwain@gmail.com
>> Subject: Re: bnx2x: kernel panic in the bnx2x driver
>>
>> Hi Sudarsana,
>>
>> Thanks for taking a look at my email. The fix you suggested would definitely fix the kernel panic, but at the same time wouldn't it also silently ignore the request by ethtool to set rx-flow-hash?
>>
>> Thanks,
>> Vishwanath
>>
>> On 06/22/2018 06:20 AM, Kalluru, Sudarsana wrote:
>>> Hi Vishwanath,
>>>     Thanks for your mail, and the analysis.
>>> The fix would be to invoke bnx2x_rss() only when the device is opened,
>>>       if (bp->state == BNX2X_STATE_OPEN)
>>>               return bnx2x_rss(bp, &bp->rss_conf_obj, false, true);
>>>       else
>>>               return 0;
>>> Ariel,
>>>    Could you please review the path (bnx2x_set_rss_flags()--> bnx2x_rss()) and confirm/correct on the above.
>>>
>>> Thanks,
>>> Sudarsana
>>>
>>> -----Original Message-----
>>> From: Vishwanath Pai [mailto:vpai@akamai.com]
>>> Sent: 22 June 2018 10:37
>>> To: Elior, Ariel <Ariel.Elior@cavium.com>; Dept-Eng Everest Linux L2 
>>> <Dept-EngEverestLinuxL2@cavium.com>
>>> Cc: davem@davemloft.net; netdev@vger.kernel.org; 
>>> dbanerje@akamai.com; pai.vishwain@gmail.com
>>> Subject: bnx2x: kernel panic in the bnx2x driver
>>>
>>> External Email
>>>
>>> Hi,
>>>
>>> We recently noticed a kernel panic in the bnx2x driver when trying 
>>> to set rx-flow-hash parameters via ethtool during if-pre-up.d. I am 
>>> running kernel
>>> v4.17.2 from ubuntu-mainline-ppa. I have added the stack trace below:
>>>
>>> [   18.280209] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
>>> [   18.280212] PGD 8000000407a79067 P4D 8000000407a79067 PUD 40ce8a067 PMD 0
>>> [   18.280214] Oops: 0010 [#1] SMP PTI
>>> [   18.280215] Modules linked in: intel_rapl x86_pkg_temp_thermal intel_powerclamp kvm_intel joydev input_led kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc hid_eneric aesni_intel gpio_ich aes_x86_64 usbhid lpc_ich crpto_simd ie31200_edac cryptd glue_helper intel_cstate mac_hid intel_rapl_perf bnx2x mdio tcp_bbr netconsole ipmi_devintf ipmi_msghandler i2c_i801 coretemp autofs4 raid10 raid456 libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 multipath linear sha26_mb mcryptd sha256_ssse3 hid ast i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect sysimgblt mpt3sas fb_sys_fops drm raid_class scsi_transport_sas ahci libahci shpchp video
>>> [   18.280241] CPU: 6 PID: 1081 Comm: ethtool Not tainted 4.17.2-041702-generic #201806160433
>>> [   18.280242] Hardware name: Foxconn CangJie/CangJie, BIOS CC1F108D 02/26/2014
>>> [   18.280243] RIP: 0010:          (null)
>>> [   18.280243] RSP: 0018:ffffb84bc260b9c0 EFLAGS: 00010246
>>> [   18.280244] RAX: 0000000000000000 RBX: ffff92f987f020f0 RCX: 0000000000000000
>>> [   18.280245] RDX: 0000000000000000 RSI: ffffb84bc260b9f8 RDI: ffff92f987f020f0
>>> [   18.280245] RBP: ffffb8bc260b9e8 R08: 0000000000000001 R09: 0000000000000000
>>> [   18.280246] R10: ffffb84bc260bd20 R11: 0000000000000000 R12: ffffb84bc260b9f8
>>> [   18.280246] R13: ffff92f987f008c0 R14: 00007ffdb75bec40 R15: 0000000000000000
>>> [   18.280247] FS:  00007fc0e8798700(0000) GS:ffff92f99fd80000(0000) knlGS:0000000000000000
>>> [   18.280248] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>>> [   18.280249] CR2: 0000000000000000 CR3: 0000000409b4c003 CR4: 00000000001606e0
>>> [   18.280249] Call Trace:
>>> [   18.280263]  ? bnx2x_config_rss+0x2f/0xd0 [bnx2x]
>>> [   18.280270]  bnx2x_rss+0x1d9/0x210 [bnx2x]
>>> [   18.280276]  bnx2x_set_rxnfc+0x17d/0x380 [bnx2x]
>>> [   18.280279]  ethtool_set_rxnfc+0x9b/0x110
>>> [   18.280281]  ? __do_page_cache_readahead+0x1da/0x2c0
>>> [   18.280283]  ? security_capable+0x3c/0x60
>>> [   18.280284]  dev_ethtool+0350/0x2610
>>> [   18.280286]  ? page_cache_async_readahead+0x71/0x80
>>> [   18.280288]  ? page_add_file_rmap+0x5d/0x220
>>> [   18.280290]  ? inet_ioctl+0x182/0x1a0
>>> [   18.280291]  dev_ioctl+0x203/0x3f0
>>> [   18.280293]  ? dev_ioctl+0x203/0x3f0
>>> [   18.280294]  sock_do_ioctl+0xae/0x150
>>> [   18.280296]  sock_ioctl+0x1e2/0x330
>>> [   18.280296]  ? sock_ioctl+0x1e2/0x330
>>> [   18.280299]  do_vfs_ioctl+0xa8/0x620
>>> [   18.280300]  ? dlci_ioctl_set+0x30/0x30
>>> [   18.280301]  ? do_vfs_ioctl+0xa8/0x620
>>> [   18.280302]  ? handle_mm_fault+0xe3/0x220
>>> [   18.280304]  ksys_ioctl+0x75/0x80
>>> [   18.280305]  __x64_sys_ioctl+0x1a/0x20
>>> [   18.280307]  do_syscall_64+0x5a/0x120
>>> [   18.280309]  entry_SYSCALL_64_aftr_hwframe+0x44/0xa9
>>> [   18.280310] RIP: 0033:0x7fc0e7fba107
>>> [   18.280311] RSP: 002b:00007ffdb75beb78 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
>>> [   18.280312] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc0e7fba107
>>> [   18.280312] RDX: 00007ffdb75bed60 RSI: 0000000000008946 RDI: 0000000000000003
>>> [   18.280313] RBP: 00007ffdb75bed50 R08: 00007ffdb75bed60 R09: 0000000000000001
>>> [   18.280313] R10: 0000000000000541 R11: 0000000000000206 R12: 00007ffdb75beed0
>>> [   18.280314] R13: 0000000000421020 R14: 000000000041fe28 R15: 0000000000000003
>>> [   18.280315] Code:  Bad RIP value.
>>> [   18.280317] RIP:           (null) RSP: ffffb84bc260b9c0
>>> [  18.280318] CR2: 0000000000000000
>>> [   18.280319] ---[ end trace 5f361db3fb9059f1 ]---
>>>
>>> To reproduce this I created a bash script in "/etc/network/if-pre-up.d/" with these two lines:
>>> ethtool -N $IFACE rx-flow-hash udp4 "sdfn"
>>> ethtool -N $IFACE rx-flow-hash udp6 "sdfn"
>>>
>>> The problem here is that rss_obj in bnx2x struct for the device 
>>> hasn't been initialized yet, which causes an exception in 
>>> bnx2x_config_rss() when calling "r->set_pending(r)" because 
>>> r->set_pending is NULL. It looks like a lot many things haven't been 
>>> initialized at this point, most of that happens in this
>>> function: "bnx2x_init_bp_objs()" which isn't called until ifup. Any thoughts on how this can be fixed? Would it be possible to safely move bnx2x_init_bp_objs() to maybe bnx2x_init_one() which runs much before ifup?
>>>
>>> Thanks,
>>> Vishwanath
>>>
>

^ permalink raw reply related

* Re: [virtio-dev] Re: [Qemu-devel] [PATCH] qemu: Introduce VIRTIO_NET_F_STANDBY feature bit to virtio_net
From: Cornelia Huck @ 2018-06-25  9:55 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Alexander Duyck, virtio-dev, Jiri Pirko, konrad.wilk,
	Jakub Kicinski, Samudrala, Sridhar, qemu-devel, virtualization,
	Siwei Liu, Venu Busireddy, Netdev, boris.ostrovsky, aaron.f.brown,
	Joao Martins
In-Reply-To: <20180622214259-mutt-send-email-mst@kernel.org>

On Fri, 22 Jun 2018 22:05:50 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Fri, Jun 22, 2018 at 05:09:55PM +0200, Cornelia Huck wrote:
> > On Thu, 21 Jun 2018 21:20:13 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> >   
> > > On Thu, Jun 21, 2018 at 04:59:13PM +0200, Cornelia Huck wrote:  
> > > > OK, so what about the following:
> > > > 
> > > > - introduce a new feature bit, VIRTIO_NET_F_STANDBY_UUID that indicates
> > > >   that we have a new uuid field in the virtio-net config space
> > > > - in QEMU, add a property for virtio-net that allows to specify a uuid,
> > > >   offer VIRTIO_NET_F_STANDBY_UUID if set
> > > > - when configuring, set the property to the group UUID of the vfio-pci
> > > >   device
> > > > - in the guest, use the uuid from the virtio-net device's config space
> > > >   if applicable; else, fall back to matching by MAC as done today
> > > > 
> > > > That should work for all virtio transports.    
> > > 
> > > True. I'm a bit unhappy that it's virtio net specific though
> > > since down the road I expect we'll have a very similar feature
> > > for scsi (and maybe others).
> > > 
> > > But we do not have a way to have fields that are portable
> > > both across devices and transports, and I think it would
> > > be a useful addition. How would this work though? Any idea?  
> > 
> > Can we introduce some kind of device-independent config space area?
> > Pushing back the device-specific config space by a certain value if the
> > appropriate feature is negotiated and use that for things like the uuid?  
> 
> So config moves back and forth?
> Reminds me of the msi vector mess we had with pci.

Yes, that would be a bit unfortunate.

> I'd rather have every transport add a new config.

You mean via different mechanisms?

> 
> > But regardless of that, I'm not sure whether extending this approach to
> > other device types is the way to go. Tying together two different
> > devices is creating complicated situations at least in the hypervisor
> > (even if it's fairly straightforward in the guest). [I have not come
> > around again to look at the "how to handle visibility in QEMU"
> > questions due to lack of cycles, sorry about that.]
> > 
> > So, what's the goal of this approach? Only to allow migration with
> > vfio-pci, or also to plug in a faster device and use it instead of an
> > already attached paravirtualized device?  
> 
> These are two sides of the same coin, I think the second approach
> is closer to what we are doing here.

Thinking about it, do we need any knob to keep the vfio device
invisible if the virtio device is not present? IOW, how does the
hypervisor know that the vfio device is supposed to be paired with a
virtio device? It seems we need an explicit tie-in.

> 
> > What about migration of vfio devices that are not easily replaced by a
> > paravirtualized device? I'm thinking of vfio-ccw, where our main (and
> > currently only) supported device is dasd (disks) -- which can do a lot
> > of specialized things that virtio-blk does not support (and should not
> > or even cannot support).  
> 
> But maybe virtio-scsi can?

I don't think so. Dasds have some channel commands that don't map
easily to scsi commands.

> 
> > Would it be more helpful to focus on generic
> > migration support for vfio instead of going about it device by device?
> > 
> > This network device approach already seems far along, so it makes sense
> > to continue with it. But I'm not sure whether we want to spend time and
> > energy on that for other device types rather than working on a general
> > solution for vfio migration.  
> 
> I'm inclined to say finalizing this feature would be a good first step
> and will teach us how we can move forward.

I'm not opposed to figuring out this one, but I'm not sure whether we
want to extend it to more device types.

Are people looking into generic migration support? I have it on my
things-to-look-at list (figuring out what needs to be device specific
and what can be generic, figuring out how we can support vfio-ccw,
etc.).

^ permalink raw reply

* [PATCH v3,net-next] vlan: implement vlan id and protocol changes
From: Chas Williams @ 2018-06-25 10:30 UTC (permalink / raw)
  To: davem; +Cc: netdev, Chas Williams
In-Reply-To: <20180610231912.1543-1-3chas3@gmail.com>

vlan_changelink silently ignores attempts to change the vlan id
or protocol id of an existing vlan interface.  Implement by adding
the new vlan id and protocol to the interface's vlan group and then
removing the old vlan id and protocol from the vlan group.

Signed-off-by: Chas Williams <3chas3@gmail.com>
---
 include/linux/netdevice.h |  1 +
 net/8021q/vlan.c          |  4 ++--
 net/8021q/vlan.h          |  2 ++
 net/8021q/vlan_netlink.c  | 38 ++++++++++++++++++++++++++++++++++++++
 net/core/dev.c            |  1 +
 5 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3ec9850c7936..a95ae238addf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2409,6 +2409,7 @@ enum netdev_cmd {
 	NETDEV_CVLAN_FILTER_DROP_INFO,
 	NETDEV_SVLAN_FILTER_PUSH_INFO,
 	NETDEV_SVLAN_FILTER_DROP_INFO,
+	NETDEV_CHANGEVLAN,
 };
 const char *netdev_cmd_to_name(enum netdev_cmd cmd);
 
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 73a65789271b..b5e0ad1a581a 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -51,8 +51,8 @@ const char vlan_version[] = DRV_VERSION;
 
 /* End of global variables definitions. */
 
-static int vlan_group_prealloc_vid(struct vlan_group *vg,
-				   __be16 vlan_proto, u16 vlan_id)
+int vlan_group_prealloc_vid(struct vlan_group *vg,
+			    __be16 vlan_proto, u16 vlan_id)
 {
 	struct net_device **array;
 	unsigned int pidx, vidx;
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 44df1c3df02d..c734dd21d70d 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -116,6 +116,8 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
 bool vlan_dev_inherit_address(struct net_device *dev,
 			      struct net_device *real_dev);
+int vlan_group_prealloc_vid(struct vlan_group *vg,
+			    __be16 vlan_proto, u16 vlan_id);
 
 static inline u32 vlan_get_ingress_priority(struct net_device *dev,
 					    u16 vlan_tci)
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 9b60c1e399e2..ee27781939e0 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -107,10 +107,48 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
 			   struct nlattr *data[],
 			   struct netlink_ext_ack *extack)
 {
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct ifla_vlan_flags *flags;
 	struct ifla_vlan_qos_mapping *m;
 	struct nlattr *attr;
 	int rem;
+	__be16 vlan_proto = vlan->vlan_proto;
+	u16 vlan_id = vlan->vlan_id;
+
+	if (data[IFLA_VLAN_ID])
+		vlan_id = nla_get_u16(data[IFLA_VLAN_ID]);
+
+	if (data[IFLA_VLAN_PROTOCOL])
+		vlan_proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
+
+	if (vlan->vlan_id != vlan_id || vlan->vlan_proto != vlan_proto) {
+		struct net_device *real_dev = vlan->real_dev;
+		struct vlan_info *vlan_info;
+		struct vlan_group *grp;
+		__be16 old_vlan_proto = vlan->vlan_proto;
+		u16 old_vlan_id = vlan->vlan_id;
+		int err;
+
+		err = vlan_vid_add(real_dev, vlan_proto, vlan_id);
+		if (err)
+			return err;
+		vlan_info = rtnl_dereference(real_dev->vlan_info);
+		grp = &vlan_info->grp;
+		err = vlan_group_prealloc_vid(grp, vlan_proto, vlan_id);
+		if (err < 0) {
+			vlan_vid_del(real_dev, vlan_proto, vlan_id);
+			return err;
+		}
+		vlan_group_set_device(grp, vlan_proto, vlan_id, dev);
+		vlan->vlan_proto = vlan_proto;
+		vlan->vlan_id = vlan_id;
+
+		vlan_group_set_device(grp, old_vlan_proto, old_vlan_id, NULL);
+		vlan_vid_del(real_dev, old_vlan_proto, old_vlan_id);
+
+		err = call_netdevice_notifiers(NETDEV_CHANGEVLAN, dev);
+		notifier_to_errno(err);
+	}
 
 	if (data[IFLA_VLAN_FLAGS]) {
 		flags = nla_data(data[IFLA_VLAN_FLAGS]);
diff --git a/net/core/dev.c b/net/core/dev.c
index a5aa1c7444e6..4bda5d2e3c85 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1587,6 +1587,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
 	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
 	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+	N(CHANGEVLAN)
 	}
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
-- 
2.14.4

^ permalink raw reply related

* [PATCH net-next 2/3] rds: Enable RDS IPv6 support
From: Ka-Cheong Poon @ 2018-06-25 10:38 UTC (permalink / raw)
  To: netdev; +Cc: santosh.shilimkar, davem, rds-devel
In-Reply-To: <cover.1529922794.git.ka-cheong.poon@oracle.com>

This patch enables RDS to use IPv6 addresses. For RDS/TCP, the
listener is now an IPv6 endpoint which accepts both IPv4 and IPv6
connection requests.  RDS/RDMA/IB uses a private data (struct
rds_ib_connect_private) exchange between endpoints at RDS connection
establishment time to support RDMA. This private data exchange uses a
32 bit integer to represent an IP address. This needs to be changed in
order to support IPv6. A new private data struct
rds6_ib_connect_private is introduced to handle this. To ensure
backward compatibility, an IPv6 capable RDS stack uses another RDMA
listener port (RDS_CM_PORT) to accept IPv6 connection. And it
continues to use the original RDS_PORT for IPv4 RDS connections. When
it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to
send the connection set up request.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
---
 net/rds/bind.c           | 21 +++++++++++++++---
 net/rds/connection.c     | 43 ++++++++++++++++++++++++-------------
 net/rds/ib.c             | 55 +++++++++++++++++++++++++++++++++++++++++-------
 net/rds/ib_cm.c          | 15 +++++++------
 net/rds/rdma_transport.c | 32 ++++++++++++++++++++++++++--
 net/rds/rdma_transport.h |  2 ++
 net/rds/rds.h            | 12 ++++++-----
 net/rds/send.c           | 23 ++++++++++++++++++--
 net/rds/tcp.c            | 54 +++++++++++++++++++++++++++++------------------
 net/rds/tcp.h            |  4 +---
 net/rds/tcp_connect.c    | 54 ++++++++++++++++++++++++++++++++++++-----------
 net/rds/tcp_listen.c     | 40 +++++++++++++++++++++++++++--------
 12 files changed, 269 insertions(+), 86 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3a1097e..4c2bf9c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -164,11 +164,12 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct in6_addr v6addr, *binding_addr;
 	struct rds_transport *trans;
 	__u32 scope_id = 0;
+	int addr_type;
 	int ret = 0;
 	__be16 port;
 
-	/* We only allow an RDS socket to be bound to and IPv4 address. IPv6
-	 * address support will be added later.
+	/* We allow an RDS socket to be bound to either IPv4 or IPv6
+	 * address.
 	 */
 	if (addr_len == sizeof(struct sockaddr_in)) {
 		struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
@@ -180,7 +181,21 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		binding_addr = &v6addr;
 		port = sin->sin_port;
 	} else if (addr_len == sizeof(struct sockaddr_in6)) {
-		return -EPROTONOSUPPORT;
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
+
+		addr_type = ipv6_addr_type(&sin6->sin6_addr);
+		if (sin6->sin6_family != AF_INET6 ||
+		    !(addr_type & IPV6_ADDR_UNICAST)) {
+			return -EINVAL;
+		}
+		/* The scope ID must be specified for link local address. */
+		if (addr_type & IPV6_ADDR_LINKLOCAL) {
+			if (sin6->sin6_scope_id == 0)
+				return -EINVAL;
+			scope_id = sin6->sin6_scope_id;
+		}
+		binding_addr = &sin6->sin6_addr;
+		port = sin6->sin6_port;
 	} else {
 		return -EINVAL;
 	}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index ca72563..8c5d093 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -486,10 +486,17 @@ void rds_conn_destroy(struct rds_connection *conn)
 }
 EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
-static void rds_conn_message_info(struct socket *sock, unsigned int len,
-				  struct rds_info_iterator *iter,
-				  struct rds_info_lengths *lens,
-				  int want_send)
+static void __rds_inc_msg_cp(struct rds_incoming *inc,
+			     struct rds_info_iterator *iter,
+			     void *saddr, void *daddr, int flip)
+{
+	rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip);
+}
+
+static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
+				      struct rds_info_iterator *iter,
+				      struct rds_info_lengths *lens,
+				      int want_send)
 {
 	struct hlist_head *head;
 	struct list_head *list;
@@ -524,18 +531,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 
 				/* XXX too lazy to maintain counts.. */
 				list_for_each_entry(rm, list, m_conn_item) {
-					__be32 laddr;
-					__be32 faddr;
-
 					total++;
-					laddr = conn->c_laddr.s6_addr32[3];
-					faddr = conn->c_faddr.s6_addr32[3];
 					if (total <= len)
-						rds_inc_info_copy(&rm->m_inc,
-								  iter,
-								  laddr,
-								  faddr,
-								  0);
+						__rds_inc_msg_cp(&rm->m_inc,
+								 iter,
+								 &conn->c_laddr,
+								 &conn->c_faddr,
+								 0);
 				}
 
 				spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -548,6 +550,14 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 	lens->each = sizeof(struct rds_info_message);
 }
 
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+				  struct rds_info_iterator *iter,
+				  struct rds_info_lengths *lens,
+				  int want_send)
+{
+	rds_conn_message_info_cmn(sock, len, iter, lens, want_send);
+}
+
 static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
 				       struct rds_info_iterator *iter,
 				       struct rds_info_lengths *lens)
@@ -655,6 +665,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
 	struct rds_info_connection *cinfo = buffer;
 	struct rds_connection *conn = cp->cp_conn;
 
+	if (conn->c_isv6)
+		return 0;
+
 	cinfo->next_tx_seq = cp->cp_next_tx_seq;
 	cinfo->next_rx_seq = cp->cp_next_rx_seq;
 	cinfo->laddr = conn->c_laddr.s6_addr32[3];
diff --git a/net/rds/ib.c b/net/rds/ib.c
index c712a84..756225c 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/module.h>
+#include <net/addrconf.h>
 
 #include "rds_single_path.h"
 #include "rds.h"
@@ -295,6 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 	/* We will only ever look at IB transports */
 	if (conn->c_trans != &rds_ib_transport)
 		return 0;
+	if (conn->c_isv6)
+		return 0;
 
 	iinfo->src_addr = conn->c_laddr.s6_addr32[3];
 	iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
@@ -330,7 +333,6 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_rdma_connection));
 }
 
-
 /*
  * Early RDS/IB was built to only bind to an address if there is an IPoIB
  * device with that address set.
@@ -346,8 +348,12 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
 {
 	int ret;
 	struct rdma_cm_id *cm_id;
+	struct sockaddr_in6 sin6;
 	struct sockaddr_in sin;
+	struct sockaddr *sa;
+	bool isv4;
 
+	isv4 = ipv6_addr_v4mapped(addr);
 	/* Create a CMA ID and try to bind it. This catches both
 	 * IB and iWARP capable NICs.
 	 */
@@ -356,20 +362,53 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
 	if (IS_ERR(cm_id))
 		return PTR_ERR(cm_id);
 
-	memset(&sin, 0, sizeof(sin));
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = addr->s6_addr32[3];
+	if (isv4) {
+		memset(&sin, 0, sizeof(sin));
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = addr->s6_addr32[3];
+		sa = (struct sockaddr *)&sin;
+	} else {
+		memset(&sin6, 0, sizeof(sin6));
+		sin6.sin6_family = AF_INET6;
+		sin6.sin6_addr = *addr;
+		sin6.sin6_scope_id = scope_id;
+		sa = (struct sockaddr *)&sin6;
+
+		/* XXX Do a special IPv6 link local address check here.  The
+		 * reason is that rdma_bind_addr() always succeeds with IPv6
+		 * link local address regardless it is indeed configured in a
+		 * system.
+		 */
+		if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
+			struct net_device *dev;
+
+			if (scope_id == 0)
+				return -EADDRNOTAVAIL;
+
+			/* Use init_net for now as RDS is not network
+			 * name space aware.
+			 */
+			dev = dev_get_by_index(&init_net, scope_id);
+			if (!dev)
+				return -EADDRNOTAVAIL;
+			if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
+				dev_put(dev);
+				return -EADDRNOTAVAIL;
+			}
+			dev_put(dev);
+		}
+	}
 
 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
-	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	ret = rdma_bind_addr(cm_id, sa);
 	/* due to this, we will claim to support iWARP devices unless we
 	   check node_type. */
 	if (ret || !cm_id->device ||
 	    cm_id->device->node_type != RDMA_NODE_IB_CA)
 		ret = -EADDRNOTAVAIL;
 
-	rdsdebug("addr %pI6c ret %d node type %d\n",
-		 addr, ret,
+	rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
+		 addr, scope_id, ret,
 		 cm_id->device ? cm_id->device->node_type : -1);
 
 	rdma_destroy_id(cm_id);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 31ffa70..03279f3 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -40,7 +40,6 @@
 #include "rds_single_path.h"
 #include "rds.h"
 #include "ib.h"
-#include "tcp.h"
 
 /*
  * Set the selected protocol version
@@ -679,7 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
 	return version;
 }
 
-/* Given an IPv6 address, find the IB net_device which hosts that address and
+/* Given an IPv6 address, find the net_device which hosts that address and
  * return its index.  This is used by the rds_ib_cm_handle_connect() code to
  * find the interface index of where an incoming request comes from when
  * the request is using a link local address.
@@ -696,8 +695,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
 
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
-		if (dev->type == ARPHRD_INFINIBAND &&
-		    ipv6_chk_addr(net, addr, dev, 0)) {
+		if (ipv6_chk_addr(net, addr, dev, 0)) {
 			idx = dev->ifindex;
 			break;
 		}
@@ -887,7 +885,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 
 	/* XXX I wonder what affect the port space has */
 	/* delegate cm event handler to rdma_transport */
-	handler = rds_rdma_cm_event_handler;
+	if (conn->c_isv6)
+		handler = rds6_rdma_cm_event_handler;
+	else
+		handler = rds_rdma_cm_event_handler;
 	ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
 				     RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(ic->i_cm_id)) {
@@ -923,7 +924,7 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 		sin6 = (struct sockaddr_in6 *)&dest;
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_addr = conn->c_faddr;
-		sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+		sin6->sin6_port = (__force u16)htons(RDS_CM_PORT);
 		sin6->sin6_scope_id = conn->c_dev_if;
 	}
 
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index d7da115..6a696b8 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -37,7 +37,9 @@
 #include "rdma_transport.h"
 #include "ib.h"
 
+/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
 static struct rdma_cm_id *rds_rdma_listen_id;
+static struct rdma_cm_id *rds6_rdma_listen_id;
 
 int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
 				  struct rdma_cm_event *event,
@@ -153,6 +155,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 	return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
 }
 
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			       struct rdma_cm_event *event)
+{
+	return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
+}
+
 static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
 				       struct sockaddr *sa,
 				       struct rdma_cm_id **ret_cm_id)
@@ -199,13 +207,14 @@ static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
 
 /* Initialize the RDS RDMA listeners.  We create two listeners for
  * compatibility reason.  The one on RDS_PORT is used for IPv4
- * requests only.  The one on RDS_TCP_PORT is used for IPv6 requests
+ * requests only.  The one on RDS_CM_PORT is used for IPv6 requests
  * only.  So only IPv6 enabled RDS module will communicate using this
  * port.
  */
 static int rds_rdma_listen_init(void)
 {
 	int ret;
+	struct sockaddr_in6 sin6;
 	struct sockaddr_in sin;
 
 	sin.sin_family = PF_INET;
@@ -214,7 +223,21 @@ static int rds_rdma_listen_init(void)
 	ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
 					  (struct sockaddr *)&sin,
 					  &rds_rdma_listen_id);
-	return ret;
+	if (ret != 0)
+		return ret;
+
+	sin6.sin6_family = PF_INET6;
+	sin6.sin6_addr = in6addr_any;
+	sin6.sin6_port = htons(RDS_CM_PORT);
+	sin6.sin6_scope_id = 0;
+	sin6.sin6_flowinfo = 0;
+	ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
+					  (struct sockaddr *)&sin6,
+					  &rds6_rdma_listen_id);
+	/* Keep going even when IPv6 is not enabled in the system. */
+	if (ret != 0)
+		rdsdebug("Cannot set up IPv6 RDMA listener\n");
+	return 0;
 }
 
 static void rds_rdma_listen_stop(void)
@@ -224,6 +247,11 @@ static void rds_rdma_listen_stop(void)
 		rdma_destroy_id(rds_rdma_listen_id);
 		rds_rdma_listen_id = NULL;
 	}
+	if (rds6_rdma_listen_id) {
+		rdsdebug("cm %p\n", rds6_rdma_listen_id);
+		rdma_destroy_id(rds6_rdma_listen_id);
+		rds6_rdma_listen_id = NULL;
+	}
 }
 
 static int rds_rdma_init(void)
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index d309c44..bc3c639 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,6 +11,8 @@
 int rds_rdma_conn_connect(struct rds_connection *conn);
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 			      struct rdma_cm_event *event);
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			       struct rdma_cm_event *event);
 
 /* from ib.c */
 extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 859808a..f5f99d1 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -24,13 +24,15 @@
 #define RDS_PROTOCOL_MINOR(v)	((v) & 255)
 #define RDS_PROTOCOL(maj, min)	(((maj) << 8) | min)
 
-/*
- * XXX randomly chosen, but at least seems to be unused:
- * #               18464-18768 Unassigned
- * We should do better.  We want a reserved port to discourage unpriv'ed
- * userspace from listening.
+/* The following ports, 16385, 18634, 18635, are registered with IANA as
+ * the ports to be used for RDS over TCP and UDP.  18634 is the historical
+ * value used for the RDMA_CM listener port.  RDS/TCP uses port 16385.  After
+ * IPv6 work, RDMA_CM also uses 16385 as the listener port.  18634 is kept
+ * to ensure compatibility with older RDS modules.
  */
 #define RDS_PORT	18634
+#define RDS_CM_PORT	16385
+#define RDS_TCP_PORT	RDS_CM_PORT
 
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
diff --git a/net/rds/send.c b/net/rds/send.c
index cc91860..3bc806b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1105,8 +1105,27 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 			break;
 
 		case sizeof(*sin6): {
-			ret = -EPROTONOSUPPORT;
-			goto out;
+			int addr_type;
+
+			if (sin6->sin6_family != AF_INET6) {
+				ret = -EINVAL;
+				goto out;
+			}
+			addr_type = ipv6_addr_type(&sin6->sin6_addr);
+			if (!(addr_type & IPV6_ADDR_UNICAST)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (addr_type & IPV6_ADDR_LINKLOCAL &&
+			    sin6->sin6_scope_id == 0) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			daddr = sin6->sin6_addr;
+			dport = sin6->sin6_port;
+			scope_id = sin6->sin6_scope_id;
+			break;
 		}
 
 		default:
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index beaff17..fb0dac1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -46,7 +46,12 @@
 /* only for info exporting */
 static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
 static LIST_HEAD(rds_tcp_tc_list);
+
+/* rds_tcp_tc_count counts only IPv4 connections.
+ * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
+ */
 static unsigned int rds_tcp_tc_count;
+static unsigned int rds6_tcp_tc_count;
 
 /* Track rds_tcp_connection structs so they can be cleaned up */
 static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -113,7 +118,9 @@ void rds_tcp_restore_callbacks(struct socket *sock,
 	/* done under the callback_lock to serialize with write_space */
 	spin_lock(&rds_tcp_tc_list_lock);
 	list_del_init(&tc->t_list_item);
-	rds_tcp_tc_count--;
+	rds6_tcp_tc_count--;
+	if (!tc->t_cpath->cp_conn->c_isv6)
+		rds_tcp_tc_count--;
 	spin_unlock(&rds_tcp_tc_list_lock);
 
 	tc->t_sock = NULL;
@@ -200,7 +207,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
 	/* done under the callback_lock to serialize with write_space */
 	spin_lock(&rds_tcp_tc_list_lock);
 	list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
-	rds_tcp_tc_count++;
+	rds6_tcp_tc_count++;
+	if (!tc->t_cpath->cp_conn->c_isv6)
+		rds_tcp_tc_count++;
 	spin_unlock(&rds_tcp_tc_list_lock);
 
 	/* accepted sockets need our listen data ready undone */
@@ -221,6 +230,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
 	write_unlock_bh(&sock->sk->sk_callback_lock);
 }
 
+/* Handle RDS_INFO_TCP_SOCKETS socket option.  It only returns IPv4
+ * connections for backward compatibility.
+ */
 static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 			    struct rds_info_iterator *iter,
 			    struct rds_info_lengths *lens)
@@ -228,8 +240,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 	struct rds_info_tcp_socket tsinfo;
 	struct rds_tcp_connection *tc;
 	unsigned long flags;
-	struct sockaddr_in sin;
-	struct socket *sock;
 
 	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
 
@@ -237,16 +247,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 		goto out;
 
 	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+		struct inet_sock *inet = inet_sk(tc->t_sock->sk);
 
-		sock = tc->t_sock;
-		if (sock) {
-			sock->ops->getname(sock, (struct sockaddr *)&sin, 0);
-			tsinfo.local_addr = sin.sin_addr.s_addr;
-			tsinfo.local_port = sin.sin_port;
-			sock->ops->getname(sock, (struct sockaddr *)&sin, 1);
-			tsinfo.peer_addr = sin.sin_addr.s_addr;
-			tsinfo.peer_port = sin.sin_port;
-		}
+		if (tc->t_cpath->cp_conn->c_isv6)
+			continue;
+
+		tsinfo.local_addr = inet->inet_saddr;
+		tsinfo.local_port = inet->inet_sport;
+		tsinfo.peer_addr = inet->inet_daddr;
+		tsinfo.peer_port = inet->inet_dport;
 
 		tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
 		tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -495,13 +504,18 @@ static __net_init int rds_tcp_init_net(struct net *net)
 		err = -ENOMEM;
 		goto fail;
 	}
-	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+	rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
 	if (!rtn->rds_tcp_listen_sock) {
-		pr_warn("could not set up listen sock\n");
-		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
-		rtn->rds_tcp_sysctl = NULL;
-		err = -EAFNOSUPPORT;
-		goto fail;
+		pr_warn("could not set up IPv6 listen sock\n");
+
+		/* Try IPv4 as some systems disable IPv6 */
+		rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+		if (!rtn->rds_tcp_listen_sock) {
+			unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+			rtn->rds_tcp_sysctl = NULL;
+			err = -EAFNOSUPPORT;
+			goto fail;
+		}
 	}
 	INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
 	return 0;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c6fa080..6a948c1 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -2,8 +2,6 @@
 #ifndef _RDS_TCP_H
 #define _RDS_TCP_H
 
-#define RDS_TCP_PORT	16385
-
 struct rds_tcp_incoming {
 	struct rds_incoming	ti_inc;
 	struct sk_buff_head	ti_skb_list;
@@ -67,7 +65,7 @@ void rds_tcp_restore_callbacks(struct socket *sock,
 void rds_tcp_state_change(struct sock *sk);
 
 /* tcp_listen.c */
-struct socket *rds_tcp_listen_init(struct net *);
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
 void rds_tcp_listen_data_ready(struct sock *sk);
 int rds_tcp_accept_one(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 0101033..039bd04 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -89,9 +89,11 @@ void rds_tcp_state_change(struct sock *sk)
 int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 {
 	struct socket *sock = NULL;
+	struct sockaddr_in6 sin6;
 	struct sockaddr_in sin;
 	struct sockaddr *addr;
 	int addrlen;
+	bool isv6;
 	int ret;
 	struct rds_connection *conn = cp->cp_conn;
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -108,18 +110,36 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 		mutex_unlock(&tc->t_conn_path_lock);
 		return 0;
 	}
-	ret = sock_create_kern(rds_conn_net(conn), PF_INET,
-			       SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (ipv6_addr_v4mapped(&conn->c_laddr)) {
+		ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+				       SOCK_STREAM, IPPROTO_TCP, &sock);
+		isv6 = false;
+	} else {
+		ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
+				       SOCK_STREAM, IPPROTO_TCP, &sock);
+		isv6 = true;
+	}
+
 	if (ret < 0)
 		goto out;
 
 	rds_tcp_tune(sock);
 
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3];
-	sin.sin_port = (__force u16)htons(0);
-	addr = (struct sockaddr *)&sin;
-	addrlen = sizeof(sin);
+	if (isv6) {
+		sin6.sin6_family = AF_INET6;
+		sin6.sin6_addr = conn->c_laddr;
+		sin6.sin6_port = 0;
+		sin6.sin6_flowinfo = 0;
+		sin6.sin6_scope_id = conn->c_dev_if;
+		addr = (struct sockaddr *)&sin6;
+		addrlen = sizeof(sin6);
+	} else {
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3];
+		sin.sin_port = (__force u16)htons(0);
+		addr = (struct sockaddr *)&sin;
+		addrlen = sizeof(sin);
+	}
 
 	ret = sock->ops->bind(sock, addr, addrlen);
 	if (ret) {
@@ -128,11 +148,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 		goto out;
 	}
 
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3];
-	sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
-	addr = (struct sockaddr *)&sin;
-	addrlen = sizeof(sin);
+	if (isv6) {
+		sin6.sin6_family = AF_INET6;
+		sin6.sin6_addr = conn->c_faddr;
+		sin6.sin6_port = htons(RDS_TCP_PORT);
+		sin6.sin6_flowinfo = 0;
+		sin6.sin6_scope_id = conn->c_dev_if;
+		addr = (struct sockaddr *)&sin6;
+		addrlen = sizeof(sin6);
+	} else {
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3];
+		sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+		addr = (struct sockaddr *)&sin;
+		addrlen = sizeof(sin);
+	}
 
 	/*
 	 * once we call connect() we can start getting callbacks and they
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 4fdf5b3..0f996e4 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -256,15 +256,22 @@ void rds_tcp_listen_data_ready(struct sock *sk)
 		ready(sk);
 }
 
-struct socket *rds_tcp_listen_init(struct net *net)
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
 {
-	struct sockaddr_in sin;
 	struct socket *sock = NULL;
+	struct sockaddr_storage ss;
+	struct sockaddr_in6 *sin6;
+	struct sockaddr_in *sin;
+	int addr_len;
 	int ret;
 
-	ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (ret < 0)
+	ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
+			       IPPROTO_TCP, &sock);
+	if (ret < 0) {
+		rdsdebug("could not create %s listener socket: %d\n",
+			 isv6 ? "IPv6" : "IPv4", ret);
 		goto out;
+	}
 
 	sock->sk->sk_reuse = SK_CAN_REUSE;
 	rds_tcp_nonagle(sock);
@@ -274,13 +281,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
 	sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
 	write_unlock_bh(&sock->sk->sk_callback_lock);
 
-	sin.sin_family = PF_INET;
-	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
-	sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+	if (isv6) {
+		sin6 = (struct sockaddr_in6 *)&ss;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_addr = in6addr_any;
+		sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+		sin6->sin6_scope_id = 0;
+		sin6->sin6_flowinfo = 0;
+		addr_len = sizeof(*sin6);
+	} else {
+		sin = (struct sockaddr_in *)&ss;
+		sin->sin_family = PF_INET;
+		sin->sin_addr.s_addr = INADDR_ANY;
+		sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
+		addr_len = sizeof(*sin);
+	}
 
-	ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
-	if (ret < 0)
+	ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
+	if (ret < 0) {
+		rdsdebug("could not bind %s listener socket: %d\n",
+			 isv6 ? "IPv6" : "IPv4", ret);
 		goto out;
+	}
 
 	ret = sock->ops->listen(sock, 64);
 	if (ret < 0)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 1/3] rds: Changing IP address internal representation to struct in6_addr
From: Ka-Cheong Poon @ 2018-06-25 10:38 UTC (permalink / raw)
  To: netdev; +Cc: santosh.shilimkar, davem, rds-devel
In-Reply-To: <cover.1529922794.git.ka-cheong.poon@oracle.com>

This patch changes the internal representation of an IP address to use
struct in6_addr.  IPv4 address is stored as an IPv4 mapped address.
All the functions which take an IP address as argument are also
changed to use struct in6_addr.  But RDS socket layer is not modified
such that it still does not accept IPv6 address from an application.
And RDS layer does not accept nor initiate IPv6 connections.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
---
 net/rds/af_rds.c         | 138 ++++++++++++++++------
 net/rds/bind.c           |  91 +++++++++-----
 net/rds/cong.c           |  23 ++--
 net/rds/connection.c     | 132 +++++++++++++--------
 net/rds/ib.c             |  17 +--
 net/rds/ib.h             |  45 +++++--
 net/rds/ib_cm.c          | 300 +++++++++++++++++++++++++++++++++++------------
 net/rds/ib_rdma.c        |  15 +--
 net/rds/ib_recv.c        |  18 +--
 net/rds/ib_send.c        |  10 +-
 net/rds/loop.c           |   7 +-
 net/rds/rdma.c           |   6 +-
 net/rds/rdma_transport.c |  56 ++++++---
 net/rds/rds.h            |  69 +++++++----
 net/rds/recv.c           |  51 +++++---
 net/rds/send.c           |  67 ++++++++---
 net/rds/tcp.c            |  33 +++++-
 net/rds/tcp_connect.c    |  34 +++---
 net/rds/tcp_listen.c     |  18 +--
 net/rds/tcp_recv.c       |   9 +-
 net/rds/tcp_send.c       |   4 +-
 net/rds/threads.c        |  69 +++++++++--
 net/rds/transport.c      |  15 ++-
 23 files changed, 858 insertions(+), 369 deletions(-)

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ab751a1..75241ed 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
 #include <linux/kernel.h>
 #include <linux/gfp.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 #include <linux/poll.h>
 #include <net/sock.h>
 
@@ -113,26 +114,63 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
 static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
 		       int peer)
 {
-	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
-
-	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	struct sockaddr_in6 *sin6;
+	struct sockaddr_in *sin;
+	int uaddr_len;
 
 	/* racey, don't care */
 	if (peer) {
-		if (!rs->rs_conn_addr)
+		if (ipv6_addr_any(&rs->rs_conn_addr))
 			return -ENOTCONN;
 
-		sin->sin_port = rs->rs_conn_port;
-		sin->sin_addr.s_addr = rs->rs_conn_addr;
+		if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
+			sin = (struct sockaddr_in *)uaddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+			sin->sin_family = AF_INET;
+			sin->sin_port = rs->rs_conn_port;
+			sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
+			uaddr_len = sizeof(*sin);
+		} else {
+			sin6 = (struct sockaddr_in6 *)uaddr;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = rs->rs_conn_port;
+			sin6->sin6_addr = rs->rs_conn_addr;
+			sin6->sin6_flowinfo = 0;
+			/* scope_id is the same as in the bound address. */
+			sin6->sin6_scope_id = rs->rs_bound_scope_id;
+			uaddr_len = sizeof(*sin6);
+		}
 	} else {
-		sin->sin_port = rs->rs_bound_port;
-		sin->sin_addr.s_addr = rs->rs_bound_addr;
+		/* If socket is not yet bound, set the return address family
+		 * to be AF_UNSPEC (value 0) and the address size to be that
+		 * of an IPv4 address.
+		 */
+		if (ipv6_addr_any(&rs->rs_bound_addr)) {
+			sin = (struct sockaddr_in *)uaddr;
+			memset(sin, 0, sizeof(*sin));
+			sin->sin_family = AF_UNSPEC;
+			return sizeof(*sin);
+		}
+		if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+			sin = (struct sockaddr_in *)uaddr;
+			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+			sin->sin_family = AF_INET;
+			sin->sin_port = rs->rs_bound_port;
+			sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
+			uaddr_len = sizeof(*sin);
+		} else {
+			sin6 = (struct sockaddr_in6 *)uaddr;
+			sin6->sin6_family = AF_INET6;
+			sin6->sin6_port = rs->rs_bound_port;
+			sin6->sin6_addr = rs->rs_bound_addr;
+			sin6->sin6_flowinfo = 0;
+			sin6->sin6_scope_id = rs->rs_bound_scope_id;
+			uaddr_len = sizeof(*sin6);
+		}
 	}
 
-	sin->sin_family = AF_INET;
-
-	return sizeof(*sin);
+	return uaddr_len;
 }
 
 /*
@@ -203,11 +241,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
 			      int len)
 {
+	struct sockaddr_in6 sin6;
 	struct sockaddr_in sin;
 	int ret = 0;
 
 	/* racing with another thread binding seems ok here */
-	if (rs->rs_bound_addr == 0) {
+	if (ipv6_addr_any(&rs->rs_bound_addr)) {
 		ret = -ENOTCONN; /* XXX not a great errno */
 		goto out;
 	}
@@ -215,14 +254,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
 	if (len < sizeof(struct sockaddr_in)) {
 		ret = -EINVAL;
 		goto out;
+	} else if (len < sizeof(struct sockaddr_in6)) {
+		/* Assume IPv4 */
+		if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
+		sin6.sin6_port = sin.sin_port;
+	} else {
+		if (copy_from_user(&sin6, optval,
+				   sizeof(struct sockaddr_in6))) {
+			ret = -EFAULT;
+			goto out;
+		}
 	}
 
-	if (copy_from_user(&sin, optval, sizeof(sin))) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	rds_send_drop_to(rs, &sin);
+	rds_send_drop_to(rs, &sin6);
 out:
 	return ret;
 }
@@ -435,31 +483,41 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
-	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+	struct sockaddr_in *sin;
 	struct rds_sock *rs = rds_sk_to_rs(sk);
 	int ret = 0;
 
 	lock_sock(sk);
 
-	if (addr_len != sizeof(struct sockaddr_in)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	switch (addr_len) {
+	case sizeof(struct sockaddr_in):
+		sin = (struct sockaddr_in *)uaddr;
+		if (sin->sin_family != AF_INET) {
+			ret = -EAFNOSUPPORT;
+			break;
+		}
+		if (sin->sin_addr.s_addr == INADDR_ANY) {
+			ret = -EDESTADDRREQ;
+			break;
+		}
+		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
+		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
+			ret = -EINVAL;
+			break;
+		}
+		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
+		rs->rs_conn_port = sin->sin_port;
+		break;
 
-	if (sin->sin_family != AF_INET) {
-		ret = -EAFNOSUPPORT;
-		goto out;
-	}
+	case sizeof(struct sockaddr_in6):
+		ret = -EPROTONOSUPPORT;
+		break;
 
-	if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
-		ret = -EDESTADDRREQ;
-		goto out;
+	default:
+		ret = -EINVAL;
+		break;
 	}
 
-	rs->rs_conn_addr = sin->sin_addr.s_addr;
-	rs->rs_conn_port = sin->sin_port;
-
-out:
 	release_sock(sk);
 	return ret;
 }
@@ -578,8 +636,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
 		list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
 			total++;
 			if (total <= len)
-				rds_inc_info_copy(inc, iter, inc->i_saddr,
-						  rs->rs_bound_addr, 1);
+				rds_inc_info_copy(inc, iter,
+						  inc->i_saddr.s6_addr32[3],
+						  rs->rs_bound_addr_v4,
+						  1);
 		}
 
 		read_unlock(&rs->rs_recv_lock);
@@ -608,8 +668,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
 	list_for_each_entry(rs, &rds_sock_list, rs_item) {
 		sinfo.sndbuf = rds_sk_sndbuf(rs);
 		sinfo.rcvbuf = rds_sk_rcvbuf(rs);
-		sinfo.bound_addr = rs->rs_bound_addr;
-		sinfo.connected_addr = rs->rs_conn_addr;
+		sinfo.bound_addr = rs->rs_bound_addr_v4;
+		sinfo.connected_addr = rs->rs_conn_addr_v4;
 		sinfo.bound_port = rs->rs_bound_port;
 		sinfo.connected_port = rs->rs_conn_port;
 		sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64..3a1097e 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <net/sock.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 #include <linux/if_arp.h>
 #include <linux/jhash.h>
 #include <linux/ratelimit.h>
@@ -42,42 +43,58 @@
 
 static const struct rhashtable_params ht_parms = {
 	.nelem_hint = 768,
-	.key_len = sizeof(u64),
+	.key_len = RDS_BOUND_KEY_LEN,
 	.key_offset = offsetof(struct rds_sock, rs_bound_key),
 	.head_offset = offsetof(struct rds_sock, rs_bound_node),
 	.max_size = 16384,
 	.min_size = 1024,
 };
 
+/* Create a key for the bind hash table manipulation.  Port is in network byte
+ * order.
+ */
+static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
+					 __be16 port, __u32 scope_id)
+{
+	memcpy(key, addr, sizeof(*addr));
+	key += sizeof(*addr);
+	memcpy(key, &port, sizeof(port));
+	key += sizeof(port);
+	memcpy(key, &scope_id, sizeof(scope_id));
+}
+
 /*
  * Return the rds_sock bound at the given local address.
  *
  * The rx path can race with rds_release.  We notice if rds_release() has
  * marked this socket and don't return a rs ref to the rx path.
  */
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+				__u32 scope_id)
 {
-	u64 key = ((u64)addr << 32) | port;
+	u8 key[RDS_BOUND_KEY_LEN];
 	struct rds_sock *rs;
 
-	rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
+	__rds_create_bind_key(key, addr, port, scope_id);
+	rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
 	if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 		rds_sock_addref(rs);
 	else
 		rs = NULL;
 
-	rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
-		ntohs(port));
+	rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
+		 ntohs(port));
 
 	return rs;
 }
 
 /* returns -ve errno or +ve port */
-static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
+			 __be16 *port, __u32 scope_id)
 {
 	int ret = -EADDRINUSE;
 	u16 rover, last;
-	u64 key;
+	u8 key[RDS_BOUND_KEY_LEN];
 
 	if (*port != 0) {
 		rover = be16_to_cpu(*port);
@@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 
 		if (rover == RDS_FLAG_PROBE_PORT)
 			continue;
-		key = ((u64)addr << 32) | cpu_to_be16(rover);
-		if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+		__rds_create_bind_key(key, addr, cpu_to_be16(rover),
+				      scope_id);
+		if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
 			continue;
 
-		rs->rs_bound_key = key;
-		rs->rs_bound_addr = addr;
+		memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
+		rs->rs_bound_addr = *addr;
 		net_get_random_once(&rs->rs_hash_initval,
 				    sizeof(rs->rs_hash_initval));
 		rs->rs_bound_port = cpu_to_be16(rover);
@@ -114,7 +132,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 			  rs, &addr, (int)ntohs(*port));
 			break;
 		} else {
-			rs->rs_bound_addr = 0;
+			rs->rs_bound_addr = in6addr_any;
 			rds_sock_put(rs);
 			ret = -ENOMEM;
 			break;
@@ -127,44 +145,61 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
 void rds_remove_bound(struct rds_sock *rs)
 {
 
-	if (!rs->rs_bound_addr)
+	if (ipv6_addr_any(&rs->rs_bound_addr))
 		return;
 
-	rdsdebug("rs %p unbinding from %pI4:%d\n",
+	rdsdebug("rs %p unbinding from %pI6c:%d\n",
 		 rs, &rs->rs_bound_addr,
 		 ntohs(rs->rs_bound_port));
 
 	rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
 	rds_sock_put(rs);
-	rs->rs_bound_addr = 0;
+	rs->rs_bound_addr = in6addr_any;
 }
 
 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
-	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 	struct rds_sock *rs = rds_sk_to_rs(sk);
+	struct in6_addr v6addr, *binding_addr;
 	struct rds_transport *trans;
+	__u32 scope_id = 0;
 	int ret = 0;
+	__be16 port;
 
+	/* We only allow an RDS socket to be bound to and IPv4 address. IPv6
+	 * address support will be added later.
+	 */
+	if (addr_len == sizeof(struct sockaddr_in)) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+		if (sin->sin_family != AF_INET ||
+		    sin->sin_addr.s_addr == INADDR_ANY)
+			return -EINVAL;
+		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
+		binding_addr = &v6addr;
+		port = sin->sin_port;
+	} else if (addr_len == sizeof(struct sockaddr_in6)) {
+		return -EPROTONOSUPPORT;
+	} else {
+		return -EINVAL;
+	}
 	lock_sock(sk);
 
-	if (addr_len != sizeof(struct sockaddr_in) ||
-	    sin->sin_family != AF_INET ||
-	    rs->rs_bound_addr ||
-	    sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+	/* RDS socket does not allow re-binding. */
+	if (!ipv6_addr_any(&rs->rs_bound_addr)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+	ret = rds_add_bound(rs, binding_addr, &port, scope_id);
 	if (ret)
 		goto out;
 
 	if (rs->rs_transport) { /* previously bound */
 		trans = rs->rs_transport;
 		if (trans->laddr_check(sock_net(sock->sk),
-				       sin->sin_addr.s_addr) != 0) {
+				       binding_addr, scope_id) != 0) {
 			ret = -ENOPROTOOPT;
 			rds_remove_bound(rs);
 		} else {
@@ -172,13 +207,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		}
 		goto out;
 	}
-	trans = rds_trans_get_preferred(sock_net(sock->sk),
-					sin->sin_addr.s_addr);
+	trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
+					scope_id);
 	if (!trans) {
 		ret = -EADDRNOTAVAIL;
 		rds_remove_bound(rs);
-		pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
-				    __func__, &sin->sin_addr.s_addr);
+		pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+				    __func__, binding_addr);
 		goto out;
 	}
 
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 63da9d2..ccdff09 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Oracle.  All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@
 static DEFINE_SPINLOCK(rds_cong_lock);
 static struct rb_root rds_cong_tree = RB_ROOT;
 
-static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
 					       struct rds_cong_map *insert)
 {
 	struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
 	struct rds_cong_map *map;
 
 	while (*p) {
+		int diff;
+
 		parent = *p;
 		map = rb_entry(parent, struct rds_cong_map, m_rb_node);
 
-		if (addr < map->m_addr)
+		diff = rds_addr_cmp(addr, &map->m_addr);
+		if (diff < 0)
 			p = &(*p)->rb_left;
-		else if (addr > map->m_addr)
+		else if (diff > 0)
 			p = &(*p)->rb_right;
 		else
 			return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
  * these bitmaps in the process getting pointers to them.  The bitmaps are only
  * ever freed as the module is removed after all connections have been freed.
  */
-static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
 {
 	struct rds_cong_map *map;
 	struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 	if (!map)
 		return NULL;
 
-	map->m_addr = addr;
+	map->m_addr = *addr;
 	init_waitqueue_head(&map->m_waitq);
 	INIT_LIST_HEAD(&map->m_conn_list);
 
@@ -171,7 +174,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
 		kfree(map);
 	}
 
-	rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+	rdsdebug("map %p for addr %pI6c\n", ret, addr);
 
 	return ret;
 }
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
 
 int rds_cong_get_maps(struct rds_connection *conn)
 {
-	conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
-	conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+	conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
+	conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
 
 	if (!(conn->c_lcong && conn->c_fcong))
 		return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
 
 	/* update congestion map for now-closed port */
 	spin_lock_irqsave(&rds_cong_lock, flags);
-	map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+	map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
 	spin_unlock_irqrestore(&rds_cong_lock, flags);
 
 	if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index abef75d..ca72563 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,8 @@
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <net/inet_hashtables.h>
+#include <net/ipv6.h>
+#include <net/inet6_hashtables.h>
 
 #include "rds.h"
 #include "loop.h"
@@ -49,18 +50,21 @@
 static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
 static struct kmem_cache *rds_conn_slab;
 
-static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
+					  const struct in6_addr *faddr)
 {
+	static u32 rds6_hash_secret __read_mostly;
 	static u32 rds_hash_secret __read_mostly;
 
-	unsigned long hash;
+	u32 lhash, fhash, hash;
 
 	net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+	net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
+
+	lhash = (__force u32)laddr->s6_addr32[3];
+	fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+	hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
 
-	/* Pass NULL, don't need struct net for hash */
-	hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
-			      be32_to_cpu(faddr), 0,
-			      rds_hash_secret);
 	return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
 }
 
@@ -72,20 +76,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
 /* rcu read lock must be held or the connection spinlock */
 static struct rds_connection *rds_conn_lookup(struct net *net,
 					      struct hlist_head *head,
-					      __be32 laddr, __be32 faddr,
-					      struct rds_transport *trans)
+					      const struct in6_addr *laddr,
+					      const struct in6_addr *faddr,
+					      struct rds_transport *trans,
+					      int dev_if)
 {
 	struct rds_connection *conn, *ret = NULL;
 
 	hlist_for_each_entry_rcu(conn, head, c_hash_node) {
-		if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
-		    conn->c_trans == trans && net == rds_conn_net(conn)) {
+		if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
+		    ipv6_addr_equal(&conn->c_laddr, laddr) &&
+		    conn->c_trans == trans &&
+		    net == rds_conn_net(conn) &&
+		    conn->c_dev_if == dev_if) {
 			ret = conn;
 			break;
 		}
 	}
-	rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
-		 &laddr, &faddr);
+	rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
+		 laddr, faddr);
 	return ret;
 }
 
@@ -99,8 +108,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
 {
 	struct rds_connection *conn = cp->cp_conn;
 
-	rdsdebug("connection %pI4 to %pI4 reset\n",
-	  &conn->c_laddr, &conn->c_faddr);
+	rdsdebug("connection %pI6c to %pI6c reset\n",
+		 &conn->c_laddr, &conn->c_faddr);
 
 	rds_stats_inc(s_conn_reset);
 	rds_send_path_reset(cp);
@@ -142,9 +151,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
  * are torn down as the module is removed, if ever.
  */
 static struct rds_connection *__rds_conn_create(struct net *net,
-						__be32 laddr, __be32 faddr,
-				       struct rds_transport *trans, gfp_t gfp,
-				       int is_outgoing)
+						const struct in6_addr *laddr,
+						const struct in6_addr *faddr,
+						struct rds_transport *trans,
+						gfp_t gfp,
+						int is_outgoing,
+						int dev_if)
 {
 	struct rds_connection *conn, *parent = NULL;
 	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +166,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
 
 	rcu_read_lock();
-	conn = rds_conn_lookup(net, head, laddr, faddr, trans);
-	if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
-	    laddr == faddr && !is_outgoing) {
+	conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+	if (conn &&
+	    conn->c_loopback &&
+	    conn->c_trans != &rds_loop_transport &&
+	    ipv6_addr_equal(laddr, faddr) &&
+	    !is_outgoing) {
 		/* This is a looped back IB connection, and we're
 		 * called by the code handling the incoming connect.
 		 * We need a second connection object into which we
@@ -181,8 +196,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	}
 
 	INIT_HLIST_NODE(&conn->c_hash_node);
-	conn->c_laddr = laddr;
-	conn->c_faddr = faddr;
+	conn->c_laddr = *laddr;
+	conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
+	conn->c_faddr = *faddr;
+	conn->c_dev_if = dev_if;
 
 	rds_conn_net_set(conn, net);
 
@@ -199,7 +216,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	 * can bind to the destination address then we'd rather the messages
 	 * flow through loopback rather than either transport.
 	 */
-	loop_trans = rds_trans_get_preferred(net, faddr);
+	loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
 	if (loop_trans) {
 		rds_trans_put(loop_trans);
 		conn->c_loopback = 1;
@@ -233,10 +250,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		goto out;
 	}
 
-	rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
-	  conn, &laddr, &faddr,
-	  strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
-	  "[unknown]", is_outgoing ? "(outgoing)" : "");
+	rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
+		 conn, laddr, faddr,
+		 strnlen(trans->t_name, sizeof(trans->t_name)) ?
+		 trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
 
 	/*
 	 * Since we ran without holding the conn lock, someone could
@@ -262,7 +279,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		/* Creating normal conn */
 		struct rds_connection *found;
 
-		found = rds_conn_lookup(net, head, laddr, faddr, trans);
+		found = rds_conn_lookup(net, head, laddr, faddr, trans,
+					dev_if);
 		if (found) {
 			struct rds_conn_path *cp;
 			int i;
@@ -295,18 +313,22 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 }
 
 struct rds_connection *rds_conn_create(struct net *net,
-				       __be32 laddr, __be32 faddr,
-				       struct rds_transport *trans, gfp_t gfp)
+				       const struct in6_addr *laddr,
+				       const struct in6_addr *faddr,
+				       struct rds_transport *trans, gfp_t gfp,
+				       int dev_if)
 {
-	return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
+	return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
 struct rds_connection *rds_conn_create_outgoing(struct net *net,
-						__be32 laddr, __be32 faddr,
-				       struct rds_transport *trans, gfp_t gfp)
+						const struct in6_addr *laddr,
+						const struct in6_addr *faddr,
+						struct rds_transport *trans,
+						gfp_t gfp, int dev_if)
 {
-	return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
+	return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
@@ -502,12 +524,17 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 
 				/* XXX too lazy to maintain counts.. */
 				list_for_each_entry(rm, list, m_conn_item) {
+					__be32 laddr;
+					__be32 faddr;
+
 					total++;
+					laddr = conn->c_laddr.s6_addr32[3];
+					faddr = conn->c_faddr.s6_addr32[3];
 					if (total <= len)
 						rds_inc_info_copy(&rm->m_inc,
 								  iter,
-								  conn->c_laddr,
-								  conn->c_faddr,
+								  laddr,
+								  faddr,
 								  0);
 				}
 
@@ -584,7 +611,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
 	struct hlist_head *head;
 	struct rds_connection *conn;
 	size_t i;
-	int j;
 
 	rcu_read_lock();
 
@@ -595,17 +621,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
 	     i++, head++) {
 		hlist_for_each_entry_rcu(conn, head, c_hash_node) {
 			struct rds_conn_path *cp;
-			int npaths;
 
-			npaths = (conn->c_trans->t_mp_capable ?
-				 RDS_MPATH_WORKERS : 1);
-			for (j = 0; j < npaths; j++) {
-				cp = &conn->c_path[j];
+			/* XXX We only copy the information from the first
+			 * path for now.  The problem is that if there are
+			 * more than one underlying paths, we cannot report
+			 * information of all of them using the exisitng
+			 * API.  For example, there is only one next_tx_seq,
+			 * which path's next_tx_seq should we report?  It is
+			 * a bug in the design of MPRDS.
+			 */
+			cp = conn->c_path;
 
-				/* XXX no cp_lock usage.. */
-				if (!visitor(cp, buffer))
-					continue;
-			}
+			/* XXX no cp_lock usage.. */
+			if (!visitor(cp, buffer))
+				continue;
 
 			/* We copy as much as we can fit in the buffer,
 			 * but we count all items so that the caller
@@ -624,12 +653,13 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
 static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
 {
 	struct rds_info_connection *cinfo = buffer;
+	struct rds_connection *conn = cp->cp_conn;
 
 	cinfo->next_tx_seq = cp->cp_next_tx_seq;
 	cinfo->next_rx_seq = cp->cp_next_rx_seq;
-	cinfo->laddr = cp->cp_conn->c_laddr;
-	cinfo->faddr = cp->cp_conn->c_faddr;
-	strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
+	cinfo->laddr = conn->c_laddr.s6_addr32[3];
+	cinfo->faddr = conn->c_faddr.s6_addr32[3];
+	strncpy(cinfo->transport, conn->c_trans->t_name,
 		sizeof(cinfo->transport));
 	cinfo->flags = 0;
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b6ad38e..c712a84 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -296,8 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 	if (conn->c_trans != &rds_ib_transport)
 		return 0;
 
-	iinfo->src_addr = conn->c_laddr;
-	iinfo->dst_addr = conn->c_faddr;
+	iinfo->src_addr = conn->c_laddr.s6_addr32[3];
+	iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
 
 	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
 	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -341,7 +341,8 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy "routing".
  */
-static int rds_ib_laddr_check(struct net *net, __be32 addr)
+static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
+			      __u32 scope_id)
 {
 	int ret;
 	struct rdma_cm_id *cm_id;
@@ -357,7 +358,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
 
 	memset(&sin, 0, sizeof(sin));
 	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = addr;
+	sin.sin_addr.s_addr = addr->s6_addr32[3];
 
 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
@@ -367,9 +368,9 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
 	    cm_id->device->node_type != RDMA_NODE_IB_CA)
 		ret = -EADDRNOTAVAIL;
 
-	rdsdebug("addr %pI4 ret %d node type %d\n",
-		&addr, ret,
-		cm_id->device ? cm_id->device->node_type : -1);
+	rdsdebug("addr %pI6c ret %d node type %d\n",
+		 addr, ret,
+		 cm_id->device ? cm_id->device->node_type : -1);
 
 	rdma_destroy_id(cm_id);
 
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a6f4d7d..12f96b3 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -57,16 +57,38 @@ struct rds_ib_refill_cache {
 	struct list_head	 *ready;
 };
 
+struct rds_ib_conn_priv_cmn {
+	u8			ricpc_protocol_major;
+	u8			ricpc_protocol_minor;
+	__be16			ricpc_protocol_minor_mask;	/* bitmask */
+	__be32			ricpc_reserved1;
+	__be64			ricpc_ack_seq;
+	__be32			ricpc_credit;	/* non-zero enables flow ctl */
+};
+
 struct rds_ib_connect_private {
 	/* Add new fields at the end, and don't permute existing fields. */
-	__be32			dp_saddr;
-	__be32			dp_daddr;
-	u8			dp_protocol_major;
-	u8			dp_protocol_minor;
-	__be16			dp_protocol_minor_mask; /* bitmask */
-	__be32			dp_reserved1;
-	__be64			dp_ack_seq;
-	__be32			dp_credit;		/* non-zero enables flow ctl */
+	__be32				dp_saddr;
+	__be32				dp_daddr;
+	struct rds_ib_conn_priv_cmn	dp_cmn;
+};
+
+struct rds6_ib_connect_private {
+	/* Add new fields at the end, and don't permute existing fields. */
+	struct in6_addr			dp_saddr;
+	struct in6_addr			dp_daddr;
+	struct rds_ib_conn_priv_cmn	dp_cmn;
+};
+
+#define dp_protocol_major	dp_cmn.ricpc_protocol_major
+#define dp_protocol_minor	dp_cmn.ricpc_protocol_minor
+#define dp_protocol_minor_mask	dp_cmn.ricpc_protocol_minor_mask
+#define dp_ack_seq		dp_cmn.ricpc_ack_seq
+#define dp_credit		dp_cmn.ricpc_credit
+
+union rds_ib_conn_priv {
+	struct rds_ib_connect_private	ricp_v4;
+	struct rds6_ib_connect_private	ricp_v6;
 };
 
 struct rds_ib_send_work {
@@ -351,8 +373,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
 __printf(2, 3)
 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
-			     struct rdma_cm_event *event);
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+			     struct rdma_cm_event *event, bool isv6);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
 void rds_ib_cm_connect_complete(struct rds_connection *conn,
 				struct rdma_cm_event *event);
 
@@ -361,7 +383,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 	__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
 
 /* ib_rdma.c */
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+			 struct in6_addr *ipaddr);
 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
 void rds_ib_destroy_nodev_conns(void);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f1684ae..31ffa70 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -35,10 +35,12 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/ratelimit.h>
+#include <net/addrconf.h>
 
 #include "rds_single_path.h"
 #include "rds.h"
 #include "ib.h"
+#include "tcp.h"
 
 /*
  * Set the selected protocol version
@@ -95,25 +97,45 @@ static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
  */
 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
 {
-	const struct rds_ib_connect_private *dp = NULL;
 	struct rds_ib_connection *ic = conn->c_transport_data;
+	const union rds_ib_conn_priv *dp = NULL;
 	struct ib_qp_attr qp_attr;
+	__be64 ack_seq = 0;
+	__be32 credit = 0;
+	u8 major = 0;
+	u8 minor = 0;
 	int err;
 
-	if (event->param.conn.private_data_len >= sizeof(*dp)) {
-		dp = event->param.conn.private_data;
-
-		/* make sure it isn't empty data */
-		if (dp->dp_protocol_major) {
-			rds_ib_set_protocol(conn,
-				RDS_PROTOCOL(dp->dp_protocol_major,
-				dp->dp_protocol_minor));
-			rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+	dp = event->param.conn.private_data;
+	if (conn->c_isv6) {
+		if (event->param.conn.private_data_len >=
+		    sizeof(struct rds6_ib_connect_private)) {
+			major = dp->ricp_v6.dp_protocol_major;
+			minor = dp->ricp_v6.dp_protocol_minor;
+			credit = dp->ricp_v6.dp_credit;
+			/* dp structure start is not guaranteed to be 8 bytes
+			 * aligned.  Since dp_ack_seq is 64-bit extended load
+			 * operations can be used so go through get_unaligned
+			 * to avoid unaligned errors.
+			 */
+			ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
 		}
+	} else if (event->param.conn.private_data_len >=
+		   sizeof(struct rds_ib_connect_private)) {
+		major = dp->ricp_v4.dp_protocol_major;
+		minor = dp->ricp_v4.dp_protocol_minor;
+		credit = dp->ricp_v4.dp_credit;
+		ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
+	}
+
+	/* make sure it isn't empty data */
+	if (major) {
+		rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
+		rds_ib_set_flow_control(conn, be32_to_cpu(credit));
 	}
 
 	if (conn->c_version < RDS_PROTOCOL(3, 1)) {
-		pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+		pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
 			  RDS_PROTOCOL_MINOR(conn->c_version));
@@ -121,7 +143,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		rds_conn_destroy(conn);
 		return;
 	} else {
-		pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+		pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
 			  ic->i_active_side ? "Active" : "Passive",
 			  &conn->c_laddr, &conn->c_faddr,
 			  RDS_PROTOCOL_MAJOR(conn->c_version),
@@ -150,7 +172,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 		printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
 
 	/* update ib_device with this local ipaddr */
-	err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+	err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
 	if (err)
 		printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
 			err);
@@ -158,14 +180,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
 	if (dp) {
-		/* dp structure start is not guaranteed to be 8 bytes aligned.
-		 * Since dp_ack_seq is 64-bit extended load operations can be
-		 * used so go through get_unaligned to avoid unaligned errors.
-		 */
-		__be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
-		if (dp_ack_seq)
-			rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+		if (ack_seq)
+			rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
 					    NULL);
 	}
 
@@ -173,11 +189,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
 }
 
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
-			struct rdma_conn_param *conn_param,
-			struct rds_ib_connect_private *dp,
-			u32 protocol_version,
-			u32 max_responder_resources,
-			u32 max_initiator_depth)
+				      struct rdma_conn_param *conn_param,
+				      union rds_ib_conn_priv *dp,
+				      u32 protocol_version,
+				      u32 max_responder_resources,
+				      u32 max_initiator_depth,
+				      bool isv6)
 {
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -193,24 +210,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
 
 	if (dp) {
 		memset(dp, 0, sizeof(*dp));
-		dp->dp_saddr = conn->c_laddr;
-		dp->dp_daddr = conn->c_faddr;
-		dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
-		dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
-		dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
-		dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
+		if (isv6) {
+			dp->ricp_v6.dp_saddr = conn->c_laddr;
+			dp->ricp_v6.dp_daddr = conn->c_faddr;
+			dp->ricp_v6.dp_protocol_major =
+			    RDS_PROTOCOL_MAJOR(protocol_version);
+			dp->ricp_v6.dp_protocol_minor =
+			    RDS_PROTOCOL_MINOR(protocol_version);
+			dp->ricp_v6.dp_protocol_minor_mask =
+			    cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+			dp->ricp_v6.dp_ack_seq =
+			    cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+			conn_param->private_data = &dp->ricp_v6;
+			conn_param->private_data_len = sizeof(dp->ricp_v6);
+		} else {
+			dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
+			dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
+			dp->ricp_v4.dp_protocol_major =
+			    RDS_PROTOCOL_MAJOR(protocol_version);
+			dp->ricp_v4.dp_protocol_minor =
+			    RDS_PROTOCOL_MINOR(protocol_version);
+			dp->ricp_v4.dp_protocol_minor_mask =
+			    cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+			dp->ricp_v4.dp_ack_seq =
+			    cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+			conn_param->private_data = &dp->ricp_v4;
+			conn_param->private_data_len = sizeof(dp->ricp_v4);
+		}
 
 		/* Advertise flow control */
 		if (ic->i_flowctl) {
 			unsigned int credits;
 
-			credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
-			dp->dp_credit = cpu_to_be32(credits);
-			atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+			credits = IB_GET_POST_CREDITS
+				(atomic_read(&ic->i_credits));
+			if (isv6)
+				dp->ricp_v6.dp_credit = cpu_to_be32(credits);
+			else
+				dp->ricp_v4.dp_credit = cpu_to_be32(credits);
+			atomic_sub(IB_SET_POST_CREDITS(credits),
+				   &ic->i_credits);
 		}
-
-		conn_param->private_data = dp;
-		conn_param->private_data_len = sizeof(*dp);
 	}
 }
 
@@ -349,7 +391,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
 		break;
 	default:
 		rdsdebug("Fatal QP Event %u (%s) "
-			"- connection %pI4->%pI4, reconnecting\n",
+			"- connection %pI6c->%pI6c, reconnecting\n",
 			event->event, ib_event_msg(event->event),
 			&conn->c_laddr, &conn->c_faddr);
 		rds_conn_drop(conn);
@@ -580,11 +622,13 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	return ret;
 }
 
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
 {
-	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
-	u16 common;
+	const union rds_ib_conn_priv *dp = event->param.conn.private_data;
+	u8 data_len, major, minor;
 	u32 version = 0;
+	__be16 mask;
+	u16 common;
 
 	/*
 	 * rdma_cm private data is odd - when there is any private data in the
@@ -603,51 +647,126 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
 		return 0;
 	}
 
+	if (isv6) {
+		data_len = sizeof(struct rds6_ib_connect_private);
+		major = dp->ricp_v6.dp_protocol_major;
+		minor = dp->ricp_v6.dp_protocol_minor;
+		mask = dp->ricp_v6.dp_protocol_minor_mask;
+	} else {
+		data_len = sizeof(struct rds_ib_connect_private);
+		major = dp->ricp_v4.dp_protocol_major;
+		minor = dp->ricp_v4.dp_protocol_minor;
+		mask = dp->ricp_v4.dp_protocol_minor_mask;
+	}
+
 	/* Even if len is crap *now* I still want to check it. -ASG */
-	if (event->param.conn.private_data_len < sizeof (*dp) ||
-	    dp->dp_protocol_major == 0)
+	if (event->param.conn.private_data_len < data_len || major == 0)
 		return RDS_PROTOCOL_3_0;
 
-	common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
-	if (dp->dp_protocol_major == 3 && common) {
+	common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+	if (major == 3 && common) {
 		version = RDS_PROTOCOL_3_0;
 		while ((common >>= 1) != 0)
 			version++;
-	} else
-		printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
-				&dp->dp_saddr,
-				dp->dp_protocol_major,
-				dp->dp_protocol_minor);
+	} else {
+		if (isv6)
+			printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
+					   &dp->ricp_v6.dp_saddr, major, minor);
+		else
+			printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+					   &dp->ricp_v4.dp_saddr, major, minor);
+	}
 	return version;
 }
 
+/* Given an IPv6 address, find the IB net_device which hosts that address and
+ * return its index.  This is used by the rds_ib_cm_handle_connect() code to
+ * find the interface index of where an incoming request comes from when
+ * the request is using a link local address.
+ *
+ * Note one problem in this search.  It is possible that two interfaces have
+ * the same link local address.  Unfortunately, this cannot be solved unless
+ * the underlying layer gives us the interface which an incoming RDMA connect
+ * request comes from.
+ */
+static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
+{
+	struct net_device *dev;
+	int idx = 0;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		if (dev->type == ARPHRD_INFINIBAND &&
+		    ipv6_chk_addr(net, addr, dev, 0)) {
+			idx = dev->ifindex;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return idx;
+}
+
 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
-				    struct rdma_cm_event *event)
+			     struct rdma_cm_event *event, bool isv6)
 {
 	__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
 	__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
-	const struct rds_ib_connect_private *dp = event->param.conn.private_data;
-	struct rds_ib_connect_private dp_rep;
+	const struct rds_ib_conn_priv_cmn *dp_cmn;
 	struct rds_connection *conn = NULL;
 	struct rds_ib_connection *ic = NULL;
 	struct rdma_conn_param conn_param;
+	const union rds_ib_conn_priv *dp;
+	union rds_ib_conn_priv dp_rep;
+	struct in6_addr s_mapped_addr;
+	struct in6_addr d_mapped_addr;
+	const struct in6_addr *saddr6;
+	const struct in6_addr *daddr6;
+	int destroy = 1;
+	u32 ifindex = 0;
 	u32 version;
-	int err = 1, destroy = 1;
+	int err = 1;
 
 	/* Check whether the remote protocol version matches ours. */
-	version = rds_ib_protocol_compatible(event);
+	version = rds_ib_protocol_compatible(event, isv6);
 	if (!version)
 		goto out;
 
-	rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
-		 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+	dp = event->param.conn.private_data;
+	if (isv6) {
+		dp_cmn = &dp->ricp_v6.dp_cmn;
+		saddr6 = &dp->ricp_v6.dp_saddr;
+		daddr6 = &dp->ricp_v6.dp_daddr;
+		/* If the local address is link local, need to find the
+		 * interface index in order to create a proper RDS
+		 * connection.
+		 */
+		if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
+			/* Using init_net for now ..  */
+			ifindex = __rds_find_ifindex(&init_net, daddr6);
+			/* No index found...  Need to bail out. */
+			if (ifindex == 0) {
+				err = -EOPNOTSUPP;
+				goto out;
+			}
+		}
+	} else {
+		dp_cmn = &dp->ricp_v4.dp_cmn;
+		ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
+		ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
+		saddr6 = &s_mapped_addr;
+		daddr6 = &d_mapped_addr;
+	}
+
+	rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
+		 "0x%llx\n", saddr6, daddr6,
 		 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
 		 (unsigned long long)be64_to_cpu(lguid),
 		 (unsigned long long)be64_to_cpu(fguid));
 
 	/* RDS/IB is not currently netns aware, thus init_net */
-	conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
-			       &rds_ib_transport, GFP_KERNEL);
+	conn = rds_conn_create(&init_net, daddr6, saddr6,
+			       &rds_ib_transport, GFP_KERNEL, ifindex);
 	if (IS_ERR(conn)) {
 		rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
 		conn = NULL;
@@ -678,12 +797,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	ic = conn->c_transport_data;
 
 	rds_ib_set_protocol(conn, version);
-	rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+	rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
 
 	/* If the peer gave us the last packet it saw, process this as if
 	 * we had received a regular ACK. */
-	if (dp->dp_ack_seq)
-		rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+	if (dp_cmn->ricpc_ack_seq)
+		rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
+				    NULL);
 
 	BUG_ON(cm_id->context);
 	BUG_ON(ic->i_cm_id);
@@ -702,8 +822,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 	}
 
 	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
-		event->param.conn.responder_resources,
-		event->param.conn.initiator_depth);
+				  event->param.conn.responder_resources,
+				  event->param.conn.initiator_depth, isv6);
 
 	/* rdma_accept() calls rdma_reject() internally if it fails */
 	if (rdma_accept(cm_id, &conn_param))
@@ -718,12 +838,12 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 }
 
 
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
 {
 	struct rds_connection *conn = cm_id->context;
 	struct rds_ib_connection *ic = conn->c_transport_data;
 	struct rdma_conn_param conn_param;
-	struct rds_ib_connect_private dp;
+	union rds_ib_conn_priv dp;
 	int ret;
 
 	/* If the peer doesn't do protocol negotiation, we must
@@ -738,7 +858,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 	}
 
 	rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
-		UINT_MAX, UINT_MAX);
+				  UINT_MAX, UINT_MAX, isv6);
 	ret = rdma_connect(cm_id, &conn_param);
 	if (ret)
 		rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -758,13 +878,17 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 {
 	struct rds_connection *conn = cp->cp_conn;
-	struct rds_ib_connection *ic = conn->c_transport_data;
-	struct sockaddr_in src, dest;
+	struct sockaddr_storage src, dest;
+	rdma_cm_event_handler handler;
+	struct rds_ib_connection *ic;
 	int ret;
 
+	ic = conn->c_transport_data;
+
 	/* XXX I wonder what affect the port space has */
 	/* delegate cm event handler to rdma_transport */
-	ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
+	handler = rds_rdma_cm_event_handler;
+	ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
 				     RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(ic->i_cm_id)) {
 		ret = PTR_ERR(ic->i_cm_id);
@@ -775,13 +899,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 
 	rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
 
-	src.sin_family = AF_INET;
-	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-	src.sin_port = (__force u16)htons(0);
+	if (ipv6_addr_v4mapped(&conn->c_faddr)) {
+		struct sockaddr_in *sin;
+
+		sin = (struct sockaddr_in *)&src;
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+		sin->sin_port = (__force u16)htons(0);
 
-	dest.sin_family = AF_INET;
-	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-	dest.sin_port = (__force u16)htons(RDS_PORT);
+		sin = (struct sockaddr_in *)&dest;
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+		sin->sin_port = (__force u16)htons(RDS_PORT);
+	} else {
+		struct sockaddr_in6 *sin6;
+
+		sin6 = (struct sockaddr_in6 *)&src;
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_addr = conn->c_laddr;
+		sin6->sin6_port = (__force u16)htons(0);
+		sin6->sin6_scope_id = conn->c_dev_if;
+
+		sin6 = (struct sockaddr_in6 *)&dest;
+		sin6->sin6_family = AF_INET6;
+		sin6->sin6_addr = conn->c_faddr;
+		sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+		sin6->sin6_scope_id = conn->c_dev_if;
+	}
 
 	ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
 				(struct sockaddr *)&dest,
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e678699..0ec9df0 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
 		kfree_rcu(to_free, rcu);
 }
 
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+			 struct in6_addr *ipaddr)
 {
 	struct rds_ib_device *rds_ibdev_old;
 
-	rds_ibdev_old = rds_ib_get_device(ipaddr);
+	rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
 	if (!rds_ibdev_old)
-		return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+		return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
 
 	if (rds_ibdev_old != rds_ibdev) {
-		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+		rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
 		rds_ib_dev_put(rds_ibdev_old);
-		return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+		return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
 	}
 	rds_ib_dev_put(rds_ibdev_old);
 
@@ -544,7 +545,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 	struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
 	int ret;
 
-	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+	rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
 	if (!rds_ibdev) {
 		ret = -ENODEV;
 		goto out;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index b4e421a..f101bed 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
 		rds_ib_stats_inc(s_ib_rx_total_incs);
 	}
 	INIT_LIST_HEAD(&ibinc->ii_frags);
-	rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+	rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
 
 	return ibinc;
 }
@@ -420,7 +420,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
 		ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
 		if (ret) {
 			rds_ib_conn_error(conn, "recv post on "
-			       "%pI4 returned %d, disconnecting and "
+			       "%pI6c returned %d, disconnecting and "
 			       "reconnecting\n", &conn->c_faddr,
 			       ret);
 			break;
@@ -850,7 +850,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 
 	if (data_len < sizeof(struct rds_header)) {
 		rds_ib_conn_error(conn, "incoming message "
-		       "from %pI4 didn't include a "
+		       "from %pI6c didn't include a "
 		       "header, disconnecting and "
 		       "reconnecting\n",
 		       &conn->c_faddr);
@@ -863,7 +863,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 	/* Validate the checksum. */
 	if (!rds_message_verify_checksum(ihdr)) {
 		rds_ib_conn_error(conn, "incoming message "
-		       "from %pI4 has corrupted header - "
+		       "from %pI6c has corrupted header - "
 		       "forcing a reconnect\n",
 		       &conn->c_faddr);
 		rds_stats_inc(s_recv_drop_bad_checksum);
@@ -943,10 +943,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
 		ic->i_recv_data_rem = 0;
 		ic->i_ibinc = NULL;
 
-		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+		if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
 			rds_ib_cong_recv(conn, ibinc);
-		else {
-			rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+		} else {
+			rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
 					  &ibinc->ii_inc, GFP_ATOMIC);
 			state->ack_next = be64_to_cpu(hdr->h_sequence);
 			state->ack_next_valid = 1;
@@ -990,7 +990,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
 	} else {
 		/* We expect errors as the qp is drained during shutdown */
 		if (rds_conn_up(conn) || rds_conn_connecting(conn))
-			rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+			rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
 					  &conn->c_laddr, &conn->c_faddr,
 					  wc->status,
 					  ib_wc_status_msg(wc->status));
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 8557a1c..c4cdfe49 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 
 	/* We expect errors as the qp is drained during shutdown */
 	if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-		rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+		rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
 				  &conn->c_laddr, &conn->c_faddr, wc->status,
 				  ib_wc_status_msg(wc->status));
 	}
@@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
 		 first, &first->s_wr, ret, failed_wr);
 	BUG_ON(failed_wr != &first->s_wr);
 	if (ret) {
-		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+		printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 		rds_ib_sub_signaled(ic, nr_sig);
@@ -827,7 +827,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
 		 send, &send->s_atomic_wr, ret, failed_wr);
 	BUG_ON(failed_wr != &send->s_atomic_wr.wr);
 	if (ret) {
-		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+		printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 		rds_ib_sub_signaled(ic, nr_sig);
@@ -967,7 +967,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
 		 first, &first->s_rdma_wr.wr, ret, failed_wr);
 	BUG_ON(failed_wr != &first->s_rdma_wr.wr);
 	if (ret) {
-		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+		printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
 		       "returned %d\n", &conn->c_faddr, ret);
 		rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
 		rds_ib_sub_signaled(ic, nr_sig);
diff --git a/net/rds/loop.c b/net/rds/loop.c
index dac6218..0dca895 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 
 #include "rds_single_path.h"
 #include "rds.h"
@@ -75,11 +76,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
 
 	BUG_ON(hdr_off || sg || off);
 
-	rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+	rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
 	/* For the embedded inc. Matching put is in loop_inc_free() */
 	rds_message_addref(rm);
 
-	rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+	rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
 			  GFP_KERNEL);
 
 	rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 634cfcb..7b39980 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007 Oracle.  All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -183,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 	long i;
 	int ret;
 
-	if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
+	if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
 		ret = -ENOTCONN; /* XXX not a great errno */
 		goto out;
 	}
@@ -574,7 +574,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 
 	args = CMSG_DATA(cmsg);
 
-	if (rs->rs_bound_addr == 0) {
+	if (ipv6_addr_any(&rs->rs_bound_addr)) {
 		ret = -ENOTCONN; /* XXX not a great errno */
 		goto out_ret;
 	}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index fc59821..d7da115 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009 Oracle.  All rights reserved.
+ * Copyright (c) 2009, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -39,8 +39,9 @@
 
 static struct rdma_cm_id *rds_rdma_listen_id;
 
-int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
-			      struct rdma_cm_event *event)
+int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
+				  struct rdma_cm_event *event,
+				  bool isv6)
 {
 	/* this can be null in the listening path */
 	struct rds_connection *conn = cm_id->context;
@@ -72,7 +73,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
 	switch (event->event) {
 	case RDMA_CM_EVENT_CONNECT_REQUEST:
-		ret = trans->cm_handle_connect(cm_id, event);
+		ret = trans->cm_handle_connect(cm_id, event, isv6);
 		break;
 
 	case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -90,7 +91,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
 			ibic = conn->c_transport_data;
 			if (ibic && ibic->i_cm_id == cm_id)
-				ret = trans->cm_initiate_connect(cm_id);
+				ret = trans->cm_initiate_connect(cm_id, isv6);
 			else
 				rds_conn_drop(conn);
 		}
@@ -116,14 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
 	case RDMA_CM_EVENT_DISCONNECTED:
 		rdsdebug("DISCONNECT event - dropping connection "
-			"%pI4->%pI4\n", &conn->c_laddr,
+			 "%pI6c->%pI6c\n", &conn->c_laddr,
 			 &conn->c_faddr);
 		rds_conn_drop(conn);
 		break;
 
 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
 		if (conn) {
-			pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+			pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
 				&conn->c_laddr, &conn->c_faddr);
 			rds_conn_drop(conn);
 		}
@@ -146,13 +147,20 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 	return ret;
 }
 
-static int rds_rdma_listen_init(void)
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
+}
+
+static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
+				       struct sockaddr *sa,
+				       struct rdma_cm_id **ret_cm_id)
 {
-	struct sockaddr_in sin;
 	struct rdma_cm_id *cm_id;
 	int ret;
 
-	cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+	cm_id = rdma_create_id(&init_net, handler, NULL,
 			       RDMA_PS_TCP, IB_QPT_RC);
 	if (IS_ERR(cm_id)) {
 		ret = PTR_ERR(cm_id);
@@ -161,15 +169,11 @@ static int rds_rdma_listen_init(void)
 		return ret;
 	}
 
-	sin.sin_family = AF_INET;
-	sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
-	sin.sin_port = (__force u16)htons(RDS_PORT);
-
 	/*
 	 * XXX I bet this binds the cm_id to a device.  If we want to support
 	 * fail-over we'll have to take this into consideration.
 	 */
-	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+	ret = rdma_bind_addr(cm_id, sa);
 	if (ret) {
 		printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
 		       "rdma_bind_addr() returned %d\n", ret);
@@ -185,7 +189,7 @@ static int rds_rdma_listen_init(void)
 
 	rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
 
-	rds_rdma_listen_id = cm_id;
+	*ret_cm_id = cm_id;
 	cm_id = NULL;
 out:
 	if (cm_id)
@@ -193,6 +197,26 @@ static int rds_rdma_listen_init(void)
 	return ret;
 }
 
+/* Initialize the RDS RDMA listeners.  We create two listeners for
+ * compatibility reason.  The one on RDS_PORT is used for IPv4
+ * requests only.  The one on RDS_TCP_PORT is used for IPv6 requests
+ * only.  So only IPv6 enabled RDS module will communicate using this
+ * port.
+ */
+static int rds_rdma_listen_init(void)
+{
+	int ret;
+	struct sockaddr_in sin;
+
+	sin.sin_family = PF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(RDS_PORT);
+	ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
+					  (struct sockaddr *)&sin,
+					  &rds_rdma_listen_id);
+	return ret;
+}
+
 static void rds_rdma_listen_stop(void)
 {
 	if (rds_rdma_listen_id) {
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f2272fb..859808a 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -10,6 +10,7 @@
 #include <linux/rds.h>
 #include <linux/rhashtable.h>
 #include <linux/refcount.h>
+#include <linux/in6.h>
 
 #include "info.h"
 
@@ -61,7 +62,7 @@ void rdsdebug(char *fmt, ...)
 
 struct rds_cong_map {
 	struct rb_node		m_rb_node;
-	__be32			m_addr;
+	struct in6_addr		m_addr;
 	wait_queue_head_t	m_waitq;
 	struct list_head	m_conn_list;
 	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
@@ -136,11 +137,13 @@ struct rds_conn_path {
 /* One rds_connection per RDS address pair */
 struct rds_connection {
 	struct hlist_node	c_hash_node;
-	__be32			c_laddr;
-	__be32			c_faddr;
+	struct in6_addr		c_laddr;
+	struct in6_addr		c_faddr;
+	int			c_dev_if; /* c_laddrs's interface index */
 	unsigned int		c_loopback:1,
+				c_isv6:1,
 				c_ping_triggered:1,
-				c_pad_to_32:30;
+				c_pad_to_32:29;
 	int			c_npaths;
 	struct rds_connection	*c_passive;
 	struct rds_transport	*c_trans;
@@ -269,7 +272,7 @@ struct rds_incoming {
 	struct rds_conn_path	*i_conn_path;
 	struct rds_header	i_hdr;
 	unsigned long		i_rx_jiffies;
-	__be32			i_saddr;
+	struct in6_addr		i_saddr;
 
 	rds_rdma_cookie_t	i_rdma_cookie;
 	struct timeval		i_rx_tstamp;
@@ -386,7 +389,7 @@ struct rds_message {
 	struct list_head	m_conn_item;
 	struct rds_incoming	m_inc;
 	u64			m_ack_seq;
-	__be32			m_daddr;
+	struct in6_addr		m_daddr;
 	unsigned long		m_flags;
 
 	/* Never access m_rs without holding m_rs_lock.
@@ -519,7 +522,8 @@ struct rds_transport {
 				t_mp_capable:1;
 	unsigned int		t_type;
 
-	int (*laddr_check)(struct net *net, __be32 addr);
+	int (*laddr_check)(struct net *net, const struct in6_addr *addr,
+			   __u32 scope_id);
 	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
 	void (*conn_free)(void *data);
 	int (*conn_path_connect)(struct rds_conn_path *cp);
@@ -535,8 +539,8 @@ struct rds_transport {
 	void (*inc_free)(struct rds_incoming *inc);
 
 	int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
-				 struct rdma_cm_event *event);
-	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+				 struct rdma_cm_event *event, bool isv6);
+	int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
 	void (*cm_connect_complete)(struct rds_connection *conn,
 				    struct rdma_cm_event *event);
 
@@ -551,6 +555,12 @@ struct rds_transport {
 	bool (*t_unloading)(struct rds_connection *conn);
 };
 
+/* Bind hash table key length.  It is the sum of the size of a struct
+ * in6_addr, a scope_id  and a port.
+ */
+#define RDS_BOUND_KEY_LEN \
+	(sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
+
 struct rds_sock {
 	struct sock		rs_sk;
 
@@ -562,10 +572,14 @@ struct rds_sock {
 	 * support.
 	 */
 	struct rhash_head	rs_bound_node;
-	u64			rs_bound_key;
-	__be32			rs_bound_addr;
-	__be32			rs_conn_addr;
-	__be16			rs_bound_port;
+	u8			rs_bound_key[RDS_BOUND_KEY_LEN];
+	struct sockaddr_in6	rs_bound_sin6;
+#define rs_bound_addr		rs_bound_sin6.sin6_addr
+#define rs_bound_addr_v4	rs_bound_sin6.sin6_addr.s6_addr32[3]
+#define rs_bound_port		rs_bound_sin6.sin6_port
+#define rs_bound_scope_id	rs_bound_sin6.sin6_scope_id
+	struct in6_addr		rs_conn_addr;
+#define rs_conn_addr_v4		rs_conn_addr.s6_addr32[3]
 	__be16			rs_conn_port;
 	struct rds_transport    *rs_transport;
 
@@ -701,7 +715,8 @@ static inline void __rds_wake_sk_sleep(struct sock *sk)
 /* bind.c */
 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 void rds_remove_bound(struct rds_sock *rs);
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+				__u32 scope_id);
 int rds_bind_lock_init(void);
 void rds_bind_lock_destroy(void);
 
@@ -725,11 +740,15 @@ static inline void __rds_wake_sk_sleep(struct sock *sk)
 int rds_conn_init(void);
 void rds_conn_exit(void);
 struct rds_connection *rds_conn_create(struct net *net,
-				       __be32 laddr, __be32 faddr,
-				       struct rds_transport *trans, gfp_t gfp);
+				       const struct in6_addr *laddr,
+				       const struct in6_addr *faddr,
+				       struct rds_transport *trans, gfp_t gfp,
+				       int dev_if);
 struct rds_connection *rds_conn_create_outgoing(struct net *net,
-						__be32 laddr, __be32 faddr,
-			       struct rds_transport *trans, gfp_t gfp);
+						const struct in6_addr *laddr,
+						const struct in6_addr *faddr,
+						struct rds_transport *trans,
+						gfp_t gfp, int dev_if);
 void rds_conn_shutdown(struct rds_conn_path *cpath);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
@@ -840,11 +859,12 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
 
 /* recv.c */
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
-		  __be32 saddr);
+		  struct in6_addr *saddr);
 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
-		       __be32 saddr);
+		       struct in6_addr *saddr);
 void rds_inc_put(struct rds_incoming *inc);
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+		       struct in6_addr *daddr,
 		       struct rds_incoming *inc, gfp_t gfp);
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		int msg_flags);
@@ -859,7 +879,7 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 void rds_send_path_reset(struct rds_conn_path *conn);
 int rds_send_xmit(struct rds_conn_path *cp);
 struct sockaddr_in;
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 			 is_acked_func is_acked);
@@ -946,11 +966,14 @@ void rds_stats_info_copy(struct rds_info_iterator *iter,
 void rds_recv_worker(struct work_struct *);
 void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
 void rds_connect_complete(struct rds_connection *conn);
+int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
 
 /* transport.c */
 void rds_trans_register(struct rds_transport *trans);
 void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+					      const struct in6_addr *addr,
+					      __u32 scope_id);
 void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
 				       unsigned int avail);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 192ac6f..4217961 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -41,14 +41,14 @@
 #include "rds.h"
 
 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
-		  __be32 saddr)
+		 struct in6_addr *saddr)
 {
 	int i;
 
 	refcount_set(&inc->i_refcount, 1);
 	INIT_LIST_HEAD(&inc->i_item);
 	inc->i_conn = conn;
-	inc->i_saddr = saddr;
+	inc->i_saddr = *saddr;
 	inc->i_rdma_cookie = 0;
 	inc->i_rx_tstamp.tv_sec = 0;
 	inc->i_rx_tstamp.tv_usec = 0;
@@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
 EXPORT_SYMBOL_GPL(rds_inc_init);
 
 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
-		       __be32 saddr)
+		       struct in6_addr  *saddr)
 {
 	refcount_set(&inc->i_refcount, 1);
 	INIT_LIST_HEAD(&inc->i_item);
 	inc->i_conn = cp->cp_conn;
 	inc->i_conn_path = cp;
-	inc->i_saddr = saddr;
+	inc->i_saddr = *saddr;
 	inc->i_rdma_cookie = 0;
 	inc->i_rx_tstamp.tv_sec = 0;
 	inc->i_rx_tstamp.tv_usec = 0;
@@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
 
 	now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
 
-	rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+	rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
 	  "now_cong %d delta %d\n",
 	  rs, &rs->rs_bound_addr,
 	  ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
@@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn)
 	struct rds_conn_path *cp;
 
 	if (conn->c_npaths > 1 &&
-	    IS_CANONICAL(conn->c_laddr, conn->c_faddr)) {
+	    rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
 		for (i = 0; i < conn->c_npaths; i++) {
 			cp = &conn->c_path[i];
 			rds_conn_path_connect_if_down(cp);
@@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn)
  * conn.  This lets loopback, who only has one conn for both directions,
  * tell us which roles the addrs in the conn are playing for this message.
  */
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+		       struct in6_addr *daddr,
 		       struct rds_incoming *inc, gfp_t gfp)
 {
 	struct rds_sock *rs = NULL;
@@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 
 	if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
 		if (inc->i_hdr.h_sport == 0) {
-			rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+			rdsdebug("ignore ping with 0 sport from %pI6c\n",
+				 saddr);
 			goto out;
 		}
 		rds_stats_inc(s_recv_ping);
@@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
 		goto out;
 	}
 
-	rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+	rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if);
 	if (!rs) {
 		rds_stats_inc(s_recv_drop_no_sock);
 		goto out;
@@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 	struct rds_sock *rs = rds_sk_to_rs(sk);
 	long timeo;
 	int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
 	DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
 	struct rds_incoming *inc = NULL;
 
@@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 			break;
 		}
 
-		rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+		rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
 			 &inc->i_conn->c_faddr,
 			 ntohs(inc->i_hdr.h_sport));
 		ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
@@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 		rds_stats_inc(s_recv_delivered);
 
-		if (sin) {
-			sin->sin_family = AF_INET;
-			sin->sin_port = inc->i_hdr.h_sport;
-			sin->sin_addr.s_addr = inc->i_saddr;
-			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-			msg->msg_namelen = sizeof(*sin);
+		if (msg->msg_name) {
+			if (ipv6_addr_v4mapped(&inc->i_saddr)) {
+				sin = (struct sockaddr_in *)msg->msg_name;
+
+				sin->sin_family = AF_INET;
+				sin->sin_port = inc->i_hdr.h_sport;
+				sin->sin_addr.s_addr =
+				    inc->i_saddr.s6_addr32[3];
+				memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+				msg->msg_namelen = sizeof(*sin);
+			} else {
+				sin6 = (struct sockaddr_in6 *)msg->msg_name;
+
+				sin6->sin6_family = AF_INET6;
+				sin6->sin6_port = inc->i_hdr.h_sport;
+				sin6->sin6_addr = inc->i_saddr;
+				sin6->sin6_flowinfo = 0;
+				sin6->sin6_scope_id = rs->rs_bound_scope_id;
+				msg->msg_namelen = sizeof(*sin6);
+			}
 		}
 		break;
 	}
diff --git a/net/rds/send.c b/net/rds/send.c
index 94c7f74..cc91860 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
 }
 EXPORT_SYMBOL_GPL(rds_send_drop_acked);
 
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
 {
 	struct rds_message *rm, *tmp;
 	struct rds_connection *conn;
@@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 	spin_lock_irqsave(&rs->rs_lock, flags);
 
 	list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
-		if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
-			     dest->sin_port != rm->m_inc.i_hdr.h_dport))
+		if (dest &&
+		    (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
+		     dest->sin6_port != rm->m_inc.i_hdr.h_dport))
 			continue;
 
 		list_move(&rm->m_sock_item, &list);
@@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 {
 	struct sock *sk = sock->sk;
 	struct rds_sock *rs = rds_sk_to_rs(sk);
+	DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
 	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
-	__be32 daddr;
 	__be16 dport;
 	struct rds_message *rm = NULL;
 	struct rds_connection *conn;
@@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	int nonblock = msg->msg_flags & MSG_DONTWAIT;
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
+	struct in6_addr daddr;
+	__u32 scope_id = 0;
 	size_t total_payload_len = payload_len, rdma_payload_len = 0;
 	bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
 		      sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
 	int num_sgs = ceil(payload_len, PAGE_SIZE);
+	int namelen;
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
@@ -1081,27 +1085,59 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 		goto out;
 	}
 
-	if (msg->msg_namelen) {
-		/* XXX fail non-unicast destination IPs? */
-		if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+	namelen = msg->msg_namelen;
+	if (namelen != 0) {
+		if (namelen < sizeof(*usin)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (namelen) {
+		case sizeof(*usin):
+			if (usin->sin_family != AF_INET ||
+			    usin->sin_addr.s_addr == INADDR_ANY ||
+			    usin->sin_addr.s_addr == INADDR_BROADCAST ||
+			    IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
+				ret = -EINVAL;
+				goto out;
+			}
+			ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
+			dport = usin->sin_port;
+			break;
+
+		case sizeof(*sin6): {
+			ret = -EPROTONOSUPPORT;
+			goto out;
+		}
+
+		default:
 			ret = -EINVAL;
 			goto out;
 		}
-		daddr = usin->sin_addr.s_addr;
-		dport = usin->sin_port;
 	} else {
 		/* We only care about consistency with ->connect() */
 		lock_sock(sk);
 		daddr = rs->rs_conn_addr;
 		dport = rs->rs_conn_port;
+		scope_id = rs->rs_bound_scope_id;
 		release_sock(sk);
 	}
 
 	lock_sock(sk);
-	if (daddr == 0 || rs->rs_bound_addr == 0) {
+	if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
 		release_sock(sk);
-		ret = -ENOTCONN; /* XXX not a great errno */
+		ret = -ENOTCONN;
 		goto out;
+	} else if (namelen != 0) {
+		/* Cannot send to an IPv4 address using an IPv6 source
+		 * address and cannot send to an IPv6 address using an
+		 * IPv4 source address.
+		 */
+		if (ipv6_addr_v4mapped(&daddr) ^
+		    ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+			release_sock(sk);
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
 	}
 	release_sock(sk);
 
@@ -1155,13 +1191,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* rds_conn_create has a spinlock that runs with IRQ off.
 	 * Caching the conn in the socket helps a lot. */
-	if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+	if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
 		conn = rs->rs_conn;
 	else {
 		conn = rds_conn_create_outgoing(sock_net(sock->sk),
-						rs->rs_bound_addr, daddr,
-					rs->rs_transport,
-					sock->sk->sk_allocation);
+						&rs->rs_bound_addr, &daddr,
+						rs->rs_transport,
+						sock->sk->sk_allocation,
+						scope_id);
 		if (IS_ERR(conn)) {
 			ret = PTR_ERR(conn);
 			goto out;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 351a284..beaff17 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,8 @@
 #include <net/tcp.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/tcp.h>
+#include <net/addrconf.h>
 
 #include "rds.h"
 #include "tcp.h"
@@ -262,9 +264,34 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 }
 
-static int rds_tcp_laddr_check(struct net *net, __be32 addr)
+static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+			       __u32 scope_id)
 {
-	if (inet_addr_type(net, addr) == RTN_LOCAL)
+	struct net_device *dev = NULL;
+	int ret;
+
+	if (ipv6_addr_v4mapped(addr)) {
+		if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
+			return 0;
+		return -EADDRNOTAVAIL;
+	}
+
+	/* If the scope_id is specified, check only those addresses
+	 * hosted on the specified interface.
+	 */
+	if (scope_id != 0) {
+		rcu_read_lock();
+		dev = dev_get_by_index_rcu(net, scope_id);
+		/* scope_id is not valid... */
+		if (!dev) {
+			rcu_read_unlock();
+			return -EADDRNOTAVAIL;
+		}
+	}
+	ret = ipv6_chk_addr(net, addr, dev, 0);
+	if (scope_id != 0)
+		rcu_read_unlock();
+	if (ret)
 		return 0;
 	return -EADDRNOTAVAIL;
 }
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index d999e70..0101033 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk)
 		 * RDS connection as RDS_CONN_UP until the reconnect,
 		 * to avoid RDS datagram loss.
 		 */
-		if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) &&
+		if (rds_addr_cmp(&cp->cp_conn->c_laddr,
+				 &cp->cp_conn->c_faddr) >= 0 &&
 		    rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
 					     RDS_CONN_ERROR)) {
 			rds_conn_path_drop(cp, false);
@@ -88,7 +89,9 @@ void rds_tcp_state_change(struct sock *sk)
 int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 {
 	struct socket *sock = NULL;
-	struct sockaddr_in src, dest;
+	struct sockaddr_in sin;
+	struct sockaddr *addr;
+	int addrlen;
 	int ret;
 	struct rds_connection *conn = cp->cp_conn;
 	struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -112,30 +115,33 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 
 	rds_tcp_tune(sock);
 
-	src.sin_family = AF_INET;
-	src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-	src.sin_port = (__force u16)htons(0);
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3];
+	sin.sin_port = (__force u16)htons(0);
+	addr = (struct sockaddr *)&sin;
+	addrlen = sizeof(sin);
 
-	ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+	ret = sock->ops->bind(sock, addr, addrlen);
 	if (ret) {
-		rdsdebug("bind failed with %d at address %pI4\n",
+		rdsdebug("bind failed with %d at address %pI6c\n",
 			 ret, &conn->c_laddr);
 		goto out;
 	}
 
-	dest.sin_family = AF_INET;
-	dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-	dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3];
+	sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+	addr = (struct sockaddr *)&sin;
+	addrlen = sizeof(sin);
 
 	/*
 	 * once we call connect() we can start getting callbacks and they
 	 * own the socket
 	 */
 	rds_tcp_set_callbacks(sock, cp);
-	ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
-				 O_NONBLOCK);
+	ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
 
-	rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+	rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
 	if (ret == -EINPROGRESS)
 		ret = 0;
 	if (ret == 0) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 2257118..4fdf5b3 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2018 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -83,13 +83,12 @@ int rds_tcp_keepalive(struct socket *sock)
 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
 {
 	int i;
-	bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
 	int npaths = max_t(int, 1, conn->c_npaths);
 
 	/* for mprds, all paths MUST be initiated by the peer
 	 * with the smaller address.
 	 */
-	if (!peer_is_smaller) {
+	if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
 		/* Make sure we initiate at least one path if this
 		 * has not already been done; rds_start_mprds() will
 		 * take care of additional paths, if necessary.
@@ -164,13 +163,16 @@ int rds_tcp_accept_one(struct socket *sock)
 
 	inet = inet_sk(new_sock->sk);
 
-	rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
-		 &inet->inet_saddr, ntohs(inet->inet_sport),
-		 &inet->inet_daddr, ntohs(inet->inet_dport));
+	rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n",
+		 &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport),
+		 &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport));
 
 	conn = rds_conn_create(sock_net(sock->sk),
-			       inet->inet_saddr, inet->inet_daddr,
-			       &rds_tcp_transport, GFP_KERNEL);
+			       &new_sock->sk->sk_v6_rcv_saddr,
+			       &new_sock->sk->sk_v6_daddr,
+			       &rds_tcp_transport, GFP_KERNEL,
+			       new_sock->sk->sk_bound_dev_if);
+
 	if (IS_ERR(conn)) {
 		ret = PTR_ERR(conn);
 		goto out;
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index b9fbd2e..42c5ff1 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 			tc->t_tinc = tinc;
 			rdsdebug("alloced tinc %p\n", tinc);
 			rds_inc_path_init(&tinc->ti_inc, cp,
-					  cp->cp_conn->c_faddr);
+					  &cp->cp_conn->c_faddr);
 			tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
 					local_clock();
 
@@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
 			if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
 				rds_tcp_cong_recv(conn, tinc);
 			else
-				rds_recv_incoming(conn, conn->c_faddr,
-						  conn->c_laddr, &tinc->ti_inc,
+				rds_recv_incoming(conn, &conn->c_faddr,
+						  &conn->c_laddr,
+						  &tinc->ti_inc,
 						  arg->gfp);
 
 			tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7df869d..78a2554 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -153,7 +153,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
 			 * an incoming RST.
 			 */
 			if (rds_conn_path_up(cp)) {
-				pr_warn("RDS/tcp: send to %pI4 on cp [%d]"
+				pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
 					"returned %d, "
 					"disconnecting and reconnecting\n",
 					&conn->c_faddr, cp->cp_index, ret);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index c52861d..cfa2496 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
 		return;
 	}
 
-	rdsdebug("conn %p for %pI4 to %pI4 complete\n",
-	  cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
+	rdsdebug("conn %p for %pI6c to %pI6c complete\n",
+		 cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
 
 	cp->cp_reconnect_jiffies = 0;
 	set_bit(0, &cp->cp_conn->c_map_queued);
@@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 	unsigned long rand;
 	struct rds_connection *conn = cp->cp_conn;
 
-	rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
-	  conn, &conn->c_laddr, &conn->c_faddr,
-	  cp->cp_reconnect_jiffies);
+	rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
+		 conn, &conn->c_laddr, &conn->c_faddr,
+		 cp->cp_reconnect_jiffies);
 
 	/* let peer with smaller addr initiate reconnect, to avoid duels */
 	if (conn->c_trans->t_type == RDS_TRANS_TCP &&
-	    !IS_CANONICAL(conn->c_laddr, conn->c_faddr))
+	    rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
 		return;
 
 	set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
@@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
 	}
 
 	get_random_bytes(&rand, sizeof(rand));
-	rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+	rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
 		 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
 		 conn, &conn->c_laddr, &conn->c_faddr);
 	rcu_read_lock();
@@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work)
 	int ret;
 
 	if (cp->cp_index > 0 &&
-	    !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr))
+	    rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) > 0)
 		return;
 	clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
 	ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
 	if (ret) {
 		ret = conn->c_trans->conn_path_connect(cp);
-		rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
-			conn, &conn->c_laddr, &conn->c_faddr, ret);
+		rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
+			 conn, &conn->c_laddr, &conn->c_faddr, ret);
 
 		if (ret) {
 			if (rds_conn_path_transition(cp,
@@ -259,3 +259,50 @@ int rds_threads_init(void)
 
 	return 0;
 }
+
+/* Compare two IPv6 addresses.  Return 0 if the two addresses are equal.
+ * Return 1 if the first is greater.  Return -1 if the second is greater.
+ */
+int rds_addr_cmp(const struct in6_addr *addr1,
+		 const struct in6_addr *addr2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	const __be64 *a1, *a2;
+	__be64 x, y;
+
+	a1 = (__be64 *)addr1;
+	a2 = (__be64 *)addr2;
+
+	if (*a1 != *a2) {
+		if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
+			return -1;
+		else
+			return 1;
+	} else {
+		x = be64_to_cpu(*++a1);
+		y = be64_to_cpu(*++a2);
+		if (x < y)
+			return -1;
+		else if (x > y)
+			return 1;
+		else
+			return 0;
+	}
+#else
+	u32 a, b;
+	int i;
+
+	for (i = 0; i < 4; i++) {
+		if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
+			a = ntohl(addr1->s6_addr32[i]);
+			b = ntohl(addr2->s6_addr32[i]);
+			if (a < b)
+				return -1;
+			else if (a > b)
+				return 1;
+		}
+	}
+	return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(rds_addr_cmp);
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 0b188dd..c9788db 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Oracle.  All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/in.h>
+#include <linux/ipv6.h>
 
 #include "rds.h"
 #include "loop.h"
@@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans)
 		module_put(trans->t_owner);
 }
 
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+					      const struct in6_addr *addr,
+					      __u32 scope_id)
 {
 	struct rds_transport *ret = NULL;
 	struct rds_transport *trans;
 	unsigned int i;
 
-	if (IN_LOOPBACK(ntohl(addr)))
+	if (ipv6_addr_v4mapped(addr)) {
+		if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
+			return &rds_loop_transport;
+	} else if (ipv6_addr_loopback(addr)) {
 		return &rds_loop_transport;
+	}
 
 	down_read(&rds_trans_sem);
 	for (i = 0; i < RDS_TRANS_COUNT; i++) {
 		trans = transports[i];
 
-		if (trans && (trans->laddr_check(net, addr) == 0) &&
+		if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
 		    (!trans->t_owner || try_module_get(trans->t_owner))) {
 			ret = trans;
 			break;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 3/3] rds: Extend RDS API for IPv6 support
From: Ka-Cheong Poon @ 2018-06-25 10:38 UTC (permalink / raw)
  To: netdev; +Cc: santosh.shilimkar, davem, rds-devel
In-Reply-To: <cover.1529922794.git.ka-cheong.poon@oracle.com>

There are many data structures (RDS socket options) used by RDS apps
which use a 32 bit integer to store IP address. To support IPv6,
struct in6_addr needs to be used. To ensure backward compatibility, a
new data structure is introduced for each of those data structures
which use a 32 bit integer to represent an IP address. And new socket
options are introduced to use those new structures. This means that
existing apps should work without a problem with the new RDS module.
For apps which want to use IPv6, those new data structures and socket
options can be used. IPv4 mapped address is used to represent IPv4
address in the new data structures.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
---
 include/uapi/linux/rds.h | 71 +++++++++++++++++++++++++++++++++--
 net/rds/connection.c     | 97 ++++++++++++++++++++++++++++++++++++++++++++----
 net/rds/ib.c             | 52 ++++++++++++++++++++++++++
 net/rds/ib_mr.h          |  2 +
 net/rds/ib_rdma.c        | 11 +++++-
 net/rds/rds.h            |  4 ++
 net/rds/recv.c           | 25 +++++++++++++
 net/rds/tcp.c            | 44 ++++++++++++++++++++++
 8 files changed, 295 insertions(+), 11 deletions(-)

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 20c6bd0..518d40f 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
 /*
- * Copyright (c) 2008 Oracle.  All rights reserved.
+ * Copyright (c) 2008, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -52,7 +52,7 @@
 #define RDS_RECVERR			5
 #define RDS_CONG_MONITOR		6
 #define RDS_GET_MR_FOR_DEST		7
-#define SO_RDS_TRANSPORT		8
+#define SO_RDS_TRANSPORT		9
 
 /* Socket option to tap receive path latency
  *	SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
@@ -118,7 +118,17 @@
 #define RDS_INFO_IB_CONNECTIONS		10008
 #define RDS_INFO_CONNECTION_STATS	10009
 #define RDS_INFO_IWARP_CONNECTIONS	10010
-#define RDS_INFO_LAST			10010
+
+/* PF_RDS6 options */
+#define RDS6_INFO_CONNECTIONS		10011
+#define RDS6_INFO_SEND_MESSAGES		10012
+#define RDS6_INFO_RETRANS_MESSAGES	10013
+#define RDS6_INFO_RECV_MESSAGES		10014
+#define RDS6_INFO_SOCKETS		10015
+#define RDS6_INFO_TCP_SOCKETS		10016
+#define RDS6_INFO_IB_CONNECTIONS	10017
+
+#define RDS_INFO_LAST			10017
 
 struct rds_info_counter {
 	__u8	name[32];
@@ -140,6 +150,15 @@ struct rds_info_connection {
 	__u8		flags;
 } __attribute__((packed));
 
+struct rds6_info_connection {
+	__u64		next_tx_seq;
+	__u64		next_rx_seq;
+	struct in6_addr	laddr;
+	struct in6_addr	faddr;
+	__u8		transport[TRANSNAMSIZ];		/* null term ascii */
+	__u8		flags;
+} __attribute__((packed));
+
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
 
@@ -153,6 +172,17 @@ struct rds_info_message {
 	__u8		flags;
 } __attribute__((packed));
 
+struct rds6_info_message {
+	__u64	seq;
+	__u32	len;
+	struct in6_addr	laddr;
+	struct in6_addr	faddr;
+	__be16		lport;
+	__be16		fport;
+	__u8		flags;
+	__u8		tos;
+} __attribute__((packed));
+
 struct rds_info_socket {
 	__u32		sndbuf;
 	__be32		bound_addr;
@@ -163,6 +193,16 @@ struct rds_info_socket {
 	__u64		inum;
 } __attribute__((packed));
 
+struct rds6_info_socket {
+	__u32		sndbuf;
+	struct in6_addr	bound_addr;
+	struct in6_addr	connected_addr;
+	__be16		bound_port;
+	__be16		connected_port;
+	__u32		rcvbuf;
+	__u64		inum;
+} __attribute__((packed));
+
 struct rds_info_tcp_socket {
 	__be32          local_addr;
 	__be16          local_port;
@@ -175,6 +215,18 @@ struct rds_info_tcp_socket {
 	__u32           last_seen_una;
 } __attribute__((packed));
 
+struct rds6_info_tcp_socket {
+	struct in6_addr	local_addr;
+	__be16		local_port;
+	struct in6_addr	peer_addr;
+	__be16		peer_port;
+	__u64		hdr_rem;
+	__u64		data_rem;
+	__u32		last_sent_nxt;
+	__u32		last_expected_una;
+	__u32		last_seen_una;
+} __attribute__((packed));
+
 #define RDS_IB_GID_LEN	16
 struct rds_info_rdma_connection {
 	__be32		src_addr;
@@ -189,6 +241,19 @@ struct rds_info_rdma_connection {
 	__u32		rdma_mr_size;
 };
 
+struct rds6_info_rdma_connection {
+	struct in6_addr	src_addr;
+	struct in6_addr	dst_addr;
+	__u8		src_gid[RDS_IB_GID_LEN];
+	__u8		dst_gid[RDS_IB_GID_LEN];
+
+	__u32		max_send_wr;
+	__u32		max_recv_wr;
+	__u32		max_send_sge;
+	__u32		rdma_mr_max;
+	__u32		rdma_mr_size;
+};
+
 /* RDS message Receive Path Latency points */
 enum rds_message_rxpath_latency {
 	RDS_MSG_RX_HDR_TO_DGRAM_START = 0,
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 8c5d093..5819b64 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -488,15 +488,19 @@ void rds_conn_destroy(struct rds_connection *conn)
 
 static void __rds_inc_msg_cp(struct rds_incoming *inc,
 			     struct rds_info_iterator *iter,
-			     void *saddr, void *daddr, int flip)
+			     void *saddr, void *daddr, int flip, bool isv6)
 {
-	rds_inc_info_copy(inc, iter, *(__be32 *)saddr, *(__be32 *)daddr, flip);
+	if (isv6)
+		rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+	else
+		rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+				  *(__be32 *)daddr, flip);
 }
 
 static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 				      struct rds_info_iterator *iter,
 				      struct rds_info_lengths *lens,
-				      int want_send)
+				      int want_send, bool isv6)
 {
 	struct hlist_head *head;
 	struct list_head *list;
@@ -507,7 +511,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 	size_t i;
 	int j;
 
-	len /= sizeof(struct rds_info_message);
+	if (isv6)
+		len /= sizeof(struct rds6_info_message);
+	else
+		len /= sizeof(struct rds_info_message);
 
 	rcu_read_lock();
 
@@ -537,7 +544,7 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 								 iter,
 								 &conn->c_laddr,
 								 &conn->c_faddr,
-								 0);
+								 0, isv6);
 				}
 
 				spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -547,7 +554,10 @@ static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
 	rcu_read_unlock();
 
 	lens->nr = total;
-	lens->each = sizeof(struct rds_info_message);
+	if (isv6)
+		lens->each = sizeof(struct rds6_info_message);
+	else
+		lens->each = sizeof(struct rds_info_message);
 }
 
 static void rds_conn_message_info(struct socket *sock, unsigned int len,
@@ -555,7 +565,15 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 				  struct rds_info_lengths *lens,
 				  int want_send)
 {
-	rds_conn_message_info_cmn(sock, len, iter, lens, want_send);
+	rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+				   struct rds_info_iterator *iter,
+				   struct rds_info_lengths *lens,
+				   int want_send)
+{
+	rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
 }
 
 static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
@@ -565,6 +583,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
 	rds_conn_message_info(sock, len, iter, lens, 1);
 }
 
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+					struct rds_info_iterator *iter,
+					struct rds_info_lengths *lens)
+{
+	rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+
 static void rds_conn_message_info_retrans(struct socket *sock,
 					  unsigned int len,
 					  struct rds_info_iterator *iter,
@@ -573,6 +598,14 @@ static void rds_conn_message_info_retrans(struct socket *sock,
 	rds_conn_message_info(sock, len, iter, lens, 0);
 }
 
+static void rds6_conn_message_info_retrans(struct socket *sock,
+					   unsigned int len,
+					   struct rds_info_iterator *iter,
+					   struct rds_info_lengths *lens)
+{
+	rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens,
@@ -688,6 +721,34 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
 	return 1;
 }
 
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+	struct rds6_info_connection *cinfo6 = buffer;
+	struct rds_connection *conn = cp->cp_conn;
+
+	cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+	cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+	cinfo6->laddr = conn->c_laddr;
+	cinfo6->faddr = conn->c_faddr;
+	strncpy(cinfo6->transport, conn->c_trans->t_name,
+		sizeof(cinfo6->transport));
+	cinfo6->flags = 0;
+
+	rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+			  SENDING);
+	/* XXX Future: return the state rather than these funky bits */
+	rds_conn_info_set(cinfo6->flags,
+			  atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+			  CONNECTING);
+	rds_conn_info_set(cinfo6->flags,
+			  atomic_read(&cp->cp_state) == RDS_CONN_UP,
+			  CONNECTED);
+	/* Just return 1 as there is no error case. This is a helper function
+	 * for rds_walk_conn_path_info() and it wants a return value.
+	 */
+	return 1;
+}
+
 static void rds_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens)
@@ -700,6 +761,18 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_connection));
 }
 
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+			   struct rds_info_iterator *iter,
+			   struct rds_info_lengths *lens)
+{
+	u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
+
+	rds_walk_conn_path_info(sock, len, iter, lens,
+				rds6_conn_info_visitor,
+				buffer,
+				sizeof(struct rds6_info_connection));
+}
+
 int rds_conn_init(void)
 {
 	rds_conn_slab = kmem_cache_create("rds_connection",
@@ -713,6 +786,11 @@ int rds_conn_init(void)
 			       rds_conn_message_info_send);
 	rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
 			       rds_conn_message_info_retrans);
+	rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+	rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+			       rds6_conn_message_info_send);
+	rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+			       rds6_conn_message_info_retrans);
 
 	return 0;
 }
@@ -730,6 +808,11 @@ void rds_conn_exit(void)
 				 rds_conn_message_info_send);
 	rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
 				 rds_conn_message_info_retrans);
+	rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+	rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+				 rds6_conn_message_info_send);
+	rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+				 rds6_conn_message_info_retrans);
 }
 
 /*
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 756225c..63d95ea 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -321,6 +321,43 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
 	return 1;
 }
 
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+				     void *buffer)
+{
+	struct rds6_info_rdma_connection *iinfo6 = buffer;
+	struct rds_ib_connection *ic;
+
+	/* We will only ever look at IB transports */
+	if (conn->c_trans != &rds_ib_transport)
+		return 0;
+
+	iinfo6->src_addr = conn->c_laddr;
+	iinfo6->dst_addr = conn->c_faddr;
+
+	memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+	memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+	if (rds_conn_state(conn) == RDS_CONN_UP) {
+		struct rds_ib_device *rds_ibdev;
+		struct rdma_dev_addr *dev_addr;
+
+		ic = conn->c_transport_data;
+		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *)&iinfo6->src_gid);
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *)&iinfo6->dst_gid);
+
+		rds_ibdev = ic->rds_ibdev;
+		iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+		iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+		iinfo6->max_send_sge = rds_ibdev->max_sge;
+		rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+	}
+	return 1;
+}
+
 static void rds_ib_ic_info(struct socket *sock, unsigned int len,
 			   struct rds_info_iterator *iter,
 			   struct rds_info_lengths *lens)
@@ -333,6 +370,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
 				sizeof(struct rds_info_rdma_connection));
 }
 
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+			    struct rds_info_iterator *iter,
+			    struct rds_info_lengths *lens)
+{
+	u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
+
+	rds_for_each_conn_info(sock, len, iter, lens,
+			       rds6_ib_conn_info_visitor,
+			       buffer,
+			       sizeof(struct rds6_info_rdma_connection));
+}
+
 /*
  * Early RDS/IB was built to only bind to an address if there is an IPoIB
  * device with that address set.
@@ -441,6 +491,7 @@ void rds_ib_exit(void)
 	rds_ib_set_unloading();
 	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
 	rds_ib_unregister_client();
 	rds_ib_destroy_nodev_conns();
 	rds_ib_sysctl_exit();
@@ -502,6 +553,7 @@ int rds_ib_init(void)
 	rds_trans_register(&rds_ib_transport);
 
 	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+	rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
 
 	goto out;
 
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 0ea4ab0..f440ace 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,6 +113,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
 					     int npages);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
 			struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+			 struct rds6_info_rdma_connection *iinfo6);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
 		    struct rds_sock *rs, u32 *key_ret);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0ec9df0..e3c8bbb 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -180,6 +180,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
 	iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
 }
 
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+			 struct rds6_info_rdma_connection *iinfo6)
+{
+	struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+	iinfo6->rdma_mr_max = pool_1m->max_items;
+	iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+
 struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f5f99d1..fd63b86 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -875,6 +875,10 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 void rds_inc_info_copy(struct rds_incoming *inc,
 		       struct rds_info_iterator *iter,
 		       __be32 saddr, __be32 daddr, int flip);
+void rds6_inc_info_copy(struct rds_incoming *inc,
+			struct rds_info_iterator *iter,
+			struct in6_addr *saddr, struct in6_addr *daddr,
+			int flip);
 
 /* send.c */
 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 4217961..10a2d0f 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -792,3 +792,28 @@ void rds_inc_info_copy(struct rds_incoming *inc,
 
 	rds_info_copy(iter, &minfo, sizeof(minfo));
 }
+
+void rds6_inc_info_copy(struct rds_incoming *inc,
+			struct rds_info_iterator *iter,
+			struct in6_addr *saddr, struct in6_addr *daddr,
+			int flip)
+{
+	struct rds6_info_message minfo6;
+
+	minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+	minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+
+	if (flip) {
+		minfo6.laddr = *daddr;
+		minfo6.faddr = *saddr;
+		minfo6.lport = inc->i_hdr.h_dport;
+		minfo6.fport = inc->i_hdr.h_sport;
+	} else {
+		minfo6.laddr = *saddr;
+		minfo6.faddr = *daddr;
+		minfo6.lport = inc->i_hdr.h_sport;
+		minfo6.fport = inc->i_hdr.h_dport;
+	}
+
+	rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index fb0dac1..c556f0d 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -273,6 +273,48 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 }
 
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+			     struct rds_info_iterator *iter,
+			     struct rds_info_lengths *lens)
+{
+	struct rds6_info_tcp_socket tsinfo6;
+	struct rds_tcp_connection *tc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+	if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+		goto out;
+
+	list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+		struct sock *sk = tc->t_sock->sk;
+		struct inet_sock *inet = inet_sk(sk);
+
+		tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+		tsinfo6.local_port = inet->inet_sport;
+		tsinfo6.peer_addr = sk->sk_v6_daddr;
+		tsinfo6.peer_port = inet->inet_dport;
+
+		tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+		tsinfo6.data_rem = tc->t_tinc_data_rem;
+		tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+		tsinfo6.last_expected_una = tc->t_last_expected_una;
+		tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+		rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+	}
+
+out:
+	lens->nr = rds6_tcp_tc_count;
+	lens->each = sizeof(tsinfo6);
+
+	spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
 static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
 			       __u32 scope_id)
 {
@@ -629,6 +671,7 @@ static void rds_tcp_exit(void)
 	rds_tcp_set_unloading();
 	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 	unregister_pernet_device(&rds_tcp_net_ops);
 	rds_tcp_destroy_conns();
 	rds_trans_unregister(&rds_tcp_transport);
@@ -660,6 +703,7 @@ static int rds_tcp_init(void)
 	rds_trans_register(&rds_tcp_transport);
 
 	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+	rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 
 	goto out;
 out_recv:
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net-next 0/3] rds: IPv6 support
From: Ka-Cheong Poon @ 2018-06-25 10:38 UTC (permalink / raw)
  To: netdev; +Cc: santosh.shilimkar, davem, rds-devel

This patch set adds IPv6 support to the kernel RDS and related
modules.  Existing RDS apps using IPv4 address continue to run without
any problem.  New RDS apps which want to use IPv6 address can do so by
passing the address in struct sockaddr_in6 to bind(), connect() or
sendmsg().  And those apps also need to use the new IPv6 equivalents
of some of the existing socket options as the existing options use a
32 bit integer to store IP address.

All RDS code now use struct in6_addr to store IP address.  IPv4
address is stored as an IPv4 mapped address.

Header file changes

There are many data structures (RDS socket options) used by RDS apps
which use a 32 bit integer to store IP address. To support IPv6,
struct in6_addr needs to be used. To ensure backward compatibility, a
new data structure is introduced for each of those data structures
which use a 32 bit integer to represent an IP address. And new socket
options are introduced to use those new structures. This means that
existing apps should work without a problem with the new RDS module.
For apps which want to use IPv6, those new data structures and socket
options can be used. IPv4 mapped address is used to represent IPv4
address in the new data structures.

Internally, all RDS data structures which contain an IP address are
changed to use struct in6_addr to store the address. IPv4 address is
stored as an IPv4 mapped address. All the functions which take an IP
address as argument are also changed to use struct in6_addr.

RDS/RDMA/IB uses a private data (struct rds_ib_connect_private)
exchange between endpoints at RDS connection establishment time to
support RDMA. This private data exchange uses a 32 bit integer to
represent an IP address. This needs to be changed in order to support
IPv6. A new private data struct rds6_ib_connect_private is introduced
to handle this. To ensure backward compatibility, an IPv6 capable RDS
stack uses another RDMA listener port (RDS_CM_PORT) to accept IPv6
connection. And it continues to use the original RDS_PORT for IPv4 RDS
connections. When it needs to communicate with an IPv6 peer, it uses
the RDS_TCP_PORT to send the connection set up request.

RDS/TCP changes

TCP related code is changed to support IPv6.  Note that only an IPv6
TCP listener on port RDS_TCP_PORT is created as it can accept both
IPv4 and IPv6 connection requests.

IB/RDMA changes

The initial private data exchange between IB endpoints using RDMA is
changed to support IPv6 address instead, if the peer address is IPv6.
To ensure backward compatibility, annother RDMA listener port
(RDS_CM_PORT) is used to accept IPv6 connection. An IPv6 capable RDS
module continues to use the original RDS_PORT for IPv4 RDS
connections. When it needs to communicate with an IPv6 peer, it uses
the RDS_CM_PORT to send the connection set up request.

Ka-Cheong Poon (3):
  rds: Changing IP address internal representation to struct in6_addr
  rds: Enable RDS IPv6 support
  rds: Extend RDS API for IPv6 support

 include/uapi/linux/rds.h |  71 ++++++++++-
 net/rds/af_rds.c         | 138 ++++++++++++++++------
 net/rds/bind.c           | 106 ++++++++++++-----
 net/rds/cong.c           |  23 ++--
 net/rds/connection.c     | 246 ++++++++++++++++++++++++++++----------
 net/rds/ib.c             | 114 ++++++++++++++++--
 net/rds/ib.h             |  45 +++++--
 net/rds/ib_cm.c          | 301 +++++++++++++++++++++++++++++++++++------------
 net/rds/ib_mr.h          |   2 +
 net/rds/ib_rdma.c        |  24 ++--
 net/rds/ib_recv.c        |  18 +--
 net/rds/ib_send.c        |  10 +-
 net/rds/loop.c           |   7 +-
 net/rds/rdma.c           |   6 +-
 net/rds/rdma_transport.c |  84 ++++++++++---
 net/rds/rdma_transport.h |   2 +
 net/rds/rds.h            |  85 ++++++++-----
 net/rds/recv.c           |  76 +++++++++---
 net/rds/send.c           |  86 +++++++++++---
 net/rds/tcp.c            | 129 ++++++++++++++++----
 net/rds/tcp.h            |   4 +-
 net/rds/tcp_connect.c    |  68 ++++++++---
 net/rds/tcp_listen.c     |  58 ++++++---
 net/rds/tcp_recv.c       |   9 +-
 net/rds/tcp_send.c       |   4 +-
 net/rds/threads.c        |  69 +++++++++--
 net/rds/transport.c      |  15 ++-
 27 files changed, 1378 insertions(+), 422 deletions(-)

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH net-next 1/5] ipv4: add __ip_queue_xmit() that supports tos param
From: Neil Horman @ 2018-06-25 11:13 UTC (permalink / raw)
  To: David Miller; +Cc: lucien.xin, netdev, linux-sctp, marcelo.leitner
In-Reply-To: <20180625.162654.1306497955900954874.davem@davemloft.net>

On Mon, Jun 25, 2018 at 04:26:54PM +0900, David Miller wrote:
> From: Xin Long <lucien.xin@gmail.com>
> Date: Mon, 25 Jun 2018 10:14:33 +0800
> 
> > +EXPORT_SYMBOL(__ip_queue_xmit);
> > +
> > +int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
> > +{
> > +	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
> > +}
> >  EXPORT_SYMBOL(ip_queue_xmit);
> 
> Maybe better to only export __ip_queue_xmit() and make ip_queue_xmit() an
> inline function in net/ip.h?
> 
I concur.  No need to export both here

Neil

^ permalink raw reply

* Re: [PATCH] igb: allow optionally getting mac address from device tree
From: Marcel Ziswiler @ 2018-06-25 11:25 UTC (permalink / raw)
  To: davem@davemloft.net
  Cc: netdev@vger.kernel.org, jeffrey.t.kirsher@intel.com,
	intel-wired-lan@lists.osuosl.org, linux-kernel@vger.kernel.org
In-Reply-To: <20180625.170547.658198698048994134.davem@davemloft.net>

Hi David

On Mon, 2018-06-25 at 17:05 +0900, David Miller wrote:
> From: Marcel Ziswiler <marcel@ziswiler.com>
> Date: Mon, 25 Jun 2018 10:00:42 +0200
> 
> > @@ -3122,7 +3124,11 @@ static int igb_probe(struct pci_dev *pdev,
> > const struct pci_device_id *ent)
> >  		break;
> >  	}
> >  
> > -	if (eth_platform_get_mac_address(&pdev->dev, hw-
> > >mac.addr)) {
> > +	/* try to get the MAC address from device tree data */
> > +	iap = of_get_mac_address(pdev->dev.of_node);
> > +	if (iap)
> > +		memcpy(hw->mac.addr, iap, ETH_ALEN);
> 
> This is exactly what eth_platform_get_mac_address() shoule be doing.

Yes, you are absolutely right. That indeed already works just fine given a
proper device tree. Struggling with the device tree part I probably missed
noticing this.

Sorry for the noise and thanks for the tip!

Cheers

Marcel

^ permalink raw reply

* Re: [PATCH] net: usb: asix: allow optionally getting mac address from device tree
From: Marcel Ziswiler @ 2018-06-25 11:26 UTC (permalink / raw)
  To: davem@davemloft.net
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	Dean_Jenkins@mentor.com, andreyknvl@google.com, oneukum@suse.com,
	linux-usb@vger.kernel.org
In-Reply-To: <20180625080108.32441-1-marcel@ziswiler.com>

Hi David

On Mon, 2018-06-25 at 10:01 +0200, Marcel Ziswiler wrote:
> From: Marcel Ziswiler <marcel.ziswiler@toradex.com>
> 
> For Embedded use where e.g. AX88772B chips may be used without
> external
> EEPROMs the boot loader may choose to pass the MAC address to be used
> via device tree. Therefore, allow for optionally getting the MAC
> address from device tree data e.g. as follows (excerpt from a T30
> based
> board, local-mac-address to be filled in by boot loader):
> 
> /* EHCI instance 1: USB2_DP/N -> AX88772B */
> usb@7d004000 {
> 	status = "okay";
> 	#address-cells = <1>;
> 	#size-cells = <0>;
> 	asix@1 {
> 		reg = <1>;
> 		local-mac-address = [00 00 00 00 00 00];
> 	};
> };
> 
> Signed-off-by: Marcel Ziswiler <marcel.ziswiler@toradex.com>
> 
> ---
> 
>  drivers/net/usb/asix.h         |  1 +
>  drivers/net/usb/asix_devices.c | 39 ++++++++++++++++++++++++------
> ---------
>  2 files changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/usb/asix.h b/drivers/net/usb/asix.h
> index 9a4171b90947..c8161960cef2 100644
> --- a/drivers/net/usb/asix.h
> +++ b/drivers/net/usb/asix.h
> @@ -37,6 +37,7 @@
>  #include <linux/usb/usbnet.h>
>  #include <linux/slab.h>
>  #include <linux/if_vlan.h>
> +#include <linux/of_net.h>
>  
>  #define DRIVER_VERSION "22-Dec-2011"
>  #define DRIVER_NAME "asix"
> diff --git a/drivers/net/usb/asix_devices.c
> b/drivers/net/usb/asix_devices.c
> index 3d4f7959dabb..a88d65dfc64c 100644
> --- a/drivers/net/usb/asix_devices.c
> +++ b/drivers/net/usb/asix_devices.c
> @@ -690,25 +690,34 @@ static int ax88772_bind(struct usbnet *dev,
> struct usb_interface *intf)
>  	u8 buf[ETH_ALEN], chipcode = 0;
>  	u32 phyid;
>  	struct asix_common_private *priv;
> +	const u8 *mac_addr;
>  
> -	usbnet_get_endpoints(dev,intf);
> +	usbnet_get_endpoints(dev, intf);
>  
> -	/* Get the MAC address */
> -	if (dev->driver_info->data & FLAG_EEPROM_MAC) {
> -		for (i = 0; i < (ETH_ALEN >> 1); i++) {
> -			ret = asix_read_cmd(dev, AX_CMD_READ_EEPROM,
> 0x04 + i,
> -					    0, 2, buf + i * 2, 0);
> -			if (ret < 0)
> -				break;
> -		}
> +	/* Maybe the boot loader passed the MAC address via device
> tree */
> +	mac_addr = of_get_mac_address(dev->udev->dev.of_node);
> +	if (mac_addr) {
> +		memcpy(buf, mac_addr, ETH_ALEN);
>  	} else {
> -		ret = asix_read_cmd(dev, AX_CMD_READ_NODE_ID,
> -				0, 0, ETH_ALEN, buf, 0);
> -	}
> +		/* Try getting the MAC address from EEPROM */
> +		if (dev->driver_info->data & FLAG_EEPROM_MAC) {
> +			for (i = 0; i < (ETH_ALEN >> 1); i++) {
> +				ret = asix_read_cmd(dev,
> AX_CMD_READ_EEPROM,
> +						    0x04 + i, 0, 2,
> buf + i * 2,
> +						    0);
> +				if (ret < 0)
> +					break;
> +			}
> +		} else {
> +			ret = asix_read_cmd(dev,
> AX_CMD_READ_NODE_ID,
> +					    0, 0, ETH_ALEN, buf, 0);
> +		}
>  
> -	if (ret < 0) {
> -		netdev_dbg(dev->net, "Failed to read MAC address:
> %d\n", ret);
> -		return ret;
> +		if (ret < 0) {
> +			netdev_dbg(dev->net, "Failed to read MAC
> address: %d\n",
> +				   ret);
> +			return ret;
> +		}
>  	}
>  
>  	asix_set_netdev_dev_addr(dev, buf);

Now that you mentioned eth_platform_get_mac_address() on my igb attempt I am
wondering whether that would not also be the better approach for ASIX USB net
use. Looks like at least lan78xx already does it that way as well.

What do you think?

Cheers

Marcel

^ permalink raw reply

* Re: [PATCH net-next 3/5] sctp: add spp_ipv6_flowlabel and spp_dscp for sctp_paddrparams
From: Neil Horman @ 2018-06-25 11:28 UTC (permalink / raw)
  To: David Miller; +Cc: lucien.xin, netdev, linux-sctp, marcelo.leitner
In-Reply-To: <20180625.163126.1464757436422028440.davem@davemloft.net>

On Mon, Jun 25, 2018 at 04:31:26PM +0900, David Miller wrote:
> From: Xin Long <lucien.xin@gmail.com>
> Date: Mon, 25 Jun 2018 10:14:35 +0800
> 
> >  struct sctp_paddrparams {
> > @@ -773,6 +775,8 @@ struct sctp_paddrparams {
> >  	__u32			spp_pathmtu;
> >  	__u32			spp_sackdelay;
> >  	__u32			spp_flags;
> > +	__u32			spp_ipv6_flowlabel;
> > +	__u8			spp_dscp;
> >  } __attribute__((packed, aligned(4)));
> 
> I don't think you can change the size of this structure like this.
> 
> This check in sctp_setsockopt_peer_addr_params():
> 
> 	if (optlen != sizeof(struct sctp_paddrparams))
> 		return -EINVAL;
> 
> is going to trigger in old kernels when executing programs
> built against the new struct definition.
> 
I think thats also the reason its a packed aligned attribute, it can't be
changed, or older kernels won't be able to fill it out properly.
Neil

^ permalink raw reply

* [PATCH ipsec-next] xfrm: policy: remove pcpu policy cache
From: Florian Westphal @ 2018-06-25 11:57 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal

Kristian Evensen says:
  In a project I am involved in, we are running ipsec (Strongswan) on
  different mt7621-based routers. Each router is configured as an
  initiator and has around ~30 tunnels to different responders (running
  on misc. devices). Before the flow cache was removed (kernel 4.9), we
  got a combined throughput of around 70Mbit/s for all tunnels on one
  router. However, we recently switched to kernel 4.14 (4.14.48), and
  the total throughput is somewhere around 57Mbit/s (best-case). I.e., a
  drop of around 20%. Reverting the flow cache removal restores, as
  expected, performance levels to that of kernel 4.9.

When pcpu xdst exists, it has to be validated first before it can be
used.

A negative hit thus increases cost vs. no-cache.

As number of tunnels increases, hit rate decreases so this pcpu caching
isn't a viable strategy.

Furthermore, the xdst cache also needs to run with BH off, so when
removing this the bh disable/enable pairs can be removed too.

Kristian tested a 4.14.y backport of this change and reported
increased performance:

  In our tests, the throughput reduction has been reduced from around -20%
  to -5%. We also see that the overall throughput is independent of the
  number of tunnels, while before the throughput was reduced as the number
  of tunnels increased.

Reported-by: Kristian Evensen <kristian.evensen@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/xfrm.h     |   1 -
 net/xfrm/xfrm_device.c |  10 ----
 net/xfrm/xfrm_policy.c | 139 +------------------------------------------------
 net/xfrm/xfrm_state.c  |   5 +-
 4 files changed, 3 insertions(+), 152 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 557122846e0e..b4c79692d915 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -323,7 +323,6 @@ int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int fam
 void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
 void km_policy_notify(struct xfrm_policy *xp, int dir,
 		      const struct km_event *c);
-void xfrm_policy_cache_flush(void);
 void km_state_notify(struct xfrm_state *x, const struct km_event *c);
 
 struct xfrm_tmpl;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e15a6e..b03ef5d2b4c0 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -306,12 +306,6 @@ static int xfrm_dev_register(struct net_device *dev)
 	return xfrm_api_check(dev);
 }
 
-static int xfrm_dev_unregister(struct net_device *dev)
-{
-	xfrm_policy_cache_flush();
-	return NOTIFY_DONE;
-}
-
 static int xfrm_dev_feat_change(struct net_device *dev)
 {
 	return xfrm_api_check(dev);
@@ -322,7 +316,6 @@ static int xfrm_dev_down(struct net_device *dev)
 	if (dev->features & NETIF_F_HW_ESP)
 		xfrm_dev_state_flush(dev_net(dev), dev, true);
 
-	xfrm_policy_cache_flush();
 	return NOTIFY_DONE;
 }
 
@@ -334,9 +327,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
 	case NETDEV_REGISTER:
 		return xfrm_dev_register(dev);
 
-	case NETDEV_UNREGISTER:
-		return xfrm_dev_unregister(dev);
-
 	case NETDEV_FEAT_CHANGE:
 		return xfrm_dev_feat_change(dev);
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..77d998c99541 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,8 +45,6 @@ struct xfrm_flo {
 	u8 flags;
 };
 
-static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
-static struct work_struct *xfrm_pcpu_work __read_mostly;
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
@@ -1714,108 +1712,6 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
 
 }
 
-static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
-{
-	this_cpu_write(xfrm_last_dst, xdst);
-	if (old)
-		dst_release(&old->u.dst);
-}
-
-static void __xfrm_pcpu_work_fn(void)
-{
-	struct xfrm_dst *old;
-
-	old = this_cpu_read(xfrm_last_dst);
-	if (old && !xfrm_bundle_ok(old))
-		xfrm_last_dst_update(NULL, old);
-}
-
-static void xfrm_pcpu_work_fn(struct work_struct *work)
-{
-	local_bh_disable();
-	rcu_read_lock();
-	__xfrm_pcpu_work_fn();
-	rcu_read_unlock();
-	local_bh_enable();
-}
-
-void xfrm_policy_cache_flush(void)
-{
-	struct xfrm_dst *old;
-	bool found = false;
-	int cpu;
-
-	might_sleep();
-
-	local_bh_disable();
-	rcu_read_lock();
-	for_each_possible_cpu(cpu) {
-		old = per_cpu(xfrm_last_dst, cpu);
-		if (old && !xfrm_bundle_ok(old)) {
-			if (smp_processor_id() == cpu) {
-				__xfrm_pcpu_work_fn();
-				continue;
-			}
-			found = true;
-			break;
-		}
-	}
-
-	rcu_read_unlock();
-	local_bh_enable();
-
-	if (!found)
-		return;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		bool bundle_release;
-
-		rcu_read_lock();
-		old = per_cpu(xfrm_last_dst, cpu);
-		bundle_release = old && !xfrm_bundle_ok(old);
-		rcu_read_unlock();
-
-		if (!bundle_release)
-			continue;
-
-		if (cpu_online(cpu)) {
-			schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
-			continue;
-		}
-
-		rcu_read_lock();
-		old = per_cpu(xfrm_last_dst, cpu);
-		if (old && !xfrm_bundle_ok(old)) {
-			per_cpu(xfrm_last_dst, cpu) = NULL;
-			dst_release(&old->u.dst);
-		}
-		rcu_read_unlock();
-	}
-
-	put_online_cpus();
-}
-
-static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
-				struct xfrm_state * const xfrm[],
-				int num)
-{
-	const struct dst_entry *dst = &xdst->u.dst;
-	int i;
-
-	if (xdst->num_xfrms != num)
-		return false;
-
-	for (i = 0; i < num; i++) {
-		if (!dst || dst->xfrm != xfrm[i])
-			return false;
-		dst = xfrm_dst_child(dst);
-	}
-
-	return xfrm_bundle_ok(xdst);
-}
-
 static struct xfrm_dst *
 xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 			       const struct flowi *fl, u16 family,
@@ -1824,7 +1720,7 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 	struct net *net = xp_net(pols[0]);
 	struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
 	struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
-	struct xfrm_dst *xdst, *old;
+	struct xfrm_dst *xdst;
 	struct dst_entry *dst;
 	int err;
 
@@ -1836,22 +1732,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 		return ERR_PTR(err);
 	}
 
-	xdst = this_cpu_read(xfrm_last_dst);
-	if (xdst &&
-	    xdst->u.dst.dev == dst_orig->dev &&
-	    xdst->num_pols == num_pols &&
-	    memcmp(xdst->pols, pols,
-		   sizeof(struct xfrm_policy *) * num_pols) == 0 &&
-	    xfrm_xdst_can_reuse(xdst, xfrm, err)) {
-		dst_hold(&xdst->u.dst);
-		xfrm_pols_put(pols, num_pols);
-		while (err > 0)
-			xfrm_state_put(xfrm[--err]);
-		return xdst;
-	}
-
-	old = xdst;
-
 	dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
 	if (IS_ERR(dst)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
@@ -1864,9 +1744,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
 	memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
 	xdst->policy_genid = atomic_read(&pols[0]->genid);
 
-	atomic_set(&xdst->u.dst.__refcnt, 2);
-	xfrm_last_dst_update(xdst, old);
-
 	return xdst;
 }
 
@@ -2067,11 +1944,8 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
 	if (num_xfrms <= 0)
 		goto make_dummy_bundle;
 
-	local_bh_disable();
 	xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
 					      xflo->dst_orig);
-	local_bh_enable();
-
 	if (IS_ERR(xdst)) {
 		err = PTR_ERR(xdst);
 		if (err != -EAGAIN)
@@ -2158,11 +2032,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
 				goto no_transform;
 			}
 
-			local_bh_disable();
 			xdst = xfrm_resolve_and_create_bundle(
 					pols, num_pols, fl,
 					family, dst_orig);
-			local_bh_enable();
 
 			if (IS_ERR(xdst)) {
 				xfrm_pols_put(pols, num_pols);
@@ -2986,15 +2858,6 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
 
 void __init xfrm_init(void)
 {
-	int i;
-
-	xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
-				       GFP_KERNEL);
-	BUG_ON(!xfrm_pcpu_work);
-
-	for (i = 0; i < NR_CPUS; i++)
-		INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
-
 	register_pernet_subsys(&xfrm_net_ops);
 	xfrm_dev_init();
 	seqcount_init(&xfrm_policy_hash_generation);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8308281f3253..16a0efec727d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -735,10 +735,9 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
 	}
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-	if (cnt) {
+	if (cnt)
 		err = 0;
-		xfrm_policy_cache_flush();
-	}
+
 	return err;
 }
 EXPORT_SYMBOL(xfrm_state_flush);
-- 
2.16.4

^ permalink raw reply related

* Re: Crash in netlink/sk_filter_trim_cap on ARMv7 on 4.18rc1
From: Peter Robinson @ 2018-06-25 12:03 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: Eric Dumazet, netdev, linux-arm-kernel, labbott
In-Reply-To: <04192986-0f46-f7a1-7f3f-86849aa73f1e@iogearbox.net>

On Mon, Jun 25, 2018 at 9:48 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 06/24/2018 11:24 AM, Peter Robinson wrote:
>>>> I'm seeing this netlink/sk_filter_trim_cap crash on ARMv7 across quite
>>>> a few ARMv7 platforms on Fedora with 4.18rc1. I've tested RPi2/RPi3
>>>> (doesn't happen on aarch64), AllWinner H3, BeagleBone and a few
>>>> others, both LPAE/normal kernels.
>>>>
>>>> I'm a bit out of my depth in this part of the kernel but I'm wondering
>>>> if it's known, I couldn't find anything that looked obvious on a few
>>>> mailing lists.
>>>>
>>>> Peter
>>>
>>> Hi Peter
>>>
>>> Could you provide symbolic information ?
>>
>> I passed in through scripts/decode_stacktrace.sh is that what you were after:
>>
>> [    8.673880] Internal error: Oops: a06 [#10] SMP ARM
>> [    8.673949] ---[ end trace 049df4786ea3140a ]---
>> [    8.678754] Modules linked in:
>> [    8.678766] CPU: 1 PID: 206 Comm: systemd-udevd Tainted: G      D
>>         4.18.0-0.rc1.git0.1.fc29.armv7hl+lpae #1
>> [    8.678769] Hardware name: Allwinner sun8i Family
>> [    8.678781] PC is at sk_filter_trim_cap ()
>> [    8.678790] LR is at   (null)
>> [    8.709463] pc : lr : psr: 60000013 ()
>> [    8.715722] sp : c996bd60  ip : 00000000  fp : 00000000
>> [    8.720939] r10: ee79dc00  r9 : c12c9f80  r8 : 00000000
>> [    8.726157] r7 : 00000000  r6 : 00000001  r5 : f1648000  r4 : 00000000
>> [    8.732674] r3 : 00000007  r2 : 00000000  r1 : 00000000  r0 : 00000000
>> [    8.739193] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
>> [    8.746318] Control: 30c5387d  Table: 6e7bc880  DAC: ffe75ece
>> [    8.752055] Process systemd-udevd (pid: 206, stack limit = 0x(ptrval))
>> [    8.758574] Stack: (0xc996bd60 to 0xc996c000)
> [...]
>
> Should be fixed by (PR to Linus with fix is pending):
>
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/?id=9262478220eac908ae6e168c3df2c453c87e2da3

Unfortunately it's not, building against the git checkout of the first
failed Fedora kernel (see rc2 issue below) it has the same effect :-(

I thought it might have been [1] because it touches bits of that code
but if I trying with rc2 and that reverted I got no output at all,
checking the vanilla Fedora build from friday (so almost rc2) it
doesn't boot at all either so I've got a second thing to investigate.

Peter

[1] https://lkml.org/lkml/2018/4/29/30

^ permalink raw reply

* [PATCH ipsec] xfrm: free skb if nlsk pointer is NULL
From: Florian Westphal @ 2018-06-25 12:00 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal

nlmsg_multicast() always frees the skb, so in case we cannot call
it we must do that ourselves.

Fixes: 21ee543edc0dea ("xfrm: fix race between netns cleanup and state expire notification")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/xfrm/xfrm_user.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 080035f056d9..5fd44fadb195 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1025,10 +1025,12 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
 {
 	struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
 
-	if (nlsk)
-		return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
-	else
-		return -1;
+	if (!nlsk) {
+		kfree_skb(skb);
+		return -EPIPE;
+	}
+
+	return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
 }
 
 static inline unsigned int xfrm_spdinfo_msgsize(void)
-- 
2.16.1

^ permalink raw reply related

* [PATCH RESEND bpf-next v6 1/2] trace_helpers.c: Add helpers to poll multiple perf FDs for events
From: Toke Høiland-Jørgensen @ 2018-06-25 12:25 UTC (permalink / raw)
  To: netdev

Add two new helper functions to trace_helpers that supports polling
multiple perf file descriptors for events. These are used to the XDP
perf_event_output example, which needs to work with one perf fd per CPU.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 tools/testing/selftests/bpf/trace_helpers.c |   48 ++++++++++++++++++++++++++-
 tools/testing/selftests/bpf/trace_helpers.h |    4 ++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 3868dcb63420..cabe2a3a3b30 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -88,7 +88,7 @@ static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
 
-int perf_event_mmap(int fd)
+int perf_event_mmap_header(int fd, struct perf_event_mmap_page **header)
 {
 	void *base;
 	int mmap_size;
@@ -102,10 +102,15 @@ int perf_event_mmap(int fd)
 		return -1;
 	}
 
-	header = base;
+	*header = base;
 	return 0;
 }
 
+int perf_event_mmap(int fd)
+{
+	return perf_event_mmap_header(fd, &header);
+}
+
 static int perf_event_poll(int fd)
 {
 	struct pollfd pfd = { .fd = fd, .events = POLLIN };
@@ -163,3 +168,42 @@ int perf_event_poller(int fd, perf_event_print_fn output_fn)
 
 	return ret;
 }
+
+int perf_event_poller_multi(int *fds, struct perf_event_mmap_page **headers,
+			    int num_fds, perf_event_print_fn output_fn)
+{
+	enum bpf_perf_event_ret ret;
+	struct pollfd *pfds;
+	void *buf = NULL;
+	size_t len = 0;
+	int i;
+
+	pfds = calloc(num_fds, sizeof(*pfds));
+	if (!pfds)
+		return LIBBPF_PERF_EVENT_ERROR;
+
+	for (i = 0; i < num_fds; i++) {
+		pfds[i].fd = fds[i];
+		pfds[i].events = POLLIN;
+	}
+
+	for (;;) {
+		poll(pfds, num_fds, 1000);
+		for (i = 0; i < num_fds; i++) {
+			if (!pfds[i].revents)
+				continue;
+
+			ret = bpf_perf_event_read_simple(headers[i],
+							 page_cnt * page_size,
+							 page_size, &buf, &len,
+							 bpf_perf_event_print,
+							 output_fn);
+			if (ret != LIBBPF_PERF_EVENT_CONT)
+				break;
+		}
+	}
+	free(buf);
+	free(pfds);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 3b4bcf7f5084..18924f23db1b 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -3,6 +3,7 @@
 #define __TRACE_HELPER_H
 
 #include <libbpf.h>
+#include <linux/perf_event.h>
 
 struct ksym {
 	long addr;
@@ -16,6 +17,9 @@ long ksym_get_addr(const char *name);
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
 int perf_event_mmap(int fd);
+int perf_event_mmap_header(int fd, struct perf_event_mmap_page **header);
 /* return LIBBPF_PERF_EVENT_DONE or LIBBPF_PERF_EVENT_ERROR */
 int perf_event_poller(int fd, perf_event_print_fn output_fn);
+int perf_event_poller_multi(int *fds, struct perf_event_mmap_page **headers,
+			    int num_fds, perf_event_print_fn output_fn);
 #endif

^ permalink raw reply related

* [PATCH RESEND bpf-next v6 2/2] samples/bpf: Add xdp_sample_pkts example
From: Toke Høiland-Jørgensen @ 2018-06-25 12:25 UTC (permalink / raw)
  To: netdev
In-Reply-To: <152992950237.15897.9894201421576854943.stgit@alrua-kau>

Add an example program showing how to sample packets from XDP using the
perf event buffer. The example userspace program just prints the ethernet
header for every packet sampled.

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 samples/bpf/Makefile               |    4 +
 samples/bpf/xdp_sample_pkts_kern.c |   66 ++++++++++++++
 samples/bpf/xdp_sample_pkts_user.c |  169 ++++++++++++++++++++++++++++++++++++
 3 files changed, 239 insertions(+)
 create mode 100644 samples/bpf/xdp_sample_pkts_kern.c
 create mode 100644 samples/bpf/xdp_sample_pkts_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1303af10e54d..9ea2f7b64869 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
+hostprogs-y += xdp_sample_pkts
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
+xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -163,6 +165,7 @@ always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
+always += xdp_sample_pkts_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -179,6 +182,7 @@ HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c
new file mode 100644
index 000000000000..f7ca8b850978
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_kern.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define SAMPLE_SIZE 64ul
+#define MAX_CPUS 128
+
+#define bpf_printk(fmt, ...)					\
+({								\
+	       char ____fmt[] = fmt;				\
+	       bpf_trace_printk(____fmt, sizeof(____fmt),	\
+				##__VA_ARGS__);			\
+})
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = MAX_CPUS,
+};
+
+SEC("xdp_sample")
+int xdp_sample_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+
+	/* Metadata will be in the perf event before the packet data. */
+	struct S {
+		u16 cookie;
+		u16 pkt_len;
+	} __packed metadata;
+
+	if (data < data_end) {
+		/* The XDP perf_event_output handler will use the upper 32 bits
+		 * of the flags argument as a number of bytes to include of the
+		 * packet payload in the event data. If the size is too big, the
+		 * call to bpf_perf_event_output will fail and return -EFAULT.
+		 *
+		 * See bpf_xdp_event_output in net/core/filter.c.
+		 *
+		 * The BPF_F_CURRENT_CPU flag means that the event output fd
+		 * will be indexed by the CPU number in the event map.
+		 */
+		u64 flags = BPF_F_CURRENT_CPU;
+		u16 sample_size;
+		int ret;
+
+		metadata.cookie = 0xdead;
+		metadata.pkt_len = (u16)(data_end - data);
+		sample_size = min(metadata.pkt_len, SAMPLE_SIZE);
+		flags |= (u64)sample_size << 32;
+
+		ret = bpf_perf_event_output(ctx, &my_map, flags,
+					    &metadata, sizeof(metadata));
+		if (ret)
+			bpf_printk("perf_event_output failed: %d\n", ret);
+	}
+
+	return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/xdp_sample_pkts_user.c b/samples/bpf/xdp_sample_pkts_user.c
new file mode 100644
index 000000000000..8dd87c1eb560
--- /dev/null
+++ b/samples/bpf/xdp_sample_pkts_user.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <net/if.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/sysinfo.h>
+#include <sys/ioctl.h>
+#include <signal.h>
+#include <libbpf.h>
+#include <bpf/bpf.h>
+
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define MAX_CPUS 128
+static int pmu_fds[MAX_CPUS], if_idx;
+static struct perf_event_mmap_page *headers[MAX_CPUS];
+static char *if_name;
+
+static int do_attach(int idx, int fd, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, fd, 0);
+	if (err < 0)
+		printf("ERROR: failed to attach program to %s\n", name);
+
+	return err;
+}
+
+static int do_detach(int idx, const char *name)
+{
+	int err;
+
+	err = bpf_set_link_xdp_fd(idx, -1, 0);
+	if (err < 0)
+		printf("ERROR: failed to detach program from %s\n", name);
+
+	return err;
+}
+
+#define SAMPLE_SIZE 64
+
+static int print_bpf_output(void *data, int size)
+{
+	struct {
+		__u16 cookie;
+		__u16 pkt_len;
+		__u8  pkt_data[SAMPLE_SIZE];
+	} __packed *e = data;
+	int i;
+
+	if (e->cookie != 0xdead) {
+		printf("BUG cookie %x sized %d\n",
+		       e->cookie, size);
+		return LIBBPF_PERF_EVENT_ERROR;
+	}
+
+	printf("Pkt len: %-5d bytes. Ethernet hdr: ", e->pkt_len);
+	for (i = 0; i < 14 && i < e->pkt_len; i++)
+		printf("%02x ", e->pkt_data[i]);
+	printf("\n");
+
+	return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void test_bpf_perf_event(int map_fd, int num)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+		.wakeup_events = 1, /* get an fd notification for every event */
+	};
+	int i;
+
+	for (i = 0; i < num; i++) {
+		int key = i;
+
+		pmu_fds[i] = sys_perf_event_open(&attr, -1/*pid*/, i/*cpu*/,
+						 -1/*group_fd*/, 0);
+
+		assert(pmu_fds[i] >= 0);
+		assert(bpf_map_update_elem(map_fd, &key,
+					   &pmu_fds[i], BPF_ANY) == 0);
+		ioctl(pmu_fds[i], PERF_EVENT_IOC_ENABLE, 0);
+	}
+}
+
+static void sig_handler(int signo)
+{
+	do_detach(if_idx, if_name);
+	exit(0);
+}
+
+int main(int argc, char **argv)
+{
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type	= BPF_PROG_TYPE_XDP,
+	};
+	struct bpf_object *obj;
+	struct bpf_map *map;
+	int prog_fd, map_fd;
+	char filename[256];
+	int ret, err, i;
+	int numcpus;
+
+	if (argc < 2) {
+		printf("Usage: %s <ifname>\n", argv[0]);
+		return 1;
+	}
+
+	numcpus = get_nprocs();
+	if (numcpus > MAX_CPUS)
+		numcpus = MAX_CPUS;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	prog_load_attr.file = filename;
+
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		return 1;
+
+	if (!prog_fd) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	map = bpf_map__next(NULL, obj);
+	if (!map) {
+		printf("finding a map in obj file failed\n");
+		return 1;
+	}
+	map_fd = bpf_map__fd(map);
+
+	if_idx = if_nametoindex(argv[1]);
+	if (!if_idx)
+		if_idx = strtoul(argv[1], NULL, 0);
+
+	if (!if_idx) {
+		fprintf(stderr, "Invalid ifname\n");
+		return 1;
+	}
+	if_name = argv[1];
+	err = do_attach(if_idx, prog_fd, argv[1]);
+	if (err)
+		return err;
+
+	if (signal(SIGINT, sig_handler) ||
+	    signal(SIGHUP, sig_handler) ||
+	    signal(SIGTERM, sig_handler)) {
+		perror("signal");
+		return 1;
+	}
+
+	test_bpf_perf_event(map_fd, numcpus);
+
+	for (i = 0; i < numcpus; i++)
+		if (perf_event_mmap_header(pmu_fds[i], &headers[i]) < 0)
+			return 1;
+
+	ret = perf_event_poller_multi(pmu_fds, headers, numcpus,
+				      print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
+}

^ permalink raw reply related

* [v4, 00/10] Support DPAA PTP clock and timestamping
From: Yangbo Lu @ 2018-06-25 12:37 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu

This patchset is to support DPAA FMAN PTP clock and HW timestamping.
It had been verified on both ARM platform and PPC platform.
- The patch #1 to patch #5 are to support DPAA FMAN 1588 timer in
  ptp_qoriq driver.
- The patch #6 to patch #10 are to add HW timestamping support in
  DPAA ethernet driver.

Yangbo Lu (10):
  fsl/fman: share the event interrupt
  ptp: support DPAA FMan 1588 timer in ptp_qoriq
  dt-binding: ptp_qoriq: add DPAA FMan support
  powerpc/mpc85xx: move ptp timer out of fman in dts
  arm64: dts: fsl: move ptp timer out of fman
  fsl/fman: add set_tstamp interface
  fsl/fman_port: support getting timestamp
  fsl/fman: define frame description command UPD
  dpaa_eth: add support for hardware timestamping
  dpaa_eth: add the get_ts_info interface for ethtool

 Documentation/devicetree/bindings/net/fsl-fman.txt |   25 +-----
 .../devicetree/bindings/ptp/ptp-qoriq.txt          |   15 +++-
 arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi   |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi        |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi        |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi       |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi       |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi      |   14 ++-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c     |   88 ++++++++++++++++-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h     |    3 +
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c |   39 ++++++++
 drivers/net/ethernet/freescale/fman/fman.c         |    3 +-
 drivers/net/ethernet/freescale/fman/fman.h         |    1 +
 drivers/net/ethernet/freescale/fman/fman_dtsec.c   |   27 +++++
 drivers/net/ethernet/freescale/fman/fman_dtsec.h   |    1 +
 drivers/net/ethernet/freescale/fman/fman_memac.c   |    5 +
 drivers/net/ethernet/freescale/fman/fman_memac.h   |    1 +
 drivers/net/ethernet/freescale/fman/fman_port.c    |   12 +++
 drivers/net/ethernet/freescale/fman/fman_port.h    |    2 +
 drivers/net/ethernet/freescale/fman/fman_tgec.c    |   21 ++++
 drivers/net/ethernet/freescale/fman/fman_tgec.h    |    1 +
 drivers/net/ethernet/freescale/fman/mac.c          |    3 +
 drivers/net/ethernet/freescale/fman/mac.h          |    1 +
 drivers/ptp/Kconfig                                |    2 +-
 drivers/ptp/ptp_qoriq.c                            |  104 ++++++++++++-------
 include/linux/fsl/ptp_qoriq.h                      |   38 ++++++--
 26 files changed, 361 insertions(+), 115 deletions(-)

^ permalink raw reply

* [v4, 01/10] fsl/fman: share the event interrupt
From: Yangbo Lu @ 2018-06-25 12:37 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu
In-Reply-To: <20180625123716.28993-1-yangbo.lu@nxp.com>

This patch is to share fman event interrupt because
the 1588 timer driver will also use this interrupt.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Madalin Bucur <madalin.bucur@nxp.com>
---
Changes for v2:
	- None.
Changes for v3:
	- None.
Changes for v4:
	- Added ACKs.
---
 drivers/net/ethernet/freescale/fman/fman.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 9530405..c415ac6 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -2801,7 +2801,8 @@ static irqreturn_t fman_irq(int irq, void *handle)
 	of_node_put(muram_node);
 	of_node_put(fm_node);
 
-	err = devm_request_irq(&of_dev->dev, irq, fman_irq, 0, "fman", fman);
+	err = devm_request_irq(&of_dev->dev, irq, fman_irq, IRQF_SHARED,
+			       "fman", fman);
 	if (err < 0) {
 		dev_err(&of_dev->dev, "%s: irq %d allocation failed (error = %d)\n",
 			__func__, irq, err);
-- 
1.7.1

^ permalink raw reply related

* [v4, 02/10] ptp: support DPAA FMan 1588 timer in ptp_qoriq
From: Yangbo Lu @ 2018-06-25 12:37 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu
In-Reply-To: <20180625123716.28993-1-yangbo.lu@nxp.com>

This patch is to support DPAA (Data Path Acceleration Architecture)
1588 timer by adding "fsl,fman-ptp-timer" compatible, sharing
interrupt with FMan, adding FSL_DPAA_ETH dependency, and fixing
up register offset.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Madalin Bucur <madalin.bucur@nxp.com>
---
Changes for v2:
	- None.
Changes for v3:
	- None.
Changes for v4:
	- Added ACKs.
---
 drivers/ptp/Kconfig           |    2 +-
 drivers/ptp/ptp_qoriq.c       |  104 ++++++++++++++++++++++++++---------------
 include/linux/fsl/ptp_qoriq.h |   38 ++++++++++++---
 3 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 474c988..d137c48 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -43,7 +43,7 @@ config PTP_1588_CLOCK_DTE
 
 config PTP_1588_CLOCK_QORIQ
 	tristate "Freescale QorIQ 1588 timer as PTP clock"
-	depends on GIANFAR
+	depends on GIANFAR || FSL_DPAA_ETH
 	depends on PTP_1588_CLOCK
 	default y
 	help
diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index e8652c1..a14c317 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -39,11 +39,12 @@
 /* Caller must hold qoriq_ptp->lock. */
 static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	u64 ns;
 	u32 lo, hi;
 
-	lo = qoriq_read(&qoriq_ptp->regs->tmr_cnt_l);
-	hi = qoriq_read(&qoriq_ptp->regs->tmr_cnt_h);
+	lo = qoriq_read(&regs->ctrl_regs->tmr_cnt_l);
+	hi = qoriq_read(&regs->ctrl_regs->tmr_cnt_h);
 	ns = ((u64) hi) << 32;
 	ns |= lo;
 	return ns;
@@ -52,16 +53,18 @@ static u64 tmr_cnt_read(struct qoriq_ptp *qoriq_ptp)
 /* Caller must hold qoriq_ptp->lock. */
 static void tmr_cnt_write(struct qoriq_ptp *qoriq_ptp, u64 ns)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	u32 hi = ns >> 32;
 	u32 lo = ns & 0xffffffff;
 
-	qoriq_write(&qoriq_ptp->regs->tmr_cnt_l, lo);
-	qoriq_write(&qoriq_ptp->regs->tmr_cnt_h, hi);
+	qoriq_write(&regs->ctrl_regs->tmr_cnt_l, lo);
+	qoriq_write(&regs->ctrl_regs->tmr_cnt_h, hi);
 }
 
 /* Caller must hold qoriq_ptp->lock. */
 static void set_alarm(struct qoriq_ptp *qoriq_ptp)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	u64 ns;
 	u32 lo, hi;
 
@@ -70,16 +73,18 @@ static void set_alarm(struct qoriq_ptp *qoriq_ptp)
 	ns -= qoriq_ptp->tclk_period;
 	hi = ns >> 32;
 	lo = ns & 0xffffffff;
-	qoriq_write(&qoriq_ptp->regs->tmr_alarm1_l, lo);
-	qoriq_write(&qoriq_ptp->regs->tmr_alarm1_h, hi);
+	qoriq_write(&regs->alarm_regs->tmr_alarm1_l, lo);
+	qoriq_write(&regs->alarm_regs->tmr_alarm1_h, hi);
 }
 
 /* Caller must hold qoriq_ptp->lock. */
 static void set_fipers(struct qoriq_ptp *qoriq_ptp)
 {
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
+
 	set_alarm(qoriq_ptp);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
+	qoriq_write(&regs->fiper_regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
+	qoriq_write(&regs->fiper_regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
 }
 
 /*
@@ -89,16 +94,17 @@ static void set_fipers(struct qoriq_ptp *qoriq_ptp)
 static irqreturn_t isr(int irq, void *priv)
 {
 	struct qoriq_ptp *qoriq_ptp = priv;
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	struct ptp_clock_event event;
 	u64 ns;
 	u32 ack = 0, lo, hi, mask, val;
 
-	val = qoriq_read(&qoriq_ptp->regs->tmr_tevent);
+	val = qoriq_read(&regs->ctrl_regs->tmr_tevent);
 
 	if (val & ETS1) {
 		ack |= ETS1;
-		hi = qoriq_read(&qoriq_ptp->regs->tmr_etts1_h);
-		lo = qoriq_read(&qoriq_ptp->regs->tmr_etts1_l);
+		hi = qoriq_read(&regs->etts_regs->tmr_etts1_h);
+		lo = qoriq_read(&regs->etts_regs->tmr_etts1_l);
 		event.type = PTP_CLOCK_EXTTS;
 		event.index = 0;
 		event.timestamp = ((u64) hi) << 32;
@@ -108,8 +114,8 @@ static irqreturn_t isr(int irq, void *priv)
 
 	if (val & ETS2) {
 		ack |= ETS2;
-		hi = qoriq_read(&qoriq_ptp->regs->tmr_etts2_h);
-		lo = qoriq_read(&qoriq_ptp->regs->tmr_etts2_l);
+		hi = qoriq_read(&regs->etts_regs->tmr_etts2_h);
+		lo = qoriq_read(&regs->etts_regs->tmr_etts2_l);
 		event.type = PTP_CLOCK_EXTTS;
 		event.index = 1;
 		event.timestamp = ((u64) hi) << 32;
@@ -130,16 +136,16 @@ static irqreturn_t isr(int irq, void *priv)
 			hi = ns >> 32;
 			lo = ns & 0xffffffff;
 			spin_lock(&qoriq_ptp->lock);
-			qoriq_write(&qoriq_ptp->regs->tmr_alarm2_l, lo);
-			qoriq_write(&qoriq_ptp->regs->tmr_alarm2_h, hi);
+			qoriq_write(&regs->alarm_regs->tmr_alarm2_l, lo);
+			qoriq_write(&regs->alarm_regs->tmr_alarm2_h, hi);
 			spin_unlock(&qoriq_ptp->lock);
 			qoriq_ptp->alarm_value = ns;
 		} else {
-			qoriq_write(&qoriq_ptp->regs->tmr_tevent, ALM2);
+			qoriq_write(&regs->ctrl_regs->tmr_tevent, ALM2);
 			spin_lock(&qoriq_ptp->lock);
-			mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+			mask = qoriq_read(&regs->ctrl_regs->tmr_temask);
 			mask &= ~ALM2EN;
-			qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+			qoriq_write(&regs->ctrl_regs->tmr_temask, mask);
 			spin_unlock(&qoriq_ptp->lock);
 			qoriq_ptp->alarm_value = 0;
 			qoriq_ptp->alarm_interval = 0;
@@ -153,7 +159,7 @@ static irqreturn_t isr(int irq, void *priv)
 	}
 
 	if (ack) {
-		qoriq_write(&qoriq_ptp->regs->tmr_tevent, ack);
+		qoriq_write(&regs->ctrl_regs->tmr_tevent, ack);
 		return IRQ_HANDLED;
 	} else
 		return IRQ_NONE;
@@ -169,6 +175,7 @@ static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 	u32 tmr_add;
 	int neg_adj = 0;
 	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 
 	if (scaled_ppm < 0) {
 		neg_adj = 1;
@@ -186,7 +193,7 @@ static int ptp_qoriq_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
 
 	tmr_add = neg_adj ? tmr_add - diff : tmr_add + diff;
 
-	qoriq_write(&qoriq_ptp->regs->tmr_add, tmr_add);
+	qoriq_write(&regs->ctrl_regs->tmr_add, tmr_add);
 
 	return 0;
 }
@@ -250,6 +257,7 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
 			      struct ptp_clock_request *rq, int on)
 {
 	struct qoriq_ptp *qoriq_ptp = container_of(ptp, struct qoriq_ptp, caps);
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 	unsigned long flags;
 	u32 bit, mask;
 
@@ -266,23 +274,23 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
 			return -EINVAL;
 		}
 		spin_lock_irqsave(&qoriq_ptp->lock, flags);
-		mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+		mask = qoriq_read(&regs->ctrl_regs->tmr_temask);
 		if (on)
 			mask |= bit;
 		else
 			mask &= ~bit;
-		qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+		qoriq_write(&regs->ctrl_regs->tmr_temask, mask);
 		spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
 		return 0;
 
 	case PTP_CLK_REQ_PPS:
 		spin_lock_irqsave(&qoriq_ptp->lock, flags);
-		mask = qoriq_read(&qoriq_ptp->regs->tmr_temask);
+		mask = qoriq_read(&regs->ctrl_regs->tmr_temask);
 		if (on)
 			mask |= PP1EN;
 		else
 			mask &= ~PP1EN;
-		qoriq_write(&qoriq_ptp->regs->tmr_temask, mask);
+		qoriq_write(&regs->ctrl_regs->tmr_temask, mask);
 		spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
 		return 0;
 
@@ -313,10 +321,12 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 {
 	struct device_node *node = dev->dev.of_node;
 	struct qoriq_ptp *qoriq_ptp;
+	struct qoriq_ptp_registers *regs;
 	struct timespec64 now;
 	int err = -ENOMEM;
 	u32 tmr_ctrl;
 	unsigned long flags;
+	void __iomem *base;
 
 	qoriq_ptp = kzalloc(sizeof(*qoriq_ptp), GFP_KERNEL);
 	if (!qoriq_ptp)
@@ -351,7 +361,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 		pr_err("irq not in device tree\n");
 		goto no_node;
 	}
-	if (request_irq(qoriq_ptp->irq, isr, 0, DRIVER, qoriq_ptp)) {
+	if (request_irq(qoriq_ptp->irq, isr, IRQF_SHARED, DRIVER, qoriq_ptp)) {
 		pr_err("request_irq failed\n");
 		goto no_node;
 	}
@@ -368,12 +378,27 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 
 	spin_lock_init(&qoriq_ptp->lock);
 
-	qoriq_ptp->regs = ioremap(qoriq_ptp->rsrc->start,
-				resource_size(qoriq_ptp->rsrc));
-	if (!qoriq_ptp->regs) {
+	base = ioremap(qoriq_ptp->rsrc->start,
+		       resource_size(qoriq_ptp->rsrc));
+	if (!base) {
 		pr_err("ioremap ptp registers failed\n");
 		goto no_ioremap;
 	}
+
+	qoriq_ptp->base = base;
+
+	if (of_device_is_compatible(node, "fsl,fman-ptp-timer")) {
+		qoriq_ptp->regs.ctrl_regs = base + FMAN_CTRL_REGS_OFFSET;
+		qoriq_ptp->regs.alarm_regs = base + FMAN_ALARM_REGS_OFFSET;
+		qoriq_ptp->regs.fiper_regs = base + FMAN_FIPER_REGS_OFFSET;
+		qoriq_ptp->regs.etts_regs = base + FMAN_ETTS_REGS_OFFSET;
+	} else {
+		qoriq_ptp->regs.ctrl_regs = base + CTRL_REGS_OFFSET;
+		qoriq_ptp->regs.alarm_regs = base + ALARM_REGS_OFFSET;
+		qoriq_ptp->regs.fiper_regs = base + FIPER_REGS_OFFSET;
+		qoriq_ptp->regs.etts_regs = base + ETTS_REGS_OFFSET;
+	}
+
 	ktime_get_real_ts64(&now);
 	ptp_qoriq_settime(&qoriq_ptp->caps, &now);
 
@@ -383,13 +408,14 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 
 	spin_lock_irqsave(&qoriq_ptp->lock, flags);
 
-	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   tmr_ctrl);
-	qoriq_write(&qoriq_ptp->regs->tmr_add,    qoriq_ptp->tmr_add);
-	qoriq_write(&qoriq_ptp->regs->tmr_prsc,   qoriq_ptp->tmr_prsc);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
-	qoriq_write(&qoriq_ptp->regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
+	regs = &qoriq_ptp->regs;
+	qoriq_write(&regs->ctrl_regs->tmr_ctrl,   tmr_ctrl);
+	qoriq_write(&regs->ctrl_regs->tmr_add,    qoriq_ptp->tmr_add);
+	qoriq_write(&regs->ctrl_regs->tmr_prsc,   qoriq_ptp->tmr_prsc);
+	qoriq_write(&regs->fiper_regs->tmr_fiper1, qoriq_ptp->tmr_fiper1);
+	qoriq_write(&regs->fiper_regs->tmr_fiper2, qoriq_ptp->tmr_fiper2);
 	set_alarm(qoriq_ptp);
-	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   tmr_ctrl|FIPERST|RTPE|TE|FRD);
+	qoriq_write(&regs->ctrl_regs->tmr_ctrl,   tmr_ctrl|FIPERST|RTPE|TE|FRD);
 
 	spin_unlock_irqrestore(&qoriq_ptp->lock, flags);
 
@@ -405,7 +431,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 	return 0;
 
 no_clock:
-	iounmap(qoriq_ptp->regs);
+	iounmap(qoriq_ptp->base);
 no_ioremap:
 	release_resource(qoriq_ptp->rsrc);
 no_resource:
@@ -419,12 +445,13 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 static int qoriq_ptp_remove(struct platform_device *dev)
 {
 	struct qoriq_ptp *qoriq_ptp = platform_get_drvdata(dev);
+	struct qoriq_ptp_registers *regs = &qoriq_ptp->regs;
 
-	qoriq_write(&qoriq_ptp->regs->tmr_temask, 0);
-	qoriq_write(&qoriq_ptp->regs->tmr_ctrl,   0);
+	qoriq_write(&regs->ctrl_regs->tmr_temask, 0);
+	qoriq_write(&regs->ctrl_regs->tmr_ctrl,   0);
 
 	ptp_clock_unregister(qoriq_ptp->clock);
-	iounmap(qoriq_ptp->regs);
+	iounmap(qoriq_ptp->base);
 	release_resource(qoriq_ptp->rsrc);
 	free_irq(qoriq_ptp->irq, qoriq_ptp);
 	kfree(qoriq_ptp);
@@ -434,6 +461,7 @@ static int qoriq_ptp_remove(struct platform_device *dev)
 
 static const struct of_device_id match_table[] = {
 	{ .compatible = "fsl,etsec-ptp" },
+	{ .compatible = "fsl,fman-ptp-timer" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, match_table);
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
index b462d9e..dc3dac4 100644
--- a/include/linux/fsl/ptp_qoriq.h
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -11,9 +11,8 @@
 
 /*
  * qoriq ptp registers
- * Generated by regen.tcl on Thu May 13 01:38:57 PM CEST 2010
  */
-struct qoriq_ptp_registers {
+struct ctrl_regs {
 	u32 tmr_ctrl;     /* Timer control register */
 	u32 tmr_tevent;   /* Timestamp event register */
 	u32 tmr_temask;   /* Timer event mask register */
@@ -28,22 +27,47 @@ struct qoriq_ptp_registers {
 	u8  res1[4];
 	u32 tmroff_h;     /* Timer offset high */
 	u32 tmroff_l;     /* Timer offset low */
-	u8  res2[8];
+};
+
+struct alarm_regs {
 	u32 tmr_alarm1_h; /* Timer alarm 1 high register */
 	u32 tmr_alarm1_l; /* Timer alarm 1 high register */
 	u32 tmr_alarm2_h; /* Timer alarm 2 high register */
 	u32 tmr_alarm2_l; /* Timer alarm 2 high register */
-	u8  res3[48];
+};
+
+struct fiper_regs {
 	u32 tmr_fiper1;   /* Timer fixed period interval */
 	u32 tmr_fiper2;   /* Timer fixed period interval */
 	u32 tmr_fiper3;   /* Timer fixed period interval */
-	u8  res4[20];
+};
+
+struct etts_regs {
 	u32 tmr_etts1_h;  /* Timestamp of general purpose external trigger */
 	u32 tmr_etts1_l;  /* Timestamp of general purpose external trigger */
 	u32 tmr_etts2_h;  /* Timestamp of general purpose external trigger */
 	u32 tmr_etts2_l;  /* Timestamp of general purpose external trigger */
 };
 
+struct qoriq_ptp_registers {
+	struct ctrl_regs __iomem *ctrl_regs;
+	struct alarm_regs __iomem *alarm_regs;
+	struct fiper_regs __iomem *fiper_regs;
+	struct etts_regs __iomem *etts_regs;
+};
+
+/* Offset definitions for the four register groups */
+#define CTRL_REGS_OFFSET	0x0
+#define ALARM_REGS_OFFSET	0x40
+#define FIPER_REGS_OFFSET	0x80
+#define ETTS_REGS_OFFSET	0xa0
+
+#define FMAN_CTRL_REGS_OFFSET	0x80
+#define FMAN_ALARM_REGS_OFFSET	0xb8
+#define FMAN_FIPER_REGS_OFFSET	0xd0
+#define FMAN_ETTS_REGS_OFFSET	0xe0
+
+
 /* Bit definitions for the TMR_CTRL register */
 #define ALM1P                 (1<<31) /* Alarm1 output polarity */
 #define ALM2P                 (1<<30) /* Alarm2 output polarity */
@@ -105,10 +129,10 @@ struct qoriq_ptp_registers {
 #define DRIVER		"ptp_qoriq"
 #define DEFAULT_CKSEL	1
 #define N_EXT_TS	2
-#define REG_SIZE	sizeof(struct qoriq_ptp_registers)
 
 struct qoriq_ptp {
-	struct qoriq_ptp_registers __iomem *regs;
+	void __iomem *base;
+	struct qoriq_ptp_registers regs;
 	spinlock_t lock; /* protects regs */
 	struct ptp_clock *clock;
 	struct ptp_clock_info caps;
-- 
1.7.1

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox