public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
From: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org>
To: Hal Rosenstock <hnrose-Wuw85uim5zDR7s880joybQ@public.gmane.org>
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: Re: [PATCHv4] opensm: Reduce heap consumption by unicast routing tables (LFTs)
Date: Tue, 13 Oct 2009 19:51:17 +0200	[thread overview]
Message-ID: <20091013175117.GR13830@me> (raw)
In-Reply-To: <20091013130002.GA20173-Wuw85uim5zDR7s880joybQ@public.gmane.org>

On 09:00 Tue 13 Oct     , Hal Rosenstock wrote:
> 
> Heap memory consumption by the unicast and multicast routing tables can be
> reduced.
> 
> Using valgrind --tool=massif (for heap profiling), there are couple of places that consume most of the heap memory:
> ->38.75% (11,206,656B) 0x43267E: osm_switch_new (osm_switch.c:134)
> ->12.89% (3,728,256B) 0x40F8C9: osm_mcast_tbl_init (osm_mcast_tbl.c:96)
> 
> osm_switch_new (osm_switch.c:108):
>        p_sw->lft = malloc(IB_LID_UCAST_END_HO + 1);
> 
> From ib_types.h
>  #define IB_LID_UCAST_END_HO 0xBFFF
> 
> The LFT can be allocated after LID assignment, once the number of LIDs
> assigned is known.
> 
> So it looks like for cluster of 2-4K with an LMC of 0 about 40% (!!!) of the
> heap memory can be saved:
> 
>  - 39% used by LFTs, each with 48K entries - SM can allocate 4K entries instead.
> 
> A similar subsequent change will do this for MFTs.
> 
> Signed-off-by: Hal Rosenstock <hal.rosenstock-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

Applied with few changes (see below). Thanks.

> ---
> Changes since v3:
> Fixed comparisons to lft_size to use max_lid_ho
> Modifications to alloc_lft
> - Simpler calculation of lft_size in units of LFT block size
> - Eliminated unneeded malloc case for LFT when no LFT
> 
> Changes since v2:
> Fixed alloc_lft handling of an existing lft
> Moved alloc_lft into osm_switch_prepare_path_rebuild 
> 
> Changes since v1:
> Basic approach changed to not do in chunks but rather allocate
> LFT once LID assignment is completed
> 
> diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
> index 502e5a7..655491d 100644
> --- a/opensm/include/opensm/osm_switch.h
> +++ b/opensm/include/opensm/osm_switch.h
> @@ -102,6 +102,7 @@ typedef struct osm_switch {
>  	osm_port_profile_t *p_prof;
>  	uint8_t *lft;
>  	uint8_t *new_lft;
> +	uint16_t lft_size;
>  	osm_mcast_tbl_t mcast_tbl;
>  	int32_t mft_block_num;
>  	uint32_t mft_position;
> @@ -405,7 +406,7 @@ uint8_t osm_switch_get_port_least_hops(IN const osm_switch_t * p_sw,
>  static inline uint8_t osm_switch_get_port_by_lid(IN const osm_switch_t * p_sw,
>  						 IN uint16_t lid_ho)
>  {
> -	if (lid_ho == 0 || lid_ho > IB_LID_UCAST_END_HO)
> +	if (lid_ho == 0 || lid_ho > p_sw->max_lid_ho)
>  		return OSM_NO_PATH;
>  	return p_sw->lft[lid_ho];
>  }
> @@ -711,7 +712,7 @@ osm_switch_set_lft_block(IN osm_switch_t * p_sw, IN const uint8_t * p_block,
>  		(uint16_t) (block_num * IB_SMP_DATA_SIZE);
>  	CL_ASSERT(p_sw);
>  
> -	if (lid_start + IB_SMP_DATA_SIZE > IB_LID_UCAST_END_HO)
> +	if (lid_start + IB_SMP_DATA_SIZE > p_sw->lft_size)
>  		return IB_INVALID_PARAMETER;
>  
>  	memcpy(&p_sw->lft[lid_start], p_block, IB_SMP_DATA_SIZE);
> diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
> index 185c700..c40c9d3 100644
> --- a/opensm/opensm/osm_state_mgr.c
> +++ b/opensm/opensm/osm_state_mgr.c
> @@ -1,6 +1,6 @@
>  /*
>   * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
>   *
> @@ -1011,9 +1011,9 @@ static void cleanup_switch(cl_map_item_t * item, void *log)
>  	if (!sw->new_lft)
>  		return;
>  
> -	if (memcmp(sw->lft, sw->new_lft, IB_LID_UCAST_END_HO + 1))
> +	if (memcmp(sw->lft, sw->new_lft, sw->lft_size))
>  		osm_log(log, OSM_LOG_ERROR, "ERR 331D: "
> -			"LFT of switch 0x%016" PRIx64 " is not up to date.\n",
> +			"LFT of switch 0x%016" PRIx64 " is not up to date\n",
>  			cl_ntoh64(sw->p_node->node_info.node_guid));
>  	else {
>  		free(sw->new_lft);
> diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
> index ad1018f..00aad5a 100644
> --- a/opensm/opensm/osm_switch.c
> +++ b/opensm/opensm/osm_switch.c
> @@ -1,6 +1,6 @@
>  /*
>   * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
>   *
> @@ -130,12 +130,6 @@ osm_switch_t *osm_switch_new(IN osm_node_t * p_node,
>  	p_sw->num_ports = num_ports;
>  	p_sw->need_update = 2;
>  
> -	p_sw->lft = malloc(IB_LID_UCAST_END_HO + 1);
> -	if (!p_sw->lft)
> -		goto err;
> -
> -	memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
> -
>  	p_sw->p_prof = malloc(sizeof(*p_sw->p_prof) * num_ports);
>  	if (!p_sw->p_prof)
>  		goto err;
> @@ -166,7 +160,7 @@ boolean_t osm_switch_get_lft_block(IN const osm_switch_t * p_sw,
>  	CL_ASSERT(p_sw);
>  	CL_ASSERT(p_block);
>  
> -	if (base_lid_ho > p_sw->max_lid_ho)
> +	if (base_lid_ho >= p_sw->lft_size)

I'm reverting this change for discussed reason - we cannot refer
potentially outdated LFT entries (in case when sw->lft_size >
sw->max_lid_ho + 1).

>  		return FALSE;
>  
>  	CL_ASSERT(base_lid_ho + IB_SMP_DATA_SIZE <= IB_LID_UCAST_END_HO);
> @@ -498,21 +492,46 @@ void osm_switch_clear_hops(IN osm_switch_t * p_sw)
>  
>  /**********************************************************************
>   **********************************************************************/
> +static int alloc_lft(IN osm_switch_t * p_sw, uint16_t lids)
> +{
> +	uint16_t lft_size;
> +	uint8_t *new_lft;
> +
> +	lft_size = lids;

This line is not needed anymore.

Sasha

> +	/* Ensure LFT is in units of LFT block size */
> +	lft_size = (lids + IB_SMP_DATA_SIZE - 1) / IB_SMP_DATA_SIZE * IB_SMP_DATA_SIZE;
> +
> +	if (lft_size > p_sw->lft_size) {
> +		new_lft = realloc(p_sw->lft, lft_size);
> +		if (!new_lft)
> +			return -1;
> +		memset(new_lft + p_sw->lft_size, OSM_NO_PATH,
> +		       lft_size - p_sw->lft_size);
> +		p_sw->lft = new_lft;
> +		p_sw->lft_size = lft_size;
> +	}
> +	return 0;
> +}
> +
> +/**********************************************************************
> + **********************************************************************/
>  int osm_switch_prepare_path_rebuild(IN osm_switch_t * p_sw, IN uint16_t max_lids)
>  {
>  	uint8_t **hops;
>  	unsigned i;
>  
> +	if (alloc_lft(p_sw, max_lids))
> +		return -1;
> +
>  	for (i = 0; i < p_sw->num_ports; i++)
>  		osm_port_prof_construct(&p_sw->p_prof[i]);
>  
>  	osm_switch_clear_hops(p_sw);
>  
> -	if (!p_sw->new_lft &&
> -	    !(p_sw->new_lft = malloc(IB_LID_UCAST_END_HO + 1)))
> -		return IB_INSUFFICIENT_MEMORY;
> +	if (!(p_sw->new_lft = realloc(p_sw->new_lft, p_sw->lft_size)))
> +		return -1;
>  
> -	memset(p_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
> +	memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
>  
>  	if (!p_sw->hops) {
>  		hops = malloc((max_lids + 1) * sizeof(hops[0]));
> diff --git a/opensm/opensm/osm_ucast_cache.c b/opensm/opensm/osm_ucast_cache.c
> index 6d3c53e..31a5333 100644
> --- a/opensm/opensm/osm_ucast_cache.c
> +++ b/opensm/opensm/osm_ucast_cache.c
> @@ -1079,10 +1079,10 @@ int osm_ucast_cache_process(osm_ucast_mgr_t * p_mgr)
>  			/* no new routing was recently calculated for this
>  			   switch, but the LFT needs to be updated anyway */
>  			p_sw->new_lft = p_sw->lft;
> -			p_sw->lft = malloc(IB_LID_UCAST_END_HO + 1);
> +			p_sw->lft = malloc(p_sw->lft_size);
>  			if (!p_sw->lft)
>  				return IB_INSUFFICIENT_MEMORY;
> -			memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
> +			memset(p_sw->lft, OSM_NO_PATH, p_sw->lft_size);
>  		}
>  
>  	}
> diff --git a/opensm/opensm/osm_ucast_file.c b/opensm/opensm/osm_ucast_file.c
> index 5b73ca5..7c1b5ba 100644
> --- a/opensm/opensm/osm_ucast_file.c
> +++ b/opensm/opensm/osm_ucast_file.c
> @@ -193,8 +193,7 @@ static int do_ucast_file_load(void *context)
>  					cl_ntoh64(sw_guid));
>  				continue;
>  			}
> -			memset(p_sw->new_lft, OSM_NO_PATH,
> -			       IB_LID_UCAST_END_HO + 1);
> +			memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
>  		} else if (p_sw && !strncmp(p, "0x", 2)) {
>  			p += 2;
>  			lid = (uint16_t) strtoul(p, &q, 16);
> diff --git a/opensm/opensm/osm_ucast_ftree.c b/opensm/opensm/osm_ucast_ftree.c
> index 1defd95..885c834 100644
> --- a/opensm/opensm/osm_ucast_ftree.c
> +++ b/opensm/opensm/osm_ucast_ftree.c
> @@ -566,7 +566,7 @@ static ftree_sw_t *sw_create(IN ftree_fabric_t * p_ftree,
>  		return NULL;
>  
>  	/* initialize lft buffer */
> -	memset(p_osm_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
> +	memset(p_osm_sw->new_lft, OSM_NO_PATH, p_osm_sw->lft_size);
>  	p_sw->hops = malloc((p_osm_sw->max_lid_ho + 1) * sizeof(*(p_sw->hops)));
>  	if (p_sw->hops == NULL)
>  		return NULL;
> diff --git a/opensm/opensm/osm_ucast_lash.c b/opensm/opensm/osm_ucast_lash.c
> index 8b0a190..8015226 100644
> --- a/opensm/opensm/osm_ucast_lash.c
> +++ b/opensm/opensm/osm_ucast_lash.c
> @@ -994,7 +994,7 @@ static void populate_fwd_tbls(lash_t * p_lash)
>  
>  	p_next_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);
>  
> -	/* Go through each swtich individually */
> +	/* Go through each switch individually */
>  	while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) {
>  		uint64_t current_guid;
>  		switch_t *sw;
> @@ -1005,7 +1005,7 @@ static void populate_fwd_tbls(lash_t * p_lash)
>  		current_guid = p_sw->p_node->node_info.port_guid;
>  		sw = p_sw->priv;
>  
> -		memset(p_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
> +		memset(p_sw->new_lft, OSM_NO_PATH, p_sw->lft_size);
>  
>  		for (lid = 1; lid <= max_lid_ho; lid++) {
>  			port = cl_ptr_vector_get(&p_subn->port_lid_tbl, lid);
> diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
> index be37df9..d0e85b1 100644
> --- a/opensm/opensm/osm_ucast_mgr.c
> +++ b/opensm/opensm/osm_ucast_mgr.c
> @@ -372,7 +372,7 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
>  		cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
>  
>  	/* Initialize LIDs in buffer to invalid port number. */
> -	memset(p_sw->new_lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
> +	memset(p_sw->new_lft, OSM_NO_PATH, p_sw->max_lid_ho + 1);
>  
>  	if (p_mgr->p_subn->opt.lmc)
>  		alloc_ports_priv(p_mgr);
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

  parent reply	other threads:[~2009-10-13 17:51 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-10-13 13:00 [PATCHv4] opensm: Reduce heap consumption by unicast routing tables (LFTs) Hal Rosenstock
     [not found] ` <20091013130002.GA20173-Wuw85uim5zDR7s880joybQ@public.gmane.org>
2009-10-13 17:51   ` Sasha Khapyorsky [this message]
2009-10-13 17:56   ` [PATCH] opensm/osm_state_mgr.c: in cleanup_switch() check only relevant LFT part Sasha Khapyorsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091013175117.GR13830@me \
    --to=sashak-smomgflxvozwk0htik3j/w@public.gmane.org \
    --cc=hnrose-Wuw85uim5zDR7s880joybQ@public.gmane.org \
    --cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox