exofs/ore: allocation of _ore_get_io

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* exofs/ore: allocation of _ore_get_io_state()
@ 2012-04-16 14:53 Idan Kedar
  2012-05-23 21:00 ` Boaz Harrosh
  0 siblings, 1 reply; 4+ messages in thread
From: Idan Kedar @ 2012-04-16 14:53 UTC (permalink / raw)
  To: Linux FS Maling List

_ore_get_io_state is supposed to allocate a struct ore_io_state, which is
variable length. Setting aside the uglification argument which gave birth
to the patch "pnfs-obj: Uglify objio_segment allocation for the sake of
the principle :-(", the above function checks if the memory it needs to
allocate is bigger than a page, and if so, separates it into two
allocations. why is this done?

from include/linux/slab.h:
>/*
> * The largest kmalloc size supported by the slab allocators is
> * 32 megabyte (2^25) or the maximum allocatable page order if that is
> * less than 32 MB.
> *
> * WARNING: Its not easy to increase this value since the allocators have
> * to do various tricks to work around compiler limitations in order to
> * ensure proper constant folding.
> */
>#define KMALLOC_SHIFT_HIGH      ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
>                                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
>
>#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
>#define KMALLOC_MAX_ORDER       (KMALLOC_SHIFT_HIGH - PAGE_SHIFT)

If kmalloc can allocate 32MB, why check one page, like so?

from fs/exofs/ore.c:
>struct __alloc_all_io_state {
>	struct ore_io_state ios;
>	struct ore_per_dev_state per_dev[numdevs];
>	union {
>		struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>		struct page *pages[num_par_pages];
>	};
>} *_aios;
>
>if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
>	_aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
>	if (unlikely(!_aios)) {
>		EXOFS_DBGMSG("Failed kzalloc bytes=%zd\n",
>			   sizeof(*_aios));
>		*pios = NULL;
>		return -ENOMEM;
>	}
>	pages = num_par_pages ? _aios->pages : NULL;
>	sgilist = sgs_per_dev ? _aios->sglist : NULL;
>	ios = &_aios->ios;
>} else {
>	struct __alloc_small_io_state {
>		struct ore_io_state ios;
>		struct ore_per_dev_state per_dev[numdevs];
>	} *_aio_small;
>	union __extra_part {
>		struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>		struct page *pages[num_par_pages];
>	} *extra_part;
>		_aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
>	if (unlikely(!_aio_small)) {
>		EXOFS_DBGMSG("Failed alloc first part bytes=%zd\n",
>			   sizeof(*_aio_small));
>		*pios = NULL;
>		return -ENOMEM;
>	}
>	extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
>	if (unlikely(!extra_part)) {
>		EXOFS_DBGMSG("Failed alloc second part bytes=%zd\n",
>			   sizeof(*extra_part));
>		kfree(_aio_small);
>		*pios = NULL;
>		return -ENOMEM;
>	}
>		pages = num_par_pages ? extra_part->pages : NULL;
>	sgilist = sgs_per_dev ? extra_part->sglist : NULL;
>	/* In this case the per_dev[0].sgilist holds the pointer to
>	 * be freed
>	 */
>	ios = &_aio_small->ios;
>	ios->extra_part_alloc = true;
>}

Is there any point to check if the memory is greater than 32MB?


-- 
Idan Kedar
Tonian

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: exofs/ore: allocation of _ore_get_io_state()
  2012-04-16 14:53 exofs/ore: allocation of _ore_get_io_state() Idan Kedar
@ 2012-05-23 21:00 ` Boaz Harrosh
  2012-05-24 11:23   ` Idan Kedar
  0 siblings, 1 reply; 4+ messages in thread
From: Boaz Harrosh @ 2012-05-23 21:00 UTC (permalink / raw)
  To: Idan Kedar; +Cc: Linux FS Maling List

On 04/16/2012 05:53 PM, Idan Kedar wrote:

> _ore_get_io_state is supposed to allocate a struct ore_io_state, which is
> variable length. Setting aside the uglification argument which gave birth
> to the patch "pnfs-obj: Uglify objio_segment allocation for the sake of
> the principle :-(", the above function checks if the memory it needs to
> allocate is bigger than a page, and if so, separates it into two
> allocations. why is this done?
> 
> from include/linux/slab.h:
>> /*
>> * The largest kmalloc size supported by the slab allocators is
>> * 32 megabyte (2^25) or the maximum allocatable page order if that is
>> * less than 32 MB.
>> *
>> * WARNING: Its not easy to increase this value since the allocators have
>> * to do various tricks to work around compiler limitations in order to
>> * ensure proper constant folding.
>> */
>> #define KMALLOC_SHIFT_HIGH      ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
>>                                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
>>
>> #define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
>> #define KMALLOC_MAX_ORDER       (KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
> 
> If kmalloc can allocate 32MB, why check one page, like so?
> 
> from fs/exofs/ore.c:
>> struct __alloc_all_io_state {
>> 	struct ore_io_state ios;
>> 	struct ore_per_dev_state per_dev[numdevs];
>> 	union {
>> 		struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>> 		struct page *pages[num_par_pages];
>> 	};
>> } *_aios;
>>
>> if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
>> 	_aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
>> 	if (unlikely(!_aios)) {
>> 		EXOFS_DBGMSG("Failed kzalloc bytes=%zd\n",
>> 			   sizeof(*_aios));
>> 		*pios = NULL;
>> 		return -ENOMEM;
>> 	}
>> 	pages = num_par_pages ? _aios->pages : NULL;
>> 	sgilist = sgs_per_dev ? _aios->sglist : NULL;
>> 	ios = &_aios->ios;
>> } else {
>> 	struct __alloc_small_io_state {
>> 		struct ore_io_state ios;
>> 		struct ore_per_dev_state per_dev[numdevs];
>> 	} *_aio_small;
>> 	union __extra_part {
>> 		struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>> 		struct page *pages[num_par_pages];
>> 	} *extra_part;
>> 		_aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
>> 	if (unlikely(!_aio_small)) {
>> 		EXOFS_DBGMSG("Failed alloc first part bytes=%zd\n",
>> 			   sizeof(*_aio_small));
>> 		*pios = NULL;
>> 		return -ENOMEM;
>> 	}
>> 	extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
>> 	if (unlikely(!extra_part)) {
>> 		EXOFS_DBGMSG("Failed alloc second part bytes=%zd\n",
>> 			   sizeof(*extra_part));
>> 		kfree(_aio_small);
>> 		*pios = NULL;
>> 		return -ENOMEM;
>> 	}
>> 		pages = num_par_pages ? extra_part->pages : NULL;
>> 	sgilist = sgs_per_dev ? extra_part->sglist : NULL;
>> 	/* In this case the per_dev[0].sgilist holds the pointer to
>> 	 * be freed
>> 	 */
>> 	ios = &_aio_small->ios;
>> 	ios->extra_part_alloc = true;
>> }
> 
> Is there any point to check if the memory is greater than 32MB?
> 
> 


In theory it can allocate 32MB, in slab. I'm not sure about slob and slub.

But in practice contiguous physical pages allocation tends to fail very
fast on a system that was up a couple of hours. So we avoid it as plage.

Past testing with tables bigger than PAGE_SIZE on the IO path gave
catastrophic results. (Again once the system is up for a while and
had a chance to fragment physical address space)

The all Kernel point of the use of sg-lists is so not to allocate
contiguous physical pages and to not have to use virtual-memory.

This is done all over the Kernel. MAX_BIO_SIZE max-sg-table ...

(BTW I saw this mail by chance. If you direct it to me I see it
 for sure)

Cheers
Boaz

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: exofs/ore: allocation of _ore_get_io_state()
  2012-05-23 21:00 ` Boaz Harrosh
@ 2012-05-24 11:23   ` Idan Kedar
  2012-05-24 14:05     ` Boaz Harrosh
  0 siblings, 1 reply; 4+ messages in thread
From: Idan Kedar @ 2012-05-24 11:23 UTC (permalink / raw)
  To: Boaz Harrosh; +Cc: Linux FS Maling List

On Thu, May 24, 2012 at 12:00 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
> On 04/16/2012 05:53 PM, Idan Kedar wrote:
>
>> _ore_get_io_state is supposed to allocate a struct ore_io_state, which is
>> variable length. Setting aside the uglification argument which gave birth
>> to the patch "pnfs-obj: Uglify objio_segment allocation for the sake of
>> the principle :-(", the above function checks if the memory it needs to
>> allocate is bigger than a page, and if so, separates it into two
>> allocations. why is this done?
>>
>> from include/linux/slab.h:
>>> /*
>>> * The largest kmalloc size supported by the slab allocators is
>>> * 32 megabyte (2^25) or the maximum allocatable page order if that is
>>> * less than 32 MB.
>>> *
>>> * WARNING: Its not easy to increase this value since the allocators have
>>> * to do various tricks to work around compiler limitations in order to
>>> * ensure proper constant folding.
>>> */
>>> #define KMALLOC_SHIFT_HIGH      ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
>>>                                (MAX_ORDER + PAGE_SHIFT - 1) : 25)
>>>
>>> #define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
>>> #define KMALLOC_MAX_ORDER       (KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
>>
>> If kmalloc can allocate 32MB, why check one page, like so?
>>
>> from fs/exofs/ore.c:
>>> struct __alloc_all_io_state {
>>>      struct ore_io_state ios;
>>>      struct ore_per_dev_state per_dev[numdevs];
>>>      union {
>>>              struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>>>              struct page *pages[num_par_pages];
>>>      };
>>> } *_aios;
>>>
>>> if (likely(sizeof(*_aios) <= PAGE_SIZE)) {
>>>      _aios = kzalloc(sizeof(*_aios), GFP_KERNEL);
>>>      if (unlikely(!_aios)) {
>>>              EXOFS_DBGMSG("Failed kzalloc bytes=%zd\n",
>>>                         sizeof(*_aios));
>>>              *pios = NULL;
>>>              return -ENOMEM;
>>>      }
>>>      pages = num_par_pages ? _aios->pages : NULL;
>>>      sgilist = sgs_per_dev ? _aios->sglist : NULL;
>>>      ios = &_aios->ios;
>>> } else {
>>>      struct __alloc_small_io_state {
>>>              struct ore_io_state ios;
>>>              struct ore_per_dev_state per_dev[numdevs];
>>>      } *_aio_small;
>>>      union __extra_part {
>>>              struct osd_sg_entry sglist[sgs_per_dev * numdevs];
>>>              struct page *pages[num_par_pages];
>>>      } *extra_part;
>>>              _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL);
>>>      if (unlikely(!_aio_small)) {
>>>              EXOFS_DBGMSG("Failed alloc first part bytes=%zd\n",
>>>                         sizeof(*_aio_small));
>>>              *pios = NULL;
>>>              return -ENOMEM;
>>>      }
>>>      extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL);
>>>      if (unlikely(!extra_part)) {
>>>              EXOFS_DBGMSG("Failed alloc second part bytes=%zd\n",
>>>                         sizeof(*extra_part));
>>>              kfree(_aio_small);
>>>              *pios = NULL;
>>>              return -ENOMEM;
>>>      }
>>>              pages = num_par_pages ? extra_part->pages : NULL;
>>>      sgilist = sgs_per_dev ? extra_part->sglist : NULL;
>>>      /* In this case the per_dev[0].sgilist holds the pointer to
>>>       * be freed
>>>       */
>>>      ios = &_aio_small->ios;
>>>      ios->extra_part_alloc = true;
>>> }
>>
>> Is there any point to check if the memory is greater than 32MB?
>>
>>
>
>
> In theory it can allocate 32MB, in slab. I'm not sure about slob and slub.
>
> But in practice contiguous physical pages allocation tends to fail very
> fast on a system that was up a couple of hours. So we avoid it as plage.
>
> Past testing with tables bigger than PAGE_SIZE on the IO path gave
> catastrophic results. (Again once the system is up for a while and
> had a chance to fragment physical address space)

What allocation sizes (of struct __alloc_all_io_state) are we talking
about? how many devices per I/O did you encounter?

>
> The all Kernel point of the use of sg-lists is so not to allocate
> contiguous physical pages and to not have to use virtual-memory.
>
> This is done all over the Kernel. MAX_BIO_SIZE max-sg-table ...

Why not use virtual memory? Is this limitation imposed by the OSD
initiator or by some other layer in the OSD stack?

>
> (BTW I saw this mail by chance. If you direct it to me I see it
>  for sure)
>
> Cheers
> Boaz

-- 
Idan Kedar
Tonian
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: exofs/ore: allocation of _ore_get_io_state()
  2012-05-24 11:23   ` Idan Kedar
@ 2012-05-24 14:05     ` Boaz Harrosh
  0 siblings, 0 replies; 4+ messages in thread
From: Boaz Harrosh @ 2012-05-24 14:05 UTC (permalink / raw)
  To: Idan Kedar; +Cc: Linux FS Maling List

On 05/24/2012 02:23 PM, Idan Kedar wrote:

> On Thu, May 24, 2012 at 12:00 AM, Boaz Harrosh <bharrosh@panasas.com> wrote:
>>> Is there any point to check if the memory is greater than 32MB?
>>>
>>>
>>
>>
>> In theory it can allocate 32MB, in slab. I'm not sure about slob and slub.
>>
>> But in practice contiguous physical pages allocation tends to fail very
>> fast on a system that was up a couple of hours. So we avoid it as plage.
>>
>> Past testing with tables bigger than PAGE_SIZE on the IO path gave
>> catastrophic results. (Again once the system is up for a while and
>> had a chance to fragment physical address space)
> 
> What allocation sizes (of struct __alloc_all_io_state) are we talking
> about? how many devices per I/O did you encounter?
> 

Personally I had it with scsi-lib's sg_table bigger than PAGE_SIZE
allocation. (Because of a bug) It is currently MAXed at PAGE_SIZE.
Other people reported same failures and great performance degradation
when allocating BIOs and BIO_VECs larger then PAGE_SIZE. 

It's simply the old and known page-fragmentation problem. It's
why virtual memory was invented in the first place.
kmalloc is not a virtual allocator.

>>
>> The all Kernel point of the use of sg-lists is so not to allocate
>> contiguous physical pages and to not have to use virtual-memory.
>>
>> This is done all over the Kernel. MAX_BIO_SIZE max-sg-table ...
> 
> Why not use virtual memory? Is this limitation imposed by the OSD
> initiator or by some other layer in the OSD stack?
> 

Welcome to Linux Kernel 101. vmalloc is ten fold slower than
kmalloc. And in principal the same will happen, multiple discrete
pages will be allocated, and collected together but now you will need
to set up a TLB entries, and make sure they are mapped in when needed.
(Every interrupt every context switch)

This single fact of "Linux Kernel code does not use VM" is a
10 fold speed gain over Windows Kernel, measured.

>>
>> (BTW I saw this mail by chance. If you direct it to me I see it
>>  for sure)
>>
>> Cheers
>> Boaz
> 

Boaz

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2012-05-24 14:05 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-04-16 14:53 exofs/ore: allocation of _ore_get_io_state() Idan Kedar
2012-05-23 21:00 ` Boaz Harrosh
2012-05-24 11:23   ` Idan Kedar
2012-05-24 14:05     ` Boaz Harrosh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).