linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* strange raid5
@ 2005-07-23  1:58 Ming Zhang
  2005-07-23  6:14 ` Tyler
  0 siblings, 1 reply; 5+ messages in thread
From: Ming Zhang @ 2005-07-23  1:58 UTC (permalink / raw)
  To: Linux RAID

[-- Attachment #1: Type: text/plain, Size: 1587 bytes --]

i created a 32KB chunk size 3 disk raid5. then write this disk with a
small code i wrote. i found that even i write it with 1048756 in unit,
which is multiple of stripe size, it still has a lot of read when seen
from iostat. 

any idea? thanks!

i attached the code for reference.

[root@bakstor2u root]# cat /proc/mdstat
Personalities : [linear] [raid0] [raid1] [raid5] [multipath] [raid6]
[raid10] [faulty]
md0 : active raid5 sdc[2] sdb[1] sda[0]
      781422592 blocks level 5, 32k chunk, algorithm 2 [3/3] [UUU]

unused devices: <none>
[root@bakstor2u root]# ./write /dev/md0 1048576 1000
 1048576Bytes *     1000 :    34.745MB/s


avg-cpu:  %user   %nice    %sys %iowait   %idle
           0.00    0.00   17.17   82.83    0.00

Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
hda               0.00         0.00         0.00          0          0
hdc               0.00         0.00         0.00          0          0
md0            8791.92         0.00     70335.35          0      69632
sda             605.05       387.88     35143.43        384      34792
sdb             611.11       323.23     35143.43        320      34792
sdc             602.02       387.88     35143.43        384      34792
sdd               0.00         0.00         0.00          0          0
sde               0.00         0.00         0.00          0          0
sdf               0.00         0.00         0.00          0          0
sdg               0.00         0.00         0.00          0          0
sdh               0.00         0.00         0.00          0          0



[-- Attachment #2: write.c --]
[-- Type: text/x-csrc, Size: 1632 bytes --]

/*
 * I need a small program to write in various mode.
 */
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <fcntl.h>

int main(int argc, char *argv[])
{
	int size, cnt;
	char *buf;
	char *fn;
	int fid, i;
	struct timeval tv1, tv2, ttv1, ttv2;
	double x;
	int sync = 1;
	int timing = 0;
	unsigned long *t;

	if (argc < 4) {
		printf("%s <fn> <strip size> <strip count> [s/a [t]]\n", argv[0]);
		exit(1);
	}
	fn = argv[1];
	size = atoi(argv[2]);
	cnt = atoi(argv[3]);
	if ((argc >=5) && (argv[4][0] == 'a'))
		sync = 0;
	if ((argc >=6) && (argv[5][0] == 't'))
		timing = 1;
	if (timing) {
		t = (unsigned long *)malloc(sizeof(double) * cnt);
		if (!t) {
			printf("fail to get mem for t\n");
			exit(1);
		}
	}
	buf = malloc(size * sizeof(char));
	if (!buf) {
		printf("fail to get memory\n");
		exit(1);
	}
	fid = open(fn, O_CREAT|O_WRONLY|(sync ? O_SYNC : 0), S_IRWXU);
	if (fid == -1) {
		printf("open file fail\n");
		exit(1);
	}
	gettimeofday(&tv1, NULL);
	for (i = 0; i < cnt; i++) {
		if (timing) {
			gettimeofday(&ttv1, NULL);
			write(fid, buf, size);
			gettimeofday(&ttv2, NULL);
			t[i] = (ttv2.tv_sec - ttv1.tv_sec) * 1000000 + ttv2.tv_usec - ttv1.tv_usec;
		} else
			write(fid, buf, size);
	}
	close(fid);
	gettimeofday(&tv2, NULL);
	x = (tv2.tv_sec - tv1.tv_sec) + ((double)(tv2.tv_usec - tv1.tv_usec)) 
		* 0.000001;
	x = ((double)(size * cnt) / 1048576.0) / x;
	printf("%8dBytes * %8d :%10.3fMB/s\n", size, cnt, x);
	if (timing) {
		for (i = 0; i < cnt; i++) {
			printf("%8ld", t[i]);
			if (!(i % 10))
				printf("\n");
		}
		printf("\n");
	}
}


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: strange raid5
  2005-07-23  1:58 strange raid5 Ming Zhang
@ 2005-07-23  6:14 ` Tyler
  2005-07-23 13:38   ` Ming Zhang
  0 siblings, 1 reply; 5+ messages in thread
From: Tyler @ 2005-07-23  6:14 UTC (permalink / raw)
  To: mingz; +Cc: Linux RAID

By my calculations, 1048756 is *not* a multiple of 32768 (32 
Kilobytes).  Did I miscalculate?

Regards,
Tyler.

Ming Zhang wrote:

>i created a 32KB chunk size 3 disk raid5. then write this disk with a
>small code i wrote. i found that even i write it with 1048756 in unit,
>which is multiple of stripe size, it still has a lot of read when seen
>from iostat. 
>
>any idea? thanks!
>
>i attached the code for reference.
>
>[root@bakstor2u root]# cat /proc/mdstat
>Personalities : [linear] [raid0] [raid1] [raid5] [multipath] [raid6]
>[raid10] [faulty]
>md0 : active raid5 sdc[2] sdb[1] sda[0]
>      781422592 blocks level 5, 32k chunk, algorithm 2 [3/3] [UUU]
>
>unused devices: <none>
>[root@bakstor2u root]# ./write /dev/md0 1048576 1000
> 1048576Bytes *     1000 :    34.745MB/s
>
>
>avg-cpu:  %user   %nice    %sys %iowait   %idle
>           0.00    0.00   17.17   82.83    0.00
>
>Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
>hda               0.00         0.00         0.00          0          0
>hdc               0.00         0.00         0.00          0          0
>md0            8791.92         0.00     70335.35          0      69632
>sda             605.05       387.88     35143.43        384      34792
>sdb             611.11       323.23     35143.43        320      34792
>sdc             602.02       387.88     35143.43        384      34792
>sdd               0.00         0.00         0.00          0          0
>sde               0.00         0.00         0.00          0          0
>sdf               0.00         0.00         0.00          0          0
>sdg               0.00         0.00         0.00          0          0
>sdh               0.00         0.00         0.00          0          0
>
>
>  
>
>------------------------------------------------------------------------
>
>/*
> * I need a small program to write in various mode.
> */
>#include <stdio.h>
>#include <stdlib.h>
>#include <sys/types.h>
>#include <sys/stat.h>
>#include <sys/time.h>
>#include <fcntl.h>
>
>int main(int argc, char *argv[])
>{
>	int size, cnt;
>	char *buf;
>	char *fn;
>	int fid, i;
>	struct timeval tv1, tv2, ttv1, ttv2;
>	double x;
>	int sync = 1;
>	int timing = 0;
>	unsigned long *t;
>
>	if (argc < 4) {
>		printf("%s <fn> <strip size> <strip count> [s/a [t]]\n", argv[0]);
>		exit(1);
>	}
>	fn = argv[1];
>	size = atoi(argv[2]);
>	cnt = atoi(argv[3]);
>	if ((argc >=5) && (argv[4][0] == 'a'))
>		sync = 0;
>	if ((argc >=6) && (argv[5][0] == 't'))
>		timing = 1;
>	if (timing) {
>		t = (unsigned long *)malloc(sizeof(double) * cnt);
>		if (!t) {
>			printf("fail to get mem for t\n");
>			exit(1);
>		}
>	}
>	buf = malloc(size * sizeof(char));
>	if (!buf) {
>		printf("fail to get memory\n");
>		exit(1);
>	}
>	fid = open(fn, O_CREAT|O_WRONLY|(sync ? O_SYNC : 0), S_IRWXU);
>	if (fid == -1) {
>		printf("open file fail\n");
>		exit(1);
>	}
>	gettimeofday(&tv1, NULL);
>	for (i = 0; i < cnt; i++) {
>		if (timing) {
>			gettimeofday(&ttv1, NULL);
>			write(fid, buf, size);
>			gettimeofday(&ttv2, NULL);
>			t[i] = (ttv2.tv_sec - ttv1.tv_sec) * 1000000 + ttv2.tv_usec - ttv1.tv_usec;
>		} else
>			write(fid, buf, size);
>	}
>	close(fid);
>	gettimeofday(&tv2, NULL);
>	x = (tv2.tv_sec - tv1.tv_sec) + ((double)(tv2.tv_usec - tv1.tv_usec)) 
>		* 0.000001;
>	x = ((double)(size * cnt) / 1048576.0) / x;
>	printf("%8dBytes * %8d :%10.3fMB/s\n", size, cnt, x);
>	if (timing) {
>		for (i = 0; i < cnt; i++) {
>			printf("%8ld", t[i]);
>			if (!(i % 10))
>				printf("\n");
>		}
>		printf("\n");
>	}
>}
>
>  
>
>------------------------------------------------------------------------
>
>No virus found in this incoming message.
>Checked by AVG Anti-Virus.
>Version: 7.0.323 / Virus Database: 267.9.2/55 - Release Date: 7/21/2005
>  
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: strange raid5
  2005-07-23  6:14 ` Tyler
@ 2005-07-23 13:38   ` Ming Zhang
  2005-07-23 21:37     ` Neil Brown
  0 siblings, 1 reply; 5+ messages in thread
From: Ming Zhang @ 2005-07-23 13:38 UTC (permalink / raw)
  To: Tyler; +Cc: Linux RAID

1048576 = 1024 * 1024 = 32 * 32768. :)

so it should be 32 stripe writes.

ming

On Fri, 2005-07-22 at 23:14 -0700, Tyler wrote:
> By my calculations, 1048756 is *not* a multiple of 32768 (32 
> Kilobytes).  Did I miscalculate?
> 
> Regards,
> Tyler.
> 
> Ming Zhang wrote:
> 
> >i created a 32KB chunk size 3 disk raid5. then write this disk with a
> >small code i wrote. i found that even i write it with 1048756 in unit,
> >which is multiple of stripe size, it still has a lot of read when seen
> >from iostat. 
> >
> >any idea? thanks!
> >
> >i attached the code for reference.
> >
> >[root@bakstor2u root]# cat /proc/mdstat
> >Personalities : [linear] [raid0] [raid1] [raid5] [multipath] [raid6]
> >[raid10] [faulty]
> >md0 : active raid5 sdc[2] sdb[1] sda[0]
> >      781422592 blocks level 5, 32k chunk, algorithm 2 [3/3] [UUU]
> >
> >unused devices: <none>
> >[root@bakstor2u root]# ./write /dev/md0 1048576 1000
> > 1048576Bytes *     1000 :    34.745MB/s
> >
> >
> >avg-cpu:  %user   %nice    %sys %iowait   %idle
> >           0.00    0.00   17.17   82.83    0.00
> >
> >Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
> >hda               0.00         0.00         0.00          0          0
> >hdc               0.00         0.00         0.00          0          0
> >md0            8791.92         0.00     70335.35          0      69632
> >sda             605.05       387.88     35143.43        384      34792
> >sdb             611.11       323.23     35143.43        320      34792
> >sdc             602.02       387.88     35143.43        384      34792
> >sdd               0.00         0.00         0.00          0          0
> >sde               0.00         0.00         0.00          0          0
> >sdf               0.00         0.00         0.00          0          0
> >sdg               0.00         0.00         0.00          0          0
> >sdh               0.00         0.00         0.00          0          0
> >
> >
> >  
> >
> >------------------------------------------------------------------------
> >
> >/*
> > * I need a small program to write in various mode.
> > */
> >#include <stdio.h>
> >#include <stdlib.h>
> >#include <sys/types.h>
> >#include <sys/stat.h>
> >#include <sys/time.h>
> >#include <fcntl.h>
> >
> >int main(int argc, char *argv[])
> >{
> >	int size, cnt;
> >	char *buf;
> >	char *fn;
> >	int fid, i;
> >	struct timeval tv1, tv2, ttv1, ttv2;
> >	double x;
> >	int sync = 1;
> >	int timing = 0;
> >	unsigned long *t;
> >
> >	if (argc < 4) {
> >		printf("%s <fn> <strip size> <strip count> [s/a [t]]\n", argv[0]);
> >		exit(1);
> >	}
> >	fn = argv[1];
> >	size = atoi(argv[2]);
> >	cnt = atoi(argv[3]);
> >	if ((argc >=5) && (argv[4][0] == 'a'))
> >		sync = 0;
> >	if ((argc >=6) && (argv[5][0] == 't'))
> >		timing = 1;
> >	if (timing) {
> >		t = (unsigned long *)malloc(sizeof(double) * cnt);
> >		if (!t) {
> >			printf("fail to get mem for t\n");
> >			exit(1);
> >		}
> >	}
> >	buf = malloc(size * sizeof(char));
> >	if (!buf) {
> >		printf("fail to get memory\n");
> >		exit(1);
> >	}
> >	fid = open(fn, O_CREAT|O_WRONLY|(sync ? O_SYNC : 0), S_IRWXU);
> >	if (fid == -1) {
> >		printf("open file fail\n");
> >		exit(1);
> >	}
> >	gettimeofday(&tv1, NULL);
> >	for (i = 0; i < cnt; i++) {
> >		if (timing) {
> >			gettimeofday(&ttv1, NULL);
> >			write(fid, buf, size);
> >			gettimeofday(&ttv2, NULL);
> >			t[i] = (ttv2.tv_sec - ttv1.tv_sec) * 1000000 + ttv2.tv_usec - ttv1.tv_usec;
> >		} else
> >			write(fid, buf, size);
> >	}
> >	close(fid);
> >	gettimeofday(&tv2, NULL);
> >	x = (tv2.tv_sec - tv1.tv_sec) + ((double)(tv2.tv_usec - tv1.tv_usec)) 
> >		* 0.000001;
> >	x = ((double)(size * cnt) / 1048576.0) / x;
> >	printf("%8dBytes * %8d :%10.3fMB/s\n", size, cnt, x);
> >	if (timing) {
> >		for (i = 0; i < cnt; i++) {
> >			printf("%8ld", t[i]);
> >			if (!(i % 10))
> >				printf("\n");
> >		}
> >		printf("\n");
> >	}
> >}
> >
> >  
> >
> >------------------------------------------------------------------------
> >
> >No virus found in this incoming message.
> >Checked by AVG Anti-Virus.
> >Version: 7.0.323 / Virus Database: 267.9.2/55 - Release Date: 7/21/2005
> >  
> >


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: strange raid5
  2005-07-23 13:38   ` Ming Zhang
@ 2005-07-23 21:37     ` Neil Brown
  2005-07-23 22:02       ` Ming Zhang
  0 siblings, 1 reply; 5+ messages in thread
From: Neil Brown @ 2005-07-23 21:37 UTC (permalink / raw)
  To: mingz; +Cc: Tyler, Linux RAID

On Saturday July 23, mingz@ele.uri.edu wrote:
> 1048576 = 1024 * 1024 = 32 * 32768. :)
      ^^^
> 
> so it should be 32 stripe writes.
> 
> ming
> 
> On Fri, 2005-07-22 at 23:14 -0700, Tyler wrote:
> > By my calculations, 1048756 is *not* a multiple of 32768 (32 
                            ^^^
> > Kilobytes).  Did I miscalculate?
> > 

A typo somewhere :-)


> > Regards,
> > Tyler.
> > 
> > Ming Zhang wrote:
> > 
> > >i created a 32KB chunk size 3 disk raid5. then write this disk with a
> > >small code i wrote. i found that even i write it with 1048756 in unit,
> > >which is multiple of stripe size, it still has a lot of read when seen
> > >from iostat. 

> > >sda             605.05       387.88     35143.43        384      34792
> > >sdb             611.11       323.23     35143.43        320      34792
> > >sdc             602.02       387.88     35143.43        384      34792

I wouldn't call this "a lot of read".  The read requests are only 1%
of the write requests.  So I would call it "some read".

There is quite a lot of complexity between the 'write' system all and
the data actually getting to the device.  Presumably the Linux VM
system is flushing dirty data to the device at times other than then
end of the write request.
I think the block layer also automatically flushes devices every
200msecs.
This may be triggering flush requests which aren't stripe-aligned.

You are certainly getting the vast majority of stripes written as
whole stripes.

NeilBrown

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: strange raid5
  2005-07-23 21:37     ` Neil Brown
@ 2005-07-23 22:02       ` Ming Zhang
  0 siblings, 0 replies; 5+ messages in thread
From: Ming Zhang @ 2005-07-23 22:02 UTC (permalink / raw)
  To: Neil Brown; +Cc: Tyler, Linux RAID

On Sun, 2005-07-24 at 07:37 +1000, Neil Brown wrote:
> On Saturday July 23, mingz@ele.uri.edu wrote:
> > 1048576 = 1024 * 1024 = 32 * 32768. :)
>       ^^^
> > 
> > so it should be 32 stripe writes.
> > 
> > ming
> > 
> > On Fri, 2005-07-22 at 23:14 -0700, Tyler wrote:
> > > By my calculations, 1048756 is *not* a multiple of 32768 (32 
>                             ^^^
> > > Kilobytes).  Did I miscalculate?
> > > 
> 
> A typo somewhere :-)
yes, my dumb stupidness. a typo here.

@Tyler, sorry about this. :P


i checked again and what i did is 
./write /dev/md0 1048576 1024 s

and i still see plenty read.

i build raid with this no resync. but should be ok, right?

mkraid -c raidtab -R --dangerous-no-resync /dev/md0

my raidtab file is like this

raiddev                 /dev/md0
raid-level              5
nr-raid-disks           3
chunk-size              32
parity-algorithm        left-symmetric

device                  /dev/sda
raid-disk               0

device                  /dev/sdb
raid-disk               1

device                  /dev/sdc
raid-disk               2


> 
> 
> > > Regards,
> > > Tyler.
> > > 
> > > Ming Zhang wrote:
> > > 
> > > >i created a 32KB chunk size 3 disk raid5. then write this disk with a
> > > >small code i wrote. i found that even i write it with 1048756 in unit,
> > > >which is multiple of stripe size, it still has a lot of read when seen
> > > >from iostat. 
> 
> > > >sda             605.05       387.88     35143.43        384      34792
> > > >sdb             611.11       323.23     35143.43        320      34792
> > > >sdc             602.02       387.88     35143.43        384      34792
> 
> I wouldn't call this "a lot of read".  The read requests are only 1%
> of the write requests.  So I would call it "some read".
> 
> There is quite a lot of complexity between the 'write' system all and
> the data actually getting to the device.  Presumably the Linux VM
> system is flushing dirty data to the device at times other than then
> end of the write request.
> I think the block layer also automatically flushes devices every
> 200msecs.
> This may be triggering flush requests which aren't stripe-aligned.
> 
> You are certainly getting the vast majority of stripes written as
> whole stripes.
> 
> NeilBrown


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2005-07-23 22:02 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-07-23  1:58 strange raid5 Ming Zhang
2005-07-23  6:14 ` Tyler
2005-07-23 13:38   ` Ming Zhang
2005-07-23 21:37     ` Neil Brown
2005-07-23 22:02       ` Ming Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).