* Constant access (write) time.
@ 2010-03-06 22:02 Paul Chavent
2010-03-06 23:13 ` tytso
0 siblings, 1 reply; 3+ messages in thread
From: Paul Chavent @ 2010-03-06 22:02 UTC (permalink / raw)
To: linux-ext4
[-- Attachment #1: Type: text/plain, Size: 1408 bytes --]
Hello.
I'm writing a real-time application that have to stream pictures to a SSD.
The pictures are 640x480x1 pnm that are stored in one tar file. I have one picture every 100ms.
I decided to code a "write" thread that open a file on an ext4 filesystem created with :
# mke2fs -t ext4 -L DATA -O large_file,^has_journal,extent -v
It is mounted with :
# mount -t ext4 /dev/sda3 /var/data/
My file descriptor is open with these flags :
O_WRONLY | O_CREAT | O_TRUNC | O_SYNC | O_DIRECT
I use a noop io scheduler.
The problem is that the access (write) time (from a userspace point of view) is not constant. I join you a piece of code that reproduce the problem.
This leads me to ask the following questions : the solution to this problem is
(1) the kernel make such a job that the write time seems constant from a userspace point of view
(2) the userspace thread haven't a constant execution time but is bounded to a maximum
(3) the userspace thread have a constant execution time but use an other userspace thread by calling aio_write
In all case, the job (some block allocation i guess) have to be done. But i suppose that if it is done anticipatory by the kernel it can be preemted by realtime task and it is better.
In the third solution we can preempt the aio thread, but the allocation isn't anticipated.
So if you could give me your point of view...
Thank you for your advices.
Paul.
[-- Attachment #2: main.c --]
[-- Type: text/plain, Size: 5165 bytes --]
/* gcc -Wall -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -o main main.c -lrt */
/* open */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
/* write,close,pathconf */
#include <unistd.h>
/* posix_memalign */
#include <stdlib.h>
/* perror */
#include <stdio.h>
/* signal */
#include <signal.h>
/* clock_* */
#include <time.h>
/* sched_setscheduler */
#include <sched.h>
/* memset */
#include <string.h>
/* mlockall */
#include <sys/mman.h> // Needed for ()
/* getrusage */
#include <sys/time.h>
#include <sys/resource.h>
/* iob */
#include <sys/io.h>
#define PP_DATA 0x378
static volatile int test_is_running = 1;
void sig_handler(int sig_num)
{
test_is_running = 0;
}
int main(int argc, char **argv)
{
/*
* I stream 640x480x1 pnm images (307215 bytes each) to a tar file.
* The write buffer is multiple of 512.
* So 307712 bytes.
*/
const int buffer_size = 307712;
/*
* Alignement for direct io
*/
int buffer_alignment;
/*
* The buffer will be allocated dynamicaly for alignement
*/
void *buffer;
int fd;
/*
* Set the scheduler
*/
struct sched_param param;
/*
* Monitoring variable
*/
unsigned long long sample = 0;
struct timespec start_time;
struct timespec stop_time;
unsigned long long diff_cur;
unsigned long long diff_min;
unsigned long long diff_max;
unsigned long long diff_avg;
struct timespec ts;
unsigned long long period_ns = 200000000;
struct rusage usage_before;
struct rusage usage_after;
/* handle ctrl-c */
struct sigaction sigact;
sigact.sa_handler= sig_handler;
sigact.sa_flags = SA_RESETHAND;
sigaction(SIGINT, &sigact, NULL);
/* for pp monitoring */
ioperm(PP_DATA, 1, 1);
/* declare ourself as a real time task */
param.sched_priority = 49;
if(sched_setscheduler(0, SCHED_FIFO, ¶m) == -1)
{
perror("sched_setscheduler failed");
return EXIT_FAILURE;
}
/* lock all current and future pages from preventing of being paged */
if(mlockall(MCL_CURRENT | MCL_FUTURE ))
{
perror("mlockall failed");
return EXIT_FAILURE;
}
/* open */
fd = open("test.log", O_WRONLY | O_CREAT | O_TRUNC | O_SYNC | O_DIRECT, 0644);
if(fd < 0)
{
perror("open failed");
return EXIT_FAILURE;
}
/* compute alignement constraints for direct io */
buffer_alignment = pathconf("test.log", _PC_REC_XFER_ALIGN);
if(buffer_alignment < 0)
{
perror("pathconf failed");
return EXIT_FAILURE;
}
/* alloc aligned buffer */
if(posix_memalign((void **)&buffer, buffer_alignment, buffer_size))
{
perror("posix_memalign failed");
return EXIT_FAILURE;
}
memset(buffer, 0, buffer_size);
fprintf(stderr, "%*s%*s\n", 16, "file size", 16, "duration");
getrusage(RUSAGE_SELF, &usage_before);
clock_gettime(CLOCK_MONOTONIC, &ts);
while(test_is_running)
{
int nb_write;
outb((inb(PP_DATA) | (0x0001)), PP_DATA);
clock_gettime(CLOCK_MONOTONIC, &start_time);
nb_write = write(fd, buffer, buffer_size);
clock_gettime(CLOCK_MONOTONIC, &stop_time);
outb((inb(PP_DATA) & ~(0x0001)), PP_DATA);
/* error handling */
if(nb_write != buffer_size)
{
perror("write failed");
return EXIT_FAILURE;
}
/* compute stats */
if(stop_time.tv_nsec < start_time.tv_nsec)
{
stop_time.tv_sec--;
stop_time.tv_nsec+=1000000000;
}
diff_cur = (stop_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (stop_time.tv_nsec - start_time.tv_nsec);
if(sample == 0)
{
diff_min = diff_cur;
diff_max = diff_cur;
diff_avg = diff_cur;
}
else
{
if(diff_cur < diff_min)
{
diff_min = diff_cur;
}
if(diff_max < diff_cur)
{
diff_max = diff_cur;
}
if(diff_cur < diff_avg)
{
diff_avg = diff_avg - (diff_avg - diff_cur) / sample;
}
else
{
diff_avg = diff_avg + (diff_cur - diff_avg) / sample;
}
}
sample++;
/* print suspect write */
if((2 * diff_avg) < diff_cur)
{
struct stat buf;
fstat(fd, &buf);
fprintf(stderr, "%*llu%*llu\n", 16, buf.st_size, 16, diff_cur);
}
/* sleep */
ts.tv_nsec += period_ns;
while(ts.tv_nsec >= 1000000000)
{
ts.tv_nsec -= 1000000000;
ts.tv_sec++;
}
clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &ts, NULL);
}
getrusage(RUSAGE_SELF, &usage_after);
close(fd);
fprintf(stderr, "\n");
fprintf(stderr, "diff min : %llu\n", diff_min);
fprintf(stderr, "diff moy : %llu\n", diff_avg);
fprintf(stderr, "diff max : %llu\n", diff_max);
fprintf(stderr, "%llu iterations\n", sample);
printf("major pagefaults : %ld\nminor pagefaults : %ld\n", usage_after.ru_majflt - usage_before.ru_majflt, usage_after.ru_minflt - usage_before.ru_minflt);
return EXIT_SUCCESS;
}
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Constant access (write) time.
2010-03-06 22:02 Constant access (write) time Paul Chavent
@ 2010-03-06 23:13 ` tytso
2010-03-08 20:17 ` Paul Chavent
0 siblings, 1 reply; 3+ messages in thread
From: tytso @ 2010-03-06 23:13 UTC (permalink / raw)
To: Paul Chavent; +Cc: linux-ext4
On Sat, Mar 06, 2010 at 11:02:53PM +0100, Paul Chavent wrote:
> Hello.
>
> I'm writing a real-time application that have to stream pictures to a SSD.
>
> The pictures are 640x480x1 pnm that are stored in one tar file. I
> have one picture every 100ms.
>
> The problem is that the access (write) time (from a userspace point
> of view) is not constant. I join you a piece of code that reproduce
> the problem.
What are you seeing/expecting? Here's what I'm getting with the X25-M
Intel SSD:
file size duration
diff min : 3666548
diff moy : 6639261
diff max : 9429924
101 iterations
major pagefaults : 0
minor pagefaults : 0
It's rarely if ever triggering your "suspect write" fprintf.
>
> This leads me to ask the following questions : the solution to this
> problem is
>
If you are seeing something seriously worse, you might want to test
what happens if you write to the SSD directly; it may be the answer is
(4) Buy a competently engineered SSD. :-)
- Ted
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: Constant access (write) time.
2010-03-06 23:13 ` tytso
@ 2010-03-08 20:17 ` Paul Chavent
0 siblings, 0 replies; 3+ messages in thread
From: Paul Chavent @ 2010-03-08 20:17 UTC (permalink / raw)
To: tytso; +Cc: linux-ext4
Hello.
Thank you for your reply.
I have an intel SSD 2.5 inch X25-M 80GB SATA300. And i have the following log :
# /test_ext4
file size duration
58157568 11809470
62773248 10658968
266786304 11514789
287403008 10423548
308327424 13095972
329251840 11036123
350176256 12643903
370792960 9767548
391717376 10987771
411410944 13204409
^C
diff min : 2298627
diff moy : 3971804
diff max : 13204409
1342 iterations
major pagefaults : 0
minor pagefaults : 0
Could you give me more detail on your configuration ?
Here, more details on mine :
# mke2fs -t ext4 -L DATA -O large_file,^has_journal,extent -v /dev/sda3
mke2fs 1.41.9 (22-Aug-2009)
fs_types for mke2fs.conf resolution: 'ext4', 'default'
Filesystem label=DATA
OS type: Linux
Block size=4096 (log=2)
Fragment size=4096 (log=2)
3670016 inodes, 14653288 blocks
732664 blocks (5.00%) reserved for the super user
First data block=0
Maximum filesystem blocks=0
448 block groups
32768 blocks per group, 32768 fragments per group
8192 inodes per group
Superblock backups stored on blocks:
32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632, 2654208,
4096000, 7962624, 11239424
Writing inode tables: done
Writing superblocks and filesystem accounting information: done
This filesystem will be automatically checked every 37 mounts or
180 days, whichever comes first. Use tune2fs -c or -i to override.
# dumpe2fs -h /dev/sda3
dumpe2fs 1.41.9 (22-Aug-2009)
Filesystem volume name: DATA
Last mounted on: <not available>
Filesystem UUID: 13b72368-2a9e-11df-8a9f-0060c2140392
Filesystem magic number: 0xEF53
Filesystem revision #: 1 (dynamic)
Filesystem features: ext_attr resize_inode dir_index filetype extent flex_bg sparse_super large_file huge_file uninit_bg dir_nlink extra_isize
Filesystem flags: signed_directory_hash
Default mount options: (none)
Filesystem state: clean
Errors behavior: Continue
Filesystem OS type: Linux
Inode count: 3670016
Block count: 14653288
Reserved block count: 732664
Free blocks: 14409685
Free inodes: 3670005
First block: 0
Block size: 4096
Fragment size: 4096
Reserved GDT blocks: 1020
Blocks per group: 32768
Fragments per group: 32768
Inodes per group: 8192
Inode blocks per group: 512
Flex block group size: 16
Filesystem created: Mon Mar 8 10:33:46 2010
Last mount time: n/a
Last write time: Mon Mar 8 10:34:00 2010
Mount count: 0
Maximum mount count: 37
Last checked: Mon Mar 8 10:33:46 2010
Check interval: 15552000 (6 months)
Next check after: Sat Sep 4 10:33:46 2010
Lifetime writes: 900 MB
Reserved blocks uid: 0 (user root)
Reserved blocks gid: 0 (group root)
First inode: 11
Inode size: 256
Required extra isize: 28
Desired extra isize: 28
Default directory hash: half_md4
Directory Hash Seed: 13b723f4-2a9e-11df-8a9f-0060c2140392
# mount -t ext4 -o noatime,nodiratime /dev/sda3 /var/data
EXT4-fs (sda3): no journal
EXT4-fs (sda3): delayed allocation enabled
EXT4-fs: file extents enabled
EXT4-fs: mballoc enabled
EXT4-fs (sda3): mounted filesystem without journal
# hdparm -i /dev/sda
/dev/sda:
hdparm: ioctl 0x304 failed: Inappropriate ioctl for device
Model=INTEL SSDSA2MH080G1GC , FwRev=045C8820, SerialNo=CVEM8385003E080DGN
Config={ Fixed }
RawCHS=16383/16/63, TrkSize=0, SectSize=0, ECCbytes=0
BuffType=(0) unknown, BuffSize=0kB, MaxMultSect=16, MultSect=?1?
CurCHS=16383/16/63, CurSects=16514064, LBA=yes, LBAsects=156301488
IORDY=on/off, tPIO={min:120,w/IORDY:120}, tDMA={min:120,rec:120}
PIO modes: pio0 pio3 pio4
DMA modes: mdma0 mdma1 mdma2
UDMA modes: udma0 udma1 udma2
AdvancedPM=no WriteCache=enabled
Drive conforms to: ATA/ATAPI-7 T13 1532D rev.1: ATA/ATAPI-2 ATA/ATAPI-3 ATA/ATAPI-4 ATA/ATAPI-5 ATA/ATAPI-6 ATA/ATAPI-7
* current active mode
tytso@mit.edu wrote:
> On Sat, Mar 06, 2010 at 11:02:53PM +0100, Paul Chavent wrote:
>> Hello.
>>
>> I'm writing a real-time application that have to stream pictures to a SSD.
>>
>> The pictures are 640x480x1 pnm that are stored in one tar file. I
>> have one picture every 100ms.
>>
>> The problem is that the access (write) time (from a userspace point
>> of view) is not constant. I join you a piece of code that reproduce
>> the problem.
>
> What are you seeing/expecting? Here's what I'm getting with the X25-M
> Intel SSD:
>
> file size duration
>
> diff min : 3666548
> diff moy : 6639261
> diff max : 9429924
> 101 iterations
> major pagefaults : 0
> minor pagefaults : 0
>
> It's rarely if ever triggering your "suspect write" fprintf.
>
>> This leads me to ask the following questions : the solution to this
>> problem is
>>
>
> If you are seeing something seriously worse, you might want to test
> what happens if you write to the SSD directly; it may be the answer is
>
> (4) Buy a competently engineered SSD. :-)
>
> - Ted
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2010-03-08 20:12 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-06 22:02 Constant access (write) time Paul Chavent
2010-03-06 23:13 ` tytso
2010-03-08 20:17 ` Paul Chavent
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).