* PROBLEM: High TCP latency
@ 2005-06-06 9:35 Hans Henrik Happe
2005-06-09 13:47 ` Andi Kleen
0 siblings, 1 reply; 3+ messages in thread
From: Hans Henrik Happe @ 2005-06-06 9:35 UTC (permalink / raw)
To: netdev
[-- Attachment #1: Type: text/plain, Size: 2049 bytes --]
Short: TCP puts the system into the idle state even though there are data in
transit.
During coding a distributed application I discovered a TCP latency issue. The
application does a lot of request forwarding like P2P protocols.
I have tried to track down the problem and have written a small program
(random-tcp.c) that shows the long latencies. In this program one message is
passed round between a number om processes. Each time a process receives the
message it randomly chooses a process to forward to next.
This I have compared to a program that doesn't give long latencies
(ring-tcp.c). In this program each process always forwards to the same
process (ring topology).
I have also made the same programs using SCTP and this protocol has no issue
in the random case.
The following is a test with 16 processes forwarding the message 100000 times.
The avg. forwarding time from process to process is messured.
$ ./random-tcp 16 100000
avg forwarding time: 0.000326
$ ./ring-tcp 16 100000
avg forwarding time: 0.000044
$ ./random-sctp 16 100000
avg forwarding time: 0.000068
$ ./ring-sctp 16 100000
avg forwarding time: 0.000067
Using 'top' i have observed that the system spends time in the idle state when
running 'random-tcp'. This I have observed with just 3 processes. With 16
processes the CPU is only 20% loaded on my Mobile Intel(R) Celeron(R) CPU
1.60GHz.
I have also tried with socketpair()'s which didn't have the problem. Therefore
my conclusion is that it must be a TCP issue.
Now this local use of TCP is not that usefull. Therefore, I tried a MPI
version and tested this in a 16 node cluster. Here the random case is 5 times
slower than the ring.
I have tested on many kernel versions from 2.4.25 up until 2.6.12-rc5 and all
had this issue.
A few people on lkml also confirmed it, but I have not got any reply from
someone with a greater knowledge of the inner working of Linux TCP (at least
they didn't tell me that they had this knowledge :-).
I hope this is helpfull.
Regards
Hans Henrik Happe
[-- Attachment #2: random-sctp.c --]
[-- Type: text/x-csrc, Size: 4434 bytes --]
/* By Hans Henrik Happe
*
* compile: gcc -o random-sctp random-sctp.c -lsctp
*
* usage: random-sctp <# processes> <# forwards>
*/
#include <asm/msr.h>
#include <stdio.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/tcp.h>
#include <fcntl.h>
#include <netdb.h>
#include <netinet/sctp.h>
#include <sys/time.h>
double second() {
struct timeval tv;
struct timezone tz;
double t;
gettimeofday(&tv,&tz);
t= (double)(tv.tv_sec)+(double)(tv.tv_usec/1.0e6);
return t;
}
typedef struct {
struct sockaddr sockadr;
int len;
} adr_t;
int get_adr(adr_t *adr, int port) {
int n;
struct addrinfo hints, *res;
char str[6];
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_flags = AI_PASSIVE;
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
sprintf(str, "%d", port);
n = getaddrinfo("localhost", str, &hints, &res);
if (n != 0) {
fprintf(stderr,
"getaddrinfo error: [%s]\n",
gai_strerror(n));
return -1;
}
memcpy(&adr->sockadr, res->ai_addr, sizeof(*res->ai_addr));
adr->len = sizeof(*res->ai_addr);
freeaddrinfo(res);
return 0;
}
int init_listen(int port) {
int n, on=1;
int sock;
struct sockaddr_in name;
sock = socket(PF_INET, SOCK_SEQPACKET, IPPROTO_SCTP);
if (sock == -1) {
perror("socket");
return -1;
}
name.sin_family = PF_INET;
name.sin_port = htons (port);
name.sin_addr.s_addr = htonl (INADDR_ANY);
if (bind (sock, (struct sockaddr *) &name, sizeof (name)) == -1) {
perror("bind");
return -1;
}
if (listen(sock, 10) == -1) {
perror("listen");
return -1;
}
return sock;
}
int do_recv(int sock, void *buf, int n) {
struct sockaddr sa;
struct sctp_sndrcvinfo info;
int slen, flags, res;
slen = sizeof(sa);
res = sctp_recvmsg(sock, buf, n, &sa, &slen, &info, &flags);
if (res == -1) {
perror("recv");
}
if (res != n) {
fprintf(stderr, "recv incomplete\n");
}
return res;
}
int do_send(int sock, adr_t *adr, void *buf, int n) {
int res;
res = sctp_sendmsg(sock, buf, n, &adr->sockadr, adr->len, 666, MSG_ADDR_OVER, 0, 0, 444);
if (res == -1) {
perror("send");
}
if (res != n) {
fprintf(stderr, "send incomplete\n");
}
return res;
}
int main(int argc, char *argv[]) {
int i, cnt, pid, src, dest, its;
int lsock;
char id, rank, data;
int port = 11100;
double t0, t1;
/* # processes */
cnt = atoi(argv[1]);
/* # forwards */
its = atoi(argv[2]);
{
adr_t dests[cnt];
/* Create processes */
rank = 0;
for (i=1; i<cnt; i++) {
pid = fork();
if (pid == 0) {
rank=cnt-i;
break;
}
}
/* Listen */
lsock = init_listen(port+rank);
sleep(2); /* "Ensure" that all processes are listening, HACK!!! */
for (i=0; i<cnt; i++) {
get_adr(dests+i, port+i);
}
srandom(666);
src = 0;
dest = 1;
/* Send startup message */
if (rank == src) {
do_send(lsock, &dests[dest], &data, 1);
}
/* Receive message and forward to a random destination */
t0 = second();
for (i=0; i < its; i++) {
if (rank == dest) {
do_recv(lsock, &data, 1);
}
src = dest;
dest = random()%cnt;
/* Do not send to self */
if (dest == src) {
dest = (dest+1)%cnt;
}
if (src == rank) {
do_send(lsock, &dests[dest], &data, 1);
}
}
if (rank == 0) {
for (i=1; i<cnt; i++) {
wait(&dest);
}
t1 = second();
printf("avg forwarding time: %lf\n", ((t1-t0)/its));
}
}
return 0;
}
[-- Attachment #3: random-tcp.c --]
[-- Type: text/x-csrc, Size: 5224 bytes --]
/* By Hans Henrik Happe
*
* usage: random-tcp <# processes> <# forwards>
*/
#include <asm/msr.h>
#include <stdio.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/tcp.h>
#include <fcntl.h>
#include <netdb.h>
#include <sys/time.h>
#include <sys/wait.h>
double second() {
struct timeval tv;
struct timezone tz;
double t;
gettimeofday(&tv,&tz);
t= (double)(tv.tv_sec)+(double)(tv.tv_usec/1.0e6);
return t;
}
int do_connect(int port) {
int n, sock, on=1;
struct addrinfo hints, *res;
char str[6];
void *adr;
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_flags = AI_PASSIVE;
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
sprintf(str, "%d", port);
n = getaddrinfo("localhost", str, &hints, &res);
if (n != 0) {
fprintf(stderr,
"getaddrinfo error: [%s]\n",
gai_strerror(n));
return -1;
}
sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return -1;
}
if (setsockopt(sock, SOL_TCP, TCP_NODELAY, &on, sizeof(on)) == -1) {
perror("setsockopt");
return -1;
}
if (connect(sock, (struct sockaddr *)res->ai_addr, sizeof(*res->ai_addr)) == -1) {
perror("connect");
return -1;
}
freeaddrinfo(res);
return sock;
}
int start_listen(int port) {
int n, on=1;
int sock;
struct sockaddr_in name;
sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return -1;
}
if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
perror("setsockopt");
return -1;
}
name.sin_family = AF_INET;
name.sin_port = htons (port);
name.sin_addr.s_addr = htonl (INADDR_ANY);
if (bind (sock, (struct sockaddr *) &name, sizeof (name)) == -1) {
perror("bind");
return -1;
}
if (listen(sock, 10) == -1) {
perror("listen");
return -1;
}
return sock;
}
int do_accept(int lsock) {
struct sockaddr addr;
socklen_t len = sizeof(addr);
int sock, on=1;
if ((sock = accept(lsock, &addr, &len)) == -1) {
perror("accept");
return -1;
}
if (setsockopt(sock, SOL_TCP, TCP_NODELAY, &on, sizeof(on)) == -1) {
perror("setsockopt");
return -1;
}
return sock;
}
int do_read(int fd, void *buf, int n) {
int res;
res = read(fd, buf, n);
if (res == -1) {
perror("read");
}
if (res != n) {
fprintf(stderr, "read incomplete\n");
}
return res;
}
int do_write(int fd, void *buf, int n) {
int res;
res = write(fd, buf, n);
if (res == -1) {
perror("write");
}
if (res != n) {
fprintf(stderr, "write incomplete\n");
}
return res;
}
int main(int argc, char *argv[]) {
int i, cnt, pid, dest, src, its;
int lsock, sock;
char id, rank, data;
int port = 11100;
double t0, t1;
/* # processes */
cnt = atoi(argv[1]);
/* # forwards */
its = atoi(argv[2]);
{
int socks[cnt];
/* Create processes */
rank = 0;
for (i=1; i<cnt; i++) {
pid = fork();
if (pid == 0) {
rank=i;
break;
}
}
/* Listen */
lsock = start_listen(port+rank);
sleep(2); /* "Ensure" that all processes are listening, HACK!!! */
for (i=0; i<rank; i++) {
sock = do_accept(lsock);
do_read(sock, &id, 1);
do_write(sock, &rank, 1);
socks[id] = sock;
}
for (i=rank; i<cnt-1; i++) {
sock = do_connect(port+i+1);
do_write(sock, &rank, 1);
do_read(sock, &id, 1);
socks[id] = sock;
}
srandom(666);
src = 0;
dest = 1;
/* Write startup message */
if (rank == src) {
do_write(socks[dest], &data, 1);
}
/* Receive message and forward to a random destination */
t0 = second();
for (i=0; i < its; i++) {
if (rank == dest) {
do_read(socks[src], &data, 1);
}
src = dest;
dest = random()%cnt;
/* Do not send to self */
if (dest == src) {
dest = (dest+1)%cnt;
}
if (src == rank) {
do_write(socks[dest], &data, 1);
}
}
if (rank == 0) {
for (i=1; i<cnt; i++) {
wait(&dest);
}
t1 = second();
printf("avg forwarding time: %lf\n", ((t1-t0)/its));
}
}
return 0;
}
[-- Attachment #4: ring-sctp.c --]
[-- Type: text/x-csrc, Size: 4276 bytes --]
/* By Hans Henrik Happe
*
* compile: gcc -o ring-sctp ring-sctp.c -lsctp
*
* usage: ring-sctp <# processes> <# forwards>
*/
#include <asm/msr.h>
#include <stdio.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/tcp.h>
#include <fcntl.h>
#include <netdb.h>
#include <netinet/sctp.h>
#include <sys/time.h>
double second() {
struct timeval tv;
struct timezone tz;
double t;
gettimeofday(&tv,&tz);
t= (double)(tv.tv_sec)+(double)(tv.tv_usec/1.0e6);
return t;
}
typedef struct {
struct sockaddr sockadr;
int len;
} adr_t;
int get_adr(adr_t *adr, int port) {
int n;
struct addrinfo hints, *res;
char str[6];
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_flags = AI_PASSIVE;
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
sprintf(str, "%d", port);
n = getaddrinfo("localhost", str, &hints, &res);
if (n != 0) {
fprintf(stderr,
"getaddrinfo error: [%s]\n",
gai_strerror(n));
return -1;
}
memcpy(&adr->sockadr, res->ai_addr, sizeof(*res->ai_addr));
adr->len = sizeof(*res->ai_addr);
freeaddrinfo(res);
return 0;
}
int init_listen(int port) {
int n, on=1;
int sock;
struct sockaddr_in name;
sock = socket(PF_INET, SOCK_SEQPACKET, IPPROTO_SCTP);
if (sock == -1) {
perror("socket");
return -1;
}
name.sin_family = PF_INET;
name.sin_port = htons (port);
name.sin_addr.s_addr = htonl (INADDR_ANY);
if (bind (sock, (struct sockaddr *) &name, sizeof (name)) == -1) {
perror("bind");
return -1;
}
if (listen(sock, 10) == -1) {
perror("listen");
return -1;
}
return sock;
}
int do_recv(int sock, void *buf, int n) {
struct sockaddr sa;
struct sctp_sndrcvinfo info;
int slen, flags, res;
slen = sizeof(sa);
res = sctp_recvmsg(sock, buf, n, &sa, &slen, &info, &flags);
if (res == -1) {
perror("recv");
}
if (res != n) {
fprintf(stderr, "recv incomplete\n");
}
return res;
}
int do_send(int sock, adr_t *adr, void *buf, int n) {
int res;
res = sctp_sendmsg(sock, buf, n, &adr->sockadr, adr->len, 666, MSG_ADDR_OVER, 0, 0, 444);
if (res == -1) {
perror("send");
}
if (res != n) {
fprintf(stderr, "send incomplete\n");
}
return res;
}
int main(int argc, char *argv[]) {
int i, cnt, pid, src, dest, its;
int lsock;
char id, rank, data;
int port = 11100;
double t0, t1;
/* # processes */
cnt = atoi(argv[1]);
/* # forwards */
its = atoi(argv[2]);
{
adr_t dests[cnt];
/* Create processes */
rank = 0;
for (i=1; i<cnt; i++) {
pid = fork();
if (pid == 0) {
rank=cnt-i;
break;
}
}
/* Listen */
lsock = init_listen(port+rank);
sleep(2); /* "Ensure" that all processes are listening, HACK!!! */
for (i=0; i<cnt; i++) {
get_adr(dests+i, port+i);
}
srandom(666);
src = 0;
dest = 1;
/* Send startup message */
if (rank == src) {
do_send(lsock, &dests[dest], &data, 1);
}
/* Receive message and forward to a random destination */
t0 = second();
for (i=0; i < its; i++) {
if (rank == dest) {
do_recv(lsock, &data, 1);
}
src = dest;
dest = (dest+1)%cnt;
if (src == rank) {
do_send(lsock, &dests[dest], &data, 1);
}
}
if (rank == 0) {
for (i=1; i<cnt; i++) {
wait(&dest);
}
t1 = second();
printf("avg forwarding time: %lf\n", ((t1-t0)/its));
}
}
return 0;
}
[-- Attachment #5: ring-tcp.c --]
[-- Type: text/x-csrc, Size: 5064 bytes --]
/* By Hans Henrik Happe
*
* usage: ring-tcp <# processes> <# forwards>
*/
#include <asm/msr.h>
#include <stdio.h>
#include <poll.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/tcp.h>
#include <fcntl.h>
#include <netdb.h>
#include <sys/time.h>
#include <sys/wait.h>
double second() {
struct timeval tv;
struct timezone tz;
double t;
gettimeofday(&tv,&tz);
t= (double)(tv.tv_sec)+(double)(tv.tv_usec/1.0e6);
return t;
}
int do_connect(int port) {
int n, sock, on=1;
struct addrinfo hints, *res;
char str[6];
void *adr;
memset(&hints, 0, sizeof(struct addrinfo));
hints.ai_flags = AI_PASSIVE;
hints.ai_family = PF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
sprintf(str, "%d", port);
n = getaddrinfo("localhost", str, &hints, &res);
if (n != 0) {
fprintf(stderr,
"getaddrinfo error: [%s]\n",
gai_strerror(n));
return -1;
}
sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return -1;
}
if (setsockopt(sock, SOL_TCP, TCP_NODELAY, &on, sizeof(on)) == -1) {
perror("setsockopt");
return -1;
}
if (connect(sock, (struct sockaddr *)res->ai_addr, sizeof(*res->ai_addr)) == -1) {
perror("connect");
return -1;
}
freeaddrinfo(res);
return sock;
}
int start_listen(int port) {
int n, on=1;
int sock;
struct sockaddr_in name;
sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == -1) {
perror("socket");
return -1;
}
if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
perror("setsockopt");
return -1;
}
name.sin_family = AF_INET;
name.sin_port = htons (port);
name.sin_addr.s_addr = htonl (INADDR_ANY);
if (bind (sock, (struct sockaddr *) &name, sizeof (name)) == -1) {
perror("bind");
return -1;
}
if (listen(sock, 10) == -1) {
perror("listen");
return -1;
}
return sock;
}
int do_accept(int lsock) {
struct sockaddr addr;
socklen_t len = sizeof(addr);
int sock, on=1;
if ((sock = accept(lsock, &addr, &len)) == -1) {
perror("accept");
return -1;
}
if (setsockopt(sock, SOL_TCP, TCP_NODELAY, &on, sizeof(on)) == -1) {
perror("setsockopt");
return -1;
}
return sock;
}
int do_read(int fd, void *buf, int n) {
int res;
res = read(fd, buf, n);
if (res == -1) {
perror("read");
}
if (res != n) {
fprintf(stderr, "read incomplete\n");
}
return res;
}
int do_write(int fd, void *buf, int n) {
int res;
res = write(fd, buf, n);
if (res == -1) {
perror("write");
}
if (res != n) {
fprintf(stderr, "write incomplete\n");
}
return res;
}
int main(int argc, char *argv[]) {
int i, cnt, pid, dest, src, its;
int lsock, sock;
char id, rank, data;
int port = 11100;
double t0, t1;
/* # processes */
cnt = atoi(argv[1]);
/* # forwards */
its = atoi(argv[2]);
{
int socks[cnt];
/* Create processes */
rank = 0;
for (i=1; i<cnt; i++) {
pid = fork();
if (pid == 0) {
rank=i;
break;
}
}
/* Listen */
lsock = start_listen(port+rank);
sleep(2); /* "Ensure" that all processes are listening, HACK!!! */
for (i=0; i<rank; i++) {
sock = do_accept(lsock);
do_read(sock, &id, 1);
do_write(sock, &rank, 1);
socks[id] = sock;
}
for (i=rank; i<cnt-1; i++) {
sock = do_connect(port+i+1);
do_write(sock, &rank, 1);
do_read(sock, &id, 1);
socks[id] = sock;
}
srandom(666);
src = 0;
dest = 1;
/* Write startup message */
if (rank == src) {
do_write(socks[dest], &data, 1);
}
/* Receive message and forward to a random destination */
t0 = second();
for (i=0; i < its; i++) {
if (rank == dest) {
do_read(socks[src], &data, 1);
}
src = dest;
dest = (dest+1)%cnt;
if (src == rank) {
do_write(socks[dest], &data, 1);
}
}
if (rank == 0) {
for (i=1; i<cnt; i++) {
wait(&dest);
}
t1 = second();
printf("avg forwarding time: %lf\n", ((t1-t0)/its));
}
}
return 0;
}
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2005-06-09 14:04 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-06-06 9:35 PROBLEM: High TCP latency Hans Henrik Happe
2005-06-09 13:47 ` Andi Kleen
2005-06-09 14:04 ` Hans Henrik Happe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).