* Code critique: checking for syntax errors
@ 2006-01-21 19:10 James Colannino
2006-01-22 9:34 ` Jesse Ruffin
0 siblings, 1 reply; 8+ messages in thread
From: James Colannino @ 2006-01-21 19:10 UTC (permalink / raw)
To: linux-c-programming
Hey everyone. I was wondering if anybody here had the time to glance at
my code here and tell me what they think of it, good or bad. I am
worried that my style is not very good, and I'd really like to improve.
I'm working with the Kernigan and Ritchie Book, "C Programming, 2nd
Edition." This is the 1-24 exercise, which asks the programmer to write
a program that checks C source for rudimentary syntax errors. I check
for unbalanced parenthesis, brackets, braces, double/single quotes (I
properly consider escape characters - I hope :-P) and unterminated
comments. I've run this on multiple C source files, and all of them
(after having tested them with the compiler first) check out as they
should. I've done a little testing to make sure it properly recognizes
unbalanced stuff, but my testing in that area by comparison has been a
bit weak.
Note that the book doesn't go into working with whole strings at once
yet (this is only chapter 1), so that's why I'm using fgetc() for every
individual character in the file.
Thank you everyone for any input you can provide :)
James
/* 1-24-syntax.c - checks C source code for rudimentary syntax
errors such as unbalanced parenthesis, brackets, quotes, etc. */
#include <stdio.h>
#include <stdlib.h>
/* Each of the below functions returns 0 on success and -1 on syntax
error */
int check_brackets(FILE *input);
int check_braces(FILE*INPUT);
int check_parenthesis(FILE *input);
int check_doublequotes(FILE *input);
int check_singlequotes(FILE *input);
int check_comments(FILE *input);
/* keeps track of the current line number */
int ln = 1;
int main (int argc, char *argv[]) {
FILE *input;
if (argc > 2) {
fprintf(stderr, "error: too many arguments\n");
exit(1);
}
if (argc < 2) input = stdin;
else {
if ((input = fopen(argv[1], "ra")) == NULL) {
fprintf(stderr, "error: could not open %s", argv[1]);
exit(1);
}
}
int character;
while ((character = fgetc(input)) != EOF) {
switch(character) {
case '\n':
ln++;
break;
case '{':
if (check_brackets(input) == -1) exit(1);
break;
case '[':
if (check_braces(input) == -1) exit(1);
break;
case '(':
if (check_parenthesis(input) == -1) exit(1);
break;
case '\"':
if (check_doublequotes(input) == -1) exit(1);
break;
case '\'':
if (check_singlequotes(input) == -1) exit(1);
break;
case ']':
fprintf(stderr, "error: line %d: unbalanced ']'\n", ln);
exit(1);
case ')':
fprintf(stderr, "error: line %d: unbalanaced ')'\n", ln);
exit(1);
case '}':
fprintf(stderr, "error: line %d: unbalanaced '}'\n", ln);
exit(1);
default:
break;
}
if (character == '/') {
if ((character = fgetc(input)) == '*') {
if (check_comments(input) == -1) exit(1);
}
else {
if (ungetc(character, input) == NULL) {
fprintf(stderr, "error: could not ungetc\n");
exit(1);
}
}
}
}
printf("Done!\n");
return 0;
}
int check_brackets(FILE *input) {
#ifdef DEBUG
printf("check_brackets()\n");
#endif
int character;
while ((character = fgetc(input)) != EOF) {
switch(character) {
case '\n':
ln++;
break;
case '}':
return 0;
case '{':
if (check_brackets(input) == -1) return -1;
break;
case '[':
if (check_braces(input) == -1) return -1;
break;
case '(':
if (check_parenthesis(input) == -1) return -1;
break;
case '\"':
if (check_doublequotes(input) == -1) return -1;
break;
case '\'':
if (check_singlequotes(input) == -1) return -1;
break;
case ']':
fprintf(stderr, "error: line %d: unbalanced ']'\n", ln);
return -1;
case ')':
fprintf(stderr, "error: line %d: unbalanaced ')'\n", ln);
return -1;
default:
break;
}
if (character == '/') {
if ((character = fgetc(input)) == '*') {
if (check_comments(input) == -1) return -1;
}
else {
if (ungetc(character, input) == NULL) {
fprintf(stderr, "error: could not ungetc\n");
return -1;
}
}
}
}
fprintf(stderr, "error: line %d: unbalanced '{'\n", ln);
return -1;
}
int check_braces(FILE *input) {
#ifdef DEBUG
printf("check_braces()\n");
#endif
int character;
while ((character = fgetc(input)) != EOF) {
switch(character) {
case '\n':
ln++;
break;
case ']':
return 0;
case '{':
if (check_brackets(input) == -1) return -1;
break;
case '[':
if (check_braces(input) == -1) return -1;
break;
case '(':
if (check_parenthesis(input) == -1) return -1;
break;
case '\"':
if (check_doublequotes(input) == -1) return -1;
break;
case '\'':
if (check_singlequotes(input) == -1) return -1;
break;
case '}':
fprintf(stderr, "error: line %d: unbalanced ']'\n", ln);
return -1;
case ')':
fprintf(stderr, "error: line %d: unbalanaced ')'\n", ln);
return -1;
default:
break;
}
if (character == '/') {
if ((character = fgetc(input)) == '*') {
if (check_comments(input) == -1) return -1;
}
else {
if (ungetc(character, input) == NULL) {
fprintf(stderr, "error: could not ungetc\n");
return -1;
}
}
}
}
fprintf(stderr, "error: line %d: unbalanced '['\n", ln);
return -1;
}
int check_parenthesis(FILE *input) {
#ifdef DEBUG
printf("check_parenthesis()\n");
#endif
int character;
while ((character = fgetc(input)) != EOF) {
switch(character) {
case '\n':
ln++;
break;
case ')':
return 0;
case '{':
if (check_brackets(input) == -1) return -1;
break;
case '[':
if (check_braces(input) == -1) return -1;
break;
case '(':
if (check_parenthesis(input) == -1) return -1;
break;
case '\"':
if (check_doublequotes(input) == -1) return -1;
break;
case '\'':
if (check_singlequotes(input) == -1) return -1;
break;
case ']':
fprintf(stderr, "error: line %d: unbalanced ']'\n", ln);
return -1;
case '}':
fprintf(stderr, "error: line %d: unbalanaced ')'\n", ln);
return -1;
default:
break;
}
if (character == '/') {
if ((character = fgetc(input)) == '*') {
if (check_comments(input) == -1) return -1;
}
else {
if (ungetc(character, input) == NULL) {
fprintf(stderr, "error: could not ungetc\n");
return -1;
}
}
}
}
fprintf(stderr, "error: line %d: unbalanced '('\n", ln);
return -1;
}
int check_doublequotes(FILE *input) {
#ifdef DEBUG
printf("check_doublequotes()\n");
#endif
int character;
while ((character = fgetc(input)) != EOF) {
switch(character) {
case '\n':
ln++;
break;
case '\"':
return 0;
default:
break;
}
}
fprintf(stderr, "error: line %d: unbalanced double quotes\n", ln);
return -1;
}
int check_singlequotes(FILE *input) {
#ifdef DEBUG
printf("check_singlequotes()\n");
#endif
int character;
for (int count = 0; (character = fgetc(input)) != EOF; count++) {
switch(character) {
case '\n':
ln++;
break;
case '\\':
character = fgetc(input);
if (character == 'a') continue;
else if (character == 'b') continue;
else if (character == 't') continue;
else if (character == 'n') continue;
else if (character == 'v') continue;
else if (character == 'f') continue;
else if (character == 'r') continue;
else if (character == '\"') continue;
else if (character == '\'') continue;
else if (character == '\?') continue;
else if (character == '\\') continue;
else {
if (ungetc(character, input) == NULL) {
fprintf(stderr, "error: could not ungetc\n");
return -1;
}
}
break;
case '\'':
if (count < 1) {
fprintf(stderr, "error: line %d: too \
few characters between ''\n", ln);
return -1;
}
else return 0;
default:
break;
}
}
fprintf(stderr, "error: line %d: unbalanced single quotes\n", ln);
return -1;
}
int check_comments(FILE *input) {
#ifdef DEBUG
printf("check_comments()\n");
#endif
int character;
while ((character = fgetc(input)) != EOF) {
if (character == '*') {
if ((character = fgetc(input)) == '/') return 0;
else {
if (ungetc(character, input) == NULL) {
fprintf(stderr, "error: could not ungetc\n");
return -1;
}
}
}
}
fprintf(stderr, "error: line %d: unterminated comment\n", ln);
return -1;
}
--
My blog: http://www.crazydrclaw.com/
My homepage: http://james.colannino.org/
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: Code critique: checking for syntax errors
2006-01-21 19:10 Code critique: checking for syntax errors James Colannino
@ 2006-01-22 9:34 ` Jesse Ruffin
2006-01-23 19:47 ` James Colannino
0 siblings, 1 reply; 8+ messages in thread
From: Jesse Ruffin @ 2006-01-22 9:34 UTC (permalink / raw)
To: linux-c-programming
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Overall this code is readable and consistent. The comments are well
placed and appropriately few. The structure does a good job of
implementing a stack. There are some places where your control
structures could be simplified or made a little easier to read. It's
clear that you are being explicit, and that is a very good practice.
However, there was one thing that caused compilation failure without
specifying some options, and more that caused warnings and bugs (only on
binary files though).
line 303:
| for (int count = 0; (character = fgetc(input)) != EOF; count++) {
If you can at all manage it, and most times you can, put all of your
variable declarations right at the beginning of the function. This also
implies moving your debug statements below them as they will be compiled
in sometime. Doing so will aid in portability to older, or more strict,
compilers. And try extra hard to avoid variable declarations in the loop
definition, as this is often seen as an error by compilers unless you
are using the C99 standard.
Multiple locations:
| if (ungetc(character, input) == NULL) {
ungetc() returns an EOF on error; NULL may, and often does, differ in
value. This causes the program to improperly report an unget error on
files with a NULL character in them. This happened to me when I ran the
program against an entire package I was writing; in particular, a .png
file. This isn't very bad in this case as it is not supposed to balance
png's. However, if it were meant to process binary files it would be a
serious bug.
Multiple locations:
| exit(1);
C defines two values, EXIT_SUCCESS and EXIT_FAILURE, that should be used
for the exit() value. These will pass the proper value to the host
environment. As that value may differ, you should use them to make your
code as portable as possible.
Once again, I think that this is good code overall. It's good to see
people being careful and using extra parenthesis and braces, rather than
hunting down logic bugs caused by a lack of them.
Jesse Ruffin
AJP Services
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.2 (GNU/Linux)
iD8DBQFD01GUx26WdJ5m3EsRAhnLAJ9pZ4osq/jjNPNa7sHKrdWL53hIUACfbBVU
GRQlggDJtZfgShpOg5vKrKk=
=iIAt
-----END PGP SIGNATURE-----
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: Code critique: checking for syntax errors
2006-01-22 9:34 ` Jesse Ruffin
@ 2006-01-23 19:47 ` James Colannino
2006-01-23 22:37 ` Jesse Ruffin
0 siblings, 1 reply; 8+ messages in thread
From: James Colannino @ 2006-01-23 19:47 UTC (permalink / raw)
To: linux-c-programming
Jesse Ruffin wrote:
> If you can at all manage it, and most times you can, put all of your
> variable declarations right at the beginning of the function. This also
> implies moving your debug statements below them as they will be compiled
> in sometime. Doing so will aid in portability to older, or more strict,
> compilers.
Do stricter and older compilers complain if variables aren't declared at
the beginning of the function?
> And try extra hard to avoid variable declarations in the loop
> definition, as this is often seen as an error by compilers unless you
> are using the C99 standard.
Actually, I was banking on the C99 standard (I guess perhaps it would
have been wise for me to say so when I posted the code.)
>
> Multiple locations:
> | if (ungetc(character, input) == NULL) {
>
> ungetc() returns an EOF on error[...]
You're right. I changed that :)
> C defines two values, EXIT_SUCCESS and EXIT_FAILURE, that should be used
> for the exit() value. These will pass the proper value to the host
> environment. As that value may differ, you should use them to make your
> code as portable as possible.
I can understand EXIT_FAILURE, but why do all the C books tell you to
return 0 instead of EXIT_SUCCESS? I always thought that both return()
and exit() did the same basic thing, so I'm confused as to why a book
teaching how to write portable code would say to simply use 0 (that's
with return() and not exit().) I have implemented exit(EXIT_FAILURE)
instead of exit(1).
>
> Once again, I think that this is good code overall. It's good to see
> people being careful and using extra parenthesis and braces, rather than
> hunting down logic bugs caused by a lack of them.
Thanks. I do try hard to be explicit and to comment when necessary. I
still have a lot of work to do, but hopefully I'm getting there.
James
--
My blog: http://www.crazydrclaw.com/
My homepage: http://james.colannino.org/
"If Carpenters made houses the way programmers design programs, the first woodpecker to come along would destroy all of civilization." --Computer Proverb
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: Code critique: checking for syntax errors
2006-01-23 19:47 ` James Colannino
@ 2006-01-23 22:37 ` Jesse Ruffin
2006-01-23 22:59 ` James Colannino
0 siblings, 1 reply; 8+ messages in thread
From: Jesse Ruffin @ 2006-01-23 22:37 UTC (permalink / raw)
To: linux-c-programming
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
James Colannino wrote:
| Do stricter and older compilers complain if variables aren't declared at
| the beginning of the function?
| Actually, I was banking on the C99 standard (I guess perhaps it would
| have been wise for me to say so when I posted the code.)
If you are using a C99 compiler it won't, but older C specifications do
not allow variable definition anywhere but the beginning or between the
function declaration and definition (K&R style). Although C99 is gaining
a lot of ground, I believe that there still are some compilers that
don't support it at all, and many that don't by default.
| I can understand EXIT_FAILURE, but why do all the C books tell you to
| return 0 instead of EXIT_SUCCESS? I always thought that both return()
| and exit() did the same basic thing, so I'm confused as to why a book
| teaching how to write portable code would say to simply use 0 (that's
| with return() and not exit().) I have implemented exit(EXIT_FAILURE)
| instead of exit(1).
The man page for exit has this to say about EXIT_*:
| The use of EXIT_SUCCESS and EXIT_FAILURE is slightly more portable (to
| non-Unix environments) than that of 0 and some non-zero value like 1 or
| -1. In particular, VMS uses a different convention.
and the info pages on exit vs. return:
| A process terminates normally when its program signals it is done by
| calling `exit'. Returning from `main' is equivalent to calling `exit',
| and the value that `main' returns is used as the argument to `exit'.
Hope that helps a bit.
Also, #defines are almost always better than hard coded values. If
nothing else they're more readable, and often centralize changes.
Jesse Ruffin
AJP Services
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.2 (GNU/Linux)
iD8DBQFD1VqQ8GGeAXLl3osRApXRAJ9k1BzmmmB5prkONd2YDTn064lHewCfcZ5m
S9uAmyjy7LXnYpDmp92H04o=
=l0AT
-----END PGP SIGNATURE-----
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: Code critique: checking for syntax errors
2006-01-23 22:37 ` Jesse Ruffin
@ 2006-01-23 22:59 ` James Colannino
2006-01-24 0:44 ` Jesse Ruffin
0 siblings, 1 reply; 8+ messages in thread
From: James Colannino @ 2006-01-23 22:59 UTC (permalink / raw)
To: linux-c-programming
Jesse Ruffin wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
> James Colannino wrote:
> | Do stricter and older compilers complain if variables aren't
> declared at
> | the beginning of the function?
> | Actually, I was banking on the C99 standard (I guess perhaps it would
> | have been wise for me to say so when I posted the code.)
>
> If you are using a C99 compiler it won't, but older C specifications do
> not allow variable definition anywhere but the beginning or between the
> function declaration and definition (K&R style). Although C99 is gaining
> a lot of ground, I believe that there still are some compilers that
> don't support it at all, and many that don't by default.
What's weird about that is that in order to do variable declarations
within the definition of a for loop, I need to pass -std=c99 to GCC.
However, I don't need to do that in order to simply declare variables
later in the function. That would lead me to believe that C99 is not
required to do so. Could it possibly be a C89 thing? How far back is
considered ANSI C?
James
--
My blog: http://www.crazydrclaw.com/
My homepage: http://james.colannino.org/
"If Carpenters made houses the way programmers design programs, the first woodpecker to come along would destroy all of civilization." --Computer Proverb
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: Code critique: checking for syntax errors
2006-01-23 22:59 ` James Colannino
@ 2006-01-24 0:44 ` Jesse Ruffin
2006-01-24 2:06 ` James Colannino
2006-01-24 9:34 ` Glynn Clements
0 siblings, 2 replies; 8+ messages in thread
From: Jesse Ruffin @ 2006-01-24 0:44 UTC (permalink / raw)
To: linux-c-programming
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
James Colannino wrote:
|
| What's weird about that is that in order to do variable declarations
| within the definition of a for loop, I need to pass -std=c99 to GCC.
| However, I don't need to do that in order to simply declare variables
| later in the function. That would lead me to believe that C99 is not
| required to do so. Could it possibly be a C89 thing? How far back is
| considered ANSI C?
|
The gcc compiler by default uses -std=gnu89, which is C89 (which is the
ANSI/ISO standard) plus extensions. One of them is this:
| ISO C99 and ISO C++ allow declarations and code to be freely mixed
| within compound statements. As an extension, GCC also allows this in
| C89 mode. For example, you could do:
|
| int i;
| /* ... */
| i++;
| int j = i + 2;
However, this does not extend to variables in loop declarations. C99
does allow declarations in loops, but C89 requires declarations with
constant expressions only before any evaluated expression.
If you want to see gcc do ANSI c, try:
gcc -Wall -ansi -pedantic
The pedantic is necessary to get some extra warnings about ANSI issues.
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.2 (GNU/Linux)
iD8DBQFD1Xhz8GGeAXLl3osRAsQ6AKCJcs/9ozAQvbCJ/YLd2ZHbLiA1CQCfcjxm
XWnVeMnOw5Z6OREybtXpZpE=
=3zmi
-----END PGP SIGNATURE-----
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2006-01-24 9:34 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-01-21 19:10 Code critique: checking for syntax errors James Colannino
2006-01-22 9:34 ` Jesse Ruffin
2006-01-23 19:47 ` James Colannino
2006-01-23 22:37 ` Jesse Ruffin
2006-01-23 22:59 ` James Colannino
2006-01-24 0:44 ` Jesse Ruffin
2006-01-24 2:06 ` James Colannino
2006-01-24 9:34 ` Glynn Clements
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).