netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Thomas Graf <tgraf@suug.ch>
To: netdev@oss.sgi.com
Cc: Jamal Hadi Salim <hadi@cyberus.ca>
Subject: [PATCH 3/5] [LIB] Naive regular expression string-matching algorithm
Date: Sat, 28 May 2005 00:48:37 +0200	[thread overview]
Message-ID: <20050527224837.GJ15391@postel.suug.ch> (raw)
In-Reply-To: <20050527224725.GG15391@postel.suug.ch>


Signed-off-by: Thomas Graf <tgraf@suug.ch>

---
commit 81553cdea2a7ff762cb44aeced743d3a77efd2d7
tree 266c4ad512e5ae71bd7cd88c94bfb18fb2f761d6
parent 5b70ca8eab4c7d7ef884582d9713cdbffa0f4cd4
author Thomas Graf <tgraf@suug.ch> Fri, 27 May 2005 23:44:24 +0200
committer Thomas Graf <tgraf@suug.ch> Fri, 27 May 2005 23:44:24 +0200

 include/linux/textsearch_regexp.h |   47 ++++++
 lib/Kconfig                       |    9 +
 lib/Makefile                      |    1 
 lib/ts_regexp.c                   |  272 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 329 insertions(+)

Index: include/linux/textsearch_regexp.h
===================================================================
--- /dev/null  (tree:4d90ca82120da7b308b9a6bf11a1069473ca5d30)
+++ 266c4ad512e5ae71bd7cd88c94bfb18fb2f761d6/include/linux/textsearch_regexp.h  (mode:100644)
@@ -0,0 +1,47 @@
+#ifndef __LINUX_TEXTSEARCH_REGEXP_H
+#define __LINUX_TEXTSEARCH_REGEXP_H
+
+#include <linux/types.h>
+
+enum {
+	TS_RE_SPECIFIC,		/* specific character */
+	TS_RE_WILDCARD,		/* any character */
+	TS_RE_DIGIT,		/* isdigit() */
+	TS_RE_XDIGIT,		/* isxdigit() */
+	TS_RE_PRINT,		/* isprint() */
+	TS_RE_ALPHA,		/* isalpha() */
+	TS_RE_ALNUM,		/* isalnum() */
+	TS_RE_ASCII,		/* isascii() */
+	TS_RE_CNTRL,		/* iscntrl() */
+	TS_RE_GRAPH,		/* isgraph() */
+	TS_RE_LOWER,		/* islower() */
+	TS_RE_UPPER,		/* isupper() */
+	TS_RE_PUNCT,		/* ispunct() */
+	TS_RE_SPACE,		/* isspace() */
+	__TS_RE_TYPE_MAX,
+};
+#define TS_RE_TYPE_MAX (__TS_RE_TYPE_MAX - 1)
+
+enum {
+	TS_RE_SINGLE,		/* 1 occurrence */
+	TS_RE_PERHAPS,		/* 1 or 0 occurrence */
+	TS_RE_ANY,		/* 0..n occurrences */
+	TS_RE_MULTI,		/* 1..n occurrences */
+	__TS_RE_RECUR_MAX,
+};
+#define TS_RE_RECUR_MAX (__TS_RE_RECUR_MAX - 1)
+
+/**
+ * struct ts_regexp_token - regular expression token
+ * @type: type of token
+ * @recur: number of recurrences
+ * @value: character value for TS_RE_SPECIFIC
+ */
+struct ts_regexp_token
+{
+	__u16		type;
+	__u8		recur;
+	__u8		value;
+};
+
+#endif
Index: lib/Kconfig
===================================================================
--- 4d90ca82120da7b308b9a6bf11a1069473ca5d30/lib/Kconfig  (mode:100644)
+++ 266c4ad512e5ae71bd7cd88c94bfb18fb2f761d6/lib/Kconfig  (mode:100644)
@@ -68,6 +68,15 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called ts_kmp.
 
+config TEXTSEARCH_REGEXP
+	tristate "Regular Expression"
+	help
+	  Say Y here if you want to be able to search text using a
+	  naive and limited regular expression textsearch algorithm.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called ts_regexp.
+
 endmenu
 
 endmenu
Index: lib/Makefile
===================================================================
--- 4d90ca82120da7b308b9a6bf11a1069473ca5d30/lib/Makefile  (mode:100644)
+++ 266c4ad512e5ae71bd7cd88c94bfb18fb2f761d6/lib/Makefile  (mode:100644)
@@ -34,6 +34,7 @@
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_REGEXP) += ts_regexp.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
Index: lib/ts_regexp.c
===================================================================
--- /dev/null  (tree:4d90ca82120da7b308b9a6bf11a1069473ca5d30)
+++ 266c4ad512e5ae71bd7cd88c94bfb18fb2f761d6/lib/ts_regexp.c  (mode:100644)
@@ -0,0 +1,272 @@
+/*
+ * lib/ts_regexp.c	Naive and very limited regular expression
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/textsearch.h>
+#include <linux/textsearch_regexp.h>
+
+struct ts_regexp
+{
+	int			ntokens;
+	struct ts_regexp_token	tokens[0];
+};
+
+/* other values derived from ctype.h */
+#define _A		0x100 /* ascii */
+#define _W		0x200 /* wildcard */
+
+/* Map to _ctype flags and some magic numbers */
+static u16 token_map[TS_RE_TYPE_MAX+1] = {
+	[TS_RE_SPECIFIC]  = 0,
+	[TS_RE_WILDCARD]  = _W,
+	[TS_RE_CNTRL]	  = _C,
+	[TS_RE_LOWER]	  = _L,
+	[TS_RE_UPPER]	  = _U,
+	[TS_RE_PUNCT]	  = _P,
+	[TS_RE_SPACE]	  = _S,
+	[TS_RE_DIGIT]	  = _D,
+	[TS_RE_XDIGIT]	  = _D | _X,
+	[TS_RE_ALPHA]	  = _U | _L,
+	[TS_RE_ALNUM]	  = _U | _L | _D,
+	[TS_RE_PRINT]	  = _P | _U | _L | _D | _SP,
+	[TS_RE_GRAPH]	  = _P | _U | _L | _D,
+	[TS_RE_ASCII]	  = _A,
+};
+
+static u16 token_lookup_tbl[256] = {
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   0-  3 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   4-  7 */
+_W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,		/*   8- 11 */
+_W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,		/*  12- 15 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  16- 19 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  20- 23 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  24- 27 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  28- 31 */
+_W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  32- 35 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  36- 39 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  40- 43 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  44- 47 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  48- 51 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  52- 55 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,		/*  56- 59 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  60- 63 */
+_W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,		/*  64- 67 */
+_W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,		/*  68- 71 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  72- 75 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  76- 79 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  80- 83 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  84- 87 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,		/*  88- 91 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  92- 95 */
+_W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,		/*  96- 99 */
+_W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,		/* 100-103 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 104-107 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 108-111 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 112-115 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 116-119 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,		/* 120-123 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,		/* 124-127 */
+_W,            _W,           _W,           _W,			/* 128-131 */
+_W,            _W,           _W,           _W,			/* 132-135 */
+_W,            _W,           _W,           _W,			/* 136-139 */
+_W,            _W,           _W,           _W,			/* 140-143 */
+_W,            _W,           _W,           _W,			/* 144-147 */
+_W,            _W,           _W,           _W,			/* 148-151 */
+_W,            _W,           _W,           _W,			/* 152-155 */
+_W,            _W,           _W,           _W,			/* 156-159 */
+_W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,		/* 160-163 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 164-167 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 168-171 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 172-175 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 176-179 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 180-183 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 184-187 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 188-191 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 192-195 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 196-199 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 200-203 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 204-207 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 208-211 */
+_W|_U,         _W|_U,        _W|_U,        _W|_P,		/* 212-215 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 216-219 */
+_W|_U,         _W|_U,        _W|_U,        _W|_L,		/* 220-223 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 224-227 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 228-231 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 232-235 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 236-239 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 240-243 */
+_W|_L,         _W|_L,        _W|_L,        _W|_P,		/* 244-247 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 248-251 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L};		/* 252-255 */
+
+static inline int match_token(struct ts_regexp_token *t, u8 d)
+{
+	if (t->type)
+		return (token_lookup_tbl[d] & t->type) != 0;
+	else
+		return t->value == d;
+}
+
+static int regexp_find(struct ts_config *conf, struct ts_state *state)
+{
+	struct ts_regexp *regexp = ts_config_priv(conf);
+	int i, q, consumed = state->offset;
+	int match_start = state->offset;
+	struct ts_regexp_token *t = NULL, *next;
+	size_t text_len = 0;
+	unsigned char *text;
+
+#define GET_TEXT() \
+	({ i = 0; conf->get_text(consumed, &text, conf, state); })
+
+#define end_of_text()  (i >= text_len && !GET_TEXT())
+#define more_text() (i < text_len || GET_TEXT())
+
+#define NEXT_CHAR() do { i++; consumed++; } while (0)
+	
+	for (i = 0, q = 0; q < regexp->ntokens; q++) {
+		t = &regexp->tokens[q];
+
+		if (likely(q < (regexp->ntokens - 1)))
+			next = &regexp->tokens[q+1];
+		else
+			next = NULL;
+
+		switch (t->recur) {
+			case TS_RE_SINGLE:
+				if (unlikely(end_of_text()))
+					goto no_match;
+
+				if (!match_token(t, text[i]))
+					goto no_match;
+				break;
+
+			case TS_RE_PERHAPS:
+				if (likely(more_text()))
+					if (match_token(t, text[i]))
+						break;
+				continue;
+
+			case TS_RE_MULTI:
+				if (unlikely(end_of_text()))
+					goto no_match;
+
+				if (!match_token(t, text[i]))
+					goto no_match;
+
+				NEXT_CHAR();
+				/* fall through */
+
+			case TS_RE_ANY:
+				if (next == NULL)
+					goto found_match;
+
+				if (likely(more_text())) {
+					while (!match_token(next, text[i])) {
+						if (!match_token(t, text[i]))
+							goto no_match;
+						NEXT_CHAR();
+						if (unlikely(end_of_text()))
+							goto no_match;
+					}
+				}
+				continue;
+		}
+
+		NEXT_CHAR();
+	}
+
+	if (q >= (regexp->ntokens - 1))
+		goto found_match;
+
+no_match:
+	return -1;
+
+found_match:
+	state->offset = consumed + i + 1;
+	return match_start;
+}
+
+static struct ts_config *regexp_init(const unsigned char *pattern, size_t len,
+				     int gfp_mask)
+{
+	int i, err = -EINVAL;
+	struct ts_config *conf;
+	struct ts_regexp *regexp;
+	struct ts_regexp_token *tokens = (struct ts_regexp_token *) pattern;
+
+	if (len  % sizeof(struct ts_regexp_token))
+		goto errout;
+
+	for (i = 0; i < len / sizeof(struct ts_regexp_token); i++) {
+		struct ts_regexp_token *t = &tokens[i];
+
+		if (t->type > TS_RE_TYPE_MAX ||
+		    t->type > TS_RE_RECUR_MAX)
+			goto errout;
+
+		t->type = token_map[t->type];
+	}
+
+	conf = alloc_ts_config(len, gfp_mask);
+	if (IS_ERR(conf))
+		return conf;
+
+	regexp = ts_config_priv(conf);
+	regexp->ntokens = len / sizeof(struct ts_regexp_token);
+	memcpy(regexp->tokens, pattern, len);
+
+	return conf;
+
+errout:
+	return ERR_PTR(err);
+}
+
+static unsigned char *regexp_get_pattern(struct ts_config *conf)
+{
+	struct ts_regexp *regexp = ts_config_priv(conf);
+	return (unsigned char *) regexp->tokens;
+}
+
+static unsigned int regexp_get_pattern_len(struct ts_config *conf)
+{
+	struct ts_regexp *regexp = ts_config_priv(conf);
+	return regexp->ntokens * sizeof(struct ts_regexp_token);
+}
+
+static struct ts_ops regexp_ops = {
+	.name		  = "regexp",
+	.find		  = regexp_find,
+	.init		  = regexp_init,
+	.get_pattern	  = regexp_get_pattern,
+	.get_pattern_len  = regexp_get_pattern_len,
+	.owner		  = THIS_MODULE,
+	.list		  = LIST_HEAD_INIT(regexp_ops.list)
+};
+
+static int __init init_regexp(void)
+{
+	return textsearch_register(&regexp_ops);
+}
+
+static void __exit exit_regexp(void)
+{
+	textsearch_unregister(&regexp_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_regexp);
+module_exit(exit_regexp);

  parent reply	other threads:[~2005-05-27 22:48 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-05-27 22:47 [RFC] textsearch infrastructure et al v2 Thomas Graf
2005-05-27 22:47 ` [PATCH 1/5] [LIB] textsearch infrastructure Thomas Graf
2005-05-27 22:48 ` [PATCH 2/5] [LIB] Knuth-Morris-Pratt string-matching algorithm Thomas Graf
2005-05-27 22:48 ` Thomas Graf [this message]
2005-05-27 22:48 ` [PATCH 4/5] [NET] Add skb_find_text() to search for a text pattern in skbs Thomas Graf
2005-05-28  3:11   ` Pablo Neira
2005-05-28 11:32     ` Thomas Graf
2005-05-27 22:49 ` [PATCH 5/5] [PKT_SCHED] textsearch ematch Thomas Graf
2005-05-28 11:59 ` [RFC] textsearch infrastructure et al v2 jamal
2005-05-28 12:35   ` Thomas Graf
2005-05-28 12:56     ` Pablo Neira
2005-05-28 12:58       ` Pablo Neira
2005-05-28 12:58       ` Pablo Neira
2005-05-28 13:58       ` Thomas Graf
2005-05-31 22:05       ` David S. Miller
2005-05-31 21:56 ` David S. Miller
2005-05-31 22:44   ` Thomas Graf
2005-05-31 22:50     ` David S. Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20050527224837.GJ15391@postel.suug.ch \
    --to=tgraf@suug.ch \
    --cc=hadi@cyberus.ca \
    --cc=netdev@oss.sgi.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).