[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Patch to gperf 2.7.2 for case-independent keyword matching
From: |
Bruce Lilly |
Subject: |
Re: Patch to gperf 2.7.2 for case-independent keyword matching |
Date: |
Wed, 20 Feb 2002 09:28:05 -0500 |
Bruce Lilly wrote:
> >
> > In several important situations, keywords must be matched
> > independent of case (email header field names, domain names,
> > etc.).
> >
> > The attached patch to gperf 2.7.2 provides the option to
> > generate output which does case-independent matching and
> > documents that option.
> >
> > Best regards,
> > Bruce Lilly
>
This is a revised patch to the original gperf 2.7.2 files.
The first set of patches sometimes generated a collision
for large keyword sets (could generally be worked around
with -r) and had a typo in the usage message. In addition,
this runs a bit faster.
As with the earlier patches, the performance of the case-
independent lookups is an improvement over alternative
methods (try it several ways to convince yourself).
*** doc/gperf.1.orig Fri Feb 15 00:33:37 2002
--- doc/gperf.1 Fri Feb 15 00:37:30 2002
***************
*** 54,62 ****
\fB\-7\fR, \fB\-\-seven\-bit\fR
Assume 7-bit characters.
.TP
\fB\-c\fR, \fB\-\-compare\-strncmp\fR
Generate comparison code using strncmp rather than
! strcmp.
.TP
\fB\-C\fR, \fB\-\-readonly\-tables\fR
Make the contents of generated lookup tables
--- 54,65 ----
\fB\-7\fR, \fB\-\-seven\-bit\fR
Assume 7-bit characters.
.TP
+ \fB\-A\fR, \fB\-\-ignore\-case\fR
+ Match keys independent of case.
+ .TP
\fB\-c\fR, \fB\-\-compare\-strncmp\fR
Generate comparison code using strncmp rather than
! strcmp (strncasecmp if -A is specified).
.TP
\fB\-C\fR, \fB\-\-readonly\-tables\fR
Make the contents of generated lookup tables
*** doc/gperf.texi.orig Fri Feb 15 00:33:59 2002
--- doc/gperf.texi Fri Feb 15 00:40:18 2002
***************
*** 671,680 ****
default in versions of @code{gperf} earlier than 2.7; now the default is
to assume 8-bit characters.
@item -c
@itemx --compare-strncmp
Generates C code that uses the @code{strncmp} function to perform
! string comparisons. The default action is to use @code{strcmp}.
@item -C
@itemx --readonly-tables
--- 671,685 ----
default in versions of @code{gperf} earlier than 2.7; now the default is
to assume 8-bit characters.
+ @item -A
+ @itemx --ignore-case
+ Matches keywords in a case-independent manner.
+
@item -c
@itemx --compare-strncmp
Generates C code that uses the @code{strncmp} function to perform
! string comparisons (@code{strncasecmp} if -A is specified). The default
! action is to use @code{strcmp} (@code{strcasecmp} if -A is specified).
@item -C
@itemx --readonly-tables
*** src/gen-perf.cc.orig Thu Feb 14 22:19:30 2002
--- src/gen-perf.cc Wed Feb 20 09:00:10 2002
***************
*** 19,24 ****
--- 19,25 ----
along with GNU GPERF; see the file COPYING. If not, write to the Free
Software Foundation, 59 Temple Place - Suite 330, Boston, MA 02111, USA. */
+ #include <ctype.h>
#include <stdio.h>
#include <stdlib.h> /* declares rand(), srand() */
#include <time.h> /* declares time() */
***************
*** 61,67 ****
srand ((long) time (0));
for (int i = 0; i < ALPHA_SIZE; i++)
! asso_values[i] = (rand () & asso_value_max - 1);
}
else
{
--- 62,71 ----
srand ((long) time (0));
for (int i = 0; i < ALPHA_SIZE; i++)
! if ((option[NOCASE] != 0) && (islower(i) != 0))
! asso_values[i] = asso_values[toupper(i)];
! else
! asso_values[i] = (rand () & asso_value_max - 1);
}
else
{
***************
*** 187,192 ****
--- 191,201 ----
asso_values[(unsigned char)c] =
(asso_values[(unsigned char)c] + (option.get_jump () ?
option.get_jump () : rand ()))
& (option.get_asso_max () - 1);
+ if (option[NOCASE])
+ if (isupper((unsigned char)c))
+ asso_values[tolower(c)] = asso_values[(unsigned char)c];
+ else if (islower((unsigned char)c))
+ asso_values[toupper(c)] = asso_values[(unsigned char)c];
/* Iteration Number array is a win, O(1) intialization time! */
reset ();
***************
*** 208,213 ****
--- 217,227 ----
/* Restore original values, no more tries. */
asso_values[(unsigned char)c] = original_char;
+ if (option[NOCASE])
+ if (isupper((unsigned char)c))
+ asso_values[tolower(c)] = original_char;
+ else if (islower((unsigned char)c))
+ asso_values[toupper(c)] = original_char;
/* If we're this far it's time to try the next character.... */
return 1;
}
*** src/key-list.cc.orig Thu Feb 14 22:10:58 2002
--- src/key-list.cc Fri Feb 15 01:32:32 2002
***************
*** 972,977 ****
--- 972,1023 ----
printf ("[len] == '\\0'");
}
+ /* This class outputs a comparison using strcasecmp. */
+
+ struct Output_Compare_Strcasecmp : public Output_Compare
+ {
+ virtual void output_comparison (const Output_Expr& expr1,
+ const Output_Expr& expr2) const;
+ Output_Compare_Strcasecmp () {}
+ virtual ~Output_Compare_Strcasecmp () {}
+ };
+
+ void Output_Compare_Strcasecmp::output_comparison (const Output_Expr& expr1,
+ const Output_Expr& expr2) const
+ {
+ T (Trace t ("Output_Compare_Strcasecmp::output_comparison");)
+ printf ("!strcasecmp (");
+ expr1.output_expr ();
+ printf (", ");
+ expr2.output_expr ();
+ printf (")");
+ }
+
+ /* This class outputs a comparison using strncasecmp.
+ Note that the length of expr1 will be available through the local variable
+ `len'. */
+
+ struct Output_Compare_Strncasecmp : public Output_Compare
+ {
+ virtual void output_comparison (const Output_Expr& expr1,
+ const Output_Expr& expr2) const;
+ Output_Compare_Strncasecmp () {}
+ virtual ~Output_Compare_Strncasecmp () {}
+ };
+
+ void Output_Compare_Strncasecmp::output_comparison (const Output_Expr& expr1,
+ const Output_Expr& expr2)
const
+ {
+ T (Trace t ("Output_Compare_Strncasecmp::output_comparison");)
+ printf ("!strncasecmp (");
+ expr1.output_expr ();
+ printf (", ");
+ expr2.output_expr ();
+ printf (", len) && ");
+ expr2.output_expr ();
+ printf ("[len] == '\\0'");
+ }
+
/* This class outputs a comparison using memcmp.
Note that the length of expr1 (available through the local variable `len')
must be verified to be equal to the length of expr2 prior to this
***************
*** 1009,1015 ****
Key_List::output_hash_function (void)
{
T (Trace t ("Key_List::output_hash_function");)
! const int max_column = 10;
int field_width;
/* Calculate maximum number of digits required for MAX_HASH_VALUE. */
--- 1055,1061 ----
Key_List::output_hash_function (void)
{
T (Trace t ("Key_List::output_hash_function");)
! const int max_column = 8;
int field_width;
/* Calculate maximum number of digits required for MAX_HASH_VALUE. */
***************
*** 1063,1070 ****
{
if (count > 0)
printf (",");
! if (!(count % max_column))
printf ("\n ");
printf ("%*d", field_width,
occurrences[count] ? asso_values[count] : max_hash_value + 1);
}
--- 1109,1119 ----
{
if (count > 0)
printf (",");
! if (!(count % max_column)) {
! if (count && isalnum(count-1) && isalnum(count-max_column))
! printf ("\t/* %c - %c */", count-max_column, count-1);
printf ("\n ");
+ }
printf ("%*d", field_width,
occurrences[count] ? asso_values[count] : max_hash_value + 1);
}
***************
*** 2014,2021 ****
output_lookup_function_body (Output_Compare_Memcmp ());
else
{
! if (option[COMP])
! output_lookup_function_body (Output_Compare_Strncmp ());
else
output_lookup_function_body (Output_Compare_Strcmp ());
}
--- 2063,2075 ----
output_lookup_function_body (Output_Compare_Memcmp ());
else
{
! if (option[COMP]) {
! if (option[NOCASE])
! output_lookup_function_body (Output_Compare_Strncasecmp ());
! else
! output_lookup_function_body (Output_Compare_Strncmp ());
! } else if (option[NOCASE])
! output_lookup_function_body (Output_Compare_Strcasecmp ());
else
output_lookup_function_body (Output_Compare_Strcmp ());
}
*** src/list-node.cc.orig Thu Feb 14 23:09:40 2002
--- src/list-node.cc Fri Feb 15 00:27:06 2002
***************
*** 20,25 ****
--- 20,26 ----
#include "list-node.h"
+ #include <ctype.h>
#include <stdio.h>
#include <stdlib.h> /* declares exit() */
#include "options.h"
***************
*** 66,73 ****
int i;
if (option[ALLCHARS]) /* Use all the character positions in the
KEY. */
! for (i = len; i > 0; k++, ptr++, i--)
++occurrences[(unsigned char)(*ptr = *k)];
else /* Only use those character positions
specified by the user. */
{
/* Iterate through the list of key_positions, initializing occurrences
table
--- 67,80 ----
int i;
if (option[ALLCHARS]) /* Use all the character positions in the
KEY. */
! for (i = len; i > 0; k++, ptr++, i--) {
++occurrences[(unsigned char)(*ptr = *k)];
+ if (option[NOCASE])
+ if (isupper(*k))
+ ++occurrences[tolower(*k)];
+ else if (islower(*k))
+ ++occurrences[toupper(*k)];
+ }
else /* Only use those character positions
specified by the user. */
{
/* Iterate through the list of key_positions, initializing occurrences
table
***************
*** 81,87 ****
*ptr = key[i - 1];
else /* Out of range of KEY length, so we'll just
skip it. */
continue;
! ++occurrences[(unsigned char)(*ptr++)];
}
/* Didn't get any hits and user doesn't want to consider the
--- 88,100 ----
*ptr = key[i - 1];
else /* Out of range of KEY length, so we'll just
skip it. */
continue;
! ++occurrences[(unsigned char)(*ptr)];
! if (option[NOCASE])
! if (isupper(*ptr))
! ++occurrences[tolower(*ptr)];
! else if (islower(*ptr))
! ++occurrences[toupper(*ptr)];
! ++ptr;
}
/* Didn't get any hits and user doesn't want to consider the
*** src/options.cc.orig Thu Feb 14 21:02:42 2002
--- src/options.cc Thu Feb 14 21:19:56 2002
***************
*** 83,89 ****
Options::short_usage (FILE * strm)
{
T (Trace t ("Options::short_usage");)
! fprintf (strm, "Usage: %s
[-cCdDef[num]F<initializers>GhH<hashname>i<init>Ijk<keys>K<keyname>lL<language>nN<function
name>ors<size>S<switches>tTvW<wordlistname>Z<class name>7] [input-file]\n"
"Try `%s --help' for more information.\n",
program_name, program_name);
}
--- 83,89 ----
Options::short_usage (FILE * strm)
{
T (Trace t ("Options::short_usage");)
! fprintf (strm, "Usage: %s
[-aAcCdDef[num]F<initializers>GhH<hashname>i<init>Ijk<keys>K<keyname>lL<language>nN<function
name>ors<size>S<switches>tTvW<wordlistname>Z<class name>7] [input-file]\n"
"Try `%s --help' for more information.\n",
program_name, program_name);
}
***************
*** 133,139 ****
" `Perfect_Hash'.\n"
" -7, --seven-bit Assume 7-bit characters.\n"
" -c, --compare-strncmp Generate comparison code using strncmp
rather than\n"
! " strcmp.\n"
" -C, --readonly-tables Make the contents of generated lookup
tables\n"
" constant, i.e., readonly.\n"
" -E, --enum Define constant values using an enum
local to the\n"
--- 133,140 ----
" `Perfect_Hash'.\n"
" -7, --seven-bit Assume 7-bit characters.\n"
" -c, --compare-strncmp Generate comparison code using strncmp
rather than\n"
! " strcmp, or strncasecmp rather than
strcasecmp if -A\n"
! " is specified.\n"
" -C, --readonly-tables Make the contents of generated lookup
tables\n"
" constant, i.e., readonly.\n"
" -E, --enum Define constant values using an enum
local to the\n"
***************
*** 160,165 ****
--- 161,170 ----
" Prevents the transfer of the type
declaration to the\n"
" output file. Use this option if the type
is already\n"
" defined elsewhere.\n"
+ " -A, --ignore-case Match keywords in a case-independent
manner. String\n"
+ " comparisons are made with strcasecmp
(strncasecmp if\n"
+ " -c is specified) and the associate
values are symmetric\n"
+ " with respect to lower-case and
upper-case ASCII indices.\n"
"\n"
"Algorithm employed by gperf:\n"
" -k, --key-positions=KEYS\n"
***************
*** 355,360 ****
--- 360,366 ----
"\nENUM is........: %s"
"\nINCLUDE is.....: %s"
"\nSEVENBIT is....: %s"
+ "\nNOCASE is......: %s"
"\niterations = %d"
"\nlookup function name = %s"
"\nhash function name = %s"
***************
*** 387,392 ****
--- 393,399 ----
option_word & ENUM ? "enabled" : "disabled",
option_word & INCLUDE ? "enabled" : "disabled",
option_word & SEVENBIT ? "enabled" : "disabled",
+ option_word & NOCASE ? "enabled" : "disabled",
iterations,
function_name, hash_name, wordlist_name, key_name,
initializer_suffix, jump, size - 1, initial_asso_value,
***************
*** 421,426 ****
--- 428,434 ----
{ "lookup-fn-name", required_argument, 0, 'N' },
{ "class-name", required_argument, 0, 'Z' },
{ "seven-bit", no_argument, 0, '7' },
+ { "ignore-case", no_argument, 0, 'A' },
{ "compare-strncmp", no_argument, 0, 'c' },
{ "readonly-tables", no_argument, 0, 'C' },
{ "enum", no_argument, 0, 'E' },
***************
*** 457,463 ****
while ((option_char =
getopt_long (argument_count, argument_vector,
! "adcCDe:Ef:F:gGhH:i:Ij:k:K:lL:nN:oprs:S:tTvW:Z:7",
long_options, (int *)0))
!= -1)
{
--- 465,471 ----
while ((option_char =
getopt_long (argument_count, argument_vector,
! "aAdcCDe:Ef:F:gGhH:i:Ij:k:K:lL:nN:oprs:S:tTvW:Z:7",
long_options, (int *)0))
!= -1)
{
***************
*** 465,470 ****
--- 473,483 ----
{
case 'a': /* Generated code uses the ANSI prototype
format. */
break; /* This is now the default. */
+ case 'A': /* Generate strcasecmp rather than strcmp,
make associate values case-symmetric. */
+ {
+ option_word |= NOCASE;
+ break;
+ }
case 'c': /* Generate strncmp rather than strcmp. */
{
option_word |= COMP;
*** src/options.h.orig Thu Feb 14 21:09:03 2002
--- src/options.h Thu Feb 14 21:11:08 2002
***************
*** 59,65 ****
CPLUSPLUS = 01000000, /* Generate C++ code: prototypes, const,
class, inline, enum. */
ENUM = 02000000, /* Use enum for constants. */
INCLUDE = 04000000, /* Generate #include statements. */
! SEVENBIT = 010000000 /* Assume 7-bit, not 8-bit, characters. */
};
/* Define some useful constants (these don't really belong here, but I'm
--- 59,66 ----
CPLUSPLUS = 01000000, /* Generate C++ code: prototypes, const,
class, inline, enum. */
ENUM = 02000000, /* Use enum for constants. */
INCLUDE = 04000000, /* Generate #include statements. */
! SEVENBIT = 010000000, /* Assume 7-bit, not 8-bit, characters. */
! NOCASE = 020000000 /* Case-independent comparisons */
};
/* Define some useful constants (these don't really belong here, but I'm