eliot-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Eliot-dev] eliot dic/Makefile.am dic/compdic.cpp dic/dic.h... [cppdic]


From: eliot-dev
Subject: [Eliot-dev] eliot dic/Makefile.am dic/compdic.cpp dic/dic.h... [cppdic]
Date: Tue, 20 Nov 2007 12:54:04 +0000

CVSROOT:        /cvsroot/eliot
Module name:    eliot
Branch:         cppdic
Changes by:     Olivier Teulière <ipkiss>      07/11/20 12:54:04

Modified files:
        dic            : Makefile.am compdic.cpp dic.h dic_internals.h 
                         encoding.cpp hashtable.cpp hashtable.h 
                         header.cpp header.h 
        test           : regression.pl 
        utils          : eliottxt.cpp 
Added files:
        dic            : hashtable.i 

Log message:
         - The dictionary header now contains the list of letters of the 
dictionary, encoded in UTF-8.
         - HashTable is now a template class, and is safer (type safe + bug fix 
concerning the size of unsigned ints).
         - The compdic binary handles a list of words in UTF-8. The list of 
letters is still hardcoded though.
         - Several other simplifications in compdic.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/Makefile.am?cvsroot=eliot&only_with_tag=cppdic&r1=1.14.4.3&r2=1.14.4.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.6&r2=1.1.2.7
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.13.2.2&r2=1.13.2.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic_internals.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.7.2.3&r2=1.7.2.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.2&r2=1.1.2.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/hashtable.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.1&r2=1.1.2.2
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/hashtable.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.6.2.1&r2=1.6.2.2
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.5&r2=1.1.2.6
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.3&r2=1.1.2.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/hashtable.i?cvsroot=eliot&only_with_tag=cppdic&rev=1.1.2.1
http://cvs.savannah.gnu.org/viewcvs/eliot/test/regression.pl?cvsroot=eliot&only_with_tag=cppdic&r1=1.1&r2=1.1.6.1
http://cvs.savannah.gnu.org/viewcvs/eliot/utils/eliottxt.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.16.2.4&r2=1.16.2.5

Patches:
Index: dic/Makefile.am
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Makefile.am,v
retrieving revision 1.14.4.3
retrieving revision 1.14.4.4
diff -u -b -r1.14.4.3 -r1.14.4.4
--- dic/Makefile.am     11 Nov 2007 19:56:58 -0000      1.14.4.3
+++ dic/Makefile.am     20 Nov 2007 12:54:03 -0000      1.14.4.4
@@ -65,6 +65,7 @@
 compdic_SOURCES= \
        dic_internals.h \
        hashtable.cpp hashtble.h \
+       encoding.cpp encoding.h \
        header.cpp header.h \
        compdic.cpp
 

Index: dic/compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/compdic.cpp,v
retrieving revision 1.1.2.6
retrieving revision 1.1.2.7
diff -u -b -r1.1.2.6 -r1.1.2.7
--- dic/compdic.cpp     18 Nov 2007 14:30:43 -0000      1.1.2.6
+++ dic/compdic.cpp     20 Nov 2007 12:54:03 -0000      1.1.2.7
@@ -25,17 +25,20 @@
  */
 
 #include <fstream>
+#include <vector>
+#include <map>
+#include <memory>
+#include <iconv.h>
 #include <time.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#include <wctype.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <assert.h>
 
 #include "hashtable.h"
+#include "encoding.h"
 #include "header.h"
 #include "dic_internals.h"
 
@@ -45,34 +48,41 @@
 #define CHECK_RECURSION
 
 
-char* load_uncompressed(const string &iFileName, unsigned int *dic_size)
+wchar_t* load_uncompressed(const string &iFileName, unsigned int iDicSize)
 {
-    char *uncompressed;
-    FILE* file_desc;
+    // TODO: we should throw exceptions everywhere instead of returning NULL
 
-    if ((file_desc = fopen(iFileName.c_str(), "r")) == NULL)
+    ifstream file(iFileName.c_str());
+    if (!file.is_open())
         return NULL;
 
-    if ((uncompressed = (char*)malloc(sizeof(char)*(*dic_size))) == NULL)
-    {
-        fclose(file_desc);
+    // Place the buffer in an auto_ptr to avoid worrying about memory handling
+    std::auto_ptr<char> buffer(new char[iDicSize]);
+    // Load the file data, everything in one shot
+    file.read(buffer.get(), iDicSize);
+    file.close();
+
+    // The words are supposed to be in utf-8, so convert everything to
+    // wide characters
+    iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
+    if (handle == (iconv_t)(-1))
         return NULL;
-    }
 
-    unsigned r = fread (uncompressed, 1, *dic_size, file_desc);
-    if (r < *dic_size)
-    {
-        /* \n is 2 chars under MS OS */
-        printf("\n");
-        printf("** The number of bytes read is less than the size of the file 
**\n");
-        printf("** this may be OK if you run a Microsoft OS but not on Unix   
**\n");
-        printf("** please check the results.                                  
**\n");
-        printf("\n");
-        *dic_size = r;
-    }
+    // Buffer for the wide characters (it will use at most as many characters
+    // as the utf-8 version)
+    std::auto_ptr<wchar_t> wide_buf(new wchar_t[iDicSize]);
+
+    size_t inChars = iDicSize;
+    size_t outChars = sizeof(wchar_t) * iDicSize;
+    char *in = buffer.get();
+    char *out = (char*)wide_buf.get();
+    size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+    iconv_close(handle);
+    // Problem during encoding conversion?
+    if (res == (size_t)(-1))
+        return NULL;
 
-    fclose(file_desc);
-    return uncompressed;
+    return wide_buf.release();
 }
 
 
@@ -115,7 +125,7 @@
 }
 
 
-void write_node(const Dawg_edge *edges, int size, int num, ostream &outfile)
+void write_node(const Dawg_edge *edges, int num, ostream &outfile)
 {
 #ifdef DEBUG_OUTPUT
     printf("writing %d edges\n", num);
@@ -129,7 +139,7 @@
         outfile.write((char*)(edges + i), sizeof(Dawg_edge));
     }
 #else
-    outfile.write((char*)edges, num * size);
+    outfile.write((char*)edges, num * sizeof(Dawg_edge));
 #endif
 }
 
@@ -140,15 +150,59 @@
 /* ods3: ??   */
 /* ods4: 1746 */
 
+// Hashing function for a vector of Dawg_edge, based on the hashing function
+// of the HashTable
+struct HashVector
+{
+    unsigned int operator()(const vector<Dawg_edge> &iKey) const
+    {
+        if (iKey.empty())
+            return 0;
+        return HashPtr(&iKey.front(), iKey.size() * sizeof(Dawg_edge));
+    }
+};
+
+#ifdef CHECK_RECURSION
+class IncDec
+{
+    public:
+        IncDec(int &ioCounter)
+            : m_counter(ioCounter)
+        {
+            m_counter++;
+        }
+
+        ~IncDec()
+        {
+            m_counter--;
+        }
+    private:
+        int &m_counter;
+};
+
+int current_rec = 0;
+int max_rec = 0;
+#endif
+
 /* global variables */
 ofstream    global_outfile;
 Dict_header_info global_header_info;
-Hash_table  global_hashtable;
+HashTable<vector<Dawg_edge>, unsigned int, HashVector> *global_hashtable;
+
+wchar_t  global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
+wchar_t* global_endstring;                    /* Marks END of current string */
+wchar_t* global_input;
+wchar_t* global_endofinput;
+#ifdef CHECK_RECURSION
+map<int, vector<Dawg_edge> > global_mapfordepth;
+#endif
+map<wchar_t, unsigned int> global_chartocode;
 
-char        global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
-char*       global_endstring;                    /* Marks END of current 
string */
-char*       global_input;
-char*       global_endofinput;
+unsigned int charToCode(wchar_t iChar)
+{
+    // TODO: throw an exception if the result is 0
+    return global_chartocode[iChar];
+}
 
 /*
  * Makenode takes a prefix (as position relative to stringbuf) and
@@ -157,38 +211,47 @@
  * to stringbuf) indicating how much of prefix is matched in the
  * input.
  */
-#ifdef CHECK_RECURSION
-int current_rec = 0;
-int max_rec = 0;
-#endif
-
-unsigned int makenode(char *prefix)
+unsigned int makenode(wchar_t *prefix)
 {
 #ifdef CHECK_RECURSION
-    current_rec++;
+    IncDec inc(current_rec);
     if (current_rec > max_rec)
         max_rec = current_rec;
 #endif
 
-    Dawg_edge  edges[MAX_EDGES];
-    Dawg_edge *edgeptr = edges;
+#ifdef CHECK_RECURSION
+    // Instead of creating a vector, try to reuse an existing one
+    vector<Dawg_edge> &edges = global_mapfordepth[current_rec];
+    edges.reserve(MAX_EDGES);
+    edges.clear();
+#else
+    vector<Dawg_edge> edges;
+    // Optimize allocation
+    edges.reserve(MAX_EDGES);
+#endif
+    Dawg_edge newEdge;
 
     while (prefix == global_endstring)
     {
-        /* More edges out of node */
-        edgeptr->ptr  = 0;
-        edgeptr->term = 0;
-        edgeptr->last = 0;
-        edgeptr->fill = 0;
-        edgeptr->chr  = 0;
-
-        (*(edgeptr++)).chr = (*global_endstring++ = *global_input++) & 
DIC_CHAR_MASK;
-        if (*global_input == '\n')                 /* End of a word */
+        // More edges out of node
+        newEdge.ptr  = 0;
+        newEdge.term = 0;
+        newEdge.last = 0;
+        newEdge.fill = 0;
+        newEdge.chr = charToCode(*global_endstring++ = *global_input++);
+        edges.push_back(newEdge);
+
+        // End of a word?
+        // TODO: handle \r as well
+        if (*global_input == L'\n')
         {
             global_header_info.nwords++;
-            edgeptr[-1].term = 1;                  /* Mark edge as word */
-            *global_endstring++ = *global_input++; /* Skip \n */
-            if (global_input == global_endofinput) /* At end of input? */
+            // Mark edge as word
+            edges.back().term = 1;
+            // Skip \n
+            *global_endstring++ = *global_input++;
+            // At the end of input?
+            if (global_input == global_endofinput)
                 break;
 
             global_endstring = global_stringbuf;
@@ -198,47 +261,36 @@
                 global_input++;
             }
         }
-        /* make dawg pointed to by this edge */
-        edgeptr[-1].ptr = makenode(prefix + 1);
+        // make dawg pointed to by this edge
+        edges.back().ptr = makenode(prefix + 1);
     }
 
-    int numedges = edgeptr - edges;
+    int numedges = edges.size();
     if (numedges == 0)
     {
-#ifdef CHECK_RECURSION
-        current_rec --;
-#endif
-        return 0;             /* Special node zero - no edges */
+        // Special node zero - no edges
+        return 0;
     }
 
-    edgeptr[-1].last = 1;     /* Mark the last edge */
+    // Mark the last edge
+    edges.back().last = 1;
 
-    unsigned *saved_position = (unsigned int*) hash_find(global_hashtable,
-                                               (void*)edges,
-                                               numedges*sizeof(Dawg_edge));
+    const unsigned int *saved_position = global_hashtable->find(edges);
     if (saved_position)
     {
         global_header_info.edgessaved += numedges;
         global_header_info.nodessaved++;
 
-#ifdef CHECK_RECURSION
-        current_rec --;
-#endif
         return *saved_position;
     }
     else
     {
         unsigned int node_pos = global_header_info.edgesused;
-        hash_add(global_hashtable,
-                 (void*)edges, numedges*sizeof(Dawg_edge),
-                 (void*)(&global_header_info.edgesused), 
sizeof(global_header_info.edgesused));
+        global_hashtable->add(edges, global_header_info.edgesused);
         global_header_info.edgesused += numedges;
         global_header_info.nodesused++;
-        write_node(edges, sizeof(Dawg_edge), numedges, global_outfile);
+        write_node(&edges.front(), numedges, global_outfile);
 
-#ifdef CHECK_RECURSION
-        current_rec --;
-#endif
         return node_pos;
     }
 }
@@ -246,8 +298,6 @@
 
 int main(int argc, char* argv[])
 {
-    char *uncompressed;
-
     if (argc < 2)
     {
         fprintf(stderr, "usage: %s uncompressed_dic [compressed_dic]\n", 
argv[0]);
@@ -271,44 +321,58 @@
     }
 
     unsigned int dicsize = size;
-    if ((uncompressed = load_uncompressed(argv[1], &dicsize)) == NULL)
+    wchar_t *uncompressed;
+    clock_t startLoadTime = clock();
+    if ((uncompressed = load_uncompressed(argv[1], dicsize)) == NULL)
     {
         fprintf(stderr, "Cannot load uncompressed dictionary into memory\n");
         exit(1);
     }
+    clock_t endLoadTime = clock();
 
     global_input = uncompressed;
     global_endofinput = global_input + dicsize;
 
 #define SCALE 0.6
-    global_hashtable = hash_init((unsigned int)(dicsize * SCALE));
+    global_hashtable = new HashTable<vector<Dawg_edge>, unsigned int, 
HashVector>((unsigned int)(dicsize * SCALE));
 #undef SCALE
 
     skip_init_header(global_outfile, &global_header_info);
 
+    // TODO: remove hard-coding
+    global_header_info.letters = convertToWc("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+    // Init the map char --> code
+    for (string::size_type i = 0; i < global_header_info.letters.size(); ++i)
+    {
+        global_chartocode[global_header_info.letters[i]] = i + 1;
+        global_chartocode[towlower(global_header_info.letters[i])] = i + 1;
+    }
+
     Dawg_edge specialnode = {0, 0, 0, 0, 0};
     specialnode.last = 1;
-    write_node(&specialnode, sizeof(specialnode), 1, global_outfile);
+    write_node(&specialnode, 1, global_outfile);
+
     /*
      * Call makenode with null (relative to stringbuf) prefix;
      * Initialize string to null; Put index of start node on output
      */
-    clock_t starttime = clock();
+    clock_t startBuildTime = clock();
     Dawg_edge rootnode = {0, 0, 0, 0, 0};
     rootnode.ptr = makenode(global_endstring = global_stringbuf);
-    clock_t endtime = clock();
-    write_node(&rootnode, sizeof(rootnode), 1, global_outfile);
+    clock_t endBuildTime = clock();
+    write_node(&rootnode, 1, global_outfile);
 
     fix_header(global_outfile, &global_header_info);
 
     Header aHeader(global_header_info);
     aHeader.print();
 
-    hash_destroy(global_hashtable);
-    free(uncompressed);
+    delete global_hashtable;
+    delete[] uncompressed;
     global_outfile.close();
 
-    printf(" Elapsed time is                : %f s\n", 1.0 * (endtime - 
starttime) / CLOCKS_PER_SEC);
+    printf(" Load time: %f s\n", 1.0 * (endLoadTime - startLoadTime) / 
CLOCKS_PER_SEC);
+    printf(" Compression time: %f s\n", 1.0 * (endBuildTime - startBuildTime) 
/ CLOCKS_PER_SEC);
 #ifdef CHECK_RECURSION
     printf(" Maximum recursion level reached: %d\n", max_rec);
 #endif

Index: dic/dic.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic.h,v
retrieving revision 1.13.2.2
retrieving revision 1.13.2.3
diff -u -b -r1.13.2.2 -r1.13.2.3
--- dic/dic.h   24 Dec 2006 20:49:42 -0000      1.13.2.2
+++ dic/dic.h   20 Nov 2007 12:54:03 -0000      1.13.2.3
@@ -37,7 +37,8 @@
  */
 #define DIC_WORD_MAX 16
 
-typedef struct _Dawg_edge Dawg_edge;
+//typedef struct _Dawg_edge Dawg_edge;
+class Dawg_edge;
 typedef unsigned int dic_elt_t;
 typedef unsigned char dic_code_t;
 

Index: dic/dic_internals.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic_internals.h,v
retrieving revision 1.7.2.3
retrieving revision 1.7.2.4
diff -u -b -r1.7.2.3 -r1.7.2.4
--- dic/dic_internals.h 11 Nov 2007 19:34:32 -0000      1.7.2.3
+++ dic/dic_internals.h 20 Nov 2007 12:54:03 -0000      1.7.2.4
@@ -51,27 +51,32 @@
  */
 
 #if defined(WORDS_BIGENDIAN)
-struct __attribute__ ((packed)) _Dawg_edge {
+struct __attribute__ ((packed)) Dawg_edge
+{
   uint32_t
     chr  :  5,
-    fill :  1,
-    last :  1,
-    term :  1,
+    fill:  1,
+    last:  1,
+    term:  1,
     ptr  : 24;
 };
 #else
-struct __attribute__ ((packed)) _Dawg_edge {
+struct __attribute__ ((packed)) Dawg_edge
+{
+    public:
   uint32_t
     ptr  : 24,
-    term :  1,
-    last :  1,
-    fill :  1,
+        term:  1,
+        last:  1,
+        fill:  1,
     chr  :  5;
+      bool operator==(const Dawg_edge &iOther) const
+      {
+          return memcmp(this, &iOther, sizeof(*this)) == 0;
+      }
 };
 #endif
 
-typedef struct _Dawg_edge Dawg_edge;
-
 
 #endif /* _DIC_INTERNALS_H */
 

Index: dic/encoding.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.cpp,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -u -b -r1.1.2.2 -r1.1.2.3
--- dic/encoding.cpp    5 Nov 2006 17:27:03 -0000       1.1.2.2
+++ dic/encoding.cpp    20 Nov 2007 12:54:03 -0000      1.1.2.3
@@ -19,7 +19,7 @@
 
 /**
  *  \file   encoding.cpp
- *  \brief  Utility functions to ease manipulation of wide-character strings
+ *  \brief  Utility functions to ease handling of wide-character strings
  *  \author Olivier Teuliere
  *  \date   2005
  */
@@ -33,7 +33,7 @@
 
 int _wtoi(const wchar_t *iWStr)
 {
-    return wcstol(iWStr,NULL,10);
+    return wcstol(iWStr, NULL, 10);
 }
 
 
@@ -53,6 +53,7 @@
 }
 
 
+#define _MAX_SIZE_FOR_STACK_ 30
 wstring convertToWc(const string& iStr)
 {
     // Get the needed length (we _can't_ use string::size())
@@ -60,14 +61,22 @@
     if (len == (size_t)-1)
         return L"";
 
-//     wchar_t *tmp = new wchar_t[len + 1];
-    wchar_t tmp[100];
+    // Change the allocation method depending on the length of the string
+    if (len < _MAX_SIZE_FOR_STACK_)
+    {
+        // Without multi-thread, we can use static storage
+        static wchar_t tmp[_MAX_SIZE_FOR_STACK_];
     len = mbstowcs(tmp, iStr.c_str(), len + 1);
-//     wstring res = tmp;
-//     delete[] tmp;
-
     return tmp;
-//     return res;
+    }
+    else
+    {
+        wchar_t *tmp = new wchar_t[len + 1];
+        len = mbstowcs(tmp, iStr.c_str(), len + 1);
+        wstring res = tmp;
+        delete[] tmp;
+        return res;
+    }
 }
 
 
@@ -78,15 +87,24 @@
     if (len == (size_t)-1)
         return "";
 
-//     char *tmp = new char[len + 1];
-    char tmp[100];
+    // Change the allocation method depending on the length of the string
+    if (len < _MAX_SIZE_FOR_STACK_)
+    {
+        // Without multi-thread, we can use static storage
+        static char tmp[_MAX_SIZE_FOR_STACK_];
     len = wcstombs(tmp, iWStr.c_str(), len + 1);
-//     string res = tmp;
-//     delete[] tmp;
-
     return tmp;
-//     return res;
+    }
+    else
+    {
+        char *tmp = new char[len + 1];
+        len = wcstombs(tmp, iWStr.c_str(), len + 1);
+        string res = tmp;
+        delete[] tmp;
+        return res;
+    }
 }
+#undef _MAX_SIZE_FOR_STACK_
 
 
 string convertToMb(wchar_t iWChar)

Index: dic/hashtable.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/hashtable.cpp,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -u -b -r1.1.2.1 -r1.1.2.2
--- dic/hashtable.cpp   15 Oct 2006 11:07:55 -0000      1.1.2.1
+++ dic/hashtable.cpp   20 Nov 2007 12:54:03 -0000      1.1.2.2
@@ -24,140 +24,27 @@
  *  \date   1999
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
 #include "hashtable.h"
 
-typedef struct _Hash_node {
-  struct _Hash_node *next;
-  void* key;
-  unsigned int keysize;
-  void* value;
-  unsigned int valuesize;
-} Hash_node;
 
-struct _Hash_table {
-  unsigned int size;
-  Hash_node** nodes;
-};
-
-
-Hash_table
-hash_init(unsigned int size)
-{
-  Hash_table ht;
-
-  ht = (Hash_table) calloc(1,sizeof(struct _Hash_table));
-  ht->size = size;
-  ht->nodes = (Hash_node  **) calloc (size, sizeof (Hash_node*));
-  return ht;
-}
-
-void
-hash_rec_free(Hash_node* node)
+unsigned int HashPtr(const void *iPtr, unsigned int iSize)
 {
-  if (node)
-    {
-      if (node->next)
-       hash_rec_free(node->next);
-      if (node->key)
-       free(node->key);
-      if (node->value)
-       free(node->value);
-      free(node);
-    }
-}
-
-int
-hash_destroy(Hash_table hashtable)
-{
-    unsigned int i;
-    if (hashtable)
-    {
-        for(i=0; i<hashtable->size; i++)
-            if (hashtable->nodes[i])
-                hash_rec_free(hashtable->nodes[i]);
-        if (hashtable->nodes)
-            free(hashtable->nodes);
-        free(hashtable);
-    }
-    return 0;
-}
-
-
-static unsigned int
-hash_key(Hash_table hashtable, void* ptr, unsigned int size)
-{
-    unsigned int i;
     unsigned int key = 0;
 
-    if (size % 4 == 0)
+    if (iSize % sizeof(unsigned int) == 0)
     {
-        unsigned int *v = (unsigned int*)ptr;
-        for (i = 0; i < (size / 4); i++)
-            key ^= (key << 3) ^ (key >> 1) ^ v[i];
+        const unsigned int *ptr =
+            reinterpret_cast<const unsigned int*>(iPtr);
+        for (unsigned int i = 0; i < (iSize / sizeof(unsigned int)); ++i)
+            key ^= (key << 3) ^ (key >> 1) ^ ptr[i];
     }
     else
     {
-        unsigned char *v = (unsigned char*)ptr;
-        for (i = 0; i < size; i++)
-            key ^= (key << 3) ^ (key >> 1) ^ v[i];
+        const unsigned char *ptr =
+            reinterpret_cast<const unsigned char*>(iPtr);
+        for (unsigned int i = 0; i < iSize; ++i)
+            key ^= (key << 3) ^ (key >> 1) ^ ptr[i];
     }
-    key %= hashtable->size;
     return key;
 }
 
-
-void*
-hash_find(Hash_table hashtable, void* key, unsigned int keysize)
-{
-    Hash_node *entry;
-    unsigned int h_key;
-
-    h_key = hash_key(hashtable,key,keysize);
-    for (entry = hashtable->nodes[h_key]; entry; entry = entry -> next)
-    {
-        if ((entry -> keysize == keysize) &&
-            (memcmp(entry->key,key,keysize) == 0))
-        {
-            return entry->value;
-        }
-    }
-    return NULL;
-}
-
-
-static Hash_node*
-new_entry(void* key, unsigned int keysize, void* value, unsigned int
-         valuesize)
-{
-  Hash_node *n;
-  n = (Hash_node*)calloc(1,sizeof(Hash_node));
-  n->key = (void*)malloc(keysize);
-  n->value = (void*)malloc(valuesize);
-  n->keysize = keysize;
-  n->valuesize = valuesize;
-  memcpy(n->key,key,keysize);
-  memcpy(n->value,value,valuesize);
-  return n;
-}
-
-
-int
-hash_add(Hash_table hashtable,
-        void* key, unsigned int keysize,
-        void* value, unsigned int valuesize)
-{
-    Hash_node *entry;
-    unsigned int h_key;
-
-    h_key = hash_key(hashtable,key,keysize);
-    entry = new_entry(key,keysize,value,valuesize);
-    entry->next = hashtable->nodes[h_key];
-    hashtable->nodes[h_key] = entry;
-
-    return 0;
-}
-
-

Index: dic/hashtable.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/hashtable.h,v
retrieving revision 1.6.2.1
retrieving revision 1.6.2.2
diff -u -b -r1.6.2.1 -r1.6.2.2
--- dic/hashtable.h     15 Oct 2006 11:07:55 -0000      1.6.2.1
+++ dic/hashtable.h     20 Nov 2007 12:54:03 -0000      1.6.2.2
@@ -27,13 +27,54 @@
 #ifndef _HASHTABLE_H
 #define _HASHTABLE_H
 
-typedef struct _Hash_table* Hash_table;
 
-Hash_table hash_init(unsigned int);
-int        hash_destroy(Hash_table);
-int        hash_size(Hash_table);
-void*      hash_find(Hash_table,void* key,unsigned keysize);
-int        hash_add (Hash_table,void* key,unsigned keysize,
-                     void* value,unsigned valuesize);
+/// Compute a hash for the data pointed to by iPtr
+/**
+ * This function is useful to define the HASH_FCN template parameter
+ * of HashTable.
+ */
+unsigned int HashPtr(const void *iPtr, unsigned int iSize);
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+class HashTable
+{
+    public:
+        /// Constructor taking the number of records in the table
+        HashTable(unsigned int iSize);
+
+        /// Destructor
+        ~HashTable();
+
+        /// Return the number of records in the table
+        unsigned int size() const { return m_size; }
+
+        /// Return the value corresponding to the given key, or NULL if not 
found
+        const VALUE *find(const KEY &iKey) const;
+
+        /// Add a new key/value pair (both the key and the value are copied)
+        void add(const KEY& iKey, const VALUE &iValue);
+
+    private:
+        /// Maximum number of records
+        unsigned int m_size;
+
+        /// Definition of a record
+        class Node
+        {
+            public:
+                Node(const KEY &iKey, const VALUE &iValue, const Node *iNext);
+                ~Node();
+                KEY m_key;
+                VALUE m_value;
+                const Node *m_next;
+        };
+
+        /// All the nodes
+        const Node **m_nodes;
+};
+
+// Include the implementation of the template
+#include "hashtable.i"
 
 #endif /* _HASHTABLE_H_ */

Index: dic/header.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.cpp,v
retrieving revision 1.1.2.5
retrieving revision 1.1.2.6
diff -u -b -r1.1.2.5 -r1.1.2.6
--- dic/header.cpp      11 Nov 2007 19:56:58 -0000      1.1.2.5
+++ dic/header.cpp      20 Nov 2007 12:54:03 -0000      1.1.2.6
@@ -21,6 +21,7 @@
 
 #include "config.h"
 #include "header.h"
+#include <iconv.h>
 
 
 // FIXME: duplicated in dic.cpp
@@ -73,26 +74,34 @@
 // Do not change these values, as they impact the size of the structure
 #define _USER_HOST_MAX_ 24
 #define _DIC_NAME_MAX_ 16
+#define _MAX_LETTERS_SIZE_ 80
 
 /** Extension of the old format */
 struct Dict_header_ext
 {
-    /// Build information
-    //@{
+    // Build information
     char compileUserHost[_USER_HOST_MAX_];
-    /// Number of seconds since the Epoch
+    // Number of seconds since the Epoch
     uint64_t compileDate;
-    //@}
-    /// Compression algorithm (1 = DAWG, 2 = GADDAG)
+
+    // Compression algorithm (1 = DAWG, 2 = GADDAG)
     char algorithm;
-    /// Endianness
+    // Endianness (XXX: currently unused)
     char bigEndian;
 
-    /// Dictionary official name and version
-    // @{
+    // Dictionary official name and version
     char dicName[_DIC_NAME_MAX_];
     char dicVersion;
     char dicRevision;
+
+    // Letters used in the dictionary
+    // We should have: nbLetters <= lettersSize <= _MAX_LETTERS_SIZE_
+    // Number of letters (XXX: in theory useless, but allows a sanity check)
+    uint32_t nbLetters;
+    // Size taken by the letters
+    uint32_t lettersSize;
+    // The letters themselves, in UTF-8
+    char letters[_MAX_LETTERS_SIZE_];
     //@}
 };
 
@@ -113,6 +122,10 @@
     m_nodesSaved = iInfo.nodessaved;
     m_edgesSaved = iInfo.edgessaved;
     m_type = iInfo.dawg ? kDAWG : kGADDAG;
+    if (iInfo.letters.size() < _MAX_LETTERS_SIZE_)
+    {
+        wcscpy(m_letters, iInfo.letters.c_str());
+    }
 }
 
 
@@ -162,7 +175,29 @@
         m_dicName = aHeaderExt.dicName;
         m_dicVersion = aHeaderExt.dicVersion;
         m_dicRevision = aHeaderExt.dicRevision;
-        // TODO
+
+        // Convert the letters from UTF-8 to wchar_t*
+        iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
+        if (handle == (iconv_t)(-1))
+            return false;
+        size_t inChars = aHeaderExt.lettersSize;
+        size_t outChars = sizeof(wchar_t) * (_MAX_NB_LETTERS_ - 1);
+        char *in = aHeaderExt.letters;
+        char *out = (char*)m_letters;
+        size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+        iconv_close(handle);
+        // Problem during encoding conversion?
+        if (res == (size_t)(-1))
+            return false;
+        // Safety check: correct number of letters?
+        if (outChars + sizeof(wchar_t) * aHeaderExt.nbLetters !=
+            sizeof(wchar_t) * (_MAX_NB_LETTERS_ - 1))
+        {
+            return false;
+        }
+        // Make sure there is a \0 at the end of the letters array
+        wchar_t *wout = (wchar_t*)out;
+        *wout = '\0';
     }
     return true;
 }
@@ -199,7 +234,32 @@
         aHeaderExt.dicName[_DIC_NAME_MAX_ - 1] = '\0';
         aHeaderExt.dicVersion = m_dicVersion;
         aHeaderExt.dicRevision = m_dicRevision;
-        // TODO
+
+        size_t length = wcslen(m_letters);
+        // Sanity check
+        if (length >= _MAX_NB_LETTERS_)
+            return false;
+        aHeaderExt.nbLetters = (uint32_t)length;
+
+        // Convert the dictionary letters to UTF-8
+        iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
+        if (handle == (iconv_t)(-1))
+            return false;
+        size_t inChars = sizeof(wchar_t) * length;
+        size_t outChars = _MAX_LETTERS_SIZE_;
+        char *in = (char*)m_letters;
+        char *out = aHeaderExt.letters;
+        size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+        iconv_close(handle);
+        // Problem during encoding conversion?
+        if (res == (size_t)(-1))
+            return false;
+        aHeaderExt.lettersSize = _MAX_LETTERS_SIZE_ - outChars;
+
+        // Write the extension
+        oStream.write((char*)&aHeaderExt, sizeof(Dict_header_ext));
+        if (!oStream.good())
+            return false;
     }
     return true;
 }
@@ -219,6 +279,8 @@
     printf("\n");
     printf("nodes: %d+%d\n", m_nodesUsed, m_nodesSaved);
     printf("edges: %d+%d\n", m_edgesUsed, m_edgesSaved);
+    printf("number of letters: %d\n", wcslen(m_letters));
+    printf("letters: %ls\n", m_letters);
     printf("============================\n");
 
 #if 0

Index: dic/header.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.h,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -u -b -r1.1.2.3 -r1.1.2.4
--- dic/header.h        11 Nov 2007 19:34:32 -0000      1.1.2.3
+++ dic/header.h        20 Nov 2007 12:54:04 -0000      1.1.2.4
@@ -1,5 +1,5 @@
 /* Eliot                                                                     */
-/* Copyright (C) 2006  Olivier Teuliere                                      */
+/* Copyright (C) 2006-2007  Olivier Teuliere                                 */
 /*                                                                           */
 /* This file is part of Eliot.                                               */
 /*                                                                           */
@@ -24,6 +24,9 @@
 
 using namespace std;
 
+/// Maximum number of distinct letters allowed for the dictionary
+#define _MAX_NB_LETTERS_ 64
+
 
 /**
  * Structure used to create a Header object.
@@ -39,13 +42,15 @@
     unsigned int nodessaved;
     unsigned int edgessaved;
     bool dawg;
+    wstring letters;
 };
 
 
 /**
  * Dictionary header.
  * There are 2 ways to create a Header object:
- *  - fill a Dict_header_info structure and give it in the constructor of the 
Header class
+ *  - fill a Dict_header_info structure and give it in the constructor of the
+ *    Header class (usually to call write() afterwards)
  *  - use the default constructor, then call read() on a dictionary file
  *
  * Serialization:
@@ -80,6 +85,7 @@
     unsigned int getNbNodesSaved() const { return m_nodesSaved; }
     unsigned int getNbEdgesSaved() const { return m_edgesSaved; }
     DictType     getType()         const { return m_type; }
+    wstring      getLetters()      const { return m_letters; }
     //@}
 
     /**
@@ -118,6 +124,10 @@
     char m_dicVersion;
     /// Dictionary revision (e.g.: 1 for ODS 4.1)
     char m_dicRevision;
+
+    /// The letters themselves. The array is terminated with \0.
+    // XXX: use a wstring instead?
+    wchar_t m_letters[_MAX_NB_LETTERS_];
 };
 
 

Index: test/regression.pl
===================================================================
RCS file: /cvsroot/eliot/eliot/test/regression.pl,v
retrieving revision 1.1
retrieving revision 1.1.6.1
diff -u -b -r1.1 -r1.1.6.1
--- test/regression.pl  16 Apr 2005 15:47:59 -0000      1.1
+++ test/regression.pl  20 Nov 2007 12:54:04 -0000      1.1.6.1
@@ -16,7 +16,7 @@
 # Find the dictionary
 if (not -f $ods)
 {
-    die "Cannot find dictionary: $!";
+    die "Cannot find dictionary $ods: $!";
 }
 
 # Find the text interface

Index: utils/eliottxt.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/utils/eliottxt.cpp,v
retrieving revision 1.16.2.4
retrieving revision 1.16.2.5
diff -u -b -r1.16.2.4 -r1.16.2.5
--- utils/eliottxt.cpp  11 Nov 2007 19:56:59 -0000      1.16.2.4
+++ utils/eliottxt.cpp  20 Nov 2007 12:54:04 -0000      1.16.2.5
@@ -964,8 +964,6 @@
     // Let the user choose the locale
     setlocale(LC_ALL, "");
 
-    Dictionary dic;
-
     if (argc != 2 && argc != 3)
     {
         fprintf(stdout, "Usage: eliot /chemin/vers/ods4.dawg [random_seed]\n");
@@ -976,6 +974,7 @@
         dicPath = argv[1];
     }
 
+    Dictionary dic;
     switch (dic.load(dicPath))
     {
         case 0:

Index: dic/hashtable.i
===================================================================
RCS file: dic/hashtable.i
diff -N dic/hashtable.i
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ dic/hashtable.i     20 Nov 2007 12:54:03 -0000      1.1.2.1
@@ -0,0 +1,94 @@
+/* Eliot                                                                     */
+/* Copyright (C) 1999  Antoine Fraboulet                                     */
+/*                                                                           */
+/* This file is part of Eliot.                                               */
+/*                                                                           */
+/* Eliot is free software; you can redistribute it and/or modify             */
+/* it under the terms of the GNU General Public License as published by      */
+/* the Free Software Foundation; either version 2 of the License, or         */
+/* (at your option) any later version.                                       */
+/*                                                                           */
+/* Eliot is distributed in the hope that it will be useful,                  */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             */
+/* GNU General Public License for more details.                              */
+/*                                                                           */
+/* You should have received a copy of the GNU General Public License         */
+/* along with this program; if not, write to the Free Software               */
+/* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
*/
+
+/**
+ *  \file   hashtable.c
+ *  \brief  Simple hashtable type
+ *  \author Antoine Fraboulet
+ *  \date   1999
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hashtable.h"
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::HashTable(unsigned int iSize)
+    : m_size(iSize)
+{
+    m_nodes = new const Node*[m_size];
+    for (unsigned int i = 0; i < m_size; ++i)
+    {
+        m_nodes[i] = NULL;
+    }
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::~HashTable()
+{
+    for (unsigned int i = 0; i < m_size; ++i)
+        delete m_nodes[i];
+    delete[] m_nodes;
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::Node::Node(const KEY &iKey, const VALUE 
&iValue, const Node *iNext)
+    : m_key(iKey), m_value(iValue), m_next(iNext)
+{
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::Node::~Node()
+{
+    delete m_next;
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+const VALUE *HashTable<KEY, VALUE, HASH_FCN>::find(const KEY &iKey) const
+{
+    HASH_FCN aHashFunc;
+    unsigned int h_key = aHashFunc(iKey) % m_size;
+    for (const Node *entry = m_nodes[h_key]; entry; entry = entry->m_next)
+    {
+        // Note: we need to be able to call == on a type KEY
+        if (entry->m_key == iKey)
+        {
+            return &entry->m_value;
+        }
+    }
+    return NULL;
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+void HashTable<KEY, VALUE, HASH_FCN>::add(const KEY &iKey, const VALUE &iValue)
+{
+    HASH_FCN aHashFunc;
+    unsigned int h_key = aHashFunc(iKey) % m_size;
+    const Node *entry = new Node(iKey, iValue, m_nodes[h_key]);
+    m_nodes[h_key] = entry;
+}
+
+// vim: ft=cpp




reply via email to

[Prev in Thread] Current Thread [Next in Thread]