[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Eliot-dev] eliot dic/Makefile.am dic/compdic.cpp dic/dic.h... [cppdic]
From: |
eliot-dev |
Subject: |
[Eliot-dev] eliot dic/Makefile.am dic/compdic.cpp dic/dic.h... [cppdic] |
Date: |
Tue, 20 Nov 2007 12:54:04 +0000 |
CVSROOT: /cvsroot/eliot
Module name: eliot
Branch: cppdic
Changes by: Olivier Teulière <ipkiss> 07/11/20 12:54:04
Modified files:
dic : Makefile.am compdic.cpp dic.h dic_internals.h
encoding.cpp hashtable.cpp hashtable.h
header.cpp header.h
test : regression.pl
utils : eliottxt.cpp
Added files:
dic : hashtable.i
Log message:
- The dictionary header now contains the list of letters of the
dictionary, encoded in UTF-8.
- HashTable is now a template class, and is safer (type safe + bug fix
concerning the size of unsigned ints).
- The compdic binary handles a list of words in UTF-8. The list of
letters is still hardcoded though.
- Several other simplifications in compdic.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/Makefile.am?cvsroot=eliot&only_with_tag=cppdic&r1=1.14.4.3&r2=1.14.4.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/compdic.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.6&r2=1.1.2.7
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.13.2.2&r2=1.13.2.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/dic_internals.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.7.2.3&r2=1.7.2.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/encoding.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.2&r2=1.1.2.3
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/hashtable.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.1&r2=1.1.2.2
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/hashtable.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.6.2.1&r2=1.6.2.2
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.5&r2=1.1.2.6
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/header.h?cvsroot=eliot&only_with_tag=cppdic&r1=1.1.2.3&r2=1.1.2.4
http://cvs.savannah.gnu.org/viewcvs/eliot/dic/hashtable.i?cvsroot=eliot&only_with_tag=cppdic&rev=1.1.2.1
http://cvs.savannah.gnu.org/viewcvs/eliot/test/regression.pl?cvsroot=eliot&only_with_tag=cppdic&r1=1.1&r2=1.1.6.1
http://cvs.savannah.gnu.org/viewcvs/eliot/utils/eliottxt.cpp?cvsroot=eliot&only_with_tag=cppdic&r1=1.16.2.4&r2=1.16.2.5
Patches:
Index: dic/Makefile.am
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Makefile.am,v
retrieving revision 1.14.4.3
retrieving revision 1.14.4.4
diff -u -b -r1.14.4.3 -r1.14.4.4
--- dic/Makefile.am 11 Nov 2007 19:56:58 -0000 1.14.4.3
+++ dic/Makefile.am 20 Nov 2007 12:54:03 -0000 1.14.4.4
@@ -65,6 +65,7 @@
compdic_SOURCES= \
dic_internals.h \
hashtable.cpp hashtble.h \
+ encoding.cpp encoding.h \
header.cpp header.h \
compdic.cpp
Index: dic/compdic.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/compdic.cpp,v
retrieving revision 1.1.2.6
retrieving revision 1.1.2.7
diff -u -b -r1.1.2.6 -r1.1.2.7
--- dic/compdic.cpp 18 Nov 2007 14:30:43 -0000 1.1.2.6
+++ dic/compdic.cpp 20 Nov 2007 12:54:03 -0000 1.1.2.7
@@ -25,17 +25,20 @@
*/
#include <fstream>
+#include <vector>
+#include <map>
+#include <memory>
+#include <iconv.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
+#include <wctype.h>
#include <stdlib.h>
#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-#include <assert.h>
#include "hashtable.h"
+#include "encoding.h"
#include "header.h"
#include "dic_internals.h"
@@ -45,34 +48,41 @@
#define CHECK_RECURSION
-char* load_uncompressed(const string &iFileName, unsigned int *dic_size)
+wchar_t* load_uncompressed(const string &iFileName, unsigned int iDicSize)
{
- char *uncompressed;
- FILE* file_desc;
+ // TODO: we should throw exceptions everywhere instead of returning NULL
- if ((file_desc = fopen(iFileName.c_str(), "r")) == NULL)
+ ifstream file(iFileName.c_str());
+ if (!file.is_open())
return NULL;
- if ((uncompressed = (char*)malloc(sizeof(char)*(*dic_size))) == NULL)
- {
- fclose(file_desc);
+ // Place the buffer in an auto_ptr to avoid worrying about memory handling
+ std::auto_ptr<char> buffer(new char[iDicSize]);
+ // Load the file data, everything in one shot
+ file.read(buffer.get(), iDicSize);
+ file.close();
+
+ // The words are supposed to be in utf-8, so convert everything to
+ // wide characters
+ iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
+ if (handle == (iconv_t)(-1))
return NULL;
- }
- unsigned r = fread (uncompressed, 1, *dic_size, file_desc);
- if (r < *dic_size)
- {
- /* \n is 2 chars under MS OS */
- printf("\n");
- printf("** The number of bytes read is less than the size of the file
**\n");
- printf("** this may be OK if you run a Microsoft OS but not on Unix
**\n");
- printf("** please check the results.
**\n");
- printf("\n");
- *dic_size = r;
- }
+ // Buffer for the wide characters (it will use at most as many characters
+ // as the utf-8 version)
+ std::auto_ptr<wchar_t> wide_buf(new wchar_t[iDicSize]);
+
+ size_t inChars = iDicSize;
+ size_t outChars = sizeof(wchar_t) * iDicSize;
+ char *in = buffer.get();
+ char *out = (char*)wide_buf.get();
+ size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+ iconv_close(handle);
+ // Problem during encoding conversion?
+ if (res == (size_t)(-1))
+ return NULL;
- fclose(file_desc);
- return uncompressed;
+ return wide_buf.release();
}
@@ -115,7 +125,7 @@
}
-void write_node(const Dawg_edge *edges, int size, int num, ostream &outfile)
+void write_node(const Dawg_edge *edges, int num, ostream &outfile)
{
#ifdef DEBUG_OUTPUT
printf("writing %d edges\n", num);
@@ -129,7 +139,7 @@
outfile.write((char*)(edges + i), sizeof(Dawg_edge));
}
#else
- outfile.write((char*)edges, num * size);
+ outfile.write((char*)edges, num * sizeof(Dawg_edge));
#endif
}
@@ -140,15 +150,59 @@
/* ods3: ?? */
/* ods4: 1746 */
+// Hashing function for a vector of Dawg_edge, based on the hashing function
+// of the HashTable
+struct HashVector
+{
+ unsigned int operator()(const vector<Dawg_edge> &iKey) const
+ {
+ if (iKey.empty())
+ return 0;
+ return HashPtr(&iKey.front(), iKey.size() * sizeof(Dawg_edge));
+ }
+};
+
+#ifdef CHECK_RECURSION
+class IncDec
+{
+ public:
+ IncDec(int &ioCounter)
+ : m_counter(ioCounter)
+ {
+ m_counter++;
+ }
+
+ ~IncDec()
+ {
+ m_counter--;
+ }
+ private:
+ int &m_counter;
+};
+
+int current_rec = 0;
+int max_rec = 0;
+#endif
+
/* global variables */
ofstream global_outfile;
Dict_header_info global_header_info;
-Hash_table global_hashtable;
+HashTable<vector<Dawg_edge>, unsigned int, HashVector> *global_hashtable;
+
+wchar_t global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
+wchar_t* global_endstring; /* Marks END of current string */
+wchar_t* global_input;
+wchar_t* global_endofinput;
+#ifdef CHECK_RECURSION
+map<int, vector<Dawg_edge> > global_mapfordepth;
+#endif
+map<wchar_t, unsigned int> global_chartocode;
-char global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
-char* global_endstring; /* Marks END of current
string */
-char* global_input;
-char* global_endofinput;
+unsigned int charToCode(wchar_t iChar)
+{
+ // TODO: throw an exception if the result is 0
+ return global_chartocode[iChar];
+}
/*
* Makenode takes a prefix (as position relative to stringbuf) and
@@ -157,38 +211,47 @@
* to stringbuf) indicating how much of prefix is matched in the
* input.
*/
-#ifdef CHECK_RECURSION
-int current_rec = 0;
-int max_rec = 0;
-#endif
-
-unsigned int makenode(char *prefix)
+unsigned int makenode(wchar_t *prefix)
{
#ifdef CHECK_RECURSION
- current_rec++;
+ IncDec inc(current_rec);
if (current_rec > max_rec)
max_rec = current_rec;
#endif
- Dawg_edge edges[MAX_EDGES];
- Dawg_edge *edgeptr = edges;
+#ifdef CHECK_RECURSION
+ // Instead of creating a vector, try to reuse an existing one
+ vector<Dawg_edge> &edges = global_mapfordepth[current_rec];
+ edges.reserve(MAX_EDGES);
+ edges.clear();
+#else
+ vector<Dawg_edge> edges;
+ // Optimize allocation
+ edges.reserve(MAX_EDGES);
+#endif
+ Dawg_edge newEdge;
while (prefix == global_endstring)
{
- /* More edges out of node */
- edgeptr->ptr = 0;
- edgeptr->term = 0;
- edgeptr->last = 0;
- edgeptr->fill = 0;
- edgeptr->chr = 0;
-
- (*(edgeptr++)).chr = (*global_endstring++ = *global_input++) &
DIC_CHAR_MASK;
- if (*global_input == '\n') /* End of a word */
+ // More edges out of node
+ newEdge.ptr = 0;
+ newEdge.term = 0;
+ newEdge.last = 0;
+ newEdge.fill = 0;
+ newEdge.chr = charToCode(*global_endstring++ = *global_input++);
+ edges.push_back(newEdge);
+
+ // End of a word?
+ // TODO: handle \r as well
+ if (*global_input == L'\n')
{
global_header_info.nwords++;
- edgeptr[-1].term = 1; /* Mark edge as word */
- *global_endstring++ = *global_input++; /* Skip \n */
- if (global_input == global_endofinput) /* At end of input? */
+ // Mark edge as word
+ edges.back().term = 1;
+ // Skip \n
+ *global_endstring++ = *global_input++;
+ // At the end of input?
+ if (global_input == global_endofinput)
break;
global_endstring = global_stringbuf;
@@ -198,47 +261,36 @@
global_input++;
}
}
- /* make dawg pointed to by this edge */
- edgeptr[-1].ptr = makenode(prefix + 1);
+ // make dawg pointed to by this edge
+ edges.back().ptr = makenode(prefix + 1);
}
- int numedges = edgeptr - edges;
+ int numedges = edges.size();
if (numedges == 0)
{
-#ifdef CHECK_RECURSION
- current_rec --;
-#endif
- return 0; /* Special node zero - no edges */
+ // Special node zero - no edges
+ return 0;
}
- edgeptr[-1].last = 1; /* Mark the last edge */
+ // Mark the last edge
+ edges.back().last = 1;
- unsigned *saved_position = (unsigned int*) hash_find(global_hashtable,
- (void*)edges,
- numedges*sizeof(Dawg_edge));
+ const unsigned int *saved_position = global_hashtable->find(edges);
if (saved_position)
{
global_header_info.edgessaved += numedges;
global_header_info.nodessaved++;
-#ifdef CHECK_RECURSION
- current_rec --;
-#endif
return *saved_position;
}
else
{
unsigned int node_pos = global_header_info.edgesused;
- hash_add(global_hashtable,
- (void*)edges, numedges*sizeof(Dawg_edge),
- (void*)(&global_header_info.edgesused),
sizeof(global_header_info.edgesused));
+ global_hashtable->add(edges, global_header_info.edgesused);
global_header_info.edgesused += numedges;
global_header_info.nodesused++;
- write_node(edges, sizeof(Dawg_edge), numedges, global_outfile);
+ write_node(&edges.front(), numedges, global_outfile);
-#ifdef CHECK_RECURSION
- current_rec --;
-#endif
return node_pos;
}
}
@@ -246,8 +298,6 @@
int main(int argc, char* argv[])
{
- char *uncompressed;
-
if (argc < 2)
{
fprintf(stderr, "usage: %s uncompressed_dic [compressed_dic]\n",
argv[0]);
@@ -271,44 +321,58 @@
}
unsigned int dicsize = size;
- if ((uncompressed = load_uncompressed(argv[1], &dicsize)) == NULL)
+ wchar_t *uncompressed;
+ clock_t startLoadTime = clock();
+ if ((uncompressed = load_uncompressed(argv[1], dicsize)) == NULL)
{
fprintf(stderr, "Cannot load uncompressed dictionary into memory\n");
exit(1);
}
+ clock_t endLoadTime = clock();
global_input = uncompressed;
global_endofinput = global_input + dicsize;
#define SCALE 0.6
- global_hashtable = hash_init((unsigned int)(dicsize * SCALE));
+ global_hashtable = new HashTable<vector<Dawg_edge>, unsigned int,
HashVector>((unsigned int)(dicsize * SCALE));
#undef SCALE
skip_init_header(global_outfile, &global_header_info);
+ // TODO: remove hard-coding
+ global_header_info.letters = convertToWc("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ // Init the map char --> code
+ for (string::size_type i = 0; i < global_header_info.letters.size(); ++i)
+ {
+ global_chartocode[global_header_info.letters[i]] = i + 1;
+ global_chartocode[towlower(global_header_info.letters[i])] = i + 1;
+ }
+
Dawg_edge specialnode = {0, 0, 0, 0, 0};
specialnode.last = 1;
- write_node(&specialnode, sizeof(specialnode), 1, global_outfile);
+ write_node(&specialnode, 1, global_outfile);
+
/*
* Call makenode with null (relative to stringbuf) prefix;
* Initialize string to null; Put index of start node on output
*/
- clock_t starttime = clock();
+ clock_t startBuildTime = clock();
Dawg_edge rootnode = {0, 0, 0, 0, 0};
rootnode.ptr = makenode(global_endstring = global_stringbuf);
- clock_t endtime = clock();
- write_node(&rootnode, sizeof(rootnode), 1, global_outfile);
+ clock_t endBuildTime = clock();
+ write_node(&rootnode, 1, global_outfile);
fix_header(global_outfile, &global_header_info);
Header aHeader(global_header_info);
aHeader.print();
- hash_destroy(global_hashtable);
- free(uncompressed);
+ delete global_hashtable;
+ delete[] uncompressed;
global_outfile.close();
- printf(" Elapsed time is : %f s\n", 1.0 * (endtime -
starttime) / CLOCKS_PER_SEC);
+ printf(" Load time: %f s\n", 1.0 * (endLoadTime - startLoadTime) /
CLOCKS_PER_SEC);
+ printf(" Compression time: %f s\n", 1.0 * (endBuildTime - startBuildTime)
/ CLOCKS_PER_SEC);
#ifdef CHECK_RECURSION
printf(" Maximum recursion level reached: %d\n", max_rec);
#endif
Index: dic/dic.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic.h,v
retrieving revision 1.13.2.2
retrieving revision 1.13.2.3
diff -u -b -r1.13.2.2 -r1.13.2.3
--- dic/dic.h 24 Dec 2006 20:49:42 -0000 1.13.2.2
+++ dic/dic.h 20 Nov 2007 12:54:03 -0000 1.13.2.3
@@ -37,7 +37,8 @@
*/
#define DIC_WORD_MAX 16
-typedef struct _Dawg_edge Dawg_edge;
+//typedef struct _Dawg_edge Dawg_edge;
+class Dawg_edge;
typedef unsigned int dic_elt_t;
typedef unsigned char dic_code_t;
Index: dic/dic_internals.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/dic_internals.h,v
retrieving revision 1.7.2.3
retrieving revision 1.7.2.4
diff -u -b -r1.7.2.3 -r1.7.2.4
--- dic/dic_internals.h 11 Nov 2007 19:34:32 -0000 1.7.2.3
+++ dic/dic_internals.h 20 Nov 2007 12:54:03 -0000 1.7.2.4
@@ -51,27 +51,32 @@
*/
#if defined(WORDS_BIGENDIAN)
-struct __attribute__ ((packed)) _Dawg_edge {
+struct __attribute__ ((packed)) Dawg_edge
+{
uint32_t
chr : 5,
- fill : 1,
- last : 1,
- term : 1,
+ fill: 1,
+ last: 1,
+ term: 1,
ptr : 24;
};
#else
-struct __attribute__ ((packed)) _Dawg_edge {
+struct __attribute__ ((packed)) Dawg_edge
+{
+ public:
uint32_t
ptr : 24,
- term : 1,
- last : 1,
- fill : 1,
+ term: 1,
+ last: 1,
+ fill: 1,
chr : 5;
+ bool operator==(const Dawg_edge &iOther) const
+ {
+ return memcmp(this, &iOther, sizeof(*this)) == 0;
+ }
};
#endif
-typedef struct _Dawg_edge Dawg_edge;
-
#endif /* _DIC_INTERNALS_H */
Index: dic/encoding.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/encoding.cpp,v
retrieving revision 1.1.2.2
retrieving revision 1.1.2.3
diff -u -b -r1.1.2.2 -r1.1.2.3
--- dic/encoding.cpp 5 Nov 2006 17:27:03 -0000 1.1.2.2
+++ dic/encoding.cpp 20 Nov 2007 12:54:03 -0000 1.1.2.3
@@ -19,7 +19,7 @@
/**
* \file encoding.cpp
- * \brief Utility functions to ease manipulation of wide-character strings
+ * \brief Utility functions to ease handling of wide-character strings
* \author Olivier Teuliere
* \date 2005
*/
@@ -33,7 +33,7 @@
int _wtoi(const wchar_t *iWStr)
{
- return wcstol(iWStr,NULL,10);
+ return wcstol(iWStr, NULL, 10);
}
@@ -53,6 +53,7 @@
}
+#define _MAX_SIZE_FOR_STACK_ 30
wstring convertToWc(const string& iStr)
{
// Get the needed length (we _can't_ use string::size())
@@ -60,14 +61,22 @@
if (len == (size_t)-1)
return L"";
-// wchar_t *tmp = new wchar_t[len + 1];
- wchar_t tmp[100];
+ // Change the allocation method depending on the length of the string
+ if (len < _MAX_SIZE_FOR_STACK_)
+ {
+ // Without multi-thread, we can use static storage
+ static wchar_t tmp[_MAX_SIZE_FOR_STACK_];
len = mbstowcs(tmp, iStr.c_str(), len + 1);
-// wstring res = tmp;
-// delete[] tmp;
-
return tmp;
-// return res;
+ }
+ else
+ {
+ wchar_t *tmp = new wchar_t[len + 1];
+ len = mbstowcs(tmp, iStr.c_str(), len + 1);
+ wstring res = tmp;
+ delete[] tmp;
+ return res;
+ }
}
@@ -78,15 +87,24 @@
if (len == (size_t)-1)
return "";
-// char *tmp = new char[len + 1];
- char tmp[100];
+ // Change the allocation method depending on the length of the string
+ if (len < _MAX_SIZE_FOR_STACK_)
+ {
+ // Without multi-thread, we can use static storage
+ static char tmp[_MAX_SIZE_FOR_STACK_];
len = wcstombs(tmp, iWStr.c_str(), len + 1);
-// string res = tmp;
-// delete[] tmp;
-
return tmp;
-// return res;
+ }
+ else
+ {
+ char *tmp = new char[len + 1];
+ len = wcstombs(tmp, iWStr.c_str(), len + 1);
+ string res = tmp;
+ delete[] tmp;
+ return res;
+ }
}
+#undef _MAX_SIZE_FOR_STACK_
string convertToMb(wchar_t iWChar)
Index: dic/hashtable.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/hashtable.cpp,v
retrieving revision 1.1.2.1
retrieving revision 1.1.2.2
diff -u -b -r1.1.2.1 -r1.1.2.2
--- dic/hashtable.cpp 15 Oct 2006 11:07:55 -0000 1.1.2.1
+++ dic/hashtable.cpp 20 Nov 2007 12:54:03 -0000 1.1.2.2
@@ -24,140 +24,27 @@
* \date 1999
*/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
#include "hashtable.h"
-typedef struct _Hash_node {
- struct _Hash_node *next;
- void* key;
- unsigned int keysize;
- void* value;
- unsigned int valuesize;
-} Hash_node;
-struct _Hash_table {
- unsigned int size;
- Hash_node** nodes;
-};
-
-
-Hash_table
-hash_init(unsigned int size)
-{
- Hash_table ht;
-
- ht = (Hash_table) calloc(1,sizeof(struct _Hash_table));
- ht->size = size;
- ht->nodes = (Hash_node **) calloc (size, sizeof (Hash_node*));
- return ht;
-}
-
-void
-hash_rec_free(Hash_node* node)
+unsigned int HashPtr(const void *iPtr, unsigned int iSize)
{
- if (node)
- {
- if (node->next)
- hash_rec_free(node->next);
- if (node->key)
- free(node->key);
- if (node->value)
- free(node->value);
- free(node);
- }
-}
-
-int
-hash_destroy(Hash_table hashtable)
-{
- unsigned int i;
- if (hashtable)
- {
- for(i=0; i<hashtable->size; i++)
- if (hashtable->nodes[i])
- hash_rec_free(hashtable->nodes[i]);
- if (hashtable->nodes)
- free(hashtable->nodes);
- free(hashtable);
- }
- return 0;
-}
-
-
-static unsigned int
-hash_key(Hash_table hashtable, void* ptr, unsigned int size)
-{
- unsigned int i;
unsigned int key = 0;
- if (size % 4 == 0)
+ if (iSize % sizeof(unsigned int) == 0)
{
- unsigned int *v = (unsigned int*)ptr;
- for (i = 0; i < (size / 4); i++)
- key ^= (key << 3) ^ (key >> 1) ^ v[i];
+ const unsigned int *ptr =
+ reinterpret_cast<const unsigned int*>(iPtr);
+ for (unsigned int i = 0; i < (iSize / sizeof(unsigned int)); ++i)
+ key ^= (key << 3) ^ (key >> 1) ^ ptr[i];
}
else
{
- unsigned char *v = (unsigned char*)ptr;
- for (i = 0; i < size; i++)
- key ^= (key << 3) ^ (key >> 1) ^ v[i];
+ const unsigned char *ptr =
+ reinterpret_cast<const unsigned char*>(iPtr);
+ for (unsigned int i = 0; i < iSize; ++i)
+ key ^= (key << 3) ^ (key >> 1) ^ ptr[i];
}
- key %= hashtable->size;
return key;
}
-
-void*
-hash_find(Hash_table hashtable, void* key, unsigned int keysize)
-{
- Hash_node *entry;
- unsigned int h_key;
-
- h_key = hash_key(hashtable,key,keysize);
- for (entry = hashtable->nodes[h_key]; entry; entry = entry -> next)
- {
- if ((entry -> keysize == keysize) &&
- (memcmp(entry->key,key,keysize) == 0))
- {
- return entry->value;
- }
- }
- return NULL;
-}
-
-
-static Hash_node*
-new_entry(void* key, unsigned int keysize, void* value, unsigned int
- valuesize)
-{
- Hash_node *n;
- n = (Hash_node*)calloc(1,sizeof(Hash_node));
- n->key = (void*)malloc(keysize);
- n->value = (void*)malloc(valuesize);
- n->keysize = keysize;
- n->valuesize = valuesize;
- memcpy(n->key,key,keysize);
- memcpy(n->value,value,valuesize);
- return n;
-}
-
-
-int
-hash_add(Hash_table hashtable,
- void* key, unsigned int keysize,
- void* value, unsigned int valuesize)
-{
- Hash_node *entry;
- unsigned int h_key;
-
- h_key = hash_key(hashtable,key,keysize);
- entry = new_entry(key,keysize,value,valuesize);
- entry->next = hashtable->nodes[h_key];
- hashtable->nodes[h_key] = entry;
-
- return 0;
-}
-
-
Index: dic/hashtable.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/hashtable.h,v
retrieving revision 1.6.2.1
retrieving revision 1.6.2.2
diff -u -b -r1.6.2.1 -r1.6.2.2
--- dic/hashtable.h 15 Oct 2006 11:07:55 -0000 1.6.2.1
+++ dic/hashtable.h 20 Nov 2007 12:54:03 -0000 1.6.2.2
@@ -27,13 +27,54 @@
#ifndef _HASHTABLE_H
#define _HASHTABLE_H
-typedef struct _Hash_table* Hash_table;
-Hash_table hash_init(unsigned int);
-int hash_destroy(Hash_table);
-int hash_size(Hash_table);
-void* hash_find(Hash_table,void* key,unsigned keysize);
-int hash_add (Hash_table,void* key,unsigned keysize,
- void* value,unsigned valuesize);
+/// Compute a hash for the data pointed to by iPtr
+/**
+ * This function is useful to define the HASH_FCN template parameter
+ * of HashTable.
+ */
+unsigned int HashPtr(const void *iPtr, unsigned int iSize);
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+class HashTable
+{
+ public:
+ /// Constructor taking the number of records in the table
+ HashTable(unsigned int iSize);
+
+ /// Destructor
+ ~HashTable();
+
+ /// Return the number of records in the table
+ unsigned int size() const { return m_size; }
+
+ /// Return the value corresponding to the given key, or NULL if not
found
+ const VALUE *find(const KEY &iKey) const;
+
+ /// Add a new key/value pair (both the key and the value are copied)
+ void add(const KEY& iKey, const VALUE &iValue);
+
+ private:
+ /// Maximum number of records
+ unsigned int m_size;
+
+ /// Definition of a record
+ class Node
+ {
+ public:
+ Node(const KEY &iKey, const VALUE &iValue, const Node *iNext);
+ ~Node();
+ KEY m_key;
+ VALUE m_value;
+ const Node *m_next;
+ };
+
+ /// All the nodes
+ const Node **m_nodes;
+};
+
+// Include the implementation of the template
+#include "hashtable.i"
#endif /* _HASHTABLE_H_ */
Index: dic/header.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.cpp,v
retrieving revision 1.1.2.5
retrieving revision 1.1.2.6
diff -u -b -r1.1.2.5 -r1.1.2.6
--- dic/header.cpp 11 Nov 2007 19:56:58 -0000 1.1.2.5
+++ dic/header.cpp 20 Nov 2007 12:54:03 -0000 1.1.2.6
@@ -21,6 +21,7 @@
#include "config.h"
#include "header.h"
+#include <iconv.h>
// FIXME: duplicated in dic.cpp
@@ -73,26 +74,34 @@
// Do not change these values, as they impact the size of the structure
#define _USER_HOST_MAX_ 24
#define _DIC_NAME_MAX_ 16
+#define _MAX_LETTERS_SIZE_ 80
/** Extension of the old format */
struct Dict_header_ext
{
- /// Build information
- //@{
+ // Build information
char compileUserHost[_USER_HOST_MAX_];
- /// Number of seconds since the Epoch
+ // Number of seconds since the Epoch
uint64_t compileDate;
- //@}
- /// Compression algorithm (1 = DAWG, 2 = GADDAG)
+
+ // Compression algorithm (1 = DAWG, 2 = GADDAG)
char algorithm;
- /// Endianness
+ // Endianness (XXX: currently unused)
char bigEndian;
- /// Dictionary official name and version
- // @{
+ // Dictionary official name and version
char dicName[_DIC_NAME_MAX_];
char dicVersion;
char dicRevision;
+
+ // Letters used in the dictionary
+ // We should have: nbLetters <= lettersSize <= _MAX_LETTERS_SIZE_
+ // Number of letters (XXX: in theory useless, but allows a sanity check)
+ uint32_t nbLetters;
+ // Size taken by the letters
+ uint32_t lettersSize;
+ // The letters themselves, in UTF-8
+ char letters[_MAX_LETTERS_SIZE_];
//@}
};
@@ -113,6 +122,10 @@
m_nodesSaved = iInfo.nodessaved;
m_edgesSaved = iInfo.edgessaved;
m_type = iInfo.dawg ? kDAWG : kGADDAG;
+ if (iInfo.letters.size() < _MAX_LETTERS_SIZE_)
+ {
+ wcscpy(m_letters, iInfo.letters.c_str());
+ }
}
@@ -162,7 +175,29 @@
m_dicName = aHeaderExt.dicName;
m_dicVersion = aHeaderExt.dicVersion;
m_dicRevision = aHeaderExt.dicRevision;
- // TODO
+
+ // Convert the letters from UTF-8 to wchar_t*
+ iconv_t handle = iconv_open("WCHAR_T", "UTF-8");
+ if (handle == (iconv_t)(-1))
+ return false;
+ size_t inChars = aHeaderExt.lettersSize;
+ size_t outChars = sizeof(wchar_t) * (_MAX_NB_LETTERS_ - 1);
+ char *in = aHeaderExt.letters;
+ char *out = (char*)m_letters;
+ size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+ iconv_close(handle);
+ // Problem during encoding conversion?
+ if (res == (size_t)(-1))
+ return false;
+ // Safety check: correct number of letters?
+ if (outChars + sizeof(wchar_t) * aHeaderExt.nbLetters !=
+ sizeof(wchar_t) * (_MAX_NB_LETTERS_ - 1))
+ {
+ return false;
+ }
+ // Make sure there is a \0 at the end of the letters array
+ wchar_t *wout = (wchar_t*)out;
+ *wout = '\0';
}
return true;
}
@@ -199,7 +234,32 @@
aHeaderExt.dicName[_DIC_NAME_MAX_ - 1] = '\0';
aHeaderExt.dicVersion = m_dicVersion;
aHeaderExt.dicRevision = m_dicRevision;
- // TODO
+
+ size_t length = wcslen(m_letters);
+ // Sanity check
+ if (length >= _MAX_NB_LETTERS_)
+ return false;
+ aHeaderExt.nbLetters = (uint32_t)length;
+
+ // Convert the dictionary letters to UTF-8
+ iconv_t handle = iconv_open("UTF-8", "WCHAR_T");
+ if (handle == (iconv_t)(-1))
+ return false;
+ size_t inChars = sizeof(wchar_t) * length;
+ size_t outChars = _MAX_LETTERS_SIZE_;
+ char *in = (char*)m_letters;
+ char *out = aHeaderExt.letters;
+ size_t res = iconv(handle, &in, &inChars, &out, &outChars);
+ iconv_close(handle);
+ // Problem during encoding conversion?
+ if (res == (size_t)(-1))
+ return false;
+ aHeaderExt.lettersSize = _MAX_LETTERS_SIZE_ - outChars;
+
+ // Write the extension
+ oStream.write((char*)&aHeaderExt, sizeof(Dict_header_ext));
+ if (!oStream.good())
+ return false;
}
return true;
}
@@ -219,6 +279,8 @@
printf("\n");
printf("nodes: %d+%d\n", m_nodesUsed, m_nodesSaved);
printf("edges: %d+%d\n", m_edgesUsed, m_edgesSaved);
+ printf("number of letters: %d\n", wcslen(m_letters));
+ printf("letters: %ls\n", m_letters);
printf("============================\n");
#if 0
Index: dic/header.h
===================================================================
RCS file: /cvsroot/eliot/eliot/dic/Attic/header.h,v
retrieving revision 1.1.2.3
retrieving revision 1.1.2.4
diff -u -b -r1.1.2.3 -r1.1.2.4
--- dic/header.h 11 Nov 2007 19:34:32 -0000 1.1.2.3
+++ dic/header.h 20 Nov 2007 12:54:04 -0000 1.1.2.4
@@ -1,5 +1,5 @@
/* Eliot */
-/* Copyright (C) 2006 Olivier Teuliere */
+/* Copyright (C) 2006-2007 Olivier Teuliere */
/* */
/* This file is part of Eliot. */
/* */
@@ -24,6 +24,9 @@
using namespace std;
+/// Maximum number of distinct letters allowed for the dictionary
+#define _MAX_NB_LETTERS_ 64
+
/**
* Structure used to create a Header object.
@@ -39,13 +42,15 @@
unsigned int nodessaved;
unsigned int edgessaved;
bool dawg;
+ wstring letters;
};
/**
* Dictionary header.
* There are 2 ways to create a Header object:
- * - fill a Dict_header_info structure and give it in the constructor of the
Header class
+ * - fill a Dict_header_info structure and give it in the constructor of the
+ * Header class (usually to call write() afterwards)
* - use the default constructor, then call read() on a dictionary file
*
* Serialization:
@@ -80,6 +85,7 @@
unsigned int getNbNodesSaved() const { return m_nodesSaved; }
unsigned int getNbEdgesSaved() const { return m_edgesSaved; }
DictType getType() const { return m_type; }
+ wstring getLetters() const { return m_letters; }
//@}
/**
@@ -118,6 +124,10 @@
char m_dicVersion;
/// Dictionary revision (e.g.: 1 for ODS 4.1)
char m_dicRevision;
+
+ /// The letters themselves. The array is terminated with \0.
+ // XXX: use a wstring instead?
+ wchar_t m_letters[_MAX_NB_LETTERS_];
};
Index: test/regression.pl
===================================================================
RCS file: /cvsroot/eliot/eliot/test/regression.pl,v
retrieving revision 1.1
retrieving revision 1.1.6.1
diff -u -b -r1.1 -r1.1.6.1
--- test/regression.pl 16 Apr 2005 15:47:59 -0000 1.1
+++ test/regression.pl 20 Nov 2007 12:54:04 -0000 1.1.6.1
@@ -16,7 +16,7 @@
# Find the dictionary
if (not -f $ods)
{
- die "Cannot find dictionary: $!";
+ die "Cannot find dictionary $ods: $!";
}
# Find the text interface
Index: utils/eliottxt.cpp
===================================================================
RCS file: /cvsroot/eliot/eliot/utils/eliottxt.cpp,v
retrieving revision 1.16.2.4
retrieving revision 1.16.2.5
diff -u -b -r1.16.2.4 -r1.16.2.5
--- utils/eliottxt.cpp 11 Nov 2007 19:56:59 -0000 1.16.2.4
+++ utils/eliottxt.cpp 20 Nov 2007 12:54:04 -0000 1.16.2.5
@@ -964,8 +964,6 @@
// Let the user choose the locale
setlocale(LC_ALL, "");
- Dictionary dic;
-
if (argc != 2 && argc != 3)
{
fprintf(stdout, "Usage: eliot /chemin/vers/ods4.dawg [random_seed]\n");
@@ -976,6 +974,7 @@
dicPath = argv[1];
}
+ Dictionary dic;
switch (dic.load(dicPath))
{
case 0:
Index: dic/hashtable.i
===================================================================
RCS file: dic/hashtable.i
diff -N dic/hashtable.i
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ dic/hashtable.i 20 Nov 2007 12:54:03 -0000 1.1.2.1
@@ -0,0 +1,94 @@
+/* Eliot */
+/* Copyright (C) 1999 Antoine Fraboulet */
+/* */
+/* This file is part of Eliot. */
+/* */
+/* Eliot is free software; you can redistribute it and/or modify */
+/* it under the terms of the GNU General Public License as published by */
+/* the Free Software Foundation; either version 2 of the License, or */
+/* (at your option) any later version. */
+/* */
+/* Eliot is distributed in the hope that it will be useful, */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
+/* GNU General Public License for more details. */
+/* */
+/* You should have received a copy of the GNU General Public License */
+/* along with this program; if not, write to the Free Software */
+/* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+
+/**
+ * \file hashtable.c
+ * \brief Simple hashtable type
+ * \author Antoine Fraboulet
+ * \date 1999
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hashtable.h"
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::HashTable(unsigned int iSize)
+ : m_size(iSize)
+{
+ m_nodes = new const Node*[m_size];
+ for (unsigned int i = 0; i < m_size; ++i)
+ {
+ m_nodes[i] = NULL;
+ }
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::~HashTable()
+{
+ for (unsigned int i = 0; i < m_size; ++i)
+ delete m_nodes[i];
+ delete[] m_nodes;
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::Node::Node(const KEY &iKey, const VALUE
&iValue, const Node *iNext)
+ : m_key(iKey), m_value(iValue), m_next(iNext)
+{
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+HashTable<KEY, VALUE, HASH_FCN>::Node::~Node()
+{
+ delete m_next;
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+const VALUE *HashTable<KEY, VALUE, HASH_FCN>::find(const KEY &iKey) const
+{
+ HASH_FCN aHashFunc;
+ unsigned int h_key = aHashFunc(iKey) % m_size;
+ for (const Node *entry = m_nodes[h_key]; entry; entry = entry->m_next)
+ {
+ // Note: we need to be able to call == on a type KEY
+ if (entry->m_key == iKey)
+ {
+ return &entry->m_value;
+ }
+ }
+ return NULL;
+}
+
+
+template<typename KEY, typename VALUE, typename HASH_FCN>
+void HashTable<KEY, VALUE, HASH_FCN>::add(const KEY &iKey, const VALUE &iValue)
+{
+ HASH_FCN aHashFunc;
+ unsigned int h_key = aHashFunc(iKey) % m_size;
+ const Node *entry = new Node(iKey, iValue, m_nodes[h_key]);
+ m_nodes[h_key] = entry;
+}
+
+// vim: ft=cpp
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Eliot-dev] eliot dic/Makefile.am dic/compdic.cpp dic/dic.h... [cppdic],
eliot-dev <=