Home | History | Annotate | Download | only in lexicon
      1 #ifndef __SUNPINYIN_PYTRIE_H__
      2 #define __SUNPINYIN_PYTRIE_H__
      3 
      4 #ifdef HAVE_CONFIG_H
      5 #include <config.h>
      6 #endif
      7 
      8 #include "../portability.h"
      9 #include "syllable.h"
     10 #include <map>
     11 
     12 #define WORD_ID_WIDTH       18
     13 
     14 class CPinyinTrie {
     15 public:
     16     friend class CPinyinTrieMaker;
     17 
     18     struct TTransUnit {
     19         TSyllable       m_Syllable;
     20         unsigned        m_Offset;
     21     };
     22 
     23     struct TWordIdInfo {
     24     #ifdef WORDS_BIGENDIAN
     25         unsigned        m_bSeen    : 1;
     26         unsigned        m_cost     : 5;
     27         unsigned        m_len      : 6;
     28         unsigned        m_csLevel  : 2;
     29         unsigned        m_id       : WORD_ID_WIDTH;
     30     #else
     31         unsigned        m_id       : WORD_ID_WIDTH;
     32         unsigned        m_csLevel  : 2;
     33         unsigned        m_len      : 6;
     34         unsigned        m_cost     : 5;
     35         unsigned        m_bSeen    : 1;
     36     #endif
     37 
     38         TWordIdInfo() { memset(this, 0, sizeof(TWordIdInfo)); }
     39 
     40         TWordIdInfo(unsigned id, unsigned len=0, unsigned seen=0, unsigned cost = 0, unsigned cslvl = 0)
     41             : m_id(id), m_csLevel(cslvl), m_len(len), m_cost(cost), m_bSeen(seen) { }
     42 
     43         operator unsigned int() const { return m_id; }
     44     };
     45 
     46     struct TNode {
     47     #ifdef WORDS_BIGENDIAN
     48         unsigned        m_other      : 5;
     49         unsigned        m_bFullSyllableTransfer: 1;
     50         unsigned        m_csLevel    : 2;
     51         unsigned        m_nTransfer  : 12;
     52         unsigned        m_nWordId    : 12;
     53     #else
     54         unsigned        m_nWordId    : 12;
     55         unsigned        m_nTransfer  : 12;
     56         unsigned        m_csLevel    : 2;
     57         unsigned        m_bFullSyllableTransfer: 1;
     58         unsigned        m_other      : 5;
     59     #endif
     60 
     61         static unsigned int
     62         size_for(unsigned int nTransfer, unsigned int nWordId)
     63             { return sizeof(TNode) + sizeof(TTransUnit)*nTransfer +
     64                      sizeof(TWordIdInfo)*nWordId; }
     65 
     66         TNode()
     67             { *((unsigned*)this) = 0; }
     68 
     69         bool
     70         hasPinyinChild(void) const
     71             { return (m_nTransfer > 1);}
     72 
     73         const TTransUnit*
     74         getTrans() const
     75             { return (TTransUnit*)(this+1); }
     76 
     77         const TWordIdInfo*
     78         getWordIdPtr() const
     79             { return (TWordIdInfo*)(((char*)(this+1))+sizeof(TTransUnit)*m_nTransfer); }
     80 
     81         unsigned int
     82         transfer(unsigned s) const
     83             {
     84                 unsigned int b = 0, e = m_nTransfer;
     85                 const TTransUnit* ptrans = getTrans();
     86                 while (b < e) {
     87                     int m = b + (e-b)/2;
     88                     if (ptrans[m].m_Syllable == s)
     89                         return ptrans[m].m_Offset;
     90                     if (ptrans[m].m_Syllable < s)
     91                         b = m + 1;
     92                     else
     93                         e = m;
     94                 }
     95                 return 0;
     96             }
     97     };
     98 
     99 public:
    100     CPinyinTrie() : m_Size(0), m_mem(NULL), m_words(NULL) { }
    101 
    102     ~CPinyinTrie()
    103         { free(); }
    104 
    105     bool
    106     load(const char* fileName);
    107 
    108     void
    109     free(void);
    110 
    111     bool
    112     isValid(const TNode* pnode, bool allowNonComplete, unsigned csLevel=0);
    113 
    114     unsigned int
    115     getRootOffset() const
    116         { return 3 * sizeof(unsigned int); }
    117 
    118     const TNode*
    119     getRootNode() const
    120         { return (TNode*)(m_mem+getRootOffset()); }
    121 
    122     const TNode*
    123     nodeFromOffset(unsigned int offset) const
    124         { return (offset < getRootOffset())?NULL:((TNode*)(m_mem+offset)); }
    125 
    126     unsigned int
    127     getWordCount(void) const
    128         { return *(unsigned int*)m_mem; }
    129 
    130     unsigned int
    131     getNodeCount(void) const
    132         { return *(unsigned int*)(m_mem+sizeof(unsigned int)); }
    133 
    134     unsigned int
    135     getStringOffset(void) const
    136         { return *(unsigned int*)(m_mem+2*sizeof(unsigned int)); }
    137 
    138     inline const TNode*
    139     transfer(const TNode* pnode, unsigned s) const
    140         { return nodeFromOffset(pnode->transfer(s)); }
    141 
    142     inline const TNode*
    143     transfer(unsigned s) const
    144         { return transfer(getRootNode(), s); }
    145 
    146     unsigned int
    147     getSymbolId(const TWCHAR* wstr);
    148 
    149     unsigned int
    150     getSymbolId(const wstring & wstr);
    151 
    152     const TWCHAR*
    153     operator[](unsigned int idx) const
    154         { return m_words[idx]; }
    155 
    156     int
    157     lengthAt(unsigned int idx) const;
    158 
    159     void
    160     print(FILE *fp) const;
    161 
    162 protected:
    163     unsigned int           m_Size;
    164     char                  *m_mem;
    165     TWCHAR               **m_words;
    166 
    167     std::map<wstring, unsigned>  m_SymbolMap;
    168 
    169     void
    170     print(const TNode* pRoot, std::string& prefix, FILE *fp) const;
    171 };
    172 
    173 #endif /* __SUNPINYIN_PYTRIE_H__*/
    174