Home | History | Annotate | Download | only in lexicon
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifndef _SIM_PYTRIE_H
     39 #define _SIM_PYTRIE_H
     40 
     41 #include <stdio.h>
     42 
     43 #include "../portability.h"
     44 
     45 #include <stdint.h>
     46 #include <string>
     47 #include <vector>
     48 #include <map>
     49 #include <set>
     50 #include <list>
     51 
     52 #define WORD_ID_WIDTH       18
     53 
     54 class CPinyinTrie {
     55 public:
     56     static const unsigned char SYLLABLE_BREAKER  = '\'';
     57     friend class CPinyinTrieMaker;
     58 
     59     class TTransUnit {
     60     public:
     61 #if !defined(WORDS_BIGENDIAN)
     62         unsigned            m_Char  : 8;
     63         unsigned            m_Offset: 24;
     64 #else
     65         unsigned            m_Offset: 24;
     66         unsigned            m_Char  : 8;
     67 #endif
     68     };
     69 
     70     class TWordIdInfo {
     71     public:
     72 #if !defined(WORDS_BIGENDIAN)
     73         unsigned            m_id       : WORD_ID_WIDTH;
     74         unsigned            m_bGBK     : 1;
     75         unsigned            m_bGB18030 : 1;
     76         unsigned            m_len      : 6;
     77         unsigned            m_cost     : 5;
     78         unsigned            m_bSeen    : 1;
     79 #else
     80         unsigned            m_bSeen    : 1;
     81         unsigned            m_cost     : 5;
     82         unsigned            m_len      : 6;
     83         unsigned            m_bGB18030 : 1;
     84         unsigned            m_bGBK     : 1;
     85         unsigned            m_id       : WORD_ID_WIDTH;
     86 #endif
     87 
     88     public:
     89         TWordIdInfo()
     90             { memset(this, 0, sizeof(TWordIdInfo)); }
     91 
     92         TWordIdInfo(unsigned id, unsigned len=0, unsigned seen=0, unsigned cost = 0, unsigned gbk = 0, unsigned gb18030 = 0)
     93             : m_id(id), m_bGBK(gbk), m_bGB18030(gb18030), m_len(len), m_cost(cost), m_bSeen(seen) { }
     94 
     95         operator
     96         unsigned int() const { return m_id; }
     97     };
     98 
     99     class TNode {
    100     public:
    101 #if !defined(WORDS_BIGENDIAN)
    102         unsigned            m_nWordId    : 16;
    103         unsigned            m_nTransfer  : 8;
    104         unsigned            m_bGBK       : 1;
    105         unsigned            m_bGB18030   : 1;
    106         unsigned            m_bFullSyllableTransfer: 1;
    107         unsigned            m_bOther     : 5;
    108 
    109 #else
    110         unsigned            m_bOther     : 5;
    111         unsigned            m_bFullSyllableTransfer: 1;
    112         unsigned            m_bGB18030   : 1;
    113         unsigned            m_bGBK       : 1;
    114         unsigned            m_nTransfer  : 8;
    115         unsigned            m_nWordId    : 16;
    116 
    117 #endif
    118 
    119     public:
    120         static unsigned int
    121         size_for(unsigned int nTransfer, unsigned int nWordId)
    122             { return sizeof(TNode) + sizeof(TTransUnit)*nTransfer +
    123                 sizeof(TWordIdInfo)*nWordId; }
    124 
    125     public:
    126         TNode() { *((uint32_t*)this) = 0; }
    127 
    128         bool
    129         hasPinyinChild(void) const
    130             { return (m_nTransfer > 1) ||
    131                 (m_nTransfer ==1 && getTrans()->m_Char != SYLLABLE_BREAKER); }
    132 
    133         const TTransUnit*
    134         getTrans() const
    135             { return (TTransUnit*)(this+1); }
    136 
    137         const TWordIdInfo*
    138         getWordIdPtr() const
    139             { return (TWordIdInfo*)(((char*)(this+1))+sizeof(TTransUnit)*m_nTransfer); }
    140 
    141         unsigned int
    142         transfer(unsigned char c) const;
    143 
    144         inline unsigned int
    145         transfer(unsigned c) const
    146             { return transfer((unsigned char)(c)); }
    147     };
    148 
    149 public:
    150     CPinyinTrie() : m_Size(0), m_mem(NULL), m_words(NULL) { }
    151 
    152     ~CPinyinTrie() { free(); }
    153 
    154     bool
    155     load(const char* fileName);
    156 
    157     bool
    158     isValid(const TNode* pnode, bool allowNonComplete, bool allowGBK=true);
    159 
    160     unsigned int
    161     getRootOffset() const { return 3 * sizeof(unsigned int); }
    162 
    163     const TNode*
    164     getRootNode() const { return (TNode*)(m_mem+getRootOffset()); }
    165 
    166     const TNode*
    167     nodeFromOffset(unsigned int offset) const
    168         { return (offset < getRootOffset())?NULL:((TNode*)(m_mem+offset)); }
    169 
    170     //@{
    171     /** transfer on an char or a string from a specific node*/
    172     //inline const TNode*
    173     //transfer(const TNode* pnode, char c) const
    174     //    { return transfer(pnode, (unsigned char)c); }
    175 
    176     inline const TNode*
    177     transfer(const TNode* pnode, unsigned char c) const
    178         { return nodeFromOffset(pnode->transfer(c)); }
    179 
    180     inline const TNode*
    181     transfer(const TNode* pnode, TWCHAR wc) const
    182         { return nodeFromOffset(pnode->transfer(unsigned(wc))); }
    183 
    184     /*
    185     inline const TNode*
    186     transfer(const TNode* pnode, const char* str) const
    187         { return transfer(pnode, (const unsigned char*)str); }
    188     */
    189 
    190     const TNode*
    191     transfer(const TNode* pnode, const unsigned char* str) const;
    192 
    193     const TNode*
    194     transfer(const TNode* pnode, const TWCHAR* wstr) const;
    195 
    196     /*
    197     inline const TNode*
    198     transfer(const TNode* pnode, const char* str, int nlen) const
    199         { return transfer(pnode, (const unsigned char*)str, nlen); }
    200     */
    201 
    202     const TNode*
    203     transfer(const TNode* pnode, const unsigned char* str, int nlen) const;
    204 
    205     const TNode*
    206     transfer(const TNode* pnode, const TWCHAR* wstr, int nlen) const;
    207     //@}
    208 
    209     //@{
    210     /** transfer on an char or a string from root node*/
    211     /*
    212     inline const TNode*
    213     transfer(const char* str) const
    214         { return transfer(getRootNode(), str); }
    215     */
    216 
    217     inline const TNode*
    218     transfer(const unsigned char* str) const
    219         { return transfer(getRootNode(), str); }
    220 
    221     inline const TNode*
    222     transfer(const TWCHAR* wstr) const
    223         { return transfer(getRootNode(), wstr); }
    224 
    225     /*
    226     inline const TNode*
    227     transfer(const char* str, int nlen) const
    228         { return transfer(getRootNode(), str, nlen); }
    229     */
    230 
    231     inline const TNode*
    232     transfer(const unsigned char* str, int nlen) const
    233         { return transfer(getRootNode(), str, nlen); }
    234 
    235     inline const TNode*
    236     transfer(const TWCHAR* wstr, int nlen) const
    237         { return transfer(getRootNode(), wstr, nlen); }
    238     //@}
    239 
    240     unsigned int
    241     getWordCount(void) const { return *(unsigned int*)m_mem; }
    242 
    243     unsigned int
    244     getNodeCount(void) const { return *(unsigned int*)(m_mem+sizeof(unsigned int)); }
    245 
    246     unsigned int
    247     getStringOffset(void) const { return *(unsigned int*)(m_mem+2*sizeof(unsigned int)); }
    248 
    249     unsigned int
    250     getSimbolId(const TWCHAR* wstr);
    251 
    252     unsigned int
    253     getSimbolId(const wstring & wstr);
    254 
    255     const TWCHAR*
    256     operator[](unsigned int idx) const { return m_words[idx]; }
    257 
    258     int
    259     lengthAt(unsigned int idx) const;
    260 
    261     void
    262     free(void);
    263 
    264     void
    265     print(FILE *fp) const;
    266 
    267 protected:
    268     unsigned int           m_Size;
    269     char                  *m_mem;
    270     TWCHAR               **m_words;
    271 
    272     std::map<wstring, unsigned>  m_SimbolMap;
    273 
    274 protected:
    275     void
    276     print(const TNode* pRoot, std::string& prefix, FILE *fp) const;
    277 };
    278 
    279 #endif
    280