Home | History | Annotate | Download | only in lexicon
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifndef _SIM_PYTRIE_H
     39    0   yongsun #define _SIM_PYTRIE_H
     40    0   yongsun 
     41    0   yongsun #include <stdio.h>
     42    0   yongsun 
     43    0   yongsun #include "../portability.h"
     44    0   yongsun 
     45  314   yongsun #include <stdint.h>
     46    0   yongsun #include <string>
     47    0   yongsun #include <vector>
     48    0   yongsun #include <map>
     49    0   yongsun #include <set>
     50    0   yongsun #include <list>
     51    0   yongsun 
     52    0   yongsun #define WORD_ID_WIDTH       18
     53    0   yongsun 
     54    0   yongsun class CPinyinTrie {
     55    0   yongsun public:
     56    0   yongsun     static const unsigned char SYLLABLE_BREAKER  = '\'';
     57    0   yongsun     friend class CPinyinTrieMaker;
     58    0   yongsun 
     59    0   yongsun     class TTransUnit {
     60    0   yongsun     public:
     61  198  tchaikov #if !defined(WORDS_BIGENDIAN)
     62    0   yongsun         unsigned            m_Char  : 8;
     63    0   yongsun         unsigned            m_Offset: 24;
     64  198  tchaikov #else
     65    0   yongsun         unsigned            m_Offset: 24;
     66    0   yongsun         unsigned            m_Char  : 8;
     67    0   yongsun #endif
     68    0   yongsun     };
     69    0   yongsun 
     70    0   yongsun     class TWordIdInfo {
     71    0   yongsun     public:
     72  198  tchaikov #if !defined(WORDS_BIGENDIAN)
     73    0   yongsun         unsigned            m_id       : WORD_ID_WIDTH;
     74    0   yongsun         unsigned            m_bGBK     : 1;
     75    0   yongsun         unsigned            m_bGB18030 : 1;
     76    0   yongsun         unsigned            m_len      : 6;
     77    0   yongsun         unsigned            m_cost     : 5;
     78    0   yongsun         unsigned            m_bSeen    : 1;
     79  198  tchaikov #else
     80    0   yongsun         unsigned            m_bSeen    : 1;
     81    0   yongsun         unsigned            m_cost     : 5;
     82    0   yongsun         unsigned            m_len      : 6;
     83    0   yongsun         unsigned            m_bGB18030 : 1;
     84    0   yongsun         unsigned            m_bGBK     : 1;
     85    0   yongsun         unsigned            m_id       : WORD_ID_WIDTH;
     86    0   yongsun #endif
     87    0   yongsun 
     88    0   yongsun     public:
     89    0   yongsun         TWordIdInfo()
     90    0   yongsun             { memset(this, 0, sizeof(TWordIdInfo)); }
     91    0   yongsun 
     92    0   yongsun         TWordIdInfo(unsigned id, unsigned len=0, unsigned seen=0, unsigned cost = 0, unsigned gbk = 0, unsigned gb18030 = 0)
     93   34  tchaikov             : m_id(id), m_bGBK(gbk), m_bGB18030(gb18030), m_len(len), m_cost(cost), m_bSeen(seen) { }
     94    0   yongsun 
     95    0   yongsun         operator
     96    0   yongsun         unsigned int() const { return m_id; }
     97    0   yongsun     };
     98    0   yongsun 
     99    0   yongsun     class TNode {
    100    0   yongsun     public:
    101  198  tchaikov #if !defined(WORDS_BIGENDIAN)
    102    0   yongsun         unsigned            m_nWordId    : 16;
    103    0   yongsun         unsigned            m_nTransfer  : 8;
    104    0   yongsun         unsigned            m_bGBK       : 1;
    105    0   yongsun         unsigned            m_bGB18030   : 1;
    106    0   yongsun         unsigned            m_bFullSyllableTransfer: 1;
    107    0   yongsun         unsigned            m_bOther     : 5;
    108    0   yongsun 
    109  198  tchaikov #else
    110    0   yongsun         unsigned            m_bOther     : 5;
    111    0   yongsun         unsigned            m_bFullSyllableTransfer: 1;
    112    0   yongsun         unsigned            m_bGB18030   : 1;
    113    0   yongsun         unsigned            m_bGBK       : 1;
    114    0   yongsun         unsigned            m_nTransfer  : 8;
    115    0   yongsun         unsigned            m_nWordId    : 16;
    116    0   yongsun 
    117    0   yongsun #endif
    118    0   yongsun 
    119    0   yongsun     public:
    120    0   yongsun         static unsigned int
    121    0   yongsun         size_for(unsigned int nTransfer, unsigned int nWordId)
    122    0   yongsun             { return sizeof(TNode) + sizeof(TTransUnit)*nTransfer +
    123    0   yongsun                 sizeof(TWordIdInfo)*nWordId; }
    124    0   yongsun 
    125    0   yongsun     public:
    126  198  tchaikov         TNode() { *((uint32_t*)this) = 0; }
    127    0   yongsun 
    128    0   yongsun         bool
    129    0   yongsun         hasPinyinChild(void) const
    130    0   yongsun             { return (m_nTransfer > 1) ||
    131    0   yongsun                 (m_nTransfer ==1 && getTrans()->m_Char != SYLLABLE_BREAKER); }
    132    0   yongsun 
    133    0   yongsun         const TTransUnit*
    134    0   yongsun         getTrans() const
    135    0   yongsun             { return (TTransUnit*)(this+1); }
    136    0   yongsun 
    137    0   yongsun         const TWordIdInfo*
    138    0   yongsun         getWordIdPtr() const
    139    0   yongsun             { return (TWordIdInfo*)(((char*)(this+1))+sizeof(TTransUnit)*m_nTransfer); }
    140    0   yongsun 
    141    0   yongsun         unsigned int
    142    0   yongsun         transfer(unsigned char c) const;
    143    0   yongsun 
    144    0   yongsun         inline unsigned int
    145    0   yongsun         transfer(unsigned c) const
    146    0   yongsun             { return transfer((unsigned char)(c)); }
    147    0   yongsun     };
    148    0   yongsun 
    149    0   yongsun public:
    150   34  tchaikov     CPinyinTrie() : m_Size(0), m_mem(NULL), m_words(NULL) { }
    151    0   yongsun 
    152    0   yongsun     ~CPinyinTrie() { free(); }
    153    0   yongsun 
    154    0   yongsun     bool
    155    0   yongsun     load(const char* fileName);
    156    0   yongsun 
    157    0   yongsun     bool
    158    0   yongsun     isValid(const TNode* pnode, bool allowNonComplete, bool allowGBK=true);
    159    0   yongsun 
    160    0   yongsun     unsigned int
    161    0   yongsun     getRootOffset() const { return 3 * sizeof(unsigned int); }
    162    0   yongsun 
    163    0   yongsun     const TNode*
    164    0   yongsun     getRootNode() const { return (TNode*)(m_mem+getRootOffset()); }
    165    0   yongsun 
    166    0   yongsun     const TNode*
    167    0   yongsun     nodeFromOffset(unsigned int offset) const
    168    0   yongsun         { return (offset < getRootOffset())?NULL:((TNode*)(m_mem+offset)); }
    169    0   yongsun 
    170    0   yongsun     //@{
    171    0   yongsun     /** transfer on an char or a string from a specific node*/
    172    0   yongsun     //inline const TNode*
    173    0   yongsun     //transfer(const TNode* pnode, char c) const
    174    0   yongsun     //    { return transfer(pnode, (unsigned char)c); }
    175    0   yongsun 
    176    0   yongsun     inline const TNode*
    177    0   yongsun     transfer(const TNode* pnode, unsigned char c) const
    178    0   yongsun         { return nodeFromOffset(pnode->transfer(c)); }
    179    0   yongsun 
    180    0   yongsun     inline const TNode*
    181    0   yongsun     transfer(const TNode* pnode, TWCHAR wc) const
    182    0   yongsun         { return nodeFromOffset(pnode->transfer(unsigned(wc))); }
    183    0   yongsun 
    184    0   yongsun     /*
    185    0   yongsun     inline const TNode*
    186    0   yongsun     transfer(const TNode* pnode, const char* str) const
    187    0   yongsun         { return transfer(pnode, (const unsigned char*)str); }
    188    0   yongsun     */
    189    0   yongsun 
    190    0   yongsun     const TNode*
    191    0   yongsun     transfer(const TNode* pnode, const unsigned char* str) const;
    192    0   yongsun 
    193    0   yongsun     const TNode*
    194    0   yongsun     transfer(const TNode* pnode, const TWCHAR* wstr) const;
    195    0   yongsun 
    196    0   yongsun     /*
    197    0   yongsun     inline const TNode*
    198    0   yongsun     transfer(const TNode* pnode, const char* str, int nlen) const
    199    0   yongsun         { return transfer(pnode, (const unsigned char*)str, nlen); }
    200    0   yongsun     */
    201    0   yongsun 
    202    0   yongsun     const TNode*
    203    0   yongsun     transfer(const TNode* pnode, const unsigned char* str, int nlen) const;
    204    0   yongsun 
    205    0   yongsun     const TNode*
    206    0   yongsun     transfer(const TNode* pnode, const TWCHAR* wstr, int nlen) const;
    207    0   yongsun     //@}
    208    0   yongsun 
    209    0   yongsun     //@{
    210    0   yongsun     /** transfer on an char or a string from root node*/
    211    0   yongsun     /*
    212    0   yongsun     inline const TNode*
    213    0   yongsun     transfer(const char* str) const
    214    0   yongsun         { return transfer(getRootNode(), str); }
    215    0   yongsun     */
    216    0   yongsun 
    217    0   yongsun     inline const TNode*
    218    0   yongsun     transfer(const unsigned char* str) const
    219    0   yongsun         { return transfer(getRootNode(), str); }
    220    0   yongsun 
    221    0   yongsun     inline const TNode*
    222    0   yongsun     transfer(const TWCHAR* wstr) const
    223    0   yongsun         { return transfer(getRootNode(), wstr); }
    224    0   yongsun 
    225    0   yongsun     /*
    226    0   yongsun     inline const TNode*
    227    0   yongsun     transfer(const char* str, int nlen) const
    228    0   yongsun         { return transfer(getRootNode(), str, nlen); }
    229    0   yongsun     */
    230    0   yongsun 
    231    0   yongsun     inline const TNode*
    232    0   yongsun     transfer(const unsigned char* str, int nlen) const
    233    0   yongsun         { return transfer(getRootNode(), str, nlen); }
    234    0   yongsun 
    235    0   yongsun     inline const TNode*
    236    0   yongsun     transfer(const TWCHAR* wstr, int nlen) const
    237    0   yongsun         { return transfer(getRootNode(), wstr, nlen); }
    238    0   yongsun     //@}
    239    0   yongsun 
    240    0   yongsun     unsigned int
    241    0   yongsun     getWordCount(void) const { return *(unsigned int*)m_mem; }
    242    0   yongsun 
    243    0   yongsun     unsigned int
    244    0   yongsun     getNodeCount(void) const { return *(unsigned int*)(m_mem+sizeof(unsigned int)); }
    245    0   yongsun 
    246    0   yongsun     unsigned int
    247    0   yongsun     getStringOffset(void) const { return *(unsigned int*)(m_mem+2*sizeof(unsigned int)); }
    248    0   yongsun 
    249    0   yongsun     unsigned int
    250    0   yongsun     getSimbolId(const TWCHAR* wstr);
    251    0   yongsun 
    252    0   yongsun     unsigned int
    253    0   yongsun     getSimbolId(const wstring & wstr);
    254    0   yongsun 
    255    0   yongsun     const TWCHAR*
    256    0   yongsun     operator[](unsigned int idx) const { return m_words[idx]; }
    257    0   yongsun 
    258    0   yongsun     int
    259    0   yongsun     lengthAt(unsigned int idx) const;
    260    0   yongsun 
    261    0   yongsun     void
    262    0   yongsun     free(void);
    263    0   yongsun 
    264    0   yongsun     void
    265    0   yongsun     print(FILE *fp) const;
    266    0   yongsun 
    267    0   yongsun protected:
    268    0   yongsun     unsigned int           m_Size;
    269    0   yongsun     char                  *m_mem;
    270    0   yongsun     TWCHAR               **m_words;
    271    0   yongsun 
    272    0   yongsun     std::map<wstring, unsigned>  m_SimbolMap;
    273    0   yongsun 
    274    0   yongsun protected:
    275    0   yongsun     void
    276    0   yongsun     print(const TNode* pRoot, std::string& prefix, FILE *fp) const;
    277    0   yongsun };
    278    0   yongsun 
    279    0   yongsun #endif
    280