Home | History | Annotate | Download | only in lexicon
      1   0   yongsun /*
      2  82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  82   yongsun  *
      4  82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  82   yongsun  *
      6  82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7  82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  82   yongsun  * specific language governing permissions and limitations under the License. When
     13  82   yongsun  * distributing the software, include this License Header Notice in each file and
     14  82   yongsun  * include the full text of the License in the License file as well as the
     15  82   yongsun  * following notice:
     16  82   yongsun  *
     17  82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  82   yongsun  * (CDDL)
     19  82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20  82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21  82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23  82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24  82   yongsun  *
     25  82   yongsun  * Contributor(s):
     26  82   yongsun  *
     27  82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28  82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31  82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32  82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35  82   yongsun  * to such option by the copyright holder.
     36   0   yongsun  */
     37  82   yongsun 
     38   0   yongsun #ifdef HAVE_CONFIG_H
     39   0   yongsun #include "config.h"
     40   0   yongsun #endif
     41   0   yongsun 
     42   0   yongsun #ifdef HAVE_ASSERT_H
     43   0   yongsun #include <assert.h>
     44   0   yongsun #endif
     45   0   yongsun 
     46   0   yongsun #include <algorithm>
     47  90   tonylee 
     48  90   tonylee #ifdef HAVE_ICONV_H
     49   0   yongsun #include <iconv.h>
     50  90   tonylee #endif
     51   0   yongsun 
     52   0   yongsun #include "pytrie_gen.h"
     53   0   yongsun 
     54   0   yongsun static const char*
     55   0   yongsun skipSpace(const char* p)
     56   0   yongsun {
     57   0   yongsun     while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')
     58   0   yongsun         ++p;
     59   0   yongsun     return p;
     60   0   yongsun }
     61   0   yongsun 
     62   0   yongsun static const char*
     63   0   yongsun skipNonSpace(const char* p)
     64   0   yongsun {
     65   0   yongsun     while (*p != '\0' && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
     66   0   yongsun         ++p;
     67   0   yongsun     return p;
     68   0   yongsun }
     69   0   yongsun 
     70   0   yongsun static void
     71   0   yongsun insertWordId(CPinyinTrieMaker::CWordSet& idset, CPinyinTrieMaker::TWordId id)
     72   0   yongsun {
     73   0   yongsun     CPinyinTrieMaker::CWordSet::iterator it = idset.find(id);
     74   0   yongsun     if (it == idset.end())
     75   0   yongsun         idset.insert(id);
     76   0   yongsun     else {
     77   0   yongsun         const CPinyinTrieMaker::TWordId& a = *it;
     78   0   yongsun         if ((a.anony.m_bHide && !id.anony.m_bHide) || (a.anony.m_bHide == id.anony.m_bHide && a.anony.m_cost > id.anony.m_cost)) {
     79   0   yongsun             idset.erase(it);
     80   0   yongsun             idset.insert(id);
     81   0   yongsun         }
     82   0   yongsun     }
     83   0   yongsun }
     84   0   yongsun 
     85   0   yongsun struct TSyllableInfo {
     86   0   yongsun     std::string   m_py;
     87   0   yongsun     int           m_cost;
     88   0   yongsun 
     89   0   yongsun     TSyllableInfo(const char* py=NULL, int cost=0) : m_py(py), m_cost(cost) {}
     90   0   yongsun     bool operator< (const TSyllableInfo& b) const { return m_py < b.m_py; }
     91   0   yongsun };
     92   0   yongsun 
     93  90   tonylee #ifdef HAVE_ICONV_H
     94   0   yongsun bool isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric)
     95   0   yongsun {
     96   0   yongsun     static char gbstr[256];
     97   0   yongsun     static char utstr[256];
     98   0   yongsun 
     99   0   yongsun     TIConvSrcPtr src = (TIConvSrcPtr)utf8;
    100   0   yongsun     size_t srclen = strlen((char*)src)+1;
    101   0   yongsun     char* dst = (char *)gbstr;
    102   0   yongsun     size_t dstlen = 256;
    103   0   yongsun     size_t res = iconv(ic, &src, &srclen, &dst, &dstlen);
    104   0   yongsun 
    105   0   yongsun     if (res != size_t(-1) && srclen == 0) {
    106   0   yongsun         // do revert convertion and compare them
    107   0   yongsun         src = (TIConvSrcPtr)gbstr;
    108   0   yongsun         srclen = strlen((char*)src)+1;
    109   0   yongsun         dst = (char *)utstr;
    110   0   yongsun         dstlen = 256;
    111   0   yongsun         res = iconv(ric, &src, &srclen, &dst, &dstlen);
    112   0   yongsun         if (res != size_t(-1) && srclen == 0)
    113   0   yongsun             return (strcmp(utf8, utstr) == 0);
    114   0   yongsun     }
    115   0   yongsun     return false;
    116   0   yongsun }
    117   0   yongsun 
    118   0   yongsun //return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk
    119   0   yongsun unsigned getPureGBEncoding(const char* utf8str)
    120   0   yongsun {
    121   0   yongsun     static iconv_t ic_gb = iconv_open("GB2312", "UTF-8");
    122   0   yongsun     static iconv_t ic_gbk = iconv_open("GBK", "UTF-8");
    123   0   yongsun     static iconv_t ric_gb = iconv_open("UTF-8", "GB2312");
    124   0   yongsun     static iconv_t ric_gbk = iconv_open("UTF-8", "GBK");
    125   0   yongsun 
    126   0   yongsun     unsigned ret = 0;
    127   0   yongsun 
    128   0   yongsun     if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) {
    129   0   yongsun         ret = 1; // at least it is contains some GBK char
    130   0   yongsun         if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk))
    131   0   yongsun             ret = 3; //contains some GB18030-only char
    132   0   yongsun 
    133   0   yongsun         #ifdef DEBUG
    134   0   yongsun             fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret);
    135   0   yongsun             fflush(stderr);
    136   0   yongsun         #endif
    137   0   yongsun     }
    138   0   yongsun     return ret;
    139   0   yongsun }
    140  90   tonylee #else // !HAVE_ICONV_H
    141  90   tonylee unsigned getPureGBEncoding(const char* utf8str)
    142  90   tonylee {
    143  90   tonylee     // FIXME
    144  90   tonylee     return 0x3;
    145  90   tonylee }
    146  90   tonylee #endif // HAVE_ICONV_H
    147   0   yongsun 
    148   0   yongsun bool
    149   0   yongsun parseLine(char* buf, char* word_buf, int& id, std::set<TSyllableInfo>& pyset)
    150   0   yongsun {
    151   0   yongsun     pyset.clear();
    152   5  ys148558 
    153   5  ys148558     /* ignore the empty lines and comment lines */
    154   5  ys148558     if (*buf == '\n' || *buf == '#')
    155   5  ys148558         return 0;
    156   5  ys148558 
    157   0   yongsun     char* word_start = word_buf;
    158   0   yongsun     char* p = (char*)skipSpace(buf);
    159   0   yongsun     char* t = (char*)skipNonSpace(p);
    160   0   yongsun     while(p < t) *word_buf++ = *p++;
    161   0   yongsun     *word_buf = 0;
    162   0   yongsun 
    163   0   yongsun     p = (char*)skipSpace(p);
    164   0   yongsun     t = (char*)skipNonSpace(p);
    165   0   yongsun     if (*t)
    166   0   yongsun         *t++ = 0;
    167   0   yongsun     id = atoi(p);
    168   0   yongsun     p = (char*)skipSpace(t);
    169   0   yongsun     while (*p) {
    170   0   yongsun         const char* s = p;
    171   0   yongsun         t = (char*)skipNonSpace(p);
    172   0   yongsun         if (*t)
    173   0   yongsun             *t++ = 0;
    174   0   yongsun         while ((*p >= 'a' && *p <= 'z') || (*p == CPinyinTrie::SYLLABLE_BREAKER))
    175   0   yongsun             ++p;
    176   0   yongsun         if ((p > s) && ((*p == 0) || (*p == ':'))) {
    177   0   yongsun             int  cost = 0;
    178   0   yongsun             if (*p == ':') {
    179   0   yongsun                 *p++ = 0;
    180   0   yongsun                 cost = atoi(p);
    181   0   yongsun             }
    182   0   yongsun             pyset.insert(TSyllableInfo(s, cost));
    183   0   yongsun         }
    184   0   yongsun         p = (char*)skipSpace(t);
    185   0   yongsun     }
    186   0   yongsun     return pyset.size() > 0;
    187   0   yongsun }
    188   0   yongsun 
    189   0   yongsun 
    190   0   yongsun CPinyinTrieMaker::CPinyinTrieMaker()
    191   0   yongsun     : m_RootNode(), m_FullSyllables(), m_StateMap(), m_AllNodes()
    192   0   yongsun {
    193   0   yongsun     m_AllNodes.push_back(&m_RootNode);
    194   0   yongsun     m_RootNode.m_bExpanded = true;
    195   0   yongsun     m_RootNode.m_PrimitiveNodes.insert(&m_RootNode);
    196   0   yongsun     m_StateMap[&(m_RootNode.m_PrimitiveNodes)] = &m_RootNode;
    197   0   yongsun }
    198   0   yongsun /**********************************************************
    199   0   yongsun     lexicon
    200   0   yongsun         TAB(1)
    201   0   yongsun          word id
    202   0   yongsun         '
    203   0   yongsun         4095;
    204   0   yongsun **********************************************************/
    205  44   yongsun #define RARE_MULTI_PHONETIC_STARTING_ID 140000 /* FIXME */
    206   0   yongsun bool
    207   0   yongsun CPinyinTrieMaker::constructFromLexicon(const char* fileName)
    208   0   yongsun {
    209  44   yongsun     static int  rmp_id = RARE_MULTI_PHONETIC_STARTING_ID;
    210   0   yongsun     static char buf[4096];
    211   0   yongsun     static char word_buf[2048];
    212   0   yongsun 
    213   0   yongsun     int id;
    214   0   yongsun     bool suc = true;
    215   0   yongsun     std::set<TSyllableInfo> pyset;
    216   0   yongsun     FILE *fp = fopen(fileName, "r");
    217   0   yongsun     printf("Adding pinyin and corresponding words..."); fflush(stdout);
    218   0   yongsun     while (fgets(buf, 4096, fp) != NULL) {
    219   0   yongsun         if (!parseLine(buf, word_buf, id, pyset)) {
    220   0   yongsun             if (word_buf[0] != L'<' && word_buf[0] != 0) {
    221   0   yongsun                 if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1);
    222   0   yongsun                 m_Lexicon[id] = std::string(word_buf);
    223   0   yongsun             }
    224   0   yongsun             continue;
    225   0   yongsun         }
    226   0   yongsun         unsigned gbcategory = getPureGBEncoding(word_buf);
    227   0   yongsun 
    228   0   yongsun         std::set<TSyllableInfo>::iterator its = pyset.begin();
    229   0   yongsun         std::set<TSyllableInfo>::iterator ite = pyset.end();
    230   0   yongsun         for (; its != ite; ++its) {
    231   0   yongsun             const char *t = its->m_py.c_str();
    232   0   yongsun             int cost = its->m_cost;
    233  44   yongsun             int myid = id;
    234  44   yongsun 
    235   0   yongsun             if (cost < 0) {
    236   0   yongsun                 cost = 30 / (-cost);
    237  44   yongsun                 myid = rmp_id ++;
    238  44   yongsun             }
    239  44   yongsun 
    240  44   yongsun             if (m_Lexicon.size() < myid+1) m_Lexicon.resize(myid+1);
    241  44   yongsun             m_Lexicon[myid] = std::string(word_buf);
    242  44   yongsun 
    243  44   yongsun             CPinyinTrieMaker::TWordId wid(myid, cost, its->m_cost < 0, gbcategory & 0x1, gbcategory & 0x2);
    244   0   yongsun             suc = insertFullPinyinPair(t, wid) && suc;
    245   0   yongsun 
    246   0   yongsun             while (*t) {
    247   0   yongsun                 char *p = buf;
    248   0   yongsun                 while (*t != 0 && *t != CPinyinTrie::SYLLABLE_BREAKER)
    249   0   yongsun                     *p++ = *t++;
    250   0   yongsun                 *p = 0;
    251   0   yongsun                 registerFullSyllable(buf);
    252   0   yongsun                 if (*t == CPinyinTrie::SYLLABLE_BREAKER)
    253   0   yongsun                     ++t;
    254   0   yongsun             }
    255   0   yongsun         }
    256   0   yongsun     }
    257   0   yongsun     fclose(fp);
    258   0   yongsun 
    259   0   yongsun     std::string pyPrefix = "";
    260   0   yongsun 
    261   0   yongsun     printf("\n    %d primitive nodes", m_AllNodes.size());  fflush(stdout);
    262   0   yongsun 
    263   0   yongsun     /*
    264   0   yongsun     printf("\n    Printing it to stderr...");
    265   0   yongsun     print(stderr, &m_RootNode, pyPrefix);
    266   0   yongsun     */
    267   0   yongsun 
    268   0   yongsun     printf("\nThreading non-complete pinyin...");  fflush(stdout);
    269   0   yongsun     suc = threadNonCompletePinyin() && suc;
    270   0   yongsun     printf("\n    %d total nodes", m_AllNodes.size());  fflush(stdout);
    271   0   yongsun 
    272   0   yongsun     /*
    273   0   yongsun     printf("\n    Printing it to stderr...");
    274   0   yongsun     fprintf(stderr, "\n\n\n\n-----------------------------\n\n\n\n");
    275   0   yongsun     print(stderr, &m_RootNode, pyPrefix);
    276   0   yongsun     */
    277   0   yongsun 
    278   0   yongsun     printf("\n");  fflush(stdout);
    279   0   yongsun 
    280   0   yongsun     return suc;
    281   0   yongsun }
    282   0   yongsun 
    283   0   yongsun CPinyinTrieMaker::TNode::TNode()
    284   0   yongsun     : m_bFullSyllableTransfer(false), m_bExpanded(false), m_WordIdSet(),
    285   0   yongsun       m_Trans(), m_PrimitiveNodes(), m_SyllablePrefix()
    286   0   yongsun {
    287   0   yongsun }
    288   0   yongsun 
    289   0   yongsun bool
    290   0   yongsun CPinyinTrieMaker::PNodeSet::operator< (const PNodeSet& another) const
    291   0   yongsun {
    292   0   yongsun     CNodeSet::const_iterator t1 = m_pns->begin();
    293   0   yongsun     CNodeSet::const_iterator t2 = m_pns->end();
    294   0   yongsun     CNodeSet::const_iterator a1 = another.m_pns->begin();
    295   0   yongsun     CNodeSet::const_iterator a2 = another.m_pns->end();
    296   0   yongsun     for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
    297   0   yongsun         if (*t1 < *a1) return true;
    298   0   yongsun         if (*t1 > *a1) return false;
    299   0   yongsun     }
    300   0   yongsun     return (a1 != a2);
    301   0   yongsun }
    302   0   yongsun 
    303   0   yongsun bool
    304   0   yongsun CPinyinTrieMaker::PNodeSet::operator==(const PNodeSet& another) const
    305   0   yongsun {
    306   0   yongsun     CNodeSet::const_iterator t1 = m_pns->begin();
    307   0   yongsun     CNodeSet::const_iterator t2 = m_pns->end();
    308   0   yongsun     CNodeSet::const_iterator a1 = another.m_pns->begin();
    309   0   yongsun     CNodeSet::const_iterator a2 = another.m_pns->end();
    310   0   yongsun     for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
    311   0   yongsun         if (*t1 != *a1) return false;
    312   0   yongsun     }
    313   0   yongsun     return (a1 == a2 && t1 != t2);
    314   0   yongsun }
    315   0   yongsun 
    316   0   yongsun 
    317   0   yongsun void
    318   0   yongsun CPinyinTrieMaker::print(FILE* fp, TNode* root, std::string& pinyin)
    319   0   yongsun {
    320   0   yongsun     if (root && root->m_WordIdSet.size() > 0) {
    321   0   yongsun         fprintf(fp, "%s", pinyin.c_str());
    322   0   yongsun         CWordSet::iterator itId = root->m_WordIdSet.begin();
    323   0   yongsun         CWordSet::iterator itIdLast = root->m_WordIdSet.end();
    324   0   yongsun         for (; itId != itIdLast; ++itId) {
    325   0   yongsun             fprintf(fp, " %s", m_Lexicon[itId->anony.m_id].c_str());
    326   0   yongsun         }
    327   0   yongsun         fprintf(fp, "\n");
    328   0   yongsun     }
    329   0   yongsun     if (root) {
    330   0   yongsun         CTrans::iterator itTrans = root->m_Trans.begin();
    331   0   yongsun         CTrans::iterator itTransLast = root->m_Trans.end();
    332   0   yongsun         for (; itTrans != itTransLast; ++itTrans) {
    333   0   yongsun             pinyin += itTrans->first;
    334   0   yongsun             print(fp, itTrans->second, pinyin);
    335   0   yongsun             pinyin.resize(pinyin.size() -1);
    336   0   yongsun         }
    337   0   yongsun     }
    338   0   yongsun }
    339   0   yongsun 
    340   0   yongsun 
    341   0   yongsun /***********************************************************
    342   0   yongsun     
    343   0   yongsun     pinyin trie
    344   0   yongsun ***********************************************************/
    345   0   yongsun bool
    346   0   yongsun CPinyinTrieMaker::registerFullSyllable(const char* pinyin)
    347   0   yongsun {
    348   0   yongsun     CPinyinString str(pinyin);
    349   0   yongsun     m_FullSyllables.insert(str);
    350   0   yongsun     return true;
    351   0   yongsun }
    352   0   yongsun 
    353   0   yongsun 
    354   0   yongsun CPinyinTrieMaker::TNode*
    355   0   yongsun CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned char c)
    356   0   yongsun {
    357   0   yongsun     CTrans::iterator itt = pnode->m_Trans.find(c);
    358   0   yongsun     CTrans::iterator ite = pnode->m_Trans.end();
    359   0   yongsun     if (itt == ite) {
    360   0   yongsun         TNode *p = new TNode();
    361   0   yongsun         m_AllNodes.push_back(p);
    362   0   yongsun         pnode->m_Trans[c] = p;
    363   0   yongsun         if (c != CPinyinTrie::SYLLABLE_BREAKER) {
    364   0   yongsun             p->m_SyllablePrefix = pnode->m_SyllablePrefix;
    365   0   yongsun             p->m_SyllablePrefix += c;
    366   0   yongsun         }
    367   0   yongsun         return p;
    368   0   yongsun     }
    369   0   yongsun     return itt->second;
    370   0   yongsun }
    371   0   yongsun 
    372   0   yongsun /***********************************************************
    373   0   yongsun     pinyin:   != NULL, 26'
    374   0   yongsun                >0''
    375   0   yongsun     wid   :   word id
    376   0   yongsun ***********************************************************/
    377   0   yongsun bool
    378   0   yongsun CPinyinTrieMaker::insertFullPinyinPair(const char* pinyin, TWordId wid)
    379   0   yongsun {
    380   0   yongsun     const char* p = pinyin;
    381   0   yongsun     TNode *pnode = &m_RootNode;
    382   0   yongsun     for (; *p; ++p) {
    383   0   yongsun         if (*p == CPinyinTrie::SYLLABLE_BREAKER)
    384   0   yongsun             pnode->m_bFullSyllableTransfer = true;
    385   0   yongsun         pnode = insertTransfer(pnode, (unsigned char)*p);
    386   0   yongsun         pnode->m_bExpanded = true;
    387   0   yongsun         pnode->m_PrimitiveNodes.insert(pnode);
    388   0   yongsun         m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode;
    389   0   yongsun     }
    390   0   yongsun     if (*p-1 != CPinyinTrie::SYLLABLE_BREAKER) {
    391   0   yongsun         pnode->m_bFullSyllableTransfer = true;
    392   0   yongsun         pnode = insertTransfer(pnode, CPinyinTrie::SYLLABLE_BREAKER);
    393   0   yongsun         pnode->m_bExpanded = true;
    394   0   yongsun         pnode->m_PrimitiveNodes.insert(pnode);
    395   0   yongsun         m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode;
    396   0   yongsun     }
    397   0   yongsun     insertWordId(pnode->m_WordIdSet, wid);
    398   0   yongsun     return true;
    399   0   yongsun }
    400   0   yongsun 
    401   0   yongsun bool
    402   0   yongsun CPinyinTrieMaker::threadNonCompletePinyin(void)
    403   0   yongsun {
    404   0   yongsun     CNodeList::iterator itNode = m_AllNodes.begin();
    405   0   yongsun     for (; itNode != m_AllNodes.end(); ++itNode) {
    406   0   yongsun         TNode* pnode = *itNode;
    407   0   yongsun         if (!pnode->m_bExpanded)
    408   0   yongsun             expandNode(pnode);
    409   0   yongsun 
    410   0   yongsun         if (pnode->m_SyllablePrefix.size() > 0 &&
    411   0   yongsun                 m_FullSyllables.find(pnode->m_SyllablePrefix) == m_FullSyllables.end() &&
    412   0   yongsun                 pnode->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) == pnode->m_Trans.end()) {
    413   0   yongsun             addNonCompleteSyllableTransfer(pnode);
    414   0   yongsun         }
    415   0   yongsun     }
    416   0   yongsun     return true;
    417   0   yongsun }
    418   0   yongsun 
    419   0   yongsun /**
    420   0   yongsun * For those node which are added after fullComplete PINYIN string,
    421   0   yongsun * give their succ nodes, for example, for "dian'ying dui'yu dian'hua", an
    422   0   yongsun * new state "d'" would be expand to "d'y d'h"
    423   0   yongsun */
    424   0   yongsun void
    425   0   yongsun CPinyinTrieMaker::expandNode(TNode* pnode)
    426   0   yongsun {
    427   0   yongsun     std::map<char, CNodeSet> combTrans;
    428   0   yongsun 
    429   0   yongsun     CNodeSet::iterator itNode = pnode->m_PrimitiveNodes.begin();
    430   0   yongsun     CNodeSet::iterator itNodeLast = pnode->m_PrimitiveNodes.end();
    431   0   yongsun     for (; itNode != itNodeLast; ++itNode) {
    432   0   yongsun          CTrans::iterator itTrans = (*itNode)->m_Trans.begin();
    433   0   yongsun          CTrans::iterator itTransLast = (*itNode)->m_Trans.end();
    434   0   yongsun          for (; itTrans != itTransLast; ++itTrans) {
    435   0   yongsun              if (itTrans->first == CPinyinTrie::SYLLABLE_BREAKER &&
    436   0   yongsun                         !(*itNode)->m_bFullSyllableTransfer) {
    437   0   yongsun                  continue;
    438   0   yongsun              }
    439   0   yongsun              combTrans[itTrans->first].insert(itTrans->second);
    440   0   yongsun          }
    441   0   yongsun     }
    442   0   yongsun 
    443   0   yongsun     std::map<char, CNodeSet>::iterator itCombTrans = combTrans.begin();
    444   0   yongsun     std::map<char, CNodeSet>::iterator itCombTransLast = combTrans.end();
    445   0   yongsun     for (; itCombTrans != itCombTransLast; ++itCombTrans) {
    446   0   yongsun         //if a new state, ie new node set appear, create the new state --> ps
    447   0   yongsun         //esle let ps = the founded old state, let transfer(c) = state
    448   0   yongsun         unsigned char c = itCombTrans->first;
    449   0   yongsun //        if (c == CPinyinTrie::SYLLABLE_BREAKER && !pnode->m_bFullSyllableTransfer) {
    450   0   yongsun //            continue;
    451   0   yongsun //        }
    452   0   yongsun         TNode* pChildNode = NULL;
    453   0   yongsun         CStateMap::iterator itStateMap = m_StateMap.find(&itCombTrans->second);
    454   0   yongsun         if (itStateMap != m_StateMap.end()) {
    455   0   yongsun             pChildNode = itStateMap->second;
    456   0   yongsun         } else {
    457   0   yongsun             pChildNode = new TNode();
    458   0   yongsun             m_AllNodes.push_back(pChildNode);
    459   0   yongsun             pChildNode->m_PrimitiveNodes = itCombTrans->second;
    460   0   yongsun             m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode;
    461   0   yongsun             if (c != CPinyinTrie::SYLLABLE_BREAKER) {
    462   0   yongsun                 pChildNode->m_SyllablePrefix = pnode->m_SyllablePrefix;
    463   0   yongsun                 pChildNode->m_SyllablePrefix += c;
    464   0   yongsun             } else {
    465   0   yongsun                 CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin();
    466   0   yongsun                 CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end();
    467   0   yongsun                 for (; itps != itpse; ++itps) {
    468   0   yongsun                     CWordSet::iterator ita = (*itps)->m_WordIdSet.begin();
    469   0   yongsun                     CWordSet::iterator itb = (*itps)->m_WordIdSet.end();
    470   0   yongsun                     for (; ita != itb; ++ita)
    471   0   yongsun                         insertWordId(pChildNode->m_WordIdSet, *ita);
    472   0   yongsun                     //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end());
    473   0   yongsun                 }
    474   0   yongsun                 pnode->m_bFullSyllableTransfer = false;
    475   0   yongsun             }
    476   0   yongsun         }
    477   0   yongsun         pnode->m_Trans[c] = pChildNode;
    478   0   yongsun     }
    479   0   yongsun     pnode->m_bExpanded = true;
    480   0   yongsun }
    481   0   yongsun 
    482   0   yongsun void
    483   0   yongsun CPinyinTrieMaker::addNonCompleteSyllableTransfer(TNode* pnode)
    484   0   yongsun {
    485   0   yongsun     CNodeSet syChildren;
    486   0   yongsun     TNode* pChildNode = NULL;
    487   0   yongsun 
    488   0   yongsun     findSyllableChildren(pnode, syChildren);
    489   0   yongsun     if (syChildren.size() == 0)  //z, c, s with only zh, ch, sh children
    490   0   yongsun         return;
    491   0   yongsun     CStateMap::iterator itStateMap = m_StateMap.find(&syChildren);
    492   0   yongsun     if (itStateMap != m_StateMap.end()) {
    493   0   yongsun         pChildNode = itStateMap->second;
    494   0   yongsun     } else {
    495   0   yongsun         pChildNode = new TNode();
    496   0   yongsun         m_AllNodes.push_back(pChildNode);
    497   0   yongsun         pChildNode->m_PrimitiveNodes = syChildren;
    498   0   yongsun         m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode;
    499   0   yongsun         CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin();
    500   0   yongsun         CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end();
    501   0   yongsun         for (; itps != itpse; ++itps) {
    502   0   yongsun             CWordSet::iterator ita = (*itps)->m_WordIdSet.begin();
    503   0   yongsun             CWordSet::iterator itb = (*itps)->m_WordIdSet.end();
    504   0   yongsun             for (; ita != itb; ++ita)
    505   0   yongsun                 insertWordId(pChildNode->m_WordIdSet, *ita);
    506   0   yongsun             //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end());
    507   0   yongsun         }
    508   0   yongsun     }
    509   0   yongsun     pnode->m_Trans[CPinyinTrie::SYLLABLE_BREAKER] = pChildNode;
    510   0   yongsun     // this is default: pnode->m_bFullSyllableTransfer = false;
    511   0   yongsun }
    512   0   yongsun 
    513   0   yongsun int
    514   0   yongsun CPinyinTrieMaker::findSyllableChildren(const TNode *pn, CNodeSet& children)
    515   0   yongsun {
    516   0   yongsun     CNodeSet::iterator itNode = pn->m_PrimitiveNodes.begin();
    517   0   yongsun     CNodeSet::iterator itNodeLast = pn->m_PrimitiveNodes.end();
    518   0   yongsun     for (children.clear(); itNode != itNodeLast; ++itNode) {
    519   0   yongsun         findPrimitiveSyllableChildren(*itNode, children);
    520   0   yongsun     }
    521   0   yongsun     return children.size();
    522   0   yongsun }
    523   0   yongsun 
    524   0   yongsun void
    525   0   yongsun CPinyinTrieMaker::findPrimitiveSyllableChildren(const TNode *pn, CNodeSet& children)
    526   0   yongsun {
    527   0   yongsun     CTrans::const_iterator it = pn->m_Trans.begin();
    528   0   yongsun     CTrans::const_iterator ite= pn->m_Trans.end();
    529   0   yongsun     for (; it != ite; ++it) {
    530   0   yongsun         if (it->first != CPinyinTrie::SYLLABLE_BREAKER) {
    531   0   yongsun             if (it->first == 'h' &&
    532   0   yongsun                 (pn->m_SyllablePrefix == "c" || pn->m_SyllablePrefix == "z" ||
    533   0   yongsun                  pn->m_SyllablePrefix == "s" ) ) {
    534   0   yongsun                 continue;
    535   0   yongsun             }
    536   0   yongsun             findPrimitiveSyllableChildren(it->second, children);
    537   0   yongsun         } else {
    538   0   yongsun             if (pn->m_bFullSyllableTransfer) {
    539   0   yongsun                 children.insert(it->second);
    540   0   yongsun             }
    541   0   yongsun         }
    542   0   yongsun     }
    543   0   yongsun }
    544   0   yongsun 
    545   0   yongsun bool
    546   0   yongsun CPinyinTrieMaker::write(const char* fileName, CWordEvaluator* psrt)
    547   0   yongsun {
    548   0   yongsun     bool suc = false;
    549   0   yongsun     FILE* fp = fopen(fileName, "wb");
    550   0   yongsun     if (fp != NULL) {
    551   0   yongsun         suc = write(fp, psrt);
    552   0   yongsun         fclose(fp);
    553   0   yongsun     }
    554   0   yongsun     return suc;
    555   0   yongsun }
    556   0   yongsun 
    557   0   yongsun bool
    558   0   yongsun CPinyinTrieMaker::write(FILE *fp, CWordEvaluator* psrt)
    559   0   yongsun {
    560   0   yongsun     bool suc = true;
    561   0   yongsun     static TWCHAR wbuf[1024];
    562   0   yongsun 
    563   0   yongsun     std::map<TNode*, unsigned int> nodeOffsetMap;
    564   0   yongsun 
    565   0   yongsun     /*the file started with m_nWord, the size itself do not included here*/
    566   0   yongsun     unsigned int nWord = m_Lexicon.size();
    567   0   yongsun     unsigned int nNode = m_AllNodes.size();
    568   0   yongsun     unsigned int lexiconOffset;
    569   0   yongsun     unsigned int offset = sizeof(unsigned int) * 3;
    570   0   yongsun 
    571   0   yongsun     CNodeList::const_iterator itNode = m_AllNodes.begin();
    572   0   yongsun     CNodeList::const_iterator itNodeLast = m_AllNodes.end();
    573   0   yongsun     for (; itNode != itNodeLast; ++itNode) {
    574   0   yongsun         nodeOffsetMap[*itNode] = offset;
    575   0   yongsun         offset += CPinyinTrie::TNode::size_for((*itNode)->m_Trans.size(),
    576   0   yongsun                                                (*itNode)->m_WordIdSet.size());
    577   0   yongsun     }
    578   0   yongsun     lexiconOffset = offset;
    579   0   yongsun     CLexicon::const_iterator itWordStr = m_Lexicon.begin();
    580   0   yongsun     CLexicon::const_iterator itWordStrLast = m_Lexicon.end();
    581   0   yongsun     for (; itWordStr != itWordStrLast; ++itWordStr) {
    582   0   yongsun         MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
    583   0   yongsun         int sz = WCSLEN(wbuf);
    584   0   yongsun         offset += (sz+1)*sizeof(TWCHAR);
    585   0   yongsun     }
    586   0   yongsun 
    587   0   yongsun     suc = (fwrite(&offset, sizeof(unsigned int), 1, fp) == 1);
    588   0   yongsun     suc = (fwrite(&nWord, sizeof(unsigned int), 1, fp) == 1);
    589   0   yongsun     suc = (fwrite(&nNode, sizeof(unsigned int), 1, fp) == 1);
    590   0   yongsun     suc = (fwrite(&lexiconOffset, sizeof(unsigned int), 1, fp) == 1);
    591   0   yongsun 
    592   0   yongsun     itNode = m_AllNodes.begin();
    593   0   yongsun     itNodeLast = m_AllNodes.end();
    594   0   yongsun     for (; itNode != itNodeLast && suc; ++itNode) {
    595   0   yongsun         CPinyinTrie::TNode outNode;
    596   0   yongsun         outNode.m_nTransfer = (*itNode)->m_Trans.size();
    597   0   yongsun         outNode.m_nWordId = (*itNode)->m_WordIdSet.size();
    598   0   yongsun         outNode.m_bFullSyllableTransfer = (*itNode)->m_bFullSyllableTransfer;
    599   0   yongsun 
    600   0   yongsun         //determine this node's GB category, have some pure gb2312 words, or all GBK/GB18030 words
    601   0   yongsun         outNode.m_bGBK = 1;
    602   0   yongsun         outNode.m_bGB18030 = 1;
    603   0   yongsun 
    604   0   yongsun         TNode* itequ = *itNode;
    605   0   yongsun         if (outNode.m_nWordId == 0) {
    606   0   yongsun             if ((*itNode)->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) != (*itNode)->m_Trans.end()) {
    607   0   yongsun                 itequ = (*itNode)->m_Trans[CPinyinTrie::SYLLABLE_BREAKER];
    608   0   yongsun                 if (itequ->m_WordIdSet.size() == 0) {
    609   0   yongsun                     outNode.m_bGBK = 0;
    610   0   yongsun                     outNode.m_bGB18030 = 0;
    611   0   yongsun                 }
    612   0   yongsun             } else {
    613   0   yongsun                 outNode.m_bGBK = 0;
    614   0   yongsun                 outNode.m_bGB18030 = 0;
    615   0   yongsun             }
    616   0   yongsun         }
    617   0   yongsun         CWordSet::iterator itId = itequ->m_WordIdSet.begin();
    618   0   yongsun         CWordSet::iterator itIdLast = itequ->m_WordIdSet.end();
    619   0   yongsun         for (; itId != itIdLast && outNode.m_bGBK; ++itId) {
    620   0   yongsun             outNode.m_bGB18030 &= itId->anony.m_bGB18030;
    621   0   yongsun             outNode.m_bGBK &= itId->anony.m_bGBK;
    622   0   yongsun         }
    623   0   yongsun         #ifdef DEBUG
    624   0   yongsun             if (outNode.m_bGBK) {
    625   0   yongsun                 CWordSet::iterator itId = (*itNode)->m_WordIdSet.begin();
    626   0   yongsun                 CWordSet::iterator itIdLast = (*itNode)->m_WordIdSet.end();
    627   0   yongsun                 fprintf(stderr, "========>(");
    628   0   yongsun                 for (; itId != itIdLast; ++itId) {
    629   0   yongsun                     fprintf(stderr, " %d-%1d", itId->anony.m_id, itId->anony.m_bGBK);
    630   0   yongsun                 }
    631   0   yongsun                 fprintf(stderr, " )\n\n");
    632   0   yongsun                 fflush(stderr);
    633   0   yongsun             }
    634   0   yongsun         #endif
    635   0   yongsun         suc = (fwrite(&outNode, sizeof(outNode), 1, fp) == 1);
    636   0   yongsun 
    637   0   yongsun         CTrans::iterator itTrans = (*itNode)->m_Trans.begin();
    638   0   yongsun         CTrans::iterator itTransLast = (*itNode)->m_Trans.end();
    639   0   yongsun         for (; itTrans != itTransLast && suc; ++itTrans) {
    640   0   yongsun             CPinyinTrie::TTransUnit tru;
    641   0   yongsun             tru.m_Char = itTrans->first;
    642   0   yongsun             tru.m_Offset = nodeOffsetMap[itTrans->second];
    643   0   yongsun             assert(tru.m_Offset != 0);
    644   0   yongsun             suc = (fwrite(&tru, sizeof(tru), 1, fp) == 1);
    645   0   yongsun         }
    646   0   yongsun 
    647   0   yongsun         CWordVec vec;
    648   0   yongsun         itId = (*itNode)->m_WordIdSet.begin();
    649   0   yongsun         itIdLast = (*itNode)->m_WordIdSet.end();
    650   0   yongsun         for (; itId != itIdLast; ++itId)
    651   0   yongsun             vec.push_back(TWordInfo(*itId, psrt->getCost(*itId), psrt->isSeen(*itId)));
    652   0   yongsun         std::make_heap(vec.begin(), vec.end());
    653   0   yongsun         std::sort_heap(vec.begin(), vec.end());
    654   0   yongsun 
    655   0   yongsun         CWordVec::iterator itv = vec.begin();
    656   0   yongsun         CWordVec::iterator itve = vec.end();
    657   0   yongsun         for (; itv != itve && suc; ++itv) {
    658   0   yongsun             CPinyinTrie::TWordIdInfo wi;
    659   0   yongsun             wi.m_id = itv->m_id.anony.m_id;
    660   0   yongsun             wi.m_bGBK = itv->m_id.anony.m_bGBK;
    661   0   yongsun             wi.m_bGB18030 = itv->m_id.anony.m_bGB18030;
    662   0   yongsun             wi.m_len = m_Lexicon[itv->m_id.anony.m_id].size();
    663   0   yongsun             wi.m_bSeen = ((itv->m_bSeen)?(1):(0));
    664   0   yongsun             wi.m_cost = itv->m_id.anony.m_cost;
    665   0   yongsun             suc = (fwrite(&wi, sizeof(wi), 1, fp) == 1);
    666   0   yongsun         }
    667   0   yongsun     }
    668   0   yongsun     itWordStr = m_Lexicon.begin();
    669   0   yongsun     itWordStrLast = m_Lexicon.end();
    670   0   yongsun     for (; itWordStr != itWordStrLast && suc; ++itWordStr) {
    671   0   yongsun         MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
    672   0   yongsun         int sz = WCSLEN(wbuf);
    673   0   yongsun         suc = (fwrite(wbuf, (sz+1)*sizeof(TWCHAR), 1, fp) == 1);
    674   0   yongsun     }
    675   0   yongsun     return suc;
    676   0   yongsun }
    677