Home | History | Annotate | Download | only in slm
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #include <stdio.h>
     39  208  tchaikov #include <stdlib.h>
     40    0   yongsun 
     41    0   yongsun #include "sim_dict.h"
     42    0   yongsun 
     43    0   yongsun 
     44    0   yongsun void CSIMDict::freeSubTree(CSIMDict::TState& root)
     45    0   yongsun {
     46    0   yongsun         if (root.follow != NULL) {
     47    0   yongsun                 Map_Type &map = *(root.follow);
     48    0   yongsun                 for (Map_Type::iterator it=map.begin(), last=map.end(); it != last; ++it)
     49    0   yongsun                         freeSubTree(it->second);
     50    0   yongsun                 delete root.follow;
     51    0   yongsun                 root.follow = NULL;
     52    0   yongsun         }
     53    0   yongsun }
     54    0   yongsun 
     55    0   yongsun const CSIMDict::TState* CSIMDict::step(const CSIMDict::TState* root, TWCHAR wch)
     56    0   yongsun {
     57    0   yongsun         if ((root != NULL) && (root->follow != NULL) && wch != WCH_NULL) {
     58    0   yongsun                 Map_Type::iterator it = root->follow->find(TSIMChar(wch));
     59    0   yongsun                 if (it != root->follow->end())
     60    0   yongsun                         return &(it->second);
     61    0   yongsun         }
     62    0   yongsun         return NULL;
     63    0   yongsun }
     64    0   yongsun 
     65    0   yongsun int	CSIMDict::matchLongest(const CSIMDict::TState* root, CSIMDict::PState &  result, const TWCHAR* str)
     66    0   yongsun {
     67    0   yongsun         int lastWordLen = 0, len = 0;
     68    0   yongsun         result = root;
     69    0   yongsun         while (root != NULL) {
     70    0   yongsun                 if (root->word_id != SIM_ID_NOT_WORD) {
     71    0   yongsun                         result = root;
     72    0   yongsun                         lastWordLen = len;
     73    0   yongsun                 }
     74    0   yongsun                 ++len;
     75    0   yongsun                 root = step(root, *str++);
     76    0   yongsun         }
     77    0   yongsun         return lastWordLen;
     78    0   yongsun }
     79    0   yongsun 
     80    0   yongsun bool
     81    0   yongsun CSIMDict::parseText(const char* filename)
     82    0   yongsun {
     83    0   yongsun     FILE * fp = NULL;
     84    0   yongsun     static char buf[1024];
     85    0   yongsun     static TWCHAR wword[sizeof(buf)];
     86    0   yongsun     unsigned int id;
     87    0   yongsun 
     88    0   yongsun     try {
     89    0   yongsun         if ((fp = fopen(filename, "r")) == NULL)
     90    0   yongsun           return false;
     91    0   yongsun         while (fgets(buf, 1024, fp) != NULL) {
     92    8  ys148558             if (*buf == '\n' || *buf == '#')
     93    8  ys148558                 continue;
     94    0   yongsun 
     95    8  ys148558             char* p = buf;
     96    8  ys148558             while (*p == ' ' || *p == '\t')
     97    8  ys148558                 ++p;
     98    8  ys148558             char* pstart = p;
     99    8  ys148558             while (*p != 0 && *p != ' ' && *p != '\t')
    100    8  ys148558                 ++p;
    101    8  ys148558             if (*p == 0)
    102    8  ys148558                 continue;
    103    8  ys148558             *p++ = 0;
    104    8  ys148558             while (*p == ' ' || *p == '\t')
    105    8  ys148558                 ++p;
    106    8  ys148558             if (!(*p >= '0' && *p <= '9')) continue;
    107    8  ys148558             for (id=0; *p >= '0' && *p <= '9'; ++p)
    108    8  ys148558                 id = 10*id + (*p - '0');
    109    8  ys148558 
    110    8  ys148558             if (id < SIM_ID_REALWORD_START)
    111    8  ys148558                 continue;
    112    8  ys148558             if (MBSTOWCS(wword, pstart, sizeof(buf)) != (size_t)-1) {
    113    8  ys148558                 insertWord(wword, TSIMWordId(id));
    114    8  ys148558             } else {
    115    8  ys148558                 fprintf(stderr, "mbs to wcs conversion error for : %s %d\n", buf, id);
    116    8  ys148558                 exit(100);
    117    0   yongsun             }
    118    0   yongsun         }
    119    0   yongsun         fclose(fp);
    120    0   yongsun     } catch (...) {
    121    0   yongsun         if (fp != NULL)
    122    0   yongsun             fclose(fp);
    123    0   yongsun         buf[sizeof(buf)-1] = 0;
    124    0   yongsun         fprintf(stderr, "Catch exception when loading dictionary at %s, existing...", buf);
    125    0   yongsun         exit(200);
    126    0   yongsun     }
    127    0   yongsun     return true;
    128    0   yongsun }
    129    0   yongsun 
    130    0   yongsun void CSIMDict::insertWord(const TWCHAR* wstr, TSIMWordId id)
    131    0   yongsun {
    132    0   yongsun         TState* ps = &m_root;
    133    0   yongsun         while (*wstr) {
    134    0   yongsun                 TSIMChar ch(*wstr++);
    135    0   yongsun                 TSIMWordId nodeId = (*wstr)?SIM_ID_NOT_WORD:id;
    136    0   yongsun                 if (ps->follow == NULL) {
    137    0   yongsun                         ps->follow = new Map_Type();
    138    0   yongsun                 }
    139    0   yongsun                 Map_Type & map = *(ps->follow);
    140    0   yongsun                 Map_Type::iterator it = map.find(ch);
    141    0   yongsun                 if (it != map.end() && nodeId != SIM_ID_NOT_WORD &&
    142    0   yongsun                     it->second.word_id != SIM_ID_NOT_WORD && it->second.word_id != nodeId) {
    143    0   yongsun                         throw new int(100);
    144    0   yongsun                 }
    145    0   yongsun                 if (it != map.end()){
    146    0   yongsun                         if (nodeId != SIM_ID_NOT_WORD)
    147    0   yongsun                                 it->second.word_id = nodeId;
    148    0   yongsun                         ps = &(it->second);
    149    0   yongsun                 } else {
    150    0   yongsun                         ps = &(map[ch] = TState(nodeId));
    151    0   yongsun                 }
    152    0   yongsun         }
    153    0   yongsun }
    154    0   yongsun 
    155    0   yongsun void CSIMDict::InnerPrint(FILE* fp, wstring & wstr, const TState* pnode)
    156    0   yongsun {
    157    0   yongsun         if (pnode != NULL && pnode->word_id != SIM_ID_NOT_WORD) {
    158    0   yongsun                 char* buf = new char[wstr.size()*2+2];
    159    0   yongsun                 WCSTOMBS(buf, wstr.c_str(), wstr.size()*2+2);
    160    0   yongsun                 fprintf(fp, "%s %d\n", buf, unsigned(pnode->word_id));
    161    0   yongsun                 delete[] buf;
    162    0   yongsun         }
    163    0   yongsun         if (pnode != NULL && pnode->follow != NULL) {
    164    0   yongsun                 Map_Type::iterator it, ite = pnode->follow->end();
    165    0   yongsun                 for (it = pnode->follow->begin(); it != ite; ++it) {
    166    0   yongsun                         TWCHAR wch = TWCHAR(it->first);
    167    0   yongsun                         wstr.push_back(wch);
    168    0   yongsun                         InnerPrint(fp, wstr, &(it->second));
    169    0   yongsun                         wstr.erase(wstr.size()-1, 1);
    170    0   yongsun                 }
    171    0   yongsun         }
    172    0   yongsun }
    173