Home | History | Annotate | Download | only in slm
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #include <stdio.h>
     39 #include <stdlib.h>
     40 
     41 #include "sim_dict.h"
     42 
     43 
     44 void CSIMDict::freeSubTree(CSIMDict::TState& root)
     45 {
     46         if (root.follow != NULL) {
     47                 Map_Type &map = *(root.follow);
     48                 for (Map_Type::iterator it=map.begin(), last=map.end(); it != last; ++it)
     49                         freeSubTree(it->second);
     50                 delete root.follow;
     51                 root.follow = NULL;
     52         }
     53 }
     54 
     55 const CSIMDict::TState* CSIMDict::step(const CSIMDict::TState* root, TWCHAR wch)
     56 {
     57         if ((root != NULL) && (root->follow != NULL) && wch != WCH_NULL) {
     58                 Map_Type::iterator it = root->follow->find(TSIMChar(wch));
     59                 if (it != root->follow->end())
     60                         return &(it->second);
     61         }
     62         return NULL;
     63 }
     64 
     65 int	CSIMDict::matchLongest(const CSIMDict::TState* root, CSIMDict::PState &  result, const TWCHAR* str)
     66 {
     67         int lastWordLen = 0, len = 0;
     68         result = root;
     69         while (root != NULL) {
     70                 if (root->word_id != SIM_ID_NOT_WORD) {
     71                         result = root;
     72                         lastWordLen = len;
     73                 }
     74                 ++len;
     75                 root = step(root, *str++);
     76         }
     77         return lastWordLen;
     78 }
     79 
     80 bool
     81 CSIMDict::parseText(const char* filename)
     82 {
     83     FILE * fp = NULL;
     84     static char buf[1024];
     85     static TWCHAR wword[sizeof(buf)];
     86     unsigned int id;
     87 
     88     try {
     89         if ((fp = fopen(filename, "r")) == NULL)
     90           return false;
     91         while (fgets(buf, 1024, fp) != NULL) {
     92             if (*buf == '\n' || *buf == '#')
     93                 continue;
     94 
     95             char* p = buf;
     96             while (*p == ' ' || *p == '\t')
     97                 ++p;
     98             char* pstart = p;
     99             while (*p != 0 && *p != ' ' && *p != '\t')
    100                 ++p;
    101             if (*p == 0)
    102                 continue;
    103             *p++ = 0;
    104             while (*p == ' ' || *p == '\t')
    105                 ++p;
    106             if (!(*p >= '0' && *p <= '9')) continue;
    107             for (id=0; *p >= '0' && *p <= '9'; ++p)
    108                 id = 10*id + (*p - '0');
    109 
    110             if (id < SIM_ID_REALWORD_START)
    111                 continue;
    112             if (MBSTOWCS(wword, pstart, sizeof(buf)) != (size_t)-1) {
    113                 insertWord(wword, TSIMWordId(id));
    114             } else {
    115                 fprintf(stderr, "mbs to wcs conversion error for : %s %d\n", buf, id);
    116                 exit(100);
    117             }
    118         }
    119         fclose(fp);
    120     } catch (...) {
    121         if (fp != NULL)
    122             fclose(fp);
    123         buf[sizeof(buf)-1] = 0;
    124         fprintf(stderr, "Catch exception when loading dictionary at %s, existing...", buf);
    125         exit(200);
    126     }
    127     return true;
    128 }
    129 
    130 void CSIMDict::insertWord(const TWCHAR* wstr, TSIMWordId id)
    131 {
    132         TState* ps = &m_root;
    133         while (*wstr) {
    134                 TSIMChar ch(*wstr++);
    135                 TSIMWordId nodeId = (*wstr)?SIM_ID_NOT_WORD:id;
    136                 if (ps->follow == NULL) {
    137                         ps->follow = new Map_Type();
    138                 }
    139                 Map_Type & map = *(ps->follow);
    140                 Map_Type::iterator it = map.find(ch);
    141                 if (it != map.end() && nodeId != SIM_ID_NOT_WORD &&
    142                     it->second.word_id != SIM_ID_NOT_WORD && it->second.word_id != nodeId) {
    143                         throw new int(100);
    144                 }
    145                 if (it != map.end()){
    146                         if (nodeId != SIM_ID_NOT_WORD)
    147                                 it->second.word_id = nodeId;
    148                         ps = &(it->second);
    149                 } else {
    150                         ps = &(map[ch] = TState(nodeId));
    151                 }
    152         }
    153 }
    154 
    155 void CSIMDict::InnerPrint(FILE* fp, wstring & wstr, const TState* pnode)
    156 {
    157         if (pnode != NULL && pnode->word_id != SIM_ID_NOT_WORD) {
    158                 char* buf = new char[wstr.size()*2+2];
    159                 WCSTOMBS(buf, wstr.c_str(), wstr.size()*2+2);
    160                 fprintf(fp, "%s %d\n", buf, unsigned(pnode->word_id));
    161                 delete[] buf;
    162         }
    163         if (pnode != NULL && pnode->follow != NULL) {
    164                 Map_Type::iterator it, ite = pnode->follow->end();
    165                 for (it = pnode->follow->begin(); it != ite; ++it) {
    166                         TWCHAR wch = TWCHAR(it->first);
    167                         wstr.push_back(wch);
    168                         InnerPrint(fp, wstr, &(it->second));
    169                         wstr.erase(wstr.size()-1, 1);
    170                 }
    171         }
    172 }
    173