Home | History | Annotate | Download | only in mmseg
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include "config.h"
     40 #endif
     41 
     42 #ifdef HAVE_ASSERT_H
     43 #include <assert.h>
     44 #endif
     45 
     46 #ifdef HAVE_GETOPT_H
     47 #include <getopt.h>
     48 #endif
     49 
     50 #include <stdio.h>
     51 #include <unistd.h>
     52 #include <locale.h>
     53 
     54 #include "../sim_dict.h"
     55 #include "../sim_sen.h"
     56 
     57 static struct option long_options[] =
     58 {
     59     {"dict", 1, 0, 'd'},
     60     {"format", 1, 0, 'f'},
     61     {"show-id", 0, 0, 'i'},
     62     {"s-tok", 1, 0, 's'},
     63     {"ambiguious-id", 1, 0, 'a'},
     64     {0, 0, 0, 0}
     65 };
     66 
     67 static char* s_strDictFile = NULL;
     68 static bool s_bTextOut = false;
     69 static bool s_bShowId = false;
     70 static TSIMWordId s_iSTOKID = 10;
     71 static TSIMWordId s_iAmbiID = 0;
     72 
     73 static CSIMDict *s_dict = NULL;
     74 
     75 static void
     76 ShowUsage()
     77 {
     78     fprintf(stderr, "\nUsage:\n");
     79     fprintf(stderr, "mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n");
     80     fprintf(stderr, "  -f --format:\n");
     81     fprintf(stderr, "    Output Format, can be 'text' or 'bin'. default 'bin'\n");
     82     fprintf(stderr, "    Normally, in text mode, word text are output, while in binary mode,\n");
     83     fprintf(stderr, "    binary short integer of the word-ids are written to stdout.\n");
     84     fprintf(stderr, "  -s --stok:\n");
     85     fprintf(stderr, "    Sentence token id. Default 10.\n");
     86     fprintf(stderr, "    It will be written to output in binary mode after every sentence.\n");
     87     fprintf(stderr, "  -i --show-id:\n");
     88     fprintf(stderr, "    Show Id info. Under text output format mode, attach id after known.\n");
     89     fprintf(stderr, "    words. If under binary mode, print id(s) in text.\n");
     90     fprintf(stderr, "  -a --ambiguious-id:\n");
     91     fprintf(stderr, "    Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n");
     92     fprintf(stderr, "    The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n");
     93     fprintf(stderr, "    is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n");
     94     fprintf(stderr, "    is 0.\n");
     95     fprintf(stderr, "\n");
     96     fprintf(stderr, "Notes:\n");
     97     fprintf(stderr, "  Under binary mode, consecutive id of 0 are merged into one 0.\n");
     98     fprintf(stderr, "  Under text mode, no space are inserted between unknown-words. \n");
     99     fprintf(stderr, "\n");
    100     fprintf(stderr, "\n");
    101     exit(1000);
    102 }
    103 
    104 static void
    105 getParameters(int argc, char* argv[])
    106 {
    107     int c;
    108     while ((c=getopt_long(argc, argv, "d:if:s:a:", long_options, NULL)) != -1)
    109     {
    110         switch (c) {
    111         case 'd':
    112             s_strDictFile = strdup(optarg);
    113             break;
    114         case 'i':
    115             s_bShowId = true;
    116             break;
    117         case 'f':
    118             s_bTextOut = (strcmp(optarg, "text") == 0);
    119             break;
    120         case 's':
    121             s_iSTOKID = atoi(optarg);
    122             break;
    123         case 'a':
    124             s_iAmbiID = atoi(optarg);
    125             break;
    126         default:
    127             ShowUsage();
    128             break;
    129         }
    130     }
    131     if (s_strDictFile == NULL)
    132         ShowUsage();
    133 }
    134 
    135 static void
    136 output_stok(int& nWords)
    137 {
    138     if (s_bShowId) {
    139         if (nWords > 0)
    140             printf(" ");
    141         printf("%d", unsigned(s_iSTOKID));
    142     } else {
    143         fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout);
    144     }
    145     ++nWords;
    146 }
    147 
    148 static void
    149 output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords)
    150 {
    151     static char mbword[1024];
    152     static TWCHAR wcword[1024];
    153 
    154     bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
    155     if (s_bTextOut) {
    156         for (int i=0; i < len; ++i, ++p)
    157             wcword[i] = *p;
    158         wcword[len] = 0;
    159         WCSTOMBS(mbword, wcword, sizeof(mbword));
    160         if (bRealGap && idprev == SIM_ID_NOT_WORD)
    161             printf("(%d)", unsigned(idprev));
    162         if (bRealGap && (nWords > 0))
    163             printf(" ");
    164 	(s_iAmbiID && idcur == s_iAmbiID)? printf ("<ambi>%s</ambi>", mbword):
    165                                            printf("%s", mbword);
    166         if (s_bShowId && idcur != SIM_ID_NOT_WORD)
    167             printf("(%d)", unsigned(idcur));
    168     } else {
    169         if (bRealGap) {
    170             if (s_bShowId) {
    171                 if (nWords > 0)
    172                     printf(" ");
    173                 printf("%d", unsigned(idcur));
    174             } else
    175                 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout);
    176         }
    177     }
    178     if (bRealGap)
    179         ++nWords;
    180 }
    181 
    182 /**
    183 * Return . For example, ABCDEF if ABC CD DEF are words.
    184 * if return len > word_len, then ambiguious exists at word [p p+len)...
    185 */
    186 int
    187 getAmbiLen(const TWCHAR* p, int word_len)
    188 {
    189     const CSIMDict::TState* pstate;
    190 
    191     for (int i=1; i<word_len && *(p+i) != WCH_NULL; ++i) {
    192         int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i);
    193         if (word_len < i+len)
    194             word_len = i+len;
    195     }
    196 
    197     return word_len;
    198 }
    199 
    200 static bool
    201 processSingleFile(FILE* fp, int &nWords, int &nAmbis)
    202 {
    203     nWords = 0;
    204     nAmbis = 0;
    205 
    206     wstring sntnc;
    207     CSIMCharReader *pReader = new CSIMCharReader(fp);
    208     CSIMCharReader::iterator iter = pReader->begin();
    209     TSIMWordId idcur, idprev = s_iSTOKID;
    210 
    211     if (!s_bTextOut)
    212         output_stok(nWords);
    213 
    214     while (true){
    215         if (ReadSentence(sntnc, iter, false) == false)
    216             break;
    217 
    218         for (const TWCHAR *p = sntnc.c_str(); (*p); ) {
    219 
    220             const CSIMDict::TState* pstate;
    221             int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
    222             if (len <= 0) {
    223                 idcur = SIM_ID_NOT_WORD;
    224                 len = 1;
    225             } else
    226                 idcur = pstate->word_id;
    227 
    228             if (s_iAmbiID != WCH_NULL) {
    229                 int ambiLen=getAmbiLen(p, len);
    230                 if (ambiLen > len) {
    231                     len = ambiLen;
    232                     idcur = s_iAmbiID;
    233                     ++nAmbis;
    234                 }
    235             }
    236 
    237             output(len, p, idprev, idcur, nWords);
    238 
    239             idprev = idcur;
    240             p += len;
    241         }
    242 
    243         if (!s_bTextOut) {
    244             output_stok(nWords);
    245             idprev = s_iSTOKID;
    246         }
    247     }
    248 
    249     fflush(stdout);
    250     return true;
    251 }
    252 
    253 int
    254 main(int argc, char *argv[])
    255 {
    256     int nWords, nAmbis;
    257 
    258     setlocale(LC_ALL, "");
    259     getParameters(argc, argv);
    260     argc -= optind;
    261     argv += optind;
    262 
    263     fprintf(stderr, "Loading lexicon..."); fflush(stderr);
    264     s_dict = new CSIMDict();
    265     if (!s_dict->parseText(s_strDictFile)) {
    266       fprintf(stderr, "fail\n"); fflush(stderr);
    267       exit(1001);
    268     }
    269     fprintf(stderr, "done"); fflush(stderr);
    270 
    271     if (argc == 0) {
    272         fprintf(stderr, "\nProcessing from stdin..."); fflush(stderr);
    273         processSingleFile(stdin, nWords, nAmbis);
    274         fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); fflush(stderr);
    275     } else {
    276         for (int i=0; i < argc; ++i) {
    277             fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
    278             FILE *fp = fopen(argv[i], "r");
    279             if (fp != NULL) {
    280                 processSingleFile(fp, nWords, nAmbis);
    281                 fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n", ftell(fp), nWords, nAmbis); fflush(stderr);
    282             } else {
    283                 fprintf(stderr, "Can not Open!!!!!!!\n"); fflush(stderr);
    284             }
    285             fclose(fp);
    286         }
    287     }
    288 
    289     s_dict->close();
    290     delete s_dict;
    291     s_dict = NULL;
    292     return 0;
    293 }
    294