Home | History | Annotate | Download | only in mmseg
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifdef HAVE_CONFIG_H
     39    0   yongsun #include "config.h"
     40    0   yongsun #endif
     41    0   yongsun 
     42    0   yongsun #ifdef HAVE_ASSERT_H
     43    0   yongsun #include <assert.h>
     44    0   yongsun #endif
     45    0   yongsun 
     46    0   yongsun #ifdef HAVE_GETOPT_H
     47    0   yongsun #include <getopt.h>
     48    0   yongsun #endif
     49    0   yongsun 
     50    0   yongsun #include <stdio.h>
     51    0   yongsun #include <unistd.h>
     52    0   yongsun #include <locale.h>
     53    0   yongsun 
     54    0   yongsun #include "../sim_dict.h"
     55    0   yongsun #include "../sim_sen.h"
     56    0   yongsun 
     57    0   yongsun static struct option long_options[] =
     58    0   yongsun {
     59    0   yongsun     {"dict", 1, 0, 'd'},
     60    0   yongsun     {"format", 1, 0, 'f'},
     61    0   yongsun     {"show-id", 0, 0, 'i'},
     62    0   yongsun     {"s-tok", 1, 0, 's'},
     63    0   yongsun     {"ambiguious-id", 1, 0, 'a'},
     64    0   yongsun     {0, 0, 0, 0}
     65    0   yongsun };
     66    0   yongsun 
     67    0   yongsun static char* s_strDictFile = NULL;
     68    0   yongsun static bool s_bTextOut = false;
     69    0   yongsun static bool s_bShowId = false;
     70    0   yongsun static TSIMWordId s_iSTOKID = 10;
     71    0   yongsun static TSIMWordId s_iAmbiID = 0;
     72    0   yongsun 
     73    0   yongsun static CSIMDict *s_dict = NULL;
     74    0   yongsun 
     75    0   yongsun static void
     76    0   yongsun ShowUsage()
     77    0   yongsun {
     78    0   yongsun     fprintf(stderr, "\nUsage:\n");
     79    0   yongsun     fprintf(stderr, "mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n");
     80    0   yongsun     fprintf(stderr, "  -f --format:\n");
     81    0   yongsun     fprintf(stderr, "    Output Format, can be 'text' or 'bin'. default 'bin'\n");
     82    0   yongsun     fprintf(stderr, "    Normally, in text mode, word text are output, while in binary mode,\n");
     83  209  tchaikov     fprintf(stderr, "    binary short integer of the word-ids are written to stdout.\n");
     84    0   yongsun     fprintf(stderr, "  -s --stok:\n");
     85    0   yongsun     fprintf(stderr, "    Sentence token id. Default 10.\n");
     86  209  tchaikov     fprintf(stderr, "    It will be written to output in binary mode after every sentence.\n");
     87    0   yongsun     fprintf(stderr, "  -i --show-id:\n");
     88   11   yongsun     fprintf(stderr, "    Show Id info. Under text output format mode, attach id after known.\n");
     89   11   yongsun     fprintf(stderr, "    words. If under binary mode, print id(s) in text.\n");
     90    0   yongsun     fprintf(stderr, "  -a --ambiguious-id:\n");
     91   11   yongsun     fprintf(stderr, "    Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n");
     92   11   yongsun     fprintf(stderr, "    The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n");
     93   11   yongsun     fprintf(stderr, "    is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n");
     94   11   yongsun     fprintf(stderr, "    is 0.\n");
     95    0   yongsun     fprintf(stderr, "\n");
     96    0   yongsun     fprintf(stderr, "Notes:\n");
     97    0   yongsun     fprintf(stderr, "  Under binary mode, consecutive id of 0 are merged into one 0.\n");
     98  209  tchaikov     fprintf(stderr, "  Under text mode, no space are inserted between unknown-words. \n");
     99    0   yongsun     fprintf(stderr, "\n");
    100    0   yongsun     fprintf(stderr, "\n");
    101    0   yongsun     exit(1000);
    102    0   yongsun }
    103    0   yongsun 
    104    0   yongsun static void
    105    0   yongsun getParameters(int argc, char* argv[])
    106    0   yongsun {
    107    0   yongsun     int c;
    108    0   yongsun     while ((c=getopt_long(argc, argv, "d:if:s:a:", long_options, NULL)) != -1)
    109    0   yongsun     {
    110    0   yongsun         switch (c) {
    111    0   yongsun         case 'd':
    112    0   yongsun             s_strDictFile = strdup(optarg);
    113    0   yongsun             break;
    114    0   yongsun         case 'i':
    115    0   yongsun             s_bShowId = true;
    116    0   yongsun             break;
    117    0   yongsun         case 'f':
    118    0   yongsun             s_bTextOut = (strcmp(optarg, "text") == 0);
    119    0   yongsun             break;
    120    0   yongsun         case 's':
    121    0   yongsun             s_iSTOKID = atoi(optarg);
    122    0   yongsun             break;
    123    0   yongsun         case 'a':
    124    0   yongsun             s_iAmbiID = atoi(optarg);
    125    0   yongsun             break;
    126    0   yongsun         default:
    127    0   yongsun             ShowUsage();
    128    0   yongsun             break;
    129    0   yongsun         }
    130    0   yongsun     }
    131    0   yongsun     if (s_strDictFile == NULL)
    132    0   yongsun         ShowUsage();
    133    0   yongsun }
    134    0   yongsun 
    135    0   yongsun static void
    136    0   yongsun output_stok(int& nWords)
    137    0   yongsun {
    138    0   yongsun     if (s_bShowId) {
    139    0   yongsun         if (nWords > 0)
    140    0   yongsun             printf(" ");
    141    0   yongsun         printf("%d", unsigned(s_iSTOKID));
    142    0   yongsun     } else {
    143    0   yongsun         fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout);
    144    0   yongsun     }
    145    0   yongsun     ++nWords;
    146    0   yongsun }
    147    0   yongsun 
    148    0   yongsun static void
    149    0   yongsun output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords)
    150    0   yongsun {
    151    0   yongsun     static char mbword[1024];
    152    0   yongsun     static TWCHAR wcword[1024];
    153    0   yongsun 
    154    0   yongsun     bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
    155    0   yongsun     if (s_bTextOut) {
    156    0   yongsun         for (int i=0; i < len; ++i, ++p)
    157    0   yongsun             wcword[i] = *p;
    158    0   yongsun         wcword[len] = 0;
    159    0   yongsun         WCSTOMBS(mbword, wcword, sizeof(mbword));
    160    0   yongsun         if (bRealGap && idprev == SIM_ID_NOT_WORD)
    161    0   yongsun             printf("(%d)", unsigned(idprev));
    162    0   yongsun         if (bRealGap && (nWords > 0))
    163    0   yongsun             printf(" ");
    164   11   yongsun 	(s_iAmbiID && idcur == s_iAmbiID)? printf ("<ambi>%s</ambi>", mbword):
    165   11   yongsun                                            printf("%s", mbword);
    166    0   yongsun         if (s_bShowId && idcur != SIM_ID_NOT_WORD)
    167    0   yongsun             printf("(%d)", unsigned(idcur));
    168    0   yongsun     } else {
    169    0   yongsun         if (bRealGap) {
    170    0   yongsun             if (s_bShowId) {
    171    0   yongsun                 if (nWords > 0)
    172    0   yongsun                     printf(" ");
    173    0   yongsun                 printf("%d", unsigned(idcur));
    174    0   yongsun             } else
    175    0   yongsun                 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout);
    176    0   yongsun         }
    177    0   yongsun     }
    178    0   yongsun     if (bRealGap)
    179    0   yongsun         ++nWords;
    180    0   yongsun }
    181    0   yongsun 
    182    0   yongsun /**
    183    0   yongsun * Return . For example, ABCDEF if ABC CD DEF are words.
    184    0   yongsun * if return len > word_len, then ambiguious exists at word [p p+len)...
    185    0   yongsun */
    186    0   yongsun int
    187    0   yongsun getAmbiLen(const TWCHAR* p, int word_len)
    188    0   yongsun {
    189    0   yongsun     const CSIMDict::TState* pstate;
    190    0   yongsun 
    191    0   yongsun     for (int i=1; i<word_len && *(p+i) != WCH_NULL; ++i) {
    192    0   yongsun         int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i);
    193    0   yongsun         if (word_len < i+len)
    194    0   yongsun             word_len = i+len;
    195    0   yongsun     }
    196    0   yongsun 
    197    0   yongsun     return word_len;
    198    0   yongsun }
    199    0   yongsun 
    200    0   yongsun static bool
    201    0   yongsun processSingleFile(FILE* fp, int &nWords, int &nAmbis)
    202    0   yongsun {
    203    0   yongsun     nWords = 0;
    204    0   yongsun     nAmbis = 0;
    205    0   yongsun 
    206    0   yongsun     wstring sntnc;
    207    0   yongsun     CSIMCharReader *pReader = new CSIMCharReader(fp);
    208    0   yongsun     CSIMCharReader::iterator iter = pReader->begin();
    209    0   yongsun     TSIMWordId idcur, idprev = s_iSTOKID;
    210    0   yongsun 
    211    0   yongsun     if (!s_bTextOut)
    212    0   yongsun         output_stok(nWords);
    213    0   yongsun 
    214    0   yongsun     while (true){
    215    0   yongsun         if (ReadSentence(sntnc, iter, false) == false)
    216    0   yongsun             break;
    217    0   yongsun 
    218    0   yongsun         for (const TWCHAR *p = sntnc.c_str(); (*p); ) {
    219    0   yongsun 
    220    0   yongsun             const CSIMDict::TState* pstate;
    221    0   yongsun             int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
    222    0   yongsun             if (len <= 0) {
    223    0   yongsun                 idcur = SIM_ID_NOT_WORD;
    224    0   yongsun                 len = 1;
    225    0   yongsun             } else
    226    0   yongsun                 idcur = pstate->word_id;
    227    0   yongsun 
    228   11   yongsun             if (s_iAmbiID != WCH_NULL) {
    229    0   yongsun                 int ambiLen=getAmbiLen(p, len);
    230    0   yongsun                 if (ambiLen > len) {
    231    0   yongsun                     len = ambiLen;
    232    0   yongsun                     idcur = s_iAmbiID;
    233    0   yongsun                     ++nAmbis;
    234    0   yongsun                 }
    235    0   yongsun             }
    236    0   yongsun 
    237    0   yongsun             output(len, p, idprev, idcur, nWords);
    238    0   yongsun 
    239    0   yongsun             idprev = idcur;
    240    0   yongsun             p += len;
    241    0   yongsun         }
    242    0   yongsun 
    243    0   yongsun         if (!s_bTextOut) {
    244    0   yongsun             output_stok(nWords);
    245    0   yongsun             idprev = s_iSTOKID;
    246    0   yongsun         }
    247    0   yongsun     }
    248    0   yongsun 
    249    0   yongsun     fflush(stdout);
    250    0   yongsun     return true;
    251    0   yongsun }
    252    0   yongsun 
    253    0   yongsun int
    254    0   yongsun main(int argc, char *argv[])
    255    0   yongsun {
    256    0   yongsun     int nWords, nAmbis;
    257    0   yongsun 
    258    0   yongsun     setlocale(LC_ALL, "");
    259    0   yongsun     getParameters(argc, argv);
    260    0   yongsun     argc -= optind;
    261    0   yongsun     argv += optind;
    262    0   yongsun 
    263    0   yongsun     fprintf(stderr, "Loading lexicon..."); fflush(stderr);
    264    0   yongsun     s_dict = new CSIMDict();
    265    0   yongsun     if (!s_dict->parseText(s_strDictFile)) {
    266    0   yongsun       fprintf(stderr, "fail\n"); fflush(stderr);
    267    0   yongsun       exit(1001);
    268    0   yongsun     }
    269    0   yongsun     fprintf(stderr, "done"); fflush(stderr);
    270    0   yongsun 
    271    0   yongsun     if (argc == 0) {
    272    0   yongsun         fprintf(stderr, "\nProcessing from stdin..."); fflush(stderr);
    273    0   yongsun         processSingleFile(stdin, nWords, nAmbis);
    274    0   yongsun         fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); fflush(stderr);
    275    0   yongsun     } else {
    276    0   yongsun         for (int i=0; i < argc; ++i) {
    277    0   yongsun             fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
    278    0   yongsun             FILE *fp = fopen(argv[i], "r");
    279    0   yongsun             if (fp != NULL) {
    280    0   yongsun                 processSingleFile(fp, nWords, nAmbis);
    281    0   yongsun                 fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n", ftell(fp), nWords, nAmbis); fflush(stderr);
    282    0   yongsun             } else {
    283    0   yongsun                 fprintf(stderr, "Can not Open!!!!!!!\n"); fflush(stderr);
    284    0   yongsun             }
    285    0   yongsun             fclose(fp);
    286    0   yongsun         }
    287    0   yongsun     }
    288    0   yongsun 
    289    0   yongsun     s_dict->close();
    290    0   yongsun     delete s_dict;
    291    0   yongsun     s_dict = NULL;
    292    0   yongsun     return 0;
    293    0   yongsun }
    294