1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include "config.h" 40 #endif 41 42 #ifdef HAVE_ASSERT_H 43 #include <assert.h> 44 #endif 45 46 #ifdef HAVE_GETOPT_H 47 #include <getopt.h> 48 #endif 49 50 #include <stdio.h> 51 #include <unistd.h> 52 #include <locale.h> 53 54 #include "../sim_dict.h" 55 #include "../sim_sen.h" 56 57 static struct option long_options[] = 58 { 59 {"dict", 1, 0, 'd'}, 60 {"format", 1, 0, 'f'}, 61 {"show-id", 0, 0, 'i'}, 62 {"s-tok", 1, 0, 's'}, 63 {"ambiguious-id", 1, 0, 'a'}, 64 {0, 0, 0, 0} 65 }; 66 67 static char* s_strDictFile = NULL; 68 static bool s_bTextOut = false; 69 static bool s_bShowId = false; 70 static TSIMWordId s_iSTOKID = 10; 71 static TSIMWordId s_iAmbiID = 0; 72 73 static CSIMDict *s_dict = NULL; 74 75 static void 76 ShowUsage() 77 { 78 fprintf(stderr, "\nUsage:\n"); 79 fprintf(stderr, "mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n"); 80 fprintf(stderr, " -f --format:\n"); 81 fprintf(stderr, " Output Format, can be 'text' or 'bin'. default 'bin'\n"); 82 fprintf(stderr, " Normally, in text mode, word text are output, while in binary mode,\n"); 83 fprintf(stderr, " binary short integer of the word-ids are written to stdout.\n"); 84 fprintf(stderr, " -s --stok:\n"); 85 fprintf(stderr, " Sentence token id. Default 10.\n"); 86 fprintf(stderr, " It will be written to output in binary mode after every sentence.\n"); 87 fprintf(stderr, " -i --show-id:\n"); 88 fprintf(stderr, " Show Id info. Under text output format mode, attach id after known.\n"); 89 fprintf(stderr, " words. If under binary mode, print id(s) in text.\n"); 90 fprintf(stderr, " -a --ambiguious-id:\n"); 91 fprintf(stderr, " Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n"); 92 fprintf(stderr, " The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n"); 93 fprintf(stderr, " is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n"); 94 fprintf(stderr, " is 0.\n"); 95 fprintf(stderr, "\n"); 96 fprintf(stderr, "Notes:\n"); 97 fprintf(stderr, " Under binary mode, consecutive id of 0 are merged into one 0.\n"); 98 fprintf(stderr, " Under text mode, no space are inserted between unknown-words. \n"); 99 fprintf(stderr, "\n"); 100 fprintf(stderr, "\n"); 101 exit(1000); 102 } 103 104 static void 105 getParameters(int argc, char* argv[]) 106 { 107 int c; 108 while ((c=getopt_long(argc, argv, "d:if:s:a:", long_options, NULL)) != -1) 109 { 110 switch (c) { 111 case 'd': 112 s_strDictFile = strdup(optarg); 113 break; 114 case 'i': 115 s_bShowId = true; 116 break; 117 case 'f': 118 s_bTextOut = (strcmp(optarg, "text") == 0); 119 break; 120 case 's': 121 s_iSTOKID = atoi(optarg); 122 break; 123 case 'a': 124 s_iAmbiID = atoi(optarg); 125 break; 126 default: 127 ShowUsage(); 128 break; 129 } 130 } 131 if (s_strDictFile == NULL) 132 ShowUsage(); 133 } 134 135 static void 136 output_stok(int& nWords) 137 { 138 if (s_bShowId) { 139 if (nWords > 0) 140 printf(" "); 141 printf("%d", unsigned(s_iSTOKID)); 142 } else { 143 fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout); 144 } 145 ++nWords; 146 } 147 148 static void 149 output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords) 150 { 151 static char mbword[1024]; 152 static TWCHAR wcword[1024]; 153 154 bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD); 155 if (s_bTextOut) { 156 for (int i=0; i < len; ++i, ++p) 157 wcword[i] = *p; 158 wcword[len] = 0; 159 WCSTOMBS(mbword, wcword, sizeof(mbword)); 160 if (bRealGap && idprev == SIM_ID_NOT_WORD) 161 printf("(%d)", unsigned(idprev)); 162 if (bRealGap && (nWords > 0)) 163 printf(" "); 164 (s_iAmbiID && idcur == s_iAmbiID)? printf ("<ambi>%s</ambi>", mbword): 165 printf("%s", mbword); 166 if (s_bShowId && idcur != SIM_ID_NOT_WORD) 167 printf("(%d)", unsigned(idcur)); 168 } else { 169 if (bRealGap) { 170 if (s_bShowId) { 171 if (nWords > 0) 172 printf(" "); 173 printf("%d", unsigned(idcur)); 174 } else 175 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout); 176 } 177 } 178 if (bRealGap) 179 ++nWords; 180 } 181 182 /** 183 * Return . For example, ABCDEF if ABC CD DEF are words. 184 * if return len > word_len, then ambiguious exists at word [p p+len)... 185 */ 186 int 187 getAmbiLen(const TWCHAR* p, int word_len) 188 { 189 const CSIMDict::TState* pstate; 190 191 for (int i=1; i<word_len && *(p+i) != WCH_NULL; ++i) { 192 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i); 193 if (word_len < i+len) 194 word_len = i+len; 195 } 196 197 return word_len; 198 } 199 200 static bool 201 processSingleFile(FILE* fp, int &nWords, int &nAmbis) 202 { 203 nWords = 0; 204 nAmbis = 0; 205 206 wstring sntnc; 207 CSIMCharReader *pReader = new CSIMCharReader(fp); 208 CSIMCharReader::iterator iter = pReader->begin(); 209 TSIMWordId idcur, idprev = s_iSTOKID; 210 211 if (!s_bTextOut) 212 output_stok(nWords); 213 214 while (true){ 215 if (ReadSentence(sntnc, iter, false) == false) 216 break; 217 218 for (const TWCHAR *p = sntnc.c_str(); (*p); ) { 219 220 const CSIMDict::TState* pstate; 221 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p); 222 if (len <= 0) { 223 idcur = SIM_ID_NOT_WORD; 224 len = 1; 225 } else 226 idcur = pstate->word_id; 227 228 if (s_iAmbiID != WCH_NULL) { 229 int ambiLen=getAmbiLen(p, len); 230 if (ambiLen > len) { 231 len = ambiLen; 232 idcur = s_iAmbiID; 233 ++nAmbis; 234 } 235 } 236 237 output(len, p, idprev, idcur, nWords); 238 239 idprev = idcur; 240 p += len; 241 } 242 243 if (!s_bTextOut) { 244 output_stok(nWords); 245 idprev = s_iSTOKID; 246 } 247 } 248 249 fflush(stdout); 250 return true; 251 } 252 253 int 254 main(int argc, char *argv[]) 255 { 256 int nWords, nAmbis; 257 258 setlocale(LC_ALL, ""); 259 getParameters(argc, argv); 260 argc -= optind; 261 argv += optind; 262 263 fprintf(stderr, "Loading lexicon..."); fflush(stderr); 264 s_dict = new CSIMDict(); 265 if (!s_dict->parseText(s_strDictFile)) { 266 fprintf(stderr, "fail\n"); fflush(stderr); 267 exit(1001); 268 } 269 fprintf(stderr, "done"); fflush(stderr); 270 271 if (argc == 0) { 272 fprintf(stderr, "\nProcessing from stdin..."); fflush(stderr); 273 processSingleFile(stdin, nWords, nAmbis); 274 fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); fflush(stderr); 275 } else { 276 for (int i=0; i < argc; ++i) { 277 fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr); 278 FILE *fp = fopen(argv[i], "r"); 279 if (fp != NULL) { 280 processSingleFile(fp, nWords, nAmbis); 281 fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n", ftell(fp), nWords, nAmbis); fflush(stderr); 282 } else { 283 fprintf(stderr, "Can not Open!!!!!!!\n"); fflush(stderr); 284 } 285 fclose(fp); 286 } 287 } 288 289 s_dict->close(); 290 delete s_dict; 291 s_dict = NULL; 292 return 0; 293 } 294