1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #include <cassert> 39 #include <algorithm> 40 #include "pinyin_seg.h" 41 #include "quanpin_trie.h" 42 43 void CGetFuzzySyllablesOp::initFuzzyMap (const char * const* fuzzyPairs, unsigned num) 44 { 45 m_fuzzyMap.clear(); 46 47 for (int c=0; c<num; ++c) { 48 const char * i = fuzzyPairs [c*2]; 49 const char * j = fuzzyPairs [c*2+1]; 50 51 m_fuzzyMap.insert (std::pair<const std::string, std::string> (i, j)); 52 m_fuzzyMap.insert (std::pair<const std::string, std::string> (j, i)); 53 } 54 } 55 56 CSyllables CGetFuzzySyllablesOp::operator () (TSyllable s) 57 { 58 CSyllables ret; 59 static char buf[128]; 60 61 if (m_fuzzyMap.empty()) { 62 unsigned num; 63 const char ** sys_fuzzy_pairs = CPinyinData::getFuzzyPairs (num); 64 initFuzzyMap (sys_fuzzy_pairs, num); 65 } 66 67 const char *i, *f; 68 CPinyinData::decodeSyllable (s, &i, &f); 69 70 std::vector<const char *> iset; 71 std::vector<const char *> fset; 72 73 iset.push_back (i); 74 fset.push_back (f); 75 76 CFuzzyMap::const_iterator it; 77 for (it = m_fuzzyMap.lower_bound(i); it != m_fuzzyMap.upper_bound(i); ++it) 78 iset.push_back ((it->second).c_str()); 79 80 for (it = m_fuzzyMap.lower_bound(f); it != m_fuzzyMap.upper_bound(f); ++it) 81 fset.push_back ((it->second).c_str()); 82 83 std::vector<const char *>::const_iterator iset_it = iset.begin(); 84 for (; iset_it != iset.end(); ++iset_it) { 85 std::vector<const char *>::const_iterator fset_it = fset.begin(); 86 for (; fset_it != fset.end(); ++ fset_it) { 87 snprintf (buf, sizeof(buf), "%s%s", *iset_it, *fset_it); 88 TSyllable ts = CPinyinData::encodeSyllable (buf); 89 if (ts && ts != s) 90 ret.push_back (ts); 91 } 92 } 93 94 return ret; 95 } 96 97 const char * CGetCorrectionPairOp::operator () (std::string& pystr, unsigned& matched_len) 98 { 99 CCorrectionPairVec::iterator it = m_correctionPairs.begin (); 100 CCorrectionPairVec::iterator ite = m_correctionPairs.end (); 101 102 for (; it != ite; ++it) { 103 std::string& k = it->first; 104 std::string& v = it->second; 105 unsigned l = k.size (); 106 107 if (pystr.size() >= l && !pystr.compare (pystr.size()-l, l, k)) { 108 matched_len = l; 109 return v.c_str(); 110 } 111 } 112 113 return NULL; 114 } 115 116 CQuanpinSegmentor::CQuanpinSegmentor () 117 : m_updatedFrom(0), m_pGetFuzzySyllablesOp(NULL), m_pGetCorrectionPairOp(NULL), 118 m_pytrie(base, check, value, sizeof(base)/sizeof(*base)) 119 { 120 m_segs.reserve (32); 121 } 122 123 bool CQuanpinSegmentor::load(const char * pyTrieFileName) 124 { 125 return m_pytrie.load (pyTrieFileName); 126 } 127 128 #ifdef DEBUG 129 void print_pystr(const std::string pystr) 130 { 131 for (const char* c = pystr.c_str(); c != pystr.c_str() + pystr.length(); ++c) 132 { 133 printf("%c", *c & 0x7f); 134 } 135 printf("<\n"); 136 } 137 #endif 138 139 unsigned CQuanpinSegmentor::push (unsigned ch) 140 { 141 m_inputBuf.push_back (ch); 142 143 if (m_pGetCorrectionPairOp && m_pGetCorrectionPairOp->isEnabled()) { 144 m_pystr.push_back (ch); 145 unsigned l = 0; 146 const char * v = (*m_pGetCorrectionPairOp) (m_pystr, l); 147 148 if (v) { 149 unsigned orig_size = m_segs.size(); 150 _clear (m_pystr.size() - l); 151 m_updatedFrom = _updateWith (v); 152 153 if (m_segs.size () >= orig_size) { 154 // does not get better segmentation, revert to original 155 _clear (m_pystr.size() - strlen(v)); 156 std::string new_pystr; 157 std::copy(m_inputBuf.end() - l, m_inputBuf.end(), back_inserter(new_pystr)); 158 m_updatedFrom = _updateWith (new_pystr); 159 } else { 160 if (l != strlen(v)) { 161 // e.g. uen -> un 162 m_segs.back().m_len += l - strlen(v); 163 m_pystr.resize(m_inputBuf.length()); 164 } 165 std::copy(m_inputBuf.end() - l, m_inputBuf.end(), m_pystr.end() - l); 166 } 167 return m_updatedFrom; 168 } 169 170 m_pystr.resize (m_pystr.size() - 1); 171 } 172 173 return m_updatedFrom = _push (ch); 174 } 175 176 unsigned CQuanpinSegmentor::pop () 177 { 178 if (m_pystr.empty()) 179 return m_updatedFrom = 0; 180 181 unsigned size = m_inputBuf.size (); 182 m_inputBuf.resize (size - 1); 183 m_pystr.resize (size - 1); 184 185 unsigned l = m_segs.back().m_len; 186 m_segs.pop_back (); 187 188 if (l == 1) 189 return m_updatedFrom = size - 1; 190 191 std::string new_pystr = m_pystr.substr (size-l); 192 m_pystr.resize (size-l); 193 194 m_updatedFrom = _updateWith(new_pystr); 195 196 return m_updatedFrom; 197 } 198 199 unsigned CQuanpinSegmentor::insertAt (unsigned idx, unsigned ch) 200 { 201 unsigned i, j; 202 locateSegment (idx, i, j); 203 204 m_inputBuf.insert (idx, 1, ch); 205 m_pystr.insert (idx, 1, ch); 206 207 std::string new_pystr = m_pystr.substr (i); 208 m_pystr.resize (i); 209 m_segs.erase (m_segs.begin()+j, m_segs.end()); 210 211 m_updatedFrom = _updateWith (new_pystr); 212 213 return m_updatedFrom; 214 } 215 216 unsigned CQuanpinSegmentor::deleteAt (unsigned idx, bool backward) 217 { 218 unsigned i, j; 219 if (!backward) idx += 1; 220 locateSegment (idx, i, j); 221 222 m_inputBuf.erase (idx, 1); 223 m_pystr.erase (idx, 1); 224 225 std::string new_pystr = m_pystr.substr (i); 226 m_pystr.resize (i); 227 m_segs.erase (m_segs.begin()+j, m_segs.end()); 228 229 m_updatedFrom = _updateWith (new_pystr); 230 231 return m_updatedFrom; 232 } 233 234 unsigned CQuanpinSegmentor::clear (unsigned from) 235 { 236 m_inputBuf.resize (from); 237 return _clear (from); 238 } 239 240 unsigned CQuanpinSegmentor::_clear (unsigned from) 241 { 242 unsigned i, j; 243 locateSegment (from, i, j); 244 245 246 std::string new_pystr = m_pystr.substr (i, from-i); 247 m_pystr.resize (i); 248 m_segs.erase (m_segs.begin()+j, m_segs.end()); 249 250 m_updatedFrom = _updateWith (new_pystr, from); 251 252 return m_updatedFrom; 253 } 254 255 void CQuanpinSegmentor::locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx) 256 { 257 strIdx = segIdx = 0; 258 259 TSegmentVec::iterator it = m_segs.begin(); 260 TSegmentVec::iterator ite = m_segs.end(); 261 262 for (; it != ite; ++it) { 263 if (strIdx + (*it).m_len > idx) 264 break; 265 266 strIdx += (*it).m_len; 267 segIdx += 1; 268 } 269 } 270 271 unsigned CQuanpinSegmentor::_push (unsigned ch) 272 { 273 unsigned l, ret; 274 m_pystr.push_back (ch); 275 int v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rend(), l); 276 277 if (l == 0) { // not a valid syllable character, e.g., \', i, u, or A-Z 278 IPySegmentor::ESegmentType seg_type; 279 if (ch == '\'' && m_inputBuf.size() > 1) 280 seg_type = IPySegmentor::SYLLABLE_SEP; 281 else if (islower (ch)) 282 seg_type = IPySegmentor::INVALID; 283 else 284 seg_type = IPySegmentor::STRING; 285 286 ret = m_pystr.size () - 1; 287 m_segs.push_back (TSegment (ch, ret, 1, seg_type)); 288 } 289 290 else if (l == 1) { // possible a new segment 291 int last_idx = m_pystr.size () - 2; 292 if ( last_idx >= 0 && (m_pystr[last_idx] & 0x80)) { 293 // check if the last syllable character's highest bitmask is set 294 // e.g., feN, so [feN] + g -> [feng] 295 m_pystr[last_idx] &= 0x7f; 296 unsigned l; 297 int v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rend(), l); 298 299 TSegment &last_seg = m_segs.back(); 300 if (l == last_seg.m_len + 1) { 301 last_seg.m_len += 1; 302 last_seg.m_syllables[0] = v; 303 ret = m_pystr.size() - l; 304 goto RETURN; 305 } 306 307 // in case not extensible, change highest bitmask back 308 m_pystr[last_idx] |= 0x80; 309 } 310 311 // push the new 1-length segment 312 ret = m_pystr.size () - 1; 313 m_segs.push_back (TSegment (v, ret, 1)); 314 } 315 316 else if (l == m_segs.back().m_len + 1) { // current segment is extensible, e.g., [xia] + n -> [xian] 317 TSegment &last_seg = m_segs.back (); 318 last_seg.m_len += 1; 319 last_seg.m_syllables[0] = v; 320 ret = m_pystr.size() - l; 321 } 322 323 else { // other cases 324 TSegment &last_seg = m_segs.back (); 325 int i = 0, isum = last_seg.m_len + 1, lsum = l; 326 TSegmentVec new_segs(1, TSegment(v, m_pystr.size()-l, l)); 327 328 // e.g., [zh] [o] [n] + g -> [zhonG], 329 if (isum < lsum) { 330 unsigned end_idx = m_pystr.size () - 1; 331 m_pystr[end_idx] |= 0x80; 332 } 333 334 while (isum != lsum) { 335 if (lsum < isum) { // e.g., [die] + r -> [di] [er] 336 v = m_pytrie.match_longest (m_pystr.rbegin()+lsum, m_pystr.rend(), l); 337 TSegment &last_seg = new_segs.back (); 338 new_segs.push_back (TSegment(v, last_seg.m_start-l, l)); 339 lsum += l; 340 } else { 341 i += 1; 342 isum += (m_segs.rbegin() + i)->m_len; 343 } 344 } 345 346 m_segs.erase (m_segs.end()-(i+1), m_segs.end()); 347 std::copy (new_segs.rbegin(), new_segs.rend(), back_inserter (m_segs)); 348 ret = m_pystr.size()-lsum; 349 } 350 351 RETURN:; 352 353 if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled()) 354 if ( m_segs.back().m_type == SYLLABLE) 355 _addFuzzySyllables (m_segs.back ()); 356 357 return ret; 358 } 359 360 void CQuanpinSegmentor::_addFuzzySyllables (TSegment& seg) 361 { 362 assert (seg.m_type == SYLLABLE); 363 364 seg.m_syllables.resize (1); 365 366 CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp) (seg.m_syllables.front()); 367 CSyllables::const_iterator it = fuzzy_set.begin (); 368 CSyllables::const_iterator ite = fuzzy_set.end (); 369 370 for (; it != ite; ++it) 371 seg.m_syllables.push_back (*it); 372 } 373 374 unsigned CQuanpinSegmentor::_updateWith (const std::string& new_pystr, unsigned from) 375 { 376 unsigned minUpdatedFrom = from; 377 std::string::const_iterator it = new_pystr.begin(); 378 for (; it != new_pystr.end(); ++it) { 379 unsigned updatedFrom = _push(*it & 0x7f); 380 381 if (updatedFrom < minUpdatedFrom) minUpdatedFrom = updatedFrom; 382 } 383 return minUpdatedFrom; 384 } 385
