OpenGrok

Cross Reference: pinyin_seg.cpp
xref: /nv-g11n/inputmethod/sunpinyin2/src/pinyin/pinyin_seg.cpp
Home | History | Annotate | Line # | Download | only in pinyin
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #include <cassert>
     39 #include <algorithm>
     40 #include "pinyin_seg.h"
     41 #include "quanpin_trie.h"
     42 
     43 void CGetFuzzySyllablesOp::initFuzzyMap (const char * const* fuzzyPairs, unsigned num)
     44 {
     45     m_fuzzyMap.clear();
     46 
     47     for (int c=0; c<num; ++c) {
     48         const char * i = fuzzyPairs [c*2];
     49         const char * j = fuzzyPairs [c*2+1];
     50 
     51         m_fuzzyMap.insert (std::pair<const std::string, std::string> (i, j));
     52         m_fuzzyMap.insert (std::pair<const std::string, std::string> (j, i));
     53     }
     54 }
     55 
     56 CSyllables CGetFuzzySyllablesOp::operator () (TSyllable s)
     57 {
     58     CSyllables ret;
     59     static char buf[128];
     60 
     61     if (m_fuzzyMap.empty()) {
     62         unsigned num;
     63         const char ** sys_fuzzy_pairs = CPinyinData::getFuzzyPairs (num);
     64         initFuzzyMap (sys_fuzzy_pairs, num);
     65     }
     66 
     67     const char *i, *f;
     68     CPinyinData::decodeSyllable (s, &i, &f);
     69 
     70     std::vector<const char *> iset;
     71     std::vector<const char *> fset;
     72 
     73     iset.push_back (i);
     74     fset.push_back (f);
     75 
     76     CFuzzyMap::const_iterator it;
     77     for (it = m_fuzzyMap.lower_bound(i); it != m_fuzzyMap.upper_bound(i); ++it)
     78         iset.push_back ((it->second).c_str());
     79 
     80     for (it = m_fuzzyMap.lower_bound(f); it != m_fuzzyMap.upper_bound(f); ++it)
     81         fset.push_back ((it->second).c_str());
     82 
     83     std::vector<const char *>::const_iterator iset_it = iset.begin();
     84     for (; iset_it != iset.end(); ++iset_it) {
     85         std::vector<const char *>::const_iterator fset_it = fset.begin();
     86         for (; fset_it != fset.end(); ++ fset_it) {
     87             snprintf (buf, sizeof(buf), "%s%s", *iset_it, *fset_it);
     88             TSyllable ts = CPinyinData::encodeSyllable (buf);
     89             if (ts && ts != s)
     90                 ret.push_back (ts);
     91         }
     92     }
     93 
     94     return ret;
     95 }
     96 
     97 const char * CGetCorrectionPairOp::operator () (std::string& pystr, unsigned& matched_len)
     98 {
     99     CCorrectionPairVec::iterator it  = m_correctionPairs.begin ();
    100     CCorrectionPairVec::iterator ite = m_correctionPairs.end ();
    101 
    102     for (; it != ite; ++it) {
    103         std::string& k = it->first;
    104         std::string& v = it->second;
    105         unsigned l = k.size ();
    106 
    107         if (pystr.size() >= l && !pystr.compare (pystr.size()-l, l, k)) {
    108             matched_len = l;
    109             return v.c_str();
    110         }
    111     }
    112 
    113     return NULL;
    114 }
    115 
    116 CQuanpinSegmentor::CQuanpinSegmentor ()
    117     : m_updatedFrom(0), m_pGetFuzzySyllablesOp(NULL), m_pGetCorrectionPairOp(NULL),
    118       m_pytrie(base, check, value, sizeof(base)/sizeof(*base))
    119 {
    120     m_segs.reserve (32);
    121 }
    122 
    123 bool CQuanpinSegmentor::load(const char * pyTrieFileName)
    124 {
    125     return m_pytrie.load (pyTrieFileName);
    126 }
    127 
    128 #ifdef DEBUG
    129 void print_pystr(const std::string pystr)
    130 {
    131     for (const char* c = pystr.c_str(); c != pystr.c_str() + pystr.length(); ++c)
    132     {
    133         printf("%c", *c & 0x7f);
    134     }
    135     printf("<\n");
    136 }
    137 #endif
    138 
    139 unsigned CQuanpinSegmentor::push (unsigned ch)
    140 {
    141     m_inputBuf.push_back (ch);
    142 
    143     if (m_pGetCorrectionPairOp && m_pGetCorrectionPairOp->isEnabled()) {
    144         m_pystr.push_back (ch);
    145         unsigned l = 0;
    146         const char * v = (*m_pGetCorrectionPairOp) (m_pystr, l);
    147 
    148         if (v) {
    149             unsigned orig_size = m_segs.size();
    150             _clear (m_pystr.size() - l);
    151             m_updatedFrom = _updateWith (v);
    152 
    153             if (m_segs.size () >= orig_size) {
    154                 // does not get better segmentation, revert to original
    155                 _clear (m_pystr.size() - strlen(v));
    156                 std::string new_pystr;
    157                 std::copy(m_inputBuf.end() - l, m_inputBuf.end(), back_inserter(new_pystr));
    158                 m_updatedFrom = _updateWith (new_pystr);
    159             } else {
    160                 if (l != strlen(v)) {
    161                     // e.g. uen -> un
    162                     m_segs.back().m_len += l - strlen(v);
    163                     m_pystr.resize(m_inputBuf.length());
    164                 }
    165                 std::copy(m_inputBuf.end() - l, m_inputBuf.end(), m_pystr.end() - l);
    166             }
    167             return m_updatedFrom;
    168         }
    169 
    170         m_pystr.resize (m_pystr.size() - 1);
    171     }
    172 
    173     return m_updatedFrom = _push (ch);
    174 }
    175 
    176 unsigned CQuanpinSegmentor::pop ()
    177 {
    178     if (m_pystr.empty())
    179         return m_updatedFrom = 0;
    180 
    181     unsigned size = m_inputBuf.size ();
    182     m_inputBuf.resize (size - 1);
    183     m_pystr.resize (size - 1);
    184 
    185     unsigned l = m_segs.back().m_len;
    186     m_segs.pop_back ();
    187 
    188     if (l == 1)
    189         return m_updatedFrom = size - 1;
    190 
    191     std::string new_pystr = m_pystr.substr (size-l);
    192     m_pystr.resize (size-l);
    193 
    194     m_updatedFrom = _updateWith(new_pystr);
    195 
    196     return m_updatedFrom;
    197 }
    198 
    199 unsigned CQuanpinSegmentor::insertAt (unsigned idx, unsigned ch)
    200 {
    201     unsigned i, j;
    202     locateSegment (idx, i, j);
    203 
    204     m_inputBuf.insert (idx, 1, ch);
    205     m_pystr.insert (idx, 1, ch);
    206 
    207     std::string new_pystr = m_pystr.substr (i);
    208     m_pystr.resize (i);
    209     m_segs.erase (m_segs.begin()+j, m_segs.end());
    210 
    211     m_updatedFrom = _updateWith (new_pystr);
    212 
    213     return m_updatedFrom;
    214 }
    215 
    216 unsigned CQuanpinSegmentor::deleteAt (unsigned idx, bool backward)
    217 {
    218     unsigned i, j;
    219     if (!backward) idx += 1;
    220     locateSegment (idx, i, j);
    221 
    222     m_inputBuf.erase (idx, 1);
    223     m_pystr.erase (idx, 1);
    224 
    225     std::string new_pystr = m_pystr.substr (i);
    226     m_pystr.resize (i);
    227     m_segs.erase (m_segs.begin()+j, m_segs.end());
    228 
    229     m_updatedFrom = _updateWith (new_pystr);
    230 
    231     return m_updatedFrom;
    232 }
    233 
    234 unsigned CQuanpinSegmentor::clear (unsigned from)
    235 {
    236     m_inputBuf.resize (from);
    237     return _clear (from);
    238 }
    239 
    240 unsigned CQuanpinSegmentor::_clear (unsigned from)
    241 {
    242     unsigned i, j;
    243     locateSegment (from, i, j);
    244 
    245 
    246     std::string new_pystr = m_pystr.substr (i, from-i);
    247     m_pystr.resize (i);
    248     m_segs.erase (m_segs.begin()+j, m_segs.end());
    249 
    250     m_updatedFrom = _updateWith (new_pystr, from);
    251 
    252     return m_updatedFrom;
    253 }
    254 
    255 void CQuanpinSegmentor::locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx)
    256 {
    257     strIdx = segIdx = 0;
    258 
    259     TSegmentVec::iterator it  = m_segs.begin();
    260     TSegmentVec::iterator ite = m_segs.end();
    261 
    262     for (; it != ite; ++it) {
    263         if (strIdx + (*it).m_len > idx)
    264            break;
    265 
    266         strIdx += (*it).m_len;
    267         segIdx += 1;
    268     }
    269 }
    270 
    271 unsigned CQuanpinSegmentor::_push (unsigned ch)
    272 {
    273     unsigned l, ret;
    274     m_pystr.push_back (ch);
    275     int v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rend(), l);
    276 
    277     if (l == 0) { // not a valid syllable character, e.g., \', i, u, or A-Z
    278         IPySegmentor::ESegmentType seg_type;
    279         if (ch == '\'' && m_inputBuf.size() > 1)
    280             seg_type = IPySegmentor::SYLLABLE_SEP;
    281         else if (islower (ch))
    282             seg_type = IPySegmentor::INVALID;
    283         else
    284             seg_type = IPySegmentor::STRING;
    285 
    286         ret = m_pystr.size () - 1;
    287         m_segs.push_back (TSegment (ch, ret, 1, seg_type));
    288     }
    289 
    290     else if (l == 1) { // possible a new segment
    291         int last_idx = m_pystr.size () - 2;
    292         if ( last_idx >= 0 && (m_pystr[last_idx] & 0x80)) {
    293             // check if the last syllable character's highest bitmask is set
    294             // e.g., feN, so [feN] + g -> [feng]
    295             m_pystr[last_idx] &= 0x7f;
    296             unsigned l;
    297             int v = m_pytrie.match_longest (m_pystr.rbegin(), m_pystr.rend(), l);
    298 
    299             TSegment &last_seg = m_segs.back();
    300             if (l == last_seg.m_len + 1) {
    301                 last_seg.m_len += 1;
    302                 last_seg.m_syllables[0] = v;
    303                 ret = m_pystr.size() - l;
    304                 goto RETURN;
    305             }
    306 
    307             // in case not extensible, change highest bitmask back
    308             m_pystr[last_idx] |= 0x80;
    309         }
    310 
    311         // push the new 1-length segment
    312         ret = m_pystr.size () - 1;
    313         m_segs.push_back (TSegment (v, ret, 1));
    314     }
    315 
    316     else if (l == m_segs.back().m_len + 1) { // current segment is extensible, e.g., [xia] + n -> [xian]
    317         TSegment &last_seg = m_segs.back ();
    318         last_seg.m_len += 1;
    319         last_seg.m_syllables[0] = v;
    320         ret = m_pystr.size() - l;
    321     }
    322 
    323     else { // other cases
    324         TSegment &last_seg = m_segs.back ();
    325         int i = 0, isum = last_seg.m_len + 1, lsum = l;
    326         TSegmentVec new_segs(1, TSegment(v, m_pystr.size()-l, l));
    327 
    328         // e.g., [zh] [o] [n] + g -> [zhonG],
    329         if (isum < lsum) {
    330             unsigned end_idx = m_pystr.size () - 1;
    331             m_pystr[end_idx] |= 0x80;
    332         }
    333 
    334         while (isum != lsum) {
    335             if (lsum < isum) { // e.g., [die] + r -> [di] [er]
    336                 v = m_pytrie.match_longest (m_pystr.rbegin()+lsum, m_pystr.rend(), l);
    337                 TSegment &last_seg = new_segs.back ();
    338                 new_segs.push_back (TSegment(v, last_seg.m_start-l, l));
    339                 lsum += l;
    340             } else {
    341                 i += 1;
    342                 isum += (m_segs.rbegin() + i)->m_len;
    343             }
    344         }
    345 
    346         m_segs.erase (m_segs.end()-(i+1), m_segs.end());
    347         std::copy (new_segs.rbegin(), new_segs.rend(), back_inserter (m_segs));
    348         ret = m_pystr.size()-lsum;
    349     }
    350 
    351 RETURN:;
    352 
    353     if (m_pGetFuzzySyllablesOp && m_pGetFuzzySyllablesOp->isEnabled())
    354         if ( m_segs.back().m_type == SYLLABLE)
    355             _addFuzzySyllables (m_segs.back ());
    356 
    357     return ret;
    358 }
    359 
    360 void CQuanpinSegmentor::_addFuzzySyllables (TSegment& seg)
    361 {
    362     assert (seg.m_type == SYLLABLE);
    363 
    364     seg.m_syllables.resize (1);
    365 
    366     CSyllables fuzzy_set = (*m_pGetFuzzySyllablesOp) (seg.m_syllables.front());
    367     CSyllables::const_iterator it  = fuzzy_set.begin ();
    368     CSyllables::const_iterator ite = fuzzy_set.end ();
    369 
    370     for (; it != ite; ++it)
    371         seg.m_syllables.push_back (*it);
    372 }
    373 
    374 unsigned CQuanpinSegmentor::_updateWith (const std::string& new_pystr, unsigned from)
    375 {
    376     unsigned minUpdatedFrom = from;
    377     std::string::const_iterator it = new_pystr.begin();
    378     for (; it != new_pystr.end(); ++it) {
    379         unsigned updatedFrom = _push(*it & 0x7f);
    380 
    381         if (updatedFrom < minUpdatedFrom) minUpdatedFrom = updatedFrom;
    382     }
    383     return minUpdatedFrom;
    384 }
    385