OpenGrok

Cross Reference: pinyin_seg.h
xref: /nv-g11n/inputmethod/sunpinyin2/src/pinyin/pinyin_seg.h
Home | History | Annotate | Line # | Download | only in pinyin
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifndef SUNPY_PINYIN_SEG_H
     39 #define SUNPY_PINYIN_SEG_H
     40 
     41 #include "portability.h"
     42 #include "syllable.h"
     43 #include "pinyin_data.h"
     44 #include "datrie.h"
     45 #include "utils.h"
     46 
     47 #include <climits>
     48 #include <vector>
     49 
     50 struct IPySegmentor
     51 {
     52     enum ESegmentType
     53         {SYLLABLE, SYLLABLE_SEP, INVALID, STRING};
     54 
     55     struct TSegment {
     56         TSegment (ESegmentType type=SYLLABLE) : m_type(type) {}
     57         TSegment (unsigned syllable, unsigned start, unsigned length, ESegmentType type=SYLLABLE)
     58             : m_start(start), m_len(length), m_type(type)
     59             {m_syllables.push_back (syllable);}
     60 
     61         // if segment is a STRING type, m_syllables may contain the string buffer without the '\0'
     62         // for multiple syllables in one seg, the non-0th elements are treated as fuzzy syllables
     63         std::vector<unsigned>   m_syllables;
     64         unsigned                m_start        : 16;
     65         unsigned                m_len          : 8;
     66         ESegmentType            m_type         : 8;
     67     };
     68 
     69     // it requires the segments are sorted by its m_start field
     70     typedef std::vector<TSegment>  TSegmentVec;
     71 
     72     virtual ~IPySegmentor () {}
     73     virtual TSegmentVec& getSegments () = 0;
     74     virtual const wstring& getInputBuffer () const = 0;
     75     virtual const char* getSylSeps () = 0;
     76 
     77     virtual unsigned push (unsigned ch) = 0;
     78     virtual unsigned pop () = 0;
     79     virtual unsigned insertAt (unsigned idx, unsigned ch) = 0;
     80     virtual unsigned deleteAt (unsigned idx, bool backward=true) = 0;
     81     virtual unsigned clear (unsigned from=0) = 0;
     82 
     83     virtual unsigned updatedFrom () = 0;
     84     virtual void locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx) = 0;
     85 };
     86 
     87 class CGetFuzzySyllablesOp : private CNonCopyable
     88 {
     89 public:
     90     typedef std::multimap<const std::string, std::string> CFuzzyMap;
     91 
     92     CGetFuzzySyllablesOp () : m_bEnabled(false) {}
     93     void initFuzzyMap (const char* const* fuzzyPairs, unsigned num);
     94 
     95     void setEnable (bool value=true) {m_bEnabled = value;}
     96     bool isEnabled () {return m_bEnabled;}
     97 
     98     CSyllables operator () (TSyllable s);
     99 
    100 private:
    101     CFuzzyMap   m_fuzzyMap;
    102     bool        m_bEnabled;
    103 };
    104 
    105 class CGetCorrectionPairOp : private CNonCopyable
    106 {
    107 public:
    108     typedef std::pair<std::string, std::string> CCorrectionPair;
    109     typedef std::vector<CCorrectionPair> CCorrectionPairVec;
    110 
    111     CGetCorrectionPairOp () : m_bEnabled(false) {m_correctionPairs.reserve(8);}
    112 
    113     void setEnable (bool value=true) {m_bEnabled = value;}
    114     bool isEnabled () {return m_bEnabled;}
    115 
    116     void setCorrectionPairs (const char* const* pairs, unsigned num)
    117     {
    118         m_correctionPairs.clear ();
    119         for (unsigned i=0; i<num; ++i) {
    120             const char * k = pairs [i*2];
    121             const char * v = pairs [i*2+1];
    122 
    123             m_correctionPairs.push_back (std::pair<std::string, std::string> (k, v));
    124         }
    125     }
    126 
    127     const char * operator () (std::string& pystr, unsigned& matched_len);
    128 
    129 private:
    130     CCorrectionPairVec  m_correctionPairs;
    131     bool                m_bEnabled;
    132 };
    133 
    134 class CQuanpinSegmentor : public IPySegmentor
    135 {
    136 public:
    137     CQuanpinSegmentor ();
    138 
    139     virtual TSegmentVec& getSegments () {return m_segs;}
    140     virtual const wstring& getInputBuffer () const {return m_inputBuf;}
    141     virtual const char* getSylSeps () {return "'";}
    142 
    143     virtual unsigned push (unsigned ch);
    144     virtual unsigned pop ();
    145     virtual unsigned insertAt (unsigned idx, unsigned ch);
    146     virtual unsigned deleteAt (unsigned idx, bool backward=true);
    147     virtual unsigned clear (unsigned from=0);
    148 
    149     virtual unsigned updatedFrom () {return m_updatedFrom;}
    150     virtual void locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx);
    151 
    152     bool load(const char * pyTrieFileName);
    153 
    154     void setGetFuzzySyllablesOp (CGetFuzzySyllablesOp *op) {m_pGetFuzzySyllablesOp = op;}
    155     void setGetCorrectionPairOp (CGetCorrectionPairOp *op) {m_pGetCorrectionPairOp = op;}
    156 
    157 private:
    158     inline unsigned _push  (unsigned ch);
    159     inline unsigned _clear (unsigned from);
    160     inline void _addFuzzySyllables (TSegment &seg);
    161     inline unsigned _updateWith (const std::string& new_pystr, unsigned from = UINT_MAX);
    162 
    163     CGetFuzzySyllablesOp       *m_pGetFuzzySyllablesOp;
    164     CGetCorrectionPairOp       *m_pGetCorrectionPairOp;
    165 
    166     CDATrie<short>              m_pytrie;
    167     std::string                 m_pystr;
    168     wstring                     m_inputBuf;
    169     TSegmentVec                 m_segs;
    170 
    171     unsigned                    m_updatedFrom;
    172 };
    173 
    174 #endif /* SUNPY_PINYIN_SEG_H */
    175