1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef SUNPY_PINYIN_SEG_H 39 #define SUNPY_PINYIN_SEG_H 40 41 #include "portability.h" 42 #include "syllable.h" 43 #include "pinyin_data.h" 44 #include "datrie.h" 45 #include "utils.h" 46 47 #include <climits> 48 #include <vector> 49 50 struct IPySegmentor 51 { 52 enum ESegmentType 53 {SYLLABLE, SYLLABLE_SEP, INVALID, STRING}; 54 55 struct TSegment { 56 TSegment (ESegmentType type=SYLLABLE) : m_type(type) {} 57 TSegment (unsigned syllable, unsigned start, unsigned length, ESegmentType type=SYLLABLE) 58 : m_start(start), m_len(length), m_type(type) 59 {m_syllables.push_back (syllable);} 60 61 // if segment is a STRING type, m_syllables may contain the string buffer without the '\0' 62 // for multiple syllables in one seg, the non-0th elements are treated as fuzzy syllables 63 std::vector<unsigned> m_syllables; 64 unsigned m_start : 16; 65 unsigned m_len : 8; 66 ESegmentType m_type : 8; 67 }; 68 69 // it requires the segments are sorted by its m_start field 70 typedef std::vector<TSegment> TSegmentVec; 71 72 virtual ~IPySegmentor () {} 73 virtual TSegmentVec& getSegments () = 0; 74 virtual const wstring& getInputBuffer () const = 0; 75 virtual const char* getSylSeps () = 0; 76 77 virtual unsigned push (unsigned ch) = 0; 78 virtual unsigned pop () = 0; 79 virtual unsigned insertAt (unsigned idx, unsigned ch) = 0; 80 virtual unsigned deleteAt (unsigned idx, bool backward=true) = 0; 81 virtual unsigned clear (unsigned from=0) = 0; 82 83 virtual unsigned updatedFrom () = 0; 84 virtual void locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx) = 0; 85 }; 86 87 class CGetFuzzySyllablesOp : private CNonCopyable 88 { 89 public: 90 typedef std::multimap<const std::string, std::string> CFuzzyMap; 91 92 CGetFuzzySyllablesOp () : m_bEnabled(false) {} 93 void initFuzzyMap (const char* const* fuzzyPairs, unsigned num); 94 95 void setEnable (bool value=true) {m_bEnabled = value;} 96 bool isEnabled () {return m_bEnabled;} 97 98 CSyllables operator () (TSyllable s); 99 100 private: 101 CFuzzyMap m_fuzzyMap; 102 bool m_bEnabled; 103 }; 104 105 class CGetCorrectionPairOp : private CNonCopyable 106 { 107 public: 108 typedef std::pair<std::string, std::string> CCorrectionPair; 109 typedef std::vector<CCorrectionPair> CCorrectionPairVec; 110 111 CGetCorrectionPairOp () : m_bEnabled(false) {m_correctionPairs.reserve(8);} 112 113 void setEnable (bool value=true) {m_bEnabled = value;} 114 bool isEnabled () {return m_bEnabled;} 115 116 void setCorrectionPairs (const char* const* pairs, unsigned num) 117 { 118 m_correctionPairs.clear (); 119 for (unsigned i=0; i<num; ++i) { 120 const char * k = pairs [i*2]; 121 const char * v = pairs [i*2+1]; 122 123 m_correctionPairs.push_back (std::pair<std::string, std::string> (k, v)); 124 } 125 } 126 127 const char * operator () (std::string& pystr, unsigned& matched_len); 128 129 private: 130 CCorrectionPairVec m_correctionPairs; 131 bool m_bEnabled; 132 }; 133 134 class CQuanpinSegmentor : public IPySegmentor 135 { 136 public: 137 CQuanpinSegmentor (); 138 139 virtual TSegmentVec& getSegments () {return m_segs;} 140 virtual const wstring& getInputBuffer () const {return m_inputBuf;} 141 virtual const char* getSylSeps () {return "'";} 142 143 virtual unsigned push (unsigned ch); 144 virtual unsigned pop (); 145 virtual unsigned insertAt (unsigned idx, unsigned ch); 146 virtual unsigned deleteAt (unsigned idx, bool backward=true); 147 virtual unsigned clear (unsigned from=0); 148 149 virtual unsigned updatedFrom () {return m_updatedFrom;} 150 virtual void locateSegment (unsigned idx, unsigned &strIdx, unsigned &segIdx); 151 152 bool load(const char * pyTrieFileName); 153 154 void setGetFuzzySyllablesOp (CGetFuzzySyllablesOp *op) {m_pGetFuzzySyllablesOp = op;} 155 void setGetCorrectionPairOp (CGetCorrectionPairOp *op) {m_pGetCorrectionPairOp = op;} 156 157 private: 158 inline unsigned _push (unsigned ch); 159 inline unsigned _clear (unsigned from); 160 inline void _addFuzzySyllables (TSegment &seg); 161 inline unsigned _updateWith (const std::string& new_pystr, unsigned from = UINT_MAX); 162 163 CGetFuzzySyllablesOp *m_pGetFuzzySyllablesOp; 164 CGetCorrectionPairOp *m_pGetCorrectionPairOp; 165 166 CDATrie<short> m_pytrie; 167 std::string m_pystr; 168 wstring m_inputBuf; 169 TSegmentVec m_segs; 170 171 unsigned m_updatedFrom; 172 }; 173 174 #endif /* SUNPY_PINYIN_SEG_H */ 175
