1 #!/usr/bin/python 2 3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 4 # 5 # Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. 6 # 7 # The contents of this file are subject to the terms of either the GNU Lesser 8 # General Public License Version 2.1 only ("LGPL") or the Common Development and 9 # Distribution License ("CDDL")(collectively, the "License"). You may not use this 10 # file except in compliance with the License. You can obtain a copy of the CDDL at 11 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 12 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 13 # specific language governing permissions and limitations under the License. When 14 # distributing the software, include this License Header Notice in each file and 15 # include the full text of the License in the License file as well as the 16 # following notice: 17 # 18 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 19 # (CDDL) 20 # For Covered Software in this distribution, this License shall be governed by the 21 # laws of the State of California (excluding conflict-of-law provisions). 22 # Any litigation relating to this License shall be subject to the jurisdiction of 23 # the Federal Courts of the Northern District of California and the state courts 24 # of the State of California, with venue lying in Santa Clara County, California. 25 # 26 # Contributor(s): 27 # 28 # If you wish your version of this file to be governed by only the CDDL or only 29 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 30 # include this software in this distribution under the [CDDL or LGPL Version 2.1] 31 # license." If you don't indicate a single choice of license, a recipient has the 32 # option to distribute your version of this file under either the CDDL or the LGPL 33 # Version 2.1, or to extend the choice of license to its licensees as provided 34 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 35 # Version 2 license, then the option applies only if the new code is made subject 36 # to such option by the copyright holder. 37 38 from trie import DATrie, match_longest 39 40 class IPySegmentor: 41 class CSegment: 42 def __init__ (self, value, length): 43 self.value = value 44 self.length = length 45 46 def get_segments (self): pass 47 def updated_from (self): pass 48 def push (self, ch): pass 49 def pop (self): pass 50 def insert (self, ch, idx): pass 51 def delete (self, idx, backward=True): pass 52 def clear (self): pass 53 54 class QuanPinSegmentor (IPySegmentor): 55 def __init__ (self, fname): 56 self.clear () 57 self.load (fname) 58 59 def clear (self): 60 self.__updated_from = 0 61 self.__segs = [] 62 self.__pystr = [] 63 64 def load (self, fname): 65 self.pytrie = DATrie () 66 67 try: 68 self.pytrie.load (fname) 69 except: 70 from trie import Trie 71 from pinyin_data import valid_syllables 72 73 trie = Trie () 74 for s in valid_syllables: 75 trie.add (s[::-1], valid_syllables[s]) 76 77 self.pytrie.construct_from_trie (trie) 78 self.pytrie.save (fname) 79 80 def updated_from (self): 81 return self.__updated_from 82 83 def get_segments (self): 84 return self.__segs[::-1] 85 86 def __locate_indices (self, idx): 87 'return the indices of pystr and segs' 88 i = 0; j = -1 89 while i+self.__segs[j].length < idx: 90 i += self.__segs[j].length 91 j -= 1 92 j += len(self.__segs) 93 i = len(self.__pystr) - i 94 return i, j 95 96 def insert (self, ch, idx): 97 i, j = self.__locate_indices (idx) 98 self.__updated_from = len(self.__pystr) - i 99 100 if (idx == 0): self.__pystr.append(ch) 101 else: self.__pystr.insert(-idx, ch) 102 103 newpystr = self.__pystr[:i+1] 104 self.__pystr = self.__pystr[i+1:] 105 self.__segs = self.__segs[j+1:] 106 107 for ch in newpystr[::-1]: 108 self.__push (ch.lower()) 109 110 def delete (self, idx, backward=True): 111 if not backward: idx += 1 112 i, j = self.__locate_indices (idx) 113 self.__updated_from = len(self.__pystr) - i 114 115 self.__pystr.pop(-idx) 116 117 newpystr = self.__pystr[:i-1] 118 self.__pystr = self.__pystr[i-1:] 119 self.__segs = self.__segs[j+1:] 120 121 for ch in newpystr[::-1]: 122 self.__push (ch.lower()) 123 124 def __push (self, ch): 125 self.__pystr.insert(0, ch) 126 v, l = match_longest (self.pytrie, self.__pystr) 127 128 if l == 0: 129 self.__segs.insert(0, IPySegmentor.CSegment(0, 1)) 130 return 131 132 if l == 1: 133 if len(self.__pystr)>1 and self.__pystr[1].isupper(): 134 self.__pystr[1] = self.__pystr[1].lower() 135 vv, ll = match_longest (self.pytrie, self.__pystr) 136 if ll == self.__segs[0].length + 1: 137 self.__segs[0].length += 1 138 self.__segs[0].value = vv 139 return 140 self.__pystr[1] = self.__pystr[1].upper() 141 142 self.__segs.insert(0, IPySegmentor.CSegment(v, l)) 143 return 144 145 if l == self.__segs[0].length + 1: 146 self.__segs[0].length += 1 147 self.__segs[0].value = v 148 return 149 150 i, isum = 0, self.__segs[0].length + 1 151 lsum, new_segs = l, [IPySegmentor.CSegment(v, l)] 152 if isum < lsum: self.__pystr[0] = self.__pystr[0].upper() 153 while isum != lsum: 154 if lsum < isum: 155 v, l = match_longest (self.pytrie, self.__pystr[lsum:]) 156 new_segs.append(IPySegmentor.CSegment(v, l)) 157 lsum += l 158 if isum < lsum: 159 i += 1 160 isum += self.__segs[i].length 161 self.__segs = new_segs + self.__segs[i+1:] 162 163 def push (self, ch): 164 self.__push (ch) 165 self.__updated_from = len(self.__pystr) - self.__segs[0].length 166 167 def __pop (self): 168 if not self.__pystr: 169 return 170 171 i = self.__segs.pop(0).length 172 if i == 1: 173 self.__pystr.pop(0) 174 return 175 176 newpystr, self.__pystr = self.__pystr[1:i], self.__pystr[i:] 177 178 for ch in newpystr[::-1]: 179 self.__push (ch.lower()) 180 181 def pop (self): 182 self.__updated_from = len(self.__pystr) - self.__segs[0].length 183 self.__pop () 184 185 def test (): 186 from pinyin_data import valid_syllables 187 r_valid_syllables = dict (zip (valid_syllables.values(), valid_syllables.keys())) 188 189 pyseg = QuanPinSegmentor ('pinyin_suffix.dat') 190 pystr = "min'ganceshizhongdierlongenfengniao" 191 print pystr 192 193 def print_segs (): 194 for s in pyseg.get_segments (): 195 print "[0x%x:%d:%s]" % (s.value, s.length, r_valid_syllables[s.value] if s.value else '?'), 196 print 'UPDATED_FROM:', pyseg.updated_from() 197 198 for ch in pystr: 199 pyseg.push (ch) 200 print_segs () 201 202 pyseg.pop () 203 print_segs () 204 205 pyseg.pop () 206 print_segs () 207 208 pyseg.insert ('i', 2) 209 print_segs () 210 211 pyseg.delete (2, True) 212 print_segs () 213 214 if __name__ == "__main__": 215 test () 216