Home | History | Annotate | Download | only in python
      1 #!/usr/bin/python
      2 
      3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      4 # 
      5 # Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
      6 # 
      7 # The contents of this file are subject to the terms of either the GNU Lesser
      8 # General Public License Version 2.1 only ("LGPL") or the Common Development and
      9 # Distribution License ("CDDL")(collectively, the "License"). You may not use this
     10 # file except in compliance with the License. You can obtain a copy of the CDDL at
     11 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     12 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 
     13 # specific language governing permissions and limitations under the License. When
     14 # distributing the software, include this License Header Notice in each file and
     15 # include the full text of the License in the License file as well as the
     16 # following notice:
     17 # 
     18 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     19 # (CDDL)
     20 # For Covered Software in this distribution, this License shall be governed by the
     21 # laws of the State of California (excluding conflict-of-law provisions).
     22 # Any litigation relating to this License shall be subject to the jurisdiction of
     23 # the Federal Courts of the Northern District of California and the state courts
     24 # of the State of California, with venue lying in Santa Clara County, California.
     25 # 
     26 # Contributor(s):
     27 # 
     28 # If you wish your version of this file to be governed by only the CDDL or only
     29 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     30 # include this software in this distribution under the [CDDL or LGPL Version 2.1]
     31 # license." If you don't indicate a single choice of license, a recipient has the
     32 # option to distribute your version of this file under either the CDDL or the LGPL
     33 # Version 2.1, or to extend the choice of license to its licensees as provided
     34 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     35 # Version 2 license, then the option applies only if the new code is made subject
     36 # to such option by the copyright holder. 
     37 
     38 from trie import DATrie, match_longest
     39 
     40 class IPySegmentor:
     41     class CSegment:
     42         def __init__ (self, value, length):
     43             self.value = value
     44             self.length = length
     45 
     46     def get_segments (self): pass
     47     def updated_from (self): pass
     48     def push (self, ch): pass
     49     def pop (self): pass
     50     def insert (self, ch, idx): pass
     51     def delete (self, idx, backward=True): pass
     52     def clear (self): pass
     53 
     54 class QuanPinSegmentor (IPySegmentor):
     55     def __init__ (self, fname):
     56         self.clear ()
     57         self.load (fname)
     58 
     59     def clear (self):
     60         self.__updated_from = 0
     61         self.__segs = []
     62         self.__pystr = []
     63 
     64     def load (self, fname):
     65         self.pytrie = DATrie ()
     66 
     67         try:
     68             self.pytrie.load (fname)
     69         except:
     70             from trie import Trie
     71             from pinyin_data import valid_syllables
     72     
     73             trie = Trie ()
     74             for s in valid_syllables:
     75                 trie.add (s[::-1], valid_syllables[s])
     76     
     77             self.pytrie.construct_from_trie (trie)
     78             self.pytrie.save (fname)
     79 
     80     def updated_from (self):
     81         return self.__updated_from
     82 
     83     def get_segments (self):
     84         return self.__segs[::-1]
     85 
     86     def __locate_indices (self, idx):
     87         'return the indices of pystr and segs'
     88         i = 0; j = -1
     89         while i+self.__segs[j].length < idx:
     90             i += self.__segs[j].length
     91             j -= 1
     92         j += len(self.__segs)
     93         i = len(self.__pystr) - i
     94         return i, j
     95 
     96     def insert (self, ch, idx):
     97         i, j = self.__locate_indices (idx)
     98         self.__updated_from = len(self.__pystr) - i
     99 
    100         if (idx == 0): self.__pystr.append(ch)
    101         else:          self.__pystr.insert(-idx, ch)
    102 
    103         newpystr = self.__pystr[:i+1]
    104         self.__pystr = self.__pystr[i+1:]
    105         self.__segs = self.__segs[j+1:]
    106 
    107         for ch in newpystr[::-1]:
    108             self.__push (ch.lower())
    109 
    110     def delete (self, idx, backward=True):
    111         if not backward: idx += 1
    112         i, j = self.__locate_indices (idx)
    113         self.__updated_from = len(self.__pystr) - i
    114 
    115         self.__pystr.pop(-idx)
    116 
    117         newpystr = self.__pystr[:i-1]
    118         self.__pystr = self.__pystr[i-1:]
    119         self.__segs = self.__segs[j+1:]
    120 
    121         for ch in newpystr[::-1]:
    122             self.__push (ch.lower())
    123 
    124     def __push (self, ch):
    125         self.__pystr.insert(0, ch)
    126         v, l = match_longest (self.pytrie, self.__pystr)
    127 
    128         if l == 0:
    129             self.__segs.insert(0, IPySegmentor.CSegment(0, 1))
    130             return
    131 
    132         if l == 1:
    133             if len(self.__pystr)>1 and self.__pystr[1].isupper():
    134                 self.__pystr[1] = self.__pystr[1].lower()
    135                 vv, ll = match_longest (self.pytrie, self.__pystr)
    136                 if ll == self.__segs[0].length + 1:
    137                     self.__segs[0].length += 1
    138                     self.__segs[0].value = vv
    139                     return
    140                 self.__pystr[1] = self.__pystr[1].upper()
    141 
    142             self.__segs.insert(0, IPySegmentor.CSegment(v, l))
    143             return
    144 
    145         if l == self.__segs[0].length + 1:
    146             self.__segs[0].length += 1
    147             self.__segs[0].value = v
    148             return
    149 
    150         i, isum = 0, self.__segs[0].length + 1
    151         lsum, new_segs = l, [IPySegmentor.CSegment(v, l)]
    152         if isum < lsum: self.__pystr[0] = self.__pystr[0].upper()
    153         while isum != lsum:
    154             if lsum < isum:
    155                 v, l = match_longest (self.pytrie, self.__pystr[lsum:])
    156                 new_segs.append(IPySegmentor.CSegment(v, l))
    157                 lsum += l
    158             if isum < lsum:
    159                 i += 1
    160                 isum += self.__segs[i].length
    161         self.__segs = new_segs + self.__segs[i+1:]
    162 
    163     def push (self, ch):
    164         self.__push (ch)
    165         self.__updated_from = len(self.__pystr) - self.__segs[0].length
    166 
    167     def __pop (self):
    168         if not self.__pystr:
    169             return
    170 
    171         i = self.__segs.pop(0).length
    172         if i == 1:
    173             self.__pystr.pop(0)
    174             return
    175 
    176         newpystr, self.__pystr = self.__pystr[1:i], self.__pystr[i:]
    177 
    178         for ch in newpystr[::-1]:
    179             self.__push (ch.lower())
    180 
    181     def pop (self):
    182         self.__updated_from = len(self.__pystr) - self.__segs[0].length
    183         self.__pop ()
    184 
    185 def test ():
    186     from pinyin_data import valid_syllables
    187     r_valid_syllables = dict (zip (valid_syllables.values(), valid_syllables.keys()))
    188 
    189     pyseg = QuanPinSegmentor ('pinyin_suffix.dat')
    190     pystr = "min'ganceshizhongdierlongenfengniao"
    191     print pystr
    192 
    193     def print_segs ():
    194         for s in pyseg.get_segments ():
    195             print "[0x%x:%d:%s]" % (s.value, s.length, r_valid_syllables[s.value] if s.value else '?'),
    196         print 'UPDATED_FROM:', pyseg.updated_from()
    197 
    198     for ch in pystr:
    199         pyseg.push (ch)
    200         print_segs ()
    201 
    202     pyseg.pop ()
    203     print_segs ()
    204 
    205     pyseg.pop ()
    206     print_segs ()
    207 
    208     pyseg.insert ('i', 2)
    209     print_segs ()
    210 
    211     pyseg.delete (2, True)
    212     print_segs ()
    213 
    214 if __name__ == "__main__":
    215     test ()
    216