1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 4 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 5 # 6 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 7 # 8 # The contents of this file are subject to the terms of either the GNU Lesser 9 # General Public License Version 2.1 only ("LGPL") or the Common Development and 10 # Distribution License ("CDDL")(collectively, the "License"). You may not use this 11 # file except in compliance with the License. You can obtain a copy of the CDDL at 12 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 13 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 14 # specific language governing permissions and limitations under the License. When 15 # distributing the software, include this License Header Notice in each file and 16 # include the full text of the License in the License file as well as the 17 # following notice: 18 # 19 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 20 # (CDDL) 21 # For Covered Software in this distribution, this License shall be governed by the 22 # laws of the State of California (excluding conflict-of-law provisions). 23 # Any litigation relating to this License shall be subject to the jurisdiction of 24 # the Federal Courts of the Northern District of California and the state courts 25 # of the State of California, with venue lying in Santa Clara County, California. 26 # 27 # Contributor(s): 28 # 29 # If you wish your version of this file to be governed by only the CDDL or only 30 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 31 # include this software in this distribution under the [CDDL or LGPL Version 2.1] 32 # license." If you don't indicate a single choice of license, a recipient has the 33 # option to distribute your version of this file under either the CDDL or the LGPL 34 # Version 2.1, or to extend the choice of license to its licensees as provided 35 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 36 # Version 2 license, then the option applies only if the new code is made subject 37 # to such option by the copyright holder. 38 39 import os 40 import mmap 41 import struct 42 import heapq 43 import tempfile 44 45 class NGram: 46 key = () 47 freq = 0 48 49 def __init__(self, key, freq): 50 self.key = key 51 self.freq = freq 52 53 def __cmp__(self,other): 54 return cmp(self.key, other.key) 55 56 def __str__(self): 57 return "ngram: " + self.key.__str__() + " freq: " + str(self.freq) 58 59 def read_ch_sentences(file): 60 nesting = 0 61 buf = "" 62 for line in file: 63 if buf and (line[0].isspace() or len(buf) <= 40): 64 yield buf 65 buf, nesting = "", 0 66 67 for ch in line: 68 if ch.isspace(): 69 continue 70 71 if ch in u"": 72 nesting +=1 73 elif ch in u"": 74 nesting -=1 75 76 if ch in u"" and nesting == 0: 77 if buf: 78 yield buf + ch 79 buf, nesting = "", 0 80 else: 81 buf += ch 82 if buf: 83 yield buf 84 85 def mergesort (iters): 86 heap=[] 87 88 for it in iters: 89 try: 90 heap.append((it.next(), it)) 91 except StopIteration: 92 pass 93 94 heapq.heapify(heap) 95 96 while heap: 97 val, it = heap[0] 98 yield val 99 100 try: 101 heapq.heapreplace(heap, (it.next(),it)) 102 except StopIteration: 103 heapq.heappop(heap) 104 105 def read_ngrams (fname, n): 106 file = open(fname, "r") 107 fsize = os.path.getsize(fname) 108 mem = mmap.mmap(file.fileno(), fsize, mmap.MAP_SHARED, mmap.PROT_READ) 109 110 while True: 111 ngram = mem.read((n+1)*4) 112 if ngram: 113 data = struct.unpack('%dl' % (n+1), ngram) 114 yield NGram(data[:n], data[n]) 115 else: 116 break 117 118 mem.close() 119 file.close() 120 121 class MMArray: 122 __file = __mem = None 123 __realsize = __capsize = 0 124 125 def __init__(self, elmsize=1, fname=None, capsize=1024*1024): 126 self.__elmsize = elmsize 127 128 if not fname: 129 fno, self.__fname = tempfile.mkstemp("-mmarray", "pyslm-") 130 self.__file = os.fdopen (fno, "w+") 131 self.__enlarge(capsize) 132 else: 133 self.fromfile(fname) 134 135 def fromfile(self, fname): 136 if not os.path.exists(fname): 137 raise "The file '%s' does not exist!" 138 139 fsize = os.path.getsize(fname) 140 if fsize == 0: 141 raise "The size of file '%s' is zero!" % fname 142 143 if self.__mem: self.__mem.close() 144 if self.__file: self.__file.close() 145 146 self.__file = open (fname, "r+") 147 self.__mem = mmap.mmap(self.__file.fileno(), fsize) 148 self.__realsize = self.__capsize = fsize/self.__elmsize 149 150 def tofile(self, fname): 151 if fname == self.__file.name: 152 raise "Can not dump the array to currently mapping file!" 153 tf = open(fname, "w+") 154 bsize = self.__realsize * self.__elmsize 155 tf.write (self.__mem[:bsize]) 156 tf.close() 157 158 def __enlarge(self, capsize): 159 if self.__capsize >= capsize: 160 return 161 162 self.__capsize = capsize 163 self.__file.seek(self.__elmsize * self.__capsize - 1) 164 self.__file.write('\0') 165 self.__file.flush() 166 167 if (self.__mem): self.__mem.close() 168 self.__mem = mmap.mmap(self.__file.fileno(), self.__file.tell()) 169 170 def __del__ (self): 171 bsize = self.__realsize * self.__elmsize 172 self.__file.truncate (bsize) 173 self.__file.close() 174 if self.__mem: self.__mem.close() 175 os.remove(self.__fname) 176 177 def __getitem__(self, idx): 178 if idx < -self.__realsize or idx >= self.__realsize: 179 raise IndexError 180 return self.__access(idx) 181 182 def __setitem__(self, idx, buf): 183 if idx < -self.__realsize or idx >= self.__realsize: 184 raise IndexError 185 if type(buf) != type("") or len(buf) != self.__elmsize: 186 raise "Not a string, or the buffer size is incorrect!" 187 self.__access(idx, buf) 188 189 def __access (self, idx, buf=None): 190 if idx < 0: idx = self.__realsize + idx 191 start = idx * self.__elmsize 192 end = start + self.__elmsize 193 if not buf: return self.__mem[start:end] 194 self.__mem[start:end] = buf 195 196 def size(self): 197 return self.__realsize 198 199 def append(self, buf): 200 if type(buf) != type("") or len(buf) != self.__elmsize: 201 raise "Not a string, or the buffer size is incorrect!" 202 203 if self.__realsize >= self.__capsize: 204 self.__enlarge(self.__capsize*2) 205 206 self.__access(self.__realsize, buf) 207 self.__realsize += 1 208 209 def __iter__(self): 210 for i in xrange(0, self.__realsize): 211 yield self.__access(i) 212 213 def truncate(self, tsize): 214 if self.__realsize >= tsize: 215 self.__realsize = tsize 216