Home | History | Annotate | Download | only in python
      1 #!/usr/bin/python 
      2 # -*- coding: UTF-8 -*-
      3 
      4 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      5 # 
      6 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      7 # 
      8 # The contents of this file are subject to the terms of either the GNU Lesser
      9 # General Public License Version 2.1 only ("LGPL") or the Common Development and
     10 # Distribution License ("CDDL")(collectively, the "License"). You may not use this
     11 # file except in compliance with the License. You can obtain a copy of the CDDL at
     12 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     13 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 
     14 # specific language governing permissions and limitations under the License. When
     15 # distributing the software, include this License Header Notice in each file and
     16 # include the full text of the License in the License file as well as the
     17 # following notice:
     18 # 
     19 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     20 # (CDDL)
     21 # For Covered Software in this distribution, this License shall be governed by the
     22 # laws of the State of California (excluding conflict-of-law provisions).
     23 # Any litigation relating to this License shall be subject to the jurisdiction of
     24 # the Federal Courts of the Northern District of California and the state courts
     25 # of the State of California, with venue lying in Santa Clara County, California.
     26 # 
     27 # Contributor(s):
     28 # 
     29 # If you wish your version of this file to be governed by only the CDDL or only
     30 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     31 # include this software in this distribution under the [CDDL or LGPL Version 2.1]
     32 # license." If you don't indicate a single choice of license, a recipient has the
     33 # option to distribute your version of this file under either the CDDL or the LGPL
     34 # Version 2.1, or to extend the choice of license to its licensees as provided
     35 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     36 # Version 2 license, then the option applies only if the new code is made subject
     37 # to such option by the copyright holder. 
     38 
     39 import os
     40 import mmap
     41 import struct
     42 import heapq
     43 import tempfile
     44 
     45 class NGram:
     46     key = ()
     47     freq = 0
     48 
     49     def __init__(self, key, freq):
     50         self.key = key
     51         self.freq = freq
     52 
     53     def __cmp__(self,other):
     54         return cmp(self.key, other.key)
     55 
     56     def __str__(self):
     57         return "ngram: " + self.key.__str__() + " freq: " + str(self.freq)
     58 
     59 def read_ch_sentences(file):
     60     nesting = 0
     61     buf = ""
     62     for line in file:
     63         if buf and (line[0].isspace() or len(buf) <= 40):
     64             yield buf
     65             buf, nesting = "", 0
     66 
     67         for ch in line:
     68             if ch.isspace():
     69                 continue
     70 
     71             if ch in u"":
     72 		nesting +=1
     73             elif ch in u"":
     74 		nesting -=1
     75 
     76             if ch in u"" and nesting == 0:
     77                 if buf:
     78                     yield buf + ch
     79                     buf, nesting = "", 0
     80             else:
     81                 buf += ch
     82     if buf:
     83         yield buf
     84 
     85 def mergesort (iters):
     86         heap=[]
     87 
     88         for it in iters:
     89             try:
     90                 heap.append((it.next(), it))
     91             except StopIteration:
     92                 pass
     93 
     94         heapq.heapify(heap)
     95 
     96         while heap:
     97             val, it = heap[0]
     98             yield val
     99 
    100             try:
    101                 heapq.heapreplace(heap, (it.next(),it))
    102             except StopIteration:
    103                 heapq.heappop(heap)
    104 
    105 def read_ngrams (fname, n):
    106     file = open(fname, "r")
    107     fsize = os.path.getsize(fname)
    108     mem = mmap.mmap(file.fileno(), fsize, mmap.MAP_SHARED, mmap.PROT_READ)
    109 
    110     while True:
    111         ngram = mem.read((n+1)*4)
    112         if ngram:
    113             data = struct.unpack('%dl' % (n+1), ngram)
    114             yield NGram(data[:n], data[n])
    115         else:
    116             break
    117 
    118     mem.close()
    119     file.close()
    120 
    121 class MMArray:
    122     __file = __mem = None
    123     __realsize = __capsize = 0
    124 
    125     def __init__(self, elmsize=1, fname=None, capsize=1024*1024):
    126         self.__elmsize = elmsize
    127 
    128         if not fname:
    129             fno, self.__fname = tempfile.mkstemp("-mmarray", "pyslm-")
    130             self.__file = os.fdopen (fno, "w+")
    131             self.__enlarge(capsize)
    132         else:
    133             self.fromfile(fname)
    134 
    135     def fromfile(self, fname):
    136         if not os.path.exists(fname):
    137             raise "The file '%s' does not exist!"
    138 
    139         fsize = os.path.getsize(fname)
    140         if fsize == 0:
    141             raise "The size of file '%s' is zero!" % fname
    142 
    143         if self.__mem: self.__mem.close()
    144         if self.__file: self.__file.close()
    145 
    146         self.__file = open (fname, "r+")
    147         self.__mem = mmap.mmap(self.__file.fileno(), fsize)
    148         self.__realsize = self.__capsize = fsize/self.__elmsize
    149 
    150     def tofile(self, fname):
    151         if fname == self.__file.name:
    152             raise "Can not dump the array to currently mapping file!"
    153         tf = open(fname, "w+")
    154         bsize = self.__realsize * self.__elmsize
    155         tf.write (self.__mem[:bsize])
    156         tf.close()
    157 
    158     def __enlarge(self, capsize):
    159         if self.__capsize >= capsize:
    160             return
    161         
    162         self.__capsize = capsize
    163         self.__file.seek(self.__elmsize * self.__capsize - 1)
    164         self.__file.write('\0')
    165         self.__file.flush()
    166 
    167         if (self.__mem): self.__mem.close()
    168         self.__mem = mmap.mmap(self.__file.fileno(), self.__file.tell())
    169 
    170     def __del__ (self):
    171         bsize = self.__realsize * self.__elmsize
    172         self.__file.truncate (bsize)
    173         self.__file.close()
    174         if self.__mem: self.__mem.close()
    175         os.remove(self.__fname)
    176 
    177     def __getitem__(self, idx):
    178         if idx < -self.__realsize or idx >= self.__realsize:
    179             raise IndexError
    180         return self.__access(idx)
    181 
    182     def __setitem__(self, idx, buf):
    183         if idx < -self.__realsize or idx >= self.__realsize:
    184             raise IndexError
    185         if type(buf) != type("") or len(buf) != self.__elmsize:
    186             raise "Not a string, or the buffer size is incorrect!"
    187         self.__access(idx, buf)
    188 
    189     def __access (self, idx, buf=None):
    190         if idx < 0: idx = self.__realsize + idx
    191         start = idx * self.__elmsize
    192         end = start + self.__elmsize
    193         if not buf: return self.__mem[start:end]
    194         self.__mem[start:end] = buf
    195 
    196     def size(self):
    197         return self.__realsize
    198 
    199     def append(self, buf):
    200         if type(buf) != type("") or len(buf) != self.__elmsize:
    201             raise "Not a string, or the buffer size is incorrect!"
    202 
    203         if self.__realsize >= self.__capsize:
    204             self.__enlarge(self.__capsize*2)
    205 
    206         self.__access(self.__realsize, buf)
    207         self.__realsize += 1
    208 
    209     def __iter__(self):
    210         for i in xrange(0, self.__realsize):
    211             yield self.__access(i)
    212 
    213     def truncate(self, tsize):
    214         if self.__realsize >= tsize:
    215             self.__realsize = tsize
    216