Home | History | Annotate | Download | only in python
      1 #!/usr/bin/python
      2 
      3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      4 # 
      5 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      6 # 
      7 # The contents of this file are subject to the terms of either the GNU Lesser
      8 # General Public License Version 2.1 only ("LGPL") or the Common Development and
      9 # Distribution License ("CDDL")(collectively, the "License"). You may not use this
     10 # file except in compliance with the License. You can obtain a copy of the CDDL at
     11 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     12 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 
     13 # specific language governing permissions and limitations under the License. When
     14 # distributing the software, include this License Header Notice in each file and
     15 # include the full text of the License in the License file as well as the
     16 # following notice:
     17 # 
     18 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     19 # (CDDL)
     20 # For Covered Software in this distribution, this License shall be governed by the
     21 # laws of the State of California (excluding conflict-of-law provisions).
     22 # Any litigation relating to this License shall be subject to the jurisdiction of
     23 # the Federal Courts of the Northern District of California and the state courts
     24 # of the State of California, with venue lying in Santa Clara County, California.
     25 # 
     26 # Contributor(s):
     27 # 
     28 # If you wish your version of this file to be governed by only the CDDL or only
     29 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     30 # include this software in this distribution under the [CDDL or LGPL Version 2.1]
     31 # license." If you don't indicate a single choice of license, a recipient has the
     32 # option to distribute your version of this file under either the CDDL or the LGPL
     33 # Version 2.1, or to extend the choice of license to its licensees as provided
     34 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     35 # Version 2 license, then the option applies only if the new code is made subject
     36 # to such option by the copyright holder. 
     37 
     38 import sys
     39 import getopt
     40 import codecs
     41 import struct
     42 
     43 from imdict import IMDict
     44 from trie import match_longest, get_ambiguious_length
     45 from utils import read_ch_sentences
     46 
     47 def usage():
     48     print '''
     49 Usage:
     50 mmseg.py -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID] corpus_file
     51 
     52   -d --dict:
     53     The dictionary file (in UTF-8 encoding) to be used.
     54   -f --format:
     55     Output format, can be 'text' or 'bin'. Default is 'bin'.
     56     Normally, in text mode, word text are output, while in binary mode,
     57     the integer of the word-ids are writed to stdout.
     58   -i --show-id:
     59     Show Id info. In text output format, attach id after known words. 
     60   -s --stok-id:
     61     Sentence token id. Default 10.
     62     It will be write to output in binary mode after every sentence.
     63   -a --ambi-id:
     64     Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), 
     65     The sequence ABC will not be segmented, in binary mode, the AMBI-ID 
     66     is written out; in text mode, <ambi>ABC</ambi> will be output. Default 
     67     is 9.
     68 '''
     69 
     70 options={'show-id':       False, 
     71          'format' :       'bin', 
     72          'stok-id':       10,
     73          'ambi-id':       9}
     74 
     75 def parse_options(args):
     76     try:
     77         opts, args = getopt.getopt(args, "hid:f:s:a:", ["help", "show-id", "dict=", "format=", "stok-id=", "ambi-id="])
     78     except getopt.GetoptError, err:
     79         print str(err) 
     80         sys.exit(1)
     81 
     82     for opt,val in opts:
     83         if opt in ('-h', '--help'):
     84             usage()
     85             sys.exit()
     86         elif opt in ('-d', '--dict'):
     87             options['dict'] = val
     88         elif opt in ('-i', '--show-id'):
     89             options['show-id'] = True
     90         elif opt in ('-f', '--format'):
     91             if val in ('bin', 'text'):
     92                 options['format'] = val
     93         elif opt in ('-s', '--stok-id'):
     94             options['stok-id'] = int(val)
     95         elif opt in ('-val', '--ambi-id'):
     96             options['ambi-id'] = int(val)
     97 
     98     if 'dict' not in options:
     99         usage()
    100         sys.exit(1)
    101 
    102     if args:
    103         options['corpus'] = args[0]
    104 
    105 def output_word(wid, word):
    106     if options['format'] == 'text':
    107         if wid == options['ambi-id']:
    108             word = '<ambi>'+word+'</ambi>'
    109         if options['show-id']:
    110             word = word+'('+str(wid)+')'
    111         sys.stdout.write('%s ' % word.encode('UTF-8'))
    112     else:
    113         sys.stdout.write(struct.pack('l', wid))
    114 
    115 def process_file(file, dict):
    116     for line in read_ch_sentences(file):
    117         length = len(line)
    118         i = 0
    119         while (i < length):
    120             strbuf = line[i:]
    121             wid, l = match_longest(dict, strbuf)
    122             if wid == 0:
    123                 l = 1
    124             else:
    125                 ambi_len = get_ambiguious_length(dict, strbuf, l)
    126                 if ambi_len > l:
    127                     wid, l = options['ambi-id'], ambi_len
    128 
    129             output_word (wid, strbuf[:l])
    130             i += l
    131 
    132         output_word (options['stok-id'], '\n')
    133 
    134 if __name__ == "__main__":
    135     parse_options(sys.argv[1:])
    136 
    137     dict = IMDict(options['dict'])
    138     
    139     try:    file = codecs.open(options['corpus'], "r", "UTF-8")
    140     except: file = codecs.getreader('UTF-8')(sys.stdin)
    141 
    142     process_file (file, dict)
    143     file.close()
    144