1 #!/usr/bin/python 2 3 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 4 # 5 # Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 6 # 7 # The contents of this file are subject to the terms of either the GNU Lesser 8 # General Public License Version 2.1 only ("LGPL") or the Common Development and 9 # Distribution License ("CDDL")(collectively, the "License"). You may not use this 10 # file except in compliance with the License. You can obtain a copy of the CDDL at 11 # http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 12 # http://www.opensource.org/licenses/lgpl-license.php. See the License for the 13 # specific language governing permissions and limitations under the License. When 14 # distributing the software, include this License Header Notice in each file and 15 # include the full text of the License in the License file as well as the 16 # following notice: 17 # 18 # NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 19 # (CDDL) 20 # For Covered Software in this distribution, this License shall be governed by the 21 # laws of the State of California (excluding conflict-of-law provisions). 22 # Any litigation relating to this License shall be subject to the jurisdiction of 23 # the Federal Courts of the Northern District of California and the state courts 24 # of the State of California, with venue lying in Santa Clara County, California. 25 # 26 # Contributor(s): 27 # 28 # If you wish your version of this file to be governed by only the CDDL or only 29 # the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 30 # include this software in this distribution under the [CDDL or LGPL Version 2.1] 31 # license." If you don't indicate a single choice of license, a recipient has the 32 # option to distribute your version of this file under either the CDDL or the LGPL 33 # Version 2.1, or to extend the choice of license to its licensees as provided 34 # above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 35 # Version 2 license, then the option applies only if the new code is made subject 36 # to such option by the copyright holder. 37 38 import sys 39 import getopt 40 import codecs 41 import struct 42 43 from imdict import IMDict 44 from trie import match_longest, get_ambiguious_length 45 from utils import read_ch_sentences 46 47 def usage(): 48 print ''' 49 Usage: 50 mmseg.py -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID] corpus_file 51 52 -d --dict: 53 The dictionary file (in UTF-8 encoding) to be used. 54 -f --format: 55 Output format, can be 'text' or 'bin'. Default is 'bin'. 56 Normally, in text mode, word text are output, while in binary mode, 57 the integer of the word-ids are writed to stdout. 58 -i --show-id: 59 Show Id info. In text output format, attach id after known words. 60 -s --stok-id: 61 Sentence token id. Default 10. 62 It will be write to output in binary mode after every sentence. 63 -a --ambi-id: 64 Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), 65 The sequence ABC will not be segmented, in binary mode, the AMBI-ID 66 is written out; in text mode, <ambi>ABC</ambi> will be output. Default 67 is 9. 68 ''' 69 70 options={'show-id': False, 71 'format' : 'bin', 72 'stok-id': 10, 73 'ambi-id': 9} 74 75 def parse_options(args): 76 try: 77 opts, args = getopt.getopt(args, "hid:f:s:a:", ["help", "show-id", "dict=", "format=", "stok-id=", "ambi-id="]) 78 except getopt.GetoptError, err: 79 print str(err) 80 sys.exit(1) 81 82 for opt,val in opts: 83 if opt in ('-h', '--help'): 84 usage() 85 sys.exit() 86 elif opt in ('-d', '--dict'): 87 options['dict'] = val 88 elif opt in ('-i', '--show-id'): 89 options['show-id'] = True 90 elif opt in ('-f', '--format'): 91 if val in ('bin', 'text'): 92 options['format'] = val 93 elif opt in ('-s', '--stok-id'): 94 options['stok-id'] = int(val) 95 elif opt in ('-val', '--ambi-id'): 96 options['ambi-id'] = int(val) 97 98 if 'dict' not in options: 99 usage() 100 sys.exit(1) 101 102 if args: 103 options['corpus'] = args[0] 104 105 def output_word(wid, word): 106 if options['format'] == 'text': 107 if wid == options['ambi-id']: 108 word = '<ambi>'+word+'</ambi>' 109 if options['show-id']: 110 word = word+'('+str(wid)+')' 111 sys.stdout.write('%s ' % word.encode('UTF-8')) 112 else: 113 sys.stdout.write(struct.pack('l', wid)) 114 115 def process_file(file, dict): 116 for line in read_ch_sentences(file): 117 length = len(line) 118 i = 0 119 while (i < length): 120 strbuf = line[i:] 121 wid, l = match_longest(dict, strbuf) 122 if wid == 0: 123 l = 1 124 else: 125 ambi_len = get_ambiguious_length(dict, strbuf, l) 126 if ambi_len > l: 127 wid, l = options['ambi-id'], ambi_len 128 129 output_word (wid, strbuf[:l]) 130 i += l 131 132 output_word (options['stok-id'], '\n') 133 134 if __name__ == "__main__": 135 parse_options(sys.argv[1:]) 136 137 dict = IMDict(options['dict']) 138 139 try: file = codecs.open(options['corpus'], "r", "UTF-8") 140 except: file = codecs.getreader('UTF-8')(sys.stdin) 141 142 process_file (file, dict) 143 file.close() 144