1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifndef _SUNPINYIN_CONTEXT_HISTORY_H 39 0 yongsun #define _SUNPINYIN_CONTEXT_HISTORY_H 40 0 yongsun 41 0 yongsun #include "portability.h" 42 0 yongsun 43 0 yongsun #include <map> 44 0 yongsun #include <deque> 45 0 yongsun #include <set> 46 0 yongsun 47 0 yongsun /** 48 0 yongsun * A forget all history memory 49 0 yongsun */ 50 0 yongsun class CICHistory { 51 0 yongsun public: 52 0 yongsun /** don't care word id, or seperator word id */ 53 0 yongsun static const unsigned int DCWID; 54 0 yongsun 55 0 yongsun virtual ~CICHistory(); 56 0 yongsun 57 0 yongsun virtual bool seenBefore(unsigned int wid); 58 0 yongsun 59 0 yongsun /** 60 0 yongsun * memorize the context stream pointed by [its_wid, ite_wid) 61 0 yongsun */ 62 0 yongsun virtual bool memorize(unsigned int* its_wid, unsigned int* ite_wid); 63 0 yongsun 64 0 yongsun /** 65 0 yongsun * @param its_wid is the first word pointer of the context stream 66 0 yongsun * @param ite_wid is the last (exclusive) word pointer of the context stream 67 0 yongsun * @return pr(*(ite_wid-1) | *its_wid, ..., *(ite_wid-2)) 68 0 yongsun * The return value could be zero, i.e. no need to smooth the probabilities 69 0 yongsun */ 70 0 yongsun virtual double pr(unsigned int* its_wid, unsigned int* ite_wid); 71 0 yongsun 72 0 yongsun /** 73 0 yongsun * @param its_wid is the first word pointer of the history stream 74 0 yongsun * @param ite_wid is the last (exclusive) word pointer of the history stream 75 0 yongsun * @return pr(*wid | *its_wid, ..., *(ite_wid-1)) 76 0 yongsun * The return value could be zero, i.e. no need to smooth the probabilities 77 0 yongsun */ 78 0 yongsun virtual double pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid); 79 0 yongsun 80 0 yongsun /** 81 0 yongsun * allocate a buffer, and put the context memory's contect into it 82 0 yongsun * @param buf_ptr would be stored the buffer pointer 83 0 yongsun * @param sz would be the size in byte of the buffer allocated 84 0 yongsun * @return false on error 85 0 yongsun * Note: the buf_ptr should be used free(*buf_ptr) to free after usage 86 0 yongsun */ 87 0 yongsun virtual bool 88 0 yongsun bufferize(void** buf_ptr, size_t* sz); 89 0 yongsun 90 0 yongsun /** 91 0 yongsun * Load context memory according to the buf 92 0 yongsun * @param buf_ptr uffer pointer 93 0 yongsun * @param sz is the size in byte of the buffer 94 0 yongsun * @return false on error 95 0 yongsun * call with buf_ptr with NULL value would clear the context memory 96 0 yongsun */ 97 0 yongsun virtual bool 98 0 yongsun loadFromBuffer(void* buf_ptr, size_t sz); 99 0 yongsun }; 100 0 yongsun 101 0 yongsun class CBigramHistory : public CICHistory { 102 0 yongsun public: 103 0 yongsun static void initClass(); 104 0 yongsun 105 0 yongsun CBigramHistory(); 106 0 yongsun 107 0 yongsun virtual ~CBigramHistory(); 108 0 yongsun 109 0 yongsun virtual bool seenBefore(unsigned int wid); 110 0 yongsun 111 0 yongsun virtual bool memorize(unsigned int* its_wid, unsigned int* ite_wid); 112 0 yongsun 113 0 yongsun /** 114 0 yongsun * @param its_wid is the first word pointer of the context stream 115 0 yongsun * @param ite_wid is the last (exclusive) word pointer of the context stream 116 0 yongsun * @return pr(*(ite_wid-1) | *(ite_wid-2)) 117 0 yongsun */ 118 0 yongsun virtual double pr(unsigned int* its_wid, unsigned int* ite_wid); 119 0 yongsun 120 0 yongsun /** 121 0 yongsun * @param its_wid is the first word pointer of the history stream 122 0 yongsun * @param ite_wid is the last (exclusive) word pointer of the history stream 123 0 yongsun * @return pr(*wid | *(ite_wid-1)) 124 0 yongsun */ 125 0 yongsun virtual double pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid); 126 0 yongsun 127 0 yongsun virtual bool 128 0 yongsun bufferize(void** buf_ptr, size_t* sz); 129 0 yongsun 130 0 yongsun virtual bool 131 0 yongsun loadFromBuffer(void* buf_ptr, size_t sz); 132 0 yongsun 133 0 yongsun protected: 134 0 yongsun typedef unsigned TWordId; 135 0 yongsun typedef std::pair<TWordId, TWordId> TBigram; 136 0 yongsun typedef TWordId TUnigram; 137 0 yongsun typedef std::map<TBigram, int> TBigramPool; 138 0 yongsun typedef std::map<TUnigram, int> TUnigramPool; 139 0 yongsun typedef std::deque<TWordId> TContextMemory; 140 0 yongsun 141 0 yongsun static const size_t contxt_memory_size; 142 0 yongsun 143 0 yongsun TContextMemory m_memory; 144 0 yongsun TUnigramPool m_unifreq; 145 0 yongsun TBigramPool m_bifreq; 146 0 yongsun 147 0 yongsun static std::set<unsigned int> s_stopWords; 148 0 yongsun 149 0 yongsun protected: 150 0 yongsun double pr(TBigram& bg); 151 0 yongsun int uniFreq(TUnigram& ug); 152 0 yongsun int biFreq(TBigram& bg); 153 0 yongsun 154 0 yongsun void decUniFreq(TUnigram& ug); 155 0 yongsun void decBiFreq(TBigram& bg); 156 0 yongsun void incUniFreq(TUnigram& ug); 157 0 yongsun void incBiFreq(TBigram& bg); 158 0 yongsun }; 159 0 yongsun 160 0 yongsun #endif 161