1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef _SUNPINYIN_CONTEXT_HISTORY_H 39 #define _SUNPINYIN_CONTEXT_HISTORY_H 40 41 #include "portability.h" 42 43 #include <map> 44 #include <deque> 45 #include <set> 46 47 /** 48 * A forget all history memory 49 */ 50 class CICHistory { 51 public: 52 /** don't care word id, or seperator word id */ 53 static const unsigned int DCWID; 54 55 virtual ~CICHistory(); 56 57 virtual bool seenBefore(unsigned int wid); 58 59 /** 60 * memorize the context stream pointed by [its_wid, ite_wid) 61 */ 62 virtual bool memorize(unsigned int* its_wid, unsigned int* ite_wid); 63 64 /** 65 * @param its_wid is the first word pointer of the context stream 66 * @param ite_wid is the last (exclusive) word pointer of the context stream 67 * @return pr(*(ite_wid-1) | *its_wid, ..., *(ite_wid-2)) 68 * The return value could be zero, i.e. no need to smooth the probabilities 69 */ 70 virtual double pr(unsigned int* its_wid, unsigned int* ite_wid); 71 72 /** 73 * @param its_wid is the first word pointer of the history stream 74 * @param ite_wid is the last (exclusive) word pointer of the history stream 75 * @return pr(*wid | *its_wid, ..., *(ite_wid-1)) 76 * The return value could be zero, i.e. no need to smooth the probabilities 77 */ 78 virtual double pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid); 79 80 /** 81 * allocate a buffer, and put the context memory's contect into it 82 * @param buf_ptr would be stored the buffer pointer 83 * @param sz would be the size in byte of the buffer allocated 84 * @return false on error 85 * Note: the buf_ptr should be used free(*buf_ptr) to free after usage 86 */ 87 virtual bool 88 bufferize(void** buf_ptr, size_t* sz); 89 90 /** 91 * Load context memory according to the buf 92 * @param buf_ptr uffer pointer 93 * @param sz is the size in byte of the buffer 94 * @return false on error 95 * call with buf_ptr with NULL value would clear the context memory 96 */ 97 virtual bool 98 loadFromBuffer(void* buf_ptr, size_t sz); 99 }; 100 101 class CBigramHistory : public CICHistory { 102 public: 103 static void initClass(); 104 105 CBigramHistory(); 106 107 virtual ~CBigramHistory(); 108 109 virtual bool seenBefore(unsigned int wid); 110 111 virtual bool memorize(unsigned int* its_wid, unsigned int* ite_wid); 112 113 /** 114 * @param its_wid is the first word pointer of the context stream 115 * @param ite_wid is the last (exclusive) word pointer of the context stream 116 * @return pr(*(ite_wid-1) | *(ite_wid-2)) 117 */ 118 virtual double pr(unsigned int* its_wid, unsigned int* ite_wid); 119 120 /** 121 * @param its_wid is the first word pointer of the history stream 122 * @param ite_wid is the last (exclusive) word pointer of the history stream 123 * @return pr(*wid | *(ite_wid-1)) 124 */ 125 virtual double pr(unsigned int* its_wid, unsigned int* ite_wid, unsigned int wid); 126 127 virtual bool 128 bufferize(void** buf_ptr, size_t* sz); 129 130 virtual bool 131 loadFromBuffer(void* buf_ptr, size_t sz); 132 133 protected: 134 typedef unsigned TWordId; 135 typedef std::pair<TWordId, TWordId> TBigram; 136 typedef TWordId TUnigram; 137 typedef std::map<TBigram, int> TBigramPool; 138 typedef std::map<TUnigram, int> TUnigramPool; 139 typedef std::deque<TWordId> TContextMemory; 140 141 static const size_t contxt_memory_size; 142 143 TContextMemory m_memory; 144 TUnigramPool m_unifreq; 145 TBigramPool m_bifreq; 146 147 static std::set<unsigned int> s_stopWords; 148 149 protected: 150 double pr(TBigram& bg); 151 int uniFreq(TUnigram& ug); 152 int biFreq(TBigram& bg); 153 154 void decUniFreq(TUnigram& ug); 155 void decBiFreq(TBigram& bg); 156 void incUniFreq(TUnigram& ug); 157 void incBiFreq(TBigram& bg); 158 }; 159 160 #endif 161
