Home | History | Annotate | Download | only in slm
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifdef HAVE_CONFIG_H
     39    0   yongsun #include <config.h>
     40    0   yongsun #endif
     41    0   yongsun 
     42    0   yongsun #include <unistd.h>
     43    0   yongsun #include <fcntl.h>
     44    0   yongsun #include <sys/types.h>
     45    0   yongsun #include <sys/stat.h>
     46    0   yongsun #include <math.h>
     47    0   yongsun 
     48    0   yongsun #include "slm.h"
     49   90   tonylee 
     50   90   tonylee #ifdef HAVE_SYS_MMAN_H
     51   90   tonylee #include <sys/mman.h>
     52   90   tonylee #elif defined(BEOS_OS)
     53   90   tonylee #include <be/kernel/OS.h>
     54   90   tonylee #endif
     55    0   yongsun 
     56    0   yongsun bool
     57    0   yongsun CThreadSlm::load(const char* fname, bool MMap)
     58    0   yongsun {
     59    0   yongsun     int fd = open(fname, O_RDONLY);
     60    0   yongsun     m_bufSize = lseek(fd, 0, SEEK_END);
     61    0   yongsun     lseek(fd, 0, SEEK_SET);
     62    0   yongsun 
     63    0   yongsun     m_bMMap = MMap;
     64    0   yongsun     if (m_bMMap) {
     65   90   tonylee #ifdef HAVE_SYS_MMAN_H
     66    0   yongsun         void* p = mmap(NULL, m_bufSize, PROT_READ, MAP_SHARED, fd, 0);
     67    0   yongsun         if (p == MAP_FAILED) {
     68    0   yongsun             close(fd);
     69    0   yongsun             return false;
     70    0   yongsun         }
     71    0   yongsun         m_buf = (char *)p;
     72   90   tonylee #elif defined(BEOS_OS)
     73   90   tonylee         char *p = NULL;
     74   90   tonylee         area_id area = create_area("tmp", (void**)&p, B_ANY_ADDRESS,
     75   90   tonylee                                    (m_bufSize + (B_PAGE_SIZE - 1)) & ~(B_PAGE_SIZE - 1),
     76   90   tonylee                                    B_NO_LOCK, B_READ_AREA | B_WRITE_AREA);
     77   90   tonylee         if (area < 0) {
     78   90   tonylee             close(fd);
     79   90   tonylee             return false;
     80   90   tonylee         }
     81   90   tonylee         m_buf = p;
     82   90   tonylee 
     83   90   tonylee         for (ssize_t len = m_bufSize; len > 0; ) {
     84   90   tonylee             ssize_t n = read(fd, p, len);
     85   90   tonylee             if (n < 0) break;
     86   90   tonylee             p += n;
     87   90   tonylee             len -= n;
     88   90   tonylee         }
     89   90   tonylee #else // Other OS
     90   90   tonylee         #error "No implementation for mmap()"
     91   90   tonylee #endif // HAVE_SYS_MMAN_H
     92    0   yongsun     } else {
     93    0   yongsun         if ((m_buf = new char[m_bufSize]) == NULL) {
     94    0   yongsun             close(fd);
     95    0   yongsun             return false;
     96    0   yongsun         }
     97    0   yongsun         if (read(fd, m_buf, m_bufSize) != m_bufSize) {
     98    0   yongsun             delete [] m_buf; m_buf = NULL;
     99    0   yongsun             close(fd);
    100    0   yongsun             return false;
    101    0   yongsun         }
    102    0   yongsun     }
    103    0   yongsun     close(fd);
    104    0   yongsun 
    105    0   yongsun     m_N = *(unsigned*)m_buf;
    106    0   yongsun     m_UseLogPr = *(((unsigned*)m_buf)+1);
    107    0   yongsun     m_LevelSizes = ((unsigned*)m_buf)+2;
    108    0   yongsun     m_prTable = (float*)(m_buf + 2*sizeof(unsigned) + (m_N+1)*sizeof(unsigned));
    109    0   yongsun     m_bowTable = m_prTable + (1 << BITS_PR);
    110    0   yongsun 
    111    0   yongsun     TNode* pn = (TNode*)(m_bowTable + (1 << BITS_BOW));
    112    0   yongsun 
    113    0   yongsun     //Solaris CC would cause error in runtime if using some thing like
    114    0   yongsun     //following even using (void**) conversion. So add PtrVoid definition
    115    0   yongsun     //m_Levels = new (void*) [m_N + 1];
    116    0   yongsun     m_Levels = new PtrVoid[m_N+1];
    117    0   yongsun 
    118  186  tchaikov     for (unsigned lvl = 0; lvl <= m_N; ++lvl) {
    119    0   yongsun         m_Levels[lvl] = (void*)pn;
    120    0   yongsun         pn += m_LevelSizes[lvl];
    121    0   yongsun     }
    122    0   yongsun     return true;
    123    0   yongsun }
    124    0   yongsun 
    125    0   yongsun void
    126    0   yongsun CThreadSlm::free()
    127    0   yongsun {
    128  186  tchaikov     delete [] m_Levels;
    129    0   yongsun     if (m_buf) {
    130    0   yongsun         if (m_bMMap) {
    131   90   tonylee #ifdef HAVE_SYS_MMAN_H
    132    0   yongsun             munmap(m_buf, m_bufSize);
    133   90   tonylee #elif defined(BEOS_OS)
    134   90   tonylee             delete_area(area_for(m_buf));
    135   90   tonylee #else // Other OS
    136   90   tonylee             #error "No implementation for munmap()"
    137   90   tonylee #endif // HAVE_SYS_MMAN_H
    138    0   yongsun         } else {
    139    0   yongsun             delete [] m_buf;
    140    0   yongsun         }
    141    0   yongsun     }
    142    0   yongsun     m_buf = NULL;
    143    0   yongsun     m_Levels = NULL;
    144    0   yongsun }
    145    0   yongsun 
    146    0   yongsun template<class NodeT>
    147    0   yongsun unsigned int
    148    0   yongsun find_id(NodeT* base, unsigned int h, unsigned int t, unsigned int id)
    149    0   yongsun {
    150    0   yongsun     unsigned int tail = t;
    151    0   yongsun     while (h < t) {
    152    0   yongsun         int m = (h+t)/2;
    153    0   yongsun         NodeT* pm = base+m;
    154    0   yongsun         unsigned int thisId = pm->wid();
    155    0   yongsun         if (thisId < id)
    156    0   yongsun             h = m+1;
    157    0   yongsun         else if (thisId > id)
    158    0   yongsun             t = m;
    159    0   yongsun         else
    160    0   yongsun             return m;
    161    0   yongsun     }
    162    0   yongsun     return tail;
    163    0   yongsun }
    164    0   yongsun 
    165    0   yongsun /**
    166    0   yongsun * return value as the model suggested. The history state must be historified
    167    0   yongsun * or the history's level should be 0. when level == 0 but idx != 0, the
    168    0   yongsun * history is a psuedo unigram state used for this model to combine another
    169    0   yongsun * bigram cache language model
    170    0   yongsun */
    171    0   yongsun double
    172    0   yongsun CThreadSlm::rawTransfer(TState history, unsigned int wid, TState& result)
    173    0   yongsun {
    174    0   yongsun     unsigned int lvl = history.getLevel();
    175    0   yongsun     unsigned int pos = history.getIdx();
    176    0   yongsun 
    177    0   yongsun     double cost = (m_UseLogPr)?0.0:1.0;
    178    0   yongsun 
    179    0   yongsun     // NON_Word id must be dealed with special, let it transfer to root
    180    0   yongsun     // without any cost
    181    0   yongsun     if (ID_NOT_WORD == wid) {
    182    0   yongsun         result = 0;
    183    0   yongsun         return cost;
    184    0   yongsun     }
    185    0   yongsun 
    186    0   yongsun     while (true) {
    187    0   yongsun         //for psuedo cache model unigram state
    188    0   yongsun         TNode* pn = ((TNode *)m_Levels[lvl]) + ((lvl)?pos:0);
    189    0   yongsun 
    190    0   yongsun         unsigned int t = (pn+1)->ch();
    191    0   yongsun 
    192    0   yongsun         if (lvl < m_N-1) {
    193    0   yongsun             TNode* pBase =(TNode*)m_Levels[lvl+1];
    194    0   yongsun             unsigned int idx = find_id(pBase, pn->ch(), t, wid);
    195    0   yongsun             if (idx != t) {
    196    0   yongsun                 result.setIdx(idx);
    197    0   yongsun                 result.setLevel(lvl+1);
    198    0   yongsun                 double pr = m_prTable[pBase[idx].pr()];
    199    0   yongsun                 return (m_UseLogPr)?(cost+pr):(cost*pr);
    200    0   yongsun             }
    201    0   yongsun 
    202    0   yongsun         } else {
    203    0   yongsun             TLeaf* pBase =(TLeaf*)m_Levels[lvl+1];
    204    0   yongsun             unsigned int idx = find_id(pBase, pn->ch(), t, wid);
    205    0   yongsun             if (idx != t) {
    206    0   yongsun                 result.setIdx(idx);
    207    0   yongsun                 result.setLevel(lvl+1);
    208    0   yongsun                 double pr = m_prTable[pBase[idx].pr()];
    209    0   yongsun                 return (m_UseLogPr)?(cost+pr):(cost*pr);
    210    0   yongsun             }
    211    0   yongsun 
    212    0   yongsun         }
    213    0   yongsun 
    214    0   yongsun         if (m_UseLogPr)
    215    0   yongsun             cost += m_bowTable[pn->bow()];
    216    0   yongsun         else
    217    0   yongsun             cost *= m_bowTable[pn->bow()];
    218    0   yongsun         if (lvl == 0)
    219    0   yongsun             break;
    220    0   yongsun         lvl = pn->bol();
    221    0   yongsun         pos = pn->bon();
    222    0   yongsun     }
    223    0   yongsun     result.setLevel(0);
    224    0   yongsun     result.setIdx(0);
    225    0   yongsun     if (m_UseLogPr)
    226    0   yongsun         return cost + m_prTable[((TNode *)m_Levels[0])->pr()];
    227    0   yongsun     else
    228    0   yongsun         return cost * m_prTable[((TNode *)m_Levels[0])->pr()];
    229    0   yongsun }
    230    0   yongsun 
    231    0   yongsun double
    232    0   yongsun CThreadSlm::transferNegLog(TState history, unsigned int wid, TState& result)
    233    0   yongsun {
    234    0   yongsun     double cost = rawTransfer(history, wid, result);
    235    0   yongsun     if (m_UseLogPr)
    236    0   yongsun         return cost;
    237    0   yongsun     else
    238    0   yongsun         return -log(cost);
    239    0   yongsun }
    240    0   yongsun 
    241    0   yongsun double
    242    0   yongsun CThreadSlm::transfer(TState history, unsigned int wid, TState& result)
    243    0   yongsun {
    244    0   yongsun     double cost = rawTransfer(history, wid, result);
    245    0   yongsun     if (!m_UseLogPr)
    246    0   yongsun         return cost;
    247    0   yongsun     else
    248    0   yongsun         return exp(-cost);
    249    0   yongsun }
    250    0   yongsun 
    251    0   yongsun unsigned int
    252    0   yongsun CThreadSlm::lastWordId(TState st)
    253    0   yongsun {
    254    0   yongsun     unsigned int lvl = st.getLevel();
    255    0   yongsun     if (lvl >= m_N) {
    256    0   yongsun         const TLeaf* pn = ((const TLeaf *)m_Levels[m_N]) + st.getIdx();
    257    0   yongsun         return pn->wid();
    258    0   yongsun     } else if (lvl > 0) {
    259    0   yongsun         const TNode *pn = ((const TNode *)m_Levels[st.getLevel()]) + st.getIdx();
    260    0   yongsun         return pn->wid();
    261    0   yongsun     } else {
    262    0   yongsun         unsigned int idx = st.getIdx();
    263    0   yongsun         if (idx == 0) {
    264    0   yongsun             const TNode *pn = ((const TNode *)m_Levels[st.getLevel()]) + st.getIdx();
    265    0   yongsun             return pn->wid();
    266    0   yongsun         }
    267    0   yongsun         return idx; // return the psuedo state word id
    268    0   yongsun     }
    269    0   yongsun }
    270    0   yongsun 
    271    0   yongsun CThreadSlm::TState
    272    0   yongsun CThreadSlm::history_state_of(TState st)
    273    0   yongsun {
    274    0   yongsun     if (st.getLevel() >= m_N) {
    275    0   yongsun         TLeaf* pl = ((TLeaf *)m_Levels[m_N]) + st.getIdx();
    276    0   yongsun         return TState(pl->bol(), pl->bon());
    277    0   yongsun     } else {
    278    0   yongsun         TNode* pn = ((TNode *)m_Levels[st.getLevel()]) + st.getIdx();
    279    0   yongsun         if (pn->ch() == (pn+1)->ch())
    280    0   yongsun             return TState(pn->bol(), pn->bon());
    281    0   yongsun         else
    282    0   yongsun             return st;
    283    0   yongsun     }
    284    0   yongsun }
    285    0   yongsun 
    286    0   yongsun CThreadSlm::TState&
    287    0   yongsun CThreadSlm::historify(TState& st)
    288    0   yongsun {
    289    0   yongsun     if (st.getLevel() >= m_N) {
    290    0   yongsun         TLeaf* pl = ((TLeaf *)m_Levels[m_N]) + st.getIdx();
    291    0   yongsun         st.setLevel(pl->bol());
    292    0   yongsun         st.setIdx(pl->bon());
    293    0   yongsun     } else {
    294    0   yongsun         TNode* pn = ((TNode *)m_Levels[st.getLevel()]) + st.getIdx();
    295    0   yongsun         if (pn->ch() == (pn+1)->ch()) {
    296    0   yongsun             st.setLevel(pn->bol());
    297    0   yongsun             st.setIdx(pn->bon());
    298    0   yongsun         }
    299    0   yongsun     }
    300    0   yongsun     return st;
    301    0   yongsun }
    302