Home | History | Annotate | Download | only in ids2ngram
      1   0  yongsun /*
      2  82  yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  82  yongsun  *
      4  82  yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  82  yongsun  *
      6  82  yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7  82  yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  82  yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  82  yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  82  yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  82  yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  82  yongsun  * specific language governing permissions and limitations under the License. When
     13  82  yongsun  * distributing the software, include this License Header Notice in each file and
     14  82  yongsun  * include the full text of the License in the License file as well as the
     15  82  yongsun  * following notice:
     16  82  yongsun  *
     17  82  yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  82  yongsun  * (CDDL)
     19  82  yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20  82  yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21  82  yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  82  yongsun  * the Federal Courts of the Northern District of California and the state courts
     23  82  yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24  82  yongsun  *
     25  82  yongsun  * Contributor(s):
     26  82  yongsun  *
     27  82  yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28  82  yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  82  yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  82  yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31  82  yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32  82  yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  82  yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  82  yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35  82  yongsun  * to such option by the copyright holder.
     36   0  yongsun  */
     37  82  yongsun 
     38   0  yongsun #ifndef _SIM_IDNGRAM_MERGE_H
     39   0  yongsun #define _SIM_IDNGRAM_MERGE_H
     40   0  yongsun 
     41   0  yongsun #include <stdio.h>
     42   0  yongsun #include <map>
     43   0  yongsun #include <vector>
     44   0  yongsun #include <algorithm>
     45   0  yongsun 
     46   0  yongsun #include "../sim_fmerge.h"
     47   0  yongsun #include "idngram.h"
     48   0  yongsun 
     49   0  yongsun template<int N>
     50   0  yongsun void DoIdngramMerge(FILE*out, CMultiWayFileMerger<CSIM_IdngramFreq<N> > &merger)
     51   0  yongsun {
     52   0  yongsun 	merger.start();
     53   0  yongsun 	CSIM_IdngramFreq<N> prevItem;
     54   0  yongsun 	while (true) {
     55   0  yongsun 		file_para<CSIM_IdngramFreq<N> >	* ppara = merger.getBest();
     56   0  yongsun 		TUnitAndParaInfo<CSIM_IdngramFreq<N> > & upi = *(*ppara);
     57   0  yongsun 		if (upi.runOut) {
     58   0  yongsun 			if (prevItem.freq != 0) {
     59   0  yongsun 				fwrite(prevItem.ids, sizeof(TSIMWordId), N, out);
     60   0  yongsun 				fwrite(&(prevItem.freq), sizeof(unsigned int), 1, out);
     61   0  yongsun 			}
     62   0  yongsun 			break;
     63   0  yongsun 		}
     64   0  yongsun 		CSIM_IdngramFreq<N>& ng = upi.unit;
     65   0  yongsun 		if (!(prevItem == ng)) {
     66   0  yongsun 			if (prevItem.freq != 0) {
     67   0  yongsun 				fwrite(prevItem.ids, sizeof(TSIMWordId), N, out);
     68   0  yongsun 				fwrite(&(prevItem.freq), sizeof(unsigned int), 1, out);
     69   0  yongsun 			}
     70   0  yongsun 			prevItem = ng;
     71   0  yongsun 		} else {
     72   0  yongsun 			prevItem.freq += ng.freq;
     73   0  yongsun 		}
     74   0  yongsun 		merger.next();
     75   0  yongsun 	}
     76   0  yongsun }
     77   0  yongsun 
     78   0  yongsun template<int N>
     79   0  yongsun void ProcessingIdngramMerge(FILE *swap, FILE* out, std::vector<long>& para_offsets)
     80   0  yongsun {
     81   0  yongsun 	CMultiWayFileMerger<CSIM_IdngramFreq<N> > merger;
     82   0  yongsun 	long s = 0;
     83   0  yongsun 	for (int i=0; i < para_offsets.size(); ++i) {
     84   0  yongsun 		merger.addPara(swap, s, para_offsets[i]);
     85   0  yongsun 		s = para_offsets[i];
     86   0  yongsun 	}
     87   0  yongsun 	DoIdngramMerge<N>(out, merger);
     88   0  yongsun }
     89   0  yongsun 
     90   0  yongsun template<int N>
     91   0  yongsun void ProcessingIdngramMerge(FILE* out, std::vector<FILE* >& file_list)
     92   0  yongsun {
     93   0  yongsun 	CMultiWayFileMerger<CSIM_IdngramFreq<N> > merger;
     94   0  yongsun 	for (int i=0; i < file_list.size(); ++i) {
     95   0  yongsun 		fseek(file_list[i], 0, SEEK_END);
     96   0  yongsun 		merger.addPara(file_list[i], 0, ftell(file_list[i]));
     97   0  yongsun 	}
     98   0  yongsun 	DoIdngramMerge<N>(out, merger);
     99   0  yongsun }
    100   0  yongsun 
    101   0  yongsun #endif
    102   0  yongsun 
    103