Home | History | Annotate | Download | only in ids2ngram
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifdef HAVE_CONFIG_H
     39    0   yongsun #include "config.h"
     40    0   yongsun #endif
     41    0   yongsun 
     42    0   yongsun #ifdef HAVE_ASSERT_H
     43    0   yongsun #include <assert.h>
     44    0   yongsun #endif
     45    0   yongsun 
     46    0   yongsun #ifdef HAVE_GETOPT_H
     47    0   yongsun #include <getopt.h>
     48    0   yongsun #endif
     49    0   yongsun 
     50    0   yongsun #include <stdio.h>
     51    0   yongsun #include <map>
     52    0   yongsun #include <vector>
     53    0   yongsun #include <algorithm>
     54    0   yongsun 
     55    0   yongsun #include "../sim_fmerge.h"
     56    0   yongsun #include "idngram.h"
     57    0   yongsun #include "idngram_merge.h"
     58    0   yongsun 
     59    0   yongsun template<int N>
     60    0   yongsun void WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map)
     61    0   yongsun {
     62    0   yongsun 	typedef typename std::map<CSIM_Idngram<N>,unsigned int>::iterator TMapIterator;
     63    0   yongsun 	TMapIterator its=map.begin(), ite=map.end();
     64    0   yongsun 	for (; its != ite; ++its) {
     65    0   yongsun 		fwrite(its->first.ids, sizeof(TSIMWordId), N, out);
     66    0   yongsun 		fwrite(&(its->second), sizeof(unsigned int), 1, out);
     67    0   yongsun 	}
     68    0   yongsun 	map.clear();
     69    0   yongsun }
     70    0   yongsun 
     71    0   yongsun template<int N>
     72    0   yongsun void ProcessingRead(FILE *fp, FILE* swap, std::vector<long>& para_offsets, size_t paraMax)
     73    0   yongsun {
     74    0   yongsun 	typedef CSIM_Idngram<N> TNgram;
     75    0   yongsun 	typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap;
     76    0   yongsun 
     77    0   yongsun 	TMap map;
     78    0   yongsun 	TNgram ngram;
     79    0   yongsun 
     80    0   yongsun 	TSIMWordId* ids = ngram.ids;
     81    0   yongsun 	fread(ids, sizeof(TSIMWordId), N-1, fp);
     82    0   yongsun 	while (fread(ids+N-1, sizeof(TSIMWordId), 1, fp) == 1) {
     83    0   yongsun 		++map[ngram];
     84    0   yongsun 		if (map.size() >= paraMax)
     85    0   yongsun 		{
     86    0   yongsun 			printf("."); fflush(stdout);
     87    0   yongsun 			WriteOut(swap, map);
     88    0   yongsun 			para_offsets.push_back(ftell(swap));
     89    0   yongsun 		}
     90    0   yongsun 		for (int i=0; i<N-1; ++i) ids[i] = ids[i+1];
     91    0   yongsun 	}
     92    0   yongsun 	if (map.size() > 0) {
     93    0   yongsun 		printf("."); fflush(stdout);
     94    0   yongsun 		WriteOut(swap, map);
     95    0   yongsun 		para_offsets.push_back(ftell(swap));
     96    0   yongsun 	}
     97    0   yongsun }
     98    0   yongsun 
     99    0   yongsun static struct option long_options[] =
    100    0   yongsun {
    101    0   yongsun 	{"NMax", 1, 0, 'n'},
    102    0   yongsun 	{"out", 1, 0, 'o'},
    103    0   yongsun 	{"swap", 1, 0, 's'},
    104    0   yongsun 	{"para", 1, 0, 'p'},
    105    0   yongsun 	{0, 0, 0, 0}
    106    0   yongsun };
    107    0   yongsun 
    108    0   yongsun static int N=0;
    109    0   yongsun static int paraMax=0;
    110    0   yongsun static char* output=NULL;
    111    0   yongsun static char* swapfile=NULL;
    112    0   yongsun 
    113    0   yongsun void ShowUsage()
    114    0   yongsun {
    115  209  tchaikov 	printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n");
    116    0   yongsun 	printf("\nDescription\n");
    117    0   yongsun 	printf("   This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n");
    118    0   yongsun 	printf("\nInput:\n");
    119    0   yongsun 	printf("\tBinary id stream files looks like [id0,...,idX]\n");
    120    0   yongsun 	printf("\nOptions:\n");
    121    0   yongsun 	printf("\t  -n N               # N-gram\n");
    122    0   yongsun 	printf("\t  -s swapfile        # intermedia temporary file\n");
    123    0   yongsun 	printf("\t  -o outputfile      # result idngram file [id1, ... idN, freq]*\n");
    124    0   yongsun 	printf("\t  -p para_size       # maxium ngram-items per para\n");
    125    0   yongsun 	printf("\nExample:\n");
    126    0   yongsun 	printf("   Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n");
    127    0   yongsun 	printf("\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n");
    128    0   yongsun 	exit(100);
    129    0   yongsun }
    130    0   yongsun 
    131    0   yongsun static void getParameters(int argc, char* const argv[])
    132    0   yongsun {
    133    0   yongsun 	int option_index = 0;
    134    0   yongsun 	int c;
    135    0   yongsun 	while ((c=getopt_long(argc, argv, "p:n:s:o:", long_options, &option_index)) != -1)
    136    0   yongsun 	{
    137    0   yongsun 		switch (c) {
    138    0   yongsun 		case 'n':
    139    0   yongsun 			N = atoi(strdup(optarg));
    140    0   yongsun 			break;
    141    0   yongsun 		case 'p':
    142    0   yongsun 			paraMax = atoi(strdup(optarg));
    143    0   yongsun 			break;
    144    0   yongsun 		case 'o':
    145    0   yongsun 			output = strdup(optarg);
    146    0   yongsun 			break;
    147    0   yongsun 		case 's':
    148    0   yongsun 			swapfile = strdup(optarg);
    149    0   yongsun 			break;
    150    0   yongsun 		default:
    151    0   yongsun 			ShowUsage();
    152    0   yongsun 		}
    153    0   yongsun 	}
    154    0   yongsun 	if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL)
    155    0   yongsun 		ShowUsage();
    156    0   yongsun }
    157    0   yongsun 
    158    0   yongsun static std::vector<long> para_offsets;
    159    0   yongsun 
    160    0   yongsun int main(int argc, char* argv[])
    161    0   yongsun {
    162    0   yongsun 	getParameters(argc, argv);
    163    0   yongsun 	FILE *swap = fopen(swapfile, "wb+");
    164    0   yongsun 	FILE *out = fopen(output, "wb+");
    165    0   yongsun 	if (optind >= argc) ShowUsage();
    166    0   yongsun    while (optind < argc) {
    167    0   yongsun     	printf("Processing %s:", argv[optind]); fflush(stdout);
    168    0   yongsun     	FILE *fp = fopen(argv[optind], "rb");
    169    0   yongsun     	switch (N) {
    170    0   yongsun     	case 1:
    171    0   yongsun   			ProcessingRead<1>(fp, swap, para_offsets, paraMax);
    172    0   yongsun   			break;
    173    0   yongsun     	case 2:
    174    0   yongsun   			ProcessingRead<2>(fp, swap, para_offsets, paraMax);
    175    0   yongsun   			break;
    176    0   yongsun     	case 3:
    177    0   yongsun   			ProcessingRead<3>(fp, swap, para_offsets, paraMax);
    178    0   yongsun   			break;
    179    0   yongsun     	}
    180    0   yongsun 		fclose(fp);
    181    0   yongsun       printf ("\n"); fflush(stdout);
    182    0   yongsun 		++optind;
    183    0   yongsun    }
    184    0   yongsun    printf("Merging..."); fflush(stdout);
    185    0   yongsun 	switch (N) {
    186    0   yongsun   	case 1:
    187    0   yongsun 		ProcessingIdngramMerge<1>(swap, out, para_offsets);
    188    0   yongsun 		break;
    189    0   yongsun   	case 2:
    190    0   yongsun 		ProcessingIdngramMerge<2>(swap, out, para_offsets);
    191    0   yongsun 		break;
    192    0   yongsun   	case 3:
    193    0   yongsun 		ProcessingIdngramMerge<3>(swap, out, para_offsets);
    194    0   yongsun 		break;
    195    0   yongsun   	}
    196    0   yongsun    printf ("Done\n"); fflush(stdout);
    197    0   yongsun    fclose(out);
    198    0   yongsun    fclose(swap);
    199    0   yongsun   	return 0;
    200    0   yongsun }
    201    0   yongsun 
    202