You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/01 02:51:42 UTC
[47/94] [abbrv] [partial] incubator-joshua git commit: Pulled JOSHUA-252 changes and Resolved Merge Conflicts

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/main.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/main.cpp b/ext/giza-pp/GIZA++-v2/main.cpp
deleted file mode 100644
index d1b588f..0000000
--- a/ext/giza-pp/GIZA++-v2/main.cpp
+++ /dev/null
@@ -1,719 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, 
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
-USA.
-
-*/
-
-#include <sstream>
-#include "getSentence.h"
-#include "TTables.h"
-#include "model1.h"
-#include "model2.h"
-#include "model3.h"
-#include "hmm.h"
-#include "file_spec.h"
-#include "defs.h"
-#include "vocab.h"
-#include "Perplexity.h"
-#include "Dictionary.h"
-#include "utility.h" 
-#include "Parameter.h"
-#include "myassert.h"
-#include "D4Tables.h"
-#include "D5Tables.h"
-#include "transpair_model4.h"
-#include "transpair_model5.h"
-
-#define ITER_M2 0
-#define ITER_MH 5
-
-GLOBAL_PARAMETER3(int,Model1_Iterations,"Model1_Iterations","NO. ITERATIONS MODEL 1","m1","number of iterations for Model 1",PARLEV_ITER,5);
-GLOBAL_PARAMETER3(int,Model2_Iterations,"Model2_Iterations","NO. ITERATIONS MODEL 2","m2","number of iterations for Model 2",PARLEV_ITER,ITER_M2);
-GLOBAL_PARAMETER3(int,HMM_Iterations,"HMM_Iterations","mh","number of iterations for HMM alignment model","mh",              PARLEV_ITER,ITER_MH);
-GLOBAL_PARAMETER3(int,Model3_Iterations,"Model3_Iterations","NO. ITERATIONS MODEL 3","m3","number of iterations for Model 3",PARLEV_ITER,5);
-GLOBAL_PARAMETER3(int,Model4_Iterations,"Model4_Iterations","NO. ITERATIONS MODEL 4","m4","number of iterations for Model 4",PARLEV_ITER,5);
-GLOBAL_PARAMETER3(int,Model5_Iterations,"Model5_Iterations","NO. ITERATIONS MODEL 5","m5","number of iterations for Model 5",PARLEV_ITER,0);
-GLOBAL_PARAMETER3(int,Model6_Iterations,"Model6_Iterations","NO. ITERATIONS MODEL 6","m6","number of iterations for Model 6",PARLEV_ITER,0);
-
-
-GLOBAL_PARAMETER(float, PROB_SMOOTH,"probSmooth","probability smoothing (floor) value ",PARLEV_OPTHEUR,1e-7);
-GLOBAL_PARAMETER(float, MINCOUNTINCREASE,"minCountIncrease","minimal count increase",PARLEV_OPTHEUR,1e-7);
-
-GLOBAL_PARAMETER2(int,Transfer_Dump_Freq,"TRANSFER DUMP FREQUENCY","t2to3","output: dump of transfer from Model 2 to 3",PARLEV_OUTPUT,0);
-GLOBAL_PARAMETER2(bool,Verbose,"verbose","v","0: not verbose; 1: verbose",PARLEV_OUTPUT,0);
-GLOBAL_PARAMETER(bool,Log,"log","0: no logfile; 1: logfile",PARLEV_OUTPUT,0);
-
-
-GLOBAL_PARAMETER(double,P0,"p0","fixed value for parameter p_0 in IBM-3/4 (if negative then it is determined in training)",PARLEV_EM,-1.0);
-GLOBAL_PARAMETER(double,M5P0,"m5p0","fixed value for parameter p_0 in IBM-5 (if negative then it is determined in training)",PARLEV_EM,-1.0);
-GLOBAL_PARAMETER3(bool,Peg,"pegging","p","DO PEGGING? (Y/N)","0: no pegging; 1: do pegging",PARLEV_EM,0);
-
-GLOBAL_PARAMETER(short,OldADBACKOFF,"adbackoff","",-1,0);
-GLOBAL_PARAMETER2(unsigned int,MAX_SENTENCE_LENGTH,"ml","MAX SENTENCE LENGTH","maximum sentence length",0,MAX_SENTENCE_LENGTH_ALLOWED);
-
-
-GLOBAL_PARAMETER(short, DeficientDistortionForEmptyWord,"DeficientDistortionForEmptyWord","0: IBM-3/IBM-4 as described in (Brown et al. 1993); 1: distortion model of empty word is deficient; 2: distoriton model of empty word is deficient (differently); setting this parameter also helps to avoid that during IBM-3 and IBM-4 training too many words are aligned with the empty word",PARLEV_MODELS,0);
-short OutputInAachenFormat=0;
-bool Transfer=TRANSFER;
-bool Transfer2to3=0;
-short NoEmptyWord=0;
-bool FEWDUMPS=0;
-GLOBAL_PARAMETER(bool,ONLYALDUMPS,"ONLYALDUMPS","1: do not write any files",PARLEV_OUTPUT,0);
-GLOBAL_PARAMETER(short,CompactAlignmentFormat,"CompactAlignmentFormat","0: detailled alignment format, 1: compact alignment format ",PARLEV_OUTPUT,0);
-GLOBAL_PARAMETER2(bool,NODUMPS,"NODUMPS","NO FILE DUMPS? (Y/N)","1: do not write any files",PARLEV_OUTPUT,0);
-
-GLOBAL_PARAMETER(WordIndex,MAX_FERTILITY,"MAX_FERTILITY","maximal fertility for fertility models",PARLEV_EM,10);
-
-Vector<map< pair<int,int>,char > > ReferenceAlignment;
-
-
-bool useDict = false;
-string CoocurrenceFile;
-string Prefix, LogFilename, OPath, Usage, 
-  SourceVocabFilename, TargetVocabFilename, CorpusFilename, 
-  TestCorpusFilename, t_Filename, a_Filename, p0_Filename, d_Filename, 
-  n_Filename, dictionary_Filename;
-
-ofstream logmsg ;
-const string str2Num(int n){
-  string number = "";
-  do{
-    number.insert((size_t)0, 1, (char)(n % 10 + '0'));
-  } while((n /= 10) > 0);
-  return(number) ;
-}
-
-
-double LAMBDA=1.09;
-sentenceHandler *testCorpus=0,*corpus=0;
-Perplexity trainPerp, testPerp, trainViterbiPerp, testViterbiPerp ;
-
-string ReadTablePrefix;
-
-
-void printGIZAPars(ostream&out)
-{
-  out << "general parameters:\n"
-         "-------------------\n";
-  printPars(out,getGlobalParSet(),0);
-  out << '\n';
-
-  out << "No. of iterations:\n-"
-         "------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_ITER);
-  out << '\n';
-
-  out << "parameter for various heuristics in GIZA++ for efficient training:\n"
-         "------------------------------------------------------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_OPTHEUR);
-  out << '\n';
-
-  out << "parameters for describing the type and amount of output:\n"
-         "-----------------------------------------------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_OUTPUT);
-  out << '\n';
-
-  out << "parameters describing input files:\n"
-         "----------------------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_INPUT);
-  out << '\n';
-
-  out << "smoothing parameters:\n"
-         "---------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_SMOOTH);
-  out << '\n';
-
-  out << "parameters modifying the models:\n"
-         "--------------------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_MODELS);
-  out << '\n';
-
-  out << "parameters modifying the EM-algorithm:\n"
-         "--------------------------------------\n";
-  printPars(out,getGlobalParSet(),PARLEV_EM);
-  out << '\n';
-}
-
-const char*stripPath(const char*fullpath)
-  // strip the path info from the file name 
-{
-  const char *ptr = fullpath + strlen(fullpath) - 1 ;
-  while(ptr && ptr > fullpath && *ptr != '/'){ptr--;}
-  if( *ptr=='/' )
-    return(ptr+1);
-  else
-    return ptr;
-}
-
-
-void printDecoderConfigFile()
-{
-  string decoder_config_file = Prefix + ".Decoder.config" ;
-  cerr << "writing decoder configuration file to " <<  decoder_config_file.c_str() <<'\n';
-  ofstream decoder(decoder_config_file.c_str());
-  if(!decoder){
-    cerr << "\nCannot write to " << decoder_config_file <<'\n';
-    exit(1);
-  }
-  decoder << "# Template for Configuration File for the Rewrite Decoder\n# Syntax:\n" 
-	  << "#         <Variable> = <value>\n#         '#' is the comment character\n"
-	  << "#================================================================\n"
-	  << "#================================================================\n"
-	  << "# LANGUAGE MODEL FILE\n# The full path and file name of the language model file:\n";
-  decoder << "LanguageModelFile =\n";
-  decoder << "#================================================================\n"
-	  << "#================================================================\n"
-	  << "# TRANSLATION MODEL FILES\n# The directory where the translation model tables as created\n"
-	  << "# by Giza are located:\n#\n"
-	  << "# Notes: - All translation model \"source\" files are assumed to be in\n"
-	  << "#          TM_RawDataDir, the binaries will be put in TM_BinDataDir\n"
-	  << "#\n#        - Attention: RELATIVE PATH NAMES DO NOT WORK!!!\n"
-	  << "#\n#        - Absolute paths (file name starts with /) will override\n"
-	  << "#          the default directory.\n\n";
-  // strip file prefix info and leave only the path name in Prefix
-  string path = Prefix.substr(0, Prefix.find_last_of("/")+1);
-  if( path=="" )
-    path=".";
-  decoder << "TM_RawDataDir = " << path << '\n';
-  decoder << "TM_BinDataDir = " << path << '\n' << '\n';
-  decoder << "# file names of the TM tables\n# Notes:\n"
-	  << "# 1. TTable and InversTTable are expected to use word IDs not\n"
-	  << "#    strings (Giza produces both, whereby the *.actual.* files\n"
-	  << "#    use strings and are THE WRONG CHOICE.\n"
-	  << "# 2. FZeroWords, on the other hand, is a simple list of strings\n"
-	  << "#    with one word per line. This file is typically edited\n"
-	  << "#    manually. Hoeever, this one listed here is generated by GIZA\n\n";
-  
-  int lastmodel;
-  if (Model5_Iterations>0)
-    lastmodel = 5 ;
-  else if (Model4_Iterations>0)
-    lastmodel = 4 ;
-  else if (Model3_Iterations>0)
-    lastmodel = 3 ;
-  else if (Model2_Iterations>0)
-    lastmodel = 2 ;
-  else lastmodel = 1 ;
-  string lastModelName = str2Num(lastmodel);
-  string p=Prefix + ".t" + /*lastModelName*/"3" +".final";
-  decoder << "TTable = " << stripPath(p.c_str()) << '\n';
-  p = Prefix + ".ti.final" ;
-  decoder << "InverseTTable = " << stripPath(p.c_str()) << '\n';
-  p=Prefix + ".n" + /*lastModelName*/"3" + ".final";
-  decoder << "NTable = " << stripPath(p.c_str())  << '\n';
-  p=Prefix + ".d" + /*lastModelName*/"3" + ".final";
-  decoder << "D3Table = " << stripPath(p.c_str())  << '\n';
-  p=Prefix + ".D4.final";
-  decoder << "D4Table = " << stripPath(p.c_str()) << '\n';
-  p=Prefix + ".p0_"+ /*lastModelName*/"3" + ".final";
-  decoder << "PZero = " << stripPath(p.c_str()) << '\n';
-  decoder << "Source.vcb = " << SourceVocabFilename << '\n';
-  decoder << "Target.vcb = " << TargetVocabFilename << '\n';
-  //  decoder << "Source.classes = " << SourceVocabFilename + ".classes" << '\n';
-  //  decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n';
-  decoder << "Source.classes = " << SourceVocabFilename+".classes" << '\n';
-  decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n';
-  p=Prefix + ".fe0_"+ /*lastModelName*/"3" + ".final";
-  decoder << "FZeroWords       = " <<stripPath(p.c_str()) << '\n' ;
-
-  /*  decoder << "# Translation Parameters\n"
-      << "# Note: TranslationModel and LanguageModelMode must have NUMBERS as\n"
-      << "# values, not words\n"
-      << "# CORRECT: LanguageModelMode = 2\n"
-      << "# WRONG:   LanguageModelMode = bigrams # WRONG, WRONG, WRONG!!!\n";
-      decoder << "TMWeight          = 0.6 # weight of TM for calculating alignment probability\n";
-      decoder << "TranslationModel  = "<<lastmodel<<"   # which model to use (3 or 4)\n";
-      decoder << "LanguageModelMode = 2   # (2 (bigrams) or 3 (trigrams)\n\n";
-      decoder << "# Output Options\n"
-      << "TellWhatYouAreDoing = TRUE # print diagnostic messages to stderr\n"
-      << "PrintOriginal       = TRUE # repeat original sentence in the output\n"
-      << "TopTranslations     = 3    # number of n best translations to be returned\n"
-      << "PrintProbabilities  = TRUE # give the probabilities for the translations\n\n";
-      
-      decoder << "# LOGGING OPTIONS\n"
-      << "LogFile = - # empty means: no log, dash means: STDOUT\n"
-      << "LogLM = true # log language model lookups\n"
-      << "LogTM = true # log translation model lookups\n";
-      */
-}
-
-
-void printAllTables(vcbList& eTrainVcbList, vcbList& eTestVcbList,
-		    vcbList& fTrainVcbList, vcbList& fTestVcbList, model1& m1)
-{
-  cerr << "writing Final tables to Disk \n";
-  string t_inv_file = Prefix + ".ti.final" ;
-  if( !FEWDUMPS)
-    m1.getTTable().printProbTableInverse(t_inv_file.c_str(), m1.getEnglishVocabList(), 
-					 m1.getFrenchVocabList(), 
-					 m1.getETotalWCount(), 
-					 m1.getFTotalWCount());
-  t_inv_file = Prefix + ".actual.ti.final" ;
-  if( !FEWDUMPS )
-    m1.getTTable().printProbTableInverse(t_inv_file.c_str(), 
-					 eTrainVcbList.getVocabList(), 
-					 fTrainVcbList.getVocabList(), 
-					 m1.getETotalWCount(), 
-					 m1.getFTotalWCount(), true);
-  
-  string perp_filename = Prefix + ".perp" ;
-  ofstream of_perp(perp_filename.c_str());
-  
-  cout << "Writing PERPLEXITY report to: " << perp_filename << '\n';
-  if(!of_perp){
-    cerr << "\nERROR: Cannot write to " << perp_filename <<'\n';
-    exit(1);
-  }
-  
-  if (testCorpus)
-    generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, 
-			     testViterbiPerp, of_perp, (*corpus).getTotalNoPairs1(), 
-			     (*testCorpus).getTotalNoPairs1(),
-			     true);
-  else 
-    generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, testViterbiPerp, 
-			     of_perp, (*corpus).getTotalNoPairs1(), 0, true);
-  
-  string eTrainVcbFile = Prefix + ".trn.src.vcb" ;
-  ofstream of_eTrainVcb(eTrainVcbFile.c_str());
-  cout << "Writing source vocabulary list to : " << eTrainVcbFile << '\n';
-  if(!of_eTrainVcb){
-    cerr << "\nERROR: Cannot write to " << eTrainVcbFile <<'\n';
-    exit(1);
-  }
-  eTrainVcbList.printVocabList(of_eTrainVcb) ;
-  
-  string fTrainVcbFile = Prefix + ".trn.trg.vcb" ;
-  ofstream of_fTrainVcb(fTrainVcbFile.c_str());
-  cout << "Writing source vocabulary list to : " << fTrainVcbFile << '\n';
-  if(!of_fTrainVcb){
-    cerr << "\nERROR: Cannot write to " << fTrainVcbFile <<'\n';
-    exit(1);
-  }
-  fTrainVcbList.printVocabList(of_fTrainVcb) ;
-  
-  //print test vocabulary list 
-  
-  string eTestVcbFile = Prefix + ".tst.src.vcb" ;
-  ofstream of_eTestVcb(eTestVcbFile.c_str());
-  cout << "Writing source vocabulary list to : " << eTestVcbFile << '\n';
-  if(!of_eTestVcb){
-    cerr << "\nERROR: Cannot write to " << eTestVcbFile <<'\n';
-    exit(1);
-  }
-  eTestVcbList.printVocabList(of_eTestVcb) ;
-  
-  string fTestVcbFile = Prefix + ".tst.trg.vcb" ;
-  ofstream of_fTestVcb(fTestVcbFile.c_str());
-  cout << "Writing source vocabulary list to : " << fTestVcbFile << '\n';
-  if(!of_fTestVcb){
-    cerr << "\nERROR: Cannot write to " << fTestVcbFile <<'\n';
-    exit(1);
-  }
-  fTestVcbList.printVocabList(of_fTestVcb) ;
-  printDecoderConfigFile();
-  if (testCorpus)
-    printOverlapReport(m1.getTTable(), *testCorpus, eTrainVcbList, 
-		       fTrainVcbList, eTestVcbList, fTestVcbList);
-  
-}
-
-bool readNextSent(istream&is,map< pair<int,int>,char >&s,int&number)
-{
-  string x;
-  if( !(is >> x) ) return 0;
-  if( x=="SENT:" ) is >> x;
-  int n=atoi(x.c_str());
-  if( number==-1 )
-    number=n;
-  else
-    if( number!=n )
-      {
-	cerr << "ERROR: readNextSent: DIFFERENT NUMBERS: " << number << " " << n << '\n';
-	return 0;
-      }
-  int nS,nP,nO;
-  nS=nP=nO=0;
-  while( is >> x )
-    {
-      if( x=="SENT:" )
-	return 1;
-      int n1,n2;
-      is >> n1 >> n2;
-      map< pair<int,int>,char >::const_iterator i=s.find(pair<int,int>(n1,n2));
-      if( i==s.end()||i->second=='P' )
-	s[pair<int,int>(n1,n2)]=x[0];
-      massert(x[0]=='S'||x[0]=='P');
-      nS+= (x[0]=='S');
-      nP+= (x[0]=='P');
-      nO+= (!(x[0]=='S'||x[0]=='P'));
-    }
-  return 1;
-}
-
-bool emptySent(map< pair<int,int>,char >&x)
-{
-  x = map< pair<int,int>,char >();
-  return 1;
-}
-
-void ReadAlignment(const string&x,Vector<map< pair<int,int>,char > >&a)
-{
-  ifstream infile(x.c_str());
-  a.clear();
-  map< pair<int,int>,char >sent;
-  int number=0;
-  while( emptySent(sent) && (readNextSent(infile,sent,number)) )
-    {
-      if( int(a.size())!=number )
-	cerr << "ERROR: ReadAlignment: " << a.size() << " " << number << '\n';
-      a.push_back(sent);
-      number++;
-    }
-  cout << "Read: " << a.size() << " sentences in reference alignment." << '\n';
-}
-    
-
-void initGlobals(void)
-{
-  NODUMPS = false ;
-  Prefix = Get_File_Spec();
-  LogFilename= Prefix + ".log";
-  MAX_SENTENCE_LENGTH = MAX_SENTENCE_LENGTH_ALLOWED ;
-}
-
-void convert(const map< pair<int,int>,char >&reference,alignment&x)
-{
-  int l=x.get_l();
-  int m=x.get_m();
-  for(map< pair<int,int>,char >::const_iterator i=reference.begin();i!=reference.end();++i)
-    {
-      if( i->first.first+1>int(m) )
-	{
-	  cerr << "ERROR m to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n";
-	  continue;
-	}
-      if( i->first.second+1>int(l) )
-	{
-	  cerr << "ERROR l to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n";
-	  continue;
-	}
-      if( x(i->first.first+1)!=0 )
-	cerr << "ERROR: position " << i->first.first+1 << " already set\n";
-      x.set(i->first.first+1,i->first.second+1);
-    }
-}
-double ErrorsInAlignment(const map< pair<int,int>,char >&reference,const Vector<WordIndex>&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int pair_no)
-{
-  int err=0;
-  for(unsigned int j=1;j<test.size();j++)
-    {
-      if( test[j]>0 )
-	{
-	  map< pair<int,int>,char >::const_iterator i=reference.find(make_pair(test[j]-1,j-1));
-	  if( i==reference.end() )
-	    {
-	      toomuch++;
-	      err++;
-	    }
-	  else
-	    if( !(i->second=='S' || i->second=='P'))
-	      cerr << "ERROR: wrong symbol in reference alignment '" << i->second << ' ' << int(i->second) << " no:" << pair_no<< "'\n";
-	  eventsToomuch++;
-	}
-    }
-  for(map< pair<int,int>,char >::const_iterator i=reference.begin();i!=reference.end();++i)
-    {
-      if( i->second=='S' )
-	{
-	  unsigned int J=i->first.second+1;
-	  unsigned int I=i->first.first+1;
-	  if( int(J)>=int(test.size())||int(I)>int(l)||int(J)<1||int(I)<1 )
-	    cerr << "ERROR: alignment outside of range in reference alignment" << J << " " << test.size() << " (" << I << " " << l << ") no:" << pair_no << '\n';
-	  else
-	    {
-	      if(test[J]!=I)
-		{
-		  missing++;
-		  err++;
-		}
-	    }
-	  eventsMissing++;
-	}
-    }
-  if( Verbose )
-    cout << err << " errors in sentence\n";
-  if( eventsToomuch+eventsMissing )
-    return (toomuch+missing)/(eventsToomuch+eventsMissing);
-  else
-    return 1.0;
-}
-
-
-vcbList *globeTrainVcbList,*globfTrainVcbList;
-
-double StartTraining(int&result)
-{ 
-  double errors=0.0;
-  vcbList eTrainVcbList, fTrainVcbList;
-  globeTrainVcbList=&eTrainVcbList;
-  globfTrainVcbList=&fTrainVcbList;
-
-
-  string repFilename = Prefix + ".gizacfg" ;
-  ofstream of2(repFilename.c_str());
-  writeParameters(of2,getGlobalParSet(),-1) ;
-
-  cout << "reading vocabulary files \n";
-  eTrainVcbList.setName(SourceVocabFilename.c_str());
-  fTrainVcbList.setName(TargetVocabFilename.c_str());
-  eTrainVcbList.readVocabList();
-  fTrainVcbList.readVocabList();
-  cout << "Source vocabulary list has " << eTrainVcbList.uniqTokens() << " unique tokens \n";
-  cout << "Target vocabulary list has " << fTrainVcbList.uniqTokens() << " unique tokens \n";
-  
-  vcbList eTestVcbList(eTrainVcbList) ;
-  vcbList fTestVcbList(fTrainVcbList) ;
-  
-  corpus = new sentenceHandler(CorpusFilename.c_str(), &eTrainVcbList, &fTrainVcbList);
-
-  if (TestCorpusFilename == "NONE")
-    TestCorpusFilename = "";
-
-  if (TestCorpusFilename != ""){
-    cout << "Test corpus will be read from: " << TestCorpusFilename << '\n';
-      testCorpus= new sentenceHandler(TestCorpusFilename.c_str(), 
-						       &eTestVcbList, &fTestVcbList);
-      cout << " Test total # sentence pairs : " <<(*testCorpus).getTotalNoPairs1()<<" weighted:"<<(*testCorpus).getTotalNoPairs2() <<'\n';
-
-      cout << "Size of the source portion of test corpus: " << eTestVcbList.totalVocab() << " tokens\n";
-      cout << "Size of the target portion of test corpus: " << fTestVcbList.totalVocab() << " tokens \n";
-      cout << "In source portion of the test corpus, only " << eTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
-      cout << "In target portion of the test corpus, only " << fTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
-      cout << "ratio (target/source) : " << double(fTestVcbList.totalVocab()) /
-	eTestVcbList.totalVocab() << '\n';
-  }
-  
-  cout << " Train total # sentence pairs (weighted): " << corpus->getTotalNoPairs2() << '\n';
-  cout << "Size of source portion of the training corpus: " << eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2() << " tokens\n";
-  cout << "Size of the target portion of the training corpus: " << fTrainVcbList.totalVocab() << " tokens \n";
-  cout << "In source portion of the training corpus, only " << eTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
-  cout << "In target portion of the training corpus, only " << fTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
-  cout << "lambda for PP calculation in IBM-1,IBM-2,HMM:= " << double(fTrainVcbList.totalVocab()) << "/(" << eTrainVcbList.totalVocab() << "-" << corpus->getTotalNoPairs2() << ")=";
-  LAMBDA = double(fTrainVcbList.totalVocab()) / (eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2());
-  cout << "= " << LAMBDA << '\n';
-  // load dictionary
-  Dictionary *dictionary;  
-  useDict = !dictionary_Filename.empty();
-  if (useDict) dictionary = new Dictionary(dictionary_Filename.c_str());
-  else dictionary = new Dictionary("");
-  int minIter=0;
-#ifdef BINARY_SEARCH_FOR_TTABLE
-  if( CoocurrenceFile.length()==0 )
-    {
-      cerr << "ERROR: NO COOCURRENCE FILE GIVEN!\n";
-      abort();
-    }
-  //ifstream coocs(CoocurrenceFile.c_str());
-  tmodel<COUNT, PROB> tTable(CoocurrenceFile);
-#else
-  tmodel<COUNT, PROB> tTable;
-#endif
-
-  model1 m1(CorpusFilename.c_str(), eTrainVcbList, fTrainVcbList,tTable,trainPerp, 
-	    *corpus,&testPerp, testCorpus, 
-	    trainViterbiPerp, &testViterbiPerp);
-   amodel<PROB>  aTable(false);
-   amodel<COUNT> aCountTable(false);
-   model2 m2(m1,aTable,aCountTable);
-   hmm h(m2);
-   model3 m3(m2); 
-   if(ReadTablePrefix.length() )
-     {
-       string number = "final";
-       string tfile,afilennfile,dfile,d4file,p0file,afile,nfile; //d5file
-       tfile = ReadTablePrefix + ".t3." + number ;
-       afile = ReadTablePrefix + ".a3." + number ;
-       nfile = ReadTablePrefix + ".n3." + number ;
-       dfile = ReadTablePrefix + ".d3." + number ;
-       d4file = ReadTablePrefix + ".d4." + number ;
-       //d5file = ReadTablePrefix + ".d5." + number ;
-       p0file = ReadTablePrefix + ".p0_3." + number ;
-       tTable.readProbTable(tfile.c_str());
-       aTable.readTable(afile.c_str());
-       m3.dTable.readTable(dfile.c_str());
-       m3.nTable.readNTable(nfile.c_str());
-       sentPair sent ;
-       double p0;
-       ifstream p0f(p0file.c_str());
-       p0f >> p0;
-       d4model d4m(MAX_SENTENCE_LENGTH);
-       d4m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
-       d4m.readProbTable(d4file.c_str());
-       //d5model d5m(d4m);
-       //d5m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
-       //d5m.readProbTable(d5file.c_str());
-       makeSetCommand("model4smoothfactor","0.0",getGlobalParSet(),2);
-       //makeSetCommand("model5smoothfactor","0.0",getGlobalParSet(),2);
-       if( corpus||testCorpus )
-	 {
-	   sentenceHandler *x=corpus;
-	   if(x==0)
-	     x=testCorpus;
-	   cout << "Text corpus exists.\n";
-	   x->rewind();
-	   while(x&&x->getNextSentence(sent)){
-	     Vector<WordIndex>& es = sent.eSent;
-	     Vector<WordIndex>& fs = sent.fSent;
-	     int l=es.size()-1;
-	     int m=fs.size()-1;
-	     transpair_model4 tm4(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d4m);
-	     alignment al(l,m);
-	     cout << "I use the alignment " << sent.sentenceNo-1 << '\n';
-	     //convert(ReferenceAlignment[sent.sentenceNo-1],al);
-	     transpair_model3 tm3(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,0);
-	     double p=tm3.prob_of_target_and_alignment_given_source(al,1);
-	     cout << "Sentence " << sent.sentenceNo << " has IBM-3 prob " << p << '\n';
-	     p=tm4.prob_of_target_and_alignment_given_source(al,3,1);
-	     cout << "Sentence " << sent.sentenceNo << " has IBM-4 prob " << p << '\n';
-	     //transpair_model5 tm5(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d5m);
-	     //p=tm5.prob_of_target_and_alignment_given_source(al,3,1);
-	     //cout << "Sentence " << sent.sentenceNo << " has IBM-5 prob " << p << '\n';
-	   }
-	 }
-       else
-	 {
-	   cout << "No corpus exists.\n";
-	 }
-    }
-   else 
-     {
-       // initialize model1
-       bool seedModel1 = false ;
-       if(Model1_Iterations > 0){
-	 if (t_Filename != "NONE" && t_Filename != ""){
-	   seedModel1 = true ;
-	   m1.load_table(t_Filename.c_str());
-	 }
-	 minIter=m1.em_with_tricks(Model1_Iterations,seedModel1,*dictionary, useDict);
-	 errors=m1.errorsAL();
-       }
-       
-	 {
-	   if(Model2_Iterations > 0){
-	     m2.initialize_table_uniformly(*corpus);
-	     minIter=m2.em_with_tricks(Model2_Iterations);
-	     errors=m2.errorsAL();
-	   }
-	   if(HMM_Iterations > 0){
-	     cout << "NOTE: I am doing iterations with the HMM model!\n";
-	     h.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
-	     h.initialize_table_uniformly(*corpus);
-	     minIter=h.em_with_tricks(HMM_Iterations);
-	     errors=h.errorsAL();
-	   }
-	   
-	   if(Transfer2to3||HMM_Iterations==0){
-	     if( HMM_Iterations>0 )
-	       cout << "WARNING: transfor is not needed, as results are overwritten bei transfer from HMM.\n";
-	     string test_alignfile = Prefix +".tst.A2to3";
-	     if (testCorpus)
-	       m2.em_loop(testPerp, *testCorpus,Transfer_Dump_Freq==1&&!NODUMPS,test_alignfile.c_str(), testViterbiPerp, true);
-	     if (testCorpus)
-	       cout << "\nTransfer: TEST CROSS-ENTROPY " << testPerp.cross_entropy() << " PERPLEXITY " << testPerp.perplexity() << "\n\n";
-	     if (Transfer == TRANSFER_SIMPLE)
-	       m3.transferSimple(*corpus, Transfer_Dump_Freq==1&&!NODUMPS,trainPerp, trainViterbiPerp);
-	     else  
-	       m3.transfer(*corpus, Transfer_Dump_Freq==1&&!NODUMPS, trainPerp, trainViterbiPerp);
-	     errors=m3.errorsAL();
-	   }
-	   
-	   if( HMM_Iterations>0 )
-	     m3.setHMM(&h);
-	   if(Model3_Iterations > 0 || Model4_Iterations > 0 || Model5_Iterations || Model6_Iterations
-	      )
-	     {
-	       minIter=m3.viterbi(Model3_Iterations,Model4_Iterations,Model5_Iterations,Model6_Iterations);
-	       errors=m3.errorsAL();
-	     }
-	   if (FEWDUMPS||!NODUMPS)
-	     {
-	       printAllTables(eTrainVcbList,eTestVcbList,fTrainVcbList,fTestVcbList,m1 );
-	     }
-	 }
-     }
-   result=minIter;
-   return errors;
-}
-
-int main(int argc, char* argv[])
-{
-#ifdef BINARY_SEARCH_FOR_TTABLE
-  getGlobalParSet().insert(new Parameter<string>("CoocurrenceFile",ParameterChangedFlag,"",CoocurrenceFile,PARLEV_SPECIAL));
-#endif
-  getGlobalParSet().insert(new Parameter<string>("ReadTablePrefix",ParameterChangedFlag,"optimized",ReadTablePrefix,-1));
-  getGlobalParSet().insert(new Parameter<string>("S",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,PARLEV_INPUT));
-  getGlobalParSet().insert(new Parameter<string>("SOURCE VOCABULARY FILE",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,-1));
-  getGlobalParSet().insert(new Parameter<string>("T",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,PARLEV_INPUT));
-  getGlobalParSet().insert(new Parameter<string>("TARGET VOCABULARY FILE",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,-1));
-  getGlobalParSet().insert(new Parameter<string>("C",ParameterChangedFlag,"training corpus file name",CorpusFilename,PARLEV_INPUT));
-  getGlobalParSet().insert(new Parameter<string>("CORPUS FILE",ParameterChangedFlag,"training corpus file name",CorpusFilename,-1));
-  getGlobalParSet().insert(new Parameter<string>("TC",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,PARLEV_INPUT));
-  getGlobalParSet().insert(new Parameter<string>("TEST CORPUS FILE",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,-1));
-  getGlobalParSet().insert(new Parameter<string>("d",ParameterChangedFlag,"dictionary file name",dictionary_Filename,PARLEV_INPUT));
-  getGlobalParSet().insert(new Parameter<string>("DICTIONARY",ParameterChangedFlag,"dictionary file name",dictionary_Filename,-1));
-  getGlobalParSet().insert(new Parameter<string>("l",ParameterChangedFlag,"log file name",LogFilename,PARLEV_OUTPUT));
-  getGlobalParSet().insert(new Parameter<string>("LOG FILE",ParameterChangedFlag,"log file name",LogFilename,-1));
-
-  getGlobalParSet().insert(new Parameter<string>("o",ParameterChangedFlag,"output file prefix",Prefix,PARLEV_OUTPUT));
-  getGlobalParSet().insert(new Parameter<string>("OUTPUT FILE PREFIX",ParameterChangedFlag,"output file prefix",Prefix,-1));
-  getGlobalParSet().insert(new Parameter<string>("OUTPUT PATH",ParameterChangedFlag,"output path",OPath,PARLEV_OUTPUT));
-
-  time_t st1, fn;
-  st1 = time(NULL);                    // starting time
-
-  string temp(argv[0]);
-  Usage = temp + " <config_file> [options]\n";
-  if(argc < 2)
-    {
-      printHelp();    
-      exit(1);
-    }
-  
-  initGlobals() ;
-  parseArguments(argc, argv);
-  
-  if (Log)
-    logmsg.open(LogFilename.c_str(), ios::out);
-  
-  printGIZAPars(cout);
-  int a=-1;
-  double errors=0.0;
-  if( OldADBACKOFF!=0 )
-    cerr << "WARNING: Parameter -adBackOff does not exist further; use CompactADTable instead.\n";
-  if( MAX_SENTENCE_LENGTH > MAX_SENTENCE_LENGTH_ALLOWED )
-    cerr << "ERROR: MAX_SENTENCE_LENGTH is too big " << MAX_SENTENCE_LENGTH << " > " << MAX_SENTENCE_LENGTH_ALLOWED << '\n';
-    errors=StartTraining(a);
-  fn = time(NULL);    // finish time
-  cout << '\n' << "Entire Training took: " << difftime(fn, st1) << " seconds\n";
-  cout << "Program Finished at: "<< ctime(&fn) << '\n';
-  cout << "==========================================================\n";
-  return 0;
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/model1.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/model1.cpp b/ext/giza-pp/GIZA++-v2/model1.cpp
deleted file mode 100644
index b1b6d92..0000000
--- a/ext/giza-pp/GIZA++-v2/model1.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, 
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
-USA.
-
-*/
-#include "model1.h"
-#include "Globals.h"
-#include "utility.h"
-#include "Parameter.h"
-
-extern short NoEmptyWord;
-extern int VerboseSentence;
-
-GLOBAL_PARAMETER2(int,Model1_Dump_Freq,"MODEL 1 DUMP FREQUENCY","t1","dump frequency of Model 1",PARLEV_OUTPUT,0);
-int NumberOfVALIalignments=100;
-
-model1::model1(const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel<COUNT, PROB>&_tTable,Perplexity& _perp,
-	      sentenceHandler& _sHandler1,
-	      Perplexity* _testPerp,
-	      sentenceHandler* _testHandler,
-	      Perplexity& _trainViterbiPerp,
-	      Perplexity* _testViterbiPerp):
-  report_info(_perp,_sHandler1,_testPerp,_testHandler,_trainViterbiPerp,_testViterbiPerp),
-  efFilename(efname), Elist(evcblist), Flist(fvcblist), 
-  eTotalWCount(Elist.totalVocab()), fTotalWCount(Flist.totalVocab()), 
-  noEnglishWords(Elist.size()), noFrenchWords(Flist.size()), tTable(_tTable),
-  evlist(Elist.getVocabList()), fvlist(Flist.getVocabList())
-{}
-
-void model1::initialize_table_uniformly(sentenceHandler& sHandler1){
-  WordIndex i, j;
-
-  cout << "Initialize tTable\n";
-
-  sentPair sent ;
-  sHandler1.rewind();
-  while(sHandler1.getNextSentence(sent)){
-    Vector<WordIndex>& es = sent.eSent;
-    Vector<WordIndex>& fs = sent.fSent;
-    PROB uniform = 1.0/es.size() ;
-    for( i=0; i < es.size(); i++)
-      for(j=1; j < fs.size(); j++)
-	tTable.insert(es[i],fs[j],0,uniform);
-  }
-}
-
-
-int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler& sHandler1, */
-			    bool seedModel1, Dictionary& dictionary, bool useDict /*Perplexity* testPerp, sentenceHandler* testHandler, 
-										     Perplexity& trainViterbiPerp, Perplexity* testViterbiPerp */ )
-{
-  double minErrors=1.0;int minIter=0;
-  string modelName="Model1",shortModelName="1";
-  time_t st, it_st, fn, it_fn;
-  string tfile, number, alignfile, test_alignfile;
-  int pair_no;
-  bool dump_files = false ;
-  st = time(NULL);
-  sHandler1.rewind();
-  cout << "==========================================================\n";
-  cout << modelName << " Training Started at: "<< ctime(&st) << "\n";  
-  for(int it = 1; it <= noIterations; it++){
-    pair_no = 0 ;
-    it_st = time(NULL);
-    cout <<  "-----------\n" << modelName << ": Iteration " << it << '\n';
-    dump_files = (Model1_Dump_Freq != 0) &&  ((it % Model1_Dump_Freq)  == 0) && !NODUMPS ;
-    number = "";
-    int n = it;
-    do{
-      number.insert((size_t)0, 1, (char)(n % 10 + '0'));
-    } while((n /= 10) > 0);
-    tfile = Prefix + ".t" + shortModelName + "." + number ;
-    alignfile = Prefix + ".A" + shortModelName + "." + number ;
-    test_alignfile = Prefix +".tst.A" + shortModelName + "." + number ;
-    initAL();
-    em_loop(it,perp, sHandler1, seedModel1, dump_files, alignfile.c_str(), dictionary, useDict, trainViterbiPerp); 
-    if (testPerp && testHandler) // calculate test perplexity
-      em_loop(it,*testPerp, *testHandler, seedModel1, dump_files, test_alignfile.c_str(), dictionary, useDict, *testViterbiPerp, true); 
-    if( errorsAL()<minErrors )
-      {
-	minErrors=errorsAL();
-        minIter=it;
-      }
-    if (dump_files){
-      if( OutputInAachenFormat==1 )
-	tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
-    }
-    tTable.normalizeTable(Elist, Flist);
-    cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
-	 << " PERPLEXITY " << perp.perplexity() << '\n';
-    if (testPerp && testHandler)
-      cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
-	   << " PERPLEXITY " << (*testPerp).perplexity() 
-	   << '\n';
-    cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
-	 << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
-    if (testPerp && testHandler)
-      cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << (*testViterbiPerp).cross_entropy()
-	   << " PERPLEXITY " << (*testViterbiPerp).perplexity() 
-	   << '\n';
-    if (dump_files){
-      if( OutputInAachenFormat==0 )
-	tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
-    }
-    it_fn = time(NULL);
-    cout << "Model 1 Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n";
-  }
-  fn = time(NULL) ;
-  cout <<  "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
-  return minIter;
-}
-
-void model1::load_table(const char* tname){
-  /* This function loads the t table from the given file; use it
-     when you want to load results from previous t training
-     without doing any new training.
-     NAS, 7/11/99
-  */
-  cout << "Model1: loading t table \n" ;
-  tTable.readProbTable(tname);
-}
-
-  
-extern float MINCOUNTINCREASE;
-void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, 
-		     bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test)
-{
-  WordIndex i, j, l, m ;
-  double cross_entropy;
-  int pair_no=0 ;
-  perp.clear();
-  viterbi_perp.clear();
-  ofstream of2;
-  // for each sentence pair in the corpus
-  if (dump_alignment||FEWDUMPS)
-    of2.open(alignfile);
-  PROB uniform = 1.0/noFrenchWords ;
-  sentPair sent ;
-  sHandler1.rewind();
-  while(sHandler1.getNextSentence(sent)){
-    Vector<WordIndex>& es = sent.eSent;
-    Vector<WordIndex>& fs = sent.fSent;
-    const float so  = sent.getCount();
-    l = es.size() - 1;
-    m = fs.size() - 1;
-    cross_entropy = log(1.0);
-    Vector<WordIndex> viterbi_alignment(fs.size());
-    double viterbi_score = 1 ;
-
-    bool eindict[l + 1];
-    bool findict[m + 1];
-    bool indict[m + 1][l + 1];
-    if(it == 1 && useDict){
-      for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false;
-      for(unsigned int dummy = 0; dummy <= m; dummy++){
-	findict[dummy] = false;
-	for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) 
-	  indict[dummy][dummy2] = false;
-      }
-      for(j = 0; j <= m; j++)
-	for(i = 0; i <= l; i++)
-	  if(dict.indict(fs[j], es[i])){
-	    eindict[i] = findict[j] = indict[j][i] = true;
-	  }
-    }
-
-    for(j=1; j <= m; j++){
-      // entries  that map fs to all possible ei in this sentence.
-      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
-      LpPair<COUNT,PROB> **sPtrCachePtr;
-
-      PROB denom = 0.0;
-      WordIndex best_i = 0 ; // i for which fj is best maped to ei
-      PROB word_best_score = 0 ;  // score for the best mapping of fj
-      if (it == 1 && !seedModel1){
-	denom = uniform  * es.size() ;
-	word_best_score = uniform ;
-      }
-      else 
-	for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
-	  PROB e(0.0) ;
-	  (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ;
-	  if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
-	    e = (*((*sPtrCachePtr))).prob;
-	  else e = PROB_SMOOTH ;
-	  denom += e  ;
-	  if (e > word_best_score){
-	    word_best_score = e ;
-	    best_i = i ;
-	  }	}
-      viterbi_alignment[j] = best_i ;
-      viterbi_score *= word_best_score ; /// denom ;
-      if (denom == 0){
-	if (test)
-	  cerr << "WARNING: denom is zero (TEST)\n";
-	else 
-	  cerr << "WARNING: denom is zero (TRAIN)\n";
-      }
-      cross_entropy += log(denom) ;
-      if (!test){
-	if(denom > 0){	  
-	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
-	  /* this if loop implements a constraint on counting:
-	     count(es[i], fs[j]) is implemented if and only if
-	     es[i] and fs[j] occur together in the dictionary, 
-	     OR
-	     es[i] does not occur in the dictionary with any fs[x] and
-	     fs[j] does not occur in the dictionary with any es[y]
-	  */
-	  if(it == 1 && useDict){
-	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
-	      if(indict[j][i] || (!findict[j] && !eindict[i])){
-		PROB e(0.0) ;
-		if (it == 1 && !seedModel1)
-		  e =  uniform  ;
-		else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
-		  e = (*((*sPtrCachePtr))).prob;
-		else e = PROB_SMOOTH ;
-		COUNT x=e*val;
-		if( it==1||x>MINCOUNTINCREASE )
-		  if ((*sPtrCachePtr) != 0)
-		    (*((*sPtrCachePtr))).count += x;
-		  else 	      
-		    tTable.incCount(es[i], fs[j], x);
-	      } /* end of if */
-	    } /* end of for i */
-	  } /* end of it == 1 */
-	  // Old code:
-	  else{
-	    for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
-	      //for(i=0; i <= l; i++) {	    
-	      PROB e(0.0) ;
-	      if (it == 1 && !seedModel1)
-		e =  uniform  ;
-	      else if ((*sPtrCachePtr) != 0 &&  (*((*sPtrCachePtr))).prob > PROB_SMOOTH) 
-		e = (*((*sPtrCachePtr))).prob;
-	      else e = PROB_SMOOTH ;
-	      //if( !(i==0) )
-	      //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl;
-	      COUNT x=e*val;
-	      if( pair_no==VerboseSentence )
-		cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl;
-	      if( it==1||x>MINCOUNTINCREASE )
-		if( NoEmptyWord==0 || i!=0 )
-		  if ((*sPtrCachePtr) != 0) 
-		    (*((*sPtrCachePtr))).count += x;
-		  else 	      
-		    tTable.incCount(es[i], fs[j], x);
-	    } /* end of for i */
-	  } // end of else
-	} // end of if (denom > 0)
-      }// if (!test)
-    } // end of for (j) ;
-    sHandler1.setProbOfSentence(sent,cross_entropy);
-    //cerr << sent << "CE: " << cross_entropy << " " << so << endl;
-    perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1);
-    viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1);
-    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000))
-      printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
-    addAL(viterbi_alignment,sent.sentenceNo,l);
-    pair_no++;
-  } /* of while */
-  sHandler1.rewind();
-  perp.record("Model1");
-  viterbi_perp.record("Model1");
-  errorReportAL(cout, "IBM-1");
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/model1.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/model1.h b/ext/giza-pp/GIZA++-v2/model1.h
deleted file mode 100644
index 7273049..0000000
--- a/ext/giza-pp/GIZA++-v2/model1.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, 
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
-USA.
-
-*/
-#ifndef _model1_h
-#define _model1_h 1
-
-#include <cassert>
- 
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-#include <functional>
-#include <map>
-#include <set>
-#include <utility>
-
-#include <ctime>
-#include <fstream>
-#include <cmath>
-#include <cstdio>
-
-#include "Vector.h"
-#include "vocab.h"
-#include "TTables.h"
-#include "getSentence.h"
-#include "Perplexity.h"
-#include "vocab.h"
-#include "Dictionary.h"
-
-extern int NumberOfVALIalignments;
-
-class report_info{
- protected:
-  Perplexity& perp;
-  sentenceHandler& sHandler1;
-  Perplexity* testPerp;
-  sentenceHandler* testHandler;
-  Perplexity& trainViterbiPerp; 
-  Perplexity* testViterbiPerp;
-  report_info(Perplexity& _perp,
-	      sentenceHandler& _sHandler1,
-	      Perplexity* _testPerp,
-	      sentenceHandler* _testHandler,
-	      Perplexity& _trainViterbiPerp,
-	      Perplexity* _testViterbiPerp)
-    : perp(_perp),sHandler1(_sHandler1),testPerp(_testPerp),testHandler(_testHandler),trainViterbiPerp(_trainViterbiPerp),testViterbiPerp(_testViterbiPerp)
-    {}
-};
-
-class model1 : public report_info{
-public:
-  string efFilename;
-  vcbList&  Elist ;
-  vcbList&  Flist ;
-  double eTotalWCount ; // size of source copus in number of words
-  double fTotalWCount ; // size of target corpus in number of words 
-  int noEnglishWords;
-  int noFrenchWords;
-  tmodel<COUNT, PROB>&tTable;
-  Vector<WordEntry>& evlist ;
-  Vector<WordEntry>& fvlist ;
-public:
-  int ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch;
-  int ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI;
-  int ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST;
-  model1 (const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel<COUNT, PROB>&_tTable,Perplexity& _perp,
-	      sentenceHandler& _sHandler1,
-	      Perplexity* _testPerp,
-	      sentenceHandler* _testHandler,
-	      Perplexity& _trainViterbiPerp,
-	      Perplexity* _testViterbiPerp);
-  void initialize_table_uniformly(sentenceHandler& sHandler1);
-  int em_with_tricks(int noIterations, 
-		      bool seedModel1, Dictionary& dictionary, bool useDict);
-  void load_table(const char* tname);
-  void readVocabFile(const char* fname, Vector<WordEntry>& vlist, int& vsize, 
-		     int& total);
-  inline Vector<WordEntry>& getEnglishVocabList(void)const {return Elist.getVocabList();};
-  inline Vector<WordEntry>& getFrenchVocabList(void)const  {return Flist.getVocabList();};
-  inline double getETotalWCount(void) const {return eTotalWCount;};
-  inline double getFTotalWCount(void) const {return fTotalWCount;};
-  inline int getNoEnglishWords(void) const  {return noEnglishWords;};
-  inline int getNoFrenchWords(void)  const {return noFrenchWords;};
-  inline tmodel<COUNT, PROB>& getTTable(void) {return tTable;};
-  inline string& getEFFilename(void) {return efFilename;};
- private:
-  void em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool , const char*, Dictionary& dictionary, bool useDict, 
-	       Perplexity& viterbiperp, bool=false);
-  friend class model2;
-  friend class hmm;
- public:
-  void addAL(const Vector<WordIndex>& viterbi_alignment,int pair_no,int l)
-    {
-      if( pair_no<=int(ReferenceAlignment.size()) )
-	{
-	  //cerr << "AL: " << viterbi_alignment << " " << pair_no << endl;
-	  ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no);
-	  if( pair_no<=NumberOfVALIalignments )
-	    ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI,pair_no);
-	  if( pair_no>NumberOfVALIalignments )
-	    ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST,pair_no);
-	}
-    }
-  void initAL()
-    {ALmissingVALI=ALtoomuchVALI=ALeventsMissingVALI=ALeventsToomuchVALI=ALmissingTEST=ALtoomuchTEST=ALeventsMissingTEST=ALeventsToomuchTEST=ALmissing=ALtoomuch=ALeventsMissing=ALeventsToomuch=0;}
-  double errorsAL()const
-    {
-      if( ALeventsMissingVALI+ALeventsToomuchVALI )
-	return (ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI);
-      else
-	return 0.0;
-    }
-  void errorReportAL(ostream&out,string m)const
-    {
-      if( ALeventsMissing+ALeventsToomuch )
-	out << "alignmentErrors (" << m << "): " 
-	    << 100.0*(ALmissing+ALtoomuch)/double(ALeventsMissing+ALeventsToomuch) 
-	    << " recall: " << 100.0*(1.0-ALmissing/double(ALeventsMissing))
-	    << " precision: " << 100.0*(1.0-ALtoomuch/double(ALeventsToomuch))
-	    << " (missing:" << ALmissing << "/" << ALeventsMissing << " " << ALtoomuch 
-	    << " " << ALeventsToomuch << ")\n";
-      if( ALeventsMissingVALI+ALeventsToomuchVALI )
-	out << "alignmentErrors VALI (" << m << "): " 
-	    << 100.0*(ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI) 
-	    << " recall: " << 100.0*(1.0-ALmissingVALI/double(ALeventsMissingVALI))
-	    << " precision: " << 100.0*(1.0-ALtoomuchVALI/double(ALeventsToomuchVALI))
-	    << " (missing:" << ALmissingVALI << "/" << ALeventsMissingVALI << " " << ALtoomuchVALI 
-	    << " " << ALeventsToomuchVALI << ")\n";
-      if( ALeventsMissingTEST+ALeventsToomuchTEST )
-	out << "alignmentErrors TEST(" << m << "): " 
-	    << 100.0*(ALmissingTEST+ALtoomuchTEST)/double(ALeventsMissingTEST+ALeventsToomuchTEST) 
-	    << " recall: " << 100.0*(1.0-ALmissingTEST/double(ALeventsMissingTEST)) 
-	    << " precision: " << 100.0*(1.0-ALtoomuchTEST/double(ALeventsToomuchTEST))
-	    << " (missing:" << ALmissingTEST << "/" << ALeventsMissingTEST << " " << ALtoomuchTEST 
-	    << " " << ALeventsToomuchTEST << ")\n";
-      
-    }
-};
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/model2.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/model2.cpp b/ext/giza-pp/GIZA++-v2/model2.cpp
deleted file mode 100644
index 945b91e..0000000
--- a/ext/giza-pp/GIZA++-v2/model2.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, 
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
-USA.
-
-*/
-#include "model2.h"
-#include "Globals.h"
-#include "utility.h"
-#include "Parameter.h"
-#include "defs.h"
-
-extern short NoEmptyWord;
-
-
-GLOBAL_PARAMETER2(int,Model2_Dump_Freq,"MODEL 2 DUMP FREQUENCY","t2","dump frequency of Model 2",PARLEV_OUTPUT,0);
-
-model2::model2(model1& m,amodel<PROB>&_aTable,amodel<COUNT>&_aCountTable): 
-  model1(m),aTable(_aTable),aCountTable(_aCountTable)
-{  }
-
-void model2::initialize_table_uniformly(sentenceHandler& sHandler1){
-  // initialize the aTable uniformly (run this before running em_with_tricks)
-  int n=0;
-  sentPair sent ;
-  sHandler1.rewind();
-   while(sHandler1.getNextSentence(sent)){
-    Vector<WordIndex>& es = sent.eSent;
-    Vector<WordIndex>& fs = sent.fSent;
-    WordIndex l = es.size() - 1;
-    WordIndex m = fs.size() - 1;
-    n++;
-    if(1<=m&&aTable.getValue(l,m,l,m)<=PROB_SMOOTH)
-      {
-	PROB uniform_val = 1.0 / (l+1) ;
-	for(WordIndex j=1; j <= m; j++)
-	  for(WordIndex i=0; i <= l; i++)
-	    aTable.setValue(i,j, l, m, uniform_val);
-      }
-  }
-}
-
-int model2::em_with_tricks(int noIterations)
-{
-  double minErrors=1.0;int minIter=0;
-  string modelName="Model2",shortModelName="2";
-  time_t it_st, st, it_fn, fn;
-  string tfile, afile, number, alignfile, test_alignfile;
-  int pair_no = 0;
-  bool dump_files = false ;
-  ofstream of2 ;
-  st = time(NULL) ;
-  sHandler1.rewind();
-  cout << "\n==========================================================\n";
-  cout << modelName << " Training Started at: " << ctime(&st) << " iter: " << noIterations << "\n";
-  for(int it=1; it <= noIterations ; it++){
-    pair_no = 0;
-    it_st = time(NULL) ;
-    cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
-    dump_files = (Model2_Dump_Freq != 0) && ((it % Model2_Dump_Freq) == 0) && !NODUMPS;
-    number = "";
-    int n = it;
-    do{
-      number.insert((size_t)0, 1, (char)(n % 10 + '0'));
-    } while((n /= 10) > 0);
-    tfile = Prefix + ".t" + shortModelName + "." + number ;
-    afile = Prefix + ".a" + shortModelName + "." + number ;
-    alignfile = Prefix + ".A" + shortModelName + "." + number ;
-    test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ;
-    aCountTable.clear();
-    initAL();
-    em_loop(perp, sHandler1, dump_files, alignfile.c_str(), trainViterbiPerp, false);
-    if( errorsAL()<minErrors )
-      {
-	minErrors=errorsAL();
-        minIter=it;
-      }
-    if (testPerp && testHandler)
-      em_loop(*testPerp, *testHandler, dump_files, test_alignfile.c_str(), *testViterbiPerp, true); 
-    if (dump_files&&OutputInAachenFormat==1)
-      tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
-    tTable.normalizeTable(Elist, Flist);
-    aCountTable.normalize(aTable);
-    cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
-	 << " PERPLEXITY " << perp.perplexity() << '\n';
-     if (testPerp && testHandler)
-       cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
-	    << " PERPLEXITY " << (*testPerp).perplexity() 
-	    << '\n';
-     cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
-	 << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
-    if (testPerp && testHandler)
-       cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << testViterbiPerp->cross_entropy()
-	    << " PERPLEXITY " << testViterbiPerp->perplexity() 
-	    << '\n';
-    if (dump_files)
-      {
-	if(OutputInAachenFormat==0)
-	  tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
-	aCountTable.printTable(afile.c_str());
-      }
-    it_fn = time(NULL) ;
-    cout << modelName << " Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n";
-  } // end of iterations 
-  aCountTable.clear();
-  fn = time(NULL) ;
-  cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
-  //  cout << "tTable contains " << tTable.getHash().bucket_count() 
-  //     << " buckets and  " << tTable.getHash().size() << " entries." ;
-  cout << "==========================================================\n";
-  return minIter;
-}
-
-void model2::load_table(const char* aname){
-  /* This function loads the a table from the given file; use it
-     when you want to load results from previous a training without
-     doing any new training.
-     NAS, 7/11/99
-  */
-  cout << "Model2: loading a table \n";
-  aTable.readTable(aname);
-}
-
-
-void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1, 
-		     bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp, 
-		     bool test)
-{
-  massert( aTable.is_distortion==0 );
-  massert( aCountTable.is_distortion==0 );
-  WordIndex i, j, l, m ;
-  double cross_entropy;
-  int pair_no=0 ;
-  perp.clear();
-  viterbi_perp.clear();
-  ofstream of2;
-  // for each sentence pair in the corpus
-  if (dump_alignment||FEWDUMPS )
-    of2.open(alignfile);
-  sentPair sent ;
-
-  vector<double> ferts(evlist.size());
-  
-  sHandler1.rewind();
-  while(sHandler1.getNextSentence(sent)){
-    Vector<WordIndex>& es = sent.eSent;
-    Vector<WordIndex>& fs = sent.fSent;
-    const float so  = sent.getCount();
-    l = es.size() - 1;
-    m = fs.size() - 1;
-    cross_entropy = log(1.0);
-    Vector<WordIndex> viterbi_alignment(fs.size());
-    double viterbi_score = 1;
-    for(j=1; j <= m; j++){
-      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table 
-      // entries  that map fs to all possible ei in this sentence.
-      PROB denom = 0.0;
-      PROB e = 0.0, word_best_score = 0;
-      WordIndex best_i = 0 ; // i for which fj is best maped to ei
-      for(i=0; i <= l; i++){
-	sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
-	if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH ) 
-	  e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ;
-	else e = PROB_SMOOTH * aTable.getValue(i,j, l, m);
-	denom += e ;
-	if (e > word_best_score){
-	  word_best_score = e ;
-	  best_i = i ;
-	}
-      }
-      viterbi_alignment[j] = best_i ;
-      viterbi_score *= word_best_score; ///denom ;
-      cross_entropy += log(denom) ;
-      if (denom == 0){
-	if (test)
-	  cerr << "WARNING: denom is zero (TEST)\n";
-	else 
-	  cerr << "WARNING: denom is zero (TRAIN)\n";
-      }      
-      if (!test){
-	if(denom > 0){	  
-	  COUNT val = COUNT(so) / (COUNT) double(denom) ;
-	  for( i=0; i <= l; i++){
-	    PROB e(0.0);
-	    if (sPtrCache[i] != 0 &&  (*(sPtrCache[i])).prob > PROB_SMOOTH)
-	      e = (*(sPtrCache[i])).prob ;
-	    else e = PROB_SMOOTH  ;
-	    e *= aTable.getValue(i,j, l, m);
-	    COUNT temp = COUNT(e) * val ;
-	    if( NoEmptyWord==0 || i!=0 )
-	      if (sPtrCache[i] != 0) 
-		(*(sPtrCache[i])).count += temp ;
-	      else 	      
-		tTable.incCount(es[i], fs[j], temp);	    
-	    aCountTable.getRef(i,j, l, m)+= temp ; 
-	  } /* end of for i */
-	} // end of if (denom > 0)
-      }// if (!test)
-    } // end of for (j) ;
-    sHandler1.setProbOfSentence(sent,cross_entropy);
-    perp.addFactor(cross_entropy, so, l, m,1);
-    viterbi_perp.addFactor(log(viterbi_score), so, l, m,1);
-    if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) )
-      printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
-    addAL(viterbi_alignment,sent.sentenceNo,l);
-    pair_no++;
-  } /* of while */
-  sHandler1.rewind();
-  perp.record("Model2");
-  viterbi_perp.record("Model2");
-  errorReportAL(cout,"IBM-2");
-}
-
-
-
-
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/model2.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/model2.h b/ext/giza-pp/GIZA++-v2/model2.h
deleted file mode 100644
index ada807e..0000000
--- a/ext/giza-pp/GIZA++-v2/model2.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, 
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
-USA.
-
-*/
-#ifndef _model2_h
-#define _model2_h 1
-
-#include <cassert>
- 
-#include <iostream>
-#include <algorithm>
-#include <functional>
-#include <map>
-#include <set>
-#include "Vector.h"
-#include <utility>
-
-#include <fstream>
-#include <cmath>
-#include <ctime>
-
-#include "TTables.h"
-#include "ATables.h" 
-#include "getSentence.h"
-#include "defs.h"
-#include "model1.h"
-#include "Perplexity.h"
-#include "vocab.h"
-
-class model2 : public model1
-{
- public:
-  amodel<PROB>&aTable;
-  amodel<COUNT>&aCountTable;
- public:
-  model2(model1& m1,amodel<PROB>&,amodel<COUNT>&);
-  void initialize_table_uniformly(sentenceHandler&);
-  int em_with_tricks(int);
-  void load_table(const char* aname);
-  inline amodel<PROB>& getATable(void) {return aTable;};
-  inline amodel<COUNT>& getACountTable(void) {return aCountTable;};
-  void em_loop(Perplexity& perp,sentenceHandler& sHandler1, bool dump_files,const char* alignfile, Perplexity&, bool test);
-  friend class model3;
-};
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/model2to3.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/model2to3.cpp b/ext/giza-pp/GIZA++-v2/model2to3.cpp
deleted file mode 100644
index 22cbf50..0000000
--- a/ext/giza-pp/GIZA++-v2/model2to3.cpp
+++ /dev/null
@@ -1,398 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful, 
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
-USA.
-
-*/
-#include "model3.h"
-#include "utility.h"
-#include "Globals.h"
-
-#define _MAX_FERTILITY 10
-
-double get_sum_of_partitions(int n, int source_pos, double alpha[_MAX_FERTILITY][MAX_SENTENCE_LENGTH_ALLOWED])
-{
-  int done, init ;
-  double sum = 0, prod ;
-  int s, w, u, v;
-  WordIndex k, k1, i ;
-  WordIndex num_parts = 0  ;
-  int total_partitions_considered = 0;
-
-  int part[_MAX_FERTILITY], mult[_MAX_FERTILITY];
-
-  done = false ;
-  init = true ;    
-  for (i = 0 ; i < _MAX_FERTILITY ; i++){
-    part[i] = mult[i] = 0 ;
-  }
-  
-  //printf("Entering get sum of partitions\n");
-  while(! done){
-    total_partitions_considered++;
-    if (init){
-      part[1] = n ;
-      mult[1] = 1 ;
-      num_parts = 1  ;  
-      init = false ;
-    }
-    else {
-      if ((part[num_parts] > 1) || (num_parts > 1)){
-	if (part[num_parts] == 1){
-	  s = part[num_parts-1] + mult[num_parts];
-	  k = num_parts - 1;
-	}
-	else {
-	  s = part[num_parts];
-	  k = num_parts ;
-	}
-	w = part[k] - 1 ;
-	u = s / w ;
-	v = s % w ;
-	mult[k] -= 1 ;
-	if (mult[k] == 0)
-	  k1 = k ;
-	else k1 = k + 1 ;
-	mult[k1] = u ;
-	part[k1] = w ;
-	if (v == 0){
-	  num_parts = k1 ;
-	}
-	else {
-	  mult[k1+1] = 1 ;
-	  part[k1+1] = v ;
-	  num_parts = k1 + 1;
-	}
-      } /* of if num_parts > 1 || part[num_parts] > 1 */    
-      else {
-	done = true ;
-      }
-    }
-    /* of else of if(init) */
-    if (!done){
-      prod = 1.0 ;
-      if (n != 0)
-	for (i = 1 ; i <= num_parts ; i++){
-	  prod *= pow(alpha[part[i]][source_pos], mult[i]) / factorial(mult[i]) ;
-	}
-      sum += prod ;
-    }
-  } /* of while */  
-  if (sum < 0) sum = 0 ;
-  return(sum) ;
-}
-
-void model3::estimate_t_a_d(sentenceHandler& sHandler1, Perplexity& perp, Perplexity& trainVPerp, 
-			    bool simple, bool dump_files,bool updateT) 
-{
-  string tfile, nfile, dfile, p0file, afile, alignfile;
-  WordIndex i, j, l, m, max_fertility_here, k ;
-  PROB val, temp_mult[MAX_SENTENCE_LENGTH_ALLOWED][MAX_SENTENCE_LENGTH_ALLOWED];
-  double cross_entropy;
-  double beta, sum, 
-      alpha[_MAX_FERTILITY][MAX_SENTENCE_LENGTH_ALLOWED];
-  double total, temp, r ;
-
-  dCountTable.clear();
-  aCountTable.clear();
-  initAL();
-  nCountTable.clear() ;
-  if (simple)
-    nTable.clear();
-  perp.clear() ;
-  trainVPerp.clear() ;
-  ofstream of2;
-  if (dump_files){
-    alignfile = Prefix +".A2to3";
-    of2.open(alignfile.c_str());
-  }
-  if (simple) cerr <<"Using simple estimation for fertilties\n";
-  sHandler1.rewind() ;
-  sentPair sent ;
-  while(sHandler1.getNextSentence(sent)){
-    Vector<WordIndex>& es = sent.eSent;
-    Vector<WordIndex>& fs = sent.fSent;
-    const float count  = sent.getCount();
-    Vector<WordIndex> viterbi_alignment(fs.size());
-    l = es.size() - 1;
-    m = fs.size() - 1;
-    cross_entropy = log(1.0); 
-    double viterbi_score = 1 ;
-    PROB word_best_score ;  // score for the best mapping of fj
-    for(j = 1 ; j <= m ; j++){
-      word_best_score = 0 ;  // score for the best mapping of fj	
-      Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0);
-      total = 0 ;
-      WordIndex best_i = 0 ;
-      for(i = 0; i <= l ; i++){
-	sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
-	if (sPtrCache[i] != 0 &&  (*(sPtrCache[i])).prob > PROB_SMOOTH) // if valid pointer
-	  temp_mult[i][j]= (*(sPtrCache[i])).prob * aTable.getValue(i, j, l, m) ;
-	else 
-	  temp_mult[i][j] = PROB_SMOOTH *  aTable.getValue(i, j, l, m) ;
-	total += temp_mult[i][j] ;
-	if (temp_mult[i][j] > word_best_score){
-	    word_best_score = temp_mult[i][j] ;
-	    best_i = i ;
-	  }
-      } // end of for (i) 
-      viterbi_alignment[j] = best_i ;
-      viterbi_score *= word_best_score ; /// total ;
-      cross_entropy += log(total) ; 
-      if (total == 0){
-	  cerr << "WARNING: total is zero (TRAIN)\n";
-	  viterbi_score = 0 ;
-      }
-      if (total > 0){
-	for(i = 0; i <= l ; i++){
-	  temp_mult[i][j] /= total ;
-	  if (temp_mult[i][j] == 1) // smooth to prevent underflow
-	    temp_mult[i][j] = 0.99 ;
-	  else  if (temp_mult[i][j] == 0)
-	    temp_mult[i][j] = PROB_SMOOTH ;
-	 val = temp_mult[i][j] * PROB(count) ;
-	  if ( val > PROB_SMOOTH) {
-	    if( updateT )
-	      {
-		if (sPtrCache[i] != 0) 
-		  (*(sPtrCache[i])).count += val ;
-		else
-		  tTable.incCount(es[i], fs[j], val);
-	      }
-	    aCountTable.getRef(i, j, l, m)+=val;
-	    if (0 != i)
-	      dCountTable.getRef(j, i, l, m)+=val;
-	  }
-	} // for (i = ..)
-      } // for (if total ...)
-    } // end of for (j  ...)
-    if (dump_files)
-      printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
-    addAL(viterbi_alignment,sent.sentenceNo,l);
-    if (!simple){
-      max_fertility_here = min(WordIndex(m+1), MAX_FERTILITY);
-      for (i = 1; i <= l ; i++) { 
-	for ( k = 1; k < max_fertility_here; k++){
-	  beta = 0 ;
-	  alpha[k][i] = 0 ;
-	  for (j = 1 ; j <= m ; j++){
-	    temp =  temp_mult[i][j];	    	   
-	    if (temp > 0.95) temp = 0.95; // smooth to prevent under/over flow
-	    else if (temp < 0.05) temp = 0.05;
-	    beta += pow(temp/(1.0-temp), (double) k);
-	  }
-	alpha[k][i] = beta * pow((double) -1, (double) (k+1)) / (double) k ;
-	}
-      }
-      for (i = 1; i <= l ; i++){ 
-	r = 1;
-	for (j = 1 ; j <= m ; j++)
-	  r *= (1 - temp_mult[i][j]); 
-	for (k = 0  ; k <  max_fertility_here ; k++){
-	  sum = get_sum_of_partitions(k, i, alpha);
-	  temp = r * sum * count;
-	  nCountTable.getRef(es[i], k)+=temp;	  
-	} // end of for (k ..)
-      } // end of for (i == ..)
-    } // end of  if (!simple)
-    perp.addFactor(cross_entropy, count, l, m,1);
-    trainVPerp.addFactor(log(viterbi_score), count, l, m,1);
-  } // end of while 
-  sHandler1.rewind();
-  cerr << "Normalizing t, a, d, n count tables now ... " ;
-  if( dump_files && OutputInAachenFormat==1 )
-    {
-      tfile = Prefix + ".t2to3" ;
-      tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
-    }
-  if( updateT )
-    tTable.normalizeTable(Elist, Flist);
-  aCountTable.normalize(aTable);
-  dCountTable.normalize(dTable);
-  if (!simple)
-    nCountTable.normalize(nTable,&Elist.getVocabList());
-  else {
-    for (i = 0 ; i< Elist.uniqTokens() ; i++){
-      if (0 < MAX_FERTILITY){
-	nTable.getRef(i,0)=PROB(0.2);
-	if (1 < MAX_FERTILITY){
-	  nTable.getRef(i,1)=PROB(0.65);
-	  if (2 < MAX_FERTILITY){
-	    nTable.getRef(i,2)=PROB(0.1);
-	  if (3 < MAX_FERTILITY)
-	    nTable.getRef(i,3)=PROB(0.04);
-	  PROB val = 0.01/(MAX_FERTILITY-4);
-	  for (k = 4 ; k < MAX_FERTILITY ; k++)
-	    nTable.getRef(i, k)=val;
-	  }
-	}
-      }
-    }
-  } // end of else (!simple) 
-  p0 = 0.95;
-  p1 = 0.05;
-  if (dump_files){
-    tfile = Prefix + ".t2to3" ;
-    afile = Prefix + ".a2to3" ;
-    nfile = Prefix + ".n2to3" ;
-    dfile = Prefix + ".d2to3" ;
-    p0file = Prefix + ".p0_2to3" ;
-    
-    if( OutputInAachenFormat==0 )
-      tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
-    aTable.printTable(afile.c_str());
-    dTable.printTable(dfile.c_str());
-    nCountTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(),OutputInAachenFormat);
-    ofstream of(p0file.c_str());
-    of << p0;
-    of.close();
-  }
-  errorReportAL(cerr,"IBM-2");
-  if(simple)
-    {
-      perp.record("T2To3");
-      trainVPerp.record("T2To3");
-    }
-  else
-    {
-      perp.record("ST2To3");
-      trainVPerp.record("ST2To3");
-    }
-}
-
-void model3::transferSimple(/*model1& m1, model2& m2, */ sentenceHandler& sHandler1, 
-			    bool dump_files, Perplexity& perp, Perplexity& trainVPerp,bool updateT)
-{
-  /* 
-     This function performs simple Model 2 -> Model 3 transfer.
-     It sets values for n and p without considering Model 2's ideas.
-     It sets d values based on a.
-  */
-  time_t st, fn;
-  // just inherit these from the previous models, to avoid data duplication
- 
-  st = time(NULL);
-  cerr << "==========================================================\n";
-  cerr << "\nTransfer started at: "<< ctime(&st) << '\n';  
-  
-  cerr << "Simple tranfer of Model2 --> Model3 (i.e. estimating initial parameters of Model3 from Model2 tables)\n";
-  
-  estimate_t_a_d(sHandler1, perp, trainVPerp, true, dump_files,updateT) ;
-  fn = time(NULL) ;
-  cerr << "\nTransfer: TRAIN CROSS-ENTROPY " << perp.cross_entropy()
-       << " PERPLEXITY " << perp.perplexity() << '\n';
-  cerr << "\nTransfer took: " << difftime(fn, st) << " seconds\n";
-  cerr << "\nTransfer Finished at: "<< ctime(&fn) << '\n';  
-  cerr << "==========================================================\n";
-  
-}
-
-
-void model3::transfer(sentenceHandler& sHandler1,bool dump_files, Perplexity& perp, Perplexity& trainVPerp,bool updateT)
-{
-  if (Transfer == TRANSFER_SIMPLE)
-    transferSimple(sHandler1,dump_files,perp, trainVPerp,updateT);
-  {
-    time_t st, fn ;
-    
-    st = time(NULL);
-    cerr << "==========================================================\n";
-    cerr << "\nTransfer started at: "<< ctime(&st) << '\n';  
-    cerr << "Transfering Model2 --> Model3 (i.e. estimating initial parameters of Model3 from Model2 tables)\n";
-    
-    p1_count = p0_count = 0 ;
-    
-    estimate_t_a_d(sHandler1, perp, trainVPerp, false, dump_files,updateT);
-    
-    
-    
-    /* Below is a made-up stab at transferring t & a probs to p0/p1.
-       (Method not documented in IBM paper).
-       It seems to give p0 = .96, which may be right for Model 2, or may not.
-       I'm commenting it out for now and hardwiring p0 = .90 as above. -Kevin
-       
-       // compute p0, p1 counts
-       Vector<LogProb> nm(Elist.uniqTokens(),0.0);
-       
-       for(i=0; i < Elist.uniqTokens(); i++){
-       for(k=1; k < MAX_FERTILITY; k++){
-       nm[i] += nTable.getValue(i, k) * (LogProb) k;
-       }
-       }
-       
-       LogProb mprime;
-       //  sentenceHandler sHandler1(efFilename.c_str());
-       //  sentPair sent ;
-       
-       while(sHandler1.getNextSentence(sent)){
-       Vector<WordIndex>& es = sent.eSent;
-       Vector<WordIndex>& fs = sent.fSent;
-       const float count  = sent.noOccurrences;
-       
-       l = es.size() - 1;
-       m = fs.size() - 1;
-       mprime = 0 ;
-       for (i = 1; i <= l ; i++){
-       mprime +=  nm[es[i]] ;
-       }
-       mprime = LogProb((int((double) mprime + 0.5)));  // round mprime to nearest integer 
-       if ((mprime < m) && (2 * mprime >= m)) {
-       //      cerr << "updating both p0_count and p1_count, mprime: " << mprime << 
-       //	"m = " << m << "\n";
-       p1_count +=  (m - (double) mprime)  * count ;
-       p0_count +=  (2 * (double) mprime - m)  * count ;
-       //  cerr << "p0_count = "<<p0_count << " , p1_count = " << p1_count << endl ;
-       }
-       else {
-       //      p1_count += 0 ;
-       //      cerr << "updating only p0_count, mprime: " << mprime << 
-       //	"m = " << m << "\n";
-       p0_count +=  double(m  * count) ;
-       //  cerr << "p0_count = "<<p0_count << " , p1_count = " << p1_count << endl ;
-       }
-       }
-       
-       // normalize p1, p0 
-       
-       cerr << "p0_count = "<<p0_count << " , p1_count = " << p1_count << endl ;
-       p1 = p1_count / (p1_count + p0_count ) ;
-       p0 = 1 - p1;
-       cerr << "p0 = "<<p0 << " , p1 = " << p1 << endl ;
-       // Smooth p0 probability to avoid getting zero probability.
-       if (0 == p0){
-       p0 = (LogProb) SMOOTH_THRESHOLD ;
-       p1 = p1 - (LogProb) SMOOTH_THRESHOLD ;
-       }
-       if (0 == p1){
-       p1 = (LogProb) SMOOTH_THRESHOLD ;
-       p0 = p0 - (LogProb) SMOOTH_THRESHOLD ;
-       }
-    */
-    
-    fn = time(NULL) ;
-    cerr << "\nTransfer: TRAIN CROSS-ENTROPY " << perp.cross_entropy()
-	 << " PERPLEXITY " << perp.perplexity() << '\n';
-    //    cerr << "tTable contains " << tTable.getHash().bucket_count() 
-    //	 << " buckets and  " << tTable.getHash().size() << " entries." ;
-    cerr << "\nTransfer took: " << difftime(fn, st) << " seconds\n";
-    cerr << "\nTransfer Finished at: "<< ctime(&fn) << endl;  
-    cerr << "==========================================================\n";
-    
-  }
-
-}