You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/26 04:22:24 UTC
[07/14] incubator-joshua git commit: JOSHUA-252 Make it possible to
use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/parse.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/parse.cpp b/ext/giza-pp/GIZA++-v2/parse.cpp
deleted file mode 100644
index ebb136e..0000000
--- a/ext/giza-pp/GIZA++-v2/parse.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-
-/* FJO 01/2001: completely reorganized parameter processing */
-
-#include <sstream>
-#include <string>
-#include <fstream>
-#include "defs.h"
-#include "utility.h"
-#include "Globals.h"
-#include "D4Tables.h"
-#include "D5Tables.h"
-#include "ATables.h"
-#include "Parameter.h"
-
-extern bool ONLYALDUMPS;
-
-void parseConfigFile (char * fname )
- // This functions reads in the configuration file to set up some run-time
- // parameters. The parameters are global variables that are defined in
- // main.cc and used all over the place in the program
- // The format of the configuration file can be explained in the following way
- // FORMAT:
- // the character '\n' separates lines ..
- // lines that start with "//" (skipping over white spaces are considered
- // as comments and will be ignored.
- // Any other line is considered as an attribute setting instruction and it
- // is divided into haves (separated by a colon ":"). The first half is the
- // attribute value which consists of the concatenation of all non-white space
- // tokens before the colon. These tokens will have spaces eseparating them.
- // The attribute vlue is the first token after the colon (any thing after
- // it will be ignored ;
- // For example :
- // if the configuration file has the following entry:
- //
- // NO. ITERATIONS MODEL 2 : 10
- //
- // then the attribute is "NO. ITERATIONS MODEL 2" , and the attribute value
- // is "10" (these do not include the quotation marks).
-
-{
-
- string line, word, attrib, attribval ;
- ifstream Config_File(fname);
- if(!Config_File){
- cerr << "ERROR: Cannot open configuration file " << fname << "!\n" ;
- exit(1);
- }
-
- cout << "The following options are from the config file and will be overwritten by any command line options.\n";
-
- while(getline(Config_File, line)){
-
- istringstream buffer(line);
- word = attrib = attribval = "" ;
- buffer >> word ;
- if (word != "//"){ // if line does not start with "//" (i.e. not a comment)
- attrib = word ;
- while((buffer >> word) && (word != ":")){
- attrib += " " + word ;
- }
- if(!(buffer >> attribval))
- {
- istringstream buffer2(line);
- buffer2>>attrib;
- buffer2>>attribval;
- }
-
- // This# is where (1) the configuration file is defined and
- // (2) parsing of its attributes occurs.
-
- if(attrib == "t FILE"){
- t_Filename = attribval;
- cout << "\tt file: " << t_Filename << '\n';
- }
- else if(attrib == "a FILE"){
- a_Filename = attribval;
- cout << "\ta file: " << a_Filename << '\n';
- }
- else if(attrib == "d FILE"){
- d_Filename = attribval;
- cout << "\td file: " << d_Filename << '\n';
- }
- else if(attrib == "n FILE"){
- n_Filename = attribval;
- cout << "\tn file: " << n_Filename << '\n';
- }
- else if(attrib == "p0 FILE"){
- p0_Filename = attribval;
- cout << "\tp0 file: " << p0_Filename << '\n';
- }
- else if ( line == ""){}
- else if( !makeSetCommand(attrib,attribval,getGlobalParSet(),2) )
- cerr << "ERROR: Unrecognized attribute :" << attrib << '\n';
- }
- }
-}
-
-
-void parseArguments(int argc, char *argv[])
-{
- int arg = 1;
-
- if(!strcmp(argv[1], "--h") || !strcmp(argv[1], "--help")){
- printHelp();
- exit(0);
- }
- if( argv[1][0]=='-' )
- arg=0;
- else
- parseConfigFile(argv[1]);
- while(++arg<argc){
- if( strlen(argv[arg])>2 && argv[arg][0]=='-' && argv[arg][1]=='-' )
- {
- if( !makeSetCommand(argv[arg]+1,"1",getGlobalParSet(),2))
- cerr << "WARNING: ignoring unrecognized option: "<< argv[arg] << '\n' ;
- }
- else if( arg+1<argc && !makeSetCommand(argv[arg],argv[arg+1],getGlobalParSet(),2))
- cerr << "WARNING: ignoring unrecognized option: "<< argv[arg] << '\n' ;
- else
- {
- arg++;
- }
- }
- if( OPath.length() )
- OPath+="/";
- Prefix = (OPath + Prefix);
- LogFilename = (OPath + LogFilename);
- printGIZAPars(cout);
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/plain2snt.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/plain2snt.cpp b/ext/giza-pp/GIZA++-v2/plain2snt.cpp
deleted file mode 100644
index 66ae677..0000000
--- a/ext/giza-pp/GIZA++-v2/plain2snt.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <map>
-#include <vector>
-#include <cstdlib>
-
-using namespace std;
-
-int main(int argc,char**argv)
-{
- vector<double>weights;
- vector<string>filenames;
- for(int i=1;i<argc;++i)
- if(string(argv[i])=="-weight")
- weights.push_back(atof(argv[++i]));
- else
- filenames.push_back(argv[i]);
-
- if((filenames.size()%2)==1||filenames.size()==0 )
- {
- cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w]\n";
- cerr << " Converts plain text into GIZA++ snt-format.\n";
- exit(1);
- }
- string line1,line2,word;
- map<string,int> v1,v2;
- map<string,int> id1,id2;
- vector<string> iid1(2),iid2(2);
-
- string w1(filenames[0]);
- string w2(filenames[1]);
-
- if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
- (w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
- {
- w1=w1.substr(0,w1.length()-4);
- w2=w2.substr(0,w2.length()-4);
- cerr << "w1:"<< w1 << " w2:" << w2 << endl;
- }
-
-
- string vocab1(w1),vocab2(w2),snt1,snt2;
- unsigned int slashpos=vocab1.rfind('/')+1;
- if( slashpos>=vocab1.length() ) slashpos=0;
- string vocab1x(vocab1.substr(slashpos,vocab1.length()));
- cout << vocab1 << " -> " << vocab1x << endl;
- slashpos=vocab2.rfind('/')+1;
- if( slashpos>=vocab2.length() ) slashpos=0;
- string vocab2x(vocab2.substr(slashpos,vocab2.length()));
- cout << vocab2 << " -> " << vocab2x << endl;
- snt1=vocab1+"_"+vocab2x+string(".snt");
- snt2=vocab2+"_"+vocab1x+string(".snt");
- vocab1+=string(".vcb");
- vocab2+=string(".vcb");
-
- ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
- for(unsigned int i=0;i<filenames.size();i+=2)
- {
- ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
- if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
- if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
- while(getline(i1,line1) && getline(i2,line2) )
- {
- vector<string> t1,t2;
- istringstream ii1(line1);
- while(ii1>>word)
- {
- t1.push_back(word);
- v1[word]++;
- if( id1.find(word)==id1.end() )
- {
- iid1.push_back(word);
- id1[word]=iid1.size()-1;
- }
- }
- istringstream ii2(line2);
- while(ii2>>word)
- {
- t2.push_back(word);
- v2[word]++;
- if( id2.find(word)==id2.end() )
- {
- iid2.push_back(word);
- id2[word]=iid2.size()-1;
- }
- }
- double w=1.0;
- if( i/2<weights.size() )
- w=weights[i/2];
- if( t1.size()&&t2.size() )
- {
- osnt1 << w << "\n";
- for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
- osnt1 << '\n';
- for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
- osnt1 << '\n';
-
- osnt2 << w << "\n";
- for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
- osnt2 << '\n';
- for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
- osnt2 << '\n';
- }
- else
- cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
- " target: " << filenames[i+1] << " " << t2.size() << ").\n";
- }
- }
-
- for(unsigned int i=2;i<iid1.size();++i)
- ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
- for(unsigned int i=2;i<iid2.size();++i)
- ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/reports.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/reports.cpp b/ext/giza-pp/GIZA++-v2/reports.cpp
deleted file mode 100644
index 621e21a..0000000
--- a/ext/giza-pp/GIZA++-v2/reports.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include <sstream>
-#include <time.h>
-#include <set>
-#include "defs.h"
-#include "vocab.h"
-#include "Perplexity.h"
-#include "getSentence.h"
-#include "TTables.h"
-#include "Globals.h"
-#include "Parameter.h"
-
-void printHelp(void)
-{
- cerr << "Usage:\n\n" << Usage << '\n';
- cerr << "Options (these override parameters set in the config file):\n\n";
- cerr << "\t--v \t\t print verbose message, Warning this is not very descriptive and not systematic.\n";
- cerr << "\t--NODUMPS \t Do not write any files to disk (This will over write dump frequency options).\n";
- cerr << "\t--h[elp]\t\tprint this help\n";
- cerr << "\t--p\t\tUse pegging when generating alignments for Model3 training. (Default NO PEGGING)\n";
- cerr << "\t--st\t\tto use a fixed ditribution for the fertility parameters when tranfering from model 2 to model 3 (Default complicated estimation)\n";
- printGIZAPars(cout);
-}
-
-
-void generatePerplexityReport(const Perplexity& trainperp,
- const Perplexity& testperp,
- const Perplexity& trainVperp,
- const Perplexity& testVperp,
- ostream& of, int trainsize, int testsize,
- bool)
-{
- unsigned int i, m;
- unsigned int m1 = max(trainperp.size(), testperp.size());
- unsigned int m2 = max(trainVperp.size(), testVperp.size());
- m = max(m1,m2);
- of << "#trnsz\ttstsz\titer\tmodel\ttrn-pp\t\ttest-pp\t\ttrn-vit-pp\t\ttst-vit-pp\n";
- for (i = 0 ; i <m ; i++){
- of << trainsize << '\t' << testsize << '\t' << i<< '\t' << trainperp.modelid[i] << '\t';
- if (i < trainperp.perp.size())
- of << trainperp.perp[i] << "\t\t" ;
- else
- of << "N/A\t\t";
- if (i<testperp.perp.size())
- of << testperp.perp[i] << "\t\t" ;
- else
- of << "N/A\t\t";
- if (i < trainVperp.perp.size())
- of << trainVperp.perp[i] << "\t\t" ;
- else
- of << "N/A\t";
- if (i< testVperp.perp.size())
- of << testVperp.perp[i] << '\n' ;
- else
- of << "N/A\n";
- }
-}
-
-void printSentencePair(Vector<WordIndex>& es,
- Vector<WordIndex>& fs,
- ostream& of)
-
- // just writes a sentece pair to the give output stream, one sentence pair line
- // it writes token ids not actual tokens.
-{
- WordIndex i, j, l, m;
- l = es.size() - 1;
- m = fs.size() - 1;
- of << "Source sentence length : " << l << " , target : " << m << "\n";
- for (i = 1 ; i <= l ; i++)
- of << es[i] << ' ';
- of << "\n";
- for (j = 1 ; j <= m ; j++)
- of << fs[j] << ' ';
- of << "\n";
-
-}
-
-extern short CompactAlignmentFormat;
-void printAlignToFile(const Vector<WordIndex>& es,
- const Vector<WordIndex>& fs,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- ostream& of2,
- const Vector<WordIndex>& viterbi_alignment,
- int pair_no, double alignment_score)
-
- // prints the given alignment to alignments file (given it stream pointer)
- // in a format recognizable by the draw-alignment tool ... which is of the
- // example (each line triple is one sentence pair):
- // # sentence caption
- // target_word_1 target_word_2 ..... target_word_m
- // source_word_1 ({ x y z }) source_word_2 ({ }) .. source_word_n ({w})
- // where x, y, z, and w are positions of target words that each source word
- // is connected to.
-
-{
- WordIndex l, m;
- Vector<Vector<WordIndex> > translations(es.size()); // each english words has a vector
- // of zero or more translations .
- l = es.size() - 1;
- m = fs.size() - 1;
- if( CompactAlignmentFormat )
- {
- for (WordIndex j = 1 ; j <= m ; j++)
- if( viterbi_alignment[j] )
- of2 << viterbi_alignment[j]-1 << ' ' << j-1 << ' ';
- of2 << '\n';
- }
- else
- {
- of2 << "# Sentence pair (" << pair_no <<") source length " << l << " target length "<< m <<
- " alignment score : "<< alignment_score << '\n';
- for (WordIndex j = 1 ; j <= m ; j++){
- of2 << fvlist[fs[j]].word << " " ;
- translations[viterbi_alignment[j]].push_back(j);
- }
- of2 << '\n';
-
- for (WordIndex i = 0 ; i <= l ; i++){
- of2 << evlist[es[i]].word << " ({ " ;
- for (WordIndex j = 0 ; j < translations[i].size() ; j++)
- of2 << translations[i][j] << " " ;
- of2 << "}) ";
- }
- of2 << '\n';
- }
-}
-
-
-void printOverlapReport(const tmodel<COUNT, PROB>& tTable,
- sentenceHandler& testHandler, vcbList& trainEList,
- vcbList& trainFList, vcbList& testEList, vcbList& testFList)
-{
- set<pair<WordIndex, WordIndex> > testCoocur ;
- sentPair s ;
- /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ;
- ofstream of_unseenCoocur(unseenCoocurFile.c_str());
-
- string seenCoocurFile = Prefix + ".tst.seen.cooc" ;
- ofstream of_seenCoocur(seenCoocurFile.c_str());
- */
- testHandler.rewind();
- int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ;
- while(testHandler.getNextSentence(s)){
- for (WordIndex i = 1 ; i < s.eSent.size() ; i++)
- for (WordIndex j = 1 ; j < s.fSent.size() ; j++)
- testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ;
- }
- set<pair<WordIndex, WordIndex> >::const_iterator i ;
- for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){
- if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){
- seen_coocur ++ ;
- // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n';
- }
- else {
- unseen_coocur++;
- // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n';
- }
- }
-
- string trgUnkFile = Prefix + ".tst.trg.unk" ;
- ofstream of_trgUnk(trgUnkFile.c_str());
-
- for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++)
- if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){
- of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq
- << '\n';
- trgUnk++ ;
- }
- string srcUnkFile = Prefix + ".tst.src.unk" ;
- ofstream of_srcUnk(srcUnkFile.c_str());
-
- for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++)
- if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){
- srcUnk++ ;
- of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq
- << '\n';
- }
- string summaryFile = Prefix + ".tst.stats" ;
- ofstream of_summary(summaryFile.c_str());
- of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n";
- of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n';
- of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n';
- of_summary << "unique unseen source tokens: " << srcUnk << '\n';
- of_summary << "unique unseen target tokens: " << trgUnk << '\n';
- of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n';
- of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n';
-
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp b/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp
deleted file mode 100644
index 1ce7648..0000000
--- a/ext/giza-pp/GIZA++-v2/small_snt2cooc.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <vector>
-#include <algorithm>
-
-
-using namespace std;
-
-class myset {
- private:
- vector<int> data;
- void check_cap() {
- size_t dc = data.capacity();
- if (dc - data.size() < 3) {
- if (dc < 4) { dc = 2; }
- if (dc < 18) { dc*=2; } else { dc+=15; }
- data.reserve(dc);
- }
- }
- public:
- typedef vector<int>::iterator iterator;
- void insert(int x) {
- if (data.size() == 0) { data.push_back(x); return; }
- vector<int>::iterator p = lower_bound(data.begin(), data.end(), x);
- int i = p - data.begin();
- if (i >= data.size()) { check_cap(); data.push_back(x); return; }
- if (*p == x) return;
- check_cap();
- data.insert(data.begin() + i,x);
- }
- iterator begin() { return data.begin(); }
- iterator end() { return data.end(); }
-};
-
-//#include <set>
-// typedef std::set<int> intset;
-//#include <ext/hash_set>
-// typedef __gnu_cxx::hash_set<int> intset;
-typedef myset intset;
-
-
-int main(int argc,char **argv)
-{
- if( argc!=2 )
- {
- cerr << "Usage: " << argv[0] << " snt12 \n";
- cerr << "Converts GIZA++ snt-format into plain text.\n";
- exit(1);
- }
- ifstream t(argv[1]);
- string line1,line2,line3;
- vector<intset> vsi(400000);
- int nLine=0;
- int totalElems=0;
- while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
- {
- istringstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
- double count;
- string word;
- eingabe1>>count;
- vector<int>l1,l2;
- while(eingabe2>>word)
- l1.push_back(atoi(word.c_str()));
- while(eingabe3>>word)
- l2.push_back(atoi(word.c_str()));
- if( ((++nLine)%1000)==0 )
- cerr << "line " << nLine << '\n';
- for(unsigned int j=0;j<l2.size();++j)
- vsi[0].insert(l2[j]);
- for(unsigned int i=0;i<l1.size();++i)
- {
- if( l1[i]>=int(vsi.size()) )
- {
- cerr << "I have to resize: " << l1[i] << endl;
- vsi.resize(l1[i]+1000);
- }
- intset&theset=vsi[l1[i]];
- for(unsigned int j=0;j<l2.size();++j)
- theset.insert(l2[j]);
- }
- }
- int vi = 0;
- for(vector<intset>::iterator i=vsi.begin();i != vsi.end(); ++i) {
- for(intset::iterator j=i->begin();j!=i->end();++j)
- cout << vi << " " << *j << endl;
- ++vi;
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/snt2cooc.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/snt2cooc.cpp b/ext/giza-pp/GIZA++-v2/snt2cooc.cpp
deleted file mode 100644
index c6af6d4..0000000
--- a/ext/giza-pp/GIZA++-v2/snt2cooc.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <map>
-#include <vector>
-#include <set>
-#include <cstdlib>
-
-using namespace std;
-
-void readVoc(istream&in,map<string,string>&voc)
-{
- string line,s1,s2;
- voc["1"]="UNK";
- if( !in )cerr <<"Vocabulary does not exist.\n";
- while(getline(in,line))
- {
- istringstream eingabe(line);
- if( !(eingabe>>s1>>s2))
- cerr << "ERROR in vocabulary '" << line << "'\n";
- voc[s1]=s2;
- }
-}
-
-int maxElems=0;
-int main(int argc,char **argv)
-{
- if( argc!=4&&argc!=5 )
- {
- cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 \n";
- cerr << "Converts GIZA++ snt-format into plain text.\n";
- exit(1);
- }
- bool counts=0;
- if( argc==5 )
- {
- if(string(argv[4])!="-counts")
- cerr << "ERROR: wrong option " << argv[5] << endl;
- counts=1;
- maxElems=10000000;
- }
- ifstream v1(argv[1]),v2(argv[2]),t(argv[3]);
- map<string,string>voc1,voc2;
- readVoc(v1,voc1);
- readVoc(v2,voc2);
- string line1,line2,line3;
- vector<map<int,int> > vsi(voc1.size()+1000);
- int nLine=0;
- int totalElems=0;
- while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
- {
- istringstream eingabe1(line1),eingabe2(line2),eingabe3(line3);
- double count;
- string word;
- eingabe1>>count;
- vector<int>l1,l2;
- while(eingabe2>>word)
- l1.push_back(atoi(word.c_str()));
- while(eingabe3>>word)
- l2.push_back(atoi(word.c_str()));
- if( ((++nLine)%1000)==0 )
- cerr << "line " << nLine << '\n';
- totalElems-=vsi[0].size();
- for(unsigned int j=0;j<l2.size();++j)
- vsi[0][l2[j]]++;
- totalElems+=vsi[0].size();
- for(unsigned int i=0;i<l1.size();++i)
- {
- if( l1[i]>=int(vsi.size()) )
- {
- cerr << "I have to resize: " << l1[i] << endl;
- vsi.resize(l1[i]+1);
- }
- map<int,int>&theset=vsi[l1[i]];
- totalElems-=theset.size();
- for(unsigned int j=0;j<l2.size();++j)
- theset[l2[j]]++;
- totalElems+=theset.size();
- }
- if( totalElems>maxElems&&maxElems )
- {
- cerr << "INFO: print out " << totalElems << " entries.\n";
- for(unsigned int i=0;i<vsi.size();++i)
- for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j)
- {
- if(counts==1 )
- cout << j->second << " " << i << " " << j->first << '\n';
- else
- cout << i << " " << j->first << '\n';
- }
- totalElems=0;
- vsi.clear();
- vsi.resize(voc1.size()+1000);
- }
- }
- cerr << "END.\n";
- for(unsigned int i=0;i<vsi.size();++i)
- for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j)
- {
- if(counts==1 )
- cout << j->second << " " << i << " " << j->first << '\n';
- else
- cout << i << " " << j->first << '\n';
- }
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/snt2plain.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/snt2plain.cpp b/ext/giza-pp/GIZA++-v2/snt2plain.cpp
deleted file mode 100644
index 3eb99ad..0000000
--- a/ext/giza-pp/GIZA++-v2/snt2plain.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <map>
-#include <vector>
-#include <cstdlib>
-
-using namespace std;
-
-void readVoc(istream&in,map<string,string>&voc)
-{
- string line,s1,s2;
- voc["1"]="UNK";
- if( !in )cerr <<"Vocabulary does not exist.\n";
- while(getline(in,line))
- {
- istringstream eingabe(line);
- if( !(eingabe>>s1>>s2))
- cerr << "ERROR in vocabulary '" << line << "'\n";
- voc[s1]=s2;
- }
-}
-
-int main(int argc,char **argv)
-{
- if( argc!=5&&argc!=6 )
- {
- cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 output_prefix [ -counts ]\n";
- cerr << "Converts GIZA++ snt-format into plain text.\n";
- exit(1);
- }
- bool counts=0;
- if( argc==6 )
- {
- if(string(argv[5])!="-counts")
- cerr << "ERROR: wrong option " << argv[5] << endl;
- counts=1;
- }
- ifstream v1(argv[1]),v2(argv[2]),t(argv[3]);
- string prefix(argv[4]);
- string outfil1=prefix+"1.txt";
- string outfil2=prefix+"2.txt";
- ofstream out1(outfil1.c_str());
- ofstream out2(outfil2.c_str());
- map<string,string>voc1,voc2;
- readVoc(v1,voc1);
- readVoc(v2,voc2);
- int source=0,target=0;
- string line1,line2,line3;
- int printed=0;
- while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
- {
- istringstream eingabe1(line1),eingabe2(line2),eingabe3(line3);
- double count;
- string word;
- eingabe1>>count;
- vector<string>l1,l2;
- while(eingabe2>>word)
- l1.push_back(word);
- while(eingabe3>>word)
- l2.push_back(word);
- if( counts )
- cout << count << '\n';
- for(unsigned int p=0;p<l1.size();p++)
- {
- if(voc1.count(l1[p])==0)
- {
- if( printed++==0)
- cerr << "ERROR: source vocabulary entry " << l1[p] << " unknown.\n";
- out1 << l1[p]<<' ';
- }
- else
- out1 << voc1[l1[p]] << ' ';
- source++;
- }
- for(unsigned int p=0;p<l2.size();p++)
- {
- if(voc2.count(l2[p])==0)
- {
- if( printed++ ==0)
- cerr << "ERROR: target vocabulary entry " << l2[p] << " unknown.\n";
- out2 <<l2[p]<<' ';
- }
- out2 << voc2[l2[p]] << ' ';
- target++;
- }
- out1<<'\n';
- out2<<'\n';
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/trainGIZA++.sh
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/trainGIZA++.sh b/ext/giza-pp/GIZA++-v2/trainGIZA++.sh
deleted file mode 100755
index 09f6851..0000000
--- a/ext/giza-pp/GIZA++-v2/trainGIZA++.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#! /bin/csh
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if( $# != 3 ) then
-
- echo Usage: trainGIZA++.sh vcb1 vcb2 snt
- echo " "
- echo Performs a training of word classes and a standard GIZA training.
-
-else
-
- snt2plain.out $1 $2 $3 PLAIN
-
- mkcls -m2 -pPLAIN1.txt -c50 -V$1.classes opt >& mkcls1.log
- rm PLAIN1.txt
- mkcls -m2 -pPLAIN2.txt -c50 -V$2.classes opt >& mkcls2.log
- rm PLAIN2.txt
- GIZA++ -S $1 -T $2 -C $3 -p0 0.98 -o GIZA++ >& GIZA++.log
-
-endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model1.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model1.h b/ext/giza-pp/GIZA++-v2/transpair_model1.h
deleted file mode 100644
index dd1425d..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model1.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef transpair_model1_h_fjo_defined
-#define transpair_model1_h_fjo_defined
-//#include "logprob.h"
-#include "defs.h"
-#include "Array2.h"
-#include "defs.h"
-#include "Vector.h"
-#include "NTables.h"
-#include "ATables.h"
-#include "TTables.h"
-#include "alignment.h"
-#include <cmath>
-#include <algorithm>
-#include "Array2.h"
-#include "mystl.h"
-
-class transpair_model1
-{
- public:
- bool verboseTP;
- Array2<PROB, Vector<PROB> > t;
- WordIndex l, m;
- Vector<WordIndex> E,F;
- void setMode(bool)
- {}
- transpair_model1(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, const tmodel<COUNT, PROB>&tTable)
- : verboseTP(0),t(es.size(), fs.size()),l(es.size()-1), m(fs.size()-1),E(es),F(fs)
- {
- WordIndex l=es.size()-1,m=fs.size()-1;
- for(WordIndex i=0;i<=l;i++)
- for(WordIndex j=1;j<=m;j++)
- {
- t(i, j)=tTable.getProb(es[i], fs[j]);
- if( !(t(i,j)>=PROB_SMOOTH) )
- cerr << "ERROR IN PROBABILITY: " << t(i,j) << " " << PROB_SMOOTH << endl;
- }
- }
- /* transpair_model1(const Vector<WordIndex>&es, const Vector<WordIndex>&fs)
- : verboseTP(0),t(es.size(), fs.size()),l(es.size()-1), m(fs.size()-1),E(es),F(fs)
- {
- WordIndex l=es.size()-1,m=fs.size()-1;
- for(WordIndex i=0;i<=l;i++)
- for(WordIndex j=1;j<=m;j++)
- {
- const string&estr=globeTrainVcbList->getVocabList()[es[i]].word;
- const string&fstr=globfTrainVcbList->getVocabList()[fs[j]].word;
- if( lev(estr,fstr)==0 )
- t(i,j)=1.0;
- else
- t(i,j)=1/100.0;
- massert( t(i,j)>=PROB_SMOOTH );
- }
-}*/
- WordIndex get_l()const
- {return l;}
- WordIndex get_m()const
- {return m;}
- const PROB&get_t(WordIndex i, WordIndex j)const
- {massert( t(i,j)>=PROB_SMOOTH);
- return t(i, j);}
- WordIndex get_es(int i)const {return E[i];}
- WordIndex get_fs(int j)const {return F[j];}
- bool greedyHillClimbing()const
- {return 0;}
- void computeScores(const alignment&,vector<double>&)const
- {}
- LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double=-1.0)const
- {
- int old_i=a(j);
- return (t(new_i, j) /t(old_i, j));
- }
- LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const
- {
- WordIndex i1=a(j1), i2=a(j2);
- return (t(i2, j1)/t(i1, j1))*(t(i1, j2)/t(i2, j2));
- }
- LogProb prob_of_target_and_alignment_given_source(const alignment&al)const
- {
- LogProb prob=1.0;
- int lp1=al.get_l()+1;
- for(unsigned int j=1;j<=al.get_m();++j)
- prob*=t(al(j),j)/lp1;
- return prob;
- }
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model2.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model2.h b/ext/giza-pp/GIZA++-v2/transpair_model2.h
deleted file mode 100644
index 751ce52..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model2.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef transpair_model2_defined_h
-#define transpair_model2_defined_h
-
-#include "defs.h"
-#include "Vector.h"
-#include "NTables.h"
-#include "ATables.h"
-#include "TTables.h"
-#include "alignment.h"
-#include <cmath>
-#include "transpair_model1.h"
-
-
-class transpair_model2 : public transpair_model1
-{
- protected:
- Array2<PROB, Vector<PROB> > a;
- public:
- transpair_model2(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, const tmodel<COUNT, PROB>&tTable,
- const amodel<PROB>&aTable)
- : transpair_model1(es,fs,tTable),a(es.size(),fs.size())
- {
- for(WordIndex i=0;i<=l;i++)
- for(WordIndex j=1;j<=m;j++)
- a(i, j)=aTable.getValue(i, j, l, m);
- }
- const PROB&get_a(WordIndex i, WordIndex j)const
- {return a(i, j);}
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model3.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model3.cpp b/ext/giza-pp/GIZA++-v2/transpair_model3.cpp
deleted file mode 100644
index 0ab4c54..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model3.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-/*--
-transpair_model3: representation of a translation pair for model3 training
-allowing for fast access (esp. to t table).
-
-Franz Josef Och (30/07/99)
---*/
-#include "transpair_model3.h"
-#include <algorithm>
-
-transpair_model3::transpair_model3(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable, amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable, double _p1, double _p0, void*)
- : transpair_model2(es,fs,tTable,aTable),d(es.size(), fs.size()),n(es.size(), MAX_FERTILITY+1), p0(_p0), p1(_p1)
-{
- WordIndex l=es.size()-1,m=fs.size()-1;
- for(WordIndex i=0;i<=l;i++)
- {
- for(WordIndex j=1;j<=m;j++)
- d(i, j)=dTable.getValue(j, i, l, m);
- if( i>0 )
- {
- for(WordIndex f=0;f<MAX_FERTILITY;f++)
- n(i, f)=nTable.getValue(es[i], f);
- n(i,MAX_FERTILITY)=PROB_SMOOTH;
- }
- }
-}
-
-LogProb transpair_model3::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j, double,bool forModel3)const
-{
- LogProb change;
- const WordIndex old_i=a(j);
- WordIndex f0=a.fert(0);
- if (old_i == new_i)
- change=1.0;
- else if (old_i == 0)
- change=((double)p0*p0/p1) *
- (( (DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):f0)*(m-f0+1.0)) / ((m-2*f0+1)*(m-2*f0+2.0))) *
- ((PROB)(forModel3?(a.fert(new_i)+1.0):1.0)) *
- (get_fertility(new_i, a.fert(new_i)+1) / get_fertility(new_i, a.fert(new_i)))*
- (t(new_i, j)/t(old_i, j))*
- (forModel3?d(new_i, j):1.0);
- else if (new_i == 0)
- change=(double(p1) / (p0*p0)) *
- (double((m-2*f0)*(m-2*f0-1))/( (DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):(1+f0))*(m-f0))) *
- (forModel3?(1.0/a.fert(old_i)):1.0) *
- (get_fertility(old_i, a.fert(old_i)-1) /get_fertility(old_i, a.fert(old_i)))*
- (t(new_i, j) /t(old_i, j)) *
- (forModel3?(1.0 / d(old_i, j)):1.0);
- else
- change=(forModel3?((a.fert(new_i)+1.0)/a.fert(old_i)):1.0) *
- (get_fertility(old_i,a.fert(old_i)-1) / get_fertility(old_i,a.fert(old_i))) *
- (get_fertility(new_i,a.fert(new_i)+1) /get_fertility(new_i,a.fert(new_i))) *
- (t(new_i,j)/t(old_i,j)) *
- (forModel3?(d(new_i,j)/d(old_i,j)):1.0);
- return change;
-}
-
-LogProb transpair_model3::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2, double,bool forModel3)const
-{
- PROB score=1;
- assert(j1<j2);
- WordIndex i1=a(j1), i2=a(j2);
- if (i1!=i2)
- {
- score=(t(i2, j1)/t(i1, j1))*(t(i1, j2)/t(i2, j2));
- if( forModel3 )
- {
- if (i1)
- score *= d(i1, j2)/d(i1, j1);
- if (i2)
- score *= d(i2, j1)/d(i2, j2);
- }
- }
- return score;
-}
-
-ostream&operator<<(ostream&out, const transpair_model3&m)
-{
- for(WordIndex i=0;i<=m.get_l();i++)
- {
- out << "EF-I:"<<i<<' ';
- for(WordIndex j=1;j<=m.get_m();j++)
- out << "("<<m.t(i,j)<<","<<m.d(i,j)<<")";
- for(WordIndex j=1;j<MAX_FERTILITY;j++)
- if( i>0 )
- out << "(fert:"<<m.get_fertility(i,j)<<")";
- out << '\n';
- }
- out << "T:" << m.t << "D:" << m.d << "A:" << m.a << "N:" << m.n << m.p0 << m.p1 << '\n';
- return out;
-}
-
-LogProb transpair_model3::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const
-{
- alignment b(a);
- b.set(j, new_i);
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
-}
-
-LogProb transpair_model3::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
-{
- alignment b(a);
- b.set(j1, a(j2));
- b.set(j2, a(j1));
- LogProb a_prob=thisValue;
- if( a_prob<0.0 )
- a_prob=prob_of_target_and_alignment_given_source(a);
- massert(a_prob==prob_of_target_and_alignment_given_source(a));
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
-}
-
-LogProb transpair_model3::prob_of_target_and_alignment_given_source(const alignment&al,bool verb)const
-{
- LogProb total = 1.0 ;
- static const LogProb zero = 1E-299 ;
- total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
- if( verb) cerr << "IBM-3: (1-p1)^(m-2 f0)*p1^f0: " << total << '\n';
- for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
- total *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
- if( verb) cerr << "IBM-3: +NULL:binomial+distortion " << total << '\n';
- for (WordIndex i = 1 ; i <= l ; i++)
- {
- total *= get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i));
- if( verb) cerr << "IBM-3: fertility of " << i << " with factorial " << get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i)) << " -> " << total << '\n';
- }
- for (WordIndex j = 1 ; j <= m ; j++)
- {
- total*= get_t(al(j), j) ;
- massert( get_t(al(j), j)>=PROB_SMOOTH );
- if( verb) cerr << "IBM-3: t of " << j << " " << al(j) << ": " << get_t(al(j), j) << " -> " << total << '\n';
- if (al(j))
- {
- total *= get_d(al(j), j);
- if( verb) cerr << "IBM-3: d of " << j << ": " << get_d(al(j), j) << " -> " << total << '\n';
- }
- }
- return total?total:zero;
-}
-
-
-void transpair_model3::computeScores(const alignment&al,vector<double>&d)const
-{
- LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
- total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
- for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
- total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
- for (WordIndex i = 1 ; i <= l ; i++)
- {
- total2 *= get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i));
- }
- for (WordIndex j = 1 ; j <= m ; j++)
- {
- total3*= get_t(al(j), j) ;
- massert( get_t(al(j), j)>=PROB_SMOOTH );
- if (al(j))
- {
- total4 *= get_d(al(j), j);
- }
- }
- d.push_back(total1);//5
- d.push_back(total2);//6
- d.push_back(total3);//7
- d.push_back(total4);//8
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model3.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model3.h b/ext/giza-pp/GIZA++-v2/transpair_model3.h
deleted file mode 100644
index 9c07fd9..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model3.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-/*--
-transpair_model3: representation of a translation pair for model3 training
-allowing for fast access (esp. to t table).
-
-Franz Josef Och (30/07/99)
---*/
-#ifndef transpair_model3_h_fjo_defined
-#define transpair_model3_h_fjo_defined
-#include "Array2.h"
-#include "defs.h"
-#include "Vector.h"
-#include "NTables.h"
-#include "ATables.h"
-#include "TTables.h"
-#include "alignment.h"
-#include <cmath>
-#include "transpair_model2.h"
-
-extern double factorial(int n);
-inline bool doubleEqual(const double a, const double b)
-{
- if( a==b )
- return 1.0;
- bool bl=fabs(1.0-a/b)<1e-10;
- if( bl )
- return 1;
- else
- {
- cerr << "DIFFERENT: " << a << " " << b << " " << a/b << " " << 1.0-a/b << endl;
- return 0;
- }
-}
-
-
-class transpair_model3 : public transpair_model2
-{
- protected:
- Array2<PROB, Vector<PROB> > d, n;
- PROB p0, p1;
- public:
- typedef transpair_model3 simpler_transpair_model;
- transpair_model3(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable,
- amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable,
- double _p1, double _p0, void*x=0);
- const PROB&get_d(WordIndex i, WordIndex j)const
- {return d(i, j);}
- const PROB&get_a(WordIndex i, WordIndex j)const
- {return a(i, j);}
- const PROB&get_fertility(WordIndex i, WordIndex f)const
- {massert(i>0);return (f>=MAX_FERTILITY)?n(i, MAX_FERTILITY):n(i, f);}
- int modelnr()const{return 3;}
- LogProb scoreOfAlignmentForChange(const alignment&)const
- {return -1.0; }
- LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j, double thisValue=-1.0,bool withDistortions=1)const;
- LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2, double thisValue=-1.0,bool withDistortions=1)const ;
- LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
- LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const;
- friend ostream&operator<<(ostream&out, const transpair_model3&m);
- LogProb prob_of_target_and_alignment_given_source(const alignment&al,bool verb=0)const;
- bool isSubOptimal()const{return 1;}
- void computeScores(const alignment&al,vector<double>&d)const;
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model4.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model4.cpp b/ext/giza-pp/GIZA++-v2/transpair_model4.cpp
deleted file mode 100644
index ebc2666..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model4.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include "transpair_model4.h"
-#include "Parameter.h"
-
-GLOBAL_PARAMETER(float,d4modelsmooth_factor,"model4SmoothFactor","smooting parameter for alignment probabilities in Model 4",PARLEV_SMOOTH,0.2);
-
-LogProb transpair_model4::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const
-{
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
- alignment b(a);
- b.set(j, new_i);
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
-}
-LogProb transpair_model4::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double)const
-{
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
- alignment b(a);
- b.set(j1, a(j2));
- b.set(j2, a(j1));
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
-}
-//increasing efficiency: no copy of alignment (calc. everything incrementally)
-LogProb transpair_model4::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue)const
-{
- if( a(j)==new_i )
- return 1.0;
- LogProb change=transpair_model3::scoreOfMove(a,new_i,j,-1.0,0);
- LogProb a_prob=thisValue;
- if(a_prob<0.0 )
- a_prob=prob_of_target_and_alignment_given_source(a,2);
- massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
- WordIndex old_i=a(j);
- //alignment b(a);
- const_cast<alignment&>(a).set(j,new_i);
- LogProb b_prob=prob_of_target_and_alignment_given_source(a,2);
- const_cast<alignment&>(a).set(j,old_i);
- change*=b_prob/a_prob;
- return change;
-}
-//increasing efficiency: no copy of alignment (calc. everything incrementally)
-LogProb transpair_model4::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
-{
- WordIndex aj1=a(j1),aj2=a(j2);
- if( aj1==aj2 )
- return 1.0;
- LogProb change=transpair_model3::scoreOfSwap(a,j1,j2,-1.0,0);
- LogProb a_prob=thisValue;
- if( a_prob<0.0 )
- a_prob=prob_of_target_and_alignment_given_source(a,2);
- massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
-
- //alignment b(a);
- const_cast<alignment&>(a).set(j1,aj2);
- const_cast<alignment&>(a).set(j2,aj1);
- LogProb b_prob=prob_of_target_and_alignment_given_source(a,2);
- const_cast<alignment&>(a).set(j1,aj1);
- const_cast<alignment&>(a).set(j2,aj2);
-
- if( verboseTP )
- cerr << "scoreOfSwap: " << change << ' ' << a_prob << ' ' << b_prob << ' ' << endl;
- change*=b_prob/a_prob;
- if( verboseTP )
- cerr << "resulting: " << change << " should be " << _scoreOfSwap(a,j1,j2) << endl;
- return change;
-}
-
-LogProb transpair_model4::prob_of_target_and_alignment_given_source_1(const alignment&al,bool verb)const
-{
- LogProb total = 1.0 ;
- total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
- if( verb) cerr << "IBM-4: (1-p1)^(m-2 f0)*p1^f0: " << total << endl;
- for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
- total *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
- if( verb) cerr << "IBM-4: +NULL:binomial+distortion " << total << endl;
- for (WordIndex i = 1 ; i <= l ; i++)
- {
- total *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i));
- if( verb) cerr << "IBM-4: fertility of " << i << " " << get_fertility(i, al.fert(i)) << " -> " << total << endl;
- }
- for (WordIndex j = 1 ; j <= m ; j++)
- {
- total*= get_t(al(j), j) ;
- if( verb) cerr << "IBM-4: t of j:" << j << " i:" << al(j) << ": " << get_t(al(j), j) << " -> " << total << endl;
- }
- return total;
-}
-
-LogProb transpair_model4::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const
-{
- LogProb total = 1.0 ;
- static const LogProb almostZero = 1E-299 ;
- if( distortionType&1 )
- {
- total *= prob_of_target_and_alignment_given_source_1(al,verb);
- }
- if( distortionType&2 )
- {
- for(WordIndex j=1;j<=m;j++)
- if( al(j) )
- if( al.get_head(al(j))==j)
- {
- int ep=al.prev_cept(al(j));
- float x2=probFirst[ep](j,al.get_center(ep));
- massert(x2<=1.0);
- total*=x2;
- if( verb) cerr << "IBM-4: d=1 of " << j << ": " << x2 << " -> " << total << endl;
- }
- else
- {
- float x2=probSecond(j,al.prev_in_cept(j));
- massert(x2<=1.0);
- total*=x2;
- if( verb) cerr << "IBM-4: d>1 of " << j << ": " << x2 << " -> " << total << endl;
- }
- }
- return total?total:almostZero;
-}
-
-void transpair_model4::computeScores(const alignment&al,vector<double>&d)const
-{
- LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
- total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
- for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
- total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
- for (WordIndex i = 1 ; i <= l ; i++)
- total2 *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i));
- for (WordIndex j = 1 ; j <= m ; j++)
- total3*= get_t(al(j), j) ;
- for(WordIndex j=1;j<=m;j++)
- if( al(j) )
- if( al.get_head(al(j))==j)
- {
- int ep=al.prev_cept(al(j));
- float x2=probFirst[ep](j,al.get_center(ep));
- total4*=x2;
- }
- else
- {
- float x2=probSecond(j,al.prev_in_cept(j));
- total4*=x2;
- }
- d.push_back(total1);//9
- d.push_back(total2);//10
- d.push_back(total3);//11
- d.push_back(total4);//12
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model4.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model4.h b/ext/giza-pp/GIZA++-v2/transpair_model4.h
deleted file mode 100644
index 730fbe7..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model4.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef transpair_model4_h_fjo_defined
-#define transpair_model4_h_fjo_defined
-#include "Array2.h"
-#include "defs.h"
-#include "Vector.h"
-#include "NTables.h"
-#include "ATables.h"
-#include "TTables.h"
-#include "alignment.h"
-#include "D4Tables.h"
-#include "transpair_model3.h"
-
-extern double factorial(int n);
-
-class transpair_model4 : public transpair_model3
-{
- private:
- d4model&d4m;
- Array2<double> probSecond;
- Vector<Array2<double> > probFirst;
- public:
- typedef transpair_model3 simpler_transpair_model;
- transpair_model4(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable, amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable, double _p1, double _p0,d4model*_d4m)
- : transpair_model3(es, fs, tTable, aTable, dTable, nTable, _p1, _p0),
- d4m(*_d4m),probSecond(m+1,m+1,0.0),probFirst(l+1)
- {
- for(unsigned int j1=1;j1<=m;++j1)
- for(unsigned int j2=1;j2<j1;++j2)
- {
- probSecond(j1,j2)=d4m.getProb_bigger(j1,j2,0,d4m.fwordclasses.getClass(get_fs(j1)),l,m);
- }
- for(unsigned int i=0;i<=l;++i)
- {
- Array2<double> &pf=probFirst[i]=Array2<double>(m+1,m+1,0.0);
- for(unsigned int j1=1;j1<=m;++j1)
- {
- map<m4_key,d4model::Vpff,compare1 >::const_iterator ci=d4m.getProb_first_iterator(d4m.ewordclasses.getClass(get_es(i)),d4m.fwordclasses.getClass(get_fs(j1)),l,m);
- for(unsigned int j2=0;j2<=m;++j2)
- {
- pf(j1,j2)=d4m.getProb_first_withiterator(j1,j2,m,ci);
- massert(pf(j1,j2)==d4m.getProb_first(j1,j2,d4m.ewordclasses.getClass(get_es(i)),d4m.fwordclasses.getClass(get_fs(j1)),l,m));
- }
- }
- }
- }
- LogProb prob_of_target_and_alignment_given_source_1(const alignment&al,bool verb)const;
- LogProb scoreOfAlignmentForChange(const alignment&a)const
- {return prob_of_target_and_alignment_given_source(a,2); }
- LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
- LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
- LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
- LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
- int modelnr()const{return 4;}
- LogProb prob_of_target_and_alignment_given_source(const alignment&al, short distortionType=3,bool verb=0)const;
- void computeScores(const alignment&al,vector<double>&d)const;
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model5.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model5.cpp b/ext/giza-pp/GIZA++-v2/transpair_model5.cpp
deleted file mode 100644
index 7baa5ca..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model5.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include "transpair_model5.h"
-#include "Parameter.h"
-
-int m5scorefound=0,m5scorenotfound=0;
-
-GLOBAL_PARAMETER(float,d5modelsmooth_factor,"model5SmoothFactor","smooting parameter for distortion probabilities in Model 5 (linear interpolation with constant)",PARLEV_SMOOTH,0.1);
-float d5modelsmooth_countoffset=0.0;
-
-LogProb transpair_model5::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const
-{
- if( doModel4Scoring )
- return transpair_model4::_scoreOfMove(a,new_i,j);
- alignment b(a);
- b.set(j, new_i);
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
-}
-LogProb transpair_model5::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
-{
- if( doModel4Scoring )
- return transpair_model4::_scoreOfSwap(a,j1,j2,thisValue);
- alignment b(a);
- b.set(j1, a(j2));
- b.set(j2, a(j1));
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- assert(a_prob);
- assert(b_prob);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
-}
-
-//increasing efficiency: no copy of alignment (calc. everything incrementally)
-LogProb transpair_model5::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue)const
-{
- if( doModel4Scoring )
- return transpair_model4::scoreOfMove(a,new_i,j,thisValue);
- alignment b(a);
- b.set(j,new_i);
-
- LogProb change;
- const WordIndex old_i=a(j);
- WordIndex f0=a.fert(0);
- if (old_i == new_i)
- change=1.0;
- else if (old_i == 0)
- change=((double)p0*p0/p1) *
- ((f0*(m-f0+1.0)) / ((m-2*f0+1)*(m-2*f0+2.0))) *
- ((PROB)(1.0)) *
- (get_fertility(new_i, a.fert(new_i)+1) / get_fertility(new_i, a.fert(new_i)))*
- (t(new_i, j)/t(old_i, j))*
- 1.0;
- else if (new_i == 0)
- change=(double(p1) / (p0*p0)) *
- (double((m-2*f0)*(m-2*f0-1))/((1+f0)*(m-f0))) *
- (1.0) *
- (get_fertility(old_i, a.fert(old_i)-1) /get_fertility(old_i, a.fert(old_i)))*
- (t(new_i, j) /t(old_i, j)) *
- (1.0);
- else
- change=(1.0) *
- (get_fertility(old_i,a.fert(old_i)-1) / get_fertility(old_i,a.fert(old_i))) *
- (get_fertility(new_i,a.fert(new_i)+1) /get_fertility(new_i,a.fert(new_i))) *
- (t(new_i,j)/t(old_i,j)) *
- (1.0);
- LogProb a_prob=thisValue;
- if( a_prob<0.0 )
- a_prob=prob_of_target_and_alignment_given_source(a,2);
- massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
-
- LogProb b_prob=prob_of_target_and_alignment_given_source(b,2);
- change*=b_prob/a_prob;
- return change;
-}
-LogProb transpair_model5::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
-{
- if( doModel4Scoring )
- return transpair_model4::scoreOfSwap(a,j1,j2,thisValue);
- alignment b(a);
- b.set(j1,a(j2));
- b.set(j2,a(j1));
- LogProb change=transpair_model3::scoreOfSwap(a,j1,j2,-1.0,0);
- LogProb a_prob=thisValue;
- if( a_prob<0.0 )
- a_prob=prob_of_target_and_alignment_given_source(a,2);
- massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
- LogProb b_prob=prob_of_target_and_alignment_given_source(b,2);
- change*=b_prob/a_prob;
- return change;
-}
-
-LogProb transpair_model5::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const
-{
- if( doModel4Scoring )
- return transpair_model4::prob_of_target_and_alignment_given_source(al,distortionType);
- LogProb total = 1.0 ;
- static const LogProb almostZero = 1E-299 ;
- double x2;
- if( distortionType&1 )
- {
- total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
- if( verb) cerr << "IBM-5: (1-p1)^(m-2 f0)*p1^f0: " << total << endl;
- for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
- total *= double(m - al.fert(0) - i + 1) / i ; // IBM-5 is not deficient!
- if( verb) cerr << "IBM-5: +NULL:binomial+distortion " << total << endl;
- for (WordIndex i = 1 ; i <= l ; i++)
- {
- total *= get_fertility(i, al.fert(i));
- if( verb) cerr << "IBM-5: fertility of " << i << " " << get_fertility(i, al.fert(i)) << " -> " << total << endl;
- }
- for (WordIndex j = 1 ; j <= m ; j++)
- {
- total*= get_t(al(j), j) ;
- if( verb) cerr << "IBM-5: t of j:" << j << " i:" << al(j) << ": " << get_t(al(j), j) << " -> " << total << endl;
- }
- }
- if( distortionType&2 )
- {
- PositionIndex prev_cept=0;
- PositionIndex vac_all=m;
- Vector<char> vac(m+1,0);
- for(WordIndex i=1;i<=l;i++)
- {
- PositionIndex cur_j=al.als_i[i];
- PositionIndex prev_j=0;
- PositionIndex k=0;
- if(cur_j) { // process first word of cept
- k++;
- // previous position
- total*= (x2=d5m.getProb_first(vacancies(vac,cur_j),vacancies(vac,al.get_center(prev_cept)),d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-al.fert(i)+k));
-
- vac_all--;
- assert(vac[cur_j]==0);
- vac[cur_j]=1;
-
- if( verb) cerr << "IBM-5: d=1 of " << cur_j << ": " << x2 << " -> " << total << endl;
- prev_j=cur_j;
- cur_j=al.als_j[cur_j].next;
- }
- while(cur_j) { // process following words of cept
- k++;
- // previous position
- int vprev=vacancies(vac,prev_j);
- total*= (x2=d5m.getProb_bigger(vacancies(vac,cur_j),vprev,d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-al.fert(i)+k));
-
-
- vac_all--;
- vac[cur_j]=1;
-
-
- if( verb) cerr << "IBM-5: d>1 of " << cur_j << ": " << x2 << " -> " << total << endl;
- prev_j=cur_j;
- cur_j=al.als_j[cur_j].next;
- }
- assert(k==al.fert(i));
- if( k )
- prev_cept=i;
- }
- assert(vac_all==al.fert(0));
- }
- total = total?total:almostZero;
- return total;
-}
-
-
-void transpair_model5::computeScores(const alignment&al,vector<double>&d)const
-{
- LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
- total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
- for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
- total1 *= double(m - al.fert(0) - i + 1) / i ; // IBM-5 is not deficient!
- for (WordIndex i = 1 ; i <= l ; i++)
- total2 *= get_fertility(i, al.fert(i));
- for (WordIndex j = 1 ; j <= m ; j++)
- total3*= get_t(al(j), j) ;
- PositionIndex prev_cept=0;
- PositionIndex vac_all=m;
- Vector<char> vac(m+1,0);
- for(WordIndex i=1;i<=l;i++)
- {
- PositionIndex cur_j=al.als_i[i];
- PositionIndex prev_j=0;
- PositionIndex k=0;
- if(cur_j) { // process first word of cept
- k++;
- total4*=d5m.getProb_first(vacancies(vac,cur_j),vacancies(vac,al.get_center(prev_cept)),d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-al.fert(i)+k);
- vac_all--;
- assert(vac[cur_j]==0);
- vac[cur_j]=1;
- prev_j=cur_j;
- cur_j=al.als_j[cur_j].next;
- }
- while(cur_j) { // process following words of cept
- k++;
- int vprev=vacancies(vac,prev_j);
- total4*=d5m.getProb_bigger(vacancies(vac,cur_j),vprev,d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-al.fert(i)+k);
- vac_all--;
- vac[cur_j]=1;
- prev_j=cur_j;
- cur_j=al.als_j[cur_j].next;
- }
- assert(k==al.fert(i));
- if( k )
- prev_cept=i;
- }
- assert(vac_all==al.fert(0));
- d.push_back(total1);//13
- d.push_back(total2);//14
- d.push_back(total3);//15
- d.push_back(total4);//16
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_model5.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_model5.h b/ext/giza-pp/GIZA++-v2/transpair_model5.h
deleted file mode 100644
index 5ecf49d..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_model5.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef transpair_model5_h_fjo_defined
-#define transpair_model5_h_fjo_defined
-#include "Array2.h"
-#include "defs.h"
-#include "Vector.h"
-#include "NTables.h"
-#include "ATables.h"
-#include "TTables.h"
-#include "alignment.h"
-#include "D5Tables.h"
-#include "transpair_model4.h"
-
-extern double factorial(int n);
-
-inline int vacancies(const Vector<char>&vac,int u)
-{
- int n=0;
- const char *i=&(vac[0])+1;
- const char *end=&(vac[0])+u+1;
- while(i<end)
- n+= ((*i++)==0);
- return n;
-}
-
-class transpair_model5 : public transpair_model4
-{
- private:
- const d5model&d5m;
- bool doModel4Scoring;
- public:
- typedef transpair_model3 simpler_transpair_model;
- mutable map<Vector<PositionIndex>,LogProb> scores[4];
- transpair_model5(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable,
- amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable, double _p1, double _p0,
- const d5model*_d5m)
- : transpair_model4(es, fs, tTable, aTable, dTable, nTable, _p1, _p0,&_d5m->d4m),d5m(*_d5m),doModel4Scoring(0) {}
- LogProb scoreOfAlignmentForChange(const alignment&a)const
- {
- if( doModel4Scoring )
- return transpair_model4::prob_of_target_and_alignment_given_source(a,2);
- else
- return prob_of_target_and_alignment_given_source(a,2);
- }
- LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
- LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
- LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
- LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
- int modelnr()const{return 5;}
- LogProb prob_of_target_and_alignment_given_source(const alignment&al, short distortionType=3,bool verb=0)const;
- void computeScores(const alignment&al,vector<double>&d)const;
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h b/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h
deleted file mode 100644
index d836ad4..0000000
--- a/ext/giza-pp/GIZA++-v2/transpair_modelhmm.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef transpair_modelhmm_h_fjo_defined
-#define transpair_modelhmm_h_fjo_defined
-#include "Array2.h"
-#include "defs.h"
-#include "Vector.h"
-#include "NTables.h"
-#include "ATables.h"
-#include "TTables.h"
-#include "alignment.h"
-#include <cmath>
-#include "transpair_model2.h"
-#include "ForwardBackward.h"
-#include "hmm.h"
-
-class transpair_modelhmm : public transpair_model2
-{
- public:
- typedef transpair_modelhmm simpler_transpair_model;
- HMMNetwork*net;
- transpair_modelhmm(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, const tmodel<COUNT, PROB>&tTable,
- const amodel<PROB>&aTable,const amodel<PROB>&,const nmodel<PROB>&,
- double, double,const hmm*h)
- : transpair_model2(es,fs,tTable,aTable),net(h->makeHMMNetwork(es,fs,0))
- {}
- ~transpair_modelhmm() { delete net; }
- int modelnr()const{return 6;}
- LogProb scoreOfMove(const alignment&a, WordIndex _new_i, WordIndex j,double=-1.0)const
- {
- int new_i=_new_i;
- LogProb change=1.0;
- int old_i=a(j);
- if (old_i == new_i)
- change=1.0;
- else
- {
- int theJ=j-1;
- old_i--;
- new_i--;
- int jj=j-1;
- while(jj>0&&a(jj)==0)
- jj--;
- int theIPrev= (jj>0)?(a(jj)-1):0;
- if( j>1&&a(j-1)==0 )
- theIPrev+=l;
- if( old_i==-1 ){old_i = theIPrev;if(old_i<int(l))old_i+=l;}
- if( new_i==-1 ){new_i = theIPrev;if(new_i<int(l))new_i+=l;}
- int theIPrevOld=theIPrev,theIPrevNew=theIPrev;
- if( theJ==0 )
- {
- change*=net->getAlphainit(new_i)/net->getAlphainit(old_i);
- }
- do
- {
- if( new_i!=old_i )
- {
- change*=net->nodeProb(new_i,theJ)/net->nodeProb(old_i,theJ);
- }
- if( theJ>0)
- change*=net->outProb(theJ,theIPrevNew,new_i)/net->outProb(theJ,theIPrevOld,old_i);
- theIPrevOld=old_i;
- theIPrevNew=new_i;
- theJ++;
- if( theJ<int(m) && a(theJ+1)==0 )
- {
- if( new_i<int(l)) new_i+=l;
- if( old_i<int(l)) old_i+=l;
- }
- } while( theJ<int(m) && a(theJ+1)==0 );
- if(theJ==int(m))
- {
- change*=net->getBetainit(new_i)/net->getBetainit(old_i);
- }
- else
- {
- new_i=a(theJ+1)-1;
- if( new_i==-1)
- new_i=theIPrevNew;
- change*=net->outProb(theJ,theIPrevNew,new_i)/net->outProb(theJ,theIPrevOld,new_i);
- }
- }
- return change;
- }
- LogProb scoreOfAlignmentForChange(const alignment&)const
- {return -1.0; }
- LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const
- {
- return _scoreOfSwap(a,j1,j2);
- }
- LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double=-1.0)const
- {
- alignment b(a);
- b.set(j, new_i);
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
- }
- LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const
- {
- WordIndex aj1=a(j1),aj2=a(j2);
- if( aj1==aj2 )
- return 1.0;
- LogProb a_prob=prob_of_target_and_alignment_given_source(a);
-
- /*alignment b(a);
- b.set(j1, a(j2));
- b.set(j2, a(j1));
- LogProb b_prob=prob_of_target_and_alignment_given_source(b);*/
-
- const_cast<alignment&>(a).set(j1,aj2);
- const_cast<alignment&>(a).set(j2,aj1);
- LogProb b_prob=prob_of_target_and_alignment_given_source(a);
- const_cast<alignment&>(a).set(j1,aj1);
- const_cast<alignment&>(a).set(j2,aj2);
-
- if( a_prob )
- return b_prob/a_prob;
- else if( b_prob )
- return 1e20;
- else
- return 1.0;
- }
- inline friend ostream&operator<<(ostream&out, const transpair_modelhmm&)
- {
- return out << "NO-OUTPUT for transpair_modelhmm\n";
- }
- LogProb prob_of_target_and_alignment_given_source(const alignment&al,bool verbose=0)const
- {
- double prob=1.0;
- int theIPrev=0;
- for(unsigned int j=1;j<=m;j++)
- {
- int theJ=j-1;
- int theI=al(j)-1;
- if( theI==-1 )
- theI=(theIPrev%l)+l;
- prob*=net->nodeProb(theI,theJ);
- if( verbose )
- cout << "NP " << net->nodeProb(theI,theJ) << ' ';
- if( j==1 )
- {
- prob*=net->getAlphainit(theI);
- if( verbose )
- cout << "AP0 " << net->getAlphainit(theI) << ' ';
- }
- else
- {
- prob*=net->outProb(theJ,theIPrev,theI);
- if( verbose )
- cout << "AP1 " << net->outProb(theJ,theIPrev,theI) << ' ';
- }
- theIPrev=theI;
- if( j==m )
- {
- prob*=net->getBetainit(theI);
- if( verbose )
- cout << "AP2 " << net->getBetainit(theI) << ' ';
- }
- if( verbose )
- cout << "j:"<<theJ<<" i:"<<theI << "; ";
- }
- if( verbose )
- cout << '\n';
- return prob*net->finalMultiply;
- }
- void computeScores(const alignment&al,vector<double>&d)const
- {
- double prob1=1.0,prob2=1.0;
- int theIPrev=0;
- for(unsigned int j=1;j<=m;j++)
- {
- int theJ=j-1;
- int theI=al(j)-1;
- if( theI==-1 )
- theI=(theIPrev%l)+l;
- prob1*=net->nodeProb(theI,theJ);
- if( j==1 )
- {
- prob2*=net->getAlphainit(theI);
- }
- else
- {
- prob2*=net->outProb(theJ,theIPrev,theI);
- }
- theIPrev=theI;
- if( j==m )
- {
- prob2*=net->getBetainit(theI);
- }
- }
- d.push_back(prob1);
- d.push_back(prob2);
- }
-
- bool isSubOptimal()const{return 0;}
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/utility.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/utility.cpp b/ext/giza-pp/GIZA++-v2/utility.cpp
deleted file mode 100644
index 4e9607a..0000000
--- a/ext/giza-pp/GIZA++-v2/utility.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include "mymath.h"
-
-double factorial(int n)
-{
- double f=1;
- for(int i=2; i <= n; i++)
- f *= i;
- return f;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/utility.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/utility.h b/ext/giza-pp/GIZA++-v2/utility.h
deleted file mode 100644
index 078a2a0..0000000
--- a/ext/giza-pp/GIZA++-v2/utility.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef utility_h
-#define utility_h
-#include <iostream>
-#include "Perplexity.h"
-#include "Vector.h"
-#include "TTables.h"
-#include "getSentence.h"
-#include "vocab.h"
-
-extern void printHelp(void);
-extern void parseConfigFile (char * fname );
-extern void parseArguments(int argc, char *argv[]);
-extern void generatePerplexityReport(const Perplexity& trainperp,
- const Perplexity& testperp,
- const Perplexity& trainVperp,
- const Perplexity& testVperp,
- ostream& of, int trainsize,
- int testsize, unsigned int last, bool);
-
-extern void printSentencePair(Vector<WordIndex>& es, Vector<WordIndex>& fs, ostream& of);
-
-extern void printOverlapReport(const tmodel<COUNT, PROB>& tTable,
- sentenceHandler& testHandler, vcbList& trainEList,
- vcbList& trainFList, vcbList& testEList, vcbList& testFList);
-
-extern void printAlignToFile(const Vector<WordIndex>& es, const Vector<WordIndex>& fs,
- const Vector<WordEntry>& evlist, const Vector<WordEntry>& fvlist,
- ostream& of2, const Vector<WordIndex>& viterbi_alignment, int pair_no,
- double viterbi_score);
-
-extern double factorial(int) ;
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/vocab.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/vocab.cpp b/ext/giza-pp/GIZA++-v2/vocab.cpp
deleted file mode 100644
index a91c572..0000000
--- a/ext/giza-pp/GIZA++-v2/vocab.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include "vocab.h"
-
-void vcbList::readVocabList()
- // reads a vocabulary file from fname. It expects the following format:
- //
- // token_id token_string frequency
-{
-
- int freq=0;
- WordIndex word_id ;
- WordEntry entry("NULL",0) ;
-
- string line, word ;
- cerr << "Reading vocabulary file from:" << fname << "\n";
- // total = 0 ;
- ifstream vFile(fname);
- if(!vFile){
- cerr << "\nCannot open vocabulary file " << fname << "file";
- exit(1);
- }
-
- list.push_back(entry);
- s2i[entry.word]=list.size()-1;
-
- while(getline(vFile, line)){
- istringstream buffer(line);
- if(!(buffer >> word_id >> word >> freq))
- cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
- if (word_id == 0){
- cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
- exit(-1);
- }
- else if (word_id >= MAX_VOCAB_SIZE){
- cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
- << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
- exit(-1);
- }
- else if (freq < 0){
- cerr << "ERROR: frequency must be a positive integer, in line :\n"
- << line <<"\n";
- exit(-1);
- }
- else if(word_id >= list.size()){
- list.resize(word_id+1);
- list[word_id].word = word ;
- s2i[word]=word_id;
- list[word_id].freq = 0 ;
- noUniqueTokens = word_id + 1 ;
- // noUniqueTokens++ ;
- // total += freq ;
- }
- else if(list[word_id].word != "\0"){
- cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
- << line <<"\n";
- cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
- list[word_id].word << "\n";
- exit(-1);
- }
- else { // line has valid information
- list[word_id].word = word ;
- s2i[word]=word_id;
- list[word_id].freq = 0 ;
- // noUniqueTokens++ ;
- noUniqueTokens = word_id + 1 ;
- // total += freq ;
- }
- } // end of while
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/GIZA++-v2/vocab.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/vocab.h b/ext/giza-pp/GIZA++-v2/vocab.h
deleted file mode 100644
index 988edc6..0000000
--- a/ext/giza-pp/GIZA++-v2/vocab.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef _vocab_h
-#define _vocab_h 1
-
-#include "defs.h"
-#include "Vector.h"
-
-#include <fstream>
-#include <sstream>
-#include <map>
-
-class WordEntry {
- public:
- string word ;
- double freq ;
- WordEntry():word("\0"), freq(0){};
- WordEntry(string w, int f):word(w), freq(f){};
-};
-
-class vcbList{
- private:
- Vector<WordEntry> list ;
- map<string,int> s2i;
- double total;
- WordIndex noUniqueTokens ;
- WordIndex noUniqueTokensInCorpus ;
- const char* fname ;
- public:
- vcbList(const char* f=0):list(), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){};
- void setName(const char*f)
- { fname=f; }
- vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){};
- inline WordIndex size()const {return (list.size());};
- inline WordIndex uniqTokens()const {return noUniqueTokens;};
- inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;};
- inline double totalVocab() const {return total;};
- inline Vector<WordEntry>& getVocabList() { return(list);};
- inline const Vector<WordEntry>& getVocabList()const { return(list);};
- void readVocabList();
- void incFreq(WordIndex id , double f){
- if(id < list.size()){
- if (list[id].freq == 0)
- noUniqueTokensInCorpus++;
- list[id].freq += f ;
- total += f ;
- }
- };
- void clearAllFreq(){
- for (WordIndex id = 0 ; id < list.size() ; id++)
- list[id].freq = 0 ;
- total = 0 ;
- noUniqueTokensInCorpus = 0 ;
- };
- int operator()(const string&x)const
- {
- map<string,int>::const_iterator i=s2i.find(x);
- if( i!=s2i.end() )
- return i->second;
- else
- {
- cerr << "ERROR: no word index for '"<<x<<"'\n";
- return 0;
- }
- }
- const string operator()(WordIndex id) const { // Yaser - 2000-12-13
- if (id < list.size())
- return list[id].word ;
- else return 0 ;
- }
- const string operator[](WordIndex id) const { // Yaser - 2000-12-13
- if (id < list.size())
- return list[id].word ;
- else return 0 ;
- }
- void printVocabList(ostream& of){
- for (WordIndex i = 1 ; i < list.size() ; i++){
- if (list[i].word != "" && list[i].freq > 0)
- of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
- }
- }
-
-};
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/Makefile
----------------------------------------------------------------------
diff --git a/ext/giza-pp/Makefile b/ext/giza-pp/Makefile
deleted file mode 100644
index cb78185..0000000
--- a/ext/giza-pp/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-
-.PHONY: gizapp mkcls-v2 install clean
-
-all: gizapp mkcls-v2
-
-gizapp:
- @echo $(JOSHUA)
- $(MAKE) -C GIZA++-v2
-
-mkcls-v2:
- @echo $(JOSHUA)
- $(MAKE) -C mkcls-v2
-
-install: gizapp mkcls-v2
- @cp GIZA++-v2/GIZA++ GIZA++-v2/snt2cooc.out mkcls-v2/mkcls $(JOSHUA)/bin/
-
-clean:
- $(MAKE) -C GIZA++-v2 clean
- $(MAKE) -C mkcls-v2 clean
- @rm -f $(JOSHUA)/bin/GIZA++ $(JOSHUA)/bin/mkcls $(JOSHUA)/bin/snt2cooc.out
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/9475d943/ext/giza-pp/README
----------------------------------------------------------------------
diff --git a/ext/giza-pp/README b/ext/giza-pp/README
deleted file mode 100644
index c4b4e34..0000000
--- a/ext/giza-pp/README
+++ /dev/null
@@ -1,8 +0,0 @@
-This package contains the GIZA++ toolkit and the mkcls tool, originally
-written by F.J. Och and several other authors.
-
-For more information, refer to the README files and the following pages:
- http://www.fjoch.com/mkcls.html
- http://www.fjoch.com/GIZA++.html
-
-