You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/01 02:51:44 UTC
[49/94] [abbrv] [partial] incubator-joshua git commit: Pulled
JOSHUA-252 changes and Resolved Merge Conflicts
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/TTables.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/TTables.cpp b/ext/giza-pp/GIZA++-v2/TTables.cpp
deleted file mode 100644
index 25c126f..0000000
--- a/ext/giza-pp/GIZA++-v2/TTables.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include "TTables.h"
-#include "Parameter.h"
-
-GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7);
-GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6);
-
-#ifdef BINARY_SEARCH_FOR_TTABLE
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::printCountTable(const char *,
- const Vector<WordEntry>&,
- const Vector<WordEntry>&,
- const bool) const
-{
-}
-
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::printProbTable(const char *filename,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- const bool actual) const
-{
- ofstream of(filename);
- /* for(unsigned int i=0;i<es.size()-1;++i)
- for(unsigned int j=es[i];j<es[i+1];++j)
- {
- const CPPair&x=fs[j].second;
- WordIndex e=i,f=fs[j].first;
- if( actual )
- of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
- else
- of << e << ' ' << f << ' ' << x.prob << '\n';
- }*/
- for(unsigned int i=0;i<lexmat.size();++i)
- {
- if( lexmat[i] )
- for(unsigned int j=0;j<lexmat[i]->size();++j)
- {
- const CPPair&x=(*lexmat[i])[j].second;
- WordIndex e=i,f=(*lexmat[i])[j].first;
- if( x.prob>PROB_SMOOTH )
- if( actual )
- of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
- else
- of << e << ' ' << f << ' ' << x.prob << '\n';
- }
- }
-}
-
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::printProbTableInverse(const char *,
- const Vector<WordEntry>&,
- const Vector<WordEntry>&,
- const double,
- const double,
- const bool ) const
-{
-}
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::normalizeTable(const vcbList&, const vcbList&, int)
-{
- for(unsigned int i=0;i<lexmat.size();++i)
- {
- double c=0.0;
- if( lexmat[i] )
- {
- unsigned int lSize=lexmat[i]->size();
- for(unsigned int j=0;j<lSize;++j)
- c+=(*lexmat[i])[j].second.count;
- for(unsigned int j=0;j<lSize;++j)
- {
- if( c==0 )
- (*lexmat[i])[j].second.prob=1.0/(lSize);
- else
- (*lexmat[i])[j].second.prob=(*lexmat[i])[j].second.count/c;
- (*lexmat[i])[j].second.count=0;
- }
- }
- }
-}
-
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::readProbTable(const char *){
-}
-
-template class tmodel<COUNT,PROB> ;
-#else
-/* ------------------ Method Definiotns for Class tmodel --------------------*/
-
-#
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::printCountTable(const char *filename,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- const bool actual) const
- // this function dumps the t table. Each line is of the following format:
- //
- // c(target_word/source_word) source_word target_word
-{
- ofstream of(filename);
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
- for(i = ef.begin(); i != ef.end();++i){
- if ( ((*i).second).count > COUNTINCREASE_CUTOFF)
- if (actual)
- of << ((*i).second).count << ' ' << evlist[ ((*i).first).first ].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
- else
- of << ((*i).second).count << ' ' << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n';
- }
-}
-
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::printProbTable(const char *filename,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- const bool actual) const
- // this function dumps the t table. Each line is of the following format:
- //
- // source_word target_word p(target_word/source_word)
-{
- ofstream of(filename);
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
- for(i = ef.begin(); i != ef.end();++i)
- if( actual )
- of << evlist[((*i).first).first].word << ' ' <<
- fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
- else
- of << ((*i).first).first << ' ' << ((*i).first).second << ' ' <<
- (*i).second.prob << '\n';
-}
-
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::printProbTableInverse(const char *filename,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- const double,
- const double,
- const bool actual) const
- // this function dumps the inverse t table. Each line is of the format:
- //
- // target_word_id source_word_id p(source_word/target_word)
- //
- // if flag "actual " is true then print actual word entries instead of
- // token ids
-{
- cerr << "Dumping the t table inverse to file: " << filename << '\n';
- ofstream of(filename);
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
- PROB p_inv = 0 ;
- // static const PROB ratio(double(fTotal)/eTotal);
- WordIndex e, f ;
- int no_errors(0);
- vector<PROB> total(fvlist.size(),PROB(0)) ; // Sum over all e of P(f/e) * p(e) - needed for normalization
-
- for(i = ef.begin(); i != ef.end(); i++){
- e = ((*i).first).first ;
- f = ((*i).first).second ;
- total[f] += (PROB) evlist[e].freq * ((*i).second.prob); //add P(f/ei) * F(ei)
- }
-
- for(i = ef.begin(); i != ef.end(); i++){
- e = ((*i).first).first ;
- f = ((*i).first).second ;
- p_inv = ((*i).second.prob) * (PROB) evlist[e].freq / total[f] ;
- if (p_inv > 1.0001 || p_inv < 0){
- no_errors++;
- if (no_errors <= 10){
- cerr << "printProbTableInverse(): Error - P("<<evlist[e].word<<"("<<
- e<<") / "<<fvlist[f].word << "("<<f<<")) = " << p_inv <<'\n';
- cerr << "f(e) = "<<evlist[e].freq << " Sum(p(f/e).f(e)) = " << total[f] <<
- " P(f/e) = " <<((*i).second.prob) <<'\n';
- if (no_errors == 10)
- cerr<<"printProbTableInverse(): Too many P inverse errors ..\n";
- }
- }
- if (actual)
- of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
- else
- of << f << ' ' << e << ' ' << p_inv << '\n';
- }
-}
-/*
-
-
-
-{
- cerr << "Dumping the t table inverse to file: " << filename << '\n';
- ofstream of(filename);
- hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
- PROB p_inv = 0 ;
- static const PROB ratio(double(fTotal)/eTotal);
- WordIndex e, f ;
- for(i = ef.begin(); i != ef.end(); i++){
- e = ((*i).first).first ;
- f = ((*i).first).second ;
- p_inv = ((*i).second.prob) * ratio * (PROB) evlist[e].freq /
- (PROB) fvlist[f].freq ;
- if (actual)
- of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
- else
- of << f << ' ' << e << ' ' << p_inv << '\n';
- }
-}
-*/
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter)
- // normalize conditional probability P(fj/ei):
- // i.e. make sure that Sum over all j of P(fj/e) = 1
- // this method reads the counts portion of the table and normalize into
- // the probability portion. Then the counts are cleared (i.e. zeroed)
- // if the resulting probability of an entry is below a threshold, then
- // remove it .
-{
- if( iter==2 )
- {
- total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0;
- }
- nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0;
- nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0;
- Vector<double> total(engl.uniqTokens(),0.0);
- //Vector<int> nFrench(engl.uniqTokens(), 0);
- //Vector<int> nEng(french.uniqTokens(), 0);
-
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
- for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e
- if( iter==2 )
- total2[((*i).first).first] += (*i).second.count;
- total[((*i).first).first] += (*i).second.count;
- nFrench[((*i).first).first]++;
- nEng[((*i).first).second]++;
- }
- for(unsigned int k=0;k<engl.uniqTokens();++k)
- if( nFrench[k] )
- {
- double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH;
- if( probMass<0.0 )
- cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << " nFrench[k]:"<< nFrench[k] << '\n';
- total[k]+= total[k]*probMass/(1-probMass);
- }
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k;
- PROB p ;
- int nParams=0;
- for(j = ef.begin(); j != ef.end(); ){
- k = j;
- k++ ;
- if( (total[((*j).first).first])>0.0 )
- p = ((((*j).second).count) /(total[((*j).first).first])) ;
- else
- p= 0.0;
- if (p > PROB_CUTOFF)
- {
- if( iter>0 )
- {
- ((*j).second).prob = 0 ;
- ((*j).second).count = p ;
- }
- else
- {
- ((*j).second).prob = p ;
- ((*j).second).count = 0 ;
- }
- nParams++;
- }
- else {
- erase(((*j).first).first, ((*j).first).second);
- }
- j = k ;
- }
- if( iter>0 )
- return normalizeTable(engl, french, iter-1);
- else
- {
- }
-}
-
-template <class COUNT, class PROB>
-void tmodel<COUNT, PROB>::readProbTable(const char *filename){
- /* This function reads the t table from a file.
- Each line is of the format: source_word_id target_word_id p(target_word|source_word)
- This is the inverse operation of the printTable function.
- NAS, 7/11/99
- */
- ifstream inf(filename);
- cerr << "Reading t prob. table from " << filename << "\n";
- if(!inf){
- cerr << "\nERROR: Cannot open " << filename << "\n";
- return;
- }
- WordIndex src_id, trg_id;
- PROB prob;
- int nEntry=0;
- while( inf >> src_id >> trg_id >> prob){
- insert(src_id, trg_id, 0.0, prob);
- nEntry++;
- }
- cerr << "Read " << nEntry << " entries in prob. table.\n";
-}
-
-template class tmodel<COUNT,PROB> ;
-
-/* ---------------- End of Method Definitions of class tmodel ---------------*/
-
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/TTables.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/TTables.h b/ext/giza-pp/GIZA++-v2/TTables.h
deleted file mode 100644
index 85673ef..0000000
--- a/ext/giza-pp/GIZA++-v2/TTables.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-/* --------------------------------------------------------------------------*
- * *
- * Module : TTables *
- * *
- * Prototypes File: TTables.h *
- * *
- * Objective: Defines clases and methods for handling I/O for Probability & *
- * Count tables and also alignment tables *
- *****************************************************************************/
-
-#ifndef _ttables_h
-#define _ttables_h 1
-
-
-#include "defs.h"
-#include "vocab.h"
-
-#include <cassert>
-
-#include <iostream>
-#include <algorithm>
-#include <functional>
-#include <map>
-#include <set>
-#include "Vector.h"
-#include <utility>
-
-#include <fstream>
-
-#include "Globals.h"
-
-
-/* The tables defined in the following classes are defined as hash tables. For
- example. the t-table is a hash function of a word pair; an alignment is
- a hash function of a vector of integer numbers (sentence positions) and so
- on */
-
-
-/*----------- Defnition of Hash Function for class tmodel ------- -----------*/
-
-typedef pair<WordIndex, WordIndex> wordPairIds;
-
-
-class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
-{
-public:
- size_t operator() (const pair<WordIndex, WordIndex>& key) const
- {
- return (size_t) MAX_W*key.first + key.second; /* hash function and it
- is guarnteed to have
- unique id for each
- unique pair */
- }
-};
-
-
-
-/* ------------------ Class Prototype Definitions ---------------------------*
- Class Name: tmodel
- Objective: This defines the underlying data structur for t Tables and t
- Count Tables. They are defined as a hash table. Each entry in the hash table
- is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
- probability and the count are represented as log integer probability as
- defined by the class LogProb .
-
- This class is used to represents t Tables (probabiliity) and n (fertility
- Tables and also their corresponding count tables .
-
- *---------------------------------------------------------------------------*/
-
-//typedef float COUNT ;
-//typedef LogProb PROB ;
-template <class COUNT, class PROB>
-class LpPair {
- public:
- COUNT count ;
- PROB prob ;
- public: // constructor
- LpPair():count(0), prob(0){} ;
- LpPair(COUNT c, PROB p):count(c), prob(p){};
-} ;
-
-#ifdef BINARY_SEARCH_FOR_TTABLE
-
-
-template<class T>
-T*mbinary_search(T*x,T*y,unsigned int val)
-{
- if( y-x==0 )
- return 0;
- if( x->first==val)
- return x;
- if( y-x<2 )
- return 0;
- T*mid=x+(y-x)/2;
- if( val < mid->first )
- return mbinary_search(x,mid,val);
- else
- return mbinary_search(mid,y,val);
-
-}
-
-template<class T>
-const T*mbinary_search(const T*x,const T*y,unsigned int val)
-{
- if( y-x==0 )
- return 0;
- if( x->first==val)
- return x;
- if( y-x<2 )
- return 0;
- const T*mid=x+(y-x)/2;
- if( val < mid->first )
- return mbinary_search(x,mid,val);
- else
- return mbinary_search(mid,y,val);
-
-}
-
-template <class COUNT, class PROB>
-class tmodel{
- typedef LpPair<COUNT, PROB> CPPair;
- public:
- int noEnglishWords; // total number of unique source words
- int noFrenchWords; // total number of unique target words
- //vector<pair<unsigned int,CPPair> > fs;
- //vector<unsigned int> es;
- vector< vector<pair<unsigned int,CPPair> >* > lexmat;
-
- void erase(WordIndex e, WordIndex f)
- {
- CPPair *p=find(e,f);
- if(p)
- *p=CPPair(0,0);
- };
- CPPair*find(int e,int f)
- {
- //pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
- //pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
- pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
- pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
- pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
- if( x==0 )
- {
- //cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
- //abort();
- return 0;
- }
- return &(x->second);
- }
- const CPPair*find(int e,int f)const
- {
- const pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
- const pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
- //const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
- //const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
- const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
- if( x==0 )
- {
- //cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
- //abort();
- return 0;
- }
-
- return &(x->second);
- }
-public:
- void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
- *find(e,f)=CPPair(cval,pval);
- }
- CPPair*getPtr(int e,int f){return find(e,f);}
- tmodel(const string&fn)
- {
- int count=0,count2=0;
- ifstream infile2(fn.c_str());
- int e,f,olde=-1,oldf=-1;
- pair<unsigned int,CPPair> cp;
- vector< pair<unsigned int,CPPair> > cps;
- while(infile2>>e>>f)
- {
- cp.first=f;
- assert(e>=olde);
- assert(e>olde ||f>oldf);
- if( e!=olde&&olde>=0 )
- {
- int oldsize=lexmat.size();
- lexmat.resize(olde+1);
- for(unsigned int i=oldsize;i<lexmat.size();++i)
- lexmat[i]=0;
- lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
- cps.clear();
- if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) )
- cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl;
- count2+=lexmat[olde]->capacity();
- }
- cps.push_back(cp);
- olde=e;
- oldf=f;
- count++;
- }
- lexmat.resize(olde+1);
- lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
- count2+=lexmat[olde]->capacity();
- cout << "There are " << count << " " << count2 << " entries in table" << '\n';
- }
-
-
- /* tmodel(const string&fn)
- {
- size_t count=0;
- {
- ifstream infile1(fn.c_str());
- if( !infile1 )
- {
- cerr << "ERROR: can't read coocurrence file " << fn << '\n';
- abort();
- }
- int e,f;
- while(infile1>>e>>f)
- count++;
- }
- cout << "There are " << count << " entries in table" << '\n';
- ifstream infile2(fn.c_str());
- fs.resize(count);
- int e,f,olde=-1,oldf=-1;
- pair<unsigned int,CPPair> cp;
- count=0;
- while(infile2>>e>>f)
- {
- assert(e>=olde);
- assert(e>olde ||f>oldf);
- if( e!=olde )
- {
- es.resize(e+1);
- for(unsigned int i=olde+1;int(i)<=e;++i)
- es[i]=count;
- }
- cp.first=f;
- assert(count<fs.size());
- fs[count]=cp;
- //fs.push_back(cp);
- olde=e;
- oldf=f;
- count++;
- }
- assert(count==fs.size());
- es.push_back(fs.size());
- cout << fs.size() << " " << count << " coocurrences read" << '\n';
- }*/
- void incCount(WordIndex e, WordIndex f, COUNT inc)
- {
- if( inc )
- {
- CPPair *p=find(e,f);
- if( p )
- p->count += inc ;
- }
- }
-
- PROB getProb(WordIndex e, WordIndex f) const
- {
- const CPPair *p=find(e,f);
- if( p )
- return max(p->prob, PROB_SMOOTH);
- else
- return PROB_SMOOTH;
- }
-
- COUNT getCount(WordIndex e, WordIndex f) const
- {
- const CPPair *p=find(e,f);
- if( p )
- return p->count;
- else
- return 0.0;
- }
-
- void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
- void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
- void printProbTableInverse(const char *filename,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- const double eTotal,
- const double fTotal,
- const bool actual = false ) const;
- void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
- void readProbTable(const char *filename);
-};
-
-
-#else
-
-
-template <class COUNT, class PROB>
-class tmodel{
- typedef LpPair<COUNT, PROB> CPPair;
- public:
- int noEnglishWords; // total number of unique source words
- int noFrenchWords; // total number of unique target words
- hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> > ef;
- void erase(WordIndex e, WordIndex f)
- // In: a source and a target token ids.
- // removes the entry with that pair from table
- {
- ef.erase(wordPairIds(e, f));
- };
-
-public:
- Vector<PROB> total2;
- Vector<int> nFrench;
- Vector<int> nEng;
-
-
- // methods;
-
- // insert: add entry P(fj/ei) to the hash function, Default value is 0.0
- void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
- ef[wordPairIds(e, f)].count = cval ;
- ef[wordPairIds(e, f)].prob = pval ;
- }
-
- // returns a reference to the word pair, if does not exists, it creates it.
- CPPair&getRe(WordIndex e, WordIndex f)
- {return ef[wordPairIds(e, f)];}
-
- // returns a pointer to an existing word pair. if pair does not exists,
- // the method returns the zero pointer (NULL)
-
- CPPair*getPtr(WordIndex e, WordIndex f)
- {
- // look up this pair and return its position
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator i = ef.find(wordPairIds(e, f));
- if(i != ef.end()) // if it exists, return a pointer to it.
- return(&((*i).second));
- else return(0) ; // else return NULL pointer
- }
-
- void incCount(WordIndex e, WordIndex f, COUNT inc)
- // increments the count of the given word pair. if the pair does not exist,
- // it creates it with the given value.
- {
- if( inc )
- ef[wordPairIds(e, f)].count += inc ;
- }
-
- PROB getProb(WordIndex e, WordIndex f) const
- // read probability value for P(fj/ei) from the hash table
- // if pair does not exist, return floor value PROB_SMOOTH
- {
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
- if(i == ef.end())
- return PROB_SMOOTH;
- else
- return max(((*i).second).prob, PROB_SMOOTH);
- }
-
- COUNT getCount(WordIndex e, WordIndex f) const
- /* read count value for entry pair (fj/ei) from the hash table */
- {
- typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
- if(i == ef.end())
- return 0;
- else
- return ((*i).second).count;
- }
-
- inline const hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >& getHash(void) const {return ef;};
- /* get a refernece to the hash table */
- //inline void resize(WordIndex n) {ef.resize(n);};
- // to resize he hash table
-
- void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
- void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
- // print the t table to the given file but this time print actual source and
- // target words instead of thier token ids
-
- void printProbTableInverse(const char *filename,
- const Vector<WordEntry>& evlist,
- const Vector<WordEntry>& fvlist,
- const double eTotal,
- const double fTotal,
- const bool actual = false ) const;
- // dump inverse of t table (i.e P(ei/fj)) to the given file name,
- // if the given flag is true then actual words are printed not token ids
-
- void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
- // to norlmalize the table i.e. make sure P(fj/ei) for all j is equal to 1
-
- void readProbTable(const char *filename);
- // void readAsFertilityTable(const char *filename);
-};
-/*--------------- End of Class Definition for tmodel -----------------------*/
-
-#endif
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/Vector.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/Vector.h b/ext/giza-pp/GIZA++-v2/Vector.h
deleted file mode 100644
index 96d26ad..0000000
--- a/ext/giza-pp/GIZA++-v2/Vector.h
+++ /dev/null
@@ -1,427 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-/*--
-Vector: checked vector implementation
-
-Franz Josef Och (30/07/99)
---*/
-#ifndef ARRAY_H_DEFINED
-#define ARRAY_H_DEFINED
-#include "mystl.h"
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <functional>
-#include <cassert>
-
-
-#ifdef NDEBUG
-
-#include <vector>
-#define Vector vector
-template<class T> ostream& operator<<(ostream&o, const Vector<T>&a)
-{
- o << "Vector(" << a.size() << "){ ";
- for(unsigned int iii=0;iii<a.size();iii++)
- o << " " << iii<< ": " << a[iii]<<" ;";
- return o << "}\n";
-}
-
-#else
-
-#define ARRAY_DEBUG
-#define memo_del(a, b)
-#define memo_new(a)
-
-template<class T> class Vector
-{
- private:
- T *p;
- int realSize;
- int maxWritten;
-
- void copy(T *a, const T *b, int n);
- void copy(T *a, T *b, int n);
- void _expand();
-
- public:
- Vector()
- : p(0), realSize(0), maxWritten(-1)
- {
-#ifdef VERY_ARRAY_DEBUG
- cout << "MAKE ARRAY: " << this<<" "<<(void*)p << '\n';
-#endif
- }
- Vector(const Vector<T> &x)
- : p(new T[x.maxWritten+1]), realSize(x.maxWritten+1), maxWritten(x.maxWritten)
- {
- memo_new(p);
- copy(p, x.p, realSize);
-#ifdef VERY_ARRAY_DEBUG
- cout << "MAKE ARRAY copy: " << this << " " << realSize <<" "<<(void*)p<< '\n';
-#endif
- }
- explicit Vector(int n)
- : p(new T[n]), realSize(n), maxWritten(n-1)
- {
- memo_new(p);
-#ifdef VERY_ARRAY_DEBUG
- cout << "MAKE ARRAY with parameter n: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- }
- Vector(int n, const T&_init)
- : p(new T[n]), realSize(n), maxWritten(n-1)
- {
- memo_new(p);
- for(int iii=0;iii<n;iii++)p[iii]=_init;
-#ifdef VERY_ARRAY_DEBUG
- cout << "MAKE ARRAY with parameter n and init: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- }
-
- ~Vector()
- {
-#ifdef VERY_ARRAY_DEBUG
- cout << "FREE ARRAY: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- delete [] p;
- memo_del(p, 1);
-#ifndef NDEBUG
- p=0;realSize=-1;maxWritten=-1;
-#endif
- }
-
- Vector<T>& operator=(const Vector<T>&x)
- {
- if( this!= &x )
- {
-#ifdef VERY_ARRAY_DEBUG
- cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- delete [] p;
- memo_del(p, 1);
- realSize = x.maxWritten+1;
- maxWritten = x.maxWritten;
- p = new T[realSize];
- memo_new(p);
- copy(p, x.p, realSize);
-#ifdef VERY_ARRAY_DEBUG
- cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- }
- return *this;
- }
-
- Vector<T>& operator=(Vector<T>&x)
- {
- if( this!= &x )
- {
-#ifdef VERY_ARRAY_DEBUG
- cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- delete [] p;
- memo_del(p, 1);
- realSize = x.maxWritten+1;
- maxWritten = x.maxWritten;
- p = new T[realSize];
- memo_new(p);
- copy(p, x.p, realSize);
-#ifdef VERY_ARRAY_DEBUG
- cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- }
- return *this;
- }
-
- void allowAccess(int n)
- {
- while( realSize<=n )
- _expand();
- maxWritten=max(maxWritten, n);
- assert( maxWritten<realSize );
- }
- void resize(int n)
- {
- while( realSize<n )
- _expand();
- maxWritten=n-1;
- }
- void clear()
- {
- resize(0);
- }
- void reserve(int n)
- {
- int maxOld=maxWritten;
- resize(n);
- maxWritten=maxOld;
- }
- void sort(int until=-1)
- {
- if( until== -1 ) until=size();
- std::sort(p, p+until);
- }
- void invsort(int until=-1)
- {
- if( until== -1 ) until=size();
- std::sort(p, p+until, greater<T>());
- }
- void init(int n, const T&_init)
- {
-#ifdef VERY_ARRAY_DEBUG
- cout << "FREE ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- delete []p;
- memo_del(p, 1);
- p=new T[n];
- memo_new(p);
- realSize=n;
- maxWritten=n-1;
- for(int iii=0;iii<n;iii++)p[iii]=_init;
-#ifdef VERY_ARRAY_DEBUG
- cout << "NEW ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- }
- inline unsigned int size() const
- {assert( maxWritten<realSize );
- return maxWritten+1;}
- inline int low() const
- { return 0; }
- inline int high() const
- { return maxWritten; }
- int findMax() const;
- int findMin() const;
- void errorAccess(int n) const;
- inline T*getPointerToData(){return p;}
- inline T*begin(){return p;}
- inline T*end(){return p+maxWritten+1;}
- inline T& operator[](int n)
- {
-#ifndef NDEBUG
- if( n<0 || n>maxWritten )
- errorAccess(n);
-#endif
- return p[n];
- }
- inline const T& operator[](int n) const
- {
-#ifndef NDEBUG
- if(n<0 || n>maxWritten )
- errorAccess(n);
-#endif
- return p[n];
- }
- inline const T& get(int n) const
- {
-#ifndef NDEBUG
- if(n<0 || n>maxWritten )
- errorAccess(n);
-#endif
- return p[n];
- }
- const T&top(int n=0) const
- {return (*this)[maxWritten-n];}
- T&top(int n=0)
- {return (*this)[maxWritten-n];}
- const T&back(int n=0) const
- {return (*this)[maxWritten-n];}
- T&back(int n=0)
- {return (*this)[maxWritten-n];}
- T&push_back(const T&x)
- {
- allowAccess(maxWritten+1);
- (*this)[maxWritten]=x;
- return top();
- }
- /*
- bool writeTo(ostream&out) const
- {
- out << "Vector ";
- out << size() << " ";
- out << a << '\n';
- for(int iv=0;iv<=maxWritten;iv++)
- {
- writeOb(out, (*this)[iv]);
- out << '\n';
- }
- return 1;
- }
- */
-
- bool readFrom(istream&in)
- {
- string s;
- if( !in )
- {
- cerr << "ERROR(Vector): file cannot be opened.\n";
- return 0;
- }
- in >> s;
- if( !(s=="Vector") )
- {
- cerr << "ERROR(Vector): Vector!='"<<s<<"'\n";
- return 0;
- }
- int biggest;
- in >> biggest;
- in >> a;
- resize(biggest);
- for(int iv=0;iv<size();iv++)
- {
- readOb(in, (*this)[iv]);
- }
- return 1;
- }
-};
-
-template<class T> bool operator==(const Vector<T> &x, const Vector<T> &y)
-{
- if( &x == &y )
- return 1;
- else
- {
- if( y.size()!=x.size() )
- return 0;
- else
- {
- for(unsigned int iii=0;iii<x.size();iii++)
- if( !(x[iii]==y[iii]) )
- return 0;
- return 1;
- }
- }
-}
-template<class T> bool operator!=(const Vector<T> &x, const Vector<T> &y)
-{
- return !(x==y);
-}
-
-template<class T> bool operator<(const Vector<T> &x, const Vector<T> &y)
-{
- if( &x == &y )
- return 0;
- else
- {
- if( y.size()<x.size() )
- return !(y<x);
- for(int iii=0;iii<x.size();iii++)
- {
- assert( iii!=y.size() );
- if( x[iii]<y[iii] )
- return 1;
- else if( y[iii]<x[iii] )
- return 0;
- }
- return x.size()!=y.size();//??
- }
-}
-
-
-template<class T> void Vector<T>:: errorAccess(int n) const
-{
- cerr << "ERROR: Access to array element " << n
- << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n";
- cout << "ERROR: Access to array element " << n
- << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n";
- assert(0);
-#ifndef DEBUG
- abort();
-#endif
-}
-
-template<class T> ostream& operator<<(ostream&o, const Vector<T>&a)
-{
- o << "Vector(" << a.size() << "){ ";
- for(unsigned int iii=0;iii<a.size();iii++)
- o << " " << iii<< ": " << a[iii]<<" ;";
- return o << "}\n";
-}
-
-template<class T> istream& operator>>(istream&in, Vector<T>&)
-{return in;}
-
-template<class T> int Hash(const Vector<T>&a)
-{
- int n=0;
- for(int iii=0;iii<a.size();iii++)
- n+=Hash(a[iii])*(iii+1);
- return n+a.size()*47;
-}
-template<class T> void Vector<T>::copy(T *aa, const T *bb, int n)
-{
- for(int iii=0;iii<n;iii++)
- aa[iii]=bb[iii];
-}
-template<class T> void Vector<T>::copy(T *aa, T *bb, int n)
-{
- for(int iii=0;iii<n;iii++)
- aa[iii]=bb[iii];
-}
-
-template<class T> void Vector<T>::_expand()
-{
-#ifdef VERY_ARRAY_DEBUG
- cout << "FREE ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
- T *oldp=p;
- int oldsize=realSize;
- realSize=realSize*2+1;
- p=new T[realSize];
- memo_new(p);
- copy(p, oldp, oldsize);
- delete [] oldp;
- memo_del(oldp, 1);
-#ifdef VERY_ARRAY_DEBUG
- cout << "NEW ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n';
-#endif
-}
-
-template<class T> int Vector<T>::findMax() const
-{
- if( size()==0 )
- return -1;
- else
- {
- int maxPos=0;
- for(int iii=1;iii<size();iii++)
- if( (*this)[maxPos]<(*this)[iii] )
- maxPos=iii;
- return maxPos;
- }
-}
-template<class T> int Vector<T>::findMin() const
-{
- if( size()==0 )
- return -1;
- else
- {
- int minPos=0;
- for(int iii=1;iii<size();iii++)
- if( (*this)[iii]<(*this)[minPos] )
- minPos=iii;
- return minPos;
- }
-}
-
-#endif
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/WordClasses.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/WordClasses.h b/ext/giza-pp/GIZA++-v2/WordClasses.h
deleted file mode 100644
index 3693906..0000000
--- a/ext/giza-pp/GIZA++-v2/WordClasses.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-
-Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef WordClasses_h_DEFINED
-#define WordClasses_h_DEFINED
-#include <map>
-#include <string>
-#include <set>
-
-class WordClasses
-{
- private:
- map<string,string> Sw2c;
- map<string,int> Sc2int;
- Vector<string> Sint2c;
- Vector<int> w2c;
- unsigned int classes;
- public:
- WordClasses()
- : classes(1)
- {
- Sint2c.push_back("0");
- Sc2int["0"]=0;
- }
- template<class MAPPER> bool read(istream&in,const MAPPER&m)
- {
- string sline;
- int maxword=0;
- while(getline(in,sline))
- {
- string word,wclass;
- //istringstream iline(sline.c_str());
- istringstream iline(sline);
- iline>>word>>wclass;
- maxword=max(m(word),maxword);
- assert(Sw2c.count(word)==0);
- Sw2c[word]=wclass;
- if( !Sc2int.count(wclass) )
- {
- Sc2int[wclass]=classes++;
- Sint2c.push_back(wclass);
- assert(classes==Sint2c.size());
- }
- }
- w2c=Vector<int>(maxword+1,0);
- for(map<string,string>::const_iterator i=Sw2c.begin();i!=Sw2c.end();++i)
- w2c[m(i->first)]=Sc2int[i->second];
- cout << "Read classes: #words: " << maxword << " " << " #classes: "<< classes <<endl;
- return 1;
- }
- int getClass(int w)const
- {
- if(w>=0&&int(w)<int(w2c.size()) )
- return w2c[w];
- else
- return 0;
- }
- int operator()(const string&x)const
- {
- if( Sc2int.count(x) )
- return Sc2int.find(x)->second;
- else
- {
- cerr << "WARNING: class " << x << " not found.\n";
- return 0;
- }
- }
- string classString(unsigned int cnr)const
- {
- if( cnr<Sint2c.size())
- return Sint2c[cnr];
- else
- return string("0");
- }
-};
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/alignment.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/alignment.cpp b/ext/giza-pp/GIZA++-v2/alignment.cpp
deleted file mode 100644
index 55a2e5c..0000000
--- a/ext/giza-pp/GIZA++-v2/alignment.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-/*--
-alignment: 'checked' alignment representation with automatic calculation
- of fertilities
-Franz Josef Och (30/07/99)
---*/
-#include "alignment.h"
-
-ostream&operator<<(ostream&out, const alignment&a)
-{
- int m=a.a.size()-1,l=a.f.size()-1;
- out << "AL(l:"<<l<<",m:"<<m<<")(a: ";
- for(int j=1;j<=m;j++)out << a(j) << ' ';
- out << ")(fert: ";
- for(int i=0;i<=l;i++)out << a.fert(i) << ' ';
- return out << ") c:"<<"\n";
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/alignment.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/alignment.h b/ext/giza-pp/GIZA++-v2/alignment.h
deleted file mode 100644
index 03cf028..0000000
--- a/ext/giza-pp/GIZA++-v2/alignment.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-/*--
-alignment: 'checked' alignment representation with autom. calc. of fertilities
-Franz Josef Och (30/07/99)
---*/
-#ifndef alignment_h_fjo_defined
-#define alignment_h_fjo_defined
-#include "Vector.h"
-#include <cassert>
-#include "defs.h"
-#include "myassert.h"
-
-class al_struct
-{
- public:
- al_struct()
- : prev(0),next(0){}
- PositionIndex prev,next;
-};
-
-
-class alignment
-{
- private:
- Vector<PositionIndex> a;
- Vector<PositionIndex> positionSum,f;
- public:
- Vector<PositionIndex> als_i;
- Vector<al_struct> als_j;
- PositionIndex l,m;
- alignment()
- {}
- alignment(PositionIndex _l, PositionIndex _m)
- : a(_m+1, (PositionIndex)0),
- positionSum(_l+1, (PositionIndex)0), f(_l+1, (PositionIndex)0), als_i(_l+1,0),als_j(_m+1),l(_l), m(_m)
- {
- f[0]=m;
- for(PositionIndex j=1;j<=m;j++)
- {
- if( j>1 )
- als_j[j].prev= j-1;
- if( j<m )
- als_j[j].next= j+1;
- }
- als_i[0]=1;
- }
- PositionIndex get_l()const
- {return l;}
- PositionIndex get_m()const
- {return m;}
- void doMove(int i,int j)
- {
- set(j,i);
- }
- void doSwap(int j1,int j2)
- {
- int aj1=a[j1],aj2=a[j2];
- set(j1,aj2);
- set(j2,aj1);
- }
- void set(PositionIndex j, PositionIndex aj)
- {
- PositionIndex old_aj=a[j];
- massert(j<a.size());massert(aj<f.size());
- massert(old_aj<f.size());massert(f[old_aj]>0);
- massert(j>0);
- positionSum[old_aj]-=j;
- // ausfuegen
- PositionIndex prev=als_j[j].prev;
- PositionIndex next=als_j[j].next;
- if( next )
- als_j[next].prev=prev;
- if( prev )
- als_j[prev].next=next;
- else
- als_i[old_aj]=next;
-
- // neue Position suchen
- PositionIndex lfd=als_i[aj],llfd=0;
- while( lfd && lfd<j )
- lfd = als_j[llfd=lfd].next;
-
- // einfuegen
- als_j[j].prev=llfd;
- als_j[j].next=lfd;
- if( llfd )
- als_j[llfd].next=j;
- else
- als_i[aj]=j;
- if( lfd )
- als_j[lfd].prev=j;
-
- f[old_aj]--;
- positionSum[aj]+=j;
- f[aj]++;
- a[j]=aj;
- }
- const Vector<PositionIndex>& getAlignment() const
- {return a ;}
- PositionIndex get_al(PositionIndex j)const
- {
- massert(j<a.size());
- return a[j];
- }
- PositionIndex operator()(PositionIndex j)const
- {
- massert(j<a.size());
- return a[j];
- }
- PositionIndex fert(PositionIndex i)const
- {
- massert(i<f.size());
- return f[i];
- }
- PositionIndex get_head(PositionIndex i)const
- {
- massert( als_i[i]==_get_head(i) );
- return als_i[i];
- }
- PositionIndex get_center(PositionIndex i)const
- {
- if( i==0 )return 0;
- massert(((positionSum[i]+f[i]-1)/f[i]==_get_center(i)));
- return (positionSum[i]+f[i]-1)/f[i];
- }
- PositionIndex _get_head(PositionIndex i)const
- {
- if( fert(i)==0 )return 0;
- for(PositionIndex j=1;j<=m;j++)
- if( a[j]==i )
- return j;
- return 0;
- }
- PositionIndex _get_center(PositionIndex i)const
- {
- if( i==0 )return 0;
- massert(fert(i));
- PositionIndex sum=0;
- for(PositionIndex j=1;j<=m;j++)
- if( a[j]==i )
- sum+=j;
- return (sum+fert(i)-1)/fert(i);
- }
- PositionIndex prev_cept(PositionIndex i)const
- {
- if( i==0 )return 0;
- PositionIndex k=i-1;
- while(k&&fert(k)==0)
- k--;
- return k;
- }
- PositionIndex next_cept(PositionIndex i)const
- {
- PositionIndex k=i+1;
- while(k<l+1&&fert(k)==0)
- k++;
- return k;
- }
- PositionIndex prev_in_cept(PositionIndex j)const
- {
- //PositionIndex k=j-1;
- //while(k&&a[k]!=a[j])
- //k--;
- //assert( als_j[j].prev==k );
- //assert(k);
- //return k;
- massert(als_j[j].prev==0||a[als_j[j].prev]==a[j]);
- return als_j[j].prev;
- }
- friend ostream &operator<<(ostream&out, const alignment&a);
- friend bool operator==(const alignment&a, const alignment&b)
- {
- massert(a.a.size()==b.a.size());
- for(PositionIndex j=1;j<=a.get_m();j++)
- if(a(j)!=b(j))
- return 0;
- return 1;
- }
- friend bool operator<(const alignment&x, const alignment&y)
- {
- massert(x.get_m()==y.get_m());
- for(PositionIndex j=1;j<=x.get_m();j++)
- if( x(j)<y(j) )
- return 1;
- else if( y(j)<x(j) )
- return 0;
- return 0;
- }
- friend int differences(const alignment&x, const alignment&y){
- int count=0;
- massert(x.get_m()==y.get_m());
- for(PositionIndex j=1;j<=x.get_m();j++)
- count += (x(j)!=y(j));
- return count;
- }
- bool valid()const
- {
- if( 2*f[0]>m )
- return 0;
- for(unsigned int i=1;i<=l;i++)
- if( f[i]>=MAX_FERTILITY )
- return 0;
- return 1;
- }
- friend class transpair_model5;
-};
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/collCounts.cpp
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/collCounts.cpp b/ext/giza-pp/GIZA++-v2/collCounts.cpp
deleted file mode 100644
index 6e6ef69..0000000
--- a/ext/giza-pp/GIZA++-v2/collCounts.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
-
-Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#include "alignment.h"
-#include "transpair_model3.h"
-#include <map>
-#include "collCounts.h"
-#include "MoveSwapMatrix.h"
-#include "D5Tables.h"
-#include "transpair_model5.h"
-#include "transpair_modelhmm.h"
-#include "Parameter.h"
-
-extern float COUNTINCREASE_CUTOFF_AL;
-// unifies collectCountsOverAlignments and findAlignmentNeighborhood FJO-20/07/99
-template<class TRANSPAIR>
-int collectCountsOverNeighborhood(const MoveSwapMatrix<TRANSPAIR>&msc,LogProb ascore,Array2<LogProb,Vector<LogProb> >&dtcount,Array2<LogProb,Vector<LogProb> >&ncount,LogProb&p1count,LogProb&p0count,LogProb&total_count)
-{
- int nAl=0;
- const PositionIndex l=msc.get_l(),m=msc.get_m();
- Array2<LogProb,Vector<LogProb> > cmove(l+1,m+1),cswap(l+1,m+1);
- Vector<LogProb> negmove(m+1),negswap(m+1),plus1fert(l+1),minus1fert(l+1);
- LogProb total_move,total_swap;
- if( msc.isCenterDeleted()==0 )
- {
- total_move+=ascore;
- nAl++;
- }
- for(PositionIndex j=1;j<=m;j++)
- for(PositionIndex i=0;i<=l;i++)
- if( msc(j)!=i && !msc.isDelMove(i,j) )
- {
- LogProb newscore=ascore*msc.cmove(i,j);
- total_move+=newscore;
- nAl++;
- cmove(i,j)+=newscore;
- negmove[j]+=newscore;
- plus1fert[i]+=newscore;
- minus1fert[msc(j)]+=newscore;
- }
- for(PositionIndex j1=1;j1<=m;j1++)
- for(PositionIndex j2=j1+1;j2<=m;j2++)
- if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
- {
- LogProb newscore=ascore*msc.cswap(j1,j2);
- total_swap+=newscore;
- nAl++;
- cswap(msc(j1),j2)+=newscore;
- cswap(msc(j2),j1)+=newscore;
- negswap[j1]+=newscore;
- negswap[j2]+=newscore;
- }
- total_count+=total_move+total_swap;
- for(PositionIndex j=1;j<=m;j++)
- for(PositionIndex i=0;i<=l;i++)
- dtcount(i,j) += ((i==msc(j)) ? (total_count-(negmove[j]+negswap[j])) : (cswap(i,j)+cmove(i,j)));
- for(PositionIndex i=1;i<=l;i++)
- {
- LogProb temp=minus1fert[i]+plus1fert[i];
- if( msc.fert(i)<MAX_FERTILITY )
- ncount(i,msc.fert(i))+=total_count-temp;
- if(msc.fert(i)>0&&msc.fert(i)-1<MAX_FERTILITY)
- ncount(i,msc.fert(i)-1)+=minus1fert[i];
- else
- if( minus1fert[i]!=0.0 )
- cerr << "ERROR: M1Fa: " << minus1fert[i] << ' ' << i << ' ' << msc.fert(i)<< endl;
- if(msc.fert(i)+1<MAX_FERTILITY)
- ncount(i,msc.fert(i)+1)+=plus1fert[i];
- }
- LogProb temp=minus1fert[0]+plus1fert[0];
- p1count += (total_count-temp)*(LogProb)msc.fert(0);
- p0count += (total_count-temp)*(LogProb)(m-2*msc.fert(0));
- if( msc.fert(0)>0 )
- {
- p1count += (minus1fert[0])*(LogProb)(msc.fert(0)-1);
- p0count += (minus1fert[0])*(LogProb)(m-2*(msc.fert(0)-1));
- }
- else
- if( minus1fert[0]!=0.0 )
- cerr << "ERROR: M1Fb: " << minus1fert[0] << endl;
- if(int(m)-2*(int(msc.fert(0))+1)>=0)
- {
- p1count += (plus1fert[0])*(LogProb)(msc.fert(0)+1);
- p0count += (plus1fert[0])*(LogProb)(m-2*(msc.fert(0)+1));
- }
- msc.check();
- return nAl;
-};
-
-template<class TRANSPAIR>
-double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&,LogProb,void*)
-{
- return 0.0;
-}
-
-template<class TRANSPAIR>
-void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d4model*d4Table)
-{
- Mmsc.check();
- const PositionIndex m=msc.get_m(),l=msc.get_l();
- for(PositionIndex j=1;j<=m;++j)
- if( msc(j)!=0 )
- if( msc.get_head(msc(j))==j)
- {
- int ep=msc.prev_cept(msc(j));
- //massert( &d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountFirst(ep,j,msc.get_center(ep)));
- d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore;
- }
- else
- {
- //massert( &d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountSecond(j,msc.prev_in_cept(j) ));
- d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore;
- }
-}
-
-template<class TRANSPAIR>
-void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d5model*d5Table)
-{
- Mmsc.check();
- _collectCountsOverNeighborhoodForSophisticatedModels(Mmsc,msc,ef,normalized_ascore,&d5Table->d4m);
- Mmsc.check();
- const PositionIndex m=msc.get_m(),l=msc.get_l();
- PositionIndex prev_cept=0;
- PositionIndex vac_all=m;
- Vector<char> vac(m+1,0);
- for(PositionIndex i=1;i<=l;i++)
- {
- PositionIndex cur_j=msc.als_i[i];
- PositionIndex prev_j=0;
- PositionIndex k=0;
- if(cur_j) { // process first word of cept
- k++;
- d5Table->getCountRef_first(vacancies(vac,cur_j),vacancies(vac,msc.get_center(prev_cept)),
- d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-msc.fert(i)+k)+=normalized_ascore;
- vac_all--;
- assert(vac[cur_j]==0);
- vac[cur_j]=1;
- Mmsc.check();
- prev_j=cur_j;
- cur_j=msc.als_j[cur_j].next;
- }
- while(cur_j) { // process following words of cept
- k++;
- int vprev=vacancies(vac,prev_j);
- d5Table->getCountRef_bigger(vacancies(vac,cur_j),vprev,d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-msc.fert(i)+k)+=normalized_ascore;
- vac_all--;
- vac[cur_j]=1;
- Mmsc.check();
- prev_j=cur_j;
- cur_j=msc.als_j[cur_j].next;
- }
- assert(k==msc.fert(i));
- if( k )
- prev_cept=i;
- }
- assert(vac_all==msc.fert(0));
-}
-
-extern int NumberOfAlignmentsInSophisticatedCountCollection;
-
-template<class TRANSPAIR,class MODEL>
-double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&msc,LogProb normalized_ascore,MODEL*d5Table)
-{
- const PositionIndex m=msc.get_m(),l=msc.get_l();
- alignment x(msc);
- double sum=0;
- msc.check();
- if( !msc.isCenterDeleted() )
- {
- _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),normalized_ascore,d5Table);
- NumberOfAlignmentsInSophisticatedCountCollection++;
- sum+=normalized_ascore;
- }
- msc.check();
- for(WordIndex j=1;j<=m;j++)for(WordIndex i=0;i<=l;i++)
- {
- WordIndex old=x(j);
- if( i!=old&& !msc.isDelMove(i,j) )
- {
- msc.check();
- double c=msc.cmove(i,j)*normalized_ascore;
- if(c > COUNTINCREASE_CUTOFF_AL )
- {
- x.set(j,i);
- _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),c,d5Table);
- NumberOfAlignmentsInSophisticatedCountCollection++;
- x.set(j,old);
- sum+=c;
- }
- msc.check();
- }
- }
- for(PositionIndex j1=1;j1<=m;j1++)
- for(PositionIndex j2=j1+1;j2<=m;j2++)
- if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
- {
- double c=msc.cswap(j1,j2)*normalized_ascore;
- msc.check();
- if(c > COUNTINCREASE_CUTOFF_AL )
- {
- int old1=msc(j1),old2=msc(j2);
- x.set(j1,old2);
- x.set(j2,old1);
- _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),c,d5Table);
- NumberOfAlignmentsInSophisticatedCountCollection++;
- x.set(j1,old1);
- x.set(j2,old2);
- sum+=c;
- }
- msc.check();
- }
- msc.check();
- return sum;
-}
-
-template<class TRANSPAIR,class MODEL>
-int collectCountsOverNeighborhood(const Vector<pair<MoveSwapMatrix<TRANSPAIR>*,LogProb> >&smsc,Vector<WordIndex>&es,Vector<WordIndex>&fs,tmodel<COUNT,PROB>&tTable,amodel<COUNT>&aCountTable,amodel<COUNT>&dCountTable,nmodel<COUNT>&nCountTable,double&p1count,double&p0count,LogProb&_total,float count,bool addCounts,MODEL*d4Table)
-{
- int nAl=0;
- const PositionIndex l=es.size()-1,m=fs.size()-1;
- Array2<LogProb,Vector<LogProb> > dtcount(l+1,m+1),ncount(l+1,MAX_FERTILITY+1);
- LogProb p0=0,p1=0,all_total=0;
- for(unsigned int i=0;i<smsc.size();++i)
- {
- LogProb this_total=0;
- nAl+=collectCountsOverNeighborhood(*smsc[i].first,smsc[i].second,dtcount,ncount,p1,p0,this_total);
- all_total+=this_total;
- }
- _total=all_total;
- all_total/=(double)count;
- double sum2=0;
- if( addCounts && d4Table )
- {
- for(unsigned int i=0;i<smsc.size();++i)
- {
- //for(WordIndex j=1;j<=m;j++)for(WordIndex ii=0;ii<=l;ii++)
- // (*smsc[i].first).cmove(ii,j);
- sum2+=collectCountsOverNeighborhoodForSophisticatedModels(*smsc[i].first,smsc[i].second/all_total,d4Table);
- }
- if(!(fabs(count-sum2)<0.05))
- cerr << "WARNING: DIFFERENT SUMS: (" << count << ") (" << sum2 << ")\n";
- }
- if( addCounts )
- {
- for(PositionIndex i=0;i<=l;i++)
- {
- for(PositionIndex j=1;j<=m;j++)
- {
- LogProb ijadd=dtcount(i,j)/all_total;
- if( ijadd>COUNTINCREASE_CUTOFF_AL )
- {
- tTable.incCount(es[i],fs[j],ijadd);
- dCountTable.getRef(j,i,l,m)+=ijadd;
- aCountTable.getRef(i,j,l,m)+=ijadd;
- }
- }
- if( i>0 )
- for(PositionIndex n=0;n<MAX_FERTILITY;n++)
- nCountTable.getRef(es[i],n)+=ncount(i,n)/all_total;
- }
- p0count+=p0/all_total;
- p1count+=p1/all_total;
- }
- return nAl;
-}
-
-
-
-
-
-
-
-
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/collCounts.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/collCounts.h b/ext/giza-pp/GIZA++-v2/collCounts.h
deleted file mode 100644
index 9a0529b..0000000
--- a/ext/giza-pp/GIZA++-v2/collCounts.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
-
-Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
-
-This file is part of GIZA++ ( extension of GIZA ).
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef collCounts_h_defined
-#define collCounts_h_defined
-#include "alignment.h"
-#include "transpair_model3.h"
-#include <map>
-#include "MoveSwapMatrix.h"
-#include "D4Tables.h"
-#include "transpair_model4.h"
-
-class OneMoveSwap
-{
- public:
- short type;
- short a,b;
- OneMoveSwap(short _type,short _a,short _b)
- : type(_type),a(_a),b(_b)
- {}
- OneMoveSwap()
- : type(0){}
-};
-
-inline bool operator<(const OneMoveSwap&a,const OneMoveSwap&b)
-{
- if(a.type<b.type)return 1;
- else if(b.type<a.type)return 0;
- else if(a.a<b.a)return 1;
- else if(b.a<a.a)return 0;
- else return a.b<b.b;
-}
-
-inline bool operator==(const OneMoveSwap&a,const OneMoveSwap&b)
-{
- return a.type==b.type&&a.a==b.a&&a.b==b.b;
-}
-
-inline ostream&operator<<(ostream&out,const OneMoveSwap&o)
-{
- return out << '(' << o.type << "," << o.a << "," << o.b << ")";
-}
-
-inline ostream &operator<<(ostream &out,const set<OneMoveSwap>&s)
-{
- for(set<OneMoveSwap>::const_iterator i=s.begin();i!=s.end();++i)
- cout << *i << ' ';
- return out;
-}
-
-bool makeOneMoveSwap(const alignment&a,const alignment&b,set<OneMoveSwap>&oms);
-
-template<class TRANSPAIR,class MODEL>
-int collectCountsOverNeighborhood(const Vector<pair<MoveSwapMatrix<TRANSPAIR>*,LogProb> >&smsc,
- Vector<WordIndex>&es,
- Vector<WordIndex>&fs,tmodel<COUNT,PROB>&tTable,
- amodel<COUNT>&aCountTable,amodel<COUNT>&dCountTable,
- nmodel<COUNT>&nCountTable,double&p1count,double&p0count,
- LogProb&_total,float count,bool addCounts,MODEL*d4Table=0);
-
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef91969a/ext/giza-pp/GIZA++-v2/defs.h
----------------------------------------------------------------------
diff --git a/ext/giza-pp/GIZA++-v2/defs.h b/ext/giza-pp/GIZA++-v2/defs.h
deleted file mode 100644
index e94addd..0000000
--- a/ext/giza-pp/GIZA++-v2/defs.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-
-EGYPT Toolkit for Statistical Machine Translation
-Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
-
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License
-as published by the Free Software Foundation; either version 2
-of the License, or (at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
-USA.
-
-*/
-#ifndef _defs_h
-#define _defs_h 1
-#include <string>
-#include <math.h>
-#include <limits.h>
-
-const int TRANSFER_SIMPLE=1;
-const int TRANSFER=0;
-
-const unsigned int MAX_SENTENCE_LENGTH_ALLOWED=101;
-const int TRAIN_BUFFER_SIZE= 50000;
-//#ifdef WORDINDEX_WITH_4_BYTE
-typedef unsigned int WordIndex;
-const unsigned int MAX_VOCAB_SIZE=UINT_MAX;
-typedef unsigned int PositionIndex;
-//#else
-//typedef unsigned short WordIndex;
-//const unsigned int MAX_VOCAB_SIZE=USHRT_MAX;
-//typedef unsigned short PositionIndex;
-//#endif
-extern WordIndex MAX_FERTILITY;
-
-const int MAX_W=457979;
-extern double LAMBDA; // Lambda that is used to scale cross_entropy factor
-
-typedef float PROB ;
-typedef float COUNT ;
-
-class LogProb {
- private:
- double x ;
- public:
- LogProb():x(0){}
- LogProb(double y):x(y){}
- LogProb(float y):x(y){}
- LogProb(int y):x(y){}
- LogProb(WordIndex y):x(y){}
- operator double() const {return x;}
- LogProb operator *= (double y) { x *= y ; return *this;}
- LogProb operator *= (LogProb y) { x *= y.x ; return *this;}
- LogProb operator /= (double y) { x /= y ; return *this;}
- LogProb operator /= (LogProb y) { x /= y.x ; return *this;}
- LogProb operator += (double y) { x += y ; return *this;}
- LogProb operator += (LogProb y) { x += y.x ; return *this;}
-};
-
-const int PARLEV_ITER=1;
-const int PARLEV_OPTHEUR=2;
-const int PARLEV_OUTPUT=3;
-const int PARLEV_SMOOTH=4;
-const int PARLEV_EM=5;
-const int PARLEV_MODELS=6;
-const int PARLEV_SPECIAL=7;
-const int PARLEV_INPUT=8;
-
-#endif
-