You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:59 UTC
[43/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it possible to use Maven to build Joshua

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
deleted file mode 100644
index f07b668..0000000
--- a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.ConcurrentHashMap;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.SourcePath;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.lm.KenLM;
-import joshua.decoder.ff.lm.KenLM.StateProbPair;
-import joshua.decoder.ff.state_maintenance.DPState;
-import joshua.decoder.ff.state_maintenance.KenLMState;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * Wrapper for KenLM LMs with left-state minimization. We inherit from the regular
- * 
- * @author Matt Post <po...@cs.jhu.edu>
- * @author Juri Ganitkevitch <ju...@cs.jhu.edu>
- */
-public class StateMinimizingLanguageModel extends LanguageModelFF {
-
-  // maps from sentence numbers to KenLM-side pools used to allocate state
-  private static final ConcurrentHashMap<Integer, Long> poolMap = new ConcurrentHashMap<Integer, Long>();
-
-  public StateMinimizingLanguageModel(FeatureVector weights, String[] args, JoshuaConfiguration config) {
-    super(weights, args, config);
-    this.type = "kenlm";
-    if (parsedArgs.containsKey("lm_type") && ! parsedArgs.get("lm_type").equals("kenlm")) {
-      System.err.println("* FATAL: StateMinimizingLanguageModel only supports 'kenlm' lm_type backend");
-      System.err.println("*        Remove lm_type from line or set to 'kenlm'");
-      System.exit(-1);
-    }
-  }
-  
-  @Override
-  public ArrayList<String> reportDenseFeatures(int index) {
-    denseFeatureIndex = index;
-    
-    ArrayList<String> names = new ArrayList<String>();
-    names.add(name);
-    return names;
-  }
-
-  /**
-   * Initializes the underlying language model.
-   * 
-   * @param config
-   * @param type
-   * @param path
-   */
-  @Override
-  public void initializeLM() {
-    
-    // Override type (only KenLM supports left-state minimization)
-    this.languageModel = new KenLM(ngramOrder, path);
-
-    Vocabulary.registerLanguageModel(this.languageModel);
-    Vocabulary.id(config.default_non_terminal);
-    
-  }
-  
-  /**
-   * Estimates the cost of a rule. We override here since KenLM can do it more efficiently
-   * than the default {@link LanguageModelFF} class.
-   *    
-   * Most of this function implementation is redundant with compute().
-   */
-  @Override
-  public float estimateCost(Rule rule, Sentence sentence) {
-    
-    int[] ruleWords = rule.getEnglish();
-
-    // The IDs we'll pass to KenLM
-    long[] words = new long[ruleWords.length];
-
-    for (int x = 0; x < ruleWords.length; x++) {
-      int id = ruleWords[x];
-
-      if (Vocabulary.nt(id)) {
-        // For the estimate, we can just mark negative values
-        words[x] = -1;
-
-      } else {
-        // Terminal: just add it
-        words[x] = id;
-      }
-    }
-    
-    // Get the probability of applying the rule and the new state
-    return weight * ((KenLM) languageModel).estimateRule(words);
-  }
-  
-  /**
-   * Computes the features incurred along this edge. Note that these features are unweighted costs
-   * of the feature; they are the feature cost, not the model cost, or the inner product of them.
-   */
-  @Override
-  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
-      Sentence sentence, Accumulator acc) {
-
-    int[] ruleWords = config.source_annotations 
-        ? getTags(rule, i, j, sentence)
-        : rule.getEnglish();
-
-    // The IDs we'll pass to KenLM
-    long[] words = new long[ruleWords.length];
-
-    for (int x = 0; x < ruleWords.length; x++) {
-      int id = ruleWords[x];
-
-      if (Vocabulary.nt(id)) {
-        // Nonterminal: retrieve the KenLM long that records the state
-        int index = -(id + 1);
-        KenLMState state = (KenLMState) tailNodes.get(index).getDPState(stateIndex);
-        words[x] = -state.getState();
-
-      } else {
-        // Terminal: just add it
-        words[x] = id;
-      }
-    }
-    
-    int sentID = sentence.id();
-    // Since sentId is unique across threads, next operations are safe, but not atomic!
-    if (!poolMap.containsKey(sentID)) {
-      poolMap.put(sentID, KenLM.createPool());
-    }
-
-    // Get the probability of applying the rule and the new state
-    StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
-
-    // Record the prob
-//    acc.add(name, pair.prob);
-    acc.add(denseFeatureIndex, pair.prob);
-
-    // Return the state
-    return pair.state;
-  }
-
-  /**
-   * Destroys the pool created to allocate state for this sentence. Called from the
-   * {@link joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting
-   * this map here in KenLMFF statically allows pools to be shared across KenLM instances.
-   * 
-   * @param sentId
-   */
-  public void destroyPool(int sentId) {
-    if (poolMap.containsKey(sentId))
-      KenLM.destroyPool(poolMap.get(sentId));
-    poolMap.remove(sentId);
-  }
-
-  /**
-   * This function differs from regular transitions because we incorporate the cost of incomplete
-   * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
-   * requested when the object was created).
-   * 
-   * KenLM already includes the prefix probabilities (of shorter n-grams on the left-hand side), so
-   * there's nothing that needs to be done.
-   */
-  @Override
-  public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
-      Accumulator acc) {
-
-    // KenLMState state = (KenLMState) tailNode.getDPState(getStateIndex());
-
-    // This is unnecessary
-    // acc.add(name, 0.0f);
-
-    // The state is the same since no rule was applied
-    return new KenLMState();
-  }
-
-  /**
-   * KenLM probs already include the prefix probabilities (they are substracted out when merging
-   * states), so this doesn't need to do anything.
-   */
-  @Override
-  public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
-    return 0.0f;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/berkeley_lm/LICENSE
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/LICENSE b/src/joshua/decoder/ff/lm/berkeley_lm/LICENSE
deleted file mode 100644
index 2aaeb08..0000000
--- a/src/joshua/decoder/ff/lm/berkeley_lm/LICENSE
+++ /dev/null
@@ -1,13 +0,0 @@
-Copyright 2013 University of California, Berkeley
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java b/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
deleted file mode 100644
index 2716576..0000000
--- a/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm.berkeley_lm;
-
-import java.io.File;
-import java.util.Arrays;
-import java.util.logging.Handler;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import com.google.common.annotations.VisibleForTesting;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.lm.DefaultNGramLanguageModel;
-import joshua.decoder.Decoder;
-import edu.berkeley.nlp.lm.ArrayEncodedNgramLanguageModel;
-import edu.berkeley.nlp.lm.ConfigOptions;
-import edu.berkeley.nlp.lm.StringWordIndexer;
-import edu.berkeley.nlp.lm.WordIndexer;
-import edu.berkeley.nlp.lm.cache.ArrayEncodedCachingLmWrapper;
-import edu.berkeley.nlp.lm.io.LmReaders;
-import edu.berkeley.nlp.lm.util.StrUtils;
-
-/**
- * This class wraps Berkeley LM.
- *
- * @author adpauls@gmail.com
- */
-public class LMGrammarBerkeley extends DefaultNGramLanguageModel {
-
-  private ArrayEncodedNgramLanguageModel<String> lm;
-
-  private static final Logger logger = Logger.getLogger(LMGrammarBerkeley.class.getName());
-
-  private int[] vocabIdToMyIdMapping;
-
-  private ThreadLocal<int[]> arrayScratch = new ThreadLocal<int[]>() {
-
-    @Override
-    protected int[] initialValue() {
-      return new int[5];
-    }
-  };
-
-  private int mappingLength = 0;
-
-  private final int unkIndex;
-
-  private static boolean logRequests = false;
-
-  private static Handler logHandler = null;
-
-  public LMGrammarBerkeley(int order, String lm_file) {
-    super(order);
-    vocabIdToMyIdMapping = new int[10];
-
-    if (!new File(lm_file).exists()) {
-      System.err.println("Can't read lm_file '" + lm_file + "'");
-      System.exit(1);
-    }
-
-    if (logRequests) {
-      logger.addHandler(logHandler);
-      logger.setLevel(Level.FINEST);
-      logger.setUseParentHandlers(false);
-    }
-
-    try { // try binary format (even gzipped)
-      lm = (ArrayEncodedNgramLanguageModel<String>) LmReaders.<String>readLmBinary(lm_file);
-      Decoder.LOG(1, "Loading Berkeley LM from binary " + lm_file);
-    } catch (RuntimeException e) {
-      ConfigOptions opts = new ConfigOptions();
-      Decoder.LOG(1, "Loading Berkeley LM from ARPA file " + lm_file);
-      final StringWordIndexer wordIndexer = new StringWordIndexer();
-      ArrayEncodedNgramLanguageModel<String> berkeleyLm =
-          LmReaders.readArrayEncodedLmFromArpa(lm_file, false, wordIndexer, opts, order);
-
-      lm = ArrayEncodedCachingLmWrapper.wrapWithCacheThreadSafe(berkeleyLm);
-    }
-    this.unkIndex = lm.getWordIndexer().getOrAddIndex(lm.getWordIndexer().getUnkSymbol());
-  }
-
-  @Override
-  public boolean registerWord(String token, int id) {
-    int myid = lm.getWordIndexer().getIndexPossiblyUnk(token);
-    if (myid < 0) return false;
-    if (id >= vocabIdToMyIdMapping.length) {
-      vocabIdToMyIdMapping =
-          Arrays.copyOf(vocabIdToMyIdMapping, Math.max(id + 1, vocabIdToMyIdMapping.length * 2));
-
-    }
-    mappingLength = Math.max(mappingLength, id + 1);
-    vocabIdToMyIdMapping[id] = myid;
-
-    return false;
-  }
-
-  @Override
-  public float sentenceLogProbability(int[] sentence, int order, int startIndex) {
-    if (sentence == null) return 0;
-    int sentenceLength = sentence.length;
-    if (sentenceLength <= 0) return 0;
-
-    float probability = 0;
-    // partial ngrams at the begining
-    for (int j = startIndex; j < order && j <= sentenceLength; j++) {
-      // TODO: startIndex dependens on the order, e.g., this.ngramOrder-1 (in srilm, for 3-gram lm,
-      // start_index=2. othercase, need to check)
-      double logProb = ngramLogProbability_helper(sentence, 0, j, false);
-      if (logger.isLoggable(Level.FINE)) {
-        int[] ngram = Arrays.copyOfRange(sentence, 0, j);
-        String words = Vocabulary.getWords(ngram);
-        logger.fine("\tlogp ( " + words + " )  =  " + logProb);
-      }
-      probability += logProb;
-    }
-
-    // regular-order ngrams
-    for (int i = 0; i <= sentenceLength - order; i++) {
-      double logProb =  ngramLogProbability_helper(sentence, i, order, false);
-      if (logger.isLoggable(Level.FINE)) {
-        int[] ngram = Arrays.copyOfRange(sentence, i, i + order);
-        String words = Vocabulary.getWords(ngram);
-        logger.fine("\tlogp ( " + words + " )  =  " + logProb);
-      }
-      probability += logProb;
-    }
-
-    return probability;
-  }
-
-  @Override
-  public float ngramLogProbability_helper(int[] ngram, int order) {
-    return ngramLogProbability_helper(ngram, false);
-  }
-
-  protected float ngramLogProbability_helper(int[] ngram, boolean log) {
-    return ngramLogProbability_helper(ngram, 0, ngram.length, log);
-  }
-
-  protected float ngramLogProbability_helper(int sentence[], int ngramStartPos, int ngramLength, boolean log) {
-    int[] mappedNgram = arrayScratch.get();
-    if (mappedNgram.length < ngramLength) {
-      mappedNgram = new int[mappedNgram.length * 2];
-      arrayScratch.set(mappedNgram);
-    }
-    for (int i = 0; i < ngramLength; ++i) {
-      mappedNgram[i] = vocabIdToMyIdMapping[sentence[ngramStartPos + i]];
-    }
-
-    if (log && logRequests) {
-      dumpBuffer(mappedNgram, ngramLength);
-    }
-
-    return lm.getLogProb(mappedNgram, 0, ngramLength);
-  }
-
-  public static void setLogRequests(Handler handler) {
-    logRequests = true;
-    logHandler = handler;
-  }
-
-  @Override
-  public float ngramLogProbability(int[] ngram) {
-    return ngramLogProbability_helper(ngram,true);
-  }
-
-  @Override
-  public float ngramLogProbability(int[] ngram, int order) {
-    return ngramLogProbability(ngram);
-  }
-
-  private void dumpBuffer(int[] buffer, int len) {
-    final int[] copyOf = Arrays.copyOf(buffer, len);
-    for (int i = 0; i < copyOf.length; ++i) {
-      if (copyOf[i] < 0) {
-        copyOf[i] = unkIndex;
-      }
-    }
-    logger.finest(StrUtils.join(WordIndexer.StaticMethods.toList(lm.getWordIndexer(), copyOf)));
-  }
-
-  @VisibleForTesting
-  ArrayEncodedNgramLanguageModel<String> getLM() {
-    return lm;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/berkeley_lm/README
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/README b/src/joshua/decoder/ff/lm/berkeley_lm/README
deleted file mode 100644
index 82bb473..0000000
--- a/src/joshua/decoder/ff/lm/berkeley_lm/README
+++ /dev/null
@@ -1,5 +0,0 @@
-To build a binary for Berkeley LM, you need to do the following:
-
-java -cp [berkelylm jar file] -server -mx[lots of memory] edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa [ARPA file] [output file]
-
-Both input and output will be appropriately GZipped if they have a .gz extension. Note that MakeLmBinaryFromArpa has options for e.g. enabling compression. 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java b/src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java
deleted file mode 100644
index a45dd7f..0000000
--- a/src/joshua/decoder/ff/lm/berkeley_lm/SymbolTableWrapper.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm.berkeley_lm;
-
-import joshua.corpus.Vocabulary;
-import edu.berkeley.nlp.lm.WordIndexer;
-
-class SymbolTableWrapper implements WordIndexer<String> {
-  /**
-	 * 
-	 */
-  private static final long serialVersionUID = 1L;
-
-  private String startSymbol;
-
-  private String endSymbol;
-
-  private String unkSymbol;
-
-  int size = -1;
-
-  public SymbolTableWrapper() {
-
-  }
-
-  @Override
-  public int getOrAddIndex(String word) {
-    return Vocabulary.id(word);
-  }
-
-  @Override
-  public int getOrAddIndexFromString(String word) {
-    return Vocabulary.id(word);
-  }
-
-  @Override
-  public String getWord(int index) {
-    return Vocabulary.word(index);
-  }
-
-  @Override
-  public int numWords() {
-    return Vocabulary.size();
-  }
-
-  @Override
-  public String getStartSymbol() {
-    return startSymbol;
-  }
-
-  @Override
-  public String getEndSymbol() {
-    return endSymbol;
-  }
-
-  @Override
-  public String getUnkSymbol() {
-    return unkSymbol;
-  }
-
-  @Override
-  public void setStartSymbol(String sym) {
-    startSymbol = sym;
-  }
-
-  @Override
-  public void setEndSymbol(String sym) {
-    endSymbol = sym;
-  }
-
-  @Override
-  public void setUnkSymbol(String sym) {
-    unkSymbol = sym;
-  }
-
-  @Override
-  public void trimAndLock() {
-
-  }
-
-  @Override
-  public int getIndexPossiblyUnk(String word) {
-    return Vocabulary.id(word);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java b/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java
deleted file mode 100644
index 7f0b6a4..0000000
--- a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilter.java
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm.bloomfilter_lm;
-
-import java.io.Externalizable;
-import java.io.IOException;
-import java.io.ObjectInput;
-import java.io.ObjectOutput;
-import java.math.BigInteger;
-import java.util.BitSet;
-import java.util.Random;
-
-/**
- * A Bloom filter: a lossy data structure for set representation. A Bloom filter consists of a bit
- * set and a set of hash functions. A Bloom filter has two operations: add and query. We can add an
- * object to a Bloom filter to indicate that it should be considered part of the set that the Bloom
- * filter represents. We can query the Bloom filter to see if a given object is considered part of
- * its set.
- * <p>
- * An object is added by sending it through a number of hash functions, each of which returns an
- * index into the bit set. The bit at each of the indices is flipped on. We can query for an abject
- * by sending it through the same hash functions. Then we look the bit at each index that was
- * returned by a hash function. If any of the bits is unset, we know that the object is not in the
- * Bloom filter (for otherwise all the bits should have already been set). If all the bits are set,
- * we assume that the object is present in the Bloom filter.
- * <p>
- * We cannot know for sure that an object is in the bloom filter just because all its bits were set.
- * There may be many collisions in the hash space, and all the bits for some object might be set by
- * chance, rather than by adding that particular object.
- * <p>
- * The advantage of a Bloom filter is that its set representation can be stored in a significantly
- * smaller space than information-theoretic lossless lower bounds. The price we pay for this is a
- * certain amount of error in the query function. One nice feature of the Bloom filter is that its
- * error is one-sided. This means that while the query function may return false positives (saying
- * an object is present when it really isn't), it can never return false negatives (saying that an
- * object is not present when it was already added.
- */
-public class BloomFilter implements Externalizable {
-  /**
-   * The main bit set of the Bloom filter.
-   */
-  private BitSet bitSet;
-
-  /**
-   * The number of objects expected to be stored in the Bloom filter. The optimal number of hash
-   * functions depends on this number.
-   */
-  int expectedNumberOfObjects;
-
-  /**
-   * A prime number that should be bigger than the size of the bit set.
-   */
-  long bigPrime;
-
-  /**
-   * The size of the bit set, in bits.
-   */
-  int filterSize;
-
-  /**
-   * A random number generator for building hash functions.
-   */
-  transient private Random RANDOM = new Random();
-
-  /**
-   * Builds an empty Bloom filter, ready to build hash functions and store objects.
-   * 
-   * @param filterSize the size of Bloom filter to make, in bits
-   * @param expectedNumberOfObjects the number of objects expected to be stored in the Bloom filter
-   */
-  public BloomFilter(int filterSize, int expectedNumberOfObjects) {
-    bitSet = new BitSet(filterSize);
-    this.filterSize = filterSize;
-    this.expectedNumberOfObjects = expectedNumberOfObjects;
-    bigPrime = getPrimeLargerThan(filterSize);
-  }
-
-  /**
-   * Adds an item (represented by an integer) to the bloom filter.
-   * 
-   * @param objectToAdd the object to add
-   * @param hashFunctions an array of pairs of long, representing the hash functions to be used on
-   *        the object
-   */
-  public void add(int objectToAdd, long[][] hashFunctions) {
-    for (long[] h : hashFunctions) {
-      int i = hash(h, (long) objectToAdd);
-      bitSet.set(i);
-    }
-  }
-
-  public void add(long objectToAdd, long[][] hashFunctions) {
-    for (long[] h : hashFunctions) {
-      int i = hash(h, objectToAdd);
-      bitSet.set(i);
-    }
-  }
-
-  /**
-   * Determines whether an item (represented by an integer) is present in the bloom filter.
-   * 
-   * @param objectToQuery the object we want to query for membership
-   * @param hashFunctions an array of pairs of long, representing the hash functions to be used
-   * 
-   * @return true if the objects is assumed to be present in the Bloom filter, false if it is
-   *         definitely not present
-   */
-  public boolean query(int objectToQuery, long[][] hashFunctions) {
-    for (long[] h : hashFunctions) {
-      int i = hash(h, (long) objectToQuery);
-      if (!bitSet.get(i)) return false;
-    }
-    return true;
-  }
-
-  public boolean query(long objectToQuery, long[][] hashFunctions) {
-    for (long[] h : hashFunctions) {
-      int i = hash(h, objectToQuery);
-      if (!bitSet.get(i)) return false;
-    }
-    return true;
-  }
-
-  /**
-   * Builds an array of pairs of long that can be used as hash functions for this Bloom filter.
-   * 
-   * @return an array of pairs of long suitable for use as hash functions
-   */
-  public long[][] initializeHashFunctions() {
-    int numberOfHashFunctions;
-    int bigPrimeInt = (int) bigPrime;
-    numberOfHashFunctions =
-        (int) Math.floor(Math.log(2) * bitSet.length() / expectedNumberOfObjects);
-    if (numberOfHashFunctions == 0) numberOfHashFunctions = 1;
-    long[][] hashFunctions = new long[numberOfHashFunctions][2];
-    for (long[] h : hashFunctions) {
-      h[0] = (long) RANDOM.nextInt(bigPrimeInt) + 1;
-      h[1] = (long) RANDOM.nextInt(bigPrimeInt) + 1;
-    }
-    return hashFunctions;
-  }
-
-  /**
-   * Determines which bit of the bit set should be either set, for add operations, or checked, for
-   * query operations.
-   * 
-   * @param h a length-2 array of long used as a hash function
-   * @param objectToHash the object of interest
-   * 
-   * @return an index into the bit set of the Bloom filter
-   */
-  private int hash(long[] h, long objectToHash) {
-    long obj = (objectToHash < Integer.MAX_VALUE) ? objectToHash : objectToHash - bigPrime;
-    long h0 = h[0];
-    long h1 = (h[1] < (Long.MAX_VALUE / 2)) ? h[1] : h[1] - bigPrime;
-    long ret = (obj * h0) % bigPrime;
-    ret = (ret < (Long.MAX_VALUE / 2)) ? ret : ret - bigPrime;
-    return (int) (((ret + h1) % bigPrime) % (long) filterSize);
-  }
-
-  /**
-   * Finds a prime number that is larger than the given number. This is used to find bigPrime, a
-   * prime that has to be larger than the size of the Bloom filter.
-   * 
-   * @param n an integer
-   * 
-   * @return a prime number larger than n
-   */
-  private long getPrimeLargerThan(int n) {
-    BigInteger ret;
-    BigInteger maxLong = BigInteger.valueOf(Long.MAX_VALUE);
-    int numBits = BigInteger.valueOf(n).bitLength() + 1;
-    do {
-      ret = BigInteger.probablePrime(numBits, RANDOM);
-    } while (ret.compareTo(maxLong) > 1);
-    return ret.longValue();
-  }
-
-  /*
-   * functions for interface externalizable
-   */
-
-  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
-    expectedNumberOfObjects = in.readInt();
-    filterSize = in.readInt();
-    bigPrime = in.readLong();
-    bitSet = (BitSet) in.readObject();
-  }
-
-  public void writeExternal(ObjectOutput out) throws IOException {
-    out.writeInt(expectedNumberOfObjects);
-    out.writeInt(filterSize);
-    out.writeLong(bigPrime);
-    out.writeObject(bitSet);
-  }
-
-  // only used for reconstruction via Externalizable
-  public BloomFilter() {}
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java b/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
deleted file mode 100644
index c91fe38..0000000
--- a/src/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
+++ /dev/null
@@ -1,562 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm.bloomfilter_lm;
-
-import java.io.Externalizable;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectInput;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutput;
-import java.io.ObjectOutputStream;
-import java.util.HashMap;
-import java.util.logging.Logger;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.lm.DefaultNGramLanguageModel;
-import joshua.util.Regex;
-import joshua.util.io.LineReader;
-
-/**
- * An n-gram language model with linearly-interpolated Witten-Bell smoothing, using a Bloom filter
- * as its main data structure. A Bloom filter is a lossy data structure that can be used to test for
- * set membership.
- */
-public class BloomFilterLanguageModel extends DefaultNGramLanguageModel implements Externalizable {
-  /**
-   * An initial value used for hashing n-grams so that they can be stored in a bloom filter.
-   */
-  public static final int HASH_SEED = 17;
-
-  /**
-   * Another value used in the process of hashing n-grams.
-   */
-  public static final int HASH_OFFSET = 37;
-
-  /**
-   * The maximum score that a language model feature function can return to the Joshua decoder.
-   */
-  public static final double MAX_SCORE = 100.0;
-
-  /**
-   * The logger for this class.
-   */
-  public static final Logger logger = Logger.getLogger(BloomFilterLanguageModel.class.getName());
-
-  /**
-   * The Bloom filter data structure itself.
-   */
-  private BloomFilter bf;
-
-  /**
-   * The base of the logarithm used to quantize n-gram counts. N-gram counts are quantized
-   * logarithmically to reduce the number of times we need to query the Bloom filter.
-   */
-  private double quantizationBase;
-
-  /**
-   * Natural log of the number of tokens seen in the training corpus.
-   */
-  private double numTokens;
-
-  /**
-   * An array of pairs of long, used as hash functions for storing or retreiving the count of an
-   * n-gram in the Bloom filter.
-   */
-  private long[][] countFuncs;
-  /**
-   * An array of pairs of long, used as hash functions for storing or retreiving the number of
-   * distinct types observed after an n-gram.
-   */
-  private long[][] typesFuncs;
-
-  /**
-   * The smoothed probability of an unseen n-gram. This is also the probability of any n-gram under
-   * the zeroth-order model.
-   */
-  transient private double p0;
-
-  /**
-   * The interpolation constant between Witten-Bell models of order zero and one. Stored in a field
-   * because it can be calculated ahead of time; it doesn't depend on the particular n-gram.
-   */
-  transient private double lambda0;
-
-  /**
-   * The maximum possible quantized count of any n-gram stored in the Bloom filter. Used as an upper
-   * bound on the count that could be returned when querying the Bloom filter.
-   */
-  transient private int maxQ; // max quantized count
-
-  /**
-   * Constructor called from the Joshua decoder. This constructor assumes that the LM has already
-   * been built, and takes the name of the file where the LM is stored.
-   * 
-   * @param order the order of the language model
-   * @param filename path to the file where the language model is stored
-   */
-  public BloomFilterLanguageModel(int order, String filename) throws IOException {
-    super(order);
-    try {
-      readExternal(new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename))));
-    } catch (ClassNotFoundException e) {
-      IOException ioe = new IOException("Could not rebuild bloom filter LM from file " + filename);
-      ioe.initCause(e);
-      throw ioe;
-    }
-
-    int vocabSize = Vocabulary.size();
-    p0 = -Math.log(vocabSize + 1);
-    double oneMinusLambda0 = numTokens - logAdd(Math.log(vocabSize), numTokens);
-    p0 += oneMinusLambda0;
-    lambda0 = Math.log(vocabSize) - logAdd(Math.log(vocabSize), numTokens);
-    maxQ = quantize((long) Math.exp(numTokens));
-  }
-
-  /**
-   * Constructor to be used by the main function. This constructor is used to build a new language
-   * model from scratch. An LM should be built with the main function before using it in the Joshua
-   * decoder.
-   * 
-   * @param filename path to the file of training corpus statistics
-   * @param order the order of the language model
-   * @param size the size of the Bloom filter, in bits
-   * @param base a double. The base of the logarithm for quantization.
-   */
-  private BloomFilterLanguageModel(String filename, int order, int size, double base) {
-    super(order);
-    quantizationBase = base;
-    populateBloomFilter(size, filename);
-  }
-
-  /**
-   * calculates the linearly-interpolated Witten-Bell probability for a given ngram. this is
-   * calculated as: p(w|h) = pML(w|h)L(h) - (1 - L(h))p(w|h') where: w is a word and h is a history
-   * h' is the history h with the first word removed pML is the maximum-likelihood estimate of the
-   * probability L(.) is lambda, the interpolation factor, which depends only on the history h: L(h)
-   * = s(h) / s(h) + c(h) where s(.) is the observed number of distinct types after h, and c is the
-   * observed number of counts of h in the training corpus.
-   * <p>
-   * in fact this model calculates the probability starting from the lowest order and working its
-   * way up, to take advantage of the one- sided error rate inherent in using a bloom filter data
-   * structure.
-   * 
-   * @param ngram the ngram whose probability is to be calculated
-   * @param ngramOrder the order of the ngram.
-   * 
-   * @return the linearly-interpolated Witten-Bell smoothed probability of an ngram
-   */
-  private float wittenBell(int[] ngram, int ngramOrder) {
-    int end = ngram.length;
-    double p = p0; // current calculated probability
-    // note that p0 and lambda0 are independent of the given
-    // ngram so they are calculated ahead of time.
-    int MAX_QCOUNT = getCount(ngram, ngram.length - 1, ngram.length, maxQ);
-    if (MAX_QCOUNT == 0) // OOV!
-      return (float) p;
-    double pML = Math.log(unQuantize(MAX_QCOUNT)) - numTokens;
-
-    // p += lambda0 * pML;
-    p = logAdd(p, (lambda0 + pML));
-    if (ngram.length == 1) { // if it's a unigram, we're done
-      return (float) p;
-    }
-    // otherwise we calculate the linear interpolation
-    // with higher order models.
-    for (int i = end - 2; i >= end - ngramOrder && i >= 0; i--) {
-      int historyCnt = getCount(ngram, i, end, MAX_QCOUNT);
-      // if the count for the history is zero, all higher
-      // terms in the interpolation must be zero, so we
-      // are done here.
-      if (historyCnt == 0) {
-        return (float) p;
-      }
-      int historyTypesAfter = getTypesAfter(ngram, i, end, historyCnt);
-      // unQuantize the counts we got from the BF
-      double HC = unQuantize(historyCnt);
-      double HTA = 1 + unQuantize(historyTypesAfter);
-      // interpolation constant
-      double lambda = Math.log(HTA) - Math.log(HTA + HC);
-      double oneMinusLambda = Math.log(HC) - Math.log(HTA + HC);
-      // p *= 1 - lambda
-      p += oneMinusLambda;
-      int wordCount = getCount(ngram, i + 1, end, historyTypesAfter);
-      double WC = unQuantize(wordCount);
-      // p += lambda * p_ML(w|h)
-      if (WC == 0) return (float) p;
-      p = logAdd(p, lambda + Math.log(WC) - Math.log(HC));
-      MAX_QCOUNT = wordCount;
-    }
-    return (float) p;
-  }
-
-  /**
-   * Retrieve the count of a ngram from the Bloom filter. That is, how many times did we see this
-   * ngram in the training corpus? This corresponds roughly to algorithm 2 in Talbot and Osborne's
-   * "Tera-Scale LMs on the Cheap."
-   * 
-   * @param ngram array containing the ngram as a sub-array
-   * @param start the index of the first word of the ngram
-   * @param end the index after the last word of the ngram
-   * @param qcount the maximum possible count to be returned
-   * 
-   * @return the number of times the ngram was seen in the training corpus, quantized
-   */
-  private int getCount(int[] ngram, int start, int end, int qcount) {
-    for (int i = 1; i <= qcount; i++) {
-      int hash = hashNgram(ngram, start, end, i);
-      if (!bf.query(hash, countFuncs)) {
-        return i - 1;
-      }
-    }
-    return qcount;
-  }
-
-  /**
-   * Retrieve the number of distinct types that follow an ngram in the training corpus.
-   * 
-   * This is another version of algorithm 2. As noted in the paper, we have different algorithms for
-   * getting ngram counts versus suffix counts because c(x) = 1 is a proxy item for s(x) = 1
-   * 
-   * @param ngram an array the contains the ngram as a sub-array
-   * @param start the index of the first word of the ngram
-   * @param end the index after the last word of the ngram
-   * @param qcount the maximum possible return value
-   * 
-   * @return the number of distinct types observed to follow an ngram in the training corpus,
-   *         quantized
-   */
-  private int getTypesAfter(int[] ngram, int start, int end, int qcount) {
-    // first we check c(x) >= 1
-    int hash = hashNgram(ngram, start, end, 1);
-    if (!bf.query(hash, countFuncs)) {
-      return 0;
-    }
-    // if c(x) >= 1, we check for the stored suffix count
-    for (int i = 1; i < qcount; i++) {
-      hash = hashNgram(ngram, start, end, i);
-      if (!bf.query(hash, typesFuncs)) {
-        return i - 1;
-      }
-    }
-    return qcount;
-  }
-
-  /**
-   * Logarithmically quantizes raw counts. The quantization scheme is described in Talbot and
-   * Osborne's paper "Tera-Scale LMs on the Cheap."
-   * 
-   * @param x long giving the raw count to be quantized
-   * 
-   * @return the quantized count
-   */
-  private int quantize(long x) {
-    return 1 + (int) Math.floor(Math.log(x) / Math.log(quantizationBase));
-  }
-
-  /**
-   * Unquantizes a quantized count.
-   * 
-   * @param x the quantized count
-   * 
-   * @return the expected raw value of the quantized count
-   */
-  private double unQuantize(int x) {
-    if (x == 0) {
-      return 0;
-    } else {
-      return ((quantizationBase + 1) * Math.pow(quantizationBase, x - 1) - 1) / 2;
-    }
-  }
-
-  /**
-   * Converts an n-gram and a count into a value that can be stored into a Bloom filter. This is
-   * adapted directly from <code>AbstractPhrase.hashCode()</code> elsewhere in the Joshua code base.
-   * 
-   * @param ngram an array containing the ngram as a sub-array
-   * @param start the index of the first word of the ngram
-   * @param end the index after the last word of the ngram
-   * @param val the count of the ngram
-   * 
-   * @return a value suitable to be stored in a Bloom filter
-   */
-  private int hashNgram(int[] ngram, int start, int end, int val) {
-    int result = HASH_OFFSET * HASH_SEED + val;
-    for (int i = start; i < end; i++)
-      result = HASH_OFFSET * result + ngram[i];
-    return result;
-  }
-
-  /**
-   * Adds two numbers that are in the log domain, avoiding underflow.
-   * 
-   * @param x one summand
-   * @param y the other summand
-   * 
-   * @return the log of the sum of the exponent of the two numbers.
-   */
-  private static double logAdd(double x, double y) {
-    if (y <= x) {
-      return x + Math.log1p(Math.exp(y - x));
-    } else {
-      return y + Math.log1p(Math.exp(x - y));
-    }
-  }
-
-  /**
-   * Builds a language model and stores it in a file.
-   * 
-   * @param argv command-line arguments
-   */
-  public static void main(String[] argv) {
-    if (argv.length < 5) {
-      System.err
-          .println("usage: BloomFilterLanguageModel <statistics file> <order> <size> <quantization base> <output file>");
-      return;
-    }
-    int order = Integer.parseInt(argv[1]);
-    int size = (int) (Integer.parseInt(argv[2]) * Math.pow(2, 23));
-    double base = Double.parseDouble(argv[3]);
-
-    try {
-      BloomFilterLanguageModel lm = new BloomFilterLanguageModel(argv[0], order, size, base);
-
-      ObjectOutputStream out =
-          new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(argv[4])));
-
-      lm.writeExternal(out);
-      out.close();
-    } catch (FileNotFoundException e) {
-      System.err.println(e.getMessage());
-    } catch (IOException e) {
-      System.err.println(e.getMessage());
-    }
-  }
-  
-  /**
-   * Adds ngram counts and counts of distinct types after ngrams, read from a file, to the Bloom
-   * filter.
-   * <p>
-   * The file format should look like this: ngram1 count types-after ngram2 count types-after ...
-   * 
-   * @param bloomFilterSize the size of the Bloom filter, in bits
-   * @param filename path to the statistics file
-   */
-  private void populateBloomFilter(int bloomFilterSize, String filename) {
-    HashMap<String, Long> typesAfter = new HashMap<String, Long>();
-    try {
-      FileInputStream file_in = new FileInputStream(filename);
-      FileInputStream file_in_copy = new FileInputStream(filename);
-      InputStream in;
-      InputStream estimateStream;
-      if (filename.endsWith(".gz")) {
-        in = new GZIPInputStream(file_in);
-        estimateStream = new GZIPInputStream(file_in_copy);
-      } else {
-        in = file_in;
-        estimateStream = file_in_copy;
-      }
-      int numObjects = estimateNumberOfObjects(estimateStream);
-      System.err.println("Estimated number of objects: " + numObjects);
-      bf = new BloomFilter(bloomFilterSize, numObjects);
-      countFuncs = bf.initializeHashFunctions();
-      populateFromInputStream(in, typesAfter);
-      in.close();
-    } catch (FileNotFoundException e) {
-      System.err.println(e.getMessage());
-      return;
-    } catch (IOException e) {
-      System.err.println(e.getMessage());
-      return;
-    }
-    typesFuncs = bf.initializeHashFunctions();
-    for (String history : typesAfter.keySet()) {
-      String[] toks = Regex.spaces.split(history);
-      int[] hist = new int[toks.length];
-      for (int i = 0; i < toks.length; i++)
-        hist[i] = Vocabulary.id(toks[i]);
-      add(hist, typesAfter.get(history), typesFuncs);
-    }
-    return;
-  }
-
-  /**
-   * Estimate the number of objects that will be stored in the Bloom filter. The optimum number of
-   * hash functions depends on the number of items that will be stored, so we want a guess before we
-   * begin to read the statistics file and store it.
-   * 
-   * @param source an InputStream pointing to the training corpus stats
-   * 
-   * @return an estimate of the number of objects to be stored in the Bloom filter
-   */
-  private int estimateNumberOfObjects(InputStream source) {
-    int numLines = 0;
-    long maxCount = 0;
-    for (String line: new LineReader(source)) {
-      if (line.trim().equals("")) continue;
-      String[] toks = Regex.spaces.split(line);
-      if (toks.length > ngramOrder + 1) continue;
-      try {
-        long cnt = Long.parseLong(toks[toks.length - 1]);
-        if (cnt > maxCount) maxCount = cnt;
-      } catch (NumberFormatException e) {
-        System.err.println("NumberFormatException! Line: " + line);
-        break;
-      }
-      numLines++;
-    }
-    double estimate = Math.log(maxCount) / Math.log(quantizationBase);
-    return (int) Math.round(numLines * estimate);
-  }
-
-  /**
-   * Reads the statistics from a source and stores them in the Bloom filter. The ngram counts are
-   * stored immediately in the Bloom filter, but the counts of distinct types following each ngram
-   * are accumulated from the file as we go.
-   * 
-   * @param source an InputStream pointing to the statistics
-   * @param types a HashMap that will stores the accumulated counts of distinct types observed to
-   *        follow each ngram
-   */
-  private void populateFromInputStream(InputStream source, HashMap<String, Long> types) {
-    numTokens = Double.NEGATIVE_INFINITY; // = log(0)
-    for (String line: new LineReader(source)) {
-      String[] toks = Regex.spaces.split(line);
-      if ((toks.length < 2) || (toks.length > ngramOrder + 1)) continue;
-      int[] ngram = new int[toks.length - 1];
-      StringBuilder history = new StringBuilder();
-      for (int i = 0; i < toks.length - 1; i++) {
-        ngram[i] = Vocabulary.id(toks[i]);
-        if (i < toks.length - 2) history.append(toks[i]).append(" ");
-      }
-
-      long cnt = Long.parseLong(toks[toks.length - 1]);
-      add(ngram, cnt, countFuncs);
-      if (toks.length == 2) { // unigram
-        numTokens = logAdd(numTokens, Math.log(cnt));
-        // no need to count types after ""
-        // that's what vocabulary.size() is for.
-        continue;
-      }
-      if (types.get(history) == null)
-        types.put(history.toString(), 1L);
-      else {
-        long x = (Long) types.get(history);
-        types.put(history.toString(), x + 1);
-      }
-    }
-    return;
-  }
-
-  /**
-   * Adds an ngram, along with an associated value, to the Bloom filter. This corresponds to Talbot
-   * and Osborne's "Tera-scale LMs on the cheap", algorithm 1.
-   * 
-   * @param ngram an array representing the ngram
-   * @param value the value to be associated with the ngram
-   * @param funcs an array of long to be used as hash functions
-   */
-  private void add(int[] ngram, long value, long[][] funcs) {
-    if (ngram == null) return;
-    int qValue = quantize(value);
-    for (int i = 1; i <= qValue; i++) {
-      int hash = hashNgram(ngram, 0, ngram.length, i);
-      bf.add(hash, funcs);
-    }
-  }
-
-  /**
-   * Read a Bloom filter LM from an external file.
-   * 
-   * @param in an ObjectInput stream to read from
-   */
-  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
-    int vocabSize = in.readInt();
-    for (int i = 0; i < vocabSize; i++) {
-      String line = in.readUTF();
-      Vocabulary.id(line);
-    }
-    numTokens = in.readDouble();
-    countFuncs = new long[in.readInt()][2];
-    for (int i = 0; i < countFuncs.length; i++) {
-      countFuncs[i][0] = in.readLong();
-      countFuncs[i][1] = in.readLong();
-    }
-    typesFuncs = new long[in.readInt()][2];
-    for (int i = 0; i < typesFuncs.length; i++) {
-      typesFuncs[i][0] = in.readLong();
-      typesFuncs[i][1] = in.readLong();
-    }
-    quantizationBase = in.readDouble();
-    bf = new BloomFilter();
-    bf.readExternal(in);
-  }
-
-  /**
-   * Write a Bloom filter LM to some external location.
-   * 
-   * @param out an ObjectOutput stream to write to
-   * 
-   * @throws IOException if an input or output exception occurred
-   */
-  public void writeExternal(ObjectOutput out) throws IOException {
-    out.writeInt(Vocabulary.size());
-    for (int i = 0; i < Vocabulary.size(); i++) {
-      // out.writeBytes(vocabulary.getWord(i));
-      // out.writeChar('\n'); // newline
-      out.writeUTF(Vocabulary.word(i));
-    }
-    out.writeDouble(numTokens);
-    out.writeInt(countFuncs.length);
-    for (int i = 0; i < countFuncs.length; i++) {
-      out.writeLong(countFuncs[i][0]);
-      out.writeLong(countFuncs[i][1]);
-    }
-    out.writeInt(typesFuncs.length);
-    for (int i = 0; i < typesFuncs.length; i++) {
-      out.writeLong(typesFuncs[i][0]);
-      out.writeLong(typesFuncs[i][1]);
-    }
-    out.writeDouble(quantizationBase);
-    bf.writeExternal(out);
-  }
-
-  /**
-   * Returns the language model score for an n-gram. This is called from the rest of the Joshua
-   * decoder.
-   * 
-   * @param ngram the ngram to score
-   * @param order the order of the model
-   * 
-   * @return the language model score of the ngram
-   */
-  @Override
-  protected float ngramLogProbability_helper(int[] ngram, int order) {
-    int[] lm_ngram = new int[ngram.length];
-    for (int i = 0; i < ngram.length; i++) {
-      lm_ngram[i] = Vocabulary.id(Vocabulary.word(ngram[i]));
-    }
-    return wittenBell(lm_ngram, order);
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html b/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html
deleted file mode 100644
index 883594a..0000000
--- a/src/joshua/decoder/ff/lm/bloomfilter_lm/package.html
+++ /dev/null
@@ -1,19 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
-<head></head>
-<body bgcolor="white">
-
-<!--
-##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
-##### TYPE YOUR PACKAGE COMMENTS HERE.  BEGIN WITH A     #####
-##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE:    #####
--->
-
-Provides an implementation of a bloom filter language model, and 
-an associated implementation of the language model feature function typically used in
-hierarchical phrase-based decoding for statistical machine translation.
-
-<!-- Put @see and @since tags down here. -->
-
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/package.html
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/package.html b/src/joshua/decoder/ff/lm/package.html
deleted file mode 100644
index b99a245..0000000
--- a/src/joshua/decoder/ff/lm/package.html
+++ /dev/null
@@ -1,35 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
-<head></head>
-<body bgcolor="white">
-
-<!--
-##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
-##### TYPE YOUR PACKAGE COMMENTS HERE.  BEGIN WITH A     #####
-##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE:    #####
--->
-
-Provides abstraction and support for the language model feature function typically used in
-hierarchical phrase-based decoding for statistical machine translation.
-
-The classes contained within this directory are responsible for two tasks: implementing the feature
-function, and representing the language model itself.  The class `LanguageModelFF` implements the
-feature function by exending the class `DefaultStatefulFF`.  One of these is instantiated for each
-language model present in the decoder.
-
-The language models themselves are implemented as a combination of an interface
-(`NGramLanguageModel`), a default implementation (`DefaultNgramLangaugeModel`), and an abstract
-implementation of the default (`AbstractLM`).
-
-<pre>
-  DefaultStatefulFF
-  |- LanguageModelFF
-
-  DefaultNgramLanguageModel implements interface NGramLanguageModel
-  |- AbstractLM
-</pre>
-
-<!-- Put @see and @since tags down here. -->
-
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/package.html
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/package.html b/src/joshua/decoder/ff/package.html
deleted file mode 100644
index b0aa63e..0000000
--- a/src/joshua/decoder/ff/package.html
+++ /dev/null
@@ -1,37 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
-<head></head>
-<body bgcolor="white">
-
-<!--
-##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
-##### TYPE YOUR PACKAGE COMMENTS HERE.  BEGIN WITH A     #####
-##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE:    #####
--->
-
-Provides an implementation of the linear feature functions typically used in
-hierarchical phrase-based decoding for statistical machine translation.
-
-The following is a note from Juri describing some of the functionality of the feature functions
-interfaces and default abstract classes.
-
-<pre>
-The equality that I intended for is ff.transitionLogP() =
-ff.estimateLogP() + ff.reEstimateTransitionLogP(). The re-estimate
-fixes the estimate to be the true transition cost that takes into
-account the state. Before decoding the cost of applying a rule is
-estimated via estimateLogP() and yields the phrasal feature costs plus
-an LM estimate of the cost of the lexical portions of the rule.
-transitionLogP() takes rule and state and computes everything from
-scratch, whereas reEstimateTransitionLogP() adds in the cost of new
-n-grams that result from combining the rule with the LM states and
-subtracts out the cost of superfluous less-than-n-grams that were
-overridden by the updated cost calculation.
-
-Hope this helps.
-</pre>
-
-<!-- Put @see and @since tags down here. -->
-
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/phrase/Distortion.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/phrase/Distortion.java b/src/joshua/decoder/ff/phrase/Distortion.java
deleted file mode 100644
index 15aced8..0000000
--- a/src/joshua/decoder/ff/phrase/Distortion.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.phrase;
-
-import java.util.ArrayList;
-import java.util.List;	
-
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.SourcePath;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.StatelessFF;
-import joshua.decoder.ff.state_maintenance.DPState;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.phrase.Hypothesis;
-import joshua.decoder.segment_file.Sentence;
-
-public class Distortion extends StatelessFF {
-
-  public Distortion(FeatureVector weights, String[] args, JoshuaConfiguration config) {
-    super(weights, "Distortion", args, config);
-    
-    if (! config.search_algorithm.equals("stack")) {
-      System.err.println("* FATAL: Distortion feature only application for phrase-based decoding");
-      System.err.println("         Use -search phrase or remove this feature");
-      System.exit(1);
-    }
-  }
-  
-  @Override
-  public ArrayList<String> reportDenseFeatures(int index) {
-    denseFeatureIndex = index;
-    
-    ArrayList<String> names = new ArrayList<String>();
-    names.add(name);
-    return names;
-  }
-
-  @Override
-  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
-      Sentence sentence, Accumulator acc) {
-
-    if (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE) {
-        int start_point = j - rule.getFrench().length + rule.getArity();
-
-        int jump_size = Math.abs(tailNodes.get(0).j - start_point);
-//        acc.add(name, -jump_size);
-        acc.add(denseFeatureIndex, -jump_size); 
-    }
-    
-//    System.err.println(String.format("DISTORTION(%d, %d) from %d = %d", i, j, tailNodes != null ? tailNodes.get(0).j : -1, jump_size));
-
-    return null;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
deleted file mode 100644
index 3497001..0000000
--- a/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.similarity;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.PrintWriter;
-import java.net.Socket;
-import java.net.UnknownHostException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import com.google.common.base.Throwables;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.SourcePath;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.StatefulFF;
-import joshua.decoder.ff.SourceDependentFF;
-import joshua.decoder.ff.state_maintenance.DPState;
-import joshua.decoder.ff.state_maintenance.NgramDPState;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.segment_file.Sentence;
-import joshua.util.Cache;
-
-public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependentFF {
-
-  private static Cache<String, Float> cache = new Cache<String, Float>(100000000);
-
-  private String host;
-  private int port;
-
-  private Socket socket;
-  private PrintWriter serverAsk;
-  private BufferedReader serverReply;
-
-  private int[] source;
-
-  private final int MAX_PHRASE_LENGTH = 4;
-  private final int GAP = 0;
-
-  public EdgePhraseSimilarityFF(FeatureVector weights, String[] args, JoshuaConfiguration config) throws NumberFormatException, UnknownHostException, IOException {
-    super(weights, "EdgePhraseSimilarity", args, config);
-
-    this.host = parsedArgs.get("host");
-    this.port = Integer.parseInt(parsedArgs.get("port"));
-
-    initializeConnection();
-  }
-
-  private void initializeConnection() throws NumberFormatException, UnknownHostException,
-      IOException {
-    System.err.println("Opening connection.");
-    socket = new Socket(host, port);
-    serverAsk = new PrintWriter(socket.getOutputStream(), true);
-    serverReply = new BufferedReader(new InputStreamReader(socket.getInputStream()));
-  }
-
-  @Override
-  public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
-      Sentence sentence, Accumulator acc) {
-
-    float value = computeScore(rule, tailNodes);
-    acc.add(name, value);
-
-    // TODO 07/2013: EdgePhraseSimilarity needs to know its order rather than inferring it from tail
-    // nodes.
-    return new NgramDPState(new int[1], new int[1]);
-  }
-  
-  @Override
-  public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath path, Sentence sentence, Accumulator acc) {
-    return null;
-  }
-
-  public float computeScore(Rule rule, List<HGNode> tailNodes) {
-    if (tailNodes == null || tailNodes.isEmpty())
-      return 0;
-
-    // System.err.println("RULE [" + spanStart + ", " + spanEnd + "]: " + rule.toString());
-
-    int[] target = rule.getEnglish();
-    int lm_state_size = 0;
-    for (HGNode node : tailNodes) {
-      NgramDPState state = (NgramDPState) node.getDPState(stateIndex);
-      lm_state_size += state.getLeftLMStateWords().length + state.getRightLMStateWords().length;
-    }
-
-    ArrayList<int[]> batch = new ArrayList<int[]>();
-
-    // Build joined target string.
-    int[] join = new int[target.length + lm_state_size];
-
-    int idx = 0, num_gaps = 1, num_anchors = 0;
-    int[] anchors = new int[rule.getArity() * 2];
-    int[] indices = new int[rule.getArity() * 2];
-    int[] gaps = new int[rule.getArity() + 2];
-    gaps[0] = 0;
-    for (int t = 0; t < target.length; t++) {
-      if (target[t] < 0) {
-        HGNode node = tailNodes.get(-(target[t] + 1));
-        if (t != 0) {
-          indices[num_anchors] = node.i;
-          anchors[num_anchors++] = idx;
-        }
-        NgramDPState state = (NgramDPState) node.getDPState(stateIndex);
-        // System.err.print("LEFT:  ");
-        // for (int w : state.getLeftLMStateWords()) System.err.print(Vocabulary.word(w) + " ");
-        // System.err.println();
-        for (int w : state.getLeftLMStateWords())
-          join[idx++] = w;
-        join[idx++] = GAP;
-        gaps[num_gaps++] = idx;
-        // System.err.print("RIGHT:  ");
-        // for (int w : state.getRightLMStateWords()) System.err.print(Vocabulary.word(w) + " ");
-        // System.err.println();
-        for (int w : state.getRightLMStateWords())
-          join[idx++] = w;
-        if (t != target.length - 1) {
-          indices[num_anchors] = node.j;
-          anchors[num_anchors++] = idx;
-        }
-      } else {
-        join[idx++] = target[t];
-      }
-    }
-    gaps[gaps.length - 1] = join.length + 1;
-
-    // int c = 0;
-    // System.err.print("> ");
-    // for (int k = 0; k < join.length; k++) {
-    // if (c < num_anchors && anchors[c] == k) {
-    // c++;
-    // System.err.print("| ");
-    // }
-    // System.err.print(Vocabulary.word(join[k]) + " ");
-    // }
-    // System.err.println("<");
-
-    int g = 0;
-    for (int a = 0; a < num_anchors; a++) {
-      if (a > 0 && anchors[a - 1] == anchors[a])
-        continue;
-      if (anchors[a] > gaps[g + 1])
-        g++;
-      int left = Math.max(gaps[g], anchors[a] - MAX_PHRASE_LENGTH + 1);
-      int right = Math.min(gaps[g + 1] - 1, anchors[a] + MAX_PHRASE_LENGTH - 1);
-
-      int[] target_phrase = new int[right - left];
-      System.arraycopy(join, left, target_phrase, 0, target_phrase.length);
-      int[] source_phrase = getSourcePhrase(indices[a]);
-
-      if (source_phrase != null && target_phrase.length != 0) {
-        // System.err.println("ANCHOR: " + indices[a]);
-        batch.add(source_phrase);
-        batch.add(target_phrase);
-      }
-    }
-    return getSimilarity(batch);
-  }
-
-  @Override
-  public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
-    return 0.0f;
-  }
-
-  /**
-   * From SourceDependentFF interface.
-   */
-  @Override
-  public void setSource(Sentence sentence) {
-    if (! sentence.isLinearChain())
-      throw new RuntimeException("EdgePhraseSimilarity not defined for lattices");
-    this.source = sentence.getWordIDs();
-  }
-
-  public EdgePhraseSimilarityFF clone() {
-    try {
-      return new EdgePhraseSimilarityFF(this.weights, args, config);
-    } catch (Exception e) {
-      throw Throwables.propagate(e);
-    }
-  }
-
-  @Override
-  public float estimateCost(Rule rule, Sentence sentence) {
-    return 0.0f;
-  }
-
-  private final int[] getSourcePhrase(int anchor) {
-    int idx;
-    int length = Math.min(anchor, MAX_PHRASE_LENGTH - 1)
-        + Math.min(source.length - anchor, MAX_PHRASE_LENGTH - 1);
-    if (length <= 0)
-      return null;
-    int[] phrase = new int[length];
-    idx = 0;
-    for (int p = Math.max(0, anchor - MAX_PHRASE_LENGTH + 1); p < Math.min(source.length, anchor
-        + MAX_PHRASE_LENGTH - 1); p++)
-      phrase[idx++] = source[p];
-    return phrase;
-  }
-
-  private float getSimilarity(List<int[]> batch) {
-    float similarity = 0.0f;
-    int count = 0;
-    StringBuilder query = new StringBuilder();
-    List<String> to_cache = new ArrayList<String>();
-    query.append("xb");
-    for (int i = 0; i < batch.size(); i += 2) {
-      int[] source = batch.get(i);
-      int[] target = batch.get(i + 1);
-
-      if (Arrays.equals(source, target)) {
-        similarity += 1;
-        count++;
-      } else {
-        String source_string = Vocabulary.getWords(source);
-        String target_string = Vocabulary.getWords(target);
-
-        String both;
-        if (source_string.compareTo(target_string) > 0)
-          both = source_string + " ||| " + target_string;
-        else
-          both = target_string + " ||| " + source_string;
-
-        Float cached = cache.get(both);
-        if (cached != null) {
-          // System.err.println("SIM: " + source_string + " X " + target_string + " = " + cached);
-          similarity += cached;
-          count++;
-        } else {
-          query.append("\t").append(source_string);
-          query.append("\t").append(target_string);
-          to_cache.add(both);
-        }
-      }
-    }
-    if (!to_cache.isEmpty()) {
-      try {
-        serverAsk.println(query.toString());
-        String response = serverReply.readLine();
-        String[] scores = response.split("\\s+");
-        for (int i = 0; i < scores.length; i++) {
-          Float score = Float.parseFloat(scores[i]);
-          cache.put(to_cache.get(i), score);
-          similarity += score;
-          count++;
-        }
-      } catch (Exception e) {
-        return 0;
-      }
-    }
-    return (count == 0 ? 0 : similarity / count);
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/state_maintenance/DPState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/state_maintenance/DPState.java b/src/joshua/decoder/ff/state_maintenance/DPState.java
deleted file mode 100644
index 1a02a90..0000000
--- a/src/joshua/decoder/ff/state_maintenance/DPState.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.state_maintenance;
-
-/**
- * Abstract class enforcing explicit implementation of the standard methods.
- * 
- * @author Zhifei Li, <zh...@gmail.com>
- * @author Juri Ganitkevitch, <ju...@cs.jhu.edu>
- */
-public abstract class DPState {
-
-  public abstract String toString();
-
-  public abstract int hashCode();
-
-  public abstract boolean equals(Object other);
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/state_maintenance/KenLMState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/state_maintenance/KenLMState.java b/src/joshua/decoder/ff/state_maintenance/KenLMState.java
deleted file mode 100644
index 906f8d8..0000000
--- a/src/joshua/decoder/ff/state_maintenance/KenLMState.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.state_maintenance;
-
-/**
- * Maintains a state pointer used by KenLM to implement left-state minimization. 
- * 
- * @author Matt Post <po...@cs.jhu.edu>
- * @author Juri Ganitkevitch <ju...@cs.jhu.edu>
- */
-public class KenLMState extends DPState {
-
-  private long state = 0;
-
-  public KenLMState() {
-  }
-
-  public KenLMState(long stateId) {
-    this.state = stateId;
-  }
-
-  public long getState() {
-    return state;
-  }
-
-  @Override
-  public int hashCode() {
-    return (int) ((getState() >> 32) ^ getState());
-  }
-
-  @Override
-  public boolean equals(Object other) {
-    return (other instanceof KenLMState && this.getState() == ((KenLMState) other).getState());
-  }
-
-  @Override
-  public String toString() {
-    return String.format("[KenLMState %d]", getState());
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/state_maintenance/NgramDPState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/state_maintenance/NgramDPState.java b/src/joshua/decoder/ff/state_maintenance/NgramDPState.java
deleted file mode 100644
index b72a5ba..0000000
--- a/src/joshua/decoder/ff/state_maintenance/NgramDPState.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.state_maintenance;
-
-import java.util.Arrays;
-
-import joshua.corpus.Vocabulary;
-
-/**
- * @author Zhifei Li, <zh...@gmail.com>
- * @author Juri Ganitkevitch, <ju...@cs.jhu.edu>
- */
-public class NgramDPState extends DPState {
-
-  private int[] left;
-  private int[] right;
-
-  private int hash = 0;
-
-  public NgramDPState(int[] l, int[] r) {
-    left = l;
-    right = r;
-    assertLengths();
-  }
-
-  public void setLeftLMStateWords(int[] words) {
-    left = words;
-    assertLengths();
-  }
-
-  public int[] getLeftLMStateWords() {
-    return left;
-  }
-
-  public void setRightLMStateWords(int[] words) {
-    right = words;
-    assertLengths();
-  }
-
-  public int[] getRightLMStateWords() {
-    return right;
-  }
-
-  private final void assertLengths() {
-    if (left.length != right.length)
-      throw new RuntimeException("Unequal lengths in left and right state: < "
-          + Vocabulary.getWords(left) + " | " + Vocabulary.getWords(right) + " >");
-  }
-
-  @Override
-  public int hashCode() {
-    if (hash == 0) {
-      hash = 31 + Arrays.hashCode(left);
-      hash = hash * 19 + Arrays.hashCode(right);
-    }
-    return hash;
-  }
-
-  @Override
-  public boolean equals(Object other) {
-    if (other instanceof NgramDPState) {
-      NgramDPState that = (NgramDPState) other;
-      if (this.left.length == that.left.length && this.right.length == that.right.length) {
-        for (int i = 0; i < left.length; ++i)
-          if (this.left[i] != that.left[i] || this.right[i] != that.right[i])
-            return false;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  public String toString() {
-    StringBuilder sb = new StringBuilder();
-    sb.append("<");
-    for (int id : left)
-      sb.append(" " + Vocabulary.word(id));
-    sb.append(" |");
-    for (int id : right)
-      sb.append(" " + Vocabulary.word(id));
-    sb.append(" >");
-    return sb.toString();
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/AbstractGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/AbstractGrammar.java b/src/joshua/decoder/ff/tm/AbstractGrammar.java
deleted file mode 100644
index 8cfb2ad..0000000
--- a/src/joshua/decoder/ff/tm/AbstractGrammar.java
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.segment_file.Token;
-import joshua.lattice.Arc;
-import joshua.lattice.Lattice;
-import joshua.lattice.Node;
-
-/**
- * Partial implementation of the <code>Grammar</code> interface that provides logic for sorting a
- * grammar.
- * <p>
- * <em>Note</em>: New classes implementing the <code>Grammar</code> interface should probably
- * inherit from this class, unless a specific sorting technique different from that implemented by
- * this class is required.
- * 
- * @author Zhifei Li
- * @author Lane Schwartz
- * @author Matt Post <post@cs.jhu.edu
- */
-public abstract class AbstractGrammar implements Grammar {
-
-  /** Logger for this class. */
-  private static final Logger logger = Logger.getLogger(AbstractGrammar.class.getName());
-
-  /**
-   * Indicates whether the rules in this grammar have been sorted based on the latest feature
-   * function values.
-   */
-  protected boolean sorted = false;
-
-  /*
-   * The grammar's owner, used to determine which weights are applicable to the dense features found
-   * within.
-   */
-  protected int owner = -1;
-  
-  /*
-   * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
-   */
-  protected int maxSourcePhraseLength = -1;
-  
-    /**
-   * Returns the longest source phrase read.
-   * 
-   * @return the longest source phrase read (nonterminal + terminal symbols).
-   */
-  @Override
-  public int getMaxSourcePhraseLength() {
-    return maxSourcePhraseLength;
-  }
-  
-  @Override
-  public int getOwner() {
-    return owner;
-  }
-
-  /* The maximum span of the input this rule can be applied to. */
-  protected int spanLimit = 1;
-
-  protected JoshuaConfiguration joshuaConfiguration;
-
-  /**
-   * Constructs an empty, unsorted grammar.
-   * 
-   * @see Grammar#isSorted()
-   */
-  public AbstractGrammar(JoshuaConfiguration config) {
-    this.joshuaConfiguration = config;
-    this.sorted = false;
-  }
-
-  public AbstractGrammar(int owner, int spanLimit) {
-    this.sorted = false;
-    this.owner = owner;
-    this.spanLimit = spanLimit;
-  }
-
-  public static final int OOV_RULE_ID = 0;
-
-  /**
-   * Cube-pruning requires that the grammar be sorted based on the latest feature functions. To
-   * avoid synchronization, this method should be called before multiple threads are initialized for
-   * parallel decoding
-   */
-  public void sortGrammar(List<FeatureFunction> models) {
-    Trie root = getTrieRoot();
-    if (root != null) {
-      sort(root, models);
-      setSorted(true);
-    }
-  }
-
-  /* See Javadoc comments for Grammar interface. */
-  public boolean isSorted() {
-    return sorted;
-  }
-
-  /**
-   * Sets the flag indicating whether this grammar is sorted.
-   * <p>
-   * This method is called by {@link #sortGrammar(ArrayList)} to indicate that the grammar has been
-   * sorted.
-   * 
-   * Its scope is protected so that child classes that override <code>sortGrammar</code> will also
-   * be able to call this method to indicate that the grammar has been sorted.
-   * 
-   * @param sorted
-   */
-  protected void setSorted(boolean sorted) {
-    this.sorted = sorted;
-    logger.fine("This grammar is now sorted: " + this);
-  }
-
-  /**
-   * Recursively sorts the grammar using the provided feature functions.
-   * <p>
-   * This method first sorts the rules stored at the provided node, then recursively calls itself on
-   * the child nodes of the provided node.
-   * 
-   * @param node Grammar node in the <code>Trie</code> whose rules should be sorted.
-   * @param models Feature function models to use during sorting.
-   */
-  private void sort(Trie node, List<FeatureFunction> models) {
-
-    if (node != null) {
-      if (node.hasRules()) {
-        RuleCollection rules = node.getRuleCollection();
-        if (logger.isLoggable(Level.FINE))
-          logger.fine("Sorting node " + Arrays.toString(rules.getSourceSide()));
-
-        /* This causes the rules at this trie node to be sorted */
-        rules.getSortedRules(models);
-
-        if (logger.isLoggable(Level.FINEST)) {
-          StringBuilder s = new StringBuilder();
-          for (Rule r : rules.getSortedRules(models)) {
-            s.append("\n\t" + r.getLHS() + " ||| " + Arrays.toString(r.getFrench()) + " ||| "
-                + Arrays.toString(r.getEnglish()) + " ||| " + r.getFeatureVector() + " ||| "
-                + r.getEstimatedCost() + "  " + r.getClass().getName() + "@"
-                + Integer.toHexString(System.identityHashCode(r)));
-          }
-          logger.finest(s.toString());
-        }
-      }
-
-      if (node.hasExtensions()) {
-        for (Trie child : node.getExtensions()) {
-          sort(child, models);
-        }
-      } else if (logger.isLoggable(Level.FINE)) {
-        logger.fine("Node has 0 children to extend: " + node);
-      }
-    }
-  }
-
-  // write grammar to disk
-  public void writeGrammarOnDisk(String file) {
-  }
-  
-  /**
-   * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that
-   * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}).
-   * 
-   * @param inputLattice the lattice representing the input sentence
-   * @param featureFunctions a list of feature functions used for scoring
-   */
-  public static void addOOVRules(Grammar grammar, Lattice<Token> inputLattice, 
-      List<FeatureFunction> featureFunctions, boolean onlyTrue) {
-    /*
-     * Add OOV rules; This should be called after the manual constraints have
-     * been set up.
-     */
-    HashSet<Integer> words = new HashSet<Integer>();
-    for (Node<Token> node : inputLattice) {
-      for (Arc<Token> arc : node.getOutgoingArcs()) {
-        // create a rule, but do not add into the grammar trie
-        // TODO: which grammar should we use to create an OOV rule?
-        int sourceWord = arc.getLabel().getWord();
-        if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
-            || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
-          continue;
-
-        // Determine if word is actual OOV.
-        if (onlyTrue && ! Vocabulary.hasId(sourceWord))
-          continue;
-
-        words.add(sourceWord);
-      }
-    }
-
-    for (int sourceWord: words) 
-      grammar.addOOVRules(sourceWord, featureFunctions);
-
-    // Sort all the rules (not much to actually do, this just marks it as sorted)
-    grammar.sortGrammar(featureFunctions);
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/BasicRuleCollection.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/BasicRuleCollection.java b/src/joshua/decoder/ff/tm/BasicRuleCollection.java
deleted file mode 100644
index 6dda7f7..0000000
--- a/src/joshua/decoder/ff/tm/BasicRuleCollection.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import joshua.decoder.ff.FeatureFunction;
-
-/**
- * Basic collection of translation rules.
- * 
- * @author Lane Schwartz
- * @author Zhifei Li
- */
-public class BasicRuleCollection implements RuleCollection {
-
-  /**
-   * Indicates whether the rules in this collection have been sorted based on the latest feature
-   * function values.
-   */
-  protected boolean sorted;
-
-  /** List of rules stored in this collection. */
-  protected final List<Rule> rules;
-
-  /** Number of nonterminals in the source pattern. */
-  protected int arity;
-
-  /**
-   * Sequence of terminals and nonterminals in the source pattern.
-   */
-  protected int[] sourceTokens;
-
-  /**
-   * Constructs an initially empty rule collection.
-   * 
-   * @param arity Number of nonterminals in the source pattern
-   * @param sourceTokens Sequence of terminals and nonterminals in the source pattern
-   */
-  public BasicRuleCollection(int arity, int[] sourceTokens) {
-    this.rules = new ArrayList<Rule>();
-    this.sourceTokens = sourceTokens;
-    this.arity = arity;
-    this.sorted = false;
-  }
-
-  public int getArity() {
-    return this.arity;
-  }
-
-  /**
-   * Returns a list of the rules, without ensuring that they are first sorted.
-   */
-  @Override
-  public List<Rule> getRules() {
-    return this.rules;
-  }
-  
-  @Override
-  public boolean isSorted() {
-    return sorted;
-  }
-
-  /**
-   * Return a list of rules sorted according to their estimated model costs.
-   */
-  @Override
-  public synchronized List<Rule> getSortedRules(List<FeatureFunction> models) {
-    if (! isSorted()) {
-      for (Rule rule: getRules())
-        rule.estimateRuleCost(models);
-
-      Collections.sort(rules, Rule.EstimatedCostComparator);
-      this.sorted = true;      
-    }
-    
-    return this.rules;
-  }
-
-  public int[] getSourceSide() {
-    return this.sourceTokens;
-  }
-}