You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/03 05:07:38 UTC
[1/6] incubator-joshua git commit: Clean up some code smells detected
by findbugs (our static code analyzer)
Repository: incubator-joshua
Updated Branches:
refs/heads/master cf5fbb5ac -> 2c02feafe
Clean up some code smells detected by findbugs (our static code analyzer)
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/7fd3cfcb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/7fd3cfcb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/7fd3cfcb
Branch: refs/heads/master
Commit: 7fd3cfcbbd12f07d0b4eb58ef02c39f4af7ef7e3
Parents: cf5fbb5
Author: Pavel Danchenko <da...@amazon.com>
Authored: Mon Dec 14 10:34:16 2015 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Mon May 2 12:20:53 2016 -0700
----------------------------------------------------------------------
src/joshua/corpus/Span.java | 10 ++++-----
src/joshua/decoder/BLEU.java | 13 ++++++------
src/joshua/decoder/Decoder.java | 4 +++-
src/joshua/decoder/JoshuaConfiguration.java | 2 +-
src/joshua/decoder/chart_parser/DotChart.java | 10 +++++----
src/joshua/decoder/ff/FeatureVector.java | 16 +++++++-------
src/joshua/decoder/ff/LabelCombinationFF.java | 8 +++----
src/joshua/decoder/ff/RuleShape.java | 8 +++----
src/joshua/decoder/ff/fragmentlm/Tree.java | 22 +++++++++-----------
.../ff/lm/StateMinimizingLanguageModel.java | 4 +++-
.../ff/similarity/EdgePhraseSimilarityFF.java | 8 ++++---
.../decoder/ff/tm/SentenceFilteredGrammar.java | 4 +---
.../GrammarBuilderWalkerFunction.java | 5 -----
src/joshua/decoder/hypergraph/HyperGraph.java | 14 +++++--------
.../decoder/hypergraph/KBestExtractor.java | 6 +++---
.../decoder/io/TranslationRequestStream.java | 2 +-
src/joshua/decoder/segment_file/Sentence.java | 15 ++++++-------
src/joshua/lattice/Lattice.java | 6 +++---
src/joshua/metrics/BLEU.java | 9 ++++----
src/joshua/metrics/EvaluationMetric.java | 6 +++---
src/joshua/metrics/MinimumChangeBLEU.java | 11 +++++-----
src/joshua/metrics/Precis.java | 9 ++++----
src/joshua/util/JoshuaEval.java | 5 ++---
src/joshua/util/Lists.java | 3 +++
.../util/encoding/FeatureTypeAnalyzer.java | 14 ++++---------
25 files changed, 102 insertions(+), 112 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/corpus/Span.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Span.java b/src/joshua/corpus/Span.java
index d26d5ea..a51a9d2 100644
--- a/src/joshua/corpus/Span.java
+++ b/src/joshua/corpus/Span.java
@@ -21,6 +21,7 @@ package joshua.corpus;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.NoSuchElementException;
/**
* Represents a span with an inclusive starting index and an exclusive ending index.
@@ -113,14 +114,13 @@ public class Span implements Iterable<Integer>, Comparable<Span> {
int next = start;
public boolean hasNext() {
- if (next < end) {
- return true;
- } else {
- return false;
- }
+ return next < end;
}
public Integer next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
return next++;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/BLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/BLEU.java b/src/joshua/decoder/BLEU.java
index 129b792..1b3e3f8 100644
--- a/src/joshua/decoder/BLEU.java
+++ b/src/joshua/decoder/BLEU.java
@@ -145,9 +145,10 @@ public class BLEU {
float resBleu = 0.0f;
int[] numNgramMatch = new int[bleuOrder];
- for (String ngram : hypNgramTbl.keySet()) {// each ngram in hyp
+ for (Map.Entry<String, Integer> entry : hypNgramTbl.entrySet()) {// each ngram in hyp
+ String ngram = entry.getKey();
if (maxRefCountTbl.containsKey(ngram)) {
- int hypNgramCount = hypNgramTbl.get(ngram);
+ int hypNgramCount = entry.getValue();
int effectiveNumMatch = hypNgramCount;
@@ -187,14 +188,14 @@ public class BLEU {
float resBleu = 0;
int[] numNgramMatch = new int[bleuOrder];
- for (Iterator<String> it = hypNgramTbl.keySet().iterator(); it.hasNext();) {
- String ngram = it.next();
+ for (Map.Entry<String, Integer> entry : hypNgramTbl.entrySet()) {
+ String ngram = entry.getKey();
if (refNgramTbl.containsKey(ngram)) {
if (doNgramClip) {
numNgramMatch[Regex.spaces.split(ngram).length - 1] += Support.findMin(
- refNgramTbl.get(ngram), hypNgramTbl.get(ngram)); // ngram clip
+ refNgramTbl.get(ngram), entry.getValue()); // ngram clip
} else {
- numNgramMatch[Regex.spaces.split(ngram).length - 1] += hypNgramTbl.get(ngram);// without
+ numNgramMatch[Regex.spaces.split(ngram).length - 1] += entry.getValue();// without
// ngram
// count
// clipping
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index aab6d36..0057f87 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -35,6 +35,8 @@ import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
+import com.google.common.base.Strings;
+
import joshua.corpus.Vocabulary;
import joshua.decoder.ff.FeatureVector;
import joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
@@ -639,7 +641,7 @@ public class Decoder {
/* Add command-line-passed weights to the weights array for processing below */
- if (joshuaConfiguration.weight_overwrite != "") {
+ if (!Strings.isNullOrEmpty(joshuaConfiguration.weight_overwrite)) {
String[] tokens = joshuaConfiguration.weight_overwrite.split("\\s+");
for (int i = 0; i < tokens.length; i += 2) {
String feature = tokens[i];
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index a825268..7a3de23 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -538,7 +538,7 @@ public class JoshuaConfiguration {
logger.finest(String.format("mark_oovs: %s", mark_oovs));
} else if (parameter.equals(normalize_key("pop-limit"))) {
- pop_limit = Integer.valueOf(fds[1]);
+ pop_limit = Integer.parseInt(fds[1]);
logger.finest(String.format("pop-limit: %s", pop_limit));
} else if (parameter.equals(normalize_key("input-type"))) {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/chart_parser/DotChart.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/chart_parser/DotChart.java b/src/joshua/decoder/chart_parser/DotChart.java
index 64972d5..b82b68c 100644
--- a/src/joshua/decoder/chart_parser/DotChart.java
+++ b/src/joshua/decoder/chart_parser/DotChart.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -302,18 +303,19 @@ class DotChart {
*/
private ArrayList<Trie> matchAll(DotNode dotNode, int wordID) {
- ArrayList<Trie> trieList = new ArrayList<Trie>();
+ ArrayList<Trie> trieList = new ArrayList<>();
HashMap<Integer, ? extends Trie> childrenTbl = dotNode.trieNode.getChildren();
if (childrenTbl != null && wordID >= 0) {
// get all the extensions, map to string, check for *, build regexp
- for (Integer arcID : childrenTbl.keySet()) {
+ for (Map.Entry<Integer, ? extends Trie> entry : childrenTbl.entrySet()) {
+ Integer arcID = entry.getKey();
if (arcID == wordID) {
- trieList.add(childrenTbl.get(arcID));
+ trieList.add(entry.getValue());
} else {
String arcWord = Vocabulary.word(arcID);
if (Vocabulary.word(wordID).matches(arcWord)) {
- trieList.add(childrenTbl.get(arcID));
+ trieList.add(entry.getValue());
}
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/FeatureVector.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/FeatureVector.java b/src/joshua/decoder/ff/FeatureVector.java
index f95a93c..dcbcda2 100644
--- a/src/joshua/decoder/ff/FeatureVector.java
+++ b/src/joshua/decoder/ff/FeatureVector.java
@@ -315,13 +315,13 @@ public class FeatureVector {
* to be compatible with their tuners.
*/
public String mosesString() {
- String outputString = "";
+ StringBuilder outputString = new StringBuilder();
HashSet<String> printed_keys = new HashSet<String>();
// First print all the dense feature names in order
for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
- outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), getDense(i));
+ outputString.append(String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i).replaceAll("_", "-"), getDense(i)));
printed_keys.add(DENSE_FEATURE_NAMES.get(i));
}
@@ -334,10 +334,10 @@ public class FeatureVector {
if (key.equals("OOVPenalty"))
// force moses to see it as sparse
key = "OOV_Penalty";
- outputString += String.format("%s=%.3f ", key, value);
+ outputString.append(String.format("%s=%.3f ", key, value));
}
}
- return outputString.trim();
+ return outputString.toString().trim();
}
/***
@@ -346,13 +346,13 @@ public class FeatureVector {
*/
@Override
public String toString() {
- String outputString = "";
+ StringBuilder outputString = new StringBuilder();
HashSet<String> printed_keys = new HashSet<String>();
// First print all the dense feature names in order
for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
- outputString += String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i), getDense(i));
+ outputString.append(String.format("%s=%.3f ", DENSE_FEATURE_NAMES.get(i), getDense(i)));
printed_keys.add(DENSE_FEATURE_NAMES.get(i));
}
@@ -361,8 +361,8 @@ public class FeatureVector {
Collections.sort(keys);
for (String key: keys)
if (! printed_keys.contains(key))
- outputString += String.format("%s=%.3f ", key, sparseFeatures.get(key));
+ outputString.append(String.format("%s=%.3f ", key, sparseFeatures.get(key)));
- return outputString.trim();
+ return outputString.toString().trim();
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/LabelCombinationFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/LabelCombinationFF.java b/src/joshua/decoder/ff/LabelCombinationFF.java
index 37e83f9..38a85db 100644
--- a/src/joshua/decoder/ff/LabelCombinationFF.java
+++ b/src/joshua/decoder/ff/LabelCombinationFF.java
@@ -42,13 +42,13 @@ public class LabelCombinationFF extends StatelessFF {
}
private final String computeRuleLabelCombinationDescriptor(Rule rule) {
- String result = getLowerCasedFeatureName() + "_";
- result += RulePropertiesQuerying.getLHSAsString(rule);
+ StringBuilder result = new StringBuilder(getLowerCasedFeatureName() + "_");
+ result.append(RulePropertiesQuerying.getLHSAsString(rule));
// System.out.println("Rule: " + rule);
for (String foreignNonterminalString : RulePropertiesQuerying.getRuleSourceNonterminalStrings(rule)) {
- result += "_" + foreignNonterminalString;
+ result.append("_").append(foreignNonterminalString);
}
- return result;
+ return result.toString();
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/RuleShape.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/RuleShape.java b/src/joshua/decoder/ff/RuleShape.java
index ff022ef..e243528 100644
--- a/src/joshua/decoder/ff/RuleShape.java
+++ b/src/joshua/decoder/ff/RuleShape.java
@@ -43,20 +43,20 @@ public class RuleShape extends StatelessFF {
}
private String pattern(int[] ids) {
- String pattern = "";
+ StringBuilder pattern = new StringBuilder();
int curtype = gettype(ids[0]);
int curcount = 1;
for (int i = 1; i < ids.length; i++) {
if (gettype(ids[i]) != curtype) {
- pattern += String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : "");
+ pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
curtype = gettype(ids[i]);
curcount = 1;
} else {
curcount++;
}
}
- pattern += String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : "");
- return pattern;
+ pattern.append(String.format("%s%s_", curtype < 0 ? "N" : "x", curcount > 1 ? "+" : ""));
+ return pattern.toString();
}
@Override
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/fragmentlm/Tree.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/fragmentlm/Tree.java b/src/joshua/decoder/ff/fragmentlm/Tree.java
index a4aa5a8..b52ccce 100644
--- a/src/joshua/decoder/ff/fragmentlm/Tree.java
+++ b/src/joshua/decoder/ff/fragmentlm/Tree.java
@@ -115,14 +115,14 @@ public class Tree implements Serializable {
* @return
*/
public String getRule() {
- String ruleString = null;
- if (!isLeaf()) {
- ruleString = "(" + Vocabulary.word(getLabel());
- for (Tree child : getChildren())
- ruleString += " " + Vocabulary.word(child.getLabel());
+ if (isLeaf()) {
+ return null;
}
-
- return ruleString;
+ StringBuilder ruleString = new StringBuilder("(" + Vocabulary.word(getLabel()));
+ for (Tree child : getChildren()) {
+ ruleString.append(" ").append(Vocabulary.word(child.getLabel()));
+ }
+ return ruleString.toString();
}
/*
@@ -537,7 +537,7 @@ public class Tree implements Serializable {
*
* @param rule
* @param tailNodes
- * @param derivation
+ * @param derivation - should not be null
* @param maxDepth
* @return
*/
@@ -551,10 +551,8 @@ public class Tree implements Serializable {
tree = tree.shallowClone();
System.err.println(String.format("buildTree(%s)", tree));
- if (derivationStates != null) {
- for (int i = 0; i < derivationStates.length; i++) {
- System.err.println(String.format(" -> %d: %s", i, derivationStates[i]));
- }
+ for (int i = 0; i < derivationStates.length; i++) {
+ System.err.println(String.format(" -> %d: %s", i, derivationStates[i]));
}
List<Tree> frontier = tree.getNonterminalYield();
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 5e406de..f07b668 100644
--- a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@ -144,8 +144,10 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
}
int sentID = sentence.id();
- if (!poolMap.containsKey(sentID))
+ // Since sentId is unique across threads, next operations are safe, but not atomic!
+ if (!poolMap.containsKey(sentID)) {
poolMap.put(sentID, KenLM.createPool());
+ }
// Get the probability of applying the rule and the new state
StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
index 22c3733..3497001 100644
--- a/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
+++ b/src/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
@@ -25,8 +25,11 @@ import java.io.PrintWriter;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
+import com.google.common.base.Throwables;
+
import joshua.corpus.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.chart_parser.SourcePath;
@@ -195,8 +198,7 @@ public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependen
try {
return new EdgePhraseSimilarityFF(this.weights, args, config);
} catch (Exception e) {
- e.printStackTrace();
- return null;
+ throw Throwables.propagate(e);
}
}
@@ -229,7 +231,7 @@ public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependen
int[] source = batch.get(i);
int[] target = batch.get(i + 1);
- if (source.equals(target)) {
+ if (Arrays.equals(source, target)) {
similarity += 1;
count++;
} else {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
index 694430b..d540727 100644
--- a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ b/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
@@ -56,10 +56,8 @@ public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
int origCount = getNumRules(baseGrammar.getTrieRoot());
long startTime = System.currentTimeMillis();
- /* Filter the rules */
+ /* Filter the rules; returns non-null object */
this.filteredTrie = filter(baseGrammar.getTrieRoot());
- if (filteredTrie == null)
- filteredTrie = new SentenceFilteredTrie(baseGrammar.getTrieRoot());
int filteredCount = getNumRules();
float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java b/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
index 5a3b422..12e79c5 100644
--- a/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
+++ b/src/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
@@ -90,11 +90,6 @@ public class GrammarBuilderWalkerFunction implements WalkerFunction {
private Rule getRuleWithSpans(HyperEdge edge, HGNode head) {
Rule edgeRule = edge.getRule();
- // System.err.printf("EdgeRule: %s\n", edgeRule);
- if (!(edgeRule instanceof Rule)) {
- // System.err.println("edge rule is not a bilingual rule");
- return null;
- }
int headLabel = getLabelWithSpan(head);
// System.err.printf("Head label: %s\n", headLabel);
// if (edge.getAntNodes() != null) {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/hypergraph/HyperGraph.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/HyperGraph.java b/src/joshua/decoder/hypergraph/HyperGraph.java
index 2d1cfb0..003c930 100644
--- a/src/joshua/decoder/hypergraph/HyperGraph.java
+++ b/src/joshua/decoder/hypergraph/HyperGraph.java
@@ -144,19 +144,15 @@ public class HyperGraph {
* @param fileName
*/
public void dump(String fileName, List<FeatureFunction> model) {
- PrintWriter out = null;
- try {
- out = new PrintWriter(fileName, "UTF-8");
+ try ( PrintWriter out = new PrintWriter(fileName, "UTF-8") ) {
+ count();
+ out.println("# target ||| features");
+ out.println(String.format("%d %d", numNodes, numEdges));
+ new ForestWalker(TRAVERSAL.POSTORDER).walk(this.goalNode, new HyperGraphDumper(out, model));
} catch (IOException e) {
System.err.println("* Can't dump hypergraph to file '" + fileName + "'");
e.printStackTrace();
}
-
- count();
- out.println("# target ||| features");
- out.println(String.format("%d %d", numNodes, numEdges));
- new ForestWalker(TRAVERSAL.POSTORDER).walk(this.goalNode, new HyperGraphDumper(out, model));
- out.close();
}
public float bestScore() {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 98f6f15..6dd3207 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -912,12 +912,12 @@ public class KBestExtractor {
* @return
*/
private String quoteTerminals(String words) {
- String quotedWords = "";
+ StringBuilder quotedWords = new StringBuilder();
for (String word: words.split("\\s+"))
if (word.startsWith("[") && word.endsWith("]"))
- quotedWords += String.format("%s ", word);
+ quotedWords.append(String.format("%s ", word));
else
- quotedWords += String.format("\"%s\" ", word);
+ quotedWords.append(String.format("\"%s\" ", word));
return quotedWords.substring(0, quotedWords.length() - 1);
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/io/TranslationRequestStream.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/io/TranslationRequestStream.java b/src/joshua/decoder/io/TranslationRequestStream.java
index 8f1c754..47f5d81 100644
--- a/src/joshua/decoder/io/TranslationRequestStream.java
+++ b/src/joshua/decoder/io/TranslationRequestStream.java
@@ -58,7 +58,7 @@ public class TranslationRequestStream {
private StreamHandler requestHandler = null;
/* Whether the request has been killed by a broken client connection. */
- private boolean isShutDown = false;
+ private volatile boolean isShutDown = false;
public TranslationRequestStream(BufferedReader reader, JoshuaConfiguration joshuaConfiguration) {
this.joshuaConfiguration = joshuaConfiguration;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index 985a9fe..588850b 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -303,11 +303,12 @@ public class Sentence {
* @return
*/
public String source() {
- String str = "";
+ StringBuilder str = new StringBuilder();
int[] ids = getWordIDs();
- for (int i = 1; i < ids.length - 1; i++)
- str += Vocabulary.word(ids[i]) + " ";
- return str.trim();
+ for (int i = 1; i < ids.length - 1; i++) {
+ str.append(Vocabulary.word(ids[i])).append(" ");
+ }
+ return str.toString().trim();
}
/**
@@ -347,16 +348,16 @@ public class Sentence {
public String source(int i, int j) {
StringTokenizer st = new StringTokenizer(fullSource());
int index = 0;
- String substring = "";
+ StringBuilder substring = new StringBuilder();
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (index >= j)
break;
if (index >= i)
- substring += token + " ";
+ substring.append(token).append(" ");
index++;
}
- return substring.trim();
+ return substring.toString().trim();
}
public String[] references() {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Lattice.java b/src/joshua/lattice/Lattice.java
index bf2bf87..b0ef40f 100644
--- a/src/joshua/lattice/Lattice.java
+++ b/src/joshua/lattice/Lattice.java
@@ -199,8 +199,8 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
while (arcMatcher.matches()) {
numArcs++;
String arcLabel = arcMatcher.group(1);
- float arcWeight = Float.valueOf(arcMatcher.group(2));
- int destinationNodeID = nodeID + Integer.valueOf(arcMatcher.group(3));
+ float arcWeight = Float.parseFloat(arcMatcher.group(2));
+ int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
Node<Token> destinationNode;
if (destinationNodeID < nodes.size() && nodes.get(destinationNodeID) != null) {
@@ -279,7 +279,7 @@ public class Lattice<Value> implements Iterable<Node<Value>> {
while (arcMatcher.matches()) {
String arcLabel = arcMatcher.group(1);
float arcWeight = Float.valueOf(arcMatcher.group(2));
- int destinationNodeID = nodeID + Integer.valueOf(arcMatcher.group(3));
+ int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
Node<String> destinationNode;
if (nodes.containsKey(destinationNodeID)) {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/metrics/BLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/BLEU.java b/src/joshua/metrics/BLEU.java
index 8f3a92a..95c6cee 100644
--- a/src/joshua/metrics/BLEU.java
+++ b/src/joshua/metrics/BLEU.java
@@ -20,6 +20,7 @@ package joshua.metrics;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import java.util.logging.Logger;
public class BLEU extends EvaluationMetric {
@@ -115,11 +116,9 @@ public class BLEU extends EvaluationMetric {
// ...and update as necessary from the other reference translations
for (int r = 1; r < refsPerSen; ++r) {
HashMap<String, Integer> nextNgramCounts = getNgramCountsAll(refSentences[i][r]);
- Iterator<String> it = (nextNgramCounts.keySet()).iterator();
-
- while (it.hasNext()) {
- gram = it.next();
- nextCount = nextNgramCounts.get(gram);
+ for (Map.Entry<String, Integer> entry : nextNgramCounts.entrySet()) {
+ gram = entry.getKey();
+ nextCount = entry.getValue();
if (maxNgramCounts[i].containsKey(gram)) { // update if necessary
oldCount = maxNgramCounts[i].get(gram);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/metrics/EvaluationMetric.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/EvaluationMetric.java b/src/joshua/metrics/EvaluationMetric.java
index f22bf47..4dd9fbd 100644
--- a/src/joshua/metrics/EvaluationMetric.java
+++ b/src/joshua/metrics/EvaluationMetric.java
@@ -284,12 +284,12 @@ public abstract class EvaluationMetric {
int[][] SS = suffStats(cand_strings, cand_indices);
for (int d = 0; d < size; ++d) {
- String stats_str = "";
+ StringBuilder stats_str = new StringBuilder();
for (int s = 0; s < suffStatsCount - 1; ++s) {
- stats_str += SS[d][s] + " ";
+ stats_str.append(SS[d][s]).append(" ");
}
- stats_str += SS[d][suffStatsCount - 1];
+ stats_str.append(SS[d][suffStatsCount - 1]);
outFile.println(stats_str);
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/metrics/MinimumChangeBLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/MinimumChangeBLEU.java b/src/joshua/metrics/MinimumChangeBLEU.java
index 17f78ee..fa764c3 100644
--- a/src/joshua/metrics/MinimumChangeBLEU.java
+++ b/src/joshua/metrics/MinimumChangeBLEU.java
@@ -20,6 +20,7 @@ package joshua.metrics;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import java.util.logging.Logger;
import joshua.util.Algorithms;
@@ -77,11 +78,9 @@ public class MinimumChangeBLEU extends BLEU {
maxNgramCounts[i] = getNgramCountsAll(refSentences[i][r]);
} else {
HashMap<String, Integer> nextNgramCounts = getNgramCountsAll(refSentences[i][r]);
- Iterator<String> it = (nextNgramCounts.keySet()).iterator();
-
- while (it.hasNext()) {
- gram = it.next();
- nextCount = nextNgramCounts.get(gram);
+ for (Map.Entry<String, Integer> entry : nextNgramCounts.entrySet()) {
+ gram = entry.getKey();
+ nextCount = entry.getValue();
if (maxNgramCounts[i].containsKey(gram)) {
oldCount = maxNgramCounts[i].get(gram);
@@ -214,7 +213,7 @@ public class MinimumChangeBLEU extends BLEU {
public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
double wer = stats[suffStatsCount - 1] / stats[suffStatsCount - 3];
- double wer_penalty = (wer >= thresholdWER) ? 1.0 : (wer / thresholdWER);
+ double wer_penalty = (wer >= thresholdWER) ? 1.0d : (wer / thresholdWER);
System.out.println("WER_penalty = " + wer_penalty);
System.out.println("MC_BLEU= " + score(stats));
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/metrics/Precis.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/Precis.java b/src/joshua/metrics/Precis.java
index 865c6cf..82f4106 100644
--- a/src/joshua/metrics/Precis.java
+++ b/src/joshua/metrics/Precis.java
@@ -20,6 +20,7 @@ package joshua.metrics;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.Map;
import java.util.logging.Logger;
import joshua.util.Algorithms;
@@ -122,11 +123,9 @@ public class Precis extends BLEU {
maxNgramCounts[i] = getNgramCountsAll(refSentences[i][r]);
} else {
HashMap<String, Integer> nextNgramCounts = getNgramCountsAll(refSentences[i][r]);
- Iterator<String> it = (nextNgramCounts.keySet()).iterator();
-
- while (it.hasNext()) {
- gram = it.next();
- nextCount = nextNgramCounts.get(gram);
+ for ( Map.Entry<String, Integer> entry : nextNgramCounts.entrySet() ) {
+ gram = entry.getKey();
+ nextCount = entry.getValue();
if (maxNgramCounts[i].containsKey(gram)) {
oldCount = maxNgramCounts[i].get(gram);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/util/JoshuaEval.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/JoshuaEval.java b/src/joshua/util/JoshuaEval.java
index 6e12de5..6c0761a 100644
--- a/src/joshua/util/JoshuaEval.java
+++ b/src/joshua/util/JoshuaEval.java
@@ -115,12 +115,11 @@ public class JoshuaEval {
String[] topCand_str = new String[numSentences];
// BUG: all of this needs to be replaced with the SegmentFileParser and related interfaces.
- try {
+ try (InputStream inStream = new FileInputStream(new File(inFileName));
+ BufferedReader inFile = new BufferedReader(new InputStreamReader(inStream, "utf8"))) {
// read the candidates
- InputStream inStream = new FileInputStream(new File(inFileName));
- BufferedReader inFile = new BufferedReader(new InputStreamReader(inStream, "utf8"));
String line, candidate_str;
if (inFileFormat.equals("plain")) {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/util/Lists.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/Lists.java b/src/joshua/util/Lists.java
index 8f51cc0..43ffa00 100644
--- a/src/joshua/util/Lists.java
+++ b/src/joshua/util/Lists.java
@@ -67,6 +67,9 @@ public class Lists {
}
public Integer next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
int result = next;
next += 1;
return result;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7fd3cfcb/src/joshua/util/encoding/FeatureTypeAnalyzer.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/encoding/FeatureTypeAnalyzer.java b/src/joshua/util/encoding/FeatureTypeAnalyzer.java
index c9b77e9..4a8861c 100644
--- a/src/joshua/util/encoding/FeatureTypeAnalyzer.java
+++ b/src/joshua/util/encoding/FeatureTypeAnalyzer.java
@@ -206,17 +206,11 @@ public class FeatureTypeAnalyzer {
}
FeatureType(String key) {
+ // either throws or returns non-null
FloatEncoder e = EncoderFactory.getFloatEncoder(key);
- if (e != null) {
- encoder = e;
- analyzer = null;
- bits = -1;
- } else if ("8bit".equals(key)) {
- encoder = null;
- analyzer = new Analyzer();
- bits = 8;
- } else
- throw new RuntimeException("Unsupported encoder type: " + key);
+ encoder = e;
+ analyzer = null;
+ bits = -1;
}
void inferUncompressedType() {
[6/6] incubator-joshua git commit: Merge branch 'performance'
Posted by mj...@apache.org.
Merge branch 'performance'
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/2c02feaf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/2c02feaf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/2c02feaf
Branch: refs/heads/master
Commit: 2c02feafed5fdc2b9aed551d8d0e13ecc2a51c6e
Parents: cf5fbb5 fb5f720
Author: Matt Post <po...@cs.jhu.edu>
Authored: Mon May 2 23:07:28 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Mon May 2 23:07:28 2016 -0400
----------------------------------------------------------------------
src/joshua/corpus/Span.java | 10 +-
src/joshua/corpus/Vocabulary.java | 188 ++++++++++---------
src/joshua/decoder/BLEU.java | 13 +-
src/joshua/decoder/Decoder.java | 4 +-
src/joshua/decoder/JoshuaConfiguration.java | 2 +-
src/joshua/decoder/chart_parser/DotChart.java | 10 +-
src/joshua/decoder/ff/FeatureVector.java | 16 +-
src/joshua/decoder/ff/LabelCombinationFF.java | 8 +-
src/joshua/decoder/ff/RuleShape.java | 8 +-
src/joshua/decoder/ff/fragmentlm/Tree.java | 22 +--
.../ff/lm/StateMinimizingLanguageModel.java | 4 +-
.../ff/lm/berkeley_lm/LMGrammarBerkeley.java | 49 +++--
.../ff/similarity/EdgePhraseSimilarityFF.java | 8 +-
.../decoder/ff/tm/SentenceFilteredGrammar.java | 4 +-
.../GrammarBuilderWalkerFunction.java | 5 -
src/joshua/decoder/hypergraph/HyperGraph.java | 14 +-
.../decoder/hypergraph/KBestExtractor.java | 6 +-
.../decoder/io/TranslationRequestStream.java | 2 +-
src/joshua/decoder/segment_file/Sentence.java | 15 +-
src/joshua/lattice/Lattice.java | 6 +-
src/joshua/metrics/BLEU.java | 9 +-
src/joshua/metrics/EvaluationMetric.java | 6 +-
src/joshua/metrics/MinimumChangeBLEU.java | 11 +-
src/joshua/metrics/Precis.java | 9 +-
src/joshua/util/JoshuaEval.java | 5 +-
src/joshua/util/Lists.java | 3 +
.../util/encoding/FeatureTypeAnalyzer.java | 14 +-
tst/joshua/corpus/VocabularyTest.java | 118 ++++++++++++
.../LMBerkeleySentenceProbablityTest.java | 29 +++
29 files changed, 377 insertions(+), 221 deletions(-)
----------------------------------------------------------------------
[3/6] incubator-joshua git commit: Reduce concurrency on Vocab by
using opt stamped rw locks Drops concurrency in Vocab class from 7min14s to
0m0s during training run of 50K
Posted by mj...@apache.org.
Reduce concurrency on Vocab by using opt stamped rw locks
Drops concurrency in Vocab class from 7min14s to 0m0s during training run of 50K
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/cb700140
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/cb700140
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/cb700140
Branch: refs/heads/master
Commit: cb7001406da2f601dac51669d56648342c881b45
Parents: 39f59a8
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Wed Jan 27 11:55:13 2016 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Mon May 2 12:27:39 2016 -0700
----------------------------------------------------------------------
src/joshua/corpus/Vocabulary.java | 188 +++++++++++++++++++--------------
1 file changed, 106 insertions(+), 82 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cb700140/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index 1792219..ee59507 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -1,21 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
package joshua.corpus;
import static joshua.util.FormatUtils.isNonterminal;
@@ -32,6 +14,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.locks.StampedLock;
import joshua.decoder.Decoder;
import joshua.decoder.ff.lm.NGramLanguageModel;
@@ -40,33 +23,32 @@ import joshua.util.FormatUtils;
/**
* Static singular vocabulary class.
* Supports (de-)serialization into a vocabulary file.
- *
+ *
* @author Juri Ganitkevitch
*/
public class Vocabulary {
- private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<NGramLanguageModel>();
+ private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<>();
private static List<String> idToString;
private static Map<String, Integer> stringToId;
-
+ private static final StampedLock lock = new StampedLock();
private static volatile List<Integer> nonTerminalIndices;
- private static final Integer lock = new Integer(0);
-
static final int UNKNOWN_ID = 0;
static final String UNKNOWN_WORD = "<unk>";
public static final String START_SYM = "<s>";
public static final String STOP_SYM = "</s>";
-
+
static {
clear();
}
public static boolean registerLanguageModel(NGramLanguageModel lm) {
- synchronized (lock) {
+ long lock_stamp = lock.writeLock();
+ try {
// Store the language model.
LMs.add(lm);
// Notify it of all the existing words.
@@ -74,39 +56,40 @@ public class Vocabulary {
for (int i = idToString.size() - 1; i > 0; i--)
collision = collision || lm.registerWord(idToString.get(i), i);
return collision;
+ } finally {
+ lock.unlockWrite(lock_stamp);
}
}
/**
* Reads a vocabulary from file. This deletes any additions to the vocabulary made prior to
* reading the file.
- *
+ *
* @param file_name
* @return Returns true if vocabulary was read without mismatches or collisions.
* @throws IOException
*/
public static boolean read(final File vocab_file) throws IOException {
- synchronized (lock) {
- DataInputStream vocab_stream =
- new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file)));
- int size = vocab_stream.readInt();
- Decoder.LOG(1, String.format("Read %d entries from the vocabulary", size));
- clear();
- for (int i = 0; i < size; i++) {
- int id = vocab_stream.readInt();
- String token = vocab_stream.readUTF();
- if (id != Math.abs(id(token))) {
- vocab_stream.close();
- return false;
- }
+ DataInputStream vocab_stream =
+ new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file)));
+ int size = vocab_stream.readInt();
+ Decoder.LOG(1, String.format("Read %d entries from the vocabulary", size));
+ clear();
+ for (int i = 0; i < size; i++) {
+ int id = vocab_stream.readInt();
+ String token = vocab_stream.readUTF();
+ if (id != Math.abs(id(token))) {
+ vocab_stream.close();
+ return false;
}
- vocab_stream.close();
- return (size + 1 == idToString.size());
}
+ vocab_stream.close();
+ return (size + 1 == idToString.size());
}
public static void write(String file_name) throws IOException {
- synchronized (lock) {
+ long lock_stamp =lock.readLock();
+ try {
File vocab_file = new File(file_name);
DataOutputStream vocab_stream =
new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
@@ -118,44 +101,81 @@ public class Vocabulary {
}
vocab_stream.close();
}
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
}
/**
* Get the id of the token if it already exists, new id is created otherwise.
- *
- * TODO: currently locks for every call.
- * Separate constant (frozen) ids from changing (e.g. OOV) ids.
- * Constant ids could be immutable -> no locking.
- * Alternatively: could we use ConcurrentHashMap to not have to lock if actually contains it and only lock for modifications?
+ *
+ * TODO: currently locks for every call. Separate constant (frozen) ids from
+ * changing (e.g. OOV) ids. Constant ids could be immutable -> no locking.
+ * Alternatively: could we use ConcurrentHashMap to not have to lock if
+ * actually contains it and only lock for modifications?
*/
public static int id(String token) {
- synchronized (lock) {
+ // First attempt an optimistic read
+ long attempt_read_lock = lock.tryOptimisticRead();
+ if (stringToId.containsKey(token)) {
+ int resultId = stringToId.get(token);
+ if (lock.validate(attempt_read_lock)) {
+ return resultId;
+ }
+ }
+else {
+ if (nonTerminalIndices != null && nt(token)) {
+ throw new IllegalArgumentException(
+ "After the nonterminal indices have been set by calling getNonterminalIndices you can't call id on new nonterminals anymore.");
+ }
+ }
+
+ // The optimistic read failed, try a read with a stamped read lock
+ long read_lock_stamp = lock.readLock();
+ try {
if (stringToId.containsKey(token)) {
return stringToId.get(token);
} else {
if (nonTerminalIndices != null && nt(token)) {
- throw new IllegalArgumentException("After the nonterminal indices have been set by calling getNonterminalIndices you can't call id on new nonterminals anymore.");
+ throw new IllegalArgumentException(
+ "After the nonterminal indices have been set by calling getNonterminalIndices you can't call id on new nonterminals anymore.");
}
- int id = idToString.size() * (nt(token) ? -1 : 1);
-
- // register this (token,id) mapping with each language
- // model, so that they can map it to their own private
- // vocabularies
- for (NGramLanguageModel lm : LMs)
- lm.registerWord(token, Math.abs(id));
+ }
+ } finally {
+ lock.unlockRead(read_lock_stamp);
+ }
- idToString.add(token);
- stringToId.put(token, id);
- return id;
+ // Looks like the id we want is not there, let's get a write lock and add it
+ long write_lock_stamp = lock.writeLock();
+ try {
+ if (stringToId.containsKey(token)) {
+ return stringToId.get(token);
}
+ int id = idToString.size() * (nt(token) ? -1 : 1);
+
+ // register this (token,id) mapping with each language
+ // model, so that they can map it to their own private
+ // vocabularies
+ for (NGramLanguageModel lm : LMs)
+ lm.registerWord(token, Math.abs(id));
+
+ idToString.add(token);
+ stringToId.put(token, id);
+ return id;
+ } finally {
+ lock.unlockWrite(write_lock_stamp);
}
}
public static boolean hasId(int id) {
- synchronized (lock) {
+ long lock_stamp = lock.readLock();
+ try {
id = Math.abs(id);
return (id < idToString.size());
}
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
}
public static int[] addAll(String sentence) {
@@ -170,10 +190,14 @@ public class Vocabulary {
}
public static String word(int id) {
- synchronized (lock) {
+ long lock_stamp = lock.readLock();
+ try {
id = Math.abs(id);
return idToString.get(id);
}
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
}
public static String getWords(int[] ids) {
@@ -190,19 +214,15 @@ public class Vocabulary {
sb.append(word(id)).append(" ");
return sb.deleteCharAt(sb.length() - 1).toString();
}
-
+
/**
* This method returns a list of all (positive) indices
* corresponding to Nonterminals in the Vocabulary.
*/
- public static List<Integer> getNonterminalIndices()
+ public static synchronized List<Integer> getNonterminalIndices()
{
if (nonTerminalIndices == null) {
- synchronized (lock) {
- if (nonTerminalIndices == null) {
- nonTerminalIndices = findNonTerminalIndices();
- }
- }
+ nonTerminalIndices = findNonTerminalIndices();
}
return nonTerminalIndices;
}
@@ -211,7 +231,7 @@ public class Vocabulary {
* Iterates over the Vocabulary and finds all non terminal indices.
*/
private static List<Integer> findNonTerminalIndices() {
- List<Integer> nonTerminalIndices = new ArrayList<Integer>();
+ List<Integer> nonTerminalIndices = new ArrayList<>();
for(int i = 0; i < idToString.size(); i++) {
final String word = idToString.get(i);
if(isNonterminal(word)){
@@ -230,8 +250,8 @@ public class Vocabulary {
}
/**
- * Returns true if the Vocabulary ID represents a nonterminal.
- *
+ * Returns true if the Vocabulary ID represents a nonterminal.
+ *
* @param id
* @return
*/
@@ -244,33 +264,37 @@ public class Vocabulary {
}
public static int size() {
- synchronized (lock) {
+ long lock_stamp = lock.readLock();
+ try {
return idToString.size();
+ } finally {
+ lock.unlockRead(lock_stamp);
}
}
- public static int getTargetNonterminalIndex(int id) {
+ public static synchronized int getTargetNonterminalIndex(int id) {
return FormatUtils.getNonterminalIndex(word(id));
}
/**
- * Clears the vocabulary and initializes it with an unknown word.
- * Registered language models are left unchanged.
+ * Clears the vocabulary and initializes it with an unknown word. Registered
+ * language models are left unchanged.
*/
public static void clear() {
- synchronized (lock) {
- nonTerminalIndices = null;
-
+ long lock_stamp = lock.writeLock();
+ try {
idToString = new ArrayList<String>();
- stringToId = new HashMap<String, Integer>();
-
+ stringToId = new HashMap<String, Integer>();
+
idToString.add(UNKNOWN_ID, UNKNOWN_WORD);
stringToId.put(UNKNOWN_WORD, UNKNOWN_ID);
+ } finally {
+ lock.unlockWrite(lock_stamp);
}
}
-
+
public static void unregisterLanguageModels() {
LMs.clear();
}
-
+
}
[4/6] incubator-joshua git commit: Removed nonTerminalIndices
functionality as it is no longer in use
Posted by mj...@apache.org.
Removed nonTerminalIndices functionality as it is no longer in use
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/ef6d5686
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/ef6d5686
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/ef6d5686
Branch: refs/heads/master
Commit: ef6d5686380fc0965182bb5432adfb35eccab193
Parents: cb70014
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Wed Jan 27 13:35:21 2016 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Mon May 2 12:29:06 2016 -0700
----------------------------------------------------------------------
src/joshua/corpus/Vocabulary.java | 40 ----------
tst/joshua/corpus/VocabularyTest.java | 118 +++++++++++++++++++++++++++++
2 files changed, 118 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef6d5686/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index ee59507..12d184d 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -1,7 +1,5 @@
package joshua.corpus;
-import static joshua.util.FormatUtils.isNonterminal;
-
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
@@ -34,7 +32,6 @@ public class Vocabulary {
private static List<String> idToString;
private static Map<String, Integer> stringToId;
private static final StampedLock lock = new StampedLock();
- private static volatile List<Integer> nonTerminalIndices;
static final int UNKNOWN_ID = 0;
static final String UNKNOWN_WORD = "<unk>";
@@ -123,23 +120,12 @@ public class Vocabulary {
return resultId;
}
}
-else {
- if (nonTerminalIndices != null && nt(token)) {
- throw new IllegalArgumentException(
- "After the nonterminal indices have been set by calling getNonterminalIndices you can't call id on new nonterminals anymore.");
- }
- }
// The optimistic read failed, try a read with a stamped read lock
long read_lock_stamp = lock.readLock();
try {
if (stringToId.containsKey(token)) {
return stringToId.get(token);
- } else {
- if (nonTerminalIndices != null && nt(token)) {
- throw new IllegalArgumentException(
- "After the nonterminal indices have been set by calling getNonterminalIndices you can't call id on new nonterminals anymore.");
- }
}
} finally {
lock.unlockRead(read_lock_stamp);
@@ -215,32 +201,6 @@ else {
return sb.deleteCharAt(sb.length() - 1).toString();
}
- /**
- * This method returns a list of all (positive) indices
- * corresponding to Nonterminals in the Vocabulary.
- */
- public static synchronized List<Integer> getNonterminalIndices()
- {
- if (nonTerminalIndices == null) {
- nonTerminalIndices = findNonTerminalIndices();
- }
- return nonTerminalIndices;
- }
-
- /**
- * Iterates over the Vocabulary and finds all non terminal indices.
- */
- private static List<Integer> findNonTerminalIndices() {
- List<Integer> nonTerminalIndices = new ArrayList<>();
- for(int i = 0; i < idToString.size(); i++) {
- final String word = idToString.get(i);
- if(isNonterminal(word)){
- nonTerminalIndices.add(i);
- }
- }
- return nonTerminalIndices;
- }
-
public static int getUnknownId() {
return UNKNOWN_ID;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ef6d5686/tst/joshua/corpus/VocabularyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/corpus/VocabularyTest.java b/tst/joshua/corpus/VocabularyTest.java
new file mode 100644
index 0000000..724d9c7
--- /dev/null
+++ b/tst/joshua/corpus/VocabularyTest.java
@@ -0,0 +1,118 @@
+// Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+package joshua.corpus;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class VocabularyTest {
+ private static final String WORD1 = "word1";
+ private static final String WORD2 = "word2";
+ private static final String NON_TERMINAL = "[X]";
+ private static final String GOAL = "[GOAL]";
+
+ @Before
+ public void init() {
+ Vocabulary.clear();
+ }
+
+ @After
+ public void deinit() {
+ Vocabulary.clear();
+ }
+
+ @Test
+ public void givenVocabulary_whenEmpty_thenOnlyContainsUnknownWord() {
+ assertTrue(Vocabulary.hasId(Vocabulary.UNKNOWN_ID));
+ assertFalse(Vocabulary.hasId(1));
+ assertFalse(Vocabulary.hasId(-1));
+ assertEquals(Vocabulary.UNKNOWN_WORD, Vocabulary.word(Vocabulary.UNKNOWN_ID));
+ assertEquals(1, Vocabulary.size());
+ }
+
+ @Test
+ public void givenVocabulary_whenNewWord_thenMappingIsAdded() {
+ final int FIRST_WORD_ID = 1;
+ assertFalse(Vocabulary.hasId(FIRST_WORD_ID));
+ assertEquals(FIRST_WORD_ID, Vocabulary.id(WORD1));
+ //should return same id after second call:
+ assertEquals(FIRST_WORD_ID, Vocabulary.id(WORD1));
+ assertTrue(Vocabulary.hasId(FIRST_WORD_ID));
+ assertEquals(WORD1, Vocabulary.word(FIRST_WORD_ID));
+ assertEquals(2, Vocabulary.size());
+ }
+
+ @Test
+ public void givenVocabulary_whenCheckingStringInBracketsOrNegativeNumber_thenIsNonTerminal() {
+ //non-terminals
+ assertTrue(Vocabulary.nt(NON_TERMINAL));
+ //terminals
+ assertFalse(Vocabulary.nt(WORD1));
+ assertFalse(Vocabulary.nt("[]"));
+ assertFalse(Vocabulary.nt("["));
+ assertFalse(Vocabulary.nt("]"));
+ assertFalse(Vocabulary.nt(""));
+
+ //negative numbers indicate non-terminals
+ assertTrue(Vocabulary.nt(-1));
+ assertTrue(Vocabulary.nt(-5));
+
+ //positive numbers indicate terminals:
+ assertFalse(Vocabulary.nt(0));
+ assertFalse(Vocabulary.nt(5));
+
+
+ }
+
+ @Test
+ public void givenVocabulary_whenNonTerminal_thenReturnsStrictlyPositiveNonTerminalIndices() {
+ final int FIRST_NON_TERMINAL_INDEX = 1;
+ assertTrue(Vocabulary.id(NON_TERMINAL) < 0);
+ assertTrue(Vocabulary.hasId(FIRST_NON_TERMINAL_INDEX));
+ assertTrue(Vocabulary.hasId(-FIRST_NON_TERMINAL_INDEX));
+
+ assertTrue(Vocabulary.id("") > 0);
+ assertTrue(Vocabulary.id(WORD1) > 0);
+
+ final int SECOND_NON_TERMINAL_INDEX = 4;
+ assertTrue(Vocabulary.id(GOAL) < 0);
+ assertTrue(Vocabulary.hasId(SECOND_NON_TERMINAL_INDEX));
+ assertTrue(Vocabulary.hasId(-SECOND_NON_TERMINAL_INDEX));
+
+ assertTrue(Vocabulary.id(WORD2) > 0);
+ }
+
+ @Rule
+ public TemporaryFolder folder = new TemporaryFolder();
+
+ @Test
+ public void givenVocabulary_whenWritenAndReading_thenVocabularyStaysTheSame() throws IOException {
+ File vocabFile = folder.newFile();
+
+ int id1 = Vocabulary.id(WORD1);
+ int id2 = Vocabulary.id(NON_TERMINAL);
+ int id3 = Vocabulary.id(WORD2);
+
+ Vocabulary.write(vocabFile.getAbsolutePath());
+
+ Vocabulary.clear();
+
+ Vocabulary.read(vocabFile);
+
+ assertEquals(4, Vocabulary.size()); //unknown word + 3 other words
+ assertTrue(Vocabulary.hasId(id1));
+ assertTrue(Vocabulary.hasId(id2));
+ assertTrue(Vocabulary.hasId(id3));
+ assertEquals(id1, Vocabulary.id(WORD1));
+ assertEquals(id2, Vocabulary.id(NON_TERMINAL));
+ assertEquals(id3, Vocabulary.id(WORD2));
+ }
+}
[5/6] incubator-joshua git commit: Update Vocabulary.java
Posted by mj...@apache.org.
Update Vocabulary.java
Restore the apache Header (lost in a cherry-pick)
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/fb5f720c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/fb5f720c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/fb5f720c
Branch: refs/heads/master
Commit: fb5f720cde3c8b937e2473017689667b07ff4f19
Parents: ef6d568
Author: Kellen Sunderland <ke...@gmail.com>
Authored: Mon May 2 14:54:41 2016 -0700
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Mon May 2 14:59:32 2016 -0700
----------------------------------------------------------------------
src/joshua/corpus/Vocabulary.java | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fb5f720c/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index 12d184d..d79170d 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -1,3 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package joshua.corpus;
import java.io.BufferedInputStream;
[2/6] incubator-joshua git commit: Optimized allocations with
sub-array indexes
Posted by mj...@apache.org.
Optimized allocations with sub-array indexes
------------
last 10 minutes of recordings taken
Statistics:
Before
Total TLAB: 1,391.77 GB
Allocation rate: 2.32 GB/s
After
Total TLAB: 1,320.95 GB
Allocation rate: 2.20 GB/s
------------
Results
-10 GB/s allocation rate
java.utils.Arrays.copyOfRange -70GB allocations from joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley.sentenceLogProbability
10K translation performance testing
Before: avg 5.29
After: avg 5.24
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/39f59a8d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/39f59a8d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/39f59a8d
Branch: refs/heads/master
Commit: 39f59a8d7950f362cc52b2414dbd53efc130e404
Parents: 7fd3cfc
Author: Pavel Danchenko <da...@amazon.com>
Authored: Wed Feb 10 17:12:15 2016 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Mon May 2 12:23:39 2016 -0700
----------------------------------------------------------------------
.../ff/lm/berkeley_lm/LMGrammarBerkeley.java | 49 +++++++++++++-------
.../LMBerkeleySentenceProbablityTest.java | 29 ++++++++++++
2 files changed, 62 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/39f59a8d/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java b/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
index d5728cf..2716576 100644
--- a/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
+++ b/src/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeley.java
@@ -24,6 +24,8 @@ import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;
+import com.google.common.annotations.VisibleForTesting;
+
import joshua.corpus.Vocabulary;
import joshua.decoder.ff.lm.DefaultNGramLanguageModel;
import joshua.decoder.Decoder;
@@ -37,7 +39,7 @@ import edu.berkeley.nlp.lm.util.StrUtils;
/**
* This class wraps Berkeley LM.
- *
+ *
* @author adpauls@gmail.com
*/
public class LMGrammarBerkeley extends DefaultNGramLanguageModel {
@@ -120,9 +122,9 @@ public class LMGrammarBerkeley extends DefaultNGramLanguageModel {
for (int j = startIndex; j < order && j <= sentenceLength; j++) {
// TODO: startIndex dependens on the order, e.g., this.ngramOrder-1 (in srilm, for 3-gram lm,
// start_index=2. othercase, need to check)
- int[] ngram = Arrays.copyOfRange(sentence, 0, j);
- double logProb = ngramLogProbability_helper(ngram, false);
+ double logProb = ngramLogProbability_helper(sentence, 0, j, false);
if (logger.isLoggable(Level.FINE)) {
+ int[] ngram = Arrays.copyOfRange(sentence, 0, j);
String words = Vocabulary.getWords(ngram);
logger.fine("\tlogp ( " + words + " ) = " + logProb);
}
@@ -131,9 +133,9 @@ public class LMGrammarBerkeley extends DefaultNGramLanguageModel {
// regular-order ngrams
for (int i = 0; i <= sentenceLength - order; i++) {
- int[] ngram = Arrays.copyOfRange(sentence, i, i + order);
- double logProb = ngramLogProbability_helper(ngram, false);
+ double logProb = ngramLogProbability_helper(sentence, i, order, false);
if (logger.isLoggable(Level.FINE)) {
+ int[] ngram = Arrays.copyOfRange(sentence, i, i + order);
String words = Vocabulary.getWords(ngram);
logger.fine("\tlogp ( " + words + " ) = " + logProb);
}
@@ -147,26 +149,26 @@ public class LMGrammarBerkeley extends DefaultNGramLanguageModel {
public float ngramLogProbability_helper(int[] ngram, int order) {
return ngramLogProbability_helper(ngram, false);
}
-
+
protected float ngramLogProbability_helper(int[] ngram, boolean log) {
+ return ngramLogProbability_helper(ngram, 0, ngram.length, log);
+ }
+ protected float ngramLogProbability_helper(int sentence[], int ngramStartPos, int ngramLength, boolean log) {
int[] mappedNgram = arrayScratch.get();
- if (mappedNgram.length < ngram.length) {
- arrayScratch.set(mappedNgram = new int[mappedNgram.length * 2]);
+ if (mappedNgram.length < ngramLength) {
+ mappedNgram = new int[mappedNgram.length * 2];
+ arrayScratch.set(mappedNgram);
}
- for (int i = 0; i < ngram.length; ++i) {
- mappedNgram[i] = vocabIdToMyIdMapping[ngram[i]];
+ for (int i = 0; i < ngramLength; ++i) {
+ mappedNgram[i] = vocabIdToMyIdMapping[sentence[ngramStartPos + i]];
}
if (log && logRequests) {
- final int[] copyOf = Arrays.copyOf(mappedNgram, ngram.length);
- for (int i = 0; i < copyOf.length; ++i)
- if (copyOf[i] < 0) copyOf[i] = unkIndex;
- logger.finest(StrUtils.join(WordIndexer.StaticMethods.toList(lm.getWordIndexer(), copyOf)));
+ dumpBuffer(mappedNgram, ngramLength);
}
- final float res = lm.getLogProb(mappedNgram, 0, ngram.length);
- return res;
+ return lm.getLogProb(mappedNgram, 0, ngramLength);
}
public static void setLogRequests(Handler handler) {
@@ -183,4 +185,19 @@ public class LMGrammarBerkeley extends DefaultNGramLanguageModel {
public float ngramLogProbability(int[] ngram, int order) {
return ngramLogProbability(ngram);
}
+
+ private void dumpBuffer(int[] buffer, int len) {
+ final int[] copyOf = Arrays.copyOf(buffer, len);
+ for (int i = 0; i < copyOf.length; ++i) {
+ if (copyOf[i] < 0) {
+ copyOf[i] = unkIndex;
+ }
+ }
+ logger.finest(StrUtils.join(WordIndexer.StaticMethods.toList(lm.getWordIndexer(), copyOf)));
+ }
+
+ @VisibleForTesting
+ ArrayEncodedNgramLanguageModel<String> getLM() {
+ return lm;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/39f59a8d/tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java b/tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
new file mode 100644
index 0000000..74a832e
--- /dev/null
+++ b/tst/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
@@ -0,0 +1,29 @@
+package joshua.decoder.ff.lm.berkeley_lm;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+import edu.berkeley.nlp.lm.ArrayEncodedNgramLanguageModel;
+
+public class LMBerkeleySentenceProbablityTest {
+
+ @Test
+ public void verifySentenceLogProbability() {
+ LMGrammarBerkeley grammar = new LMGrammarBerkeley(2, "resources/berkeley_lm/lm");
+ grammar.registerWord("the", 2);
+ grammar.registerWord("chat-rooms", 3);
+ grammar.registerWord("<unk>", 0);
+
+ ArrayEncodedNgramLanguageModel<String> lm = grammar.getLM();
+ float expected =
+ lm.getLogProb(new int[] {}, 0, 0)
+ + lm.getLogProb(new int[] {0}, 0, 1)
+ + lm.getLogProb(new int[] {0, 2}, 0, 2)
+ + lm.getLogProb(new int[] {2, 3}, 0, 2)
+ + lm.getLogProb(new int[] {3, 0}, 0, 2);
+
+ float result = grammar.sentenceLogProbability(new int[] {0, 2, 3, 0}, 2, 0);
+ assertEquals(expected, result, 0.0);
+ }
+}