You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2013/01/28 18:34:16 UTC
svn commit: r1439513 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/benchmark/ lucene/benchmark/conf/ lucene/benchmark/scripts/
lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/
lucene/benchmark/src/java/org/apache/lucene/benchmark/...
Author: sarowe
Date: Mon Jan 28 17:34:15 2013
New Revision: 1439513
URL: http://svn.apache.org/viewvc?rev=1439513&view=rev
Log:
LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer creation via the resulting factories using NewAnalyzerTask. (merged trunk r* LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer)
Added:
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
- copied unchanged from r1439510, lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AnalyzerFactoryTask.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
- copied unchanged from r1439510, lucene/dev/trunk/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/AnalyzerFactory.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
- copied unchanged from r1439510, lucene/dev/trunk/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/test-mapping-ISOLatin1Accent-partial.txt
Removed:
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/conf/shingle.alg
lucene/dev/branches/branch_4x/lucene/benchmark/scripts/shingle.bm2jira.pl
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Jan 28 17:34:15 2013
@@ -64,6 +64,9 @@ New Features
near-real-time reader is opened that contains those changes.
(Robert Muir, Mike McCandless)
+* LUCENE-4723: Add AnalyzerFactoryTask to benchmark, and enable analyzer
+ creation via the resulting factories using NewAnalyzerTask. (Steve Rowe)
+
API Changes
* LUCENE-4709: FacetResultNode no longer has a residue field. (Shai Erera)
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/conf/shingle.alg
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/conf/shingle.alg?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/conf/shingle.alg (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/conf/shingle.alg Mon Jan 28 17:34:15 2013
@@ -19,25 +19,43 @@ doc.body.tokenized=true
docs.dir=reuters-out
log.step=1000
+-AnalyzerFactory(name:shingle-bigrams-unigrams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:2, outputUnigrams:true))
+
+-AnalyzerFactory(name:shingle-bigrams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:2, outputUnigrams:false))
+
+-AnalyzerFactory(name:shingle-4grams-unigrams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:4, outputUnigrams:true))
+
+-AnalyzerFactory(name:shingle-4grams,
+ StandardTokenizer,
+ ShingleFilter(maxShingleSize:4, outputUnigrams:false))
+
+-AnalyzerFactory(name:standard-tokenizer-only, StandardTokenizer)
+
{ "Rounds"
- -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
+ -NewAnalyzer(shingle-bigrams-unigrams)
-ResetInputs
{ "BigramsAndUnigrams" { ReadTokens > : 10000 }
- -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
+ -NewAnalyzer(shingle-bigrams)
-ResetInputs
{ "BigramsOnly" { ReadTokens > : 10000 }
- -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
+ -NewAnalyzer(shingle-4grams-unigrams)
-ResetInputs
{ "FourgramsAndUnigrams" { ReadTokens > : 10000 }
- -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
+ -NewAnalyzer(shingle-4grams)
-ResetInputs
{ "FourgramsOnly" { ReadTokens > : 10000 }
- -NewAnalyzer(standard.StandardAnalyzer)
+ -NewAnalyzer(standard-tokenizer-only)
-ResetInputs
{ "UnigramsOnly" { ReadTokens > : 10000 }
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/scripts/shingle.bm2jira.pl
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/scripts/shingle.bm2jira.pl?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/scripts/shingle.bm2jira.pl (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/scripts/shingle.bm2jira.pl Mon Jan 28 17:34:15 2013
@@ -51,7 +51,7 @@ while (<>) {
# Print out platform info
print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
-if ($^O =~ /win/i) {
+if ($^O =~ /(?<!dar)win/i) {
print "$^O\n";
eval {
require Win32;
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java Mon Jan 28 17:34:15 2013
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Locale;
+import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.benchmark.byTask.feeds.ContentSource;
@@ -34,6 +35,7 @@ import org.apache.lucene.benchmark.byTas
import org.apache.lucene.benchmark.byTask.tasks.PerfTask;
import org.apache.lucene.benchmark.byTask.tasks.ReadTask;
import org.apache.lucene.benchmark.byTask.tasks.SearchTask;
+import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.FileUtils;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@@ -55,6 +57,7 @@ import org.apache.lucene.util.IOUtils;
* <li>Directory, Writer, Reader.
* <li>Taxonomy Directory, Writer, Reader.
* <li>DocMaker, FacetSource and a few instances of QueryMaker.
+ * <li>Named AnalysisFactories.
* <li>Analyzer.
* <li>Statistics data which updated during the run.
* </ul>
@@ -78,6 +81,7 @@ public class PerfRunData implements Clos
// directory, analyzer, docMaker - created at startup.
// reader, writer, searcher - maintained by basic tasks.
private Directory directory;
+ private Map<String,AnalyzerFactory> analyzerFactories = new HashMap<String,AnalyzerFactory>();
private Analyzer analyzer;
private DocMaker docMaker;
private ContentSource contentSource;
@@ -358,7 +362,7 @@ public class PerfRunData implements Clos
}
/**
- * @return Returns the anlyzer.
+ * @return Returns the analyzer.
*/
public Analyzer getAnalyzer() {
return analyzer;
@@ -434,4 +438,7 @@ public class PerfRunData implements Clos
return qm;
}
+ public Map<String,AnalyzerFactory> getAnalyzerFactories() {
+ return analyzerFactories;
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java Mon Jan 28 17:34:15 2013
@@ -16,10 +16,16 @@ package org.apache.lucene.benchmark.byTa
*/
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
import org.apache.lucene.util.Version;
import java.io.IOException;
+import java.io.StreamTokenizer;
+import java.io.StringReader;
import java.util.*;
import java.lang.reflect.Constructor;
@@ -28,12 +34,12 @@ import java.lang.reflect.Constructor;
*
*/
public class NewAnalyzerTask extends PerfTask {
- private List<String> analyzerClassNames;
+ private List<String> analyzerNames;
private int current;
public NewAnalyzerTask(PerfRunData runData) {
super(runData);
- analyzerClassNames = new ArrayList<String>();
+ analyzerNames = new ArrayList<String>();
}
public static final Analyzer createAnalyzer(String className) throws Exception{
@@ -50,55 +56,98 @@ public class NewAnalyzerTask extends Per
@Override
public int doLogic() throws IOException {
- String className = null;
+ String analyzerName = null;
try {
- if (current >= analyzerClassNames.size()) {
+ if (current >= analyzerNames.size()) {
current = 0;
}
- className = analyzerClassNames.get(current++);
+ analyzerName = analyzerNames.get(current++);
Analyzer analyzer = null;
- if (null == className || 0 == className.length()) {
- className = "org.apache.lucene.analysis.standard.StandardAnalyzer";
+ if (null == analyzerName || 0 == analyzerName.length()) {
+ analyzerName = "org.apache.lucene.analysis.standard.StandardAnalyzer";
}
- if (-1 == className.indexOf(".")) {
- try {
- // If no package, first attempt to instantiate a core analyzer
- String coreClassName = "org.apache.lucene.analysis.core." + className;
- analyzer = createAnalyzer(coreClassName);
- className = coreClassName;
- } catch (ClassNotFoundException e) {
- // If not a core analyzer, try the base analysis package
- className = "org.apache.lucene.analysis." + className;
- analyzer = createAnalyzer(className);
- }
+ // First, lookup analyzerName as a named analyzer factory
+ AnalyzerFactory factory = getRunData().getAnalyzerFactories().get(analyzerName);
+ if (null != factory) {
+ analyzer = factory.create();
} else {
- if (className.startsWith("standard.")) {
- className = "org.apache.lucene.analysis." + className;
+ if (analyzerName.contains(".")) {
+ if (analyzerName.startsWith("standard.")) {
+ analyzerName = "org.apache.lucene.analysis." + analyzerName;
+ }
+ analyzer = createAnalyzer(analyzerName);
+ } else { // No package
+ try {
+ // Attempt to instantiate a core analyzer
+ String coreClassName = "org.apache.lucene.analysis.core." + analyzerName;
+ analyzer = createAnalyzer(coreClassName);
+ analyzerName = coreClassName;
+ } catch (ClassNotFoundException e) {
+ // If not a core analyzer, try the base analysis package
+ analyzerName = "org.apache.lucene.analysis." + analyzerName;
+ analyzer = createAnalyzer(analyzerName);
+ }
}
- analyzer = createAnalyzer(className);
}
getRunData().setAnalyzer(analyzer);
- System.out.println("Changed Analyzer to: " + className);
} catch (Exception e) {
- throw new RuntimeException("Error creating Analyzer: " + className, e);
+ throw new RuntimeException("Error creating Analyzer: " + analyzerName, e);
}
return 1;
}
/**
- * Set the params (analyzerClassName only), Comma-separate list of Analyzer class names. If the Analyzer lives in
+ * Set the params (analyzerName only), Comma-separate list of Analyzer class names. If the Analyzer lives in
* org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name.
* <p/>
+ * Analyzer names may also refer to previously defined AnalyzerFactory's.
+ * <p/>
* Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) >
+ * <p/>
+ * Example AnalyzerFactory usage:
+ * <pre>
+ * -AnalyzerFactory(name:'whitespace tokenized',WhitespaceTokenizer)
+ * -NewAnalyzer('whitespace tokenized')
+ * </pre>
* @param params analyzerClassName, or empty for the StandardAnalyzer
*/
@Override
public void setParams(String params) {
super.setParams(params);
- for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) {
- String s = tokenizer.nextToken();
- analyzerClassNames.add(s.trim());
+ final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
+ stok.quoteChar('"');
+ stok.quoteChar('\'');
+ stok.eolIsSignificant(false);
+ stok.ordinaryChar(',');
+ try {
+ while (stok.nextToken() != StreamTokenizer.TT_EOF) {
+ switch (stok.ttype) {
+ case ',': {
+ // Do nothing
+ break;
+ }
+ case '\'':
+ case '\"':
+ case StreamTokenizer.TT_WORD: {
+ analyzerNames.add(stok.sval);
+ break;
+ }
+ default: {
+ throw new RuntimeException("Unexpected token: " + stok.toString());
+ }
+ }
+ }
+ } catch (RuntimeException e) {
+ if (e.getMessage().startsWith("Line #")) {
+ throw e;
+ } else {
+ throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", e);
+ }
+ } catch (Throwable t) {
+ throw new RuntimeException("Line #" + (stok.lineno() + getAlgLineNum()) + ": ", t);
}
+
+
}
/* (non-Javadoc)
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Mon Jan 28 17:34:15 2013
@@ -62,6 +62,9 @@ public abstract class PerfTask implement
private boolean runInBackground;
private int deltaPri;
+ // The first line of this task's definition in the alg file
+ private int algLineNum = 0;
+
protected static final String NEW_LINE = System.getProperty("line.separator");
/** Should not be used externally */
@@ -317,4 +320,11 @@ public abstract class PerfTask implement
this.disableCounting = disableCounting;
}
+ public void setAlgLineNum(int algLineNum) {
+ this.algLineNum = algLineNum;
+ }
+
+ public int getAlgLineNum() {
+ return algLineNum;
+ }
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java Mon Jan 28 17:34:15 2013
@@ -58,11 +58,12 @@ public class Algorithm {
StreamTokenizer stok = new StreamTokenizer(new StringReader(algTxt));
stok.commentChar('#');
stok.eolIsSignificant(false);
- stok.ordinaryChar('"');
+ stok.quoteChar('"');
+ stok.quoteChar('\'');
stok.ordinaryChar('/');
stok.ordinaryChar('(');
stok.ordinaryChar(')');
- boolean colonOk = false;
+ boolean colonOk = false;
boolean isDisableCountNextTask = false; // only for primitive tasks
currSequence.setDepth(0);
@@ -74,6 +75,7 @@ public class Algorithm {
Constructor<? extends PerfTask> cnstr = taskClass(config,s)
.asSubclass(PerfTask.class).getConstructor(PerfRunData.class);
PerfTask task = cnstr.newInstance(runData);
+ task.setAlgLineNum(stok.lineno());
task.setDisableCounting(isDisableCountNextTask);
isDisableCountNextTask = false;
currSequence.addTask(task);
@@ -90,24 +92,54 @@ public class Algorithm {
if (stok.ttype!='(') {
stok.pushBack();
} else {
- // get params, for tasks that supports them, - anything until next ')'
+ // get params, for tasks that supports them - allow recursive parenthetical expressions
+ stok.eolIsSignificant(true); // Allow params tokenizer to keep track of line number
StringBuilder params = new StringBuilder();
stok.nextToken();
- while (stok.ttype!=')') {
- switch (stok.ttype) {
- case StreamTokenizer.TT_NUMBER:
- params.append(stok.nval);
- break;
- case StreamTokenizer.TT_WORD:
- params.append(stok.sval);
- break;
- case StreamTokenizer.TT_EOF:
- throw new Exception("unexpexted EOF: - "+stok.toString());
- default:
- params.append((char)stok.ttype);
+ if (stok.ttype != ')') {
+ int count = 1;
+ BALANCED_PARENS: while (true) {
+ switch (stok.ttype) {
+ case StreamTokenizer.TT_NUMBER: {
+ params.append(stok.nval);
+ break;
+ }
+ case StreamTokenizer.TT_WORD: {
+ params.append(stok.sval);
+ break;
+ }
+ case StreamTokenizer.TT_EOF: {
+ throw new RuntimeException("Unexpexted EOF: - "+stok.toString());
+ }
+ case '"':
+ case '\'': {
+ params.append((char)stok.ttype);
+ // re-escape delimiters, if any
+ params.append(stok.sval.replaceAll("" + (char)stok.ttype, "\\\\" + (char)stok.ttype));
+ params.append((char)stok.ttype);
+ break;
+ }
+ case '(': {
+ params.append((char)stok.ttype);
+ ++count;
+ break;
+ }
+ case ')': {
+ if (--count >= 1) { // exclude final closing parenthesis
+ params.append((char)stok.ttype);
+ } else {
+ break BALANCED_PARENS;
+ }
+ break;
+ }
+ default: {
+ params.append((char)stok.ttype);
+ }
+ }
+ stok.nextToken();
}
- stok.nextToken();
}
+ stok.eolIsSignificant(false);
String prm = params.toString().trim();
if (prm.length()>0) {
task.setParams(prm);
@@ -182,10 +214,8 @@ public class Algorithm {
if (stok.ttype!='"') {
stok.pushBack();
} else {
- stok.nextToken();
name = stok.sval;
- stok.nextToken();
- if (stok.ttype!='"' || name==null || name.length()==0) {
+ if (stok.ttype!='"' || name==null || name.length()==0) {
throw new Exception("sequence name problem - "+stok.toString());
}
}
Modified: lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1439513&r1=1439512&r2=1439513&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/dev/branches/branch_4x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Mon Jan 28 17:34:15 2013
@@ -73,6 +73,7 @@ public class TestPerfTasksLogic extends
public void setUp() throws Exception {
super.setUp();
copyToWorkDir("reuters.first20.lines.txt");
+ copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt");
}
/**
@@ -1022,63 +1023,79 @@ public class TestPerfTasksLogic extends
}
/**
- * Test that we can create ShingleAnalyzerWrappers.
+ * Test that we can create shingle analyzers using AnalyzerFactory.
*/
public void testShingleAnalyzer() throws Exception {
String text = "one,two,three, four five six";
- // Default analyzer, maxShingleSize, and outputUnigrams
- Benchmark benchmark = execBenchmark(getShingleConfig(""));
+ // StandardTokenizer, maxShingleSize, and outputUnigrams
+ Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
+ ("shingle-analyzer", "StandardTokenizer,ShingleFilter"));
benchmark.getRunData().getAnalyzer().tokenStream
("bogus", new StringReader(text)).close();
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] {"one", "one two", "two", "two three",
- "three", "three four", "four", "four five",
- "five", "five six", "six"});
- // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one", "one two", "two", "two three",
+ "three", "three four", "four", "four five",
+ "five", "five six", "six" });
+ // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false
benchmark = execBenchmark
- (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] { "one two", "one two three", "two three",
- "two three four", "three four",
- "three four five", "four five",
- "four five six", "five six" });
- // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
+ (getAnalyzerFactoryConfig
+ ("shingle-analyzer",
+ "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one two", "one two three", "two three",
+ "two three four", "three four",
+ "three four five", "four five",
+ "four five six", "five six" });
+ // WhitespaceTokenizer, default maxShingleSize and outputUnigrams
benchmark = execBenchmark
- (getShingleConfig("analyzer:WhitespaceAnalyzer"));
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] { "one,two,three,", "one,two,three, four",
- "four", "four five", "five", "five six",
- "six" });
+ (getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one,two,three,", "one,two,three, four",
+ "four", "four five", "five", "five six",
+ "six" });
- // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
+ // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false
benchmark = execBenchmark
- (getShingleConfig
- ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
- assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
- new String[] { "one,two,three, four",
- "one,two,three, four five",
- "four five", "four five six",
- "five six" });
+ (getAnalyzerFactoryConfig
+ ("shingle-factory",
+ "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one,two,three, four",
+ "one,two,three, four five",
+ "four five", "four five six",
+ "five six" });
}
- private void assertEqualShingle
- (Analyzer analyzer, String text, String[] expected) throws Exception {
- BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
- }
-
- private String[] getShingleConfig(String params) {
+ private String[] getAnalyzerFactoryConfig(String name, String params) {
+ final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'");
String algLines[] = {
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
+ "work.dir=" + getWorkDir().getAbsolutePath(),
"content.source.forever=false",
"directory=RAMDirectory",
- "NewShingleAnalyzer(" + params + ")",
+ "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")",
+ "NewAnalyzer('" + singleQuoteEscapedName + "')",
"CreateIndex",
"{ \"AddDocs\" AddDoc > : * "
};
return algLines;
}
+
+ public void testAnalyzerFactory() throws Exception {
+ String text = "Fortieth, Quarantième, Cuadragésimo";
+ Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig
+ ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'",
+ "positionIncrementGap:100,offsetGap:1111,"
+ +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt'),"
+ +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\"),"
+ +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)"));
+ BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "fo", "or", "rt", "ti", "ie", "et", "th",
+ "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe",
+ "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"});
+ }
private String getReuters20LinesFile() {
return getWorkDirResourcePath("reuters.first20.lines.txt");