You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/01/29 05:07:48 UTC
svn commit: r904371 - in /lucene/java/trunk/contrib/benchmark: ./ conf/
scripts/ src/java/org/apache/lucene/benchmark/byTask/tasks/
src/test/org/apache/lucene/benchmark/byTask/
Author: rmuir
Date: Fri Jan 29 04:07:47 2010
New Revision: 904371
URL: http://svn.apache.org/viewvc?rev=904371&view=rev
Log:
LUCENE-2223: add a ShingleFilter benchmark
Added:
lucene/java/trunk/contrib/benchmark/conf/shingle.alg
lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl (with props)
lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl (with props)
lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (with props)
Modified:
lucene/java/trunk/contrib/benchmark/CHANGES.txt
lucene/java/trunk/contrib/benchmark/build.xml
lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=904371&r1=904370&r2=904371&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Fri Jan 29 04:07:47 2010
@@ -2,6 +2,11 @@
The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
+1/28/2010
+ LUCENE-2223: Add a benchmark for ShingleFilter. You can wrap any
+ analyzer with ShingleAnalyzerWrapper and specify shingle parameters
+ with the NewShingleAnalyzer task. (Steven Rowe via Robert Muir)
+
1/14/2010
LUCENE-2210: TrecTopicsReader now properly reads descriptions and
narratives from trec topics files. (Robert Muir)
Modified: lucene/java/trunk/contrib/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/build.xml?rev=904371&r1=904370&r2=904371&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/build.xml (original)
+++ lucene/java/trunk/contrib/benchmark/build.xml Fri Jan 29 04:07:47 2010
@@ -131,6 +131,7 @@
<pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
<pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
<pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
+ <pathelement path="${common.dir}/build/contrib/analyzers/common/classes/java"/>
<fileset dir="lib">
<include name="**/*.jar"/>
</fileset>
@@ -192,6 +193,32 @@
<echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
</target>
+ <property name="shingle.alg.file" location="conf/shingle.alg"/>
+ <property name="shingle.output.file"
+ value="${working.dir}/shingle.benchmark.output.txt"/>
+ <property name="shingle.jira.output.file"
+ value="${working.dir}/shingle.bm2jira.output.txt"/>
+
+ <path id="shingle.runtime.classpath">
+ <path refid="run.classpath"/>
+ </path>
+
+ <target name="shingle" depends="compile,compile-analyzers-common,get-files">
+ <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
+ <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark"
+ maxmemory="${task.mem}" output="${shingle.output.file}">
+ <classpath refid="run.classpath"/>
+ <arg file="${shingle.alg.file}"/>
+ </java>
+ <echo>Benchmark output is in file: ${shingle.output.file}</echo>
+ <echo>Converting to JIRA table format...</echo>
+ <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
+ <arg value="scripts/shingle.bm2jira.pl"/>
+ <arg value="${shingle.output.file}"/>
+ </exec>
+ <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
+ </target>
+
<target name="compile-demo">
<subant target="compile-demo">
<fileset dir="${common.dir}" includes="build.xml"/>
@@ -207,6 +234,11 @@
<fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
</subant>
</target>
+ <target name="compile-analyzers-common">
+ <subant target="compile">
+ <fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
+ </subant>
+ </target>
<target name="compile-memory">
<subant target="compile">
<fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>
Added: lucene/java/trunk/contrib/benchmark/conf/shingle.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/shingle.alg?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/shingle.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/shingle.alg Fri Jan 29 04:07:47 2010
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+doc.tokenized=false
+doc.body.tokenized=true
+docs.dir=reuters-out
+log.step=1000
+
+{ "Rounds"
+
+ -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
+ -ResetInputs
+ { "BigramsAndUnigrams" { ReadTokens > : 10000 }
+
+ -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
+ -ResetInputs
+ { "BigramsOnly" { ReadTokens > : 10000 }
+
+ -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
+ -ResetInputs
+ { "FourgramsAndUnigrams" { ReadTokens > : 10000 }
+
+ -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
+ -ResetInputs
+ { "FourgramsOnly" { ReadTokens > : 10000 }
+
+ -NewAnalyzer(standard.StandardAnalyzer)
+ -ResetInputs
+ { "UnigramsOnly" { ReadTokens > : 10000 }
+
+ NewRound
+
+} : 5
+
+RepSumByNameRound
Added: lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl (added)
+++ lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl Fri Jan 29 04:07:47 2010
@@ -0,0 +1,116 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ------------------------------------------
+# compare.shingle.benchmark.jira.tables.pl
+#
+# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
+# by shingle.bm2jira.pl (located in the same directory as this script), and
+# outputs a third JIRA-formatted comparison table.
+#
+# The difference is calculated as a percentage:
+#
+# 100 * (unpatched-elapsed - patched-elapsed / patched-elapsed)
+#
+# where (un)patched-elapsed values have had the no-shingle-filter
+# (StandardAnalyzer) elapsed time subtracted from them.
+#
+#
+# Example shingle.bm2jira.pl output:
+# ----------------------------------
+# JAVA:
+# java version "1.5.0_15"
+# Java(TM) 2 Runtime Environment, Standard Edition (build 1.5.0_15-b04)
+# Java HotSpot(TM) 64-Bit Server VM (build 1.5.0_15-b04, mixed mode)
+#
+# OS:
+# cygwin
+# WinVistaService Pack 2
+# Service Pack 26060022202561
+#
+# ||Max Shingle Size||Unigrams?||Elapsed||
+# |1 (Unigrams)|yes|2.19s|
+# |2|no|4.74s|
+# |2|yes|4.90s|
+# |4|no|5.82s|
+# |4|yes|5.97s|
+
+use strict;
+use warnings;
+
+my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
+
+die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
+
+my %stats = ();
+
+open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
+my $table_encountered = 0;
+my $standard_analyzer_elapsed = 0;
+my %unpatched_stats = ();
+my %patched_stats = ();
+while (<UNPATCHED>) {
+ unless ($table_encountered) {
+ if (/\Q||Max Shingle Size||Unigrams?||Elapsed||\E/) {
+ $table_encountered = 1;
+ } else {
+ print;
+ }
+ } elsif (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
+ my $max_shingle_size = $1;
+ my $output_unigrams = $2;
+ my $elapsed = $3;
+ if ($max_shingle_size =~ /Unigrams/) {
+ $standard_analyzer_elapsed = $elapsed;
+ } else {
+ $unpatched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
+ }
+ }
+}
+close UNPATCHED;
+
+open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
+while (<PATCHED>) {
+ if (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
+ my $max_shingle_size = $1;
+ my $output_unigrams = $2;
+ my $elapsed = $3;
+ if ($max_shingle_size =~ /Unigrams/) {
+ $standard_analyzer_elapsed = $elapsed
+ if ($elapsed < $standard_analyzer_elapsed);
+ } else {
+ $patched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
+ }
+ }
+}
+close PATCHED;
+
+print "||Max Shingle Size||Unigrams?||Unpatched||Patched||StandardAnalyzer||Improvement||\n";
+for my $max_shingle_size (sort { $a <=> $b } keys %unpatched_stats) {
+ for my $output_unigrams (sort keys %{$unpatched_stats{$max_shingle_size}}) {
+ my $improvement
+ = ( $unpatched_stats{$max_shingle_size}{$output_unigrams}
+ - $patched_stats{$max_shingle_size}{$output_unigrams})
+ / ( $patched_stats{$max_shingle_size}{$output_unigrams}
+ - $standard_analyzer_elapsed);
+ $improvement = int($improvement * 1000 + .5) / 10; # Round and truncate
+ printf "|$max_shingle_size|$output_unigrams"
+ ."|$unpatched_stats{$max_shingle_size}{$output_unigrams}s"
+ ."|$patched_stats{$max_shingle_size}{$output_unigrams}s"
+ ."|${standard_analyzer_elapsed}s|%2.1f%%|\n", $improvement;
+ }
+}
Propchange: lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl (added)
+++ lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl Fri Jan 29 04:07:47 2010
@@ -0,0 +1,73 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ----------
+# shingle.bm2jira.pl
+#
+# Converts Lucene contrib-benchmark output produced using the
+# conf/shingle.alg file into a JIRA-formatted table.
+#
+
+use strict;
+use warnings;
+
+my %min_elapsed = ();
+
+#Operation round runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
+#BigramsAndUnigrams 0 1 255691 21,147.22 12.09 15,501,840 35,061,760
+#BigramsOnly - - 0 - - 1 - - 127383 - 16,871.92 7.55 - 31,725,312 41,746,432
+#FourgramsAndUnigrams
+#FourgramsOnly
+#UnigramsOnly
+
+while (<>) {
+ if (/^((?:Uni|Bi|Four)grams\S+)[-\s]*([^\s{].*)/) {
+ my $operation = $1;
+ my $stats = $2;
+ my $max_shingle_size
+ = ($operation =~ /^Bigrams/ ? 2 : $operation =~ /^Unigrams/ ? 1 : 4);
+ my $output_unigrams
+ = ($operation =~ /(?:AndUnigrams|UnigramsOnly)$/ ? 'yes' : 'no');
+ my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
+ $min_elapsed{$max_shingle_size}{$output_unigrams} = $elapsed
+ unless (defined($min_elapsed{$max_shingle_size}{$output_unigrams})
+ && $elapsed >= $min_elapsed{$max_shingle_size}{$output_unigrams});
+ }
+}
+
+# Print out platform info
+print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
+if ($^O =~ /win/i) {
+ print "$^O\n";
+ eval {
+ require Win32;
+ print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
+ };
+ die "Error loading Win32: $@" if ($@);
+} else {
+ print `uname -a 2>&1`;
+}
+
+print "\n||Max Shingle Size||Unigrams?||Elapsed||\n";
+
+for my $max_shingle_size (sort { $a <=> $b } keys %min_elapsed) {
+ for my $output_unigrams (sort keys %{$min_elapsed{$max_shingle_size}}) {
+ my $size = (1 == $max_shingle_size ? '1 (Unigrams)' : $max_shingle_size);
+ printf "|$size|$output_unigrams|\%2.2fs|\n",
+ $min_elapsed{$max_shingle_size}{$output_unigrams};
+ }
+}
Propchange: lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java Fri Jan 29 04:07:47 2010
@@ -0,0 +1,117 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.util.Version;
+
+/**
+ * Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper
+ * <p>
+ * <ul>
+ * <li> <code>NewShingleAnalyzer</code> (constructs with all defaults)
+ * <li> <code>NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true)</code>
+ * </ul>
+ * </p>
+ */
+public class NewShingleAnalyzerTask extends PerfTask {
+
+ private String analyzerClassName = "standard.StandardAnalyzer";
+ private static final String shingleAnalyzerClassName
+ = "org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper";
+ private int maxShingleSize = 2;
+ private boolean outputUnigrams = true;
+
+ public NewShingleAnalyzerTask(PerfRunData runData) {
+ super(runData);
+ }
+
+ private void setAnalyzer() throws Exception {
+ Class<? extends Analyzer> clazz = null;
+ Analyzer wrappedAnalyzer;
+ try {
+ if (analyzerClassName == null || analyzerClassName.equals("")) {
+ analyzerClassName
+ = "org.apache.lucene.analysis.standard.StandardAnalyzer";
+ }
+ if (analyzerClassName.indexOf(".") == -1
+ || analyzerClassName.startsWith("standard.")) {
+ //there is no package name, assume o.a.l.analysis
+ analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
+ }
+ clazz = Class.forName(analyzerClassName).asSubclass(Analyzer.class);
+ // first try to use a ctor with version parameter (needed for many new
+ // Analyzers that have no default one anymore)
+ Constructor<? extends Analyzer> ctor = clazz.getConstructor(Version.class);
+ wrappedAnalyzer = ctor.newInstance(Version.LUCENE_CURRENT);
+ } catch (NoSuchMethodException e) {
+ // otherwise use default ctor
+ wrappedAnalyzer = clazz.newInstance();
+ }
+ ShingleAnalyzerWrapper analyzer
+ = new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
+ analyzer.setOutputUnigrams(outputUnigrams);
+ getRunData().setAnalyzer(analyzer);
+ }
+
+ @Override
+ public int doLogic() throws Exception {
+ try {
+ setAnalyzer();
+ System.out.println
+ ("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over"
+ + analyzerClassName);
+ } catch (Exception e) {
+ throw new RuntimeException("Error creating Analyzer", e);
+ }
+ return 1;
+ }
+
+ @Override
+ public void setParams(String params) {
+ super.setParams(params);
+ StringTokenizer st = new StringTokenizer(params, ",");
+ while (st.hasMoreTokens()) {
+ String param = st.nextToken();
+ StringTokenizer expr = new StringTokenizer(param, ":");
+ String key = expr.nextToken();
+ String value = expr.nextToken();
+ if (key.equalsIgnoreCase("analyzer")) {
+ analyzerClassName = value;
+ } else if (key.equalsIgnoreCase("outputUnigrams")) {
+ outputUnigrams = Boolean.parseBoolean(value);
+ } else if (key.equalsIgnoreCase("maxShingleSize")) {
+ maxShingleSize = (int)Double.parseDouble(value);
+ } else {
+ throw new RuntimeException("Unknown parameter " + param);
+ }
+ }
+ }
+
+ @Override
+ public boolean supportsParams() {
+ return true;
+ }
+}
Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=904371&r1=904370&r2=904371&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Fri Jan 29 04:07:47 2010
@@ -975,6 +975,79 @@
return algLines;
}
+ /**
+ * Test that we can create ShingleAnalyzerWrappers.
+ */
+ public void testShingleAnalyzer() throws Exception {
+ String text = "one,two,three, four five six";
+
+ // Default analyzer, maxShingleSize, and outputUnigrams
+ Benchmark benchmark = execBenchmark(getShingleConfig(""));
+ TokenStream stream = benchmark.getRunData().getAnalyzer().tokenStream
+ ("bogus", new StringReader(text));
+ assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+ new String[] {"one", "one two", "two", "two three",
+ "three", "three four", "four", "four five",
+ "five", "five six", "six"});
+ // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
+ benchmark = execBenchmark
+ (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
+ assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one two", "one two three", "two three",
+ "two three four", "three four",
+ "three four five", "four five",
+ "four five six", "five six" });
+ // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
+ benchmark = execBenchmark
+ (getShingleConfig("analyzer:WhitespaceAnalyzer"));
+ assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one,two,three,", "one,two,three, four",
+ "four", "four five", "five", "five six",
+ "six" });
+
+ // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
+ benchmark = execBenchmark
+ (getShingleConfig
+ ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
+ assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+ new String[] { "one,two,three, four",
+ "one,two,three, four five",
+ "four five", "four five six",
+ "five six" });
+ }
+
+ private void assertEqualShingle
+ (Analyzer analyzer, String text, String[] expected) throws Exception {
+ TokenStream stream = analyzer.tokenStream("bogus", new StringReader(text));
+ stream.reset();
+ TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
+ int termNum = 0;
+ while (stream.incrementToken()) {
+ assertTrue("Extra output term(s), starting with '"
+ + new String(termAtt.termBuffer(), 0, termAtt.termLength()) + "'",
+ termNum < expected.length);
+ assertEquals("Mismatch in output term # " + termNum + " - ",
+ expected[termNum],
+ new String(termAtt.termBuffer(), 0, termAtt.termLength()));
+ ++termNum;
+ }
+ assertEquals("Too few output terms", expected.length, termNum);
+ stream.close();
+ }
+
+ private static String[] getShingleConfig(String params) {
+ String algLines[] = {
+ "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
+ "docs.file=" + getReuters20LinesFile(),
+ "content.source.forever=false",
+ "directory=RAMDirectory",
+ "NewShingleAnalyzer(" + params + ")",
+ "CreateIndex",
+ "{ \"AddDocs\" AddDoc > : * "
+ };
+ return algLines;
+ }
+
private static String getReuters20LinesFile() {
return System.getProperty("lucene.common.dir").replace('\\','/') +
"/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";