You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by rm...@apache.org on 2010/01/29 05:07:48 UTC

svn commit: r904371 - in /lucene/java/trunk/contrib/benchmark: ./ conf/ scripts/ src/java/org/apache/lucene/benchmark/byTask/tasks/ src/test/org/apache/lucene/benchmark/byTask/

Author: rmuir
Date: Fri Jan 29 04:07:47 2010
New Revision: 904371

URL: http://svn.apache.org/viewvc?rev=904371&view=rev
Log:
LUCENE-2223: add a ShingleFilter benchmark

Added:
    lucene/java/trunk/contrib/benchmark/conf/shingle.alg
    lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl   (with props)
    lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl   (with props)
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java   (with props)
Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/build.xml
    lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?rev=904371&r1=904370&r2=904371&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Fri Jan 29 04:07:47 2010
@@ -2,6 +2,11 @@
 
 The Benchmark contrib package contains code for benchmarking Lucene in a variety of ways.
 
+1/28/2010
+  LUCENE-2223: Add a benchmark for ShingleFilter. You can wrap any
+  analyzer with ShingleAnalyzerWrapper and specify shingle parameters
+  with the NewShingleAnalyzer task.  (Steven Rowe via Robert Muir)
+
 1/14/2010
   LUCENE-2210: TrecTopicsReader now properly reads descriptions and
   narratives from trec topics files.  (Robert Muir)

Modified: lucene/java/trunk/contrib/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/build.xml?rev=904371&r1=904370&r2=904371&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/build.xml (original)
+++ lucene/java/trunk/contrib/benchmark/build.xml Fri Jan 29 04:07:47 2010
@@ -131,6 +131,7 @@
         <pathelement path="${common.dir}/build/contrib/highlighter/classes/java"/>
         <pathelement path="${common.dir}/build/contrib/memory/classes/java"/>
         <pathelement path="${common.dir}/build/contrib/fast-vector-highlighter/classes/java"/>
+        <pathelement path="${common.dir}/build/contrib/analyzers/common/classes/java"/>
     	<fileset dir="lib">
     		<include name="**/*.jar"/>
     	</fileset>
@@ -192,6 +193,32 @@
 	    <echo>Benchmark output in JIRA table format is in file: ${collation.jira.output.file}</echo>
 	</target>
 	
+    <property name="shingle.alg.file" location="conf/shingle.alg"/>
+    <property name="shingle.output.file" 
+              value="${working.dir}/shingle.benchmark.output.txt"/>
+    <property name="shingle.jira.output.file" 
+              value="${working.dir}/shingle.bm2jira.output.txt"/>
+	
+    <path id="shingle.runtime.classpath">
+      <path refid="run.classpath"/>
+    </path>
+	
+    <target name="shingle" depends="compile,compile-analyzers-common,get-files">
+      <echo>Running contrib/benchmark with alg file: ${shingle.alg.file}</echo>
+      <java fork="true" classname="org.apache.lucene.benchmark.byTask.Benchmark" 
+            maxmemory="${task.mem}" output="${shingle.output.file}">
+        <classpath refid="run.classpath"/>
+        <arg file="${shingle.alg.file}"/>
+      </java>
+      <echo>Benchmark output is in file: ${shingle.output.file}</echo>
+      <echo>Converting to JIRA table format...</echo>
+      <exec executable="perl" output="${shingle.jira.output.file}" failonerror="true">
+        <arg value="scripts/shingle.bm2jira.pl"/>
+        <arg value="${shingle.output.file}"/>
+      </exec>
+      <echo>Benchmark output in JIRA table format is in file: ${shingle.jira.output.file}</echo>
+    </target>
+
     <target name="compile-demo">
       <subant target="compile-demo">
          <fileset dir="${common.dir}" includes="build.xml"/>
@@ -207,6 +234,11 @@
          <fileset dir="${common.dir}/contrib/icu" includes="build.xml"/>
       </subant>
     </target>
+    <target name="compile-analyzers-common">
+      <subant target="compile">
+        <fileset dir="${common.dir}/contrib/analyzers/common" includes="build.xml"/>
+      </subant>
+    </target>
     <target name="compile-memory">
       <subant target="compile">
          <fileset dir="${common.dir}/contrib/memory" includes="build.xml"/>

Added: lucene/java/trunk/contrib/benchmark/conf/shingle.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/shingle.alg?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/shingle.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/shingle.alg Fri Jan 29 04:07:47 2010
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+doc.tokenized=false
+doc.body.tokenized=true
+docs.dir=reuters-out
+log.step=1000
+
+{ "Rounds"
+
+    -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:true)
+    -ResetInputs
+    { "BigramsAndUnigrams" { ReadTokens > : 10000 }
+
+    -NewShingleAnalyzer(maxShingleSize:2,outputUnigrams:false)
+    -ResetInputs
+    { "BigramsOnly" { ReadTokens > : 10000 }
+
+    -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:true)
+    -ResetInputs
+    { "FourgramsAndUnigrams" { ReadTokens > : 10000 }
+
+    -NewShingleAnalyzer(maxShingleSize:4,outputUnigrams:false)
+    -ResetInputs
+    { "FourgramsOnly" { ReadTokens > : 10000 }
+
+    -NewAnalyzer(standard.StandardAnalyzer)
+    -ResetInputs
+    { "UnigramsOnly" { ReadTokens > : 10000 }
+
+    NewRound
+
+} : 5
+
+RepSumByNameRound

Added: lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl (added)
+++ lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl Fri Jan 29 04:07:47 2010
@@ -0,0 +1,116 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ------------------------------------------
+# compare.shingle.benchmark.jira.tables.pl
+#
+# Takes as cmdline parameters two JIRA-formatted benchmark results, as produced
+# by shingle.bm2jira.pl (located in the same directory as this script), and
+# outputs a third JIRA-formatted comparison table.
+#
+# The difference is calculated as a percentage:
+#
+#   100 * (unpatched-elapsed - patched-elapsed / patched-elapsed)
+#
+# where (un)patched-elapsed values have had the no-shingle-filter 
+# (StandardAnalyzer) elapsed time subtracted from them.
+#
+#
+# Example shingle.bm2jira.pl output:
+# ----------------------------------
+# JAVA:
+# java version "1.5.0_15"
+# Java(TM) 2 Runtime Environment, Standard Edition (build 1.5.0_15-b04)
+# Java HotSpot(TM) 64-Bit Server VM (build 1.5.0_15-b04, mixed mode)
+#
+# OS:
+# cygwin
+# WinVistaService Pack 2
+# Service Pack 26060022202561
+#
+# ||Max Shingle Size||Unigrams?||Elapsed||
+# |1 (Unigrams)|yes|2.19s|
+# |2|no|4.74s|
+# |2|yes|4.90s|
+# |4|no|5.82s|
+# |4|yes|5.97s|
+
+use strict;
+use warnings;
+
+my $usage = "Usage: $0 <unpatched-file> <patched-file>\n";
+
+die $usage unless ($#ARGV == 1 && -f $ARGV[0] && -f $ARGV[1]);
+
+my %stats = ();
+
+open UNPATCHED, "<$ARGV[0]" || die "ERROR opening '$ARGV[0]': $!";
+my $table_encountered = 0;
+my $standard_analyzer_elapsed = 0;
+my %unpatched_stats = ();
+my %patched_stats = ();
+while (<UNPATCHED>) {
+  unless ($table_encountered) {
+    if (/\Q||Max Shingle Size||Unigrams?||Elapsed||\E/) {
+      $table_encountered = 1;
+    } else {
+      print;
+    }
+  } elsif (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
+    my $max_shingle_size = $1;
+    my $output_unigrams = $2;
+    my $elapsed = $3;
+    if ($max_shingle_size =~ /Unigrams/) {
+      $standard_analyzer_elapsed = $elapsed;
+    } else {
+      $unpatched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
+    }
+  }
+}
+close UNPATCHED;
+
+open PATCHED, "<$ARGV[1]" || die "ERROR opening '$ARGV[1]': $!";
+while (<PATCHED>) {
+  if (/\|([^|]+)\|([^|]+)\|([\d.]+)s\|/) {
+    my $max_shingle_size = $1;
+    my $output_unigrams = $2;
+    my $elapsed = $3;
+    if ($max_shingle_size =~ /Unigrams/) {
+      $standard_analyzer_elapsed = $elapsed
+         if ($elapsed < $standard_analyzer_elapsed);
+    } else {
+      $patched_stats{$max_shingle_size}{$output_unigrams} = $elapsed;
+    }
+  }
+}
+close PATCHED;
+
+print "||Max Shingle Size||Unigrams?||Unpatched||Patched||StandardAnalyzer||Improvement||\n";
+for my $max_shingle_size (sort { $a <=> $b } keys %unpatched_stats) {
+  for my $output_unigrams (sort keys %{$unpatched_stats{$max_shingle_size}}) {
+    my $improvement 
+      = ( $unpatched_stats{$max_shingle_size}{$output_unigrams}
+        - $patched_stats{$max_shingle_size}{$output_unigrams})
+      / ( $patched_stats{$max_shingle_size}{$output_unigrams}
+        - $standard_analyzer_elapsed);
+    $improvement = int($improvement * 1000 + .5) / 10; # Round and truncate
+    printf "|$max_shingle_size|$output_unigrams"
+          ."|$unpatched_stats{$max_shingle_size}{$output_unigrams}s"
+          ."|$patched_stats{$max_shingle_size}{$output_unigrams}s"
+          ."|${standard_analyzer_elapsed}s|%2.1f%%|\n", $improvement;
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/scripts/compare.shingle.benchmark.tables.pl
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl (added)
+++ lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl Fri Jan 29 04:07:47 2010
@@ -0,0 +1,73 @@
+#!/usr/bin/perl
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ----------
+# shingle.bm2jira.pl
+#
+# Converts Lucene contrib-benchmark output produced using the 
+# conf/shingle.alg file into a JIRA-formatted table.
+#
+
+use strict;
+use warnings;
+
+my %min_elapsed = ();
+
+#Operation           round  runCnt  recsPerRun  rec/s      elapsedSec  avgUsedMem  avgTotalMem
+#BigramsAndUnigrams  0      1       255691      21,147.22  12.09       15,501,840  35,061,760
+#BigramsOnly   -  -  0 -  - 1 -  -  127383   -  16,871.92  7.55    -   31,725,312  41,746,432
+#FourgramsAndUnigrams
+#FourgramsOnly
+#UnigramsOnly
+
+while (<>) {
+  if (/^((?:Uni|Bi|Four)grams\S+)[-\s]*([^\s{].*)/) {
+    my $operation = $1;
+    my $stats = $2;
+    my $max_shingle_size 
+    = ($operation =~ /^Bigrams/ ? 2 : $operation =~ /^Unigrams/ ? 1 : 4);
+    my $output_unigrams 
+      = ($operation =~ /(?:AndUnigrams|UnigramsOnly)$/ ? 'yes' : 'no'); 
+    my ($elapsed) = $stats =~ /(?:[\d,.]+[-\s]*){4}([.\d]+)/;
+    $min_elapsed{$max_shingle_size}{$output_unigrams} = $elapsed
+      unless (defined($min_elapsed{$max_shingle_size}{$output_unigrams})
+              && $elapsed >= $min_elapsed{$max_shingle_size}{$output_unigrams});
+  }
+}
+
+# Print out platform info
+print "JAVA:\n", `java -version 2>&1`, "\nOS:\n";
+if ($^O =~ /win/i) {
+  print "$^O\n";
+  eval {
+    require Win32;
+    print Win32::GetOSName(), "\n", Win32::GetOSVersion(), "\n";
+  };
+  die "Error loading Win32: $@" if ($@);
+} else {
+  print `uname -a 2>&1`;
+}
+
+print "\n||Max Shingle Size||Unigrams?||Elapsed||\n";
+
+for my $max_shingle_size (sort { $a <=> $b } keys %min_elapsed) {
+  for my $output_unigrams (sort keys %{$min_elapsed{$max_shingle_size}}) {
+    my $size = (1 == $max_shingle_size ? '1 (Unigrams)' : $max_shingle_size);   
+    printf "|$size|$output_unigrams|\%2.2fs|\n",
+           $min_elapsed{$max_shingle_size}{$output_unigrams};
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/scripts/shingle.bm2jira.pl
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java?rev=904371&view=auto
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java (added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java Fri Jan 29 04:07:47 2010
@@ -0,0 +1,117 @@
+package org.apache.lucene.benchmark.byTask.tasks;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.util.Version;
+
+/**
+ * Task to support benchmarking ShingleFilter / ShingleAnalyzerWrapper
+ * <p>
+ * <ul>
+ *  <li> <code>NewShingleAnalyzer</code> (constructs with all defaults)
+ *  <li> <code>NewShingleAnalyzer(analyzer:o.a.l.analysis.StandardAnalyzer,maxShingleSize:2,outputUnigrams:true)</code>
+ * </ul>
+ * </p>
+ */
+public class NewShingleAnalyzerTask extends PerfTask {
+
+  private String analyzerClassName = "standard.StandardAnalyzer";
+  private static final String shingleAnalyzerClassName
+    = "org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper";
+  private int maxShingleSize = 2;
+  private boolean outputUnigrams = true;
+  
+  public NewShingleAnalyzerTask(PerfRunData runData) {
+    super(runData);
+  }
+
+  private void setAnalyzer() throws Exception {
+    Class<? extends Analyzer> clazz = null;
+    Analyzer wrappedAnalyzer;
+    try {
+      if (analyzerClassName == null || analyzerClassName.equals("")) {
+        analyzerClassName 
+          = "org.apache.lucene.analysis.standard.StandardAnalyzer"; 
+      }
+      if (analyzerClassName.indexOf(".") == -1 
+          || analyzerClassName.startsWith("standard.")) {
+        //there is no package name, assume o.a.l.analysis
+        analyzerClassName = "org.apache.lucene.analysis." + analyzerClassName;
+      }
+      clazz = Class.forName(analyzerClassName).asSubclass(Analyzer.class);
+      // first try to use a ctor with version parameter (needed for many new 
+      // Analyzers that have no default one anymore)
+      Constructor<? extends Analyzer> ctor = clazz.getConstructor(Version.class);
+      wrappedAnalyzer = ctor.newInstance(Version.LUCENE_CURRENT);
+    } catch (NoSuchMethodException e) {
+      // otherwise use default ctor
+      wrappedAnalyzer = clazz.newInstance();
+    }
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(wrappedAnalyzer, maxShingleSize);
+    analyzer.setOutputUnigrams(outputUnigrams);
+    getRunData().setAnalyzer(analyzer);
+  }
+  
+  @Override
+  public int doLogic() throws Exception {
+    try {
+      setAnalyzer();
+      System.out.println
+        ("Changed Analyzer to: ShingleAnalyzerWrapper, wrapping ShingleFilter over" 
+         + analyzerClassName);
+    } catch (Exception e) {
+      throw new RuntimeException("Error creating Analyzer", e);
+    }
+    return 1;
+  }
+  
+  @Override
+  public void setParams(String params) {
+    super.setParams(params);
+    StringTokenizer st = new StringTokenizer(params, ",");
+    while (st.hasMoreTokens()) {
+      String param = st.nextToken();
+      StringTokenizer expr = new StringTokenizer(param, ":");
+      String key = expr.nextToken();
+      String value = expr.nextToken();
+      if (key.equalsIgnoreCase("analyzer")) {
+        analyzerClassName = value;
+      } else if (key.equalsIgnoreCase("outputUnigrams")) {
+        outputUnigrams = Boolean.parseBoolean(value);
+      } else if (key.equalsIgnoreCase("maxShingleSize")) {
+        maxShingleSize = (int)Double.parseDouble(value);
+      } else {
+        throw new RuntimeException("Unknown parameter " + param);
+      }
+    }
+  }
+
+  @Override
+  public boolean supportsParams() {
+    return true;
+  }
+}

Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewShingleAnalyzerTask.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=904371&r1=904370&r2=904371&view=diff
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/java/trunk/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Fri Jan 29 04:07:47 2010
@@ -975,6 +975,79 @@
     return algLines;
   }
   
+  /**
+   * Test that we can create ShingleAnalyzerWrappers.
+   */
+  public void testShingleAnalyzer() throws Exception {
+    String text = "one,two,three, four five six";
+    
+    // Default analyzer, maxShingleSize, and outputUnigrams
+    Benchmark benchmark = execBenchmark(getShingleConfig(""));
+    TokenStream stream = benchmark.getRunData().getAnalyzer().tokenStream
+      ("bogus", new StringReader(text));
+    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+                       new String[] {"one", "one two", "two", "two three",
+                                     "three", "three four", "four", "four five",
+                                     "five", "five six", "six"});
+    // Default analyzer, maxShingleSize = 3, and outputUnigrams = false
+    benchmark = execBenchmark
+      (getShingleConfig("maxShingleSize:3,outputUnigrams:false"));
+    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+                       new String[] { "one two", "one two three", "two three",
+                                      "two three four", "three four", 
+                                      "three four five", "four five",
+                                      "four five six", "five six" });
+    // WhitespaceAnalyzer, default maxShingleSize and outputUnigrams
+    benchmark = execBenchmark
+      (getShingleConfig("analyzer:WhitespaceAnalyzer"));
+    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+                       new String[] { "one,two,three,", "one,two,three, four",
+                                      "four", "four five", "five", "five six", 
+                                      "six" });
+    
+    // WhitespaceAnalyzer, maxShingleSize=3 and outputUnigrams=false
+    benchmark = execBenchmark
+      (getShingleConfig
+        ("outputUnigrams:false,maxShingleSize:3,analyzer:WhitespaceAnalyzer"));
+    assertEqualShingle(benchmark.getRunData().getAnalyzer(), text,
+                       new String[] { "one,two,three, four", 
+                                      "one,two,three, four five",
+                                      "four five", "four five six",
+                                      "five six" });
+  }
+  
+  private void assertEqualShingle
+    (Analyzer analyzer, String text, String[] expected) throws Exception {
+    TokenStream stream = analyzer.tokenStream("bogus", new StringReader(text));
+    stream.reset();
+    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
+    int termNum = 0;
+    while (stream.incrementToken()) {
+      assertTrue("Extra output term(s), starting with '"
+                 + new String(termAtt.termBuffer(), 0, termAtt.termLength()) + "'",
+                 termNum < expected.length);
+      assertEquals("Mismatch in output term # " + termNum + " - ", 
+                   expected[termNum],
+                   new String(termAtt.termBuffer(), 0, termAtt.termLength()));
+      ++termNum;
+    }
+    assertEquals("Too few output terms", expected.length, termNum);
+    stream.close();
+  }
+  
+  private static String[] getShingleConfig(String params) { 
+    String algLines[] = {
+        "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
+        "docs.file=" + getReuters20LinesFile(),
+        "content.source.forever=false",
+        "directory=RAMDirectory",
+        "NewShingleAnalyzer(" + params + ")",
+        "CreateIndex",
+        "{ \"AddDocs\"  AddDoc > : * "
+    };
+    return algLines;
+  }
+  
   private static String getReuters20LinesFile() {
     return System.getProperty("lucene.common.dir").replace('\\','/') +
       "/contrib/benchmark/src/test/org/apache/lucene/benchmark/reuters.first20.lines.txt";