You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2010/10/09 19:16:40 UTC

svn commit: r1006195 - in /lucene/dev/branches/branch_3x/lucene/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/shingle/ analyzers/common/src/test/org/apache/lucene/analysis/shingle/

Author: sarowe
Date: Sat Oct  9 17:16:40 2010
New Revision: 1006195

URL: http://svn.apache.org/viewvc?rev=1006195&view=rev
Log:
LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles can be generated.

Modified:
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Sat Oct  9 17:16:40 2010
@@ -133,6 +133,9 @@ API Changes
  * LUCENE-2626: FastVectorHighlighter: enable FragListBuilder and FragmentsBuilder
    to be set per-field override. (Koji Sekiguchi)
    
+ * LUCENE-1370: Added ShingleFilter option to output unigrams if no shingles
+   can be generated. (Chris Harris via Steven Rowe)
+   
 New features
 
  * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.

Propchange: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Sat Oct  9 17:16:40 2010
@@ -0,0 +1,8 @@
+/lucene/dev/trunk/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940666,940699,940730,940878-940892,940994,941270,941363,942166,942235,942288,942676,942719,943142,943493,943931,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949288,949311,949445,949976,949997,950008,950042,950458,950613,951126,951355,951397,951521,953628,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957520,957634,957707,960367,960371,960374,960719,962555,963372
 ,963654,963720,963781,963873,963906,963909,963920,964019,964054,964430,964459,964720,964753,964832,964856,965103,965110,965222,965230,965299,965327,965330,965585,966354,966878,967080,979453,979809,980369,980428,980436,980501,980909,980911,980917,981265,981550,981598,981650,981661,981857,981936,982073,982084,982201,982725,982824,983100,983212,983216,983313,983495,983500,983530,983622,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986173,986612,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,989785,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991310,991497,992424,992469,992567,992571,992623,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995376,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996978,997180,997230,998055,998505,998684,9
 99016,999223,999545,999842,1000424,1000581,1000675,1001006,1001420,1001661,1001796,1002032,1003614,1003631,1003645,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146
+/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle:942235,945090,946139,950008,964019,964054,984968,986612,990459,996611,997180,998684,1002032,1003906,1003978,1003990,1006187
+/lucene/java/branches/flex_1458/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:924791,924850,930201
+/lucene/java/branches/lucene_2_4/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:748824
+/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:817269-818600,825998,829134,829816,829881,831036,896850,909334,948516
+/lucene/java/branches/lucene_2_9_back_compat_tests/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:818601-821336
+/lucene/java/branches/lucene_3_0/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:880793,896906
+/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle:924483-925561

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Sat Oct  9 17:16:40 2010
@@ -38,6 +38,7 @@ public final class ShingleAnalyzerWrappe
   private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
   private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
   private boolean outputUnigrams = true;
+  private boolean outputUnigramsIfNoShingles = false;
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
     super();
@@ -147,6 +148,24 @@ public final class ShingleAnalyzerWrappe
   public void setOutputUnigrams(boolean outputUnigrams) {
     this.outputUnigrams = outputUnigrams;
   }
+  
+  public boolean isOutputUnigramsIfNoShingles() {
+    return outputUnigramsIfNoShingles;
+  }
+  
+  /**
+   * <p>Shall we override the behavior of outputUnigrams==false for those
+   * times when no shingles are available (because there are fewer than
+   * minShingleSize tokens in the input stream)? (default: false.)
+   * <p>Note that if outputUnigrams==true, then unigrams are always output,
+   * regardless of whether any shingles are available.
+   *
+   * @param outputUnigramsIfNoShingles Whether or not to output a single
+   *  unigram when no shingles are available.
+   */
+  public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+    this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+  }
 
   @Override
   public TokenStream tokenStream(String fieldName, Reader reader) {
@@ -161,6 +180,7 @@ public final class ShingleAnalyzerWrappe
     filter.setMaxShingleSize(maxShingleSize);
     filter.setTokenSeparator(tokenSeparator);
     filter.setOutputUnigrams(outputUnigrams);
+    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     return filter;
   }
   
@@ -192,6 +212,7 @@ public final class ShingleAnalyzerWrappe
     streams.shingle.setMinShingleSize(minShingleSize);
     streams.shingle.setTokenSeparator(tokenSeparator);
     streams.shingle.setOutputUnigrams(outputUnigrams);
+    streams.shingle.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
     return streams.shingle;
   }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sat Oct  9 17:16:40 2010
@@ -103,6 +103,11 @@ public final class ShingleFilter extends
   private boolean outputUnigrams = true;
 
   /**
+   * By default, we don't override behavior of outputUnigrams.
+   */
+  private boolean outputUnigramsIfNoShingles = false;
+ 
+  /**
    * maximum shingle size (number of tokens)
    */
   private int maxShingleSize;
@@ -136,6 +141,11 @@ public final class ShingleFilter extends
    * position.
    */
   private boolean isOutputHere = false;
+
+  /**
+   * true if no shingles have been output yet (for outputUnigramsIfNoShingles).
+   */
+  boolean noShingleOutput = true;
   
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -212,6 +222,20 @@ public final class ShingleFilter extends
   }
 
   /**
+   * <p>Shall we override the behavior of outputUnigrams==false for those
+   * times when no shingles are available (because there are fewer than
+   * minShingleSize tokens in the input stream)? (default: false.)
+   * <p>Note that if outputUnigrams==true, then unigrams are always output,
+   * regardless of whether any shingles are available.
+   *
+   * @param outputUnigramsIfNoShingles Whether or not to output a single
+   * unigram when no shingles are available.
+   */
+  public void setOutputUnigramsIfNoShingles(boolean outputUnigramsIfNoShingles) {
+    this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
+  }
+
+  /**
    * Set the max shingle size (default: 2)
    *
    * @param maxShingleSize max size of output shingles
@@ -292,6 +316,7 @@ public final class ShingleFilter extends
         termAtt.setEmpty().append(gramBuilder);
         if (gramSize.getValue() > 1) {
           typeAtt.setType(tokenType);
+          noShingleOutput = false;
         }
         offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
         isOutputHere = true;
@@ -395,6 +420,10 @@ public final class ShingleFilter extends
         }
       }
     }
+    if (outputUnigramsIfNoShingles && noShingleOutput 
+        && gramSize.minValue > 1 && inputWindow.size() < minShingleSize) {
+      gramSize.minValue = 1;
+    }
     gramSize.reset();
     isOutputHere = false;
   }
@@ -406,6 +435,11 @@ public final class ShingleFilter extends
     inputWindow.clear();
     numFillerTokensToInsert = 0;
     isOutputHere = false;
+    noShingleOutput = true;    
+    if (outputUnigramsIfNoShingles && ! outputUnigrams) {
+      // Fix up gramSize if minValue was reset for outputUnigramsIfNoShingles
+      gramSize.minValue = minShingleSize;
+    }
   }
 
 

Propchange: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Sat Oct  9 17:16:40 2010
@@ -0,0 +1,8 @@
+/lucene/dev/trunk/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:931298,931337,931502,932129-932131,932163,932304,932369,932374,932398,932417,932541,932576,932587,932698,932731-932749,932752,932773,932795,932828,932856-932857,932862,932864,932878,932963,932998-932999,933541-933575,933598,933613,933679,933879,934339,934954,935014-935048,935065,935186-935513,935521-935522,935553-935962,936522,936544,936605,936657-936726,937039,937360,938582-938646,938989,939111,939611,939649,940433,940447,940451-940452,940666,940699,940730,940878-940892,940994,941270,941363,942166,942235,942288,942676,942719,943142,943493,943931,945057,945090,945130,945245,945343,945420,946139,946330,946338,946599,948011,948082,948429,949288,949311,949445,949976,949997,950008,950042,950458,950613,951126,951355,951397,951521,953628,955547,955613,955615,955796-955797,955809-955996,956097,956125,956173,956316,956715,957465,957520,957634,957707,960367,960371,960374,960719,962555,963372
 ,963654,963720,963781,963873,963906,963909,963920,964019,964054,964430,964459,964720,964753,964832,964856,965103,965110,965222,965230,965299,965327,965330,965585,966354,966878,967080,979453,979809,980369,980428,980436,980501,980909,980911,980917,981265,981550,981598,981650,981661,981857,981936,982073,982084,982201,982725,982824,983100,983212,983216,983313,983495,983500,983530,983622,983632,983778,984187,984202,984232,984510,984968,985453,985455,985672,985875,986158,986173,986612,987122,988087,988206,988216,988259,988346,988478,988527,988543,988592,988613,988688,988710,988736,988739,989004,989010,989013,989030,989035,989315,989321,989334,989785,990160-990161,990180,990189,990281,990301,990451,990459,990766,990781,990854,991053,991191,991310,991497,992424,992469,992567,992571,992623,993106,993194,993199,993287,993408,994935,994976,994979,995247,995250,995376,995772,996268,996357,996416,996511,996611,996623,996647-996653,996720,996942,996978,997180,997230,998055,998505,998684,9
 99016,999223,999545,999842,1000424,1000581,1000675,1001006,1001420,1001661,1001796,1002032,1003614,1003631,1003645,1003841-1003852,1003873,1003877,1003906,1003938,1003954,1003978,1003990,1004038,1004082,1004179,1004200,1004215,1004241,1004335,1005310,1005356,1005363,1006146
+/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/shingle:942235,945090,946139,950008,950042,964019,964054,984968,986612,990459,996611,997180,998684,1002032,1003906,1003978,1003990,1006187
+/lucene/java/branches/flex_1458/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:924791,924850,930201
+/lucene/java/branches/lucene_2_4/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:748824
+/lucene/java/branches/lucene_2_9/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:817269-818600,825998,829134,829816,829881,831036,896850,909334,948516
+/lucene/java/branches/lucene_2_9_back_compat_tests/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:818601-821336
+/lucene/java/branches/lucene_3_0/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:880793,896906
+/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle:924483-925561

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Sat Oct  9 17:16:40 2010
@@ -359,4 +359,16 @@ public class ShingleAnalyzerWrapperTest 
                           new int[] { 13, 18, 27 },
                           new int[] {  1,  1,  1 });
   }
+  
+  public void testOutputUnigramsIfNoShinglesSingleToken() throws Exception {
+    ShingleAnalyzerWrapper analyzer
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
+    analyzer.setOutputUnigrams(false);
+    analyzer.setOutputUnigramsIfNoShingles(true);
+    assertAnalyzesToReuse(analyzer, "please",
+                          new String[] { "please" },
+                          new int[] { 0 },
+                          new int[] { 6 },
+                          new int[] { 1 });
+  }
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=1006195&r1=1006194&r2=1006195&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sat Oct  9 17:16:40 2010
@@ -73,6 +73,14 @@ public class ShingleFilterTest extends B
       createToken("shingles", 33, 39),
   };
 
+  public static final int[] UNIGRAM_ONLY_POSITION_INCREMENTS = new int[] {
+    1, 1, 1, 1, 1, 1
+  };
+
+  public static final String[] UNIGRAM_ONLY_TYPES = new String[] {
+    "word", "word", "word", "word", "word", "word"
+  };
+
   public static Token[] testTokenWithHoles;
 
   public static final Token[] BI_GRAM_TOKENS = new Token[] {
@@ -1018,15 +1026,44 @@ public class ShingleFilterTest extends B
       new int[]{1,0,1,0,1,0,1}
     );
   }
-  
+
+  public void testOutputUnigramsIfNoShinglesSingleTokenCase() throws IOException {
+    // Single token input with outputUnigrams==false is the primary case where
+    // enabling this option should alter program behavior.
+    this.shingleFilterTest(2, 2, TEST_SINGLE_TOKEN, SINGLE_TOKEN,
+                           SINGLE_TOKEN_INCREMENTS, SINGLE_TOKEN_TYPES,
+                           false, true);
+  }
+ 
+  public void testOutputUnigramsIfNoShinglesWithSimpleBigram() throws IOException {
+    // Here we expect the same result as with testBiGramFilter().
+    this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS,
+                           BI_GRAM_POSITION_INCREMENTS, BI_GRAM_TYPES,
+                           true, true);
+  }
+
+  public void testOutputUnigramsIfNoShinglesWithSimpleUnigramlessBigram() throws IOException {
+    // Here we expect the same result as with testBiGramFilterWithoutUnigrams().
+    this.shingleFilterTest(2, 2, TEST_TOKEN, BI_GRAM_TOKENS_WITHOUT_UNIGRAMS,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS, BI_GRAM_TYPES_WITHOUT_UNIGRAMS,
+                           false, true);
+  }
+
+  public void testOutputUnigramsIfNoShinglesWithMultipleInputTokens() throws IOException {
+    // Test when the minimum shingle size is greater than the number of input tokens
+    this.shingleFilterTest(7, 7, TEST_TOKEN, TEST_TOKEN, 
+                           UNIGRAM_ONLY_POSITION_INCREMENTS, UNIGRAM_ONLY_TYPES,
+                           false, true);
+  }
+
   protected void shingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
                                    int[] positionIncrements, String[] types,
                                    boolean outputUnigrams)
     throws IOException {
 
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
-    shingleFilterTestCommon
-      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+    filter.setOutputUnigrams(outputUnigrams);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
 
   protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, 
@@ -1035,8 +1072,20 @@ public class ShingleFilterTest extends B
     throws IOException {
     ShingleFilter filter 
       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
-    shingleFilterTestCommon
-      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+    filter.setOutputUnigrams(outputUnigrams);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
+  }
+
+  protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, 
+                                   Token[] tokensToCompare, int[] positionIncrements,
+                                   String[] types, boolean outputUnigrams, 
+                                   boolean outputUnigramsIfNoShingles)
+    throws IOException {
+    ShingleFilter filter 
+      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+    filter.setOutputUnigrams(outputUnigrams);
+    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
 
   protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, 
@@ -1046,18 +1095,15 @@ public class ShingleFilterTest extends B
     ShingleFilter filter 
       = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
     filter.setTokenSeparator(tokenSeparator);
-    shingleFilterTestCommon
-      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+    filter.setOutputUnigrams(outputUnigrams);
+    shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types);
   }
 
   protected void shingleFilterTestCommon(ShingleFilter filter,
                                          Token[] tokensToCompare,
                                          int[] positionIncrements,
-                                         String[] types, boolean outputUnigrams)
+                                         String[] types)
     throws IOException {
-
-    filter.setOutputUnigrams(outputUnigrams);
-
     String text[] = new String[tokensToCompare.length];
     int startOffsets[] = new int[tokensToCompare.length];
     int endOffsets[] = new int[tokensToCompare.length];