You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/04/16 23:53:16 UTC
svn commit: r1674159 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/ lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/

Author: mikemccand
Date: Thu Apr 16 21:53:16 2015
New Revision: 1674159

URL: http://svn.apache.org/r1674159
Log:
LUCENE-6400: preserve original token when possible in SolrSynonymParser

Added:
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/BaseSynonymParserTestCase.java
      - copied unchanged from r1674155, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/BaseSynonymParserTestCase.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1674159&r1=1674158&r2=1674159&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Thu Apr 16 21:53:16 2015
@@ -76,6 +76,10 @@ Bug Fixes
 * LUCENE-6345: Null check terms/fields in Lucene queries (Lee
   Hinman via Mike McCandless)
 
+* LUCENE-6400: SolrSynonymParser should preserve original token instead
+  of replacing it with a synonym, when expand=true and there is no
+  explicit mapping (Ian Ribas, Robert Muir, Mike McCandless)
+
 API Changes
 
 * LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java?rev=1674159&r1=1674158&r2=1674159&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java Thu Apr 16 21:53:16 2015
@@ -84,9 +84,6 @@ public class SolrSynonymParser extends S
         continue; // ignore empty lines and comments
       }
       
-      CharsRef inputs[];
-      CharsRef outputs[];
-      
       // TODO: we could process this more efficiently.
       String sides[] = split(line, "=>");
       if (sides.length > 1) { // explicit mapping
@@ -94,37 +91,45 @@ public class SolrSynonymParser extends S
           throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
         }
         String inputStrings[] = split(sides[0], ",");
-        inputs = new CharsRef[inputStrings.length];
+        CharsRef[] inputs = new CharsRef[inputStrings.length];
         for (int i = 0; i < inputs.length; i++) {
           inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
         }
         
         String outputStrings[] = split(sides[1], ",");
-        outputs = new CharsRef[outputStrings.length];
+        CharsRef[] outputs = new CharsRef[outputStrings.length];
         for (int i = 0; i < outputs.length; i++) {
           outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder());
         }
+        // these mappings are explicit and never preserve original
+        for (int i = 0; i < inputs.length; i++) {
+          for (int j = 0; j < outputs.length; j++) {
+            add(inputs[i], outputs[j], false);
+          }
+        }
       } else {
         String inputStrings[] = split(line, ",");
-        inputs = new CharsRef[inputStrings.length];
+        CharsRef[] inputs = new CharsRef[inputStrings.length];
         for (int i = 0; i < inputs.length; i++) {
           inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
         }
         if (expand) {
-          outputs = inputs;
+          // all pairs
+          for (int i = 0; i < inputs.length; i++) {
+            for (int j = 0; j < inputs.length; j++) {
+              if (i != j) {
+                add(inputs[i], inputs[j], true);
+              }
+            }
+          }
         } else {
-          outputs = new CharsRef[1];
-          outputs[0] = inputs[0];
-        }
-      }
-      
-      // currently we include the term itself in the map,
-      // and use includeOrig = false always.
-      // this is how the existing filter does it, but it's actually a bug,
-      // especially if combined with ignoreCase = true
-      for (int i = 0; i < inputs.length; i++) {
-        for (int j = 0; j < outputs.length; j++) {
-          add(inputs[i], outputs[j], false);
+          // all subsequent inputs map to first one; we also add inputs[0] here
+          // so that we "effectively" (because we remove the original input and
+          // add back a synonym with the same text) change that token's type to
+          // SYNONYM (matching legacy behavior):
+          for (int i = 0; i < inputs.length; i++) {
+            add(inputs[i], inputs[0], false);
+          }
         }
       }
     }

Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java?rev=1674159&r1=1674158&r2=1674159&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java Thu Apr 16 21:53:16 2015
@@ -21,7 +21,6 @@ import java.io.StringReader;
 import java.text.ParseException;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
@@ -31,7 +30,7 @@ import org.apache.lucene.analysis.en.Eng
  * Tests parser for the Solr synonyms format
  * @lucene.experimental
  */
-public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
+public class TestSolrSynonymParser extends BaseSynonymParserTestCase {
   
   /** Tests some simple examples from the solr wiki */
   public void testSimple() throws Exception {
@@ -174,4 +173,61 @@ public class TestSolrSynonymParser exten
         new int[] { 1 });
     analyzer.close();
   }
+
+  /** Verify type of token and positionLength after analyzer. */
+  public void testPositionLengthAndTypeSimple() throws Exception {
+    String testFile =
+     "spider man, spiderman";
+
+    Analyzer analyzer = new MockAnalyzer(random());
+    SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
+    parser.parse(new StringReader(testFile));
+    final SynonymMap map = parser.build();
+    analyzer.close();
+
+    analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+        return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+      }
+    };
+
+    assertAnalyzesToPositions(analyzer, "spider man",
+        new String[]{"spider", "spiderman", "man"},
+        new String[]{"word", "SYNONYM", "word"},
+        new int[]{1, 0, 1},
+        new int[]{1, 2, 1});
+  }
+
+  /** Test parsing of simple examples. */
+  public void testParseSimple() throws Exception {
+    String testFile =
+      "spider man, spiderman\n" +
+      "usa,united states,u s a,united states of america\n"+
+      "mystyped, mistyped => mistyped\n" +
+      "foo => foo bar\n" +
+      "foo => baz";
+
+    Analyzer analyzer = new MockAnalyzer(random());
+    SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
+    parser.parse(new StringReader(testFile));
+    final SynonymMap map = parser.build();
+    analyzer.close();
+
+    assertEntryEquals(map, "spiderman", true, "spider man");
+    assertEntryEquals(map, "spider man", true, "spiderman");
+
+    assertEntryEquals(map, "usa", true, new String[] {"united states", "u s a", "united states of america"});
+    assertEntryEquals(map, "united states", true, new String[] {"usa", "u s a", "united states of america"});
+    assertEntryEquals(map, "u s a", true, new String[] {"usa", "united states", "united states of america"});
+    assertEntryEquals(map, "united states of america", true, new String[] {"usa", "u s a", "united states"});
+
+    assertEntryEquals(map, "mistyped", false, "mistyped");
+    assertEntryEquals(map, "mystyped", false, "mistyped");
+
+    assertEntryEquals(map, "foo", false, new String[]{"foo bar", "baz"});
+    assertEntryAbsent(map, "baz");
+    assertEntryAbsent(map, "bar");
+  }
 }