You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/04/16 23:53:16 UTC
svn commit: r1674159 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/
lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/
Author: mikemccand
Date: Thu Apr 16 21:53:16 2015
New Revision: 1674159
URL: http://svn.apache.org/r1674159
Log:
LUCENE-6400: preserve original token when possible in SolrSynonymParser
Added:
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/BaseSynonymParserTestCase.java
- copied unchanged from r1674155, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/BaseSynonymParserTestCase.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1674159&r1=1674158&r2=1674159&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Thu Apr 16 21:53:16 2015
@@ -76,6 +76,10 @@ Bug Fixes
* LUCENE-6345: Null check terms/fields in Lucene queries (Lee
Hinman via Mike McCandless)
+* LUCENE-6400: SolrSynonymParser should preserve original token instead
+ of replacing it with a synonym, when expand=true and there is no
+ explicit mapping (Ian Ribas, Robert Muir, Mike McCandless)
+
API Changes
* LUCENE-6377: SearcherFactory#newSearcher now accepts the previous reader
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java?rev=1674159&r1=1674158&r2=1674159&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java Thu Apr 16 21:53:16 2015
@@ -84,9 +84,6 @@ public class SolrSynonymParser extends S
continue; // ignore empty lines and comments
}
- CharsRef inputs[];
- CharsRef outputs[];
-
// TODO: we could process this more efficiently.
String sides[] = split(line, "=>");
if (sides.length > 1) { // explicit mapping
@@ -94,37 +91,45 @@ public class SolrSynonymParser extends S
throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
}
String inputStrings[] = split(sides[0], ",");
- inputs = new CharsRef[inputStrings.length];
+ CharsRef[] inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
}
String outputStrings[] = split(sides[1], ",");
- outputs = new CharsRef[outputStrings.length];
+ CharsRef[] outputs = new CharsRef[outputStrings.length];
for (int i = 0; i < outputs.length; i++) {
outputs[i] = analyze(unescape(outputStrings[i]).trim(), new CharsRefBuilder());
}
+ // these mappings are explicit and never preserve original
+ for (int i = 0; i < inputs.length; i++) {
+ for (int j = 0; j < outputs.length; j++) {
+ add(inputs[i], outputs[j], false);
+ }
+ }
} else {
String inputStrings[] = split(line, ",");
- inputs = new CharsRef[inputStrings.length];
+ CharsRef[] inputs = new CharsRef[inputStrings.length];
for (int i = 0; i < inputs.length; i++) {
inputs[i] = analyze(unescape(inputStrings[i]).trim(), new CharsRefBuilder());
}
if (expand) {
- outputs = inputs;
+ // all pairs
+ for (int i = 0; i < inputs.length; i++) {
+ for (int j = 0; j < inputs.length; j++) {
+ if (i != j) {
+ add(inputs[i], inputs[j], true);
+ }
+ }
+ }
} else {
- outputs = new CharsRef[1];
- outputs[0] = inputs[0];
- }
- }
-
- // currently we include the term itself in the map,
- // and use includeOrig = false always.
- // this is how the existing filter does it, but it's actually a bug,
- // especially if combined with ignoreCase = true
- for (int i = 0; i < inputs.length; i++) {
- for (int j = 0; j < outputs.length; j++) {
- add(inputs[i], outputs[j], false);
+ // all subsequent inputs map to first one; we also add inputs[0] here
+ // so that we "effectively" (because we remove the original input and
+ // add back a synonym with the same text) change that token's type to
+ // SYNONYM (matching legacy behavior):
+ for (int i = 0; i < inputs.length; i++) {
+ add(inputs[i], inputs[0], false);
+ }
}
}
}
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java?rev=1674159&r1=1674158&r2=1674159&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSolrSynonymParser.java Thu Apr 16 21:53:16 2015
@@ -21,7 +21,6 @@ import java.io.StringReader;
import java.text.ParseException;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
@@ -31,7 +30,7 @@ import org.apache.lucene.analysis.en.Eng
* Tests parser for the Solr synonyms format
* @lucene.experimental
*/
-public class TestSolrSynonymParser extends BaseTokenStreamTestCase {
+public class TestSolrSynonymParser extends BaseSynonymParserTestCase {
/** Tests some simple examples from the solr wiki */
public void testSimple() throws Exception {
@@ -174,4 +173,61 @@ public class TestSolrSynonymParser exten
new int[] { 1 });
analyzer.close();
}
+
+ /** Verify type of token and positionLength after analyzer. */
+ public void testPositionLengthAndTypeSimple() throws Exception {
+ String testFile =
+ "spider man, spiderman";
+
+ Analyzer analyzer = new MockAnalyzer(random());
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
+ parser.parse(new StringReader(testFile));
+ final SynonymMap map = parser.build();
+ analyzer.close();
+
+ analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
+ return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
+ }
+ };
+
+ assertAnalyzesToPositions(analyzer, "spider man",
+ new String[]{"spider", "spiderman", "man"},
+ new String[]{"word", "SYNONYM", "word"},
+ new int[]{1, 0, 1},
+ new int[]{1, 2, 1});
+ }
+
+ /** Test parsing of simple examples. */
+ public void testParseSimple() throws Exception {
+ String testFile =
+ "spider man, spiderman\n" +
+ "usa,united states,u s a,united states of america\n"+
+ "mystyped, mistyped => mistyped\n" +
+ "foo => foo bar\n" +
+ "foo => baz";
+
+ Analyzer analyzer = new MockAnalyzer(random());
+ SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
+ parser.parse(new StringReader(testFile));
+ final SynonymMap map = parser.build();
+ analyzer.close();
+
+ assertEntryEquals(map, "spiderman", true, "spider man");
+ assertEntryEquals(map, "spider man", true, "spiderman");
+
+ assertEntryEquals(map, "usa", true, new String[] {"united states", "u s a", "united states of america"});
+ assertEntryEquals(map, "united states", true, new String[] {"usa", "u s a", "united states of america"});
+ assertEntryEquals(map, "u s a", true, new String[] {"usa", "united states", "united states of america"});
+ assertEntryEquals(map, "united states of america", true, new String[] {"usa", "u s a", "united states"});
+
+ assertEntryEquals(map, "mistyped", false, "mistyped");
+ assertEntryEquals(map, "mystyped", false, "mistyped");
+
+ assertEntryEquals(map, "foo", false, new String[]{"foo bar", "baz"});
+ assertEntryAbsent(map, "baz");
+ assertEntryAbsent(map, "bar");
+ }
}