You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ji...@apache.org on 2017/05/14 19:12:57 UTC
lucene-solr:master: LUCENE-7824: Fix graph query analysis for
multi-word synonym rules with common terms (eg. new york, new york city).
Repository: lucene-solr
Updated Branches:
refs/heads/master e11bc0309 -> 21362a3ba
LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/21362a3b
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/21362a3b
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/21362a3b
Branch: refs/heads/master
Commit: 21362a3ba4c1e936416635667f257b36235b00ab
Parents: e11bc03
Author: Jim Ferenczi <ji...@apache.org>
Authored: Sun May 14 21:12:42 2017 +0200
Committer: Jim Ferenczi <ji...@apache.org>
Committed: Sun May 14 21:12:42 2017 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 ++
.../graph/GraphTokenStreamFiniteStrings.java | 34 +++----------
.../TestGraphTokenStreamFiniteStrings.java | 53 ++++++++++++++++++++
3 files changed, 64 insertions(+), 26 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/21362a3b/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 10c46cf..8693e24 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -131,6 +131,9 @@ Bug Fixes
"lucene"/standard query parser, should require " TO " in range queries,
and accept "TO" as endpoints in range queries. (hossman, Steve Rowe)
+* LUCENE-7824: Fix graph query analysis for multi-word synonym rules with common terms (eg. new york, new york city).
+ (Jim Ferenczi)
+
Improvements
* LUCENE-7782: OfflineSorter now passes the total number of items it
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/21362a3b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
index fd85836..a700501 100644
--- a/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/java/org/apache/lucene/util/graph/GraphTokenStreamFiniteStrings.java
@@ -48,7 +48,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
* This class also provides helpers to explore the different paths of the {@link Automaton}.
*/
public final class GraphTokenStreamFiniteStrings {
- private final Map<BytesRef, Integer> termToID = new HashMap<>();
private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
private final Map<Integer, Integer> idToInc = new HashMap<>();
private final Automaton det;
@@ -247,35 +246,18 @@ public final class GraphTokenStreamFiniteStrings {
}
/**
- * Gets an integer id for a given term.
- *
- * If there is no position gaps for this token then we can reuse the id for the same term if it appeared at another
- * position without a gap. If we have a position gap generate a new id so we can keep track of the position
- * increment.
+ * Gets an integer id for a given term and saves the position increment if needed.
*/
private int getTermID(int incr, int prevIncr, BytesRef term) {
assert term != null;
boolean isStackedGap = incr == 0 && prevIncr > 1;
- boolean hasGap = incr > 1;
- Integer id;
- if (hasGap || isStackedGap) {
- id = idToTerm.size();
- idToTerm.put(id, BytesRef.deepCopyOf(term));
-
- // stacked token should have the same increment as original token at this position
- if (isStackedGap) {
- idToInc.put(id, prevIncr);
- } else {
- idToInc.put(id, incr);
- }
- } else {
- id = termToID.get(term);
- if (id == null) {
- term = BytesRef.deepCopyOf(term);
- id = idToTerm.size();
- termToID.put(term, id);
- idToTerm.put(id, term);
- }
+ int id = idToTerm.size();
+ idToTerm.put(id, BytesRef.deepCopyOf(term));
+ // stacked token should have the same increment as original token at this position
+ if (isStackedGap) {
+ idToInc.put(id, prevIncr);
+ } else if (incr > 1) {
+ idToInc.put(id, incr);
}
return id;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/21362a3b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
index 8c336cd..44b7b7c 100644
--- a/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
+++ b/lucene/core/src/test/org/apache/lucene/util/graph/TestGraphTokenStreamFiniteStrings.java
@@ -378,6 +378,59 @@ public class TestGraphTokenStreamFiniteStrings extends LuceneTestCase {
assertArrayEquals(terms, new Term[] {new Term("field", "network")});
}
+ public void testStackedGraphWithRepeat() throws Exception {
+ TokenStream ts = new CannedTokenStream(
+ token("ny", 1, 4),
+ token("new", 0, 1),
+ token("new", 0, 3),
+ token("york", 1, 1),
+ token("city", 1, 2),
+ token("york", 1, 1),
+ token("is", 1, 1),
+ token("great", 1, 1)
+ );
+
+ GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
+
+ Iterator<TokenStream> it = graph.getFiniteStrings();
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"ny", "is", "great"}, new int[]{1, 1, 1});
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"new", "york", "city", "is", "great"}, new int[]{1, 1, 1, 1, 1});
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"new", "york", "is", "great"}, new int[]{1, 1, 1, 1});
+ assertFalse(it.hasNext());
+
+ int[] points = graph.articulationPoints();
+ assertArrayEquals(points, new int[] {4, 5});
+
+ assertTrue(graph.hasSidePath(0));
+ it = graph.getFiniteStrings(0, 4);
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"ny"}, new int[]{1});
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"new", "york", "city"}, new int[]{1, 1, 1});
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"new", "york"}, new int[]{1, 1});
+ assertFalse(it.hasNext());
+
+ assertFalse(graph.hasSidePath(4));
+ it = graph.getFiniteStrings(4, 5);
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"is"}, new int[] {1});
+ assertFalse(it.hasNext());
+ Term[] terms = graph.getTerms("field", 4);
+ assertArrayEquals(terms, new Term[] {new Term("field", "is")});
+
+ assertFalse(graph.hasSidePath(5));
+ it = graph.getFiniteStrings(5, -1);
+ assertTrue(it.hasNext());
+ assertTokenStream(it.next(), new String[]{"great"}, new int[] {1});
+ assertFalse(it.hasNext());
+ terms = graph.getTerms("field", 5);
+ assertArrayEquals(terms, new Term[] {new Term("field", "great")});
+ }
+
public void testGraphWithRegularSynonym() throws Exception {
TokenStream ts = new CannedTokenStream(
token("fast", 1, 1),