You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2017/01/14 10:52:21 UTC
[3/4] lucene-solr:branch_6x: LUCENE-7627: Add
#intersect(CompiledAutomaton) to Sorted*DocValues
LUCENE-7627: Add #intersect(CompiledAutomaton) to Sorted*DocValues
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c2c758bb
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c2c758bb
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c2c758bb
Branch: refs/heads/branch_6x
Commit: c2c758bb71e621b1d8c5d8b228b8dfe4ec50acfe
Parents: e37b777
Author: Alan Woodward <ro...@apache.org>
Authored: Wed Jan 11 12:07:11 2017 +0000
Committer: Alan Woodward <ro...@apache.org>
Committed: Sat Jan 14 10:32:14 2017 +0000
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +++
.../apache/lucene/index/SortedDocValues.java | 25 +++++++++++++++
.../apache/lucene/index/SortedSetDocValues.java | 24 +++++++++++++++
.../index/BaseDocValuesFormatTestCase.java | 32 ++++++++++++++++++++
4 files changed, 85 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c2c758bb/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index ff5d138..7dc7239 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -67,6 +67,10 @@ New features
concurrently across all segments in the index (Emmanuel Keller via
Mike McCandless)
+* LUCENE-7627: Added .intersect methods to SortedDocValues and
+ SortedSetDocValues to allow filtering their TermsEnums with a
+ CompiledAutomaton (Alan Woodward, Mike McCandless)
+
Bug Fixes
* LUCENE-7547: JapaneseTokenizerFactory was failing to close the
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c2c758bb/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
index e9a55a3..87f8b7c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
@@ -17,7 +17,10 @@
package org.apache.lucene.index;
+import java.io.IOException;
+
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
* A per-document byte[] with presorted values.
@@ -104,4 +107,26 @@ public abstract class SortedDocValues extends BinaryDocValues {
public TermsEnum termsEnum() {
return new SortedDocValuesTermsEnum(this);
}
+
+ /**
+ * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
+ * The enum supports {@link TermsEnum#ord()}.
+ */
+ public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
+ TermsEnum in = termsEnum();
+ switch (automaton.type) {
+ case NONE:
+ return TermsEnum.EMPTY;
+ case ALL:
+ return in;
+ case SINGLE:
+ return new SingleTermsEnum(in, automaton.term);
+ case NORMAL:
+ return new AutomatonTermsEnum(in, automaton);
+ default:
+ // unreachable
+ throw new RuntimeException("unhandled case");
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c2c758bb/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java
index f68efcc..64abd64 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SortedSetDocValues.java
@@ -17,7 +17,10 @@
package org.apache.lucene.index;
+import java.io.IOException;
+
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
/**
* A per-document set of presorted byte[] values.
@@ -103,4 +106,25 @@ public abstract class SortedSetDocValues {
public TermsEnum termsEnum() {
return new SortedSetDocValuesTermsEnum(this);
}
+
+ /**
+ * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
+ * The enum supports {@link TermsEnum#ord()}.
+ */
+ public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
+ TermsEnum in = termsEnum();
+ switch (automaton.type) {
+ case NONE:
+ return TermsEnum.EMPTY;
+ case ALL:
+ return in;
+ case SINGLE:
+ return new SingleTermsEnum(in, automaton.term);
+ case NORMAL:
+ return new AutomatonTermsEnum(in, automaton);
+ default:
+ // unreachable
+ throw new RuntimeException("unhandled case");
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c2c758bb/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
index 85ac12f..b573c20 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
@@ -63,6 +63,8 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
@@ -879,6 +881,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
+
+ // NORMAL automaton
+ termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
+ assertEquals("hello", termsEnum.next().utf8ToString());
+ assertEquals(1, termsEnum.ord());
+ assertEquals("world", termsEnum.next().utf8ToString());
+ assertEquals(2, termsEnum.ord());
+ assertNull(termsEnum.next());
+
+ // SINGLE automaton
+ termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
+ assertEquals("hello", termsEnum.next().utf8ToString());
+ assertEquals(1, termsEnum.ord());
+ assertNull(termsEnum.next());
+
ireader.close();
directory.close();
}
@@ -1937,6 +1954,21 @@ public abstract class BaseDocValuesFormatTestCase extends BaseIndexFileFormatTes
termsEnum.seekExact(2);
assertEquals("world", termsEnum.term().utf8ToString());
assertEquals(2, termsEnum.ord());
+
+ // NORMAL automaton
+ termsEnum = dv.intersect(new CompiledAutomaton(new RegExp(".*l.*").toAutomaton()));
+ assertEquals("hello", termsEnum.next().utf8ToString());
+ assertEquals(1, termsEnum.ord());
+ assertEquals("world", termsEnum.next().utf8ToString());
+ assertEquals(2, termsEnum.ord());
+ assertNull(termsEnum.next());
+
+ // SINGLE automaton
+ termsEnum = dv.intersect(new CompiledAutomaton(new RegExp("hello").toAutomaton()));
+ assertEquals("hello", termsEnum.next().utf8ToString());
+ assertEquals(1, termsEnum.ord());
+ assertNull(termsEnum.next());
+
ireader.close();
directory.close();
}