You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2021/11/05 07:59:23 UTC
[lucene] branch branch_9x updated: LUCENE-10220: Add an utility
method to get IntervalSource from analyzed text (or token stream) (#427)
This is an automated email from the ASF dual-hosted git repository.
dweiss pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 5de05f3 LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream) (#427)
5de05f3 is described below
commit 5de05f3556224fc68302f38848d60fe208e9875b
Author: Dawid Weiss <da...@carrotsearch.com>
AuthorDate: Fri Nov 5 08:57:48 2021 +0100
LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream) (#427)
---
lucene/CHANGES.txt | 4 +-
.../matchhighlight/TestMatchHighlighter.java | 54 ++++
.../lucene/queries/intervals/IntervalBuilder.java | 325 +++++++++++++++++++++
.../apache/lucene/queries/intervals/Intervals.java | 70 ++++-
.../queries/intervals/TestIntervalBuilder.java | 216 ++++++++++++++
5 files changed, 658 insertions(+), 11 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 740ffce..2d3a11d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -11,7 +11,9 @@ API Changes
New Features
---------------------
-(No changes)
+
+* LUCENE-10220: Add an utility method to get IntervalSource from analyzed text (or token stream).
+ (Uwe Schindler, Dawid Weiss, Alan Woodward)
Improvements
---------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
index bafb75d..a441b90 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java
@@ -32,10 +32,13 @@ import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Field;
@@ -265,6 +268,57 @@ public class TestMatchHighlighter extends LuceneTestCase {
}
@Test
+ public void testAnalyzedTextIntervals() throws IOException {
+ SynonymMap synonymMap =
+ buildSynonymMap(
+ new String[][] {
+ {"moon\u0000shine", "firewater"},
+ {"firewater", "moon\u0000shine"},
+ });
+
+ Analyzer analyzer =
+ new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new StandardTokenizer();
+ TokenStream ts = tokenizer;
+ ts = new LowerCaseFilter(ts);
+ ts = new SynonymGraphFilter(ts, synonymMap, true);
+ ts = new PorterStemFilter(ts);
+ return new TokenStreamComponents(tokenizer, ts);
+ }
+ };
+
+ new IndexBuilder(this::toField)
+ .doc(FLD_TEXT1, "Where the moon shine falls, firewater flows.")
+ .build(
+ analyzer,
+ reader -> {
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
+
+ MatchHighlighter highlighter =
+ new MatchHighlighter(searcher, analyzer)
+ .appendFieldHighlighter(
+ FieldValueHighlighters.highlighted(
+ 80 * 3, 1, new PassageFormatter("...", ">", "<"), FLD_TEXT1::equals))
+ .appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
+
+ {
+ // [moon shine, firewater] are synonyms, tokens are lowercased. Porter stemming on.
+ Query query =
+ new IntervalQuery(
+ FLD_TEXT1,
+ Intervals.analyzedText("Firewater Fall", analyzer, FLD_TEXT1, 0, true));
+
+ assertHighlights(
+ toDocList(highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
+ "0. text1: Where the >moon shine falls<, firewater flows.");
+ }
+ });
+ }
+
+ @Test
public void testCustomFieldHighlightHandling() throws IOException {
// Match highlighter is a showcase of individual components in this package, suitable
// to create any kind of field-display designs.
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java
new file mode 100644
index 0000000..b6e1727
--- /dev/null
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalBuilder.java
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Code adopted from ASL-licensed Elasticsearch.
+ * https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java
+ *
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queries.intervals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
+
+/**
+ * Constructs an {@link IntervalsSource} based on analyzed text.
+ *
+ * <p>Code adopted from ASL-licensed <a
+ * href="https://github.com/elastic/elasticsearch">Elasticsearch</a>.
+ *
+ * @see
+ * "https://github.com/elastic/elasticsearch/blob/7.10/server/src/main/java/org/elasticsearch/index/query/IntervalBuilder.java"
+ */
+final class IntervalBuilder {
+ static IntervalsSource analyzeText(CachingTokenFilter stream, int maxGaps, boolean ordered)
+ throws IOException {
+ assert stream != null;
+
+ TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
+ PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
+ PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
+
+ if (termAtt == null) {
+ return NO_INTERVALS;
+ }
+
+ // phase 1: read through the stream and assess the situation:
+ // counting the number of tokens/positions and marking if we have any synonyms.
+
+ int numTokens = 0;
+ boolean hasSynonyms = false;
+ boolean isGraph = false;
+
+ stream.reset();
+ while (stream.incrementToken()) {
+ numTokens++;
+ int positionIncrement = posIncAtt.getPositionIncrement();
+ if (positionIncrement == 0) {
+ hasSynonyms = true;
+ }
+ int positionLength = posLenAtt.getPositionLength();
+ if (positionLength > 1) {
+ isGraph = true;
+ }
+ }
+
+ // phase 2: based on token count, presence of synonyms, and options
+ // formulate a single term, boolean, or phrase.
+
+ if (numTokens == 0) {
+ return NO_INTERVALS;
+ } else if (numTokens == 1) {
+ // single term
+ return analyzeTerm(stream);
+ } else if (isGraph) {
+ // graph
+ return combineSources(analyzeGraph(stream), maxGaps, ordered);
+ } else {
+ // phrase
+ if (hasSynonyms) {
+ // phrase with single-term synonyms
+ return analyzeSynonyms(stream, maxGaps, ordered);
+ } else {
+ // simple phrase
+ return combineSources(analyzeTerms(stream), maxGaps, ordered);
+ }
+ }
+ }
+
+ private static IntervalsSource analyzeTerm(TokenStream ts) throws IOException {
+ TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+ ts.reset();
+ ts.incrementToken();
+ return Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef()));
+ }
+
+ private static IntervalsSource combineSources(
+ List<IntervalsSource> sources, int maxGaps, boolean ordered) {
+ if (sources.size() == 0) {
+ return NO_INTERVALS;
+ }
+ if (sources.size() == 1) {
+ return sources.get(0);
+ }
+ IntervalsSource[] sourcesArray = sources.toArray(new IntervalsSource[0]);
+ if (maxGaps == 0 && ordered) {
+ return Intervals.phrase(sourcesArray);
+ }
+ IntervalsSource inner =
+ ordered ? Intervals.ordered(sourcesArray) : Intervals.unordered(sourcesArray);
+ if (maxGaps == -1) {
+ return inner;
+ }
+ return Intervals.maxgaps(maxGaps, inner);
+ }
+
+ private static List<IntervalsSource> analyzeTerms(TokenStream ts) throws IOException {
+ List<IntervalsSource> terms = new ArrayList<>();
+ TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+ PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ ts.reset();
+ while (ts.incrementToken()) {
+ BytesRef term = bytesAtt.getBytesRef();
+ int precedingSpaces = posAtt.getPositionIncrement() - 1;
+ terms.add(extend(Intervals.term(BytesRef.deepCopyOf(term)), precedingSpaces));
+ }
+ ts.end();
+ return terms;
+ }
+
+ private static IntervalsSource extend(IntervalsSource source, int precedingSpaces) {
+ if (precedingSpaces == 0) {
+ return source;
+ }
+ return Intervals.extend(source, precedingSpaces, 0);
+ }
+
+ private static IntervalsSource analyzeSynonyms(TokenStream ts, int maxGaps, boolean ordered)
+ throws IOException {
+ List<IntervalsSource> terms = new ArrayList<>();
+ List<IntervalsSource> synonyms = new ArrayList<>();
+ TermToBytesRefAttribute bytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+ PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
+ ts.reset();
+ int spaces = 0;
+ while (ts.incrementToken()) {
+ int posInc = posAtt.getPositionIncrement();
+ if (posInc > 0) {
+ if (synonyms.size() == 1) {
+ terms.add(extend(synonyms.get(0), spaces));
+ } else if (synonyms.size() > 1) {
+ terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
+ }
+ synonyms.clear();
+ spaces = posInc - 1;
+ }
+ synonyms.add(Intervals.term(BytesRef.deepCopyOf(bytesAtt.getBytesRef())));
+ }
+ if (synonyms.size() == 1) {
+ terms.add(extend(synonyms.get(0), spaces));
+ } else {
+ terms.add(extend(Intervals.or(synonyms.toArray(new IntervalsSource[0])), spaces));
+ }
+ return combineSources(terms, maxGaps, ordered);
+ }
+
+ private static List<IntervalsSource> analyzeGraph(TokenStream source) throws IOException {
+ source.reset();
+ GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(source);
+
+ List<IntervalsSource> clauses = new ArrayList<>();
+ int[] articulationPoints = graph.articulationPoints();
+ int lastState = 0;
+ int maxClauseCount = BooleanQuery.getMaxClauseCount();
+ for (int i = 0; i <= articulationPoints.length; i++) {
+ int start = lastState;
+ int end = -1;
+ if (i < articulationPoints.length) {
+ end = articulationPoints[i];
+ }
+ lastState = end;
+ if (graph.hasSidePath(start)) {
+ List<IntervalsSource> paths = new ArrayList<>();
+ Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
+ while (it.hasNext()) {
+ TokenStream ts = it.next();
+ IntervalsSource phrase = combineSources(analyzeTerms(ts), 0, true);
+ if (paths.size() >= maxClauseCount) {
+ throw new BooleanQuery.TooManyClauses();
+ }
+ paths.add(phrase);
+ }
+ if (paths.size() > 0) {
+ clauses.add(Intervals.or(paths.toArray(new IntervalsSource[0])));
+ }
+ } else {
+ Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
+ TokenStream ts = it.next();
+ clauses.addAll(analyzeTerms(ts));
+ assert it.hasNext() == false;
+ }
+ }
+ return clauses;
+ }
+
+ static final IntervalsSource NO_INTERVALS =
+ new IntervalsSource() {
+ @Override
+ public IntervalIterator intervals(String field, LeafReaderContext ctx) {
+ return new IntervalIterator() {
+ @Override
+ public int start() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int end() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public int gaps() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int nextInterval() {
+ return NO_MORE_INTERVALS;
+ }
+
+ @Override
+ public float matchCost() {
+ return 0;
+ }
+
+ @Override
+ public int docID() {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int nextDoc() {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int advance(int target) {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public long cost() {
+ return 0;
+ }
+ };
+ }
+
+ @Override
+ public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) {
+ return null;
+ }
+
+ @Override
+ public void visit(String field, QueryVisitor visitor) {}
+
+ @Override
+ public int minExtent() {
+ return 0;
+ }
+
+ @Override
+ public Collection<IntervalsSource> pullUpDisjunctions() {
+ return Collections.emptyList();
+ }
+
+ @Override
+ public int hashCode() {
+ return 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return other == this;
+ }
+
+ @Override
+ public String toString() {
+ return "no_match";
+ }
+ };
+}
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
index 1094dbe..b79d48f 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java
@@ -17,9 +17,13 @@
package org.apache.lucene.queries.intervals;
+import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.WildcardQuery;
@@ -30,17 +34,17 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
* Constructor functions for {@link IntervalsSource} types
*
* <p>These sources implement minimum-interval algorithms taken from the paper <a
- * href="http://vigna.di.unimi.it/ftp/papers/EfficientAlgorithmsMinimalIntervalSemantics.pdf">
- * Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics</a>
+ * href="https://vigna.di.unimi.it/ftp/papers/EfficientLazy.pdf">Efficient Optimally Lazy Algorithms
+ * for Minimal-Interval Semantics</a>
*
- * <p>By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will rewrite
- * their sub-sources so that disjunctions of different lengths are pulled up to the top of the
- * interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c") will automatically
- * rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c")) to ensure that documents
- * containing "b c" are matched. This can lead to less efficient queries, as more terms need to be
- * loaded (for example, the "c" iterator above is loaded twice), so if you care more about speed
- * than about accuracy you can use the {@link #or(boolean, IntervalsSource...)} factory method to
- * prevent rewriting.
+ * <p>By default, sources that are sensitive to internal gaps (e.g. {@code PHRASE} and {@code
+ * MAXGAPS}) will rewrite their sub-sources so that disjunctions of different lengths are pulled up
+ * to the top of the interval tree. For example, {@code PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")}
+ * will automatically rewrite itself to {@code OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))} to
+ * ensure that documents containing {@code "b c"} are matched. This can lead to less efficient
+ * queries, as more terms need to be loaded (for example, the {@code "c"} iterator above is loaded
+ * twice), so if you care more about speed than about accuracy you can use the {@link #or(boolean,
+ * IntervalsSource...)} factory method to prevent rewriting.
*/
public final class Intervals {
@@ -429,4 +433,50 @@ public final class Intervals {
source,
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
}
+
+ /**
+ * Returns intervals that correspond to tokens from a {@link TokenStream} returned for {@code
+ * text} by applying the provided {@link Analyzer} as if {@code text} was the content of the given
+ * {@code field}. The intervals can be ordered or unordered and can have optional gaps inside.
+ *
+ * @param text The text to analyze.
+ * @param analyzer The {@link Analyzer} to use to acquire a {@link TokenStream} which is then
+ * converted into intervals.
+ * @param field The field {@code text} should be parsed as.
+ * @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
+ * @param ordered Whether sub-intervals should enforce token ordering or not.
+ * @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
+ * text}. Possibly an empty interval source, never {@code null}.
+ * @throws IOException If an I/O exception occurs.
+ */
+ public static IntervalsSource analyzedText(
+ String text, Analyzer analyzer, String field, int maxGaps, boolean ordered)
+ throws IOException {
+ try (TokenStream ts = analyzer.tokenStream(field, text)) {
+ return analyzedText(ts, maxGaps, ordered);
+ }
+ }
+
+ /**
+ * Returns intervals that correspond to tokens from the provided {@link TokenStream}. This is a
+ * low-level counterpart to {@link #analyzedText(String, Analyzer, String, int, boolean)}. The
+ * intervals can be ordered or unordered and can have optional gaps inside.
+ *
+ * @param tokenStream The token stream to produce intervals for. The token stream may be fully or
+ * partially consumed after returning from this method.
+ * @param maxGaps Maximum number of allowed gaps between sub-intervals resulting from tokens.
+ * @param ordered Whether sub-intervals should enforce token ordering or not.
+ * @return Returns an {@link IntervalsSource} that matches tokens acquired from analysis of {@code
+ * text}. Possibly an empty interval source, never {@code null}.
+ * @throws IOException If an I/O exception occurs.
+ */
+ public static IntervalsSource analyzedText(TokenStream tokenStream, int maxGaps, boolean ordered)
+ throws IOException {
+ CachingTokenFilter stream =
+ tokenStream instanceof CachingTokenFilter
+ ? (CachingTokenFilter) tokenStream
+ : new CachingTokenFilter(tokenStream);
+
+ return IntervalBuilder.analyzeText(stream, maxGaps, ordered);
+ }
}
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java
new file mode 100644
index 0000000..a51a690
--- /dev/null
+++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervalBuilder.java
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Code adopted from ASL-licensed Elasticsearch.
+ * https://github.com/elastic/elasticsearch/blob/7.10/server/src/test/java/org/elasticsearch/index/query/IntervalBuilderTests.java
+ *
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.lucene.queries.intervals;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.CachingTokenFilter;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestIntervalBuilder extends LuceneTestCase {
+ public void testSimpleTerm() throws IOException {
+ CannedTokenStream ts = new CannedTokenStream(new Token("term1", 1, 2));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected = Intervals.term("term1");
+
+ assertEquals(expected, source);
+ }
+
+ public void testOrdered() throws IOException {
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected =
+ Intervals.ordered(
+ Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
+
+ assertEquals(expected, source);
+ }
+
+ public void testUnordered() throws IOException {
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, false);
+ IntervalsSource expected =
+ Intervals.unordered(
+ Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
+
+ assertEquals(expected, source);
+ }
+
+ public void testPhrase() throws IOException {
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2), new Token("term2", 3, 4), new Token("term3", 5, 6));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
+ IntervalsSource expected =
+ Intervals.phrase(Intervals.term("term1"), Intervals.term("term2"), Intervals.term("term3"));
+
+ assertEquals(expected, source);
+ }
+
+ public void testPhraseWithStopword() throws IOException {
+ CannedTokenStream ts =
+ new CannedTokenStream(new Token("term1", 1, 1, 2), new Token("term3", 2, 5, 6));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
+ IntervalsSource expected =
+ Intervals.phrase(Intervals.term("term1"), Intervals.extend(Intervals.term("term3"), 1, 0));
+
+ assertEquals(expected, source);
+ }
+
+ public void testEmptyTokenStream() throws IOException {
+ CannedTokenStream ts = new CannedTokenStream();
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), 0, true);
+ assertSame(IntervalBuilder.NO_INTERVALS, source);
+ }
+
+ public void testSimpleSynonyms() throws IOException {
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2),
+ new Token("term2", 3, 4),
+ new Token("term4", 0, 3, 4),
+ new Token("term3", 5, 6));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected =
+ Intervals.ordered(
+ Intervals.term("term1"),
+ Intervals.or(Intervals.term("term2"), Intervals.term("term4")),
+ Intervals.term("term3"));
+
+ assertEquals(expected, source);
+ }
+
+ public void testSimpleSynonymsWithGap() throws IOException {
+ // term1 [] term2/term3/term4 term5
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2),
+ new Token("term2", 2, 3, 4),
+ new Token("term3", 0, 3, 4),
+ new Token("term4", 0, 3, 4),
+ new Token("term5", 5, 6));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected =
+ Intervals.ordered(
+ Intervals.term("term1"),
+ Intervals.extend(
+ Intervals.or(
+ Intervals.term("term2"), Intervals.term("term3"), Intervals.term("term4")),
+ 1,
+ 0),
+ Intervals.term("term5"));
+ assertEquals(expected, source);
+ }
+
+ public void testGraphSynonyms() throws IOException {
+ // term1 term2:2/term3 term4 term5
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2),
+ new Token("term2", 1, 3, 4, 2),
+ new Token("term3", 0, 3, 4),
+ new Token("term4", 5, 6),
+ new Token("term5", 6, 7));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected =
+ Intervals.ordered(
+ Intervals.term("term1"),
+ Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
+ Intervals.term("term5"));
+
+ assertEquals(expected, source);
+ }
+
+ public void testGraphSynonymsWithGaps() throws IOException {
+ // term1 [] term2:4/term3 [] [] term4 term5
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2),
+ new Token("term2", 2, 3, 4, 4),
+ new Token("term3", 0, 3, 4),
+ new Token("term4", 3, 5, 6),
+ new Token("term5", 6, 7));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected =
+ Intervals.ordered(
+ Intervals.term("term1"),
+ Intervals.or(
+ Intervals.extend(Intervals.term("term2"), 1, 0),
+ Intervals.phrase(
+ Intervals.extend(Intervals.term("term3"), 1, 0),
+ Intervals.extend(Intervals.term("term4"), 2, 0))),
+ Intervals.term("term5"));
+
+ assertEquals(expected, source);
+ }
+
+ public void testGraphTerminatesOnGap() throws IOException {
+ // term1 term2:2/term3 term4 [] term5
+ CannedTokenStream ts =
+ new CannedTokenStream(
+ new Token("term1", 1, 2),
+ new Token("term2", 1, 2, 3, 2),
+ new Token("term3", 0, 2, 3),
+ new Token("term4", 2, 3),
+ new Token("term5", 2, 6, 7));
+
+ IntervalsSource source = IntervalBuilder.analyzeText(new CachingTokenFilter(ts), -1, true);
+ IntervalsSource expected =
+ Intervals.ordered(
+ Intervals.term("term1"),
+ Intervals.or(Intervals.term("term2"), Intervals.phrase("term3", "term4")),
+ Intervals.extend(Intervals.term("term5"), 1, 0));
+ assertEquals(expected, source);
+ }
+}