You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/01/03 11:49:09 UTC
lucene-solr:branch_6x: LUCENE-6664: be more robust to broken token
stream offsets
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 5b6401b21 -> c35fbbd32
LUCENE-6664: be more robust to broken token stream offsets
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c35fbbd3
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c35fbbd3
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c35fbbd3
Branch: refs/heads/branch_6x
Commit: c35fbbd328687f5e309fcb00acf6169122f2a009
Parents: 5b6401b
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Jan 3 06:47:47 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Jan 3 06:48:08 2017 -0500
----------------------------------------------------------------------
.../analysis/synonym/FlattenGraphFilter.java | 31 ++++++++------------
1 file changed, 12 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c35fbbd3/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
index 7ede190..c1fa1f7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
@@ -17,22 +17,6 @@
package org.apache.lucene.analysis.synonym;
-/**
- * This filter "casts" token graphs down into a "flat" form,
- * for indexing. This is an inherently lossy process: nodes (positions)
- * along side paths are forcefully merged.
- *
- * <p>In general this means the output graph will accept token sequences
- * that the input graph did not accept, and will also fail to accept
- * token sequences that the input graph did accept.
- *
- * <p>This is only necessary at indexing time because Lucene cannot yet index
- * an arbitrary token graph. At search time there are better options, e.g.
- * the experimental <code>TermAutomatonQuery</code> in sandbox.
- *
- * @lucene.experimental
- */
-
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -49,7 +33,12 @@ import org.apache.lucene.util.RollingBuffer;
* Converts an incoming graph token stream, such as one from
* {@link SynonymGraphFilter}, into a flat form so that
* all nodes form a single linear chain with no side paths. Every
- * path through the graph touches every node.
+ * path through the graph touches every node. This is necessary
+ * when indexing a graph token stream, because the index does not
+ * save {@link PositionLengthAttribute} and so it cannot
+ * preserve the graph structure. However, at search time,
+ * query parsers can correctly handle the graph and this token
+ * filter should <b>not</b> be used.
*
* <p>If the graph was not already flat to start, this
* is likely a lossy process, i.e. it will often cause the
@@ -234,7 +223,11 @@ public final class FlattenGraphFilter extends TokenFilter {
// which would otherwise happen if the replacement has more tokens
// than the input:
int startOffset = Math.max(lastStartOffset, output.startOffset);
- offsetAtt.setOffset(startOffset, outputEndNode.endOffset);
+
+ // We must do this in case the incoming tokens have broken offsets:
+ int endOffset = Math.max(startOffset, outputEndNode.endOffset);
+
+ offsetAtt.setOffset(startOffset, endOffset);
lastStartOffset = startOffset;
if (inputNode.nextOut == inputNode.tokens.size()) {
@@ -382,7 +375,7 @@ public final class FlattenGraphFilter extends TokenFilter {
// NOTE, shady: don't call super.end, because we did already from incrementToken
}
- clearAttributes();
+ clearAttributes();
if (done) {
// On exc, done is false, and we will not have set these:
posIncAtt.setPositionIncrement(finalPosInc);