You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/01/03 11:49:09 UTC
lucene-solr:branch_6x: LUCENE-6664: be more robust to broken token stream offsets

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x 5b6401b21 -> c35fbbd32


LUCENE-6664: be more robust to broken token stream offsets


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c35fbbd3
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c35fbbd3
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c35fbbd3

Branch: refs/heads/branch_6x
Commit: c35fbbd328687f5e309fcb00acf6169122f2a009
Parents: 5b6401b
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Jan 3 06:47:47 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Jan 3 06:48:08 2017 -0500

----------------------------------------------------------------------
 .../analysis/synonym/FlattenGraphFilter.java    | 31 ++++++++------------
 1 file changed, 12 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c35fbbd3/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
index 7ede190..c1fa1f7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/FlattenGraphFilter.java
@@ -17,22 +17,6 @@
 
 package org.apache.lucene.analysis.synonym;
 
-/**
- * This filter "casts" token graphs down into a "flat" form,
- * for indexing.   This is an inherently lossy process: nodes (positions)
- * along side paths are forcefully merged.
- *
- * <p>In general this means the output graph will accept token sequences
- * that the input graph did not accept, and will also fail to accept
- * token sequences that the input graph did accept.
- *
- * <p>This is only necessary at indexing time because Lucene cannot yet index
- * an arbitrary token graph.  At search time there are better options, e.g.
- * the experimental <code>TermAutomatonQuery</code> in sandbox.
- *
- * @lucene.experimental
- */
-
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -49,7 +33,12 @@ import org.apache.lucene.util.RollingBuffer;
  * Converts an incoming graph token stream, such as one from
  * {@link SynonymGraphFilter}, into a flat form so that
  * all nodes form a single linear chain with no side paths.  Every
- * path through the graph touches every node.
+ * path through the graph touches every node.  This is necessary
+ * when indexing a graph token stream, because the index does not
+ * save {@link PositionLengthAttribute} and so it cannot
+ * preserve the graph structure.  However, at search time,
+ * query parsers can correctly handle the graph and this token
+ * filter should <b>not</b> be used.
  *
  * <p>If the graph was not already flat to start, this
  * is likely a lossy process, i.e. it will often cause the 
@@ -234,7 +223,11 @@ public final class FlattenGraphFilter extends TokenFilter {
         // which would otherwise happen if the replacement has more tokens
         // than the input:
         int startOffset = Math.max(lastStartOffset, output.startOffset);
-        offsetAtt.setOffset(startOffset, outputEndNode.endOffset);
+
+        // We must do this in case the incoming tokens have broken offsets:
+        int endOffset = Math.max(startOffset, outputEndNode.endOffset);
+        
+        offsetAtt.setOffset(startOffset, endOffset);
         lastStartOffset = startOffset;
 
         if (inputNode.nextOut == inputNode.tokens.size()) {
@@ -382,7 +375,7 @@ public final class FlattenGraphFilter extends TokenFilter {
       // NOTE, shady: don't call super.end, because we did already from incrementToken
     }
 
-   clearAttributes();
+    clearAttributes();
     if (done) {
       // On exc, done is false, and we will not have set these:
       posIncAtt.setPositionIncrement(finalPosInc);