You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2017/01/16 00:10:57 UTC

[39/50] [abbrv] lucene-solr:jira/solr-5944: LUCENE-7626: IndexWriter no longer accepts broken offsets

LUCENE-7626: IndexWriter no longer accepts broken offsets


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/64b86331
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/64b86331
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/64b86331

Branch: refs/heads/jira/solr-5944
Commit: 64b86331c29d074fa7b257d65d3fda3b662bf96a
Parents: 5b3565e
Author: Mike McCandless <mi...@apache.org>
Authored: Fri Jan 13 17:46:02 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Fri Jan 13 17:46:02 2017 -0500

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   3 +
 .../miscellaneous/FixBrokenOffsetsFilter.java   |  78 ++++++++++++
 .../FixBrokenOffsetsFilterFactory.java          |  39 ++++++
 ...ache.lucene.analysis.util.TokenFilterFactory |   1 +
 .../TestFixBrokenOffsetsFilter.java             |  50 ++++++++
 .../apache/lucene/index/FixBrokenOffsets.java   | 125 +++++++++++++++++++
 .../java/org/apache/lucene/index/package.html   |  27 ++++
 .../lucene/index/TestFixBrokenOffsets.java      | 114 +++++++++++++++++
 .../lucene/index/index.630.brokenoffsets.zip    | Bin 0 -> 3203 bytes
 .../org/apache/lucene/index/CheckIndex.java     |  29 +++--
 .../lucene/index/DefaultIndexingChain.java      |  20 ++-
 .../org/apache/lucene/index/TestCheckIndex.java |   5 -
 .../search/highlight/TokenSourcesTest.java      |   2 +-
 .../lucene/search/TestTermAutomatonQuery.java   |   3 +
 .../index/BaseTermVectorsFormatTestCase.java    |  17 +--
 .../apache/lucene/index/BaseTestCheckIndex.java |  19 ---
 .../java/org/apache/lucene/util/TestUtil.java   |   4 +-
 .../apache/solr/schema/PreAnalyzedField.java    |  11 ++
 .../solr/index/hdfs/CheckHdfsIndexTest.java     |   5 -
 19 files changed, 480 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 040f4e0..30943d2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -29,6 +29,9 @@ API Changes
 
 Bug Fixes
 
+* LUCENE-7626: IndexWriter will no longer accept broken token offsets
+  (Mike McCandless)
+
 Improvements
 
 * LUCENE-7489: Better storage of sparse doc-values fields with the default

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilter.java
new file mode 100644
index 0000000..b0a6b1d
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilter.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+/** 
+ * A filter to correct offsets that illegally go backwards.
+ *
+ * @deprecated Fix the token filters that create broken offsets in the first place.
+ */
+@Deprecated
+public final class FixBrokenOffsetsFilter extends TokenFilter {
+
+  private int lastStartOffset;
+  private int lastEndOffset;
+
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  public FixBrokenOffsetsFilter(TokenStream in) {
+    super(in);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken() == false) {
+      return false;
+    }
+    fixOffsets();
+    return true;
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    fixOffsets();
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    lastStartOffset = 0;
+    lastEndOffset = 0;
+  }
+
+  private void fixOffsets() {
+    int startOffset = offsetAtt.startOffset();
+    int endOffset = offsetAtt.endOffset();
+    if (startOffset < lastStartOffset) {
+      startOffset = lastStartOffset;
+    }
+    if (endOffset < startOffset) {
+      endOffset = startOffset;
+    }
+    offsetAtt.setOffset(startOffset, endOffset);
+    lastStartOffset = startOffset;
+    lastEndOffset = endOffset;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilterFactory.java
new file mode 100644
index 0000000..8484b8c
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/FixBrokenOffsetsFilterFactory.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link FixBrokenOffsetsFilter}.
+ */
+public class FixBrokenOffsetsFilterFactory extends TokenFilterFactory {
+
+  /** Sole constructor */
+  public FixBrokenOffsetsFilterFactory(Map<String,String> args) {
+    super(args);
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new FixBrokenOffsetsFilter(input);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 73986d7..5f8894c 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -64,6 +64,7 @@ org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
 org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
 org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
 org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
+org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
 org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java
new file mode 100644
index 0000000..ada5014
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestFixBrokenOffsetsFilter.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+
+public class TestFixBrokenOffsetsFilter extends BaseTokenStreamTestCase {
+
+  public void testBogusTermVectors() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
+    Document doc = new Document();
+    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+    ft.setStoreTermVectors(true);
+    ft.setStoreTermVectorOffsets(true);
+    Field field = new Field("foo", "", ft);
+    field.setTokenStream(new FixBrokenOffsetsFilter(new CannedTokenStream(
+        new Token("bar", 5, 10), new Token("bar", 1, 4)
+        )));
+    doc.add(field);
+    iw.addDocument(doc);
+    iw.close();
+    dir.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/backward-codecs/src/java/org/apache/lucene/index/FixBrokenOffsets.java
----------------------------------------------------------------------
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/index/FixBrokenOffsets.java b/lucene/backward-codecs/src/java/org/apache/lucene/index/FixBrokenOffsets.java
new file mode 100644
index 0000000..d4d6f85
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/index/FixBrokenOffsets.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.SuppressForbidden;
+
+/**
+ * Command-line tool that reads from a source index and
+ * writes to a dest index, correcting any broken offsets
+ * in the process.
+ *
+ * @lucene.experimental
+ */
+public class FixBrokenOffsets {
+  public SegmentInfos infos;
+
+  FSDirectory fsDir;
+
+  Path dir;
+
+  @SuppressForbidden(reason = "System.out required: command line tool")
+  public static void main(String[] args) throws IOException {
+    if (args.length < 2) {
+      System.err.println("Usage: FixBrokenOffsetse <srcDir> <destDir>");
+      return;
+    }
+    Path srcPath = Paths.get(args[0]);
+    if (!Files.exists(srcPath)) {
+      throw new RuntimeException("srcPath " + srcPath.toAbsolutePath() + " doesn't exist");
+    }
+    Path destPath = Paths.get(args[1]);
+    if (Files.exists(destPath)) {
+      throw new RuntimeException("destPath " + destPath.toAbsolutePath() + " already exists; please remove it and re-run");
+    }
+    Directory srcDir = FSDirectory.open(srcPath);
+    DirectoryReader reader = DirectoryReader.open(srcDir);
+
+    List<LeafReaderContext> leaves = reader.leaves();
+    CodecReader[] filtered = new CodecReader[leaves.size()];
+    for(int i=0;i<leaves.size();i++) {
+      filtered[i] = SlowCodecReaderWrapper.wrap(new FilterLeafReader(leaves.get(i).reader()) {
+          @Override
+          public Fields getTermVectors(int docID) throws IOException {
+            Fields termVectors = in.getTermVectors(docID);
+            if (termVectors == null) {
+              return null;
+            }
+            return new FilterFields(termVectors) {
+              @Override
+              public Terms terms(String field) throws IOException {
+                return new FilterTerms(super.terms(field)) {
+                  @Override
+                  public TermsEnum iterator() throws IOException {
+                    return new FilterTermsEnum(super.iterator()) {
+                      @Override
+                      public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
+                        return new FilterPostingsEnum(super.postings(reuse, flags)) {
+                          int nextLastStartOffset = 0;
+                          int lastStartOffset = 0;
+
+                          @Override
+                          public int nextPosition() throws IOException {
+                            int pos = super.nextPosition();
+                            lastStartOffset = nextLastStartOffset;
+                            nextLastStartOffset = startOffset();
+                            return pos;
+                          }
+                          
+                          @Override
+                          public int startOffset() throws IOException {
+                            int offset = super.startOffset();
+                            if (offset < lastStartOffset) {
+                              offset = lastStartOffset;
+                            }
+                            return offset;
+                          }
+                          
+                          @Override
+                          public int endOffset() throws IOException {
+                            int offset = super.endOffset();
+                            if (offset < lastStartOffset) {
+                              offset = lastStartOffset;
+                            }
+                            return offset;
+                          }
+                        };
+                      }
+                    };
+                  }
+                };
+              }
+            };
+          }
+        });
+    }
+
+    Directory destDir = FSDirectory.open(destPath);
+    IndexWriter writer = new IndexWriter(destDir, new IndexWriterConfig());
+    writer.addIndexes(filtered);
+    IOUtils.close(writer, reader, srcDir, destDir);
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/backward-codecs/src/java/org/apache/lucene/index/package.html
----------------------------------------------------------------------
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/index/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/index/package.html
new file mode 100644
index 0000000..42ff91a
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/index/package.html
@@ -0,0 +1,27 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- not a package-info.java, because we already defined this package in core/ -->
+<html>
+<head>
+  <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+  <title>Tools for handling backwards compatibility issues with indices.</title>
+</head>
+<body>
+Tools for handling backwards compatibility issues with indices.
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/backward-codecs/src/test/org/apache/lucene/index/TestFixBrokenOffsets.java
----------------------------------------------------------------------
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/TestFixBrokenOffsets.java b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestFixBrokenOffsets.java
new file mode 100644
index 0000000..bcd5a65
--- /dev/null
+++ b/lucene/backward-codecs/src/test/org/apache/lucene/index/TestFixBrokenOffsets.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestFixBrokenOffsets extends LuceneTestCase {
+
+  // Run this in Lucene 6.x:
+  //
+  //     ant test -Dtestcase=TestFixBrokenOffsets -Dtestmethod=testCreateBrokenOffsetsIndex -Dtests.codec=default -Dtests.useSecurityManager=false
+  /*
+  public void testCreateBrokenOffsetsIndex() throws IOException {
+
+    Path indexDir = Paths.get("/tmp/brokenoffsets");
+    Files.deleteIfExists(indexDir);
+    Directory dir = newFSDirectory(indexDir);
+    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
+
+    Document doc = new Document();
+    FieldType fieldType = new FieldType(TextField.TYPE_STORED);
+    fieldType.setStoreTermVectors(true);
+    fieldType.setStoreTermVectorPositions(true);
+    fieldType.setStoreTermVectorOffsets(true);
+    Field field = new Field("foo", "bar", fieldType);
+    field.setTokenStream(new CannedTokenStream(new Token("foo", 10, 13), new Token("foo", 7, 9)));
+    doc.add(field);
+    writer.addDocument(doc);
+    writer.commit();
+
+    // 2nd segment
+    doc = new Document();
+    field = new Field("foo", "bar", fieldType);
+    field.setTokenStream(new CannedTokenStream(new Token("bar", 15, 17), new Token("bar", 1, 5)));
+    doc.add(field);
+    writer.addDocument(doc);
+    
+    writer.close();
+
+    dir.close();
+  }
+  */
+
+  public void testFixBrokenOffsetsIndex() throws IOException {
+    InputStream resource = getClass().getResourceAsStream("index.630.brokenoffsets.zip");
+    assertNotNull("Broken offsets index not found", resource);
+    Path path = createTempDir("brokenoffsets");
+    TestUtil.unzip(resource, path);
+    Directory dir = FSDirectory.open(path);
+
+    // OK: index is 6.3.0 so offsets not checked:
+    TestUtil.checkIndex(dir);
+    
+    MockDirectoryWrapper tmpDir = newMockDirectory();
+    tmpDir.setCheckIndexOnClose(false);
+    IndexWriter w = new IndexWriter(tmpDir, new IndexWriterConfig());
+    w.addIndexes(dir);
+    w.close();
+    // OK: addIndexes(Directory...) also keeps version as 6.3.0, so offsets not checked:
+    TestUtil.checkIndex(tmpDir);
+    tmpDir.close();
+
+    final MockDirectoryWrapper tmpDir2 = newMockDirectory();
+    tmpDir2.setCheckIndexOnClose(false);
+    w = new IndexWriter(tmpDir2, new IndexWriterConfig());
+    DirectoryReader reader = DirectoryReader.open(dir);
+    List<LeafReaderContext> leaves = reader.leaves();
+    CodecReader[] codecReaders = new CodecReader[leaves.size()];
+    for(int i=0;i<leaves.size();i++) {
+      codecReaders[i] = (CodecReader) leaves.get(i).reader();
+    }
+    w.addIndexes(codecReaders);
+    w.close();
+
+    // NOT OK: broken offsets were copied into a 7.0 segment:
+    ByteArrayOutputStream output = new ByteArrayOutputStream(1024);    
+    RuntimeException re = expectThrows(RuntimeException.class, () -> {TestUtil.checkIndex(tmpDir2, false, true, output);});
+    assertEquals("term [66 6f 6f]: doc 0: pos 1: startOffset 7 < lastStartOffset 10; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index", re.getMessage());
+    tmpDir2.close();
+
+    // Now run the tool and confirm the broken offsets are fixed:
+    Path path2 = createTempDir("fixedbrokenoffsets").resolve("subdir");
+    FixBrokenOffsets.main(new String[] {path.toString(), path2.toString()});
+    Directory tmpDir3 = FSDirectory.open(path2);
+    TestUtil.checkIndex(tmpDir3);
+    tmpDir3.close();
+    
+    dir.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/backward-codecs/src/test/org/apache/lucene/index/index.630.brokenoffsets.zip
----------------------------------------------------------------------
diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/index/index.630.brokenoffsets.zip b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.630.brokenoffsets.zip
new file mode 100644
index 0000000..3cf476a
Binary files /dev/null and b/lucene/backward-codecs/src/test/org/apache/lucene/index/index.630.brokenoffsets.zip differ

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index fd8011d..3bb10d3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -740,13 +740,13 @@ public final class CheckIndex implements Closeable {
           segInfoStat.fieldNormStatus = testFieldNorms(reader, infoStream, failFast);
 
           // Test the Term Index
-          segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast);
+          segInfoStat.termIndexStatus = testPostings(reader, infoStream, verbose, failFast, version);
 
           // Test Stored Fields
           segInfoStat.storedFieldStatus = testStoredFields(reader, infoStream, failFast);
 
           // Test Term Vectors
-          segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast);
+          segInfoStat.termVectorStatus = testTermVectors(reader, infoStream, verbose, crossCheckTermVectors, failFast, version);
 
           // Test Docvalues
           segInfoStat.docValuesStatus = testDocValues(reader, infoStream, failFast);
@@ -1205,7 +1205,7 @@ public final class CheckIndex implements Closeable {
    * checks Fields api is consistent with itself.
    * searcher is optional, to verify with queries. Can be null.
    */
-  private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose) throws IOException {
+  private static Status.TermIndexStatus checkFields(Fields fields, Bits liveDocs, int maxDoc, FieldInfos fieldInfos, boolean doPrint, boolean isVectors, PrintStream infoStream, boolean verbose, Version version) throws IOException {
     // TODO: we should probably return our own stats thing...?!
     long startNS;
     if (doPrint) {
@@ -1461,14 +1461,13 @@ public final class CheckIndex implements Closeable {
               if (hasOffsets) {
                 int startOffset = postings.startOffset();
                 int endOffset = postings.endOffset();
-                // NOTE: we cannot enforce any bounds whatsoever on vectors... they were a free-for-all before?
-                // but for offsets in the postings lists these checks are fine: they were always enforced by IndexWriter
-                if (!isVectors) {
+                // In Lucene 7 we fixed IndexWriter to also enforce term vector offsets
+                if (isVectors == false || version.onOrAfter(Version.LUCENE_7_0_0)) {
                   if (startOffset < 0) {
                     throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " is out of bounds");
                   }
                   if (startOffset < lastOffset) {
-                    throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset);
+                    throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": startOffset " + startOffset + " < lastStartOffset " + lastOffset + "; consider using the FixBrokenOffsets tool in Lucene's backward-codecs module to correct your index");
                   }
                   if (endOffset < 0) {
                     throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + ": endOffset " + endOffset + " is out of bounds");
@@ -1742,15 +1741,15 @@ public final class CheckIndex implements Closeable {
    * Test the term index.
    * @lucene.experimental
    */
-  public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream) throws IOException {
-    return testPostings(reader, infoStream, false, false);
+  public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
+    return testPostings(reader, infoStream, false, false, version);
   }
   
   /**
    * Test the term index.
    * @lucene.experimental
    */
-  public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast) throws IOException {
+  public static Status.TermIndexStatus testPostings(CodecReader reader, PrintStream infoStream, boolean verbose, boolean failFast, Version version) throws IOException {
 
     // TODO: we should go and verify term vectors match, if
     // crossCheckTermVectors is on...
@@ -1765,7 +1764,7 @@ public final class CheckIndex implements Closeable {
 
       final Fields fields = reader.getPostingsReader().getMergeInstance();
       final FieldInfos fieldInfos = reader.getFieldInfos();
-      status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose);
+      status = checkFields(fields, reader.getLiveDocs(), maxDoc, fieldInfos, true, false, infoStream, verbose, version);
     } catch (Throwable e) {
       if (failFast) {
         IOUtils.reThrow(e);
@@ -2339,15 +2338,15 @@ public final class CheckIndex implements Closeable {
    * Test term vectors.
    * @lucene.experimental
    */
-  public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream) throws IOException {
-    return testTermVectors(reader, infoStream, false, false, false);
+  public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, Version version) throws IOException {
+    return testTermVectors(reader, infoStream, false, false, false, version);
   }
 
   /**
    * Test term vectors.
    * @lucene.experimental
    */
-  public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast) throws IOException {
+  public static Status.TermVectorStatus testTermVectors(CodecReader reader, PrintStream infoStream, boolean verbose, boolean crossCheckTermVectors, boolean failFast, Version version) throws IOException {
     long startNS = System.nanoTime();
     final Status.TermVectorStatus status = new Status.TermVectorStatus();
     final FieldInfos fieldInfos = reader.getFieldInfos();
@@ -2387,7 +2386,7 @@ public final class CheckIndex implements Closeable {
           
           if (tfv != null) {
             // First run with no deletions:
-            checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose);
+            checkFields(tfv, null, 1, fieldInfos, false, true, infoStream, verbose, version);
             
             // Only agg stats if the doc is live:
             final boolean doStats = liveDocs == null || liveDocs.get(j);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index 79c285b..197ab31 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -27,6 +27,7 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.NormsConsumer;
@@ -728,10 +729,6 @@ final class DefaultIndexingChain extends DocConsumer {
 
       final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
         
-      // only bother checking offsets if something will consume them.
-      // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
-      final boolean checkOffsets = indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
-
       /*
        * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
        * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
@@ -743,6 +740,7 @@ final class DefaultIndexingChain extends DocConsumer {
         stream.reset();
         invertState.setAttributeSource(stream);
         termsHashPerField.start(field, first);
+        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
 
         while (stream.incrementToken()) {
 
@@ -771,15 +769,13 @@ final class DefaultIndexingChain extends DocConsumer {
             invertState.numOverlap++;
           }
               
-          if (checkOffsets) {
-            int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
-            int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
-            if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
-              throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
-                                                 + "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
-            }
-            invertState.lastStartOffset = startOffset;
+          int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
+          int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
+          if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
+            throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
+                                               + "startOffset=" + startOffset + ",endOffset=" + endOffset + ",lastStartOffset=" + invertState.lastStartOffset + " for field '" + field.name() + "'");
           }
+          invertState.lastStartOffset = startOffset;
 
           invertState.length++;
           if (invertState.length < 0) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java b/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java
index 7b71d3c..2559ce4 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestCheckIndex.java
@@ -43,11 +43,6 @@ public class TestCheckIndex extends BaseTestCheckIndex {
   }
   
   @Test
-  public void testBogusTermVectors() throws IOException {
-    testBogusTermVectors(directory);
-  }
-  
-  @Test
   public void testChecksumsOnly() throws IOException {
     testChecksumsOnly(directory);
   }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
index 581ff2f..d49434a 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
@@ -377,7 +377,7 @@ public class TokenSourcesTest extends BaseTokenStreamTestCase {
     }
 
     final BaseTermVectorsFormatTestCase.RandomTokenStream rTokenStream =
-        new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes, false);
+        new BaseTermVectorsFormatTestCase.RandomTokenStream(TestUtil.nextInt(random(), 1, 10), terms, termBytes);
     //check to see if the token streams might have non-deterministic testable result
     final boolean storeTermVectorPositions = random().nextBoolean();
     final int[] startOffsets = rTokenStream.getStartOffsets();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
index 6055e00..6ef9baf 100644
--- a/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestTermAutomatonQuery.java
@@ -45,6 +45,7 @@ import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.BitSetIterator;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
@@ -431,7 +432,9 @@ public class TestTermAutomatonQuery extends LuceneTestCase {
     @Override
     public boolean incrementToken() throws IOException {
       if (synNext) {
+        AttributeSource.State state = captureState();
         clearAttributes();
+        restoreState(state);
         posIncAtt.setPositionIncrement(0);
         termAtt.append(""+((char) 97 + random().nextInt(3)));
         synNext = false;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
index 5e6809f..7acee87 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTermVectorsFormatTestCase.java
@@ -200,10 +200,6 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
     int i = 0;
 
     public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes) {
-      this(len, sampleTerms, sampleTermBytes, rarely());
-    }
-
-    public RandomTokenStream(int len, String[] sampleTerms, BytesRef[] sampleTermBytes, boolean offsetsGoBackwards) {
       terms = new String[len];
       termBytes = new BytesRef[len];
       positionsIncrements = new int[len];
@@ -216,17 +212,12 @@ public abstract class BaseTermVectorsFormatTestCase extends BaseIndexFileFormatT
         terms[i] = sampleTerms[o];
         termBytes[i] = sampleTermBytes[o];
         positionsIncrements[i] = TestUtil.nextInt(random(), i == 0 ? 1 : 0, 10);
-        if (offsetsGoBackwards) {
-          startOffsets[i] = random().nextInt();
-          endOffsets[i] = random().nextInt();
+        if (i == 0) {
+          startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
         } else {
-          if (i == 0) {
-            startOffsets[i] = TestUtil.nextInt(random(), 0, 1 << 16);
-          } else {
-            startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
-          }
-          endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
+          startOffsets[i] = startOffsets[i-1] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 16 : 20);
         }
+        endOffsets[i] = startOffsets[i] + TestUtil.nextInt(random(), 0, rarely() ? 1 << 10 : 20);
       }
 
       for (int i = 0; i < len; ++i) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/test-framework/src/java/org/apache/lucene/index/BaseTestCheckIndex.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTestCheckIndex.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTestCheckIndex.java
index cdec720..21ccf3b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseTestCheckIndex.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseTestCheckIndex.java
@@ -22,11 +22,8 @@ import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.lucene.analysis.CannedTokenStream;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.store.Directory;
@@ -105,22 +102,6 @@ public class BaseTestCheckIndex extends LuceneTestCase {
     checker.close();
   }
   
-  // LUCENE-4221: we have to let these thru, for now
-  public void testBogusTermVectors(Directory dir) throws IOException {
-    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
-    Document doc = new Document();
-    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
-    ft.setStoreTermVectors(true);
-    ft.setStoreTermVectorOffsets(true);
-    Field field = new Field("foo", "", ft);
-    field.setTokenStream(new CannedTokenStream(
-        new Token("bar", 5, 10), new Token("bar", 1, 4)
-    ));
-    doc.add(field);
-    iw.addDocument(doc);
-    iw.close();
-  }
-  
   public void testChecksumsOnly(Directory dir) throws IOException {
     LineFileDocs lf = new LineFileDocs(random());
     MockAnalyzer analyzer = new MockAnalyzer(random());

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
index d3351ab..0ea90fc 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
@@ -334,9 +334,9 @@ public final class TestUtil {
     CheckIndex.testLiveDocs(codecReader, infoStream, true);
     CheckIndex.testFieldInfos(codecReader, infoStream, true);
     CheckIndex.testFieldNorms(codecReader, infoStream, true);
-    CheckIndex.testPostings(codecReader, infoStream, false, true);
+    CheckIndex.testPostings(codecReader, infoStream, false, true, Version.LUCENE_7_0_0);
     CheckIndex.testStoredFields(codecReader, infoStream, true);
-    CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true);
+    CheckIndex.testTermVectors(codecReader, infoStream, false, crossCheckTermVectors, true, Version.LUCENE_7_0_0);
     CheckIndex.testDocValues(codecReader, infoStream, true);
     CheckIndex.testPoints(codecReader, infoStream, true);
     

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java b/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java
index 87d4094..5f125d9 100644
--- a/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java
+++ b/solr/core/src/java/org/apache/solr/schema/PreAnalyzedField.java
@@ -27,6 +27,7 @@ import java.util.Map;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexableField;
@@ -284,6 +285,7 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
     private byte[] binaryValue = null;
     private PreAnalyzedParser parser;
     private IOException readerConsumptionException;
+    private int lastEndOffset;
 
     public PreAnalyzedTokenizer(PreAnalyzedParser parser) {
       // we don't pack attributes: since we are used for (de)serialization and dont want bloat.
@@ -311,6 +313,8 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
       
       AttributeSource.State state = it.next();
       restoreState(state.clone());
+      // TODO: why can't I lookup the OffsetAttribute up in ctor instead?
+      lastEndOffset = addAttribute(OffsetAttribute.class).endOffset();
       return true;
     }
 
@@ -329,6 +333,13 @@ public class PreAnalyzedField extends TextField implements HasImplicitIndexAnaly
       it = cachedStates.iterator();
     }
 
+    @Override
+    public void end() throws IOException {
+      super.end();
+      // we must set the end offset correctly so multi-valued fields don't try to send offsets backwards:
+      addAttribute(OffsetAttribute.class).setOffset(lastEndOffset, lastEndOffset);
+    }
+
     private void setReaderConsumptionException(IOException e) {
       readerConsumptionException = e;
     }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/64b86331/solr/core/src/test/org/apache/solr/index/hdfs/CheckHdfsIndexTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/index/hdfs/CheckHdfsIndexTest.java b/solr/core/src/test/org/apache/solr/index/hdfs/CheckHdfsIndexTest.java
index b4f6931..61b4305 100644
--- a/solr/core/src/test/org/apache/solr/index/hdfs/CheckHdfsIndexTest.java
+++ b/solr/core/src/test/org/apache/solr/index/hdfs/CheckHdfsIndexTest.java
@@ -121,11 +121,6 @@ public class CheckHdfsIndexTest extends AbstractFullDistribZkTestBase {
   }
 
   @Test
-  public void testBogusTermVectors() throws IOException {
-    testCheckIndex.testBogusTermVectors(directory);
-  }
-
-  @Test
   public void testChecksumsOnly() throws IOException {
     testCheckIndex.testChecksumsOnly(directory);
   }