You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ar...@apache.org on 2015/03/27 23:37:49 UTC
svn commit: r1669698 [1/2] - in /lucene/dev/trunk/lucene: ./ suggest/src/java/org/apache/lucene/search/suggest/analyzing/ suggest/src/java/org/apache/lucene/search/suggest/document/ suggest/src/resources/META-INF/services/ suggest/src/test/org/apache/l...

Author: areek
Date: Fri Mar 27 22:37:49 2015
New Revision: 1669698

URL: http://svn.apache.org/r1669698
Log:
LUCENE-6339: Added Near-real time Document Suggester via custom postings format

Added:
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/package-info.java   (with props)
    lucene/dev/trunk/lucene/suggest/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat   (with props)
    lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/
    lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/SuggestFieldTest.java   (with props)
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1669698&r1=1669697&r2=1669698&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Mar 27 22:37:49 2015
@@ -19,6 +19,9 @@ New Features
   for counting ranges that align with the underlying terms as defined by the
   NumberRangePrefixTree (e.g. familiar date units like days).  (David Smiley)
 
+* LUCENE-6339: Added Near-real time Document Suggester via custom postings format
+  (Areek Zillur, Mike McCandless, Simon Willnauer)
+
 API Changes
 
 * LUCENE-3312: The API of oal.document was restructured to

Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java?rev=1669698&r1=1669697&r2=1669698&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java Fri Mar 27 22:37:49 2015
@@ -49,7 +49,7 @@ public class FSTUtil {
     public final FST.Arc<T> fstNode;
 
     /** Output of the path so far: */
-    T output;
+    public final T output;
 
     /** Input of the path so far: */
     public final IntsRefBuilder input;

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,42 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.codecs.PostingsFormat;
+
+/**
+ * {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat}
+ * for {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat}
+ *
+ * @lucene.experimental
+ */
+public class Completion50PostingsFormat extends CompletionPostingsFormat {
+
+  /**
+   * Sole Constructor
+   */
+  public Completion50PostingsFormat() {
+    super();
+  }
+
+  @Override
+  protected PostingsFormat delegatePostingsFormat() {
+    return PostingsFormat.forName("Lucene50");
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,173 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.Transition;
+
+/**
+ * Wraps an {@link org.apache.lucene.analysis.Analyzer}
+ * to provide additional completion-only tuning
+ * (e.g. preserving token separators, preserving position increments while converting
+ * a token stream to an automaton)
+ * <p>
+ * Can be used to index {@link SuggestField}
+ * and as a query analyzer to {@link SuggestIndexSearcher}
+ * <p>
+ * NOTE: In most cases, index and query analyzer should have same values for {@link #preservePositionIncrements}
+ * and {@link #preserveSep}
+ *
+ * @lucene.experimental
+ */
+public class CompletionAnalyzer extends AnalyzerWrapper {
+
+  /**
+   * Represents the separation between tokens, if
+   * <code>preserveSep</code> is <code>true</code>
+   * <p>
+   * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
+   * payload
+   */
+  final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
+
+  /**
+   * Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
+   */
+  final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
+
+  final static int DEFAULT_MAX_GRAPH_EXPANSIONS = -1;
+  final static boolean DEFAULT_PRESERVE_SEP = true;
+  final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
+
+  private final Analyzer analyzer;
+
+  /**
+   * Preserve separation between tokens
+   * when converting to an automaton
+   * <p>
+   * Defaults to <code>true</code>
+   */
+  private final boolean preserveSep;
+
+  /**
+   * Preserve position increments for tokens
+   * when converting to an automaton
+   * <p>
+   * Defaults to <code>true</code>
+   */
+  private final boolean preservePositionIncrements;
+
+  /**
+   * Sets the maximum number of graph expansions of a completion automaton
+   * <p>
+   * Defaults to <code>-1</code> (no limit)
+   */
+  private final int maxGraphExpansions;
+
+  /**
+   * Wraps an analyzer to convert it's output token stream to an automaton
+   *
+   * @param analyzer token stream to be converted to an automaton
+   * @param preserveSep Preserve separation between tokens when converting to an automaton
+   * @param preservePositionIncrements Preserve position increments for tokens when converting to an automaton
+   * @param maxGraphExpansions Sets the maximum number of graph expansions of a completion automaton
+   */
+  public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
+    super(PER_FIELD_REUSE_STRATEGY);
+    this.analyzer = analyzer;
+    this.preserveSep = preserveSep;
+    this.preservePositionIncrements = preservePositionIncrements;
+    this.maxGraphExpansions = maxGraphExpansions;
+  }
+
+  /**
+   * Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
+   * preserving token separation, position increments and no limit on graph expansions
+   */
+  public CompletionAnalyzer(Analyzer analyzer) {
+    this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+  }
+
+  /**
+   * Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
+   * with no limit on graph expansions
+   */
+  public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
+    this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
+  }
+
+  /**
+   * Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
+   * preserving token separation and position increments
+   */
+  public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
+    this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
+  }
+
+  @Override
+  protected Analyzer getWrappedAnalyzer(String fieldName) {
+    return analyzer;
+  }
+
+  @Override
+  protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+    CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(),
+        preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions);
+    return new TokenStreamComponents(components.getTokenizer(), tokenStream);
+  }
+
+  /**
+   * Converts <code>key</code> to an automaton using
+   * {@link #preservePositionIncrements}, {@link #preserveSep}
+   * and {@link #maxGraphExpansions}
+   */
+  public Automaton toAutomaton(String field, CharSequence key) throws IOException {
+    for (int i = 0; i < key.length(); i++) {
+      switch (key.charAt(i)) {
+        case HOLE_CHARACTER:
+          throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
+        case SEP_LABEL:
+          throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
+        default:
+          break;
+      }
+    }
+
+    try (TokenStream tokenStream = analyzer.tokenStream(field, key.toString())) {
+      try(CompletionTokenStream stream = new CompletionTokenStream(tokenStream,
+          preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions)) {
+        return stream.toAutomaton(tokenStream);
+      }
+    }
+  }
+
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,192 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
+
+/**
+ * <p>
+ * Weighted FSTs for any indexed {@link SuggestField} is built on {@link #write(Fields)}.
+ * A weighted FST maps the analyzed forms of a field to its
+ * surface form and document id. FSTs are stored in the CompletionDictionary (.lkp).
+ * </p>
+ * <p>
+ * The file offsets of a field's FST are stored in the CompletionIndex (.cmp)
+ * along with the field's internal number {@link FieldInfo#number} on {@link #close()}.
+ * </p>
+ *
+ */
+final class CompletionFieldsConsumer extends FieldsConsumer {
+
+  private final String delegatePostingsFormatName;
+  private final Map<String, Long> seenFields = new HashMap<>();
+  private final SegmentWriteState state;
+  private IndexOutput dictOut;
+  private FieldsConsumer delegateFieldsConsumer;
+
+  CompletionFieldsConsumer(PostingsFormat delegatePostingsFormat, SegmentWriteState state) throws IOException {
+    this.delegatePostingsFormatName = delegatePostingsFormat.getName();
+    this.state = state;
+    String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
+    boolean success = false;
+    try {
+      this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
+      dictOut = state.directory.createOutput(dictFile, state.context);
+      CodecUtil.writeIndexHeader(dictOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      success = true;
+    } finally {
+      if (success == false) {
+        IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
+      }
+    }
+  }
+
+  @Override
+  public void write(Fields fields) throws IOException {
+    delegateFieldsConsumer.write(fields);
+
+    for (String field : fields) {
+      CompletionTermWriter termWriter = new CompletionTermWriter();
+      Terms terms = fields.terms(field);
+      TermsEnum termsEnum = terms.iterator(null);
+
+      // write terms
+      BytesRef term;
+      while ((term = termsEnum.next()) != null) {
+        termWriter.write(term, termsEnum);
+      }
+
+      // store lookup, if needed
+      long filePointer = dictOut.getFilePointer();
+      if (termWriter.finish(dictOut)) {
+        seenFields.put(field, filePointer);
+      }
+    }
+  }
+
+  private boolean closed = false;
+
+  @Override
+  public void close() throws IOException {
+    if (closed) {
+      return;
+    }
+    closed = true;
+    String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
+    boolean success = false;
+    try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
+      delegateFieldsConsumer.close();
+      CodecUtil.writeIndexHeader(indexOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      /*
+       * we write the delegate postings format name so we can load it
+       * without getting an instance in the ctor
+       */
+      indexOut.writeString(delegatePostingsFormatName);
+      // write # of seen fields
+      indexOut.writeVInt(seenFields.size());
+      // write field numbers and dictOut offsets
+      for (Map.Entry<String, Long> seenField : seenFields.entrySet()) {
+        FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
+        indexOut.writeVInt(fieldInfo.number);
+        indexOut.writeVLong(seenField.getValue());
+      }
+      CodecUtil.writeFooter(indexOut);
+      CodecUtil.writeFooter(dictOut);
+      IOUtils.close(dictOut);
+      success = true;
+    } finally {
+      if (success == false) {
+        IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
+      }
+    }
+  }
+
+  // builds an FST based on the terms written
+  private static class CompletionTermWriter {
+
+    private PostingsEnum postingsEnum = null;
+    private int docCount = 0;
+
+    private final BytesRefBuilder scratch = new BytesRefBuilder();
+    private final NRTSuggesterBuilder builder;
+
+    public CompletionTermWriter() {
+      builder = new NRTSuggesterBuilder();
+    }
+
+    /**
+     * Stores the built FST in <code>output</code>
+     * Returns true if there was anything stored, false otherwise
+     */
+    public boolean finish(IndexOutput output) throws IOException {
+      boolean stored = builder.store(output);
+      assert stored || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
+      return stored;
+    }
+
+    /**
+     * Writes all postings (surface form, weight, document id) for <code>term</code>
+     */
+    public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
+      postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.PAYLOADS);
+      builder.startTerm(term);
+      int docFreq = 0;
+      while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+        int docID = postingsEnum.docID();
+        for (int i = 0; i < postingsEnum.freq(); i++) {
+          postingsEnum.nextPosition();
+          assert postingsEnum.getPayload() != null;
+          BytesRef payload = postingsEnum.getPayload();
+          ByteArrayDataInput input = new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
+          int len = input.readVInt();
+          scratch.grow(len);
+          scratch.setLength(len);
+          input.readBytes(scratch.bytes(), 0, scratch.length());
+          builder.addEntry(docID, scratch.get(), input.readVLong() - 1);
+        }
+        docFreq++;
+        docCount = Math.max(docCount, docFreq + 1);
+      }
+      builder.finishTerm();
+    }
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,228 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Accountables;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_CODEC_VERSION;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
+
+/**
+ * <p>
+ * Completion index (.cmp) is opened and read at instantiation to read in {@link SuggestField}
+ * numbers and their FST offsets in the Completion dictionary (.lkp).
+ * </p>
+ * <p>
+ * Completion dictionary (.lkp) is opened at instantiation and a field's FST is loaded
+ * into memory the first time it is requested via {@link #terms(String)}.
+ * </p>
+ * <p>
+ * NOTE: Only the footer is validated for Completion dictionary (.lkp) and not the checksum due
+ * to random access pattern and checksum validation being too costly at instantiation
+ * </p>
+ *
+ */
+final class CompletionFieldsProducer extends FieldsProducer {
+
+  private FieldsProducer delegateFieldsProducer;
+  private Map<String, CompletionsTermsReader> readers;
+  private IndexInput dictIn;
+
+  // copy ctr for merge instance
+  private CompletionFieldsProducer(FieldsProducer delegateFieldsProducer, Map<String, CompletionsTermsReader> readers) {
+    this.delegateFieldsProducer = delegateFieldsProducer;
+    this.readers = readers;
+  }
+
+  CompletionFieldsProducer(SegmentReadState state) throws IOException {
+    String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
+    delegateFieldsProducer = null;
+    boolean success = false;
+
+    try (ChecksumIndexInput index = state.directory.openChecksumInput(indexFile, state.context)) {
+      // open up dict file containing all fsts
+      String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
+      dictIn = state.directory.openInput(dictFile, state.context);
+      CodecUtil.checkIndexHeader(dictIn, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      // just validate the footer for the dictIn
+      CodecUtil.retrieveChecksum(dictIn);
+
+      // open up index file (fieldNumber, offset)
+      CodecUtil.checkIndexHeader(index, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+      // load delegate PF
+      PostingsFormat delegatePostingsFormat = PostingsFormat.forName(index.readString());
+      delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
+
+      // read suggest field numbers and their offsets in the terms file from index
+      int numFields = index.readVInt();
+      readers = new HashMap<>(numFields);
+      for (int i = 0; i < numFields; i++) {
+        int fieldNumber = index.readVInt();
+        long offset = index.readVLong();
+        FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber);
+        // we don't load the FST yet
+        readers.put(fieldInfo.name, new CompletionsTermsReader(offset));
+      }
+      CodecUtil.checkFooter(index);
+      success = true;
+    } finally {
+      if (success == false) {
+        IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
+      }
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    boolean success = false;
+    try {
+      delegateFieldsProducer.close();
+      IOUtils.close(dictIn);
+      success = true;
+    } finally {
+      if (success == false) {
+        IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
+      }
+    }
+  }
+
+  @Override
+  public void checkIntegrity() throws IOException {
+    delegateFieldsProducer.checkIntegrity();
+    // TODO: checkIntegrity should checksum the dictionary and index
+  }
+
+  @Override
+  public FieldsProducer getMergeInstance() throws IOException {
+    return new CompletionFieldsProducer(delegateFieldsProducer, readers);
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    long ramBytesUsed = delegateFieldsProducer.ramBytesUsed();
+    for (CompletionsTermsReader reader : readers.values()) {
+      ramBytesUsed += reader.ramBytesUsed();
+    }
+    return ramBytesUsed;
+  }
+
+  @Override
+  public Collection<Accountable> getChildResources() {
+    List<Accountable> accountableList = new ArrayList<>();
+    for (Map.Entry<String, CompletionsTermsReader> readerEntry : readers.entrySet()) {
+      accountableList.add(Accountables.namedAccountable(readerEntry.getKey(), readerEntry.getValue()));
+    }
+    return Collections.unmodifiableCollection(accountableList);
+  }
+
+  @Override
+  public Iterator<String> iterator() {
+    return readers.keySet().iterator();
+  }
+
+  @Override
+  public Terms terms(String field) throws IOException {
+    return new CompletionTerms(delegateFieldsProducer.terms(field), readers.get(field));
+  }
+
+  @Override
+  public int size() {
+    return readers.size();
+  }
+
+  private class CompletionsTermsReader implements Accountable {
+    private final long offset;
+    private NRTSuggester suggester;
+
+    public CompletionsTermsReader(long offset) throws IOException {
+      assert offset >= 0l && offset < dictIn.length();
+      this.offset = offset;
+    }
+
+    public synchronized NRTSuggester suggester() throws IOException {
+      if (suggester == null) {
+        try (IndexInput dictClone = dictIn.clone()) { // let multiple fields load concurrently
+          dictClone.seek(offset);
+          suggester = NRTSuggester.load(dictClone);
+        }
+      }
+      return suggester;
+    }
+
+    @Override
+    public long ramBytesUsed() {
+      return (suggester != null) ? suggester.ramBytesUsed() : 0;
+    }
+
+    @Override
+    public Collection<Accountable> getChildResources() {
+      return Collections.emptyList();
+    }
+  }
+
+  /**
+   * Thin wrapper over {@link org.apache.lucene.index.Terms} with
+   * a {@link NRTSuggester}
+   */
+  public static class CompletionTerms extends FilterLeafReader.FilterTerms {
+
+    private final CompletionsTermsReader reader;
+
+    public CompletionTerms(Terms in, CompletionsTermsReader reader) {
+      super(in);
+      this.reader = reader;
+    }
+
+    /**
+     * Returns a {@link NRTSuggester} for the field
+     * or <code>null</code> if no FST
+     * was indexed for this field
+     */
+    public NRTSuggester suggester() throws IOException {
+      if (reader == null) {
+        return null;
+      }
+      return reader.suggester();
+    }
+  }
+
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,121 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * <p>
+ * A {@link PostingsFormat} which supports document suggestion based on
+ * indexed {@link SuggestField}s.
+ * Document suggestion is based on an weighted FST which map analyzed
+ * terms of a {@link SuggestField} to its surface form and document id.
+ * </p>
+ * <p>
+ * Files:
+ * <ul>
+ *   <li><tt>.lkp</tt>: <a href="#Completiondictionary">Completion Dictionary</a></li>
+ *   <li><tt>.cmp</tt>: <a href="#Completionindex">Completion Index</a></li>
+ * </ul>
+ * <p>
+ * <a name="Completionictionary"></a>
+ * <h3>Completion Dictionary</h3>
+ * <p>The .lkp file contains an FST for each suggest field
+ * </p>
+ * <ul>
+ *   <li>CompletionDict (.lkp) --&gt; Header, FST<sup>NumSuggestFields</sup>, Footer</li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *   <!-- TODO: should the FST output be mentioned at all? -->
+ *   <li>FST --&gt; {@link FST FST&lt;Long, BytesRef&gt;}</li>
+ *   <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ *   <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ *     for the Completion implementation.</li>
+ *   <li>FST maps all analyzed forms to surface forms of a SuggestField</li>
+ * </ul>
+ * <a name="Completionindex"></a>
+ * <h3>Completion Index</h3>
+ * <p>The .cmp file contains an index into the completion dictionary, so that it can be
+ * accessed randomly.</p>
+ * <ul>
+ *   <li>CompletionIndex (.cmp) --&gt; Header, NumSuggestFields, Entry<sup>NumSuggestFields</sup>, Footer</li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *   <li>NumSuggestFields --&gt; {@link DataOutput#writeVInt Uint32}</li>
+ *   <li>Entry --&gt; FieldNumber, CompletionDictionaryOffset</li>
+ *   <li>FieldNumber --&gt; {@link DataOutput#writeVInt Uint32}</li>
+ *   <li>CompletionDictionaryOffset --&gt; {@link DataOutput#writeVLong  Uint64}</li>
+ *   <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ *   <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ *     for the Completion implementation.</li>
+ *   <li>NumSuggestFields is the number of suggest fields indexed</li>
+ *   <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
+ *   <li>CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)</li>
+ * </ul>
+ *
+ * @lucene.experimental
+ */
+public abstract class CompletionPostingsFormat extends PostingsFormat {
+
+  static final String CODEC_NAME = "completion";
+  static final int COMPLETION_CODEC_VERSION = 1;
+  static final int COMPLETION_VERSION_CURRENT = COMPLETION_CODEC_VERSION;
+  static final String INDEX_EXTENSION = "cmp";
+  static final String DICT_EXTENSION = "lkp";
+
+  /**
+   * Used only by core Lucene at read-time via Service Provider instantiation
+   */
+  public CompletionPostingsFormat() {
+    super(CODEC_NAME);
+  }
+
+  /**
+   * Concrete implementation should specify the delegating postings format
+   */
+  protected abstract PostingsFormat delegatePostingsFormat();
+
+  @Override
+  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+    PostingsFormat delegatePostingsFormat = delegatePostingsFormat();
+    if (delegatePostingsFormat == null) {
+      throw new UnsupportedOperationException("Error - " + getClass().getName()
+          + " has been constructed without a choice of PostingsFormat");
+    }
+    return new CompletionFieldsConsumer(delegatePostingsFormat, state);
+  }
+
+  @Override
+  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+    return new CompletionFieldsProducer(state);
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,358 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.fst.Util;
+
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
+
+/**
+ * Token stream which converts a provided token stream to an automaton.
+ * The accepted strings enumeration from the automaton are available through the
+ * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
+ * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
+ * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
+ *
+ */
+final class CompletionTokenStream extends TokenStream {
+
+  private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
+  private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
+  private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
+
+  private final TokenStream input;
+  private final boolean preserveSep;
+  private final boolean preservePositionIncrements;
+  private final int sepLabel;
+  private final int maxGraphExpansions;
+
+  private BytesRef payload;
+  private Iterator<IntsRef> finiteStrings;
+  private int posInc = -1;
+  private CharTermAttribute charTermAttribute;
+
+  /**
+   * Creates a token stream to convert <code>input</code> to a token stream
+   * of accepted strings by its automaton.
+   * <p>
+   * The token stream <code>input</code> is converted to an automaton
+   * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
+   */
+  public CompletionTokenStream(TokenStream input) {
+    this(input, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, SEP_LABEL, DEFAULT_MAX_GRAPH_EXPANSIONS);
+  }
+
+  CompletionTokenStream(TokenStream input, boolean preserveSep, boolean preservePositionIncrements, int sepLabel, int maxGraphExpansions) {
+    // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
+    // the input stream entirely in toFiniteStrings(input)
+    this.input = input;
+    this.preserveSep = preserveSep;
+    this.preservePositionIncrements = preservePositionIncrements;
+    this.sepLabel = sepLabel;
+    this.maxGraphExpansions = maxGraphExpansions;
+  }
+
+  /**
+   * Returns a separator label that is reserved for the payload
+   * in {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}
+   */
+  public int sepLabel() {
+    return sepLabel;
+  }
+
+  /**
+   * Sets a payload available throughout successive token stream enumeration
+   */
+  public void setPayload(BytesRef payload) {
+    this.payload = payload;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    clearAttributes();
+    if (finiteStrings == null) {
+      //TODO: make this return a Iterator<IntsRef> instead?
+      Automaton automaton = toAutomaton(input);
+      Set<IntsRef> strings = Operations.getFiniteStrings(automaton, maxGraphExpansions);
+
+      posInc = strings.size();
+      finiteStrings = strings.iterator();
+    }
+    if (finiteStrings.hasNext()) {
+      posAttr.setPositionIncrement(posInc);
+      /*
+       * this posInc encodes the number of paths that this surface form
+       * produced. Multi Fields have the same surface form and therefore sum up
+       */
+      posInc = 0;
+      Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
+      if (charTermAttribute != null) {
+        charTermAttribute.setLength(0);
+        charTermAttribute.append(bytesAtt.toUTF16());
+      }
+      if (payload != null) {
+        payloadAttr.setPayload(this.payload);
+      }
+      return true;
+    }
+
+    return false;
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    if (posInc == -1) {
+      input.end();
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (posInc == -1) {
+      input.close();
+    }
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    if (hasAttribute(CharTermAttribute.class)) {
+      // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
+      charTermAttribute = getAttribute(CharTermAttribute.class);
+    }
+    finiteStrings = null;
+    posInc = -1;
+  }
+
+  /**
+   * Converts <code>tokenStream</code> to an automaton
+   */
+  public Automaton toAutomaton(TokenStream tokenStream) throws IOException {
+    // TODO refactor this
+    // maybe we could hook up a modified automaton from TermAutomatonQuery here?
+    Automaton automaton = null;
+    try {
+      // Create corresponding automaton: labels are bytes
+      // from each analyzed token, with byte 0 used as
+      // separator between tokens:
+      final TokenStreamToAutomaton tsta;
+      if (preserveSep) {
+        tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
+      } else {
+        // When we're not preserving sep, we don't steal 0xff
+        // byte, so we don't need to do any escaping:
+        tsta = new TokenStreamToAutomaton();
+      }
+      tsta.setPreservePositionIncrements(preservePositionIncrements);
+
+      automaton = tsta.toAutomaton(tokenStream);
+    } finally {
+      IOUtils.closeWhileHandlingException(tokenStream);
+    }
+
+    // TODO: we can optimize this somewhat by determinizing
+    // while we convert
+    automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
+    // This automaton should not blow up during determinize:
+    return Operations.determinize(automaton, maxGraphExpansions);
+  }
+
+  /**
+   * Just escapes the 0xff byte (which we still for SEP).
+   */
+  private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
+
+    final BytesRefBuilder spare = new BytesRefBuilder();
+    private char sepLabel;
+
+    public EscapingTokenStreamToAutomaton(char sepLabel) {
+      this.sepLabel = sepLabel;
+    }
+
+    @Override
+    protected BytesRef changeToken(BytesRef in) {
+      int upto = 0;
+      for (int i = 0; i < in.length; i++) {
+        byte b = in.bytes[in.offset + i];
+        if (b == (byte) sepLabel) {
+          spare.grow(upto + 2);
+          spare.setByteAt(upto++, (byte) sepLabel);
+          spare.setByteAt(upto++, b);
+        } else {
+          spare.grow(upto + 1);
+          spare.setByteAt(upto++, b);
+        }
+      }
+      spare.setLength(upto);
+      return spare.get();
+    }
+  }
+
+  // Replaces SEP with epsilon or remaps them if
+  // we were asked to preserve them:
+  private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
+
+    Automaton result = new Automaton();
+
+    // Copy all states over
+    int numStates = a.getNumStates();
+    for (int s = 0; s < numStates; s++) {
+      result.createState();
+      result.setAccept(s, a.isAccept(s));
+    }
+
+    // Go in reverse topo sort so we know we only have to
+    // make one pass:
+    Transition t = new Transition();
+    int[] topoSortStates = topoSortStates(a);
+    for (int i = 0; i < topoSortStates.length; i++) {
+      int state = topoSortStates[topoSortStates.length - 1 - i];
+      int count = a.initTransition(state, t);
+      for (int j = 0; j < count; j++) {
+        a.getNextTransition(t);
+        if (t.min == TokenStreamToAutomaton.POS_SEP) {
+          assert t.max == TokenStreamToAutomaton.POS_SEP;
+          if (preserveSep) {
+            // Remap to SEP_LABEL:
+            result.addTransition(state, t.dest, sepLabel);
+          } else {
+            result.addEpsilon(state, t.dest);
+          }
+        } else if (t.min == TokenStreamToAutomaton.HOLE) {
+          assert t.max == TokenStreamToAutomaton.HOLE;
+
+          // Just remove the hole: there will then be two
+          // SEP tokens next to each other, which will only
+          // match another hole at search time.  Note that
+          // it will also match an empty-string token ... if
+          // that's somehow a problem we can always map HOLE
+          // to a dedicated byte (and escape it in the
+          // input).
+          result.addEpsilon(state, t.dest);
+        } else {
+          result.addTransition(state, t.dest, t.min, t.max);
+        }
+      }
+    }
+
+    result.finishState();
+
+    return result;
+  }
+
+  private static int[] topoSortStates(Automaton a) {
+    int[] states = new int[a.getNumStates()];
+    final Set<Integer> visited = new HashSet<>();
+    final LinkedList<Integer> worklist = new LinkedList<>();
+    worklist.add(0);
+    visited.add(0);
+    int upto = 0;
+    states[upto] = 0;
+    upto++;
+    Transition t = new Transition();
+    while (worklist.size() > 0) {
+      int s = worklist.removeFirst();
+      int count = a.initTransition(s, t);
+      for (int i = 0; i < count; i++) {
+        a.getNextTransition(t);
+        if (!visited.contains(t.dest)) {
+          visited.add(t.dest);
+          worklist.add(t.dest);
+          states[upto++] = t.dest;
+        }
+      }
+    }
+    return states;
+  }
+
+  public interface ByteTermAttribute extends TermToBytesRefAttribute {
+    // marker interface
+
+    /**
+     * Return the builder from which the term is derived.
+     */
+    public BytesRefBuilder builder();
+
+    public CharSequence toUTF16();
+  }
+
+  public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
+    private final BytesRefBuilder bytes = new BytesRefBuilder();
+    private CharsRefBuilder charsRef;
+
+    @Override
+    public void fillBytesRef() {
+      // does nothing - we change in place
+    }
+
+    @Override
+    public BytesRefBuilder builder() {
+      return bytes;
+    }
+
+    @Override
+    public BytesRef getBytesRef() {
+      return bytes.get();
+    }
+
+    @Override
+    public void clear() {
+      bytes.clear();
+    }
+
+    @Override
+    public void copyTo(AttributeImpl target) {
+      ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
+      other.bytes.copyBytes(bytes);
+    }
+
+    @Override
+    public CharSequence toUTF16() {
+      if (charsRef == null) {
+        charsRef = new CharsRefBuilder();
+      }
+      charsRef.copyUTF8Bytes(getBytesRef());
+      return charsRef.get();
+    }
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,324 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.suggest.analyzing.FSTUtil;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseDocID;
+import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
+
+/**
+ * <p>
+ * NRTSuggester returns Top N completions with corresponding documents matching a provided automaton.
+ * The completions are returned in descending order of their corresponding weight.
+ * Deleted documents are filtered out in near real time using the provided reader.
+ * A {@link org.apache.lucene.search.DocIdSet} can be passed in at query time to filter out documents.
+ * </p>
+ * <p>
+ * See {@link #lookup(LeafReader, Automaton, int, DocIdSet, TopSuggestDocsCollector)} for more implementation
+ * details.
+ * <p>
+ * Builder: {@link NRTSuggesterBuilder}
+ * </p>
+ * <p>
+ * FST Format:
+ * <ul>
+ *   <li>Input: analyzed forms of input terms</li>
+ *   <li>Output: Pair&lt;Long, BytesRef&gt; containing weight, surface form and docID</li>
+ * </ul>
+ * <p>
+ * NOTE:
+ * <ul>
+ *   <li>currently only {@link org.apache.lucene.search.DocIdSet} with random access capabilities are supported.</li>
+ *   <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to
+ *     over-pruning of potential paths</li>
+ *   <li>when a {@link org.apache.lucene.search.DocIdSet} is used, it is assumed that the filter will roughly
+ *     filter out half the number of documents that match the provided automaton</li>
+ *   <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li>
+ * </ul>
+ *
+ */
+final class NRTSuggester implements Accountable {
+
+  /**
+   * FST<Weight,Surface>:
+   * input is the analyzed form, with a null byte between terms
+   * and a {@link NRTSuggesterBuilder#END_BYTE} to denote the
+   * end of the input
+   * weight is a long
+   * surface is the original, unanalyzed form followed by the docID
+   */
+  private final FST<Pair<Long, BytesRef>> fst;
+
+  /**
+   * Highest number of analyzed paths we saw for any single
+   * input surface form. This can be > 1, when index analyzer
+   * creates graphs or if multiple surface form(s) yields the
+   * same analyzed form
+   */
+  private final int maxAnalyzedPathsPerOutput;
+
+  /**
+   * Separator used between surface form and its docID in the FST output
+   */
+  private final int payloadSep;
+
+  /**
+   * Label used to denote the end of an input in the FST and
+   * the beginning of dedup bytes
+   */
+  private final int endByte;
+
+  /**
+   * Maximum queue depth for TopNSearcher
+   *
+   * NOTE: value should be <= Integer.MAX_VALUE
+   */
+  private static final long MAX_TOP_N_QUEUE_SIZE = 1000;
+
+  private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep, int endByte) {
+    this.fst = fst;
+    this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput;
+    this.payloadSep = payloadSep;
+    this.endByte = endByte;
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    return fst == null ? 0 : fst.ramBytesUsed();
+  }
+
+  @Override
+  public Collection<Accountable> getChildResources() {
+    return Collections.emptyList();
+  }
+
+  private static Comparator<Pair<Long, BytesRef>> getComparator() {
+    return new Comparator<Pair<Long, BytesRef>>() {
+      @Override
+      public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) {
+        return Long.compare(o1.output1, o2.output1);
+      }
+    };
+  }
+
+  /**
+   * Collects at most Top <code>num</code> completions, filtered by <code>filter</code> on
+   * corresponding documents, which has a prefix accepted by <code>automaton</code>
+   * <p>
+   * Supports near real time deleted document filtering using <code>reader</code>
+   * <p>
+   * {@link TopSuggestDocsCollector#collect(int, CharSequence, long)} is called
+   * for every matched completion
+   * <p>
+   * Completion collection can be early terminated by throwing {@link org.apache.lucene.search.CollectionTerminatedException}
+   */
+  public void lookup(final LeafReader reader, final Automaton automaton, final int num, final DocIdSet filter, final TopSuggestDocsCollector collector) {
+    final Bits filterDocs;
+    try {
+      if (filter != null) {
+        if (filter.iterator() == null) {
+          return;
+        }
+        if (filter.bits() == null) {
+          throw new IllegalArgumentException("DocIDSet does not provide random access interface");
+        } else {
+          filterDocs = filter.bits();
+        }
+      } else {
+        filterDocs = null;
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+
+    int queueSize = getMaxTopNSearcherQueueSize(num, reader, filterDocs != null);
+    if (queueSize == -1) {
+      return;
+    }
+
+    final Bits liveDocs = reader.getLiveDocs();
+    try {
+      final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
+      Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num, queueSize, getComparator()) {
+
+        private final CharsRefBuilder spare = new CharsRefBuilder();
+
+        @Override
+        protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
+          int payloadSepIndex = parseSurfaceForm(output.output2, payloadSep, spare);
+          int docID = parseDocID(output.output2, payloadSepIndex);
+
+          // filter out deleted docs only if no filter is set
+          if (filterDocs == null && liveDocs != null && !liveDocs.get(docID)) {
+            return false;
+          }
+
+          // filter by filter context
+          if (filterDocs != null && !filterDocs.get(docID)) {
+            return false;
+          }
+
+          try {
+            collector.collect(docID, spare.toCharsRef(), decode(output.output1));
+            return true;
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+      };
+
+      // TODO: add fuzzy support
+      for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
+        searcher.addStartPaths(path.fstNode, path.output, false, path.input);
+      }
+
+      try {
+        // hits are also returned by search()
+        // we do not use it, instead collect at acceptResult
+        Util.TopResults<Pair<Long, BytesRef>> search = searcher.search();
+        // search admissibility is not guaranteed
+        // see comment on getMaxTopNSearcherQueueSize
+        // assert  search.isComplete;
+      } catch (CollectionTerminatedException e) {
+        // terminate
+      }
+
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
+  }
+
+  /**
+   * Simple heuristics to try to avoid over-pruning potential suggestions by the
+   * TopNSearcher. Since suggestion entries can be rejected if they belong
+   * to a deleted document, the length of the TopNSearcher queue has to
+   * be increased by some factor, to account for the filtered out suggestions.
+   * This heuristic will try to make the searcher admissible, but the search
+   * can still lead to over-pruning
+   * <p>
+   * If a <code>filter</code> is applied, the queue size is increased by
+   * half the number of live documents.
+   * <p>
+   * The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE}
+   */
+  private int getMaxTopNSearcherQueueSize(int num, LeafReader reader, boolean filterEnabled) {
+    double liveDocsRatio = calculateLiveDocRatio(reader.numDocs(), reader.maxDoc());
+    if (liveDocsRatio == -1) {
+      return -1;
+    }
+    long maxQueueSize = num * maxAnalyzedPathsPerOutput;
+    // liveDocRatio can be at most 1.0 (if no docs were deleted)
+    assert liveDocsRatio <= 1.0d;
+    maxQueueSize = (long) (maxQueueSize / liveDocsRatio);
+    if (filterEnabled) {
+      maxQueueSize = maxQueueSize + (reader.numDocs()/2);
+    }
+    return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize);
+  }
+
+  private static double calculateLiveDocRatio(int numDocs, int maxDocs) {
+    return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1;
+  }
+
+  /**
+   * Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput}
+   */
+  public static NRTSuggester load(IndexInput input) throws IOException {
+    final FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>(
+        PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
+
+    /* read some meta info */
+    int maxAnalyzedPathsPerOutput = input.readVInt();
+    int endByte = input.readVInt();
+    int payloadSep = input.readVInt();
+
+    return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep, endByte);
+  }
+
+  static long encode(long input) {
+    if (input < 0) {
+      throw new UnsupportedOperationException("cannot encode value: " + input);
+    }
+    return Long.MAX_VALUE - input;
+  }
+
+  static long decode(long output) {
+    return (Long.MAX_VALUE - output);
+  }
+
+  /**
+   * Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output
+   */
+  static final class PayLoadProcessor {
+    final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes
+
+    static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
+      int surfaceFormLen = -1;
+      for (int i = 0; i < output.length; i++) {
+        if (output.bytes[output.offset + i] == payloadSep) {
+          surfaceFormLen = i;
+          break;
+        }
+      }
+      assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
+      spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
+      return surfaceFormLen;
+    }
+
+    static int parseDocID(final BytesRef output, int payloadSepIndex) {
+      assert payloadSepIndex != -1 : "payload sep index can not be -1";
+      ByteArrayDataInput input = new ByteArrayDataInput(output.bytes, payloadSepIndex + output.offset + 1, output.length - (payloadSepIndex + output.offset));
+      return input.readVInt();
+    }
+
+    static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
+      int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
+      byte[] buffer = new byte[len];
+      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+      output.writeBytes(surface.bytes, surface.length - surface.offset);
+      output.writeByte((byte) payloadSep);
+      output.writeVInt(docID);
+      return new BytesRef(buffer, 0, output.getPosition());
+    }
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,165 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.PriorityQueue;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+import static org.apache.lucene.search.suggest.document.NRTSuggester.encode;
+
+/**
+ * Builder for {@link NRTSuggester}
+ *
+ */
+final class NRTSuggesterBuilder {
+
+  /**
+   * Label used to separate surface form and docID
+   * in the output
+   */
+  public static final int PAYLOAD_SEP = '\u001F';
+
+  /**
+   * Marks end of the analyzed input and start of dedup
+   * byte.
+   */
+  private static final int END_BYTE = 0x0;
+
+  private final PairOutputs<Long, BytesRef> outputs;
+  private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
+  private final IntsRefBuilder scratchInts = new IntsRefBuilder();
+  private final BytesRefBuilder analyzed = new BytesRefBuilder();
+  private final PriorityQueue<Entry> entries;
+  private final int payloadSep;
+  private final int endByte;
+
+  private int maxAnalyzedPathsPerOutput = 0;
+
+  /**
+   * Create a builder for {@link NRTSuggester}
+   */
+  public NRTSuggesterBuilder() {
+    this.payloadSep = PAYLOAD_SEP;
+    this.endByte = END_BYTE;
+    this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
+    this.entries = new PriorityQueue<>();
+    this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+  }
+
+  /**
+   * Initializes an FST input term to add entries against
+   */
+  public void startTerm(BytesRef analyzed) {
+    this.analyzed.copyBytes(analyzed);
+    this.analyzed.append((byte) endByte);
+  }
+
+  /**
+   * Adds an entry for the latest input term, should be called after
+   * {@link #startTerm(org.apache.lucene.util.BytesRef)} on the desired input
+   */
+  public void addEntry(int docID, BytesRef surfaceForm, long weight) throws IOException {
+    BytesRef payloadRef = NRTSuggester.PayLoadProcessor.make(surfaceForm, docID, payloadSep);
+    entries.add(new Entry(payloadRef, encode(weight)));
+  }
+
+  /**
+   * Writes all the entries for the FST input term
+   */
+  public void finishTerm() throws IOException {
+    int numArcs = 0;
+    int numDedupBytes = 1;
+    analyzed.grow(analyzed.length() + 1);
+    analyzed.setLength(analyzed.length() + 1);
+    for (Entry entry : entries) {
+      if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
+        analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
+        analyzed.grow(analyzed.length() + 1);
+        analyzed.setLength(analyzed.length() + 1);
+        numArcs = 0;
+        numDedupBytes++;
+      }
+      analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
+      Util.toIntsRef(analyzed.get(), scratchInts);
+      builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
+    }
+    maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
+    entries.clear();
+  }
+
+  /**
+   * Builds and stores a FST that can be loaded with
+   * {@link NRTSuggester#load(org.apache.lucene.store.IndexInput)}
+   */
+  public boolean store(DataOutput output) throws IOException {
+    final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
+    if (build == null) {
+      return false;
+    }
+    build.save(output);
+
+    /* write some more meta-info */
+    assert maxAnalyzedPathsPerOutput > 0;
+    output.writeVInt(maxAnalyzedPathsPerOutput);
+    output.writeVInt(END_BYTE);
+    output.writeVInt(PAYLOAD_SEP);
+    return true;
+  }
+
+  /**
+   * Num arcs for nth dedup byte:
+   * if n <= 5: 1 + (2 * n)
+   * else: (1 + (2 * n)) * n
+   * <p>
+   * TODO: is there a better way to make the fst built to be
+   * more TopNSearcher friendly?
+   */
+  private static int maxNumArcsForDedupByte(int currentNumDedupBytes) {
+    int maxArcs = 1 + (2 * currentNumDedupBytes);
+    if (currentNumDedupBytes > 5) {
+      maxArcs *= currentNumDedupBytes;
+    }
+    return Math.min(maxArcs, 255);
+  }
+
+  private final static class Entry implements Comparable<Entry> {
+    final BytesRef payload;
+    final long weight;
+
+    public Entry(BytesRef payload, long weight) {
+      this.payload = payload;
+      this.weight = weight;
+    }
+
+    @Override
+    public int compareTo(Entry o) {
+      return Long.compare(weight, o.weight);
+    }
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,123 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * <p>
+ * Field that indexes a string value and a weight as a weighted completion
+ * against a named suggester.
+ * Field is tokenized, not stored and stores documents, frequencies and positions.
+ * Field can be used to provide near real time document suggestions.
+ * </p>
+ * <p>
+ * Besides the usual {@link org.apache.lucene.analysis.Analyzer}s,
+ * {@link CompletionAnalyzer}
+ * can be used to tune suggest field only parameters
+ * (e.g. preserving token seperators, preserving position increments
+ * when converting the token stream to an automaton)
+ * </p>
+ * <p>
+ * Example indexing usage:
+ * <pre class="prettyprint">
+ * document.add(new SuggestField(name, "suggestion", 4));
+ * </pre>
+ * To perform document suggestions based on the this field, use
+ * {@link SuggestIndexSearcher#suggest(String, CharSequence, int, org.apache.lucene.search.Filter)}
+ * <p>
+ * Example query usage:
+ * <pre class="prettyprint">
+ * SuggestIndexSearcher indexSearcher = ..
+ * indexSearcher.suggest(name, "su", 2)
+ * </pre>
+ *
+ * @lucene.experimental
+ */
+public class SuggestField extends Field {
+
+  private static final FieldType FIELD_TYPE = new FieldType();
+
+  static {
+    FIELD_TYPE.setTokenized(true);
+    FIELD_TYPE.setStored(false);
+    FIELD_TYPE.setStoreTermVectors(false);
+    FIELD_TYPE.setOmitNorms(false);
+    FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+    FIELD_TYPE.freeze();
+  }
+
+  private final BytesRef surfaceForm;
+  private final long weight;
+
+  /**
+   * Creates a {@link SuggestField}
+   *
+   * @param name   of the field
+   * @param value  to get suggestions on
+   * @param weight weight of the suggestion
+   */
+  public SuggestField(String name, String value, long weight) {
+    super(name, value, FIELD_TYPE);
+    if (weight < 0l) {
+      throw new IllegalArgumentException("weight must be >= 0");
+    }
+    this.surfaceForm = new BytesRef(value);
+    this.weight = weight;
+  }
+
+  @Override
+  public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
+    TokenStream stream = super.tokenStream(analyzer, reuse);
+    CompletionTokenStream completionStream;
+    if (stream instanceof CompletionTokenStream) {
+      completionStream = (CompletionTokenStream) stream;
+    } else {
+      completionStream = new CompletionTokenStream(stream);
+    }
+    BytesRef suggestPayload = buildSuggestPayload(surfaceForm, weight, (char) completionStream.sepLabel());
+    completionStream.setPayload(suggestPayload);
+    return completionStream;
+  }
+
+  private BytesRef buildSuggestPayload(BytesRef surfaceForm, long weight, char sepLabel) throws IOException {
+    for (int i = 0; i < surfaceForm.length; i++) {
+      if (surfaceForm.bytes[i] == sepLabel) {
+        assert sepLabel == '\u001f';
+        throw new IllegalArgumentException(
+            "surface form cannot contain unit separator character U+001F; this character is reserved");
+      }
+    }
+    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
+      output.writeVInt(surfaceForm.length);
+      output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
+      output.writeVLong(weight + 1);
+    }
+    return new BytesRef(byteArrayOutputStream.toByteArray());
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,150 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.automaton.Automaton;
+
+import static org.apache.lucene.search.suggest.document.CompletionFieldsProducer.CompletionTerms;
+
+/**
+ * Adds document suggest capabilities to IndexSearcher
+ *
+ * @lucene.experimental
+ */
+public class SuggestIndexSearcher extends IndexSearcher {
+
+  private final Analyzer queryAnalyzer;
+
+  /**
+   * Creates a searcher with document suggest capabilities
+   * for <code>reader</code>.
+   * <p>
+   * Suggestion <code>key</code> is analyzed with <code>queryAnalyzer</code>
+   */
+  public SuggestIndexSearcher(IndexReader reader, Analyzer queryAnalyzer) {
+    super(reader);
+    this.queryAnalyzer = queryAnalyzer;
+  }
+
+  /**
+   * Calls {@link #suggest(String, CharSequence, int, Filter)}
+   * with no document filter
+   */
+  public TopSuggestDocs suggest(String field, CharSequence key, int num) throws IOException {
+    return suggest(field, key, num, (Filter) null);
+  }
+
+  /**
+   * Calls {@link #suggest(String, CharSequence, int, Filter, TopSuggestDocsCollector)}
+   * with no document filter
+   */
+  public void suggest(String field, CharSequence key, int num, TopSuggestDocsCollector collector) throws IOException {
+    suggest(field, key, num, null, collector);
+  }
+
+  /**
+   * Suggests at most <code>num</code> documents filtered by <code>filter</code>
+   * that completes to <code>key</code> for a suggest <code>field</code>
+   * <p>
+   * Returns at most Top <code>num</code> document ids with corresponding completion and weight pair
+   *
+   * @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
+   *                                            interface or if <code>field</code> is not a {@link SuggestField}
+   */
+  public TopSuggestDocs suggest(String field, CharSequence key, int num, Filter filter) throws IOException {
+    TopSuggestDocsCollector collector = new TopSuggestDocsCollector(num);
+    suggest(field, key, num, filter, collector);
+    return collector.get();
+  }
+
+  /**
+   * Suggests at most <code>num</code> documents filtered by <code>filter</code>
+   * that completes to <code>key</code> for a suggest <code>field</code>
+   * <p>
+   * Collect completions with {@link TopSuggestDocsCollector}
+   * The completions are collected in order of the suggest <code>field</code> weight.
+   * There can be more than one collection of the same document, if the <code>key</code>
+   * matches multiple <code>field</code> values of the same document
+   *
+   * @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
+   *                                            interface or if <code>field</code> is not a {@link SuggestField}
+   */
+  public void suggest(String field, CharSequence key, int num, Filter filter, TopSuggestDocsCollector collector) throws IOException {
+    // verify input
+    if (field == null) {
+      throw new IllegalArgumentException("'field' can not be null");
+    }
+    if (num <= 0) {
+      throw new IllegalArgumentException("'num' should be > 0");
+    }
+    if (collector == null) {
+      throw new IllegalArgumentException("'collector' can not be null");
+    }
+
+    // build query automaton
+    CompletionAnalyzer analyzer;
+    if (queryAnalyzer instanceof CompletionAnalyzer) {
+      analyzer = (CompletionAnalyzer) queryAnalyzer;
+    } else {
+      analyzer = new CompletionAnalyzer(queryAnalyzer);
+    }
+    final Automaton automaton = analyzer.toAutomaton(field, key);
+
+    // collect results
+    for (LeafReaderContext context : getIndexReader().leaves()) {
+      TopSuggestDocsCollector leafCollector = (TopSuggestDocsCollector) collector.getLeafCollector(context);
+      LeafReader reader = context.reader();
+      Terms terms = reader.terms(field);
+      if (terms == null) {
+        continue;
+      }
+      NRTSuggester suggester;
+      if (terms instanceof CompletionTerms) {
+        CompletionTerms completionTerms = (CompletionTerms) terms;
+        suggester = completionTerms.suggester();
+      } else {
+        throw new IllegalArgumentException(field + " is not a SuggestField");
+      }
+      if (suggester == null) {
+        // a segment can have a null suggester
+        // i.e. no FST was built
+        continue;
+      }
+
+      DocIdSet docIdSet = null;
+      if (filter != null) {
+        docIdSet = filter.getDocIdSet(context, reader.getLiveDocs());
+        if (docIdSet == null) {
+          // filter matches no docs in current leave
+          continue;
+        }
+      }
+      suggester.lookup(reader, automaton, num, docIdSet, leafCollector);
+    }
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,56 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Bounded priority queue for {@link SuggestScoreDoc}s.
+ * Priority is based on {@link SuggestScoreDoc#score} and tie
+ * is broken by {@link SuggestScoreDoc#doc}
+ */
+final class SuggestScoreDocPriorityQueue extends PriorityQueue<SuggestScoreDoc> {
+  /**
+   * Creates a new priority queue of the specified size.
+   */
+  public SuggestScoreDocPriorityQueue(int size) {
+    super(size);
+  }
+
+  @Override
+  protected boolean lessThan(SuggestScoreDoc a, SuggestScoreDoc b) {
+    if (a.score == b.score) {
+      // prefer smaller doc id, in case of a tie
+      return a.doc > b.doc;
+    }
+    return a.score < b.score;
+  }
+
+  /**
+   * Returns the top N results in descending order.
+   */
+  public SuggestScoreDoc[] getResults() {
+    int size = size();
+    SuggestScoreDoc[] res = new SuggestScoreDoc[size];
+    for (int i = size - 1; i >= 0; i--) {
+      res[i] = pop();
+    }
+    return res;
+  }
+}

Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,111 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.suggest.Lookup;
+
+/**
+ * {@link org.apache.lucene.search.TopDocs} wrapper with
+ * an additional CharSequence key per {@link org.apache.lucene.search.ScoreDoc}
+ *
+ * @lucene.experimental
+ */
+public class TopSuggestDocs extends TopDocs {
+
+  /**
+   * Singleton for empty {@link TopSuggestDocs}
+   */
+  public final static TopSuggestDocs EMPTY = new TopSuggestDocs(0, new SuggestScoreDoc[0], 0);
+
+  /**
+   * {@link org.apache.lucene.search.ScoreDoc} with an
+   * additional CharSequence key
+   */
+  public static class SuggestScoreDoc extends ScoreDoc implements Comparable<SuggestScoreDoc> {
+
+    /**
+     * Matched completion key
+     */
+    public CharSequence key;
+
+    /**
+     * Creates a SuggestScoreDoc instance
+     *
+     * @param doc   document id (hit)
+     * @param key   matched completion
+     * @param score weight of the matched completion
+     */
+    public SuggestScoreDoc(int doc, CharSequence key, long score) {
+      // loss of precision but not magnitude
+      // implicit conversion from long -> float
+      super(doc, score);
+      this.key = key;
+    }
+
+    @Override
+    public int compareTo(SuggestScoreDoc o) {
+      return Lookup.CHARSEQUENCE_COMPARATOR.compare(key, o.key);
+    }
+  }
+
+  /**
+   * {@link org.apache.lucene.search.TopDocs} wrapper with
+   * {@link TopSuggestDocs.SuggestScoreDoc}
+   * instead of {@link org.apache.lucene.search.ScoreDoc}
+   */
+  public TopSuggestDocs(int totalHits, SuggestScoreDoc[] scoreDocs, float maxScore) {
+    super(totalHits, scoreDocs, maxScore);
+  }
+
+  /**
+   * Returns {@link TopSuggestDocs.SuggestScoreDoc}s
+   * for this instance
+   */
+  public SuggestScoreDoc[] scoreLookupDocs() {
+    return (SuggestScoreDoc[]) scoreDocs;
+  }
+
+  /**
+   * Returns a new TopSuggestDocs, containing topN results across
+   * the provided TopSuggestDocs, sorting by score. Each {@link TopSuggestDocs}
+   * instance must be sorted.
+   * Analogous to {@link org.apache.lucene.search.TopDocs#merge(int, org.apache.lucene.search.TopDocs[])}
+   * for {@link TopSuggestDocs}
+   *
+   * NOTE: assumes every <code>shardHit</code> is already sorted by score
+   */
+  public static TopSuggestDocs merge(int topN, TopSuggestDocs[] shardHits) {
+    SuggestScoreDocPriorityQueue priorityQueue = new SuggestScoreDocPriorityQueue(topN);
+    for (TopSuggestDocs shardHit : shardHits) {
+      for (SuggestScoreDoc scoreDoc : shardHit.scoreLookupDocs()) {
+        if (scoreDoc == priorityQueue.insertWithOverflow(scoreDoc)) {
+          break;
+        }
+      }
+    }
+    SuggestScoreDoc[] topNResults = priorityQueue.getResults();
+    if (topNResults.length > 0) {
+      return new TopSuggestDocs(topNResults.length, topNResults, topNResults[0].score);
+    } else {
+      return TopSuggestDocs.EMPTY;
+    }
+  }
+
+}