You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ar...@apache.org on 2015/03/27 23:37:49 UTC
svn commit: r1669698 [1/2] - in /lucene/dev/trunk/lucene: ./
suggest/src/java/org/apache/lucene/search/suggest/analyzing/
suggest/src/java/org/apache/lucene/search/suggest/document/
suggest/src/resources/META-INF/services/ suggest/src/test/org/apache/l...
Author: areek
Date: Fri Mar 27 22:37:49 2015
New Revision: 1669698
URL: http://svn.apache.org/r1669698
Log:
LUCENE-6339: Added Near-real time Document Suggester via custom postings format
Added:
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java (with props)
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/package-info.java (with props)
lucene/dev/trunk/lucene/suggest/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (with props)
lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/
lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/SuggestFieldTest.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1669698&r1=1669697&r2=1669698&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Mar 27 22:37:49 2015
@@ -19,6 +19,9 @@ New Features
for counting ranges that align with the underlying terms as defined by the
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
+* LUCENE-6339: Added Near-real time Document Suggester via custom postings format
+ (Areek Zillur, Mike McCandless, Simon Willnauer)
+
API Changes
* LUCENE-3312: The API of oal.document was restructured to
Modified: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java?rev=1669698&r1=1669697&r2=1669698&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java (original)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FSTUtil.java Fri Mar 27 22:37:49 2015
@@ -49,7 +49,7 @@ public class FSTUtil {
public final FST.Arc<T> fstNode;
/** Output of the path so far: */
- T output;
+ public final T output;
/** Input of the path so far: */
public final IntsRefBuilder input;
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion50PostingsFormat.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,42 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.codecs.PostingsFormat;
+
+/**
+ * {@link org.apache.lucene.search.suggest.document.CompletionPostingsFormat}
+ * for {@link org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat}
+ *
+ * @lucene.experimental
+ */
+public class Completion50PostingsFormat extends CompletionPostingsFormat {
+
+ /**
+ * Sole Constructor
+ */
+ public Completion50PostingsFormat() {
+ super();
+ }
+
+ @Override
+ protected PostingsFormat delegatePostingsFormat() {
+ return PostingsFormat.forName("Lucene50");
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionAnalyzer.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,173 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.AnalyzerWrapper;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.Transition;
+
+/**
+ * Wraps an {@link org.apache.lucene.analysis.Analyzer}
+ * to provide additional completion-only tuning
+ * (e.g. preserving token separators, preserving position increments while converting
+ * a token stream to an automaton)
+ * <p>
+ * Can be used to index {@link SuggestField}
+ * and as a query analyzer to {@link SuggestIndexSearcher}
+ * <p>
+ * NOTE: In most cases, index and query analyzer should have same values for {@link #preservePositionIncrements}
+ * and {@link #preserveSep}
+ *
+ * @lucene.experimental
+ */
+public class CompletionAnalyzer extends AnalyzerWrapper {
+
+ /**
+ * Represents the separation between tokens, if
+ * <code>preserveSep</code> is <code>true</code>
+ * <p>
+ * Same label is used as a delimiter in the {@link org.apache.lucene.search.suggest.document.CompletionTokenStream}
+ * payload
+ */
+ final static int SEP_LABEL = NRTSuggesterBuilder.PAYLOAD_SEP;
+
+ /**
+ * Represent a hole character, inserted by {@link org.apache.lucene.analysis.TokenStreamToAutomaton}
+ */
+ final static int HOLE_CHARACTER = TokenStreamToAutomaton.HOLE;
+
+ final static int DEFAULT_MAX_GRAPH_EXPANSIONS = -1;
+ final static boolean DEFAULT_PRESERVE_SEP = true;
+ final static boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
+
+ private final Analyzer analyzer;
+
+ /**
+ * Preserve separation between tokens
+ * when converting to an automaton
+ * <p>
+ * Defaults to <code>true</code>
+ */
+ private final boolean preserveSep;
+
+ /**
+ * Preserve position increments for tokens
+ * when converting to an automaton
+ * <p>
+ * Defaults to <code>true</code>
+ */
+ private final boolean preservePositionIncrements;
+
+ /**
+ * Sets the maximum number of graph expansions of a completion automaton
+ * <p>
+ * Defaults to <code>-1</code> (no limit)
+ */
+ private final int maxGraphExpansions;
+
+ /**
+ * Wraps an analyzer to convert it's output token stream to an automaton
+ *
+ * @param analyzer token stream to be converted to an automaton
+ * @param preserveSep Preserve separation between tokens when converting to an automaton
+ * @param preservePositionIncrements Preserve position increments for tokens when converting to an automaton
+ * @param maxGraphExpansions Sets the maximum number of graph expansions of a completion automaton
+ */
+ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements, int maxGraphExpansions) {
+ super(PER_FIELD_REUSE_STRATEGY);
+ this.analyzer = analyzer;
+ this.preserveSep = preserveSep;
+ this.preservePositionIncrements = preservePositionIncrements;
+ this.maxGraphExpansions = maxGraphExpansions;
+ }
+
+ /**
+ * Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
+ * preserving token separation, position increments and no limit on graph expansions
+ */
+ public CompletionAnalyzer(Analyzer analyzer) {
+ this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, DEFAULT_MAX_GRAPH_EXPANSIONS);
+ }
+
+ /**
+ * Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
+ * with no limit on graph expansions
+ */
+ public CompletionAnalyzer(Analyzer analyzer, boolean preserveSep, boolean preservePositionIncrements) {
+ this(analyzer, preserveSep, preservePositionIncrements, DEFAULT_MAX_GRAPH_EXPANSIONS);
+ }
+
+ /**
+ * Calls {@link #CompletionAnalyzer(org.apache.lucene.analysis.Analyzer, boolean, boolean, int)}
+ * preserving token separation and position increments
+ */
+ public CompletionAnalyzer(Analyzer analyzer, int maxGraphExpansions) {
+ this(analyzer, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, maxGraphExpansions);
+ }
+
+ @Override
+ protected Analyzer getWrappedAnalyzer(String fieldName) {
+ return analyzer;
+ }
+
+ @Override
+ protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
+ CompletionTokenStream tokenStream = new CompletionTokenStream(components.getTokenStream(),
+ preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions);
+ return new TokenStreamComponents(components.getTokenizer(), tokenStream);
+ }
+
+ /**
+ * Converts <code>key</code> to an automaton using
+ * {@link #preservePositionIncrements}, {@link #preserveSep}
+ * and {@link #maxGraphExpansions}
+ */
+ public Automaton toAutomaton(String field, CharSequence key) throws IOException {
+ for (int i = 0; i < key.length(); i++) {
+ switch (key.charAt(i)) {
+ case HOLE_CHARACTER:
+ throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
+ case SEP_LABEL:
+ throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
+ default:
+ break;
+ }
+ }
+
+ try (TokenStream tokenStream = analyzer.tokenStream(field, key.toString())) {
+ try(CompletionTokenStream stream = new CompletionTokenStream(tokenStream,
+ preserveSep, preservePositionIncrements, SEP_LABEL, maxGraphExpansions)) {
+ return stream.toAutomaton(tokenStream);
+ }
+ }
+ }
+
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsConsumer.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,192 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
+
+/**
+ * <p>
+ * Weighted FSTs for any indexed {@link SuggestField} is built on {@link #write(Fields)}.
+ * A weighted FST maps the analyzed forms of a field to its
+ * surface form and document id. FSTs are stored in the CompletionDictionary (.lkp).
+ * </p>
+ * <p>
+ * The file offsets of a field's FST are stored in the CompletionIndex (.cmp)
+ * along with the field's internal number {@link FieldInfo#number} on {@link #close()}.
+ * </p>
+ *
+ */
+final class CompletionFieldsConsumer extends FieldsConsumer {
+
+ private final String delegatePostingsFormatName;
+ private final Map<String, Long> seenFields = new HashMap<>();
+ private final SegmentWriteState state;
+ private IndexOutput dictOut;
+ private FieldsConsumer delegateFieldsConsumer;
+
+ CompletionFieldsConsumer(PostingsFormat delegatePostingsFormat, SegmentWriteState state) throws IOException {
+ this.delegatePostingsFormatName = delegatePostingsFormat.getName();
+ this.state = state;
+ String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
+ boolean success = false;
+ try {
+ this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state);
+ dictOut = state.directory.createOutput(dictFile, state.context);
+ CodecUtil.writeIndexHeader(dictOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
+ }
+ }
+ }
+
+ @Override
+ public void write(Fields fields) throws IOException {
+ delegateFieldsConsumer.write(fields);
+
+ for (String field : fields) {
+ CompletionTermWriter termWriter = new CompletionTermWriter();
+ Terms terms = fields.terms(field);
+ TermsEnum termsEnum = terms.iterator(null);
+
+ // write terms
+ BytesRef term;
+ while ((term = termsEnum.next()) != null) {
+ termWriter.write(term, termsEnum);
+ }
+
+ // store lookup, if needed
+ long filePointer = dictOut.getFilePointer();
+ if (termWriter.finish(dictOut)) {
+ seenFields.put(field, filePointer);
+ }
+ }
+ }
+
+ private boolean closed = false;
+
+ @Override
+ public void close() throws IOException {
+ if (closed) {
+ return;
+ }
+ closed = true;
+ String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
+ boolean success = false;
+ try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) {
+ delegateFieldsConsumer.close();
+ CodecUtil.writeIndexHeader(indexOut, CODEC_NAME, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ /*
+ * we write the delegate postings format name so we can load it
+ * without getting an instance in the ctor
+ */
+ indexOut.writeString(delegatePostingsFormatName);
+ // write # of seen fields
+ indexOut.writeVInt(seenFields.size());
+ // write field numbers and dictOut offsets
+ for (Map.Entry<String, Long> seenField : seenFields.entrySet()) {
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey());
+ indexOut.writeVInt(fieldInfo.number);
+ indexOut.writeVLong(seenField.getValue());
+ }
+ CodecUtil.writeFooter(indexOut);
+ CodecUtil.writeFooter(dictOut);
+ IOUtils.close(dictOut);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer);
+ }
+ }
+ }
+
+ // builds an FST based on the terms written
+ private static class CompletionTermWriter {
+
+ private PostingsEnum postingsEnum = null;
+ private int docCount = 0;
+
+ private final BytesRefBuilder scratch = new BytesRefBuilder();
+ private final NRTSuggesterBuilder builder;
+
+ public CompletionTermWriter() {
+ builder = new NRTSuggesterBuilder();
+ }
+
+ /**
+ * Stores the built FST in <code>output</code>
+ * Returns true if there was anything stored, false otherwise
+ */
+ public boolean finish(IndexOutput output) throws IOException {
+ boolean stored = builder.store(output);
+ assert stored || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
+ return stored;
+ }
+
+ /**
+ * Writes all postings (surface form, weight, document id) for <code>term</code>
+ */
+ public void write(BytesRef term, TermsEnum termsEnum) throws IOException {
+ postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.PAYLOADS);
+ builder.startTerm(term);
+ int docFreq = 0;
+ while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+ int docID = postingsEnum.docID();
+ for (int i = 0; i < postingsEnum.freq(); i++) {
+ postingsEnum.nextPosition();
+ assert postingsEnum.getPayload() != null;
+ BytesRef payload = postingsEnum.getPayload();
+ ByteArrayDataInput input = new ByteArrayDataInput(payload.bytes, payload.offset, payload.length);
+ int len = input.readVInt();
+ scratch.grow(len);
+ scratch.setLength(len);
+ input.readBytes(scratch.bytes(), 0, scratch.length());
+ builder.addEntry(docID, scratch.get(), input.readVLong() - 1);
+ }
+ docFreq++;
+ docCount = Math.max(docCount, docFreq + 1);
+ }
+ builder.finishTerm();
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionFieldsProducer.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,228 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Accountables;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.CODEC_NAME;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_CODEC_VERSION;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION;
+import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION;
+
+/**
+ * <p>
+ * Completion index (.cmp) is opened and read at instantiation to read in {@link SuggestField}
+ * numbers and their FST offsets in the Completion dictionary (.lkp).
+ * </p>
+ * <p>
+ * Completion dictionary (.lkp) is opened at instantiation and a field's FST is loaded
+ * into memory the first time it is requested via {@link #terms(String)}.
+ * </p>
+ * <p>
+ * NOTE: Only the footer is validated for Completion dictionary (.lkp) and not the checksum due
+ * to random access pattern and checksum validation being too costly at instantiation
+ * </p>
+ *
+ */
+final class CompletionFieldsProducer extends FieldsProducer {
+
+ private FieldsProducer delegateFieldsProducer;
+ private Map<String, CompletionsTermsReader> readers;
+ private IndexInput dictIn;
+
+ // copy ctr for merge instance
+ private CompletionFieldsProducer(FieldsProducer delegateFieldsProducer, Map<String, CompletionsTermsReader> readers) {
+ this.delegateFieldsProducer = delegateFieldsProducer;
+ this.readers = readers;
+ }
+
+ CompletionFieldsProducer(SegmentReadState state) throws IOException {
+ String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION);
+ delegateFieldsProducer = null;
+ boolean success = false;
+
+ try (ChecksumIndexInput index = state.directory.openChecksumInput(indexFile, state.context)) {
+ // open up dict file containing all fsts
+ String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION);
+ dictIn = state.directory.openInput(dictFile, state.context);
+ CodecUtil.checkIndexHeader(dictIn, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ // just validate the footer for the dictIn
+ CodecUtil.retrieveChecksum(dictIn);
+
+ // open up index file (fieldNumber, offset)
+ CodecUtil.checkIndexHeader(index, CODEC_NAME, COMPLETION_CODEC_VERSION, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
+ // load delegate PF
+ PostingsFormat delegatePostingsFormat = PostingsFormat.forName(index.readString());
+ delegateFieldsProducer = delegatePostingsFormat.fieldsProducer(state);
+
+ // read suggest field numbers and their offsets in the terms file from index
+ int numFields = index.readVInt();
+ readers = new HashMap<>(numFields);
+ for (int i = 0; i < numFields; i++) {
+ int fieldNumber = index.readVInt();
+ long offset = index.readVLong();
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNumber);
+ // we don't load the FST yet
+ readers.put(fieldInfo.name, new CompletionsTermsReader(offset));
+ }
+ CodecUtil.checkFooter(index);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ boolean success = false;
+ try {
+ delegateFieldsProducer.close();
+ IOUtils.close(dictIn);
+ success = true;
+ } finally {
+ if (success == false) {
+ IOUtils.closeWhileHandlingException(delegateFieldsProducer, dictIn);
+ }
+ }
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ delegateFieldsProducer.checkIntegrity();
+ // TODO: checkIntegrity should checksum the dictionary and index
+ }
+
+ @Override
+ public FieldsProducer getMergeInstance() throws IOException {
+ return new CompletionFieldsProducer(delegateFieldsProducer, readers);
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ long ramBytesUsed = delegateFieldsProducer.ramBytesUsed();
+ for (CompletionsTermsReader reader : readers.values()) {
+ ramBytesUsed += reader.ramBytesUsed();
+ }
+ return ramBytesUsed;
+ }
+
+ @Override
+ public Collection<Accountable> getChildResources() {
+ List<Accountable> accountableList = new ArrayList<>();
+ for (Map.Entry<String, CompletionsTermsReader> readerEntry : readers.entrySet()) {
+ accountableList.add(Accountables.namedAccountable(readerEntry.getKey(), readerEntry.getValue()));
+ }
+ return Collections.unmodifiableCollection(accountableList);
+ }
+
+ @Override
+ public Iterator<String> iterator() {
+ return readers.keySet().iterator();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ return new CompletionTerms(delegateFieldsProducer.terms(field), readers.get(field));
+ }
+
+ @Override
+ public int size() {
+ return readers.size();
+ }
+
+ private class CompletionsTermsReader implements Accountable {
+ private final long offset;
+ private NRTSuggester suggester;
+
+ public CompletionsTermsReader(long offset) throws IOException {
+ assert offset >= 0l && offset < dictIn.length();
+ this.offset = offset;
+ }
+
+ public synchronized NRTSuggester suggester() throws IOException {
+ if (suggester == null) {
+ try (IndexInput dictClone = dictIn.clone()) { // let multiple fields load concurrently
+ dictClone.seek(offset);
+ suggester = NRTSuggester.load(dictClone);
+ }
+ }
+ return suggester;
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return (suggester != null) ? suggester.ramBytesUsed() : 0;
+ }
+
+ @Override
+ public Collection<Accountable> getChildResources() {
+ return Collections.emptyList();
+ }
+ }
+
+ /**
+ * Thin wrapper over {@link org.apache.lucene.index.Terms} with
+ * a {@link NRTSuggester}
+ */
+ public static class CompletionTerms extends FilterLeafReader.FilterTerms {
+
+ private final CompletionsTermsReader reader;
+
+ public CompletionTerms(Terms in, CompletionsTermsReader reader) {
+ super(in);
+ this.reader = reader;
+ }
+
+ /**
+ * Returns a {@link NRTSuggester} for the field
+ * or <code>null</code> if no FST
+ * was indexed for this field
+ */
+ public NRTSuggester suggester() throws IOException {
+ if (reader == null) {
+ return null;
+ }
+ return reader.suggester();
+ }
+ }
+
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionPostingsFormat.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,121 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.fst.FST;
+
+/**
+ * <p>
+ * A {@link PostingsFormat} which supports document suggestion based on
+ * indexed {@link SuggestField}s.
+ * Document suggestion is based on an weighted FST which map analyzed
+ * terms of a {@link SuggestField} to its surface form and document id.
+ * </p>
+ * <p>
+ * Files:
+ * <ul>
+ * <li><tt>.lkp</tt>: <a href="#Completiondictionary">Completion Dictionary</a></li>
+ * <li><tt>.cmp</tt>: <a href="#Completionindex">Completion Index</a></li>
+ * </ul>
+ * <p>
+ * <a name="Completionictionary"></a>
+ * <h3>Completion Dictionary</h3>
+ * <p>The .lkp file contains an FST for each suggest field
+ * </p>
+ * <ul>
+ * <li>CompletionDict (.lkp) --> Header, FST<sup>NumSuggestFields</sup>, Footer</li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <!-- TODO: should the FST output be mentioned at all? -->
+ * <li>FST --> {@link FST FST<Long, BytesRef>}</li>
+ * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ * for the Completion implementation.</li>
+ * <li>FST maps all analyzed forms to surface forms of a SuggestField</li>
+ * </ul>
+ * <a name="Completionindex"></a>
+ * <h3>Completion Index</h3>
+ * <p>The .cmp file contains an index into the completion dictionary, so that it can be
+ * accessed randomly.</p>
+ * <ul>
+ * <li>CompletionIndex (.cmp) --> Header, NumSuggestFields, Entry<sup>NumSuggestFields</sup>, Footer</li>
+ * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>NumSuggestFields --> {@link DataOutput#writeVInt Uint32}</li>
+ * <li>Entry --> FieldNumber, CompletionDictionaryOffset</li>
+ * <li>FieldNumber --> {@link DataOutput#writeVInt Uint32}</li>
+ * <li>CompletionDictionaryOffset --> {@link DataOutput#writeVLong Uint64}</li>
+ * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li>
+ * </ul>
+ * <p>Notes:</p>
+ * <ul>
+ * <li>Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
+ * for the Completion implementation.</li>
+ * <li>NumSuggestFields is the number of suggest fields indexed</li>
+ * <li>FieldNumber is the fields number from {@link FieldInfos}. (.fnm)</li>
+ * <li>CompletionDictionaryOffset is the file offset of a field's FST in CompletionDictionary (.lkp)</li>
+ * </ul>
+ *
+ * @lucene.experimental
+ */
+public abstract class CompletionPostingsFormat extends PostingsFormat {
+
+ static final String CODEC_NAME = "completion";
+ static final int COMPLETION_CODEC_VERSION = 1;
+ static final int COMPLETION_VERSION_CURRENT = COMPLETION_CODEC_VERSION;
+ static final String INDEX_EXTENSION = "cmp";
+ static final String DICT_EXTENSION = "lkp";
+
+ /**
+ * Used only by core Lucene at read-time via Service Provider instantiation
+ */
+ public CompletionPostingsFormat() {
+ super(CODEC_NAME);
+ }
+
+ /**
+ * Concrete implementation should specify the delegating postings format
+ */
+ protected abstract PostingsFormat delegatePostingsFormat();
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ PostingsFormat delegatePostingsFormat = delegatePostingsFormat();
+ if (delegatePostingsFormat == null) {
+ throw new UnsupportedOperationException("Error - " + getClass().getName()
+ + " has been constructed without a choice of PostingsFormat");
+ }
+ return new CompletionFieldsConsumer(delegatePostingsFormat, state);
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ return new CompletionFieldsProducer(state);
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/CompletionTokenStream.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,358 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Set;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.TokenStreamToAutomaton;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.Transition;
+import org.apache.lucene.util.fst.Util;
+
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_MAX_GRAPH_EXPANSIONS;
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_POSITION_INCREMENTS;
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.DEFAULT_PRESERVE_SEP;
+import static org.apache.lucene.search.suggest.document.CompletionAnalyzer.SEP_LABEL;
+
+/**
+ * Token stream which converts a provided token stream to an automaton.
+ * The accepted strings enumeration from the automaton are available through the
+ * {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
+ * The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
+ * a completion's payload (see {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)})
+ *
+ */
+final class CompletionTokenStream extends TokenStream {
+
+ private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
+ private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
+ private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
+
+ private final TokenStream input;
+ private final boolean preserveSep;
+ private final boolean preservePositionIncrements;
+ private final int sepLabel;
+ private final int maxGraphExpansions;
+
+ private BytesRef payload;
+ private Iterator<IntsRef> finiteStrings;
+ private int posInc = -1;
+ private CharTermAttribute charTermAttribute;
+
+ /**
+ * Creates a token stream to convert <code>input</code> to a token stream
+ * of accepted strings by its automaton.
+ * <p>
+ * The token stream <code>input</code> is converted to an automaton
+ * with the default settings of {@link org.apache.lucene.search.suggest.document.CompletionAnalyzer}
+ */
+ public CompletionTokenStream(TokenStream input) {
+ this(input, DEFAULT_PRESERVE_SEP, DEFAULT_PRESERVE_POSITION_INCREMENTS, SEP_LABEL, DEFAULT_MAX_GRAPH_EXPANSIONS);
+ }
+
+ CompletionTokenStream(TokenStream input, boolean preserveSep, boolean preservePositionIncrements, int sepLabel, int maxGraphExpansions) {
+ // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
+ // the input stream entirely in toFiniteStrings(input)
+ this.input = input;
+ this.preserveSep = preserveSep;
+ this.preservePositionIncrements = preservePositionIncrements;
+ this.sepLabel = sepLabel;
+ this.maxGraphExpansions = maxGraphExpansions;
+ }
+
+ /**
+ * Returns a separator label that is reserved for the payload
+ * in {@link CompletionTokenStream#setPayload(org.apache.lucene.util.BytesRef)}
+ */
+ public int sepLabel() {
+ return sepLabel;
+ }
+
+ /**
+ * Sets a payload available throughout successive token stream enumeration
+ */
+ public void setPayload(BytesRef payload) {
+ this.payload = payload;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+ if (finiteStrings == null) {
+ //TODO: make this return a Iterator<IntsRef> instead?
+ Automaton automaton = toAutomaton(input);
+ Set<IntsRef> strings = Operations.getFiniteStrings(automaton, maxGraphExpansions);
+
+ posInc = strings.size();
+ finiteStrings = strings.iterator();
+ }
+ if (finiteStrings.hasNext()) {
+ posAttr.setPositionIncrement(posInc);
+ /*
+ * this posInc encodes the number of paths that this surface form
+ * produced. Multi Fields have the same surface form and therefore sum up
+ */
+ posInc = 0;
+ Util.toBytesRef(finiteStrings.next(), bytesAtt.builder()); // now we have UTF-8
+ if (charTermAttribute != null) {
+ charTermAttribute.setLength(0);
+ charTermAttribute.append(bytesAtt.toUTF16());
+ }
+ if (payload != null) {
+ payloadAttr.setPayload(this.payload);
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ if (posInc == -1) {
+ input.end();
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (posInc == -1) {
+ input.close();
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ if (hasAttribute(CharTermAttribute.class)) {
+ // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
+ charTermAttribute = getAttribute(CharTermAttribute.class);
+ }
+ finiteStrings = null;
+ posInc = -1;
+ }
+
+ /**
+ * Converts <code>tokenStream</code> to an automaton
+ */
+ public Automaton toAutomaton(TokenStream tokenStream) throws IOException {
+ // TODO refactor this
+ // maybe we could hook up a modified automaton from TermAutomatonQuery here?
+ Automaton automaton = null;
+ try {
+ // Create corresponding automaton: labels are bytes
+ // from each analyzed token, with byte 0 used as
+ // separator between tokens:
+ final TokenStreamToAutomaton tsta;
+ if (preserveSep) {
+ tsta = new EscapingTokenStreamToAutomaton((char) SEP_LABEL);
+ } else {
+ // When we're not preserving sep, we don't steal 0xff
+ // byte, so we don't need to do any escaping:
+ tsta = new TokenStreamToAutomaton();
+ }
+ tsta.setPreservePositionIncrements(preservePositionIncrements);
+
+ automaton = tsta.toAutomaton(tokenStream);
+ } finally {
+ IOUtils.closeWhileHandlingException(tokenStream);
+ }
+
+ // TODO: we can optimize this somewhat by determinizing
+ // while we convert
+ automaton = replaceSep(automaton, preserveSep, SEP_LABEL);
+ // This automaton should not blow up during determinize:
+ return Operations.determinize(automaton, maxGraphExpansions);
+ }
+
+ /**
+ * Just escapes the 0xff byte (which we still for SEP).
+ */
+ private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
+
+ final BytesRefBuilder spare = new BytesRefBuilder();
+ private char sepLabel;
+
+ public EscapingTokenStreamToAutomaton(char sepLabel) {
+ this.sepLabel = sepLabel;
+ }
+
+ @Override
+ protected BytesRef changeToken(BytesRef in) {
+ int upto = 0;
+ for (int i = 0; i < in.length; i++) {
+ byte b = in.bytes[in.offset + i];
+ if (b == (byte) sepLabel) {
+ spare.grow(upto + 2);
+ spare.setByteAt(upto++, (byte) sepLabel);
+ spare.setByteAt(upto++, b);
+ } else {
+ spare.grow(upto + 1);
+ spare.setByteAt(upto++, b);
+ }
+ }
+ spare.setLength(upto);
+ return spare.get();
+ }
+ }
+
+ // Replaces SEP with epsilon or remaps them if
+ // we were asked to preserve them:
+ private static Automaton replaceSep(Automaton a, boolean preserveSep, int sepLabel) {
+
+ Automaton result = new Automaton();
+
+ // Copy all states over
+ int numStates = a.getNumStates();
+ for (int s = 0; s < numStates; s++) {
+ result.createState();
+ result.setAccept(s, a.isAccept(s));
+ }
+
+ // Go in reverse topo sort so we know we only have to
+ // make one pass:
+ Transition t = new Transition();
+ int[] topoSortStates = topoSortStates(a);
+ for (int i = 0; i < topoSortStates.length; i++) {
+ int state = topoSortStates[topoSortStates.length - 1 - i];
+ int count = a.initTransition(state, t);
+ for (int j = 0; j < count; j++) {
+ a.getNextTransition(t);
+ if (t.min == TokenStreamToAutomaton.POS_SEP) {
+ assert t.max == TokenStreamToAutomaton.POS_SEP;
+ if (preserveSep) {
+ // Remap to SEP_LABEL:
+ result.addTransition(state, t.dest, sepLabel);
+ } else {
+ result.addEpsilon(state, t.dest);
+ }
+ } else if (t.min == TokenStreamToAutomaton.HOLE) {
+ assert t.max == TokenStreamToAutomaton.HOLE;
+
+ // Just remove the hole: there will then be two
+ // SEP tokens next to each other, which will only
+ // match another hole at search time. Note that
+ // it will also match an empty-string token ... if
+ // that's somehow a problem we can always map HOLE
+ // to a dedicated byte (and escape it in the
+ // input).
+ result.addEpsilon(state, t.dest);
+ } else {
+ result.addTransition(state, t.dest, t.min, t.max);
+ }
+ }
+ }
+
+ result.finishState();
+
+ return result;
+ }
+
+ private static int[] topoSortStates(Automaton a) {
+ int[] states = new int[a.getNumStates()];
+ final Set<Integer> visited = new HashSet<>();
+ final LinkedList<Integer> worklist = new LinkedList<>();
+ worklist.add(0);
+ visited.add(0);
+ int upto = 0;
+ states[upto] = 0;
+ upto++;
+ Transition t = new Transition();
+ while (worklist.size() > 0) {
+ int s = worklist.removeFirst();
+ int count = a.initTransition(s, t);
+ for (int i = 0; i < count; i++) {
+ a.getNextTransition(t);
+ if (!visited.contains(t.dest)) {
+ visited.add(t.dest);
+ worklist.add(t.dest);
+ states[upto++] = t.dest;
+ }
+ }
+ }
+ return states;
+ }
+
+ public interface ByteTermAttribute extends TermToBytesRefAttribute {
+ // marker interface
+
+ /**
+ * Return the builder from which the term is derived.
+ */
+ public BytesRefBuilder builder();
+
+ public CharSequence toUTF16();
+ }
+
+ public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
+ private final BytesRefBuilder bytes = new BytesRefBuilder();
+ private CharsRefBuilder charsRef;
+
+ @Override
+ public void fillBytesRef() {
+ // does nothing - we change in place
+ }
+
+ @Override
+ public BytesRefBuilder builder() {
+ return bytes;
+ }
+
+ @Override
+ public BytesRef getBytesRef() {
+ return bytes.get();
+ }
+
+ @Override
+ public void clear() {
+ bytes.clear();
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
+ other.bytes.copyBytes(bytes);
+ }
+
+ @Override
+ public CharSequence toUTF16() {
+ if (charsRef == null) {
+ charsRef = new CharsRefBuilder();
+ }
+ charsRef.copyUTF8Bytes(getBytesRef());
+ return charsRef.get();
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggester.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,324 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.search.CollectionTerminatedException;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.suggest.analyzing.FSTUtil;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PairOutputs.Pair;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseDocID;
+import static org.apache.lucene.search.suggest.document.NRTSuggester.PayLoadProcessor.parseSurfaceForm;
+
+/**
+ * <p>
+ * NRTSuggester returns Top N completions with corresponding documents matching a provided automaton.
+ * The completions are returned in descending order of their corresponding weight.
+ * Deleted documents are filtered out in near real time using the provided reader.
+ * A {@link org.apache.lucene.search.DocIdSet} can be passed in at query time to filter out documents.
+ * </p>
+ * <p>
+ * See {@link #lookup(LeafReader, Automaton, int, DocIdSet, TopSuggestDocsCollector)} for more implementation
+ * details.
+ * <p>
+ * Builder: {@link NRTSuggesterBuilder}
+ * </p>
+ * <p>
+ * FST Format:
+ * <ul>
+ * <li>Input: analyzed forms of input terms</li>
+ * <li>Output: Pair<Long, BytesRef> containing weight, surface form and docID</li>
+ * </ul>
+ * <p>
+ * NOTE:
+ * <ul>
+ * <li>currently only {@link org.apache.lucene.search.DocIdSet} with random access capabilities are supported.</li>
+ * <li>having too many deletions or using a very restrictive filter can make the search inadmissible due to
+ * over-pruning of potential paths</li>
+ * <li>when a {@link org.apache.lucene.search.DocIdSet} is used, it is assumed that the filter will roughly
+ * filter out half the number of documents that match the provided automaton</li>
+ * <li>lookup performance will degrade as more accepted completions lead to filtered out documents</li>
+ * </ul>
+ *
+ */
+final class NRTSuggester implements Accountable {
+
+ /**
+ * FST<Weight,Surface>:
+ * input is the analyzed form, with a null byte between terms
+ * and a {@link NRTSuggesterBuilder#END_BYTE} to denote the
+ * end of the input
+ * weight is a long
+ * surface is the original, unanalyzed form followed by the docID
+ */
+ private final FST<Pair<Long, BytesRef>> fst;
+
+ /**
+ * Highest number of analyzed paths we saw for any single
+ * input surface form. This can be > 1, when index analyzer
+ * creates graphs or if multiple surface form(s) yields the
+ * same analyzed form
+ */
+ private final int maxAnalyzedPathsPerOutput;
+
+ /**
+ * Separator used between surface form and its docID in the FST output
+ */
+ private final int payloadSep;
+
+ /**
+ * Label used to denote the end of an input in the FST and
+ * the beginning of dedup bytes
+ */
+ private final int endByte;
+
+ /**
+ * Maximum queue depth for TopNSearcher
+ *
+ * NOTE: value should be <= Integer.MAX_VALUE
+ */
+ private static final long MAX_TOP_N_QUEUE_SIZE = 1000;
+
+ private NRTSuggester(FST<Pair<Long, BytesRef>> fst, int maxAnalyzedPathsPerOutput, int payloadSep, int endByte) {
+ this.fst = fst;
+ this.maxAnalyzedPathsPerOutput = maxAnalyzedPathsPerOutput;
+ this.payloadSep = payloadSep;
+ this.endByte = endByte;
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return fst == null ? 0 : fst.ramBytesUsed();
+ }
+
+ @Override
+ public Collection<Accountable> getChildResources() {
+ return Collections.emptyList();
+ }
+
+ private static Comparator<Pair<Long, BytesRef>> getComparator() {
+ return new Comparator<Pair<Long, BytesRef>>() {
+ @Override
+ public int compare(Pair<Long, BytesRef> o1, Pair<Long, BytesRef> o2) {
+ return Long.compare(o1.output1, o2.output1);
+ }
+ };
+ }
+
+ /**
+ * Collects at most Top <code>num</code> completions, filtered by <code>filter</code> on
+ * corresponding documents, which has a prefix accepted by <code>automaton</code>
+ * <p>
+ * Supports near real time deleted document filtering using <code>reader</code>
+ * <p>
+ * {@link TopSuggestDocsCollector#collect(int, CharSequence, long)} is called
+ * for every matched completion
+ * <p>
+ * Completion collection can be early terminated by throwing {@link org.apache.lucene.search.CollectionTerminatedException}
+ */
+ public void lookup(final LeafReader reader, final Automaton automaton, final int num, final DocIdSet filter, final TopSuggestDocsCollector collector) {
+ final Bits filterDocs;
+ try {
+ if (filter != null) {
+ if (filter.iterator() == null) {
+ return;
+ }
+ if (filter.bits() == null) {
+ throw new IllegalArgumentException("DocIDSet does not provide random access interface");
+ } else {
+ filterDocs = filter.bits();
+ }
+ } else {
+ filterDocs = null;
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ int queueSize = getMaxTopNSearcherQueueSize(num, reader, filterDocs != null);
+ if (queueSize == -1) {
+ return;
+ }
+
+ final Bits liveDocs = reader.getLiveDocs();
+ try {
+ final List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(automaton, fst);
+ Util.TopNSearcher<Pair<Long, BytesRef>> searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num, queueSize, getComparator()) {
+
+ private final CharsRefBuilder spare = new CharsRefBuilder();
+
+ @Override
+ protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
+ int payloadSepIndex = parseSurfaceForm(output.output2, payloadSep, spare);
+ int docID = parseDocID(output.output2, payloadSepIndex);
+
+ // filter out deleted docs only if no filter is set
+ if (filterDocs == null && liveDocs != null && !liveDocs.get(docID)) {
+ return false;
+ }
+
+ // filter by filter context
+ if (filterDocs != null && !filterDocs.get(docID)) {
+ return false;
+ }
+
+ try {
+ collector.collect(docID, spare.toCharsRef(), decode(output.output1));
+ return true;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+
+ // TODO: add fuzzy support
+ for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
+ searcher.addStartPaths(path.fstNode, path.output, false, path.input);
+ }
+
+ try {
+ // hits are also returned by search()
+ // we do not use it, instead collect at acceptResult
+ Util.TopResults<Pair<Long, BytesRef>> search = searcher.search();
+ // search admissibility is not guaranteed
+ // see comment on getMaxTopNSearcherQueueSize
+ // assert search.isComplete;
+ } catch (CollectionTerminatedException e) {
+ // terminate
+ }
+
+ } catch (IOException bogus) {
+ throw new RuntimeException(bogus);
+ }
+ }
+
+ /**
+ * Simple heuristics to try to avoid over-pruning potential suggestions by the
+ * TopNSearcher. Since suggestion entries can be rejected if they belong
+ * to a deleted document, the length of the TopNSearcher queue has to
+ * be increased by some factor, to account for the filtered out suggestions.
+ * This heuristic will try to make the searcher admissible, but the search
+ * can still lead to over-pruning
+ * <p>
+ * If a <code>filter</code> is applied, the queue size is increased by
+ * half the number of live documents.
+ * <p>
+ * The maximum queue size is {@link #MAX_TOP_N_QUEUE_SIZE}
+ */
+ private int getMaxTopNSearcherQueueSize(int num, LeafReader reader, boolean filterEnabled) {
+ double liveDocsRatio = calculateLiveDocRatio(reader.numDocs(), reader.maxDoc());
+ if (liveDocsRatio == -1) {
+ return -1;
+ }
+ long maxQueueSize = num * maxAnalyzedPathsPerOutput;
+ // liveDocRatio can be at most 1.0 (if no docs were deleted)
+ assert liveDocsRatio <= 1.0d;
+ maxQueueSize = (long) (maxQueueSize / liveDocsRatio);
+ if (filterEnabled) {
+ maxQueueSize = maxQueueSize + (reader.numDocs()/2);
+ }
+ return (int) Math.min(MAX_TOP_N_QUEUE_SIZE, maxQueueSize);
+ }
+
+ private static double calculateLiveDocRatio(int numDocs, int maxDocs) {
+ return (numDocs > 0) ? ((double) numDocs / maxDocs) : -1;
+ }
+
+ /**
+ * Loads a {@link NRTSuggester} from {@link org.apache.lucene.store.IndexInput}
+ */
+ public static NRTSuggester load(IndexInput input) throws IOException {
+ final FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>(
+ PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
+
+ /* read some meta info */
+ int maxAnalyzedPathsPerOutput = input.readVInt();
+ int endByte = input.readVInt();
+ int payloadSep = input.readVInt();
+
+ return new NRTSuggester(fst, maxAnalyzedPathsPerOutput, payloadSep, endByte);
+ }
+
+ static long encode(long input) {
+ if (input < 0) {
+ throw new UnsupportedOperationException("cannot encode value: " + input);
+ }
+ return Long.MAX_VALUE - input;
+ }
+
+ static long decode(long output) {
+ return (Long.MAX_VALUE - output);
+ }
+
+ /**
+ * Helper to encode/decode payload (surface + PAYLOAD_SEP + docID) output
+ */
+ static final class PayLoadProcessor {
+ final static private int MAX_DOC_ID_LEN_WITH_SEP = 6; // vint takes at most 5 bytes
+
+ static int parseSurfaceForm(final BytesRef output, int payloadSep, CharsRefBuilder spare) {
+ int surfaceFormLen = -1;
+ for (int i = 0; i < output.length; i++) {
+ if (output.bytes[output.offset + i] == payloadSep) {
+ surfaceFormLen = i;
+ break;
+ }
+ }
+ assert surfaceFormLen != -1 : "no payloadSep found, unable to determine surface form";
+ spare.copyUTF8Bytes(output.bytes, output.offset, surfaceFormLen);
+ return surfaceFormLen;
+ }
+
+ static int parseDocID(final BytesRef output, int payloadSepIndex) {
+ assert payloadSepIndex != -1 : "payload sep index can not be -1";
+ ByteArrayDataInput input = new ByteArrayDataInput(output.bytes, payloadSepIndex + output.offset + 1, output.length - (payloadSepIndex + output.offset));
+ return input.readVInt();
+ }
+
+ static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
+ int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
+ byte[] buffer = new byte[len];
+ ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+ output.writeBytes(surface.bytes, surface.length - surface.offset);
+ output.writeByte((byte) payloadSep);
+ output.writeVInt(docID);
+ return new BytesRef(buffer, 0, output.getPosition());
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/NRTSuggesterBuilder.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,165 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.PriorityQueue;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.ByteSequenceOutputs;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PairOutputs;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+import static org.apache.lucene.search.suggest.document.NRTSuggester.encode;
+
+/**
+ * Builder for {@link NRTSuggester}
+ *
+ */
+final class NRTSuggesterBuilder {
+
+ /**
+ * Label used to separate surface form and docID
+ * in the output
+ */
+ public static final int PAYLOAD_SEP = '\u001F';
+
+ /**
+ * Marks end of the analyzed input and start of dedup
+ * byte.
+ */
+ private static final int END_BYTE = 0x0;
+
+ private final PairOutputs<Long, BytesRef> outputs;
+ private final Builder<PairOutputs.Pair<Long, BytesRef>> builder;
+ private final IntsRefBuilder scratchInts = new IntsRefBuilder();
+ private final BytesRefBuilder analyzed = new BytesRefBuilder();
+ private final PriorityQueue<Entry> entries;
+ private final int payloadSep;
+ private final int endByte;
+
+ private int maxAnalyzedPathsPerOutput = 0;
+
+ /**
+ * Create a builder for {@link NRTSuggester}
+ */
+ public NRTSuggesterBuilder() {
+ this.payloadSep = PAYLOAD_SEP;
+ this.endByte = END_BYTE;
+ this.outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton());
+ this.entries = new PriorityQueue<>();
+ this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ }
+
+ /**
+ * Initializes an FST input term to add entries against
+ */
+ public void startTerm(BytesRef analyzed) {
+ this.analyzed.copyBytes(analyzed);
+ this.analyzed.append((byte) endByte);
+ }
+
+ /**
+ * Adds an entry for the latest input term, should be called after
+ * {@link #startTerm(org.apache.lucene.util.BytesRef)} on the desired input
+ */
+ public void addEntry(int docID, BytesRef surfaceForm, long weight) throws IOException {
+ BytesRef payloadRef = NRTSuggester.PayLoadProcessor.make(surfaceForm, docID, payloadSep);
+ entries.add(new Entry(payloadRef, encode(weight)));
+ }
+
+ /**
+ * Writes all the entries for the FST input term
+ */
+ public void finishTerm() throws IOException {
+ int numArcs = 0;
+ int numDedupBytes = 1;
+ analyzed.grow(analyzed.length() + 1);
+ analyzed.setLength(analyzed.length() + 1);
+ for (Entry entry : entries) {
+ if (numArcs == maxNumArcsForDedupByte(numDedupBytes)) {
+ analyzed.setByteAt(analyzed.length() - 1, (byte) (numArcs));
+ analyzed.grow(analyzed.length() + 1);
+ analyzed.setLength(analyzed.length() + 1);
+ numArcs = 0;
+ numDedupBytes++;
+ }
+ analyzed.setByteAt(analyzed.length() - 1, (byte) numArcs++);
+ Util.toIntsRef(analyzed.get(), scratchInts);
+ builder.add(scratchInts.get(), outputs.newPair(entry.weight, entry.payload));
+ }
+ maxAnalyzedPathsPerOutput = Math.max(maxAnalyzedPathsPerOutput, entries.size());
+ entries.clear();
+ }
+
+ /**
+ * Builds and stores a FST that can be loaded with
+ * {@link NRTSuggester#load(org.apache.lucene.store.IndexInput)}
+ */
+ public boolean store(DataOutput output) throws IOException {
+ final FST<PairOutputs.Pair<Long, BytesRef>> build = builder.finish();
+ if (build == null) {
+ return false;
+ }
+ build.save(output);
+
+ /* write some more meta-info */
+ assert maxAnalyzedPathsPerOutput > 0;
+ output.writeVInt(maxAnalyzedPathsPerOutput);
+ output.writeVInt(END_BYTE);
+ output.writeVInt(PAYLOAD_SEP);
+ return true;
+ }
+
+ /**
+ * Num arcs for nth dedup byte:
+ * if n <= 5: 1 + (2 * n)
+ * else: (1 + (2 * n)) * n
+ * <p>
+ * TODO: is there a better way to make the fst built to be
+ * more TopNSearcher friendly?
+ */
+ private static int maxNumArcsForDedupByte(int currentNumDedupBytes) {
+ int maxArcs = 1 + (2 * currentNumDedupBytes);
+ if (currentNumDedupBytes > 5) {
+ maxArcs *= currentNumDedupBytes;
+ }
+ return Math.min(maxArcs, 255);
+ }
+
+ private final static class Entry implements Comparable<Entry> {
+ final BytesRef payload;
+ final long weight;
+
+ public Entry(BytesRef payload, long weight) {
+ this.payload = payload;
+ this.weight = weight;
+ }
+
+ @Override
+ public int compareTo(Entry o) {
+ return Long.compare(weight, o.weight);
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestField.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,123 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * <p>
+ * Field that indexes a string value and a weight as a weighted completion
+ * against a named suggester.
+ * Field is tokenized, not stored and stores documents, frequencies and positions.
+ * Field can be used to provide near real time document suggestions.
+ * </p>
+ * <p>
+ * Besides the usual {@link org.apache.lucene.analysis.Analyzer}s,
+ * {@link CompletionAnalyzer}
+ * can be used to tune suggest field only parameters
+ * (e.g. preserving token seperators, preserving position increments
+ * when converting the token stream to an automaton)
+ * </p>
+ * <p>
+ * Example indexing usage:
+ * <pre class="prettyprint">
+ * document.add(new SuggestField(name, "suggestion", 4));
+ * </pre>
+ * To perform document suggestions based on the this field, use
+ * {@link SuggestIndexSearcher#suggest(String, CharSequence, int, org.apache.lucene.search.Filter)}
+ * <p>
+ * Example query usage:
+ * <pre class="prettyprint">
+ * SuggestIndexSearcher indexSearcher = ..
+ * indexSearcher.suggest(name, "su", 2)
+ * </pre>
+ *
+ * @lucene.experimental
+ */
+public class SuggestField extends Field {
+
+ private static final FieldType FIELD_TYPE = new FieldType();
+
+ static {
+ FIELD_TYPE.setTokenized(true);
+ FIELD_TYPE.setStored(false);
+ FIELD_TYPE.setStoreTermVectors(false);
+ FIELD_TYPE.setOmitNorms(false);
+ FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
+ FIELD_TYPE.freeze();
+ }
+
+ private final BytesRef surfaceForm;
+ private final long weight;
+
+ /**
+ * Creates a {@link SuggestField}
+ *
+ * @param name of the field
+ * @param value to get suggestions on
+ * @param weight weight of the suggestion
+ */
+ public SuggestField(String name, String value, long weight) {
+ super(name, value, FIELD_TYPE);
+ if (weight < 0l) {
+ throw new IllegalArgumentException("weight must be >= 0");
+ }
+ this.surfaceForm = new BytesRef(value);
+ this.weight = weight;
+ }
+
+ @Override
+ public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) throws IOException {
+ TokenStream stream = super.tokenStream(analyzer, reuse);
+ CompletionTokenStream completionStream;
+ if (stream instanceof CompletionTokenStream) {
+ completionStream = (CompletionTokenStream) stream;
+ } else {
+ completionStream = new CompletionTokenStream(stream);
+ }
+ BytesRef suggestPayload = buildSuggestPayload(surfaceForm, weight, (char) completionStream.sepLabel());
+ completionStream.setPayload(suggestPayload);
+ return completionStream;
+ }
+
+ private BytesRef buildSuggestPayload(BytesRef surfaceForm, long weight, char sepLabel) throws IOException {
+ for (int i = 0; i < surfaceForm.length; i++) {
+ if (surfaceForm.bytes[i] == sepLabel) {
+ assert sepLabel == '\u001f';
+ throw new IllegalArgumentException(
+ "surface form cannot contain unit separator character U+001F; this character is reserved");
+ }
+ }
+ ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
+ try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
+ output.writeVInt(surfaceForm.length);
+ output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
+ output.writeVLong(weight + 1);
+ }
+ return new BytesRef(byteArrayOutputStream.toByteArray());
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestIndexSearcher.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,150 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.automaton.Automaton;
+
+import static org.apache.lucene.search.suggest.document.CompletionFieldsProducer.CompletionTerms;
+
+/**
+ * Adds document suggest capabilities to IndexSearcher
+ *
+ * @lucene.experimental
+ */
+public class SuggestIndexSearcher extends IndexSearcher {
+
+ private final Analyzer queryAnalyzer;
+
+ /**
+ * Creates a searcher with document suggest capabilities
+ * for <code>reader</code>.
+ * <p>
+ * Suggestion <code>key</code> is analyzed with <code>queryAnalyzer</code>
+ */
+ public SuggestIndexSearcher(IndexReader reader, Analyzer queryAnalyzer) {
+ super(reader);
+ this.queryAnalyzer = queryAnalyzer;
+ }
+
+ /**
+ * Calls {@link #suggest(String, CharSequence, int, Filter)}
+ * with no document filter
+ */
+ public TopSuggestDocs suggest(String field, CharSequence key, int num) throws IOException {
+ return suggest(field, key, num, (Filter) null);
+ }
+
+ /**
+ * Calls {@link #suggest(String, CharSequence, int, Filter, TopSuggestDocsCollector)}
+ * with no document filter
+ */
+ public void suggest(String field, CharSequence key, int num, TopSuggestDocsCollector collector) throws IOException {
+ suggest(field, key, num, null, collector);
+ }
+
+ /**
+ * Suggests at most <code>num</code> documents filtered by <code>filter</code>
+ * that completes to <code>key</code> for a suggest <code>field</code>
+ * <p>
+ * Returns at most Top <code>num</code> document ids with corresponding completion and weight pair
+ *
+ * @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
+ * interface or if <code>field</code> is not a {@link SuggestField}
+ */
+ public TopSuggestDocs suggest(String field, CharSequence key, int num, Filter filter) throws IOException {
+ TopSuggestDocsCollector collector = new TopSuggestDocsCollector(num);
+ suggest(field, key, num, filter, collector);
+ return collector.get();
+ }
+
+ /**
+ * Suggests at most <code>num</code> documents filtered by <code>filter</code>
+ * that completes to <code>key</code> for a suggest <code>field</code>
+ * <p>
+ * Collect completions with {@link TopSuggestDocsCollector}
+ * The completions are collected in order of the suggest <code>field</code> weight.
+ * There can be more than one collection of the same document, if the <code>key</code>
+ * matches multiple <code>field</code> values of the same document
+ *
+ * @throws java.lang.IllegalArgumentException if <code>filter</code> does not provide a random access
+ * interface or if <code>field</code> is not a {@link SuggestField}
+ */
+ public void suggest(String field, CharSequence key, int num, Filter filter, TopSuggestDocsCollector collector) throws IOException {
+ // verify input
+ if (field == null) {
+ throw new IllegalArgumentException("'field' can not be null");
+ }
+ if (num <= 0) {
+ throw new IllegalArgumentException("'num' should be > 0");
+ }
+ if (collector == null) {
+ throw new IllegalArgumentException("'collector' can not be null");
+ }
+
+ // build query automaton
+ CompletionAnalyzer analyzer;
+ if (queryAnalyzer instanceof CompletionAnalyzer) {
+ analyzer = (CompletionAnalyzer) queryAnalyzer;
+ } else {
+ analyzer = new CompletionAnalyzer(queryAnalyzer);
+ }
+ final Automaton automaton = analyzer.toAutomaton(field, key);
+
+ // collect results
+ for (LeafReaderContext context : getIndexReader().leaves()) {
+ TopSuggestDocsCollector leafCollector = (TopSuggestDocsCollector) collector.getLeafCollector(context);
+ LeafReader reader = context.reader();
+ Terms terms = reader.terms(field);
+ if (terms == null) {
+ continue;
+ }
+ NRTSuggester suggester;
+ if (terms instanceof CompletionTerms) {
+ CompletionTerms completionTerms = (CompletionTerms) terms;
+ suggester = completionTerms.suggester();
+ } else {
+ throw new IllegalArgumentException(field + " is not a SuggestField");
+ }
+ if (suggester == null) {
+ // a segment can have a null suggester
+ // i.e. no FST was built
+ continue;
+ }
+
+ DocIdSet docIdSet = null;
+ if (filter != null) {
+ docIdSet = filter.getDocIdSet(context, reader.getLiveDocs());
+ if (docIdSet == null) {
+ // filter matches no docs in current leave
+ continue;
+ }
+ }
+ suggester.lookup(reader, automaton, num, docIdSet, leafCollector);
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/SuggestScoreDocPriorityQueue.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,56 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Bounded priority queue for {@link SuggestScoreDoc}s.
+ * Priority is based on {@link SuggestScoreDoc#score} and tie
+ * is broken by {@link SuggestScoreDoc#doc}
+ */
+final class SuggestScoreDocPriorityQueue extends PriorityQueue<SuggestScoreDoc> {
+ /**
+ * Creates a new priority queue of the specified size.
+ */
+ public SuggestScoreDocPriorityQueue(int size) {
+ super(size);
+ }
+
+ @Override
+ protected boolean lessThan(SuggestScoreDoc a, SuggestScoreDoc b) {
+ if (a.score == b.score) {
+ // prefer smaller doc id, in case of a tie
+ return a.doc > b.doc;
+ }
+ return a.score < b.score;
+ }
+
+ /**
+ * Returns the top N results in descending order.
+ */
+ public SuggestScoreDoc[] getResults() {
+ int size = size();
+ SuggestScoreDoc[] res = new SuggestScoreDoc[size];
+ for (int i = size - 1; i >= 0; i--) {
+ res[i] = pop();
+ }
+ return res;
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java?rev=1669698&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/TopSuggestDocs.java Fri Mar 27 22:37:49 2015
@@ -0,0 +1,111 @@
+package org.apache.lucene.search.suggest.document;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.suggest.Lookup;
+
+/**
+ * {@link org.apache.lucene.search.TopDocs} wrapper with
+ * an additional CharSequence key per {@link org.apache.lucene.search.ScoreDoc}
+ *
+ * @lucene.experimental
+ */
+public class TopSuggestDocs extends TopDocs {
+
+ /**
+ * Singleton for empty {@link TopSuggestDocs}
+ */
+ public final static TopSuggestDocs EMPTY = new TopSuggestDocs(0, new SuggestScoreDoc[0], 0);
+
+ /**
+ * {@link org.apache.lucene.search.ScoreDoc} with an
+ * additional CharSequence key
+ */
+ public static class SuggestScoreDoc extends ScoreDoc implements Comparable<SuggestScoreDoc> {
+
+ /**
+ * Matched completion key
+ */
+ public CharSequence key;
+
+ /**
+ * Creates a SuggestScoreDoc instance
+ *
+ * @param doc document id (hit)
+ * @param key matched completion
+ * @param score weight of the matched completion
+ */
+ public SuggestScoreDoc(int doc, CharSequence key, long score) {
+ // loss of precision but not magnitude
+ // implicit conversion from long -> float
+ super(doc, score);
+ this.key = key;
+ }
+
+ @Override
+ public int compareTo(SuggestScoreDoc o) {
+ return Lookup.CHARSEQUENCE_COMPARATOR.compare(key, o.key);
+ }
+ }
+
+ /**
+ * {@link org.apache.lucene.search.TopDocs} wrapper with
+ * {@link TopSuggestDocs.SuggestScoreDoc}
+ * instead of {@link org.apache.lucene.search.ScoreDoc}
+ */
+ public TopSuggestDocs(int totalHits, SuggestScoreDoc[] scoreDocs, float maxScore) {
+ super(totalHits, scoreDocs, maxScore);
+ }
+
+ /**
+ * Returns {@link TopSuggestDocs.SuggestScoreDoc}s
+ * for this instance
+ */
+ public SuggestScoreDoc[] scoreLookupDocs() {
+ return (SuggestScoreDoc[]) scoreDocs;
+ }
+
+ /**
+ * Returns a new TopSuggestDocs, containing topN results across
+ * the provided TopSuggestDocs, sorting by score. Each {@link TopSuggestDocs}
+ * instance must be sorted.
+ * Analogous to {@link org.apache.lucene.search.TopDocs#merge(int, org.apache.lucene.search.TopDocs[])}
+ * for {@link TopSuggestDocs}
+ *
+ * NOTE: assumes every <code>shardHit</code> is already sorted by score
+ */
+ public static TopSuggestDocs merge(int topN, TopSuggestDocs[] shardHits) {
+ SuggestScoreDocPriorityQueue priorityQueue = new SuggestScoreDocPriorityQueue(topN);
+ for (TopSuggestDocs shardHit : shardHits) {
+ for (SuggestScoreDoc scoreDoc : shardHit.scoreLookupDocs()) {
+ if (scoreDoc == priorityQueue.insertWithOverflow(scoreDoc)) {
+ break;
+ }
+ }
+ }
+ SuggestScoreDoc[] topNResults = priorityQueue.getResults();
+ if (topNResults.length > 0) {
+ return new TopSuggestDocs(topNResults.length, topNResults, topNResults[0].score);
+ } else {
+ return TopSuggestDocs.EMPTY;
+ }
+ }
+
+}