You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/10/08 13:25:25 UTC
svn commit: r1530229 - in /lucene/dev/trunk/lucene: CHANGES.txt
suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
Author: mikemccand
Date: Tue Oct 8 11:25:24 2013
New Revision: 1530229
URL: http://svn.apache.org/r1530229
Log:
LUCENE-5251: add DocumentDictionary
Added:
lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java (with props)
lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1530229&r1=1530228&r2=1530229&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Oct 8 11:25:24 2013
@@ -89,6 +89,10 @@ New Features
tail" suggestions, when a primary suggester fails to find a
suggestion. (Mike McCandless)
+* LUCENE-5251: New DocumentDictionary allows building suggesters via
+ contents of existing field, weight and optionally payload stored
+ fields in an index (Areek Zillur via Mike McCandless)
+
Bug Fixes
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
Added: lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java?rev=1530229&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/java/org/apache/lucene/search/suggest/DocumentDictionary.java Tue Oct 8 11:25:24 2013
@@ -0,0 +1,174 @@
+package org.apache.lucene.search.suggest;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.StoredDocument;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.TermFreqPayloadIterator;
+import org.apache.lucene.search.suggest.analyzing.AnalyzingInfixSuggester; // javadoc
+import org.apache.lucene.search.suggest.fst.FSTCompletionLookup; // javadoc
+import org.apache.lucene.search.suggest.fst.WFSTCompletionLookup; // javadoc
+import org.apache.lucene.search.suggest.jaspell.JaspellLookup; // javadoc
+import org.apache.lucene.search.suggest.tst.TSTLookup; // javadoc
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
+
+/**
+ * Dictionary with terms, weights and optionally payload information
+ * taken from stored fields in a Lucene index.
+ *
+ * <b>NOTE: </b>
+ * <ul>
+ * <li>
+ * The term, weight and (optionally) payload fields supplied
+ * are required for ALL documents and has to be stored
+ * </li>
+ * <li>
+ * This Dictionary implementation is not compatible with the following Suggesters:
+ * {@link JaspellLookup}, {@link TSTLookup}, {@link FSTCompletionLookup},
+ * {@link WFSTCompletionLookup} and {@link AnalyzingInfixSuggester}.
+ * see https://issues.apache.org/jira/browse/LUCENE-5260
+ * </li>
+ * </ul>
+ */
+public class DocumentDictionary implements Dictionary {
+
+ private final IndexReader reader;
+ private final String field;
+ private final String weightField;
+ private final String payloadField;
+
+ /**
+ * Creates a new dictionary with the contents of the fields named <code>field</code>
+ * for the terms and <code>weightField</code> for the weights that will be used for
+ * the corresponding terms.
+ */
+ public DocumentDictionary(IndexReader reader, String field, String weightField) {
+ this.reader = reader;
+ this.field = field;
+ this.weightField = weightField;
+ this.payloadField = null;
+ }
+
+ /**
+ * Creates a new dictionary with the contents of the fields named <code>field</code>
+ * for the terms, <code>weightField</code> for the weights that will be used for the
+ * the corresponding terms and <code>payloadField</code> for the corresponding payloads
+ * for the entry.
+ */
+ public DocumentDictionary(IndexReader reader, String field, String weightField, String payloadField) {
+ this.reader = reader;
+ this.field = field;
+ this.weightField = weightField;
+ this.payloadField = payloadField;
+ }
+
+ @Override
+ public BytesRefIterator getWordsIterator() throws IOException {
+ return new TermWeightPayloadIterator(payloadField!=null);
+ }
+
+ final class TermWeightPayloadIterator implements TermFreqPayloadIterator {
+ private final int docCount;
+ private final Set<String> relevantFields;
+ private final boolean withPayload;
+ private final Bits liveDocs;
+ private int currentDocId = -1;
+ private long currentWeight;
+ private BytesRef currentPayload;
+
+ /**
+ * Creates an iterator over term, weight and payload fields from the lucene
+ * index. setting <code>withPayload</code> to false, implies an iterator
+ * over only term and weight.
+ */
+ public TermWeightPayloadIterator(boolean withPayload) throws IOException {
+ docCount = reader.maxDoc() - 1;
+ this.withPayload = withPayload;
+ currentPayload = null;
+ liveDocs = MultiFields.getLiveDocs(reader);
+ List<String> relevantFieldList;
+ if(withPayload) {
+ relevantFieldList = Arrays.asList(field, weightField, payloadField);
+ } else {
+ relevantFieldList = Arrays.asList(field, weightField);
+ }
+ this.relevantFields = new HashSet<>(relevantFieldList);
+ }
+
+ @Override
+ public long weight() {
+ return currentWeight;
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ while (currentDocId < docCount) {
+ currentDocId++;
+ if (liveDocs != null && !liveDocs.get(currentDocId)) {
+ continue;
+ }
+
+ StoredDocument doc = reader.document(currentDocId, relevantFields);
+
+ if (withPayload) {
+ StorableField payload = doc.getField(payloadField);
+ if (payload == null) {
+ throw new IllegalArgumentException(payloadField + " does not exist");
+ } else if (payload.binaryValue() == null) {
+ throw new IllegalArgumentException(payloadField + " does not have binary value");
+ }
+ currentPayload = payload.binaryValue();
+ }
+
+ StorableField weight = doc.getField(weightField);
+ if (weight == null) {
+ throw new IllegalArgumentException(weightField + " does not exist");
+ } else if (weight.numericValue() == null) {
+ throw new IllegalArgumentException(weightField + " does not have numeric value");
+ }
+ currentWeight = weight.numericValue().longValue();
+
+ StorableField fieldVal = doc.getField(field);
+ if (fieldVal == null) {
+ throw new IllegalArgumentException(field + " does not exist");
+ } else if(fieldVal.stringValue() == null) {
+ throw new IllegalArgumentException(field + " does not have string value");
+ }
+
+ return new BytesRef(fieldVal.stringValue());
+ }
+ return null;
+ }
+
+ @Override
+ public BytesRef payload() {
+ return currentPayload;
+ }
+
+ }
+}
Added: lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java?rev=1530229&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java (added)
+++ lucene/dev/trunk/lucene/suggest/src/test/org/apache/lucene/search/suggest/DocumentDictionaryTest.java Tue Oct 8 11:25:24 2013
@@ -0,0 +1,168 @@
+package org.apache.lucene.search.suggest;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StoredField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.TermFreqPayloadIterator;
+import org.apache.lucene.search.suggest.DocumentDictionary;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class DocumentDictionaryTest extends LuceneTestCase {
+
+ static final String FIELD_NAME = "f1";
+ static final String WEIGHT_FIELD_NAME = "w1";
+ static final String PAYLOAD_FIELD_NAME = "p1";
+
+ private Map<String, Document> generateIndexDocuments(int ndocs) {
+ Map<String, Document> docs = new HashMap<>();
+ for(int i = 0; i < ndocs ; i++) {
+ Field field = new TextField(FIELD_NAME, "field_" + i, Field.Store.YES);
+ Field payload = new StoredField(PAYLOAD_FIELD_NAME, new BytesRef("payload_" + i));
+ Field weight = new StoredField(WEIGHT_FIELD_NAME, 100d + i);
+ Document doc = new Document();
+ doc.add(field);
+ doc.add(payload);
+ doc.add(weight);
+ docs.put(field.stringValue(), doc);
+ }
+ return docs;
+ }
+
+ @Test
+ public void testBasic() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+ Map<String, Document> docs = generateIndexDocuments(10);
+ for(Document doc: docs.values()) {
+ writer.addDocument(doc);
+ }
+ writer.commit();
+ writer.close();
+ IndexReader ir = DirectoryReader.open(dir);
+ Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME, PAYLOAD_FIELD_NAME);
+ TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator();
+ BytesRef f;
+ while((f = tfp.next())!=null) {
+ Document doc = docs.remove(f.utf8ToString());
+ assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
+ assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
+ assertTrue(tfp.payload().equals(doc.getField(PAYLOAD_FIELD_NAME).binaryValue()));
+ }
+ assertTrue(docs.isEmpty());
+ ir.close();
+ dir.close();
+ }
+
+ @Test
+ public void testWithoutPayload() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+ Map<String, Document> docs = generateIndexDocuments(10);
+ for(Document doc: docs.values()) {
+ writer.addDocument(doc);
+ }
+ writer.commit();
+ writer.close();
+ IndexReader ir = DirectoryReader.open(dir);
+ Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME);
+ TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator();
+ BytesRef f;
+ while((f = tfp.next())!=null) {
+ Document doc = docs.remove(f.utf8ToString());
+ assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
+ assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
+ assertEquals(tfp.payload(), null);
+ }
+ assertTrue(docs.isEmpty());
+ ir.close();
+ dir.close();
+ }
+
+ @Test
+ public void testWithDeletions() throws IOException {
+ Directory dir = newDirectory();
+ IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+ iwc.setMergePolicy(newLogMergePolicy());
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+ Map<String, Document> docs = generateIndexDocuments(10);
+ Random rand = random();
+ List<String> termsToDel = new ArrayList<>();
+ for(Document doc : docs.values()) {
+ if(rand.nextBoolean()) {
+ termsToDel.add(doc.get(FIELD_NAME));
+ }
+ writer.addDocument(doc);
+ }
+ writer.commit();
+
+ Term[] delTerms = new Term[termsToDel.size()];
+ for(int i=0; i < termsToDel.size() ; i++) {
+ delTerms[i] = new Term(FIELD_NAME, termsToDel.get(i));
+ }
+
+ for(Term delTerm: delTerms) {
+ writer.deleteDocuments(delTerm);
+ }
+ writer.commit();
+ writer.close();
+
+ for(String termToDel: termsToDel) {
+ assertTrue(null!=docs.remove(termToDel));
+ }
+
+ IndexReader ir = DirectoryReader.open(dir);
+ assertEquals(ir.numDocs(), docs.size());
+ Dictionary dictionary = new DocumentDictionary(ir, FIELD_NAME, WEIGHT_FIELD_NAME);
+ TermFreqPayloadIterator tfp = (TermFreqPayloadIterator) dictionary.getWordsIterator();
+ BytesRef f;
+ while((f = tfp.next())!=null) {
+ Document doc = docs.remove(f.utf8ToString());
+ assertTrue(f.equals(new BytesRef(doc.get(FIELD_NAME))));
+ assertEquals(tfp.weight(), doc.getField(WEIGHT_FIELD_NAME).numericValue().longValue());
+ assertEquals(tfp.payload(), null);
+ }
+ assertTrue(docs.isEmpty());
+ ir.close();
+ dir.close();
+ }
+}