You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ka...@apache.org on 2008/06/28 19:23:36 UTC
svn commit: r672556 - in /lucene/java/trunk/contrib/instantiated: ./
src/java/org/apache/lucene/store/instantiated/
src/test/org/apache/lucene/store/instantiated/
Author: kalle
Date: Sat Jun 28 10:23:35 2008
New Revision: 672556
URL: http://svn.apache.org/viewvc?rev=672556&view=rev
Log:
LUCENE-1312: Added full support for InstantiatedIndexReader#getFieldNames() and extended the test case to assert deleted documents behaves as they should (they did).
Added:
lucene/java/trunk/contrib/instantiated/CHANGES.txt
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java
Modified:
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java
lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html
lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
Added: lucene/java/trunk/contrib/instantiated/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/CHANGES.txt?rev=672556&view=auto
==============================================================================
--- lucene/java/trunk/contrib/instantiated/CHANGES.txt (added)
+++ lucene/java/trunk/contrib/instantiated/CHANGES.txt Sat Jun 28 10:23:35 2008
@@ -0,0 +1,33 @@
+Lucene InstantiatedIndex contrib module change Log
+
+======================= Trunk (not yet released) =======================
+
+Changes in runtime behavior
+
+ (None)
+
+API Changes
+
+ (None)
+
+Bug fixes
+
+ 1. LUCENE-1312: Added full support for InstantiatedIndexReader#getFieldNames()
+ and tests that assert that deleted documents behaves as they should (they did).
+ (Jason Rutherglen, Karl Wettin)
+
+New features
+
+ (None)
+
+Documentation
+
+ (None)
+
+Build
+
+ (None)
+
+Test Cases
+
+ (None)
\ No newline at end of file
Added: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java?rev=672556&view=auto
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java (added)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSetting.java Sat Jun 28 10:23:35 2008
@@ -0,0 +1,61 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * For non package access see {@link org.apache.lucene.index.IndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
+ */
+class FieldSetting {
+ String fieldName;
+
+ boolean storeTermVector = false;
+ boolean storeOffsetWithTermVector = false;
+ boolean storePositionWithTermVector = false;
+ boolean storePayloads = false;
+
+ boolean stored = false;
+ boolean indexed = false;
+ boolean tokenized = false;
+ boolean compressed = false;
+
+ FieldSetting() {
+ }
+
+
+ FieldSetting(String fieldName) {
+ this.fieldName = fieldName;
+ }
+
+ public boolean equals(Object o) {
+ if (this == o)
+ return true;
+ if (o == null || getClass() != o.getClass())
+ return false;
+
+ final FieldSetting that = (FieldSetting) o;
+
+ return fieldName.equals(that.fieldName);
+
+ }
+
+ public int hashCode() {
+ return fieldName.hashCode();
+ }
+
+
+}
Added: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java?rev=672556&view=auto
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java (added)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/FieldSettings.java Sat Jun 28 10:23:35 2008
@@ -0,0 +1,95 @@
+package org.apache.lucene.store.instantiated;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Collection;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Essetially a Map<FieldName, {@link org.apache.lucene.store.instantiated.FieldSetting}>
+ */
+class FieldSettings {
+
+
+ FieldSettings() {
+ }
+
+ private Map</** field name */String, FieldSetting> fieldSettings = new HashMap<String, FieldSetting>();
+
+ synchronized FieldSetting merge(FieldSetting fieldSetting) {
+ FieldSetting setting = fieldSettings.get(fieldSetting.fieldName);
+
+ if (setting == null) {
+ setting = new FieldSetting(fieldSetting.fieldName);
+ fieldSettings.put(fieldSetting.fieldName, setting);
+ }
+
+ if (fieldSetting.stored) {
+ setting.stored = true;
+ }
+ if (fieldSetting.compressed) {
+ setting.compressed = true;
+ }
+
+ if ("b3".equals(fieldSetting.fieldName)) {
+ System.currentTimeMillis();
+ }
+ if (fieldSetting.indexed) {
+ setting.indexed = true;
+ }
+ if (fieldSetting.tokenized) {
+ setting.tokenized = true;
+ }
+
+ if (fieldSetting.storeTermVector) {
+ setting.storeTermVector = true;
+ }
+ if (fieldSetting.storeOffsetWithTermVector) {
+ setting.storeOffsetWithTermVector = true;
+ }
+ if (fieldSetting.storePositionWithTermVector) {
+ setting.storePositionWithTermVector = true;
+ }
+
+ if (fieldSetting.storePayloads) {
+ setting.storePayloads = true;
+ }
+
+ return setting;
+
+ }
+
+ FieldSetting get(String name) {
+ return fieldSettings.get(name);
+ }
+
+ FieldSetting get(String name, boolean create) {
+ FieldSetting fieldSetting = fieldSettings.get(name);
+ if (create && fieldSetting == null) {
+ fieldSetting = new FieldSetting(name);
+ fieldSettings.put(name, fieldSetting);
+ }
+ return fieldSetting;
+ }
+
+ Collection<FieldSetting> values() {
+ return fieldSettings.values();
+ }
+
+}
Modified: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java (original)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndex.java Sat Jun 28 10:23:35 2008
@@ -16,14 +16,24 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.index.*;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.*;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermPositions;
/**
* Represented as a coupled graph of class instances, this
@@ -49,7 +59,8 @@
private long version = System.currentTimeMillis();
private InstantiatedDocument[] documentsByNumber;
- /** todo: this should be a BitSet */
+
+ /** todo: should this be a BitSet? */
private Set<Integer> deletedDocuments;
private Map<String, Map<String, InstantiatedTerm>> termsByFieldAndText;
@@ -57,6 +68,7 @@
private Map<String, byte[]> normsByFieldNameAndDocumentNumber;
+ private FieldSettings fieldSettings;
/**
* Creates an empty instantiated index for you to fill with data using an {@link org.apache.lucene.store.instantiated.InstantiatedIndexWriter}.
@@ -68,12 +80,14 @@
void initialize() {
// todo: clear index without loosing memory (uncouple stuff)
termsByFieldAndText = new HashMap<String, Map<String, InstantiatedTerm>>();
+ fieldSettings = new FieldSettings();
orderedTerms = new InstantiatedTerm[0];
documentsByNumber = new InstantiatedDocument[0];
normsByFieldNameAndDocumentNumber = new HashMap<String, byte[]>();
deletedDocuments = new HashSet<Integer>();
}
+
/**
* Creates a new instantiated index that looks just like the index in a specific state as represented by a reader.
*
@@ -83,7 +97,9 @@
public InstantiatedIndex(IndexReader sourceIndexReader) throws IOException {
this(sourceIndexReader, null);
}
+
+
/**
* Creates a new instantiated index that looks just like the index in a specific state as represented by a reader.
*
@@ -97,10 +113,63 @@
throw new IOException("Source index is not optimized.");
}
- Collection<String> allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL);
initialize();
+ Collection<String> allFieldNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.ALL);
+
+ // load field options
+
+ Collection<String> indexedNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED);
+ for (String name : indexedNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.indexed = true;
+ }
+ Collection<String> indexedNoVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR);
+ for (String name : indexedNoVecNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storeTermVector = false;
+ setting.indexed = true;
+ }
+ Collection<String> indexedVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR);
+ for (String name : indexedVecNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storeTermVector = true;
+ setting.indexed = true;
+ }
+ Collection<String> payloadNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS);
+ for (String name : payloadNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storePayloads = true;
+ }
+ Collection<String> termVecNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR);
+ for (String name : termVecNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storeTermVector = true;
+ }
+ Collection<String> termVecOffsetNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET);
+ for (String name : termVecOffsetNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storeOffsetWithTermVector = true;
+ }
+ Collection<String> termVecPosNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION);
+ for (String name : termVecPosNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storePositionWithTermVector = true;
+ }
+ Collection<String> termVecPosOffNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET);
+ for (String name : termVecPosOffNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.storeOffsetWithTermVector = true;
+ setting.storePositionWithTermVector = true;
+ }
+ Collection<String> unindexedNames = sourceIndexReader.getFieldNames(IndexReader.FieldOption.UNINDEXED);
+ for (String name : unindexedNames) {
+ FieldSetting setting = fieldSettings.get(name, true);
+ setting.indexed = false;
+ }
+
+
documentsByNumber = new InstantiatedDocument[sourceIndexReader.numDocs()];
// create documents
@@ -129,6 +198,8 @@
}
}
+
+
// create norms
for (String fieldName : allFieldNames) {
if (fields == null || fields.contains(fieldName)) {
@@ -271,4 +342,9 @@
void setVersion(long version) {
this.version = version;
}
+
+
+ FieldSettings getFieldSettings() {
+ return fieldSettings;
+ }
}
Modified: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (original)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java Sat Jun 28 10:23:35 2008
@@ -16,22 +16,37 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.index.*;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.store.Directory;
-import java.io.IOException;
-import java.util.*;
-
/**
- * An InstantiatedIndexReader is not a snapshot in time,
- * it is completely in sync with the latest commit to the store!
- *
+ * An InstantiatedIndexReader is not a snapshot in time, it is completely in
+ * sync with the latest commit to the store!
+ *
* Consider using InstantiatedIndex as if it was immutable.
*/
-public class InstantiatedIndexReader
- extends IndexReader {
+public class InstantiatedIndexReader extends IndexReader {
private final InstantiatedIndex index;
@@ -47,36 +62,32 @@
return true;
}
-
/**
- * An InstantiatedIndexReader is not a snapshot in time,
- * it is completely in sync with the latest commit to the store!
- *
+ * An InstantiatedIndexReader is not a snapshot in time, it is completely in
+ * sync with the latest commit to the store!
+ *
* @return output from {@link InstantiatedIndex#getVersion()} in associated instantiated index.
*/
public long getVersion() {
return index.getVersion();
}
-
public Directory directory() {
throw new UnsupportedOperationException();
}
-
/**
* An InstantiatedIndexReader is always current!
- *
- * Check whether this IndexReader is still using the
- * current (i.e., most recently committed) version of the
- * index. If a writer has committed any changes to the
- * index since this reader was opened, this will return
- * <code>false</code>, in which case you must open a new
- * IndexReader in order to see the changes. See the
- * description of the <a href="IndexWriter.html#autoCommit"><code>autoCommit</code></a>
- * flag which controls when the {@link IndexWriter}
- * actually commits changes to the index.
- *
+ *
+ * Check whether this IndexReader is still using the current (i.e., most
+ * recently committed) version of the index. If a writer has committed any
+ * changes to the index since this reader was opened, this will return
+ * <code>false</code>, in which case you must open a new IndexReader in
+ * order to see the changes. See the description of the <a
+ * href="IndexWriter.html#autoCommit"><code>autoCommit</code></a> flag
+ * which controls when the {@link IndexWriter} actually commits changes to the
+ * index.
+ *
* @return always true
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
@@ -92,7 +103,7 @@
private Set<InstantiatedDocument> deletedDocuments = new HashSet<InstantiatedDocument>();
private Set<Integer> deletedDocumentNumbers = new HashSet<Integer>();
- private Map<String, List<NormUpdate>> updatedNormsByFieldNameAndDocumentNumber = null;
+ private Map<String,List<NormUpdate>> updatedNormsByFieldNameAndDocumentNumber = null;
private class NormUpdate {
private int doc;
@@ -140,7 +151,7 @@
// 1. update norms
if (updatedNormsByFieldNameAndDocumentNumber != null) {
- for (Map.Entry<String, List<NormUpdate>> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) {
+ for (Map.Entry<String,List<NormUpdate>> e : updatedNormsByFieldNameAndDocumentNumber.entrySet()) {
byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(e.getKey());
for (NormUpdate normUpdate : e.getValue()) {
norms[normUpdate.doc] = normUpdate.value;
@@ -168,27 +179,67 @@
protected void doClose() throws IOException {
// ignored
+ // todo perhaps release all associated instances?
}
- public Collection getFieldNames(FieldOption fldOption) {
- if (fldOption != FieldOption.ALL) {
- throw new IllegalArgumentException("Only FieldOption.ALL implemented."); // todo
+ public Collection getFieldNames(FieldOption fieldOption) {
+ Set<String> fieldSet = new HashSet<String>();
+ for (FieldSetting fi : index.getFieldSettings().values()) {
+ if (fieldOption == IndexReader.FieldOption.ALL) {
+ fieldSet.add(fi.fieldName);
+ } else if (!fi.indexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.indexed && fieldOption == IndexReader.FieldOption.INDEXED) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.indexed && fi.storeTermVector == false && fieldOption == IndexReader.FieldOption.INDEXED_NO_TERMVECTOR) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.storeTermVector == true && fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == false
+ && fieldOption == IndexReader.FieldOption.TERMVECTOR) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.indexed && fi.storeTermVector && fieldOption == IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.storePositionWithTermVector && fi.storeOffsetWithTermVector == false
+ && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION) {
+ fieldSet.add(fi.fieldName);
+ } else if (fi.storeOffsetWithTermVector && fi.storePositionWithTermVector == false
+ && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET) {
+ fieldSet.add(fi.fieldName);
+ } else if ((fi.storeOffsetWithTermVector && fi.storePositionWithTermVector)
+ && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET) {
+ fieldSet.add(fi.fieldName);
+ }
}
- return new ArrayList<String>(getIndex().getTermsByFieldAndText().keySet());
+ return fieldSet;
}
-
/**
- * This implementation ignores the field selector! All fields are always returned
- *
- * Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position.
+ * Return the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup>
+ * position.
+ <p>
+ * <b>Warning!</b>
+ * The resulting document is the actual stored document instance
+ * and not a deserialized clone as retuned by an IndexReader
+ * over a {@link org.apache.lucene.store.Directory}.
+ * I.e., if you need to touch the document, clone it first!
+ * <p>
+ * This can also be seen as a feature for live canges of stored values,
+ * but be carful! Adding a field with an name unknown to the index
+ * or to a field with previously no stored values will make
+ * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
+ * out of sync, causing problems for instance when merging the
+ * instantiated index to another index.
+ <p>
+ * This implementation ignores the field selector! All stored fields are always returned!
+ * <p>
*
- * @param n Get the document at the <code>n</code><sup>th</sup> position
+ * @param n document number
* @param fieldSelector ignored
* @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
- *
+ *
* @see org.apache.lucene.document.Fieldable
* @see org.apache.lucene.document.FieldSelector
* @see org.apache.lucene.document.SetBasedFieldSelector
@@ -198,19 +249,34 @@
return document(n);
}
+ /**
+ * Returns the stored fields of the <code>n</code><sup>th</sup>
+ * <code>Document</code> in this index.
+ * <p>
+ * <b>Warning!</b>
+ * The resulting document is the actual stored document instance
+ * and not a deserialized clone as retuned by an IndexReader
+ * over a {@link org.apache.lucene.store.Directory}.
+ * I.e., if you need to touch the document, clone it first!
+ * <p>
+ * This can also be seen as a feature for live canges of stored values,
+ * but be carful! Adding a field with an name unknown to the index
+ * or to a field with previously no stored values will make
+ * {@link org.apache.lucene.store.instantiated.InstantiatedIndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption)}
+ * out of sync, causing problems for instance when merging the
+ * instantiated index to another index.
+ *
+ * @throws CorruptIndexException if the index is corrupt
+ * @throws IOException if there is a low-level IO error
+ */
+
public Document document(int n) throws IOException {
- if ((deletedDocumentNumbers != null
- && deletedDocumentNumbers.contains(n))
- ||
- (getIndex().getDeletedDocuments() != null
- && getIndex().getDeletedDocuments().contains(n))) {
- return null;
- }
- return getIndex().getDocumentsByNumber()[n].getDocument();
+ return isDeleted(n) ? null : getIndex().getDocumentsByNumber()[n].getDocument();
}
/**
- * never ever touch these values. it is the true values, unless norms have been touched.
+ * never ever touch these values. it is the true values, unless norms have
+ * been touched.
*/
public byte[] norms(String field) throws IOException {
byte[] norms = getIndex().getNormsByFieldNameAndDocumentNumber().get(field);
@@ -233,7 +299,7 @@
protected void doSetNorm(int doc, String field, byte value) throws IOException {
if (updatedNormsByFieldNameAndDocumentNumber == null) {
- updatedNormsByFieldNameAndDocumentNumber = new HashMap<String, List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
+ updatedNormsByFieldNameAndDocumentNumber = new HashMap<String,List<NormUpdate>>(getIndex().getNormsByFieldNameAndDocumentNumber().size());
}
List<NormUpdate> list = updatedNormsByFieldNameAndDocumentNumber.get(field);
if (list == null) {
@@ -252,7 +318,6 @@
}
}
-
public TermEnum terms() throws IOException {
return new InstantiatedTermEnum(this);
}
@@ -260,11 +325,11 @@
public TermEnum terms(Term t) throws IOException {
InstantiatedTerm it = getIndex().findTerm(t);
if (it != null) {
- return new InstantiatedTermEnum(this, it.getTermIndex());
+ return new InstantiatedTermEnum(this, it.getTermIndex());
} else {
int startPos = Arrays.binarySearch(index.getOrderedTerms(), t, InstantiatedTerm.termComparator);
if (startPos < 0) {
- startPos = -1 -startPos;
+ startPos = -1 - startPos;
}
return new InstantiatedTermEnum(this, startPos);
}
@@ -293,19 +358,16 @@
public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
- if (doc.getVectorSpace() == null
- || doc.getVectorSpace().get(field) == null) {
+ if (doc.getVectorSpace() == null || doc.getVectorSpace().get(field) == null) {
return null;
} else {
return new InstantiatedTermPositionVector(doc, field);
}
}
-
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
- if (doc.getVectorSpace() != null
- && doc.getVectorSpace().get(field) == null) {
+ if (doc.getVectorSpace() != null && doc.getVectorSpace().get(field) == null) {
List<InstantiatedTermDocumentInformation> tv = doc.getVectorSpace().get(field);
mapper.setExpectations(field, tv.size(), true, true);
for (InstantiatedTermDocumentInformation tdi : tv) {
@@ -316,7 +378,7 @@
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
InstantiatedDocument doc = getIndex().getDocumentsByNumber()[docNumber];
- for (Map.Entry<String, List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
+ for (Map.Entry<String,List<InstantiatedTermDocumentInformation>> e : doc.getVectorSpace().entrySet()) {
mapper.setExpectations(e.getKey(), e.getValue().size(), true, true);
for (InstantiatedTermDocumentInformation tdi : e.getValue()) {
mapper.map(tdi.getTerm().text(), tdi.getTermPositions().length, tdi.getTermOffsets(), tdi.getTermPositions());
Modified: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (original)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java Sat Jun 28 10:23:35 2008
@@ -16,6 +16,22 @@
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@@ -28,11 +44,6 @@
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Similarity;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.io.StringReader;
-import java.util.*;
-
/**
* This class, similar to {@link org.apache.lucene.index.IndexWriter}, has no locking mechanism.
*
@@ -161,6 +172,11 @@
boolean orderedTermsDirty = false;
Set<InstantiatedTerm> dirtyTerms = new HashSet<InstantiatedTerm>(1000);
+
+ Map<String, FieldSetting> fieldSettingsByFieldName = new HashMap<String, FieldSetting>();
+ for (String fieldName : fieldNameBuffer) {
+ fieldSettingsByFieldName.put(fieldName, new FieldSetting(fieldName));
+ }
InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length);
@@ -215,7 +231,7 @@
}
termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
- if (eFieldTermDocInfoFactoriesByTermText.getKey().isIndexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) {
+ if (eFieldTermDocInfoFactoriesByTermText.getKey().indexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) {
float norm = eFieldTermDocInfoFactoriesByTermText.getKey().boost;
norm *= document.getDocument().getBoost();
norm *= similarity.lengthNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
@@ -340,6 +356,7 @@
}
}
+ fieldSettingsByFieldName.putAll(documentFieldSettingsByFieldName);
}
// order document informations in dirty terms
@@ -358,6 +375,9 @@
index.setDocumentsByNumber(documentsByNumber);
index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[orderedTerms.size()]));
+ for (FieldSetting fieldSetting : fieldSettingsByFieldName.values()) {
+ index.getFieldSettings().merge(fieldSetting);
+ }
// set term index
if (orderedTermsDirty) {
// todo optimize, only update from start position
@@ -434,45 +454,46 @@
Map<String /* field name */, FieldSetting> fieldSettingsByFieldName = new HashMap<String, FieldSetting>();
for (Field field : (List<Field>) document.getDocument().getFields()) {
- FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name());
- if (fieldSettings == null) {
- fieldSettings = new FieldSetting();
- fieldSettings.fieldName = field.name().intern();
- fieldSettingsByFieldName.put(fieldSettings.fieldName, fieldSettings);
- fieldNameBuffer.add(fieldSettings.fieldName);
+ FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name());
+ if (fieldSetting == null) {
+ fieldSetting = new FieldSetting();
+ fieldSetting.fieldName = field.name().intern();
+ fieldSettingsByFieldName.put(fieldSetting.fieldName, fieldSetting);
+ fieldNameBuffer.add(fieldSetting.fieldName);
}
// todo: fixme: multiple fields with the same name does not mean field boost += more boost.
- fieldSettings.boost *= field.getBoost();
+ fieldSetting.boost *= field.getBoost();
//fieldSettings.dimensions++;
+
// once fieldSettings, always fieldSettings.
- if (field.getOmitNorms() != fieldSettings.omitNorms) {
- fieldSettings.omitNorms = true;
+ if (field.getOmitNorms()) {
+ fieldSetting.omitNorms = true;
}
- if (field.isIndexed() != fieldSettings.isIndexed) {
- fieldSettings.isIndexed = true;
+ if (field.isIndexed() ) {
+ fieldSetting.indexed = true;
}
- if (field.isTokenized() != fieldSettings.isTokenized) {
- fieldSettings.isTokenized = true;
+ if (field.isTokenized()) {
+ fieldSetting.tokenized = true;
}
- if (field.isCompressed() != fieldSettings.isCompressed) {
- fieldSettings.isCompressed = true;
+ if (field.isCompressed()) {
+ fieldSetting.compressed = true;
}
- if (field.isStored() != fieldSettings.isStored) {
- fieldSettings.isStored = true;
+ if (field.isStored()) {
+ fieldSetting.stored = true;
}
- if (field.isBinary() != fieldSettings.isBinary) {
- fieldSettings.isBinary = true;
+ if (field.isBinary()) {
+ fieldSetting.isBinary = true;
}
- if (field.isTermVectorStored() != fieldSettings.storeTermVector) {
- fieldSettings.storeTermVector = true;
+ if (field.isTermVectorStored()) {
+ fieldSetting.storeTermVector = true;
}
- if (field.isStorePositionWithTermVector() != fieldSettings.storePositionWithTermVector) {
- fieldSettings.storePositionWithTermVector = true;
+ if (field.isStorePositionWithTermVector()) {
+ fieldSetting.storePositionWithTermVector = true;
}
- if (field.isStoreOffsetWithTermVector() != fieldSettings.storeOffsetWithTermVector) {
- fieldSettings.storeOffsetWithTermVector = true;
+ if (field.isStoreOffsetWithTermVector()) {
+ fieldSetting.storeOffsetWithTermVector = true;
}
}
@@ -483,7 +504,7 @@
Field field = it.next();
- FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name());
+ FieldSetting fieldSetting = fieldSettingsByFieldName.get(field.name());
if (field.isIndexed()) {
@@ -505,15 +526,15 @@
next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned?
tokens.add(next); // the vector will be built on commit.
next = tokenStream.next();
- fieldSettings.fieldLength++;
- if (fieldSettings.fieldLength > maxFieldLength) {
+ fieldSetting.fieldLength++;
+ if (fieldSetting.fieldLength > maxFieldLength) {
break;
}
}
} else {
// untokenized
tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized"));
- fieldSettings.fieldLength++;
+ fieldSetting.fieldLength++;
}
}
@@ -528,7 +549,7 @@
// build term vector, term positions and term offsets
for (Map.Entry<Field, LinkedList<Token>> eField_Tokens : tokensByField.entrySet()) {
- FieldSetting fieldSettings = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
+ FieldSetting fieldSetting = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
Map<String, TermDocumentInformationFactory> termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
if (termDocumentInformationFactoryByTermText == null) {
@@ -539,9 +560,9 @@
int lastOffset = 0;
// for each new field, move positions a bunch.
- if (fieldSettings.position > 0) {
+ if (fieldSetting.position > 0) {
// todo what if no analyzer set, multiple fields with same name and index without tokenization?
- fieldSettings.position += analyzer.getPositionIncrementGap(fieldSettings.fieldName);
+ fieldSetting.position += analyzer.getPositionIncrementGap(fieldSetting.fieldName);
}
for (Token token : eField_Tokens.getValue()) {
@@ -553,26 +574,27 @@
}
//termDocumentInformationFactory.termFrequency++;
- fieldSettings.position += (token.getPositionIncrement() - 1);
- termDocumentInformationFactory.termPositions.add(fieldSettings.position++);
+ fieldSetting.position += (token.getPositionIncrement() - 1);
+ termDocumentInformationFactory.termPositions.add(fieldSetting.position++);
if (token.getPayload() != null && token.getPayload().length() > 0) {
termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
+ fieldSetting.storePayloads = true;
} else {
termDocumentInformationFactory.payloads.add(null);
}
if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
- termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSettings.offset + token.startOffset(), fieldSettings.offset + token.endOffset()));
- lastOffset = fieldSettings.offset + token.endOffset();
+ termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSetting.offset + token.startOffset(), fieldSetting.offset + token.endOffset()));
+ lastOffset = fieldSetting.offset + token.endOffset();
}
}
if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
- fieldSettings.offset = lastOffset + 1;
+ fieldSetting.offset = lastOffset + 1;
}
}
@@ -631,51 +653,30 @@
return analyzer;
}
+ private class TermDocumentInformationFactory {
+ private LinkedList<byte[]> payloads = new LinkedList<byte[]>();
+ private LinkedList<Integer> termPositions = new LinkedList<Integer>();
+ private LinkedList<TermVectorOffsetInfo> termOffsets = new LinkedList<TermVectorOffsetInfo>();
+ }
- private class FieldSetting {
- private String fieldName;
-
- private float boost = 1;
- //private int dimensions = 0; // this is futuristic
- private int position = 0;
- private int offset;
- private int fieldLength = 0;
-
- private boolean storeTermVector = false;
- private boolean storeOffsetWithTermVector = false;
- private boolean storePositionWithTermVector = false;
- private boolean omitNorms = false;
- private boolean isTokenized = false;
-
- private boolean isStored = false;
- private boolean isIndexed = false;
- private boolean isBinary = false;
- private boolean isCompressed = false;
-
- //private float norm;
- //private byte encodedNorm;
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
+ static class FieldSetting extends org.apache.lucene.store.instantiated.FieldSetting {
- final FieldSetting that = (FieldSetting) o;
+ float boost = 1;
+ int position = 0;
+ int offset;
+ int fieldLength = 0;
- return fieldName.equals(that.fieldName);
+ boolean omitNorms = false;
+ boolean isBinary = false;
+ private FieldSetting() {
}
- public int hashCode() {
- return fieldName.hashCode();
+ private FieldSetting(String fieldName) {
+ super(fieldName);
}
}
- private class TermDocumentInformationFactory {
- private LinkedList<byte[]> payloads = new LinkedList<byte[]>();
- private LinkedList<Integer> termPositions = new LinkedList<Integer>();
- private LinkedList<TermVectorOffsetInfo> termOffsets = new LinkedList<TermVectorOffsetInfo>();
- }
-
-
}
Modified: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java (original)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java Sat Jun 28 10:23:35 2008
@@ -121,16 +121,11 @@
} else {
return true;
}
-
-
}
/**
* Does nothing
*/
public void close() {
-
}
-
-
}
Modified: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java (original)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermEnum.java Sat Jun 28 10:23:35 2008
@@ -61,7 +61,7 @@
* Returns the current Term in the enumeration.
*/
public Term term() {
- return /*term == null ? null :*/ term.getTerm();
+ return term == null ? null : term.getTerm();
}
/**
Modified: lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html (original)
+++ lucene/java/trunk/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html Sat Jun 28 10:23:35 2008
@@ -70,9 +70,10 @@
<h2>Caveats</h2>
<ul>
<li>No locks! Consider using InstantiatedIndex as if it was immutable.</li>
- <li>No documents with fields containing readers!</li>
- <li>Only FieldOption.All allowed by IndexReader#getFieldNames(FieldOption).</li>
+ <li>No documents with fields containing readers.</li>
<li>No field selection when retrieving documents, as all stored field are available in memory.</li>
+ <li>Any document returned must cloned if they are to be touched.</li>
+ <li>Norms array returned must not be touched.</li>
</ul>
<h2>Use cases</h2>
Modified: lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java?rev=672556&r1=672555&r2=672556&view=diff
==============================================================================
--- lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java (original)
+++ lucene/java/trunk/contrib/instantiated/src/test/org/apache/lucene/store/instantiated/TestIndicesEquals.java Sat Jun 28 10:23:35 2008
@@ -47,7 +47,7 @@
// create dir data
IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true);
- for (int i = 0; i < 5; i++) {
+ for (int i = 0; i < 20; i++) {
Document document = new Document();
assembleDocument(document, i);
indexWriter.addDocument(document);
@@ -59,9 +59,10 @@
InstantiatedIndex ii = new InstantiatedIndex(ir);
ir.close();
- testEquals(dir, ii);
+ testEqualBehaviour(dir, ii);
}
+
public void testInstantiatedIndexWriter() throws Exception {
@@ -86,7 +87,7 @@
}
instantiatedIndexWriter.close();
- testEquals(dir, ii);
+ testEqualBehaviour(dir, ii);
testTermDocs(dir, ii);
@@ -186,6 +187,25 @@
* @param testIndex the index that is supposed to equals the apriori index.
* @throws Exception
*/
+ protected void testEqualBehaviour(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
+
+ testEquals(aprioriIndex, testIndex);
+
+ // delete a few documents
+ IndexReader ir = IndexReader.open(aprioriIndex);
+ ir.deleteDocument(3);
+ ir.deleteDocument(8);
+ ir.close();
+
+ ir = testIndex.indexReaderFactory();
+ ir.deleteDocument(3);
+ ir.deleteDocument(8);
+ ir.close();
+
+ // make sure they still equal
+ testEquals(aprioriIndex, testIndex);
+ }
+
protected void testEquals(Directory aprioriIndex, InstantiatedIndex testIndex) throws Exception {
IndexReader aprioriReader = IndexReader.open(aprioriIndex);
@@ -193,6 +213,17 @@
assertEquals(aprioriReader.numDocs(), testReader.numDocs());
+ // assert field options
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.INDEXED), testReader.getFieldNames(IndexReader.FieldOption.INDEXED));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR), testReader.getFieldNames(IndexReader.FieldOption.INDEXED_NO_TERMVECTOR));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR), testReader.getFieldNames(IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), testReader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), testReader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET));
+ assertEquals(aprioriReader.getFieldNames(IndexReader.FieldOption.UNINDEXED), testReader.getFieldNames(IndexReader.FieldOption.UNINDEXED));
+
for (Object field : aprioriReader.getFieldNames(IndexReader.FieldOption.ALL)) {
// test norms as used by normal use