You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by go...@apache.org on 2004/09/15 14:50:23 UTC
cvs commit: jakarta-lucene/src/test/org/apache/lucene/document TestDocument.java
goller 2004/09/15 05:50:23
Modified: src/java/org/apache/lucene/index FieldsReader.java
FieldsWriter.java
src/java/org/apache/lucene/document Field.java Document.java
src/test/org/apache/lucene/document TestDocument.java
Log:
Applied patch #29370 supplied by Drew Farris and
Bernhard Messer.
Revision Changes Path
1.9 +23 -15 jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java
Index: FieldsReader.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- FieldsReader.java 1 Sep 2004 20:04:12 -0000 1.8
+++ FieldsReader.java 15 Sep 2004 12:50:22 -0000 1.9
@@ -18,10 +18,10 @@
import java.io.IOException;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.InputStream;
/**
* Class responsible for access to stored document fields.
@@ -67,20 +67,28 @@
byte bits = fieldsStream.readByte();
- Field.Index index;
- boolean tokenize = (bits & 1) != 0;
- if (fi.isIndexed && tokenize)
- index = Field.Index.TOKENIZED;
- else if (fi.isIndexed && !tokenize)
- index = Field.Index.UN_TOKENIZED;
- else
- index = Field.Index.NO;
- doc.add(new Field(fi.name, // name
- fieldsStream.readString(), // read value
- Field.Store.YES, index,
- fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+ if ((bits & 2) != 0) {
+ final byte[] b = new byte[fieldsStream.readVInt()];
+ fieldsStream.readBytes(b, 0, b.length);
+ doc.add(new Field(fi.name, b));
+ }
+ else {
+ Field.Index index;
+ boolean tokenize = (bits & 1) != 0;
+ if (fi.isIndexed && tokenize)
+ index = Field.Index.TOKENIZED;
+ else if (fi.isIndexed && !tokenize)
+ index = Field.Index.UN_TOKENIZED;
+ else
+ index = Field.Index.NO;
+ doc.add(new Field(fi.name, // name
+ fieldsStream.readString(), // read value
+ Field.Store.YES, index,
+ fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+ }
}
return doc;
}
+
}
1.4 +68 -56 jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java
Index: FieldsWriter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- FieldsWriter.java 29 Mar 2004 22:48:02 -0000 1.3
+++ FieldsWriter.java 15 Sep 2004 12:50:22 -0000 1.4
@@ -2,70 +2,82 @@
/**
* Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
* Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
*/
-import java.util.Enumeration;
import java.io.IOException;
+import java.util.Enumeration;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.OutputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.OutputStream;
-final class FieldsWriter {
- private FieldInfos fieldInfos;
- private OutputStream fieldsStream;
- private OutputStream indexStream;
-
- FieldsWriter(Directory d, String segment, FieldInfos fn)
- throws IOException {
- fieldInfos = fn;
- fieldsStream = d.createFile(segment + ".fdt");
- indexStream = d.createFile(segment + ".fdx");
- }
-
- final void close() throws IOException {
- fieldsStream.close();
- indexStream.close();
- }
-
- final void addDocument(Document doc) throws IOException {
- indexStream.writeLong(fieldsStream.getFilePointer());
-
- int storedCount = 0;
- Enumeration fields = doc.fields();
- while (fields.hasMoreElements()) {
- Field field = (Field)fields.nextElement();
- if (field.isStored())
- storedCount++;
+final class FieldsWriter
+{
+ private FieldInfos fieldInfos;
+
+ private OutputStream fieldsStream;
+
+ private OutputStream indexStream;
+
+ FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException {
+ fieldInfos = fn;
+ fieldsStream = d.createFile(segment + ".fdt");
+ indexStream = d.createFile(segment + ".fdx");
}
- fieldsStream.writeVInt(storedCount);
-
- fields = doc.fields();
- while (fields.hasMoreElements()) {
- Field field = (Field)fields.nextElement();
- if (field.isStored()) {
- fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
-
- byte bits = 0;
- if (field.isTokenized())
- bits |= 1;
- fieldsStream.writeByte(bits);
- fieldsStream.writeString(field.stringValue());
- }
+ final void close() throws IOException {
+ fieldsStream.close();
+ indexStream.close();
}
- }
-}
+
+ final void addDocument(Document doc) throws IOException {
+ indexStream.writeLong(fieldsStream.getFilePointer());
+
+ int storedCount = 0;
+ Enumeration fields = doc.fields();
+ while (fields.hasMoreElements()) {
+ Field field = (Field) fields.nextElement();
+ if (field.isStored())
+ storedCount++;
+ }
+ fieldsStream.writeVInt(storedCount);
+
+ fields = doc.fields();
+ while (fields.hasMoreElements()) {
+ Field field = (Field) fields.nextElement();
+ if (field.isStored()) {
+ fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
+
+ byte bits = 0;
+ if (field.isTokenized())
+ bits |= 1;
+ if (field.isBinary())
+ bits |= 2;
+ fieldsStream.writeByte(bits);
+
+ if (field.isBinary()) {
+ byte[] data = field.binaryValue();
+ final int len = data.length;
+ fieldsStream.writeVInt(len);
+ fieldsStream.writeBytes(data, len);
+ } else {
+ fieldsStream.writeString(field.stringValue());
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file
1.23 +43 -9 jakarta-lucene/src/java/org/apache/lucene/document/Field.java
Index: Field.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
retrieving revision 1.22
retrieving revision 1.23
diff -u -r1.22 -r1.23
--- Field.java 1 Sep 2004 22:11:07 -0000 1.22
+++ Field.java 15 Sep 2004 12:50:22 -0000 1.23
@@ -18,9 +18,10 @@
import java.io.Reader;
import java.util.Date;
-import org.apache.lucene.index.IndexReader; // for javadoc
-import org.apache.lucene.search.Similarity; // for javadoc
-import org.apache.lucene.search.Hits; // for javadoc
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Similarity;
/**
A field is a section of a Document. Each field has two parts, a name and a
@@ -33,12 +34,15 @@
public final class Field implements java.io.Serializable {
private String name = "body";
private String stringValue = null;
- private boolean storeTermVector = false;
private Reader readerValue = null;
+ private byte[] binaryValue = null;
+
+ private boolean storeTermVector = false;
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
-
+ private boolean isBinary = false;
+
private float boost = 1.0f;
public static final class Store {
@@ -101,7 +105,7 @@
* of the document's terms and their number of occurences in that document. */
public static final TermVector YES = new TermVector("YES");
}
-
+
/** Sets the boost factor hits on this field. This value will be
* multiplied into the score of all hits on this this field of this
* document.
@@ -213,7 +217,7 @@
f.storeTermVector = storeTermVector;
return f;
}
-
+
/** The name of the field (e.g., "date", "title", "body", ...)
as an interned string. */
public String name() { return name; }
@@ -224,7 +228,11 @@
/** The value of the field as a Reader, or null. If null, the String value
is used. Exactly one of stringValue() and readerValue() must be set. */
public Reader readerValue() { return readerValue; }
-
+ /** The value of the field in Binary, or null. If null, the Reader or
+ String value is used. Exactly one of stringValue(), readerValue() and
+ binaryValue() must be set. */
+ public byte[] binaryValue() { return binaryValue; }
+
/**
* Create a field by specifying its name, value and how it will
* be saved in the index. Term vectors will not be stored in the index.
@@ -240,7 +248,7 @@
public Field(String name, String value, Store store, Index index) {
this(name, value, store, index, TermVector.NO);
}
-
+
/**
* Create a field by specifying its name, value and how it will
* be saved in the index.
@@ -340,6 +348,23 @@
this(name, string, store, index, token, false);
}
+ public Field(String name, byte[] value) {
+ if (name == null)
+ throw new IllegalArgumentException("name cannot be null");
+ if (value == null)
+ throw new IllegalArgumentException("value cannot be null");
+
+ this.name = name.intern();
+ this.binaryValue = value;
+
+ this.isBinary = true;
+ this.isStored = true;
+
+ this.isIndexed = false;
+ this.isTokenized = false;
+ this.storeTermVector = false;
+ }
+
/**
*
* @param name The name of the field
@@ -402,6 +427,9 @@
*/
public final boolean isTermVectorStored() { return storeTermVector; }
+ /** True iff the value of the filed is stored as binary */
+ public final boolean isBinary() { return isBinary; }
+
/** Prints a Field for human consumption. */
public final String toString() {
StringBuffer result = new StringBuffer();
@@ -422,6 +450,12 @@
result.append(",");
result.append("termVector");
}
+ if (isBinary) {
+ if (result.length() > 0)
+ result.append(",");
+ result.append("binary");
+ }
+
result.append('<');
result.append(name);
result.append(':');
1.20 +58 -13 jakarta-lucene/src/java/org/apache/lucene/document/Document.java
Index: Document.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Document.java,v
retrieving revision 1.19
retrieving revision 1.20
diff -u -r1.19 -r1.20
--- Document.java 21 Apr 2004 17:08:04 -0000 1.19
+++ Document.java 15 Sep 2004 12:50:22 -0000 1.20
@@ -144,14 +144,16 @@
/** Returns the string value of the field with the given name if any exist in
* this document, or null. If multiple fields exist with this name, this
- * method returns the first value added.
+ * method returns the first value added. If only binary fields with this name
+ * exist, returns null.
*/
public final String get(String name) {
- Field field = getField(name);
- if (field != null)
- return field.stringValue();
- else
- return null;
+ for (int i = 0; i < fields.size(); i++) {
+ Field field = (Field)fields.get(i);
+ if (field.name().equals(name) && (!field.isBinary()))
+ return field.stringValue();
+ }
+ return null;
}
/** Returns an Enumeration of all the fields in a document. */
@@ -189,16 +191,59 @@
* @return a <code>String[]</code> of field values
*/
public final String[] getValues(String name) {
- Field[] namedFields = getFields(name);
- if (namedFields == null)
- return null;
- String[] values = new String[namedFields.length];
- for (int i = 0; i < namedFields.length; i++) {
- values[i] = namedFields[i].stringValue();
+ List result = new ArrayList();
+ for (int i = 0; i < fields.size(); i++) {
+ Field field = (Field)fields.get(i);
+ if (field.name().equals(name) && (!field.isBinary()))
+ result.add(field.stringValue());
}
- return values;
+
+ if (result.size() == 0)
+ return null;
+
+ return (String[])result.toArray(new String[result.size()]);
}
+ /**
+ * Returns an array of byte arrays for of the fields that have the name specified
+ * as the method parameter. This method will return <code>null</code> if no
+ * binary fields with the specified name are available.
+ *
+ * @param name the name of the field
+ * @return a <code>byte[][]</code> of binary field values.
+ */
+ public final byte[][] getBinaryValues(String name) {
+ List result = new ArrayList();
+ for (int i = 0; i < fields.size(); i++) {
+ Field field = (Field)fields.get(i);
+ if (field.name().equals(name) && (field.isBinary()))
+ result.add(field.binaryValue());
+ }
+
+ if (result.size() == 0)
+ return null;
+
+ return (byte[][])result.toArray(new byte[result.size()][]);
+ }
+
+ /**
+ * Returns an array of bytes for the first (or only) field that has the name
+ * specified as the method parameter. This method will return <code>null</code>
+ * if no binary fields with the specified name are available.
+ * There may be non-binary fields with the same name.
+ *
+ * @param name the name of the field.
+ * @return a <code>byte[]</code> containing the binary field value.
+ */
+ public final byte[] getBinaryValue(String name) {
+ for (int i=0; i < fields.size(); i++) {
+ Field field = (Field)fields.get(i);
+ if (field.name().equals(name) && (field.isBinary()))
+ return field.binaryValue();
+ }
+ return null;
+ }
+
/** Prints the fields of a document for human consumption. */
public final String toString() {
StringBuffer buffer = new StringBuffer();
1.8 +51 -1 jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java
Index: TestDocument.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- TestDocument.java 5 Sep 2004 22:27:42 -0000 1.7
+++ TestDocument.java 15 Sep 2004 12:50:23 -0000 1.8
@@ -39,6 +39,56 @@
public class TestDocument extends TestCase
{
+ String binaryVal = "this text will be stored as a byte array in the index";
+ String binaryVal2 = "this text will be also stored as a byte array in the index";
+
+ public void testBinaryField()
+ throws Exception
+ {
+ Document doc = new Document();
+ Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO);
+ Field binaryFld = new Field("binary", binaryVal.getBytes());
+ Field binaryFld2 = new Field("binary", binaryVal2.getBytes());
+
+ doc.add(stringFld);
+ doc.add(binaryFld);
+
+ assertEquals(2, doc.fields.size());
+
+ assertTrue(binaryFld.isBinary());
+ assertTrue(binaryFld.isStored());
+ assertFalse(binaryFld.isIndexed());
+ assertFalse(binaryFld.isTokenized());
+
+ String binaryTest = new String(doc.getBinaryValue("binary"));
+ assertTrue(binaryTest.equals(binaryVal));
+
+ String stringTest = doc.get("string");
+ assertTrue(binaryTest.equals(stringTest));
+
+ doc.add(binaryFld2);
+
+ assertEquals(3, doc.fields.size());
+
+ byte[][] binaryTests = doc.getBinaryValues("binary");
+
+ assertEquals(2, binaryTests.length);
+
+ binaryTest = new String(binaryTests[0]);
+ String binaryTest2 = new String(binaryTests[1]);
+
+ assertFalse(binaryTest.equals(binaryTest2));
+
+ assertTrue(binaryTest.equals(binaryVal));
+ assertTrue(binaryTest2.equals(binaryVal2));
+
+ doc.removeField("string");
+ assertEquals(2, doc.fields.size());
+
+ doc.removeFields("binary");
+ assertEquals(0, doc.fields.size());
+ }
+
/**
* Tests {@link Document#removeField(String)} method for a brand new Document
* that has not been indexed yet.
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org
Re: cvs commit: jakarta-lucene/src/test/org/apache/lucene/document
TestDocument.java
Posted by Christoph Goller <go...@detego-software.de>.
Doug Cutting wrote:
> goller@apache.org wrote:
>
>> + if ((bits & 2) != 0) {
>> + final byte[] b = new byte[fieldsStream.readVInt()];
>> + fieldsStream.readBytes(b, 0, b.length);
>> + doc.add(new Field(fi.name, b));
>> + }
>> + else {
>> + Field.Index index;
>> + boolean tokenize = (bits & 1) != 0;
>
>
> I'd still like to see constants for these bit masks, as suggested in:
>
> http://www.mail-archive.com/lucene-dev@jakarta.apache.org/msg06693.html
>
> This will make the code much more readable as we add, e.g., compression.
>
> Doug
Yes, that would be better. I just noticed that you already suggested
compressed non-binary values. That's what I meant with my latest comment
in bugzilla. So I suggest to wait until Bernhard is back. Maybe he will
change his compression patch accordingly and also add the bit-masks.
I will keep this in my mind.
Christoph
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org
Re: cvs commit: jakarta-lucene/src/test/org/apache/lucene/document
TestDocument.java
Posted by Doug Cutting <cu...@apache.org>.
goller@apache.org wrote:
> + if ((bits & 2) != 0) {
> + final byte[] b = new byte[fieldsStream.readVInt()];
> + fieldsStream.readBytes(b, 0, b.length);
> + doc.add(new Field(fi.name, b));
> + }
> + else {
> + Field.Index index;
> + boolean tokenize = (bits & 1) != 0;
I'd still like to see constants for these bit masks, as suggested in:
http://www.mail-archive.com/lucene-dev@jakarta.apache.org/msg06693.html
This will make the code much more readable as we add, e.g., compression.
Doug
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org