You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by go...@apache.org on 2004/09/30 14:40:28 UTC
cvs commit: jakarta-lucene/src/test/org/apache/lucene/document TestBinaryDocument.java TestDocument.java
goller 2004/09/30 05:40:28
Modified: src/java/org/apache/lucene/index FieldsReader.java
FieldsWriter.java
src/java/org/apache/lucene/document Field.java
src/test/org/apache/lucene/document TestDocument.java
Added: src/test/org/apache/lucene/document TestBinaryDocument.java
Log:
Allow stored fields to be compressed (see Bug#31149)
Revision Changes Path
1.11 +60 -9 jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java
Index: FieldsReader.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- FieldsReader.java 16 Sep 2004 21:13:37 -0000 1.10
+++ FieldsReader.java 30 Sep 2004 12:40:26 -0000 1.11
@@ -16,7 +16,10 @@
* limitations under the License.
*/
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.util.zip.DataFormatException;
+import java.util.zip.Inflater;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -66,29 +69,77 @@
FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
byte bits = fieldsStream.readByte();
-
- if ((bits & 2) != 0) {
+
+ boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
+ boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
+
+ if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0) {
final byte[] b = new byte[fieldsStream.readVInt()];
fieldsStream.readBytes(b, 0, b.length);
- doc.add(new Field(fi.name, b));
+ if (compressed)
+ doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
+ else
+ doc.add(new Field(fi.name, b, Field.Store.YES));
}
else {
Field.Index index;
- boolean tokenize = (bits & 1) != 0;
+ Field.Store store = Field.Store.YES;
+
if (fi.isIndexed && tokenize)
index = Field.Index.TOKENIZED;
else if (fi.isIndexed && !tokenize)
index = Field.Index.UN_TOKENIZED;
else
index = Field.Index.NO;
- doc.add(new Field(fi.name, // name
- fieldsStream.readString(), // read value
- Field.Store.YES, index,
- fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+
+ if (compressed) {
+ store = Field.Store.COMPRESS;
+ final byte[] b = new byte[fieldsStream.readVInt()];
+ fieldsStream.readBytes(b, 0, b.length);
+ doc.add(new Field(fi.name, // field name
+ new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
+ store,
+ index,
+ fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
+ }
+ else
+ doc.add(new Field(fi.name, // name
+ fieldsStream.readString(), // read value
+ store,
+ index,
+ fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
}
}
return doc;
}
+ private final byte[] uncompress(final byte[] input)
+ throws IOException
+ {
+
+ Inflater decompressor = new Inflater();
+ decompressor.setInput(input);
+
+ // Create an expandable byte array to hold the decompressed data
+ ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
+
+ // Decompress the data
+ byte[] buf = new byte[1024];
+ while (!decompressor.finished()) {
+ try {
+ int count = decompressor.inflate(buf);
+ bos.write(buf, 0, count);
+ }
+ catch (DataFormatException e) {
+ // this will happen if the field is not compressed
+ throw new IOException ("field data are in wrong format: " + e.toString());
+ }
+ }
+
+ decompressor.end();
+
+ // Get the decompressed data
+ return bos.toByteArray();
+ }
}
1.6 +62 -5 jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java
Index: FieldsWriter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- FieldsWriter.java 28 Sep 2004 18:15:52 -0000 1.5
+++ FieldsWriter.java 30 Sep 2004 12:40:26 -0000 1.6
@@ -16,8 +16,10 @@
* the License.
*/
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Enumeration;
+import java.util.zip.Deflater;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -26,6 +28,10 @@
final class FieldsWriter
{
+ static final short FIELD_IS_TOKENIZED = 1;
+ static final short FIELD_IS_BINARY = 2;
+ static final short FIELD_IS_COMPRESSED = 4;
+
private FieldInfos fieldInfos;
private IndexOutput fieldsStream;
@@ -63,21 +69,72 @@
byte bits = 0;
if (field.isTokenized())
- bits |= 1;
+ bits |= FieldsWriter.FIELD_IS_TOKENIZED;
if (field.isBinary())
- bits |= 2;
+ bits |= FieldsWriter.FIELD_IS_BINARY;
+ if (field.isCompressed())
+ bits |= FieldsWriter.FIELD_IS_COMPRESSED;
+
fieldsStream.writeByte(bits);
-
- if (field.isBinary()) {
+
+ if (field.isCompressed()) {
+ // compression is enabled for the current field
+ byte[] data = null;
+ // check if it is a binary field
+ if (field.isBinary()) {
+ data = compress(field.binaryValue());
+ }
+ else {
+ data = compress(field.stringValue().getBytes("UTF-8"));
+ }
+ final int len = data.length;
+ fieldsStream.writeVInt(len);
+ fieldsStream.writeBytes(data, len);
+ }
+ else {
+ // compression is disabled for the current field
+ if (field.isBinary()) {
byte[] data = field.binaryValue();
final int len = data.length;
fieldsStream.writeVInt(len);
fieldsStream.writeBytes(data, len);
- } else {
+ }
+ else {
fieldsStream.writeString(field.stringValue());
+ }
}
}
}
}
+ private final byte[] compress (byte[] input) {
+
+ // Create the compressor with highest level of compression
+ Deflater compressor = new Deflater();
+ compressor.setLevel(Deflater.BEST_COMPRESSION);
+
+ // Give the compressor the data to compress
+ compressor.setInput(input);
+ compressor.finish();
+
+ /*
+ * Create an expandable byte array to hold the compressed data.
+ * You cannot use an array that's the same size as the orginal because
+ * there is no guarantee that the compressed data will be smaller than
+ * the uncompressed data.
+ */
+ ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
+
+ // Compress the data
+ byte[] buf = new byte[1024];
+ while (!compressor.finished()) {
+ int count = compressor.deflate(buf);
+ bos.write(buf, 0, count);
+ }
+
+ compressor.end();
+
+ // Get the compressed data
+ return bos.toByteArray();
+ }
}
1.24 +62 -27 jakarta-lucene/src/java/org/apache/lucene/document/Field.java
Index: Field.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -r1.23 -r1.24
--- Field.java 15 Sep 2004 12:50:22 -0000 1.23
+++ Field.java 30 Sep 2004 12:40:26 -0000 1.24
@@ -33,15 +33,16 @@
public final class Field implements java.io.Serializable {
private String name = "body";
- private String stringValue = null;
- private Reader readerValue = null;
- private byte[] binaryValue = null;
+
+ // the one and only data object for all different kind of field values
+ private Object fieldsData = null;
private boolean storeTermVector = false;
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
private boolean isBinary = false;
+ private boolean isCompressed = false;
private float boost = 1.0f;
@@ -54,6 +55,10 @@
public String toString() {
return name;
}
+ /** Store the original field value in the index in a compressed form. This is
+ * useful for long documents and for binary valued fields.
+ */
+ public static final Store COMPRESS = new Store("COMPRESS");
/** Store the original field value in the index. This is useful for short texts
* like a document's title which should be displayed with the results. The
* value is stored in its original form, i.e. no analyzer is used before it is
@@ -220,18 +225,22 @@
/** The name of the field (e.g., "date", "title", "body", ...)
as an interned string. */
- public String name() { return name; }
+ public String name() { return name; }
/** The value of the field as a String, or null. If null, the Reader value
- is used. Exactly one of stringValue() and readerValue() must be set. */
- public String stringValue() { return stringValue; }
+ * or binary value is used. Exactly one of stringValue(), readerValue(), and
+ * binaryValue() must be set. */
+ public String stringValue() { try { return (String)fieldsData; } catch (ClassCastException ignore) { return null; } }
+
/** The value of the field as a Reader, or null. If null, the String value
- is used. Exactly one of stringValue() and readerValue() must be set. */
- public Reader readerValue() { return readerValue; }
+ * or binary value is used. Exactly one of stringValue(), readerValue(),
+ * and binaryValue() must be set. */
+ public Reader readerValue() { try { return (Reader)fieldsData; } catch (ClassCastException ignore) { return null; } }
+
/** The value of the field in Binary, or null. If null, the Reader or
- String value is used. Exactly one of stringValue(), readerValue() and
- binaryValue() must be set. */
- public byte[] binaryValue() { return binaryValue; }
+ * String value is used. Exactly one of stringValue(), readerValue() and
+ * binaryValue() must be set. */
+ public byte[] binaryValue() { try { return (byte[])fieldsData; } catch (ClassCastException ignore) { return null; } }
/**
* Create a field by specifying its name, value and how it will
@@ -277,12 +286,16 @@
if (index == Index.NO && termVector != TermVector.NO)
throw new IllegalArgumentException("cannot store term vector information "
+ "for a field that is not indexed");
-
+
this.name = name.intern(); // field names are interned
- this.stringValue = value;
+ this.fieldsData = value;
if (store == Store.YES)
this.isStored = true;
+ else if (store == Store.COMPRESS) {
+ this.isStored = true;
+ this.isCompressed = true;
+ }
else if (store == Store.NO)
this.isStored = false;
else
@@ -331,7 +344,7 @@
if (reader == null)
throw new NullPointerException("reader cannot be null");
this.name = name.intern(); // field names are interned
- this.readerValue = reader;
+ this.fieldsData = reader;
this.isStored = false;
this.isIndexed = true;
this.isTokenized = true;
@@ -344,18 +357,31 @@
* @deprecated use {@link #Field(String, String, Field.Store, Field.Index)} instead
*/
public Field(String name, String string,
- boolean store, boolean index, boolean token) {
+ boolean store, boolean index, boolean token) {
this(name, string, store, index, token, false);
}
- public Field(String name, byte[] value) {
+
+ /**
+ * Create a stored field with binary value. Optionally the value may be compressed.
+ *
+ * @param name The name of the field
+ * @param value The binary value
+ * @param store How <code>value</code> should be stored (compressed or not.)
+ */
+ public Field(String name, byte[] value, Store store) {
if (name == null)
throw new IllegalArgumentException("name cannot be null");
if (value == null)
throw new IllegalArgumentException("value cannot be null");
+ if (store == Store.NO)
+ throw new IllegalArgumentException("binary values can't be unstored");
+ if (store == Store.COMPRESS)
+ this.isCompressed = true;
this.name = name.intern();
- this.binaryValue = value;
+ //wrap the byte[] to a ByteBuffer object
+ this.fieldsData = value;
this.isBinary = true;
this.isStored = true;
@@ -377,7 +403,7 @@
* @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead
*/
public Field(String name, String string,
- boolean store, boolean index, boolean token, boolean storeTermVector) {
+ boolean store, boolean index, boolean token, boolean storeTermVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (string == null)
@@ -385,8 +411,8 @@
if (!index && storeTermVector)
throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed");
- this.name = name.intern(); // field names are interned
- this.stringValue = string;
+ this.name = name.intern(); // field names are interned
+ this.fieldsData = string;
this.isStored = store;
this.isIndexed = index;
this.isTokenized = token;
@@ -406,16 +432,19 @@
/** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is
Reader-valued. */
- public final boolean isStored() { return isStored; }
+ public final boolean isStored() { return isStored; }
/** True iff the value of the field is to be indexed, so that it may be
searched on. */
- public final boolean isIndexed() { return isIndexed; }
+ public final boolean isIndexed() { return isIndexed; }
/** True iff the value of the field should be tokenized as text prior to
indexing. Un-tokenized fields are indexed as a single word and may not be
Reader-valued. */
- public final boolean isTokenized() { return isTokenized; }
+ public final boolean isTokenized() { return isTokenized; }
+
+ /** True if the value of the field is stored and compressed within the index */
+ public final boolean isCompressed() { return isCompressed; }
/** True iff the term or terms used to index this field are stored as a term
* vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
@@ -456,14 +485,20 @@
result.append("binary");
}
+ if (isCompressed) {
+ if (result.length() > 0)
+ result.append(",");
+ result.append("compressed");
+ }
+
result.append('<');
result.append(name);
result.append(':');
- if (readerValue != null) {
- result.append(readerValue.toString());
- } else {
- result.append(stringValue);
+
+ if (fieldsData != null) {
+ result.append(fieldsData);
}
+
result.append('>');
return result.toString();
}
1.9 +3 -3 jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java
Index: TestDocument.java
===================================================================
RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- TestDocument.java 15 Sep 2004 12:50:23 -0000 1.8
+++ TestDocument.java 30 Sep 2004 12:40:28 -0000 1.9
@@ -47,8 +47,8 @@
{
Document doc = new Document();
Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO);
- Field binaryFld = new Field("binary", binaryVal.getBytes());
- Field binaryFld2 = new Field("binary", binaryVal2.getBytes());
+ Field binaryFld = new Field("binary", binaryVal.getBytes(), Field.Store.YES);
+ Field binaryFld2 = new Field("binary", binaryVal2.getBytes(), Field.Store.YES);
doc.add(stringFld);
doc.add(binaryFld);
1.1 jakarta-lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java
Index: TestBinaryDocument.java
===================================================================
package org.apache.lucene.document;
import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.RAMDirectory;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Tests {@link Document} class.
*
* @author Bernhard Messer
* @version $Id: TestBinaryDocument.java,v 1.1 2004/09/30 12:40:28 goller Exp $
*/
public class TestBinaryDocument extends TestCase
{
String binaryValStored = "this text will be stored as a byte array in the index";
String binaryValCompressed = "this text will be also stored and compressed as a byte array in the index";
public void testBinaryFieldInIndex()
throws Exception
{
Field binaryFldStored = new Field("binaryStored", binaryValStored.getBytes(), Field.Store.YES);
Field binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS);
Field stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO);
Field stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO);
try {
// binary fields with store off are not allowed
new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO);
fail();
}
catch (IllegalArgumentException iae) {
;
}
Document doc = new Document();
doc.add(binaryFldStored);
doc.add(binaryFldCompressed);
doc.add(stringFldStored);
doc.add(stringFldCompressed);
/** test for field count */
assertEquals(4, doc.fields.size());
/** add the doc to a ram index */
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
writer.addDocument(doc);
writer.close();
/** open a reader and fetch the document */
IndexReader reader = IndexReader.open(dir);
Document docFromReader = reader.document(0);
assertTrue(docFromReader != null);
/** fetch the binary stored field and compare it's content with the original one */
String binaryFldStoredTest = new String(docFromReader.getBinaryValue("binaryStored"));
assertTrue(binaryFldStoredTest.equals(binaryValStored));
/** fetch the binary compressed field and compare it's content with the original one */
String binaryFldCompressedTest = new String(docFromReader.getBinaryValue("binaryCompressed"));
assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
/** fetch the string field and compare it's content with the original one */
String stringFldStoredTest = new String(docFromReader.get("stringStored"));
assertTrue(stringFldStoredTest.equals(binaryValStored));
/** fetch the compressed string field and compare it's content with the original one */
String stringFldCompressedTest = new String(docFromReader.get("stringCompressed"));
assertTrue(stringFldCompressedTest.equals(binaryValCompressed));
/** delete the document from index */
reader.delete(0);
assertEquals(0, reader.numDocs());
reader.close();
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org