You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by go...@apache.org on 2004/09/30 14:40:28 UTC

cvs commit: jakarta-lucene/src/test/org/apache/lucene/document TestBinaryDocument.java TestDocument.java

goller      2004/09/30 05:40:28

  Modified:    src/java/org/apache/lucene/index FieldsReader.java
                        FieldsWriter.java
               src/java/org/apache/lucene/document Field.java
               src/test/org/apache/lucene/document TestDocument.java
  Added:       src/test/org/apache/lucene/document TestBinaryDocument.java
  Log:
  Allow stored fields to be compressed (see Bug#31149)
  
  Revision  Changes    Path
  1.11      +60 -9     jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java
  
  Index: FieldsReader.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsReader.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- FieldsReader.java	16 Sep 2004 21:13:37 -0000	1.10
  +++ FieldsReader.java	30 Sep 2004 12:40:26 -0000	1.11
  @@ -16,7 +16,10 @@
    * limitations under the License.
    */
   
  +import java.io.ByteArrayOutputStream;
   import java.io.IOException;
  +import java.util.zip.DataFormatException;
  +import java.util.zip.Inflater;
   
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
  @@ -66,29 +69,77 @@
         FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
   
         byte bits = fieldsStream.readByte();
  -
  -      if ((bits & 2) != 0) {
  +      
  +      boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
  +      boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
  +      
  +      if ((bits & FieldsWriter.FIELD_IS_BINARY) != 0) {
           final byte[] b = new byte[fieldsStream.readVInt()];
           fieldsStream.readBytes(b, 0, b.length);
  -        doc.add(new Field(fi.name, b));
  +        if (compressed)
  +          doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
  +        else
  +          doc.add(new Field(fi.name, b, Field.Store.YES));
         }
         else {
           Field.Index index;
  -        boolean tokenize = (bits & 1) != 0;
  +        Field.Store store = Field.Store.YES;
  +        
           if (fi.isIndexed && tokenize)
             index = Field.Index.TOKENIZED;
           else if (fi.isIndexed && !tokenize)
             index = Field.Index.UN_TOKENIZED;
           else
             index = Field.Index.NO;
  -        doc.add(new Field(fi.name,		  // name
  -  			fieldsStream.readString(), // read value
  -  			Field.Store.YES, index,
  -  			fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
  +        
  +        if (compressed) {
  +          store = Field.Store.COMPRESS;
  +          final byte[] b = new byte[fieldsStream.readVInt()];
  +          fieldsStream.readBytes(b, 0, b.length);
  +          doc.add(new Field(fi.name,      // field name
  +              new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
  +              store,
  +              index,
  +              fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
  +        }
  +        else
  +          doc.add(new Field(fi.name,      // name
  +                fieldsStream.readString(), // read value
  +                store,
  +                index,
  +                fi.storeTermVector ? Field.TermVector.YES : Field.TermVector.NO));
         }
       }
   
       return doc;
     }
     
  +  private final byte[] uncompress(final byte[] input)
  +    throws IOException
  +  {
  +  
  +    Inflater decompressor = new Inflater();
  +    decompressor.setInput(input);
  +  
  +    // Create an expandable byte array to hold the decompressed data
  +    ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
  +  
  +    // Decompress the data
  +    byte[] buf = new byte[1024];
  +    while (!decompressor.finished()) {
  +      try {
  +        int count = decompressor.inflate(buf);
  +        bos.write(buf, 0, count);
  +      }
  +      catch (DataFormatException e) {
  +        // this will happen if the field is not compressed
  +        throw new IOException ("field data are in wrong format: " + e.toString());
  +      }
  +    }
  +  
  +    decompressor.end();
  +    
  +    // Get the decompressed data
  +    return bos.toByteArray();
  +  }
   }
  
  
  
  1.6       +62 -5     jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java
  
  Index: FieldsWriter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FieldsWriter.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- FieldsWriter.java	28 Sep 2004 18:15:52 -0000	1.5
  +++ FieldsWriter.java	30 Sep 2004 12:40:26 -0000	1.6
  @@ -16,8 +16,10 @@
    * the License.
    */
   
  +import java.io.ByteArrayOutputStream;
   import java.io.IOException;
   import java.util.Enumeration;
  +import java.util.zip.Deflater;
   
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
  @@ -26,6 +28,10 @@
   
   final class FieldsWriter
   {
  +  static final short FIELD_IS_TOKENIZED = 1;
  +  static final short FIELD_IS_BINARY = 2;
  +  static final short FIELD_IS_COMPRESSED = 4;
  +  
       private FieldInfos fieldInfos;
   
       private IndexOutput fieldsStream;
  @@ -63,21 +69,72 @@
   
                   byte bits = 0;
                   if (field.isTokenized())
  -                    bits |= 1;
  +                    bits |= FieldsWriter.FIELD_IS_TOKENIZED;
                   if (field.isBinary())
  -                    bits |= 2;
  +                    bits |= FieldsWriter.FIELD_IS_BINARY;
  +                if (field.isCompressed())
  +                    bits |= FieldsWriter.FIELD_IS_COMPRESSED;
  +                
                   fieldsStream.writeByte(bits);
  -
  -                if (field.isBinary()) {
  +                
  +                if (field.isCompressed()) {
  +                  // compression is enabled for the current field
  +                  byte[] data = null;
  +                  // check if it is a binary field
  +                  if (field.isBinary()) {
  +                    data = compress(field.binaryValue());
  +                  }
  +                  else {
  +                    data = compress(field.stringValue().getBytes("UTF-8"));
  +                  }
  +                  final int len = data.length;
  +                  fieldsStream.writeVInt(len);
  +                  fieldsStream.writeBytes(data, len);
  +                }
  +                else {
  +                  // compression is disabled for the current field
  +                  if (field.isBinary()) {
                       byte[] data = field.binaryValue();
                       final int len = data.length;
                       fieldsStream.writeVInt(len);
                       fieldsStream.writeBytes(data, len);
  -                } else {
  +                  }
  +                  else {
                       fieldsStream.writeString(field.stringValue());
  +                  }
                   }
               }
           }
       }
   
  +    private final byte[] compress (byte[] input) {
  +
  +      // Create the compressor with highest level of compression
  +      Deflater compressor = new Deflater();
  +      compressor.setLevel(Deflater.BEST_COMPRESSION);
  +
  +      // Give the compressor the data to compress
  +      compressor.setInput(input);
  +      compressor.finish();
  +
  +      /*
  +       * Create an expandable byte array to hold the compressed data.
  +       * You cannot use an array that's the same size as the orginal because
  +       * there is no guarantee that the compressed data will be smaller than
  +       * the uncompressed data.
  +       */
  +      ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
  +
  +      // Compress the data
  +      byte[] buf = new byte[1024];
  +      while (!compressor.finished()) {
  +        int count = compressor.deflate(buf);
  +        bos.write(buf, 0, count);
  +      }
  +      
  +      compressor.end();
  +
  +      // Get the compressed data
  +      return bos.toByteArray();
  +    }
   }
  
  
  
  1.24      +62 -27    jakarta-lucene/src/java/org/apache/lucene/document/Field.java
  
  Index: Field.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
  retrieving revision 1.23
  retrieving revision 1.24
  diff -u -r1.23 -r1.24
  --- Field.java	15 Sep 2004 12:50:22 -0000	1.23
  +++ Field.java	30 Sep 2004 12:40:26 -0000	1.24
  @@ -33,15 +33,16 @@
   
   public final class Field implements java.io.Serializable {
     private String name = "body";
  -  private String stringValue = null;
  -  private Reader readerValue = null;
  -  private byte[] binaryValue = null;
  +  
  +  // the one and only data object for all different kind of field values
  +  private Object fieldsData = null;
     
     private boolean storeTermVector = false;
     private boolean isStored = false;
     private boolean isIndexed = true;
     private boolean isTokenized = true;
     private boolean isBinary = false;
  +  private boolean isCompressed = false;
     
     private float boost = 1.0f;
     
  @@ -54,6 +55,10 @@
       public String toString() {
         return name;
       }
  +    /** Store the original field value in the index in a compressed form. This is
  +     * useful for long documents and for binary valued fields.
  +     */
  +    public static final Store COMPRESS = new Store("COMPRESS");
       /** Store the original field value in the index. This is useful for short texts
        * like a document's title which should be displayed with the results. The
        * value is stored in its original form, i.e. no analyzer is used before it is
  @@ -220,18 +225,22 @@
     
     /** The name of the field (e.g., "date", "title", "body", ...)
       as an interned string. */
  -  public String name() 		{ return name; }
  +  public String name()    { return name; }
   
     /** The value of the field as a String, or null.  If null, the Reader value
  -    is used.  Exactly one of stringValue() and readerValue() must be set. */
  -  public String stringValue()		{ return stringValue; }
  +   * or binary value is used.  Exactly one of stringValue(), readerValue(), and
  +   * binaryValue() must be set. */
  +  public String stringValue()   { try { return (String)fieldsData; } catch (ClassCastException ignore) { return null; } }
  +  
     /** The value of the field as a Reader, or null.  If null, the String value
  -    is used.  Exactly one of stringValue() and readerValue() must be set. */
  -  public Reader readerValue()	{ return readerValue; }
  +   * or binary value is  used.  Exactly one of stringValue(), readerValue(),
  +   * and binaryValue() must be set. */
  +  public Reader readerValue()   { try { return (Reader)fieldsData; } catch (ClassCastException ignore) { return null; } }
  +  
     /** The value of the field in Binary, or null.  If null, the Reader or
  -     String value is used.  Exactly one of stringValue(), readerValue() and
  -     binaryValue() must be set. */
  -  public byte[] binaryValue() { return binaryValue; }
  +   * String value is used.  Exactly one of stringValue(), readerValue() and
  +   * binaryValue() must be set. */
  +  public byte[] binaryValue()   { try { return (byte[])fieldsData; } catch (ClassCastException ignore) { return null; } }
     
     /**
      * Create a field by specifying its name, value and how it will
  @@ -277,12 +286,16 @@
       if (index == Index.NO && termVector != TermVector.NO)
         throw new IllegalArgumentException("cannot store term vector information "
            + "for a field that is not indexed");
  -
  +          
       this.name = name.intern();        // field names are interned
  -    this.stringValue = value;
  +    this.fieldsData = value;
   
       if (store == Store.YES)
         this.isStored = true;
  +    else if (store == Store.COMPRESS) {
  +      this.isStored = true;
  +      this.isCompressed = true;
  +    }
       else if (store == Store.NO)
         this.isStored = false;
       else
  @@ -331,7 +344,7 @@
       if (reader == null)
         throw new NullPointerException("reader cannot be null");
       this.name = name.intern();        // field names are interned
  -    this.readerValue = reader;
  +    this.fieldsData = reader;
       this.isStored = false;
       this.isIndexed = true;
       this.isTokenized = true;
  @@ -344,18 +357,31 @@
      * @deprecated use {@link #Field(String, String, Field.Store, Field.Index)} instead
      */
     public Field(String name, String string,
  -	       boolean store, boolean index, boolean token) {
  +         boolean store, boolean index, boolean token) {
       this(name, string, store, index, token, false);
     }
   
  -  public Field(String name, byte[] value) {
  +  
  +  /**
  +   * Create a stored field with binary value. Optionally the value may be compressed.
  +   * 
  +   * @param name The name of the field
  +   * @param value The binary value
  +   * @param store How <code>value</code> should be stored (compressed or not.)
  +   */
  +  public Field(String name, byte[] value, Store store) {
       if (name == null)
         throw new IllegalArgumentException("name cannot be null");
       if (value == null)
         throw new IllegalArgumentException("value cannot be null");
  +    if (store == Store.NO)
  +      throw new IllegalArgumentException("binary values can't be unstored");
  +    if (store == Store.COMPRESS)
  +      this.isCompressed = true;
       
       this.name = name.intern();
  -    this.binaryValue = value;
  +    //wrap the byte[] to a ByteBuffer object
  +    this.fieldsData = value;
       
       this.isBinary    = true;
       this.isStored    = true;
  @@ -377,7 +403,7 @@
      * @deprecated use {@link #Field(String, String, Field.Store, Field.Index, Field.TermVector)} instead
      */ 
     public Field(String name, String string,
  -	       boolean store, boolean index, boolean token, boolean storeTermVector) {
  +         boolean store, boolean index, boolean token, boolean storeTermVector) {
       if (name == null)
         throw new NullPointerException("name cannot be null");
       if (string == null)
  @@ -385,8 +411,8 @@
       if (!index && storeTermVector)
         throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed");
   
  -    this.name = name.intern();			  // field names are interned
  -    this.stringValue = string;
  +    this.name = name.intern();        // field names are interned
  +    this.fieldsData = string;
       this.isStored = store;
       this.isIndexed = index;
       this.isTokenized = token;
  @@ -406,16 +432,19 @@
     /** True iff the value of the field is to be stored in the index for return
       with search hits.  It is an error for this to be true if a field is
       Reader-valued. */
  -  public final boolean	isStored() 	{ return isStored; }
  +  public final boolean  isStored()  { return isStored; }
   
     /** True iff the value of the field is to be indexed, so that it may be
       searched on. */
  -  public final boolean 	isIndexed() 	{ return isIndexed; }
  +  public final boolean  isIndexed()   { return isIndexed; }
   
     /** True iff the value of the field should be tokenized as text prior to
       indexing.  Un-tokenized fields are indexed as a single word and may not be
       Reader-valued. */
  -  public final boolean 	isTokenized() 	{ return isTokenized; }
  +  public final boolean  isTokenized()   { return isTokenized; }
  +  
  +  /** True if the value of the field is stored and compressed within the index */
  +  public final boolean  isCompressed()   { return isCompressed; }
   
     /** True iff the term or terms used to index this field are stored as a term
      *  vector, available from {@link IndexReader#getTermFreqVector(int,String)}.
  @@ -456,14 +485,20 @@
         result.append("binary");
       }
       
  +    if (isCompressed) {
  +      if (result.length() > 0)
  +        result.append(",");
  +      result.append("compressed");
  +    }
  +    
       result.append('<');
       result.append(name);
       result.append(':');
  -    if (readerValue != null) {
  -      result.append(readerValue.toString());
  -    } else {
  -      result.append(stringValue);
  +    
  +    if (fieldsData != null) {
  +      result.append(fieldsData);
       }
  +    
       result.append('>');
       return result.toString();
     }
  
  
  
  1.9       +3 -3      jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java
  
  Index: TestDocument.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/document/TestDocument.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- TestDocument.java	15 Sep 2004 12:50:23 -0000	1.8
  +++ TestDocument.java	30 Sep 2004 12:40:28 -0000	1.9
  @@ -47,8 +47,8 @@
     {
       Document doc = new Document();
       Field stringFld = new Field("string", binaryVal, Field.Store.YES, Field.Index.NO);
  -    Field binaryFld = new Field("binary", binaryVal.getBytes());
  -    Field binaryFld2 = new Field("binary", binaryVal2.getBytes());
  +    Field binaryFld = new Field("binary", binaryVal.getBytes(), Field.Store.YES);
  +    Field binaryFld2 = new Field("binary", binaryVal2.getBytes(), Field.Store.YES);
       
       doc.add(stringFld);
       doc.add(binaryFld);
  
  
  
  1.1                  jakarta-lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java
  
  Index: TestBinaryDocument.java
  ===================================================================
  package org.apache.lucene.document;
  
  import junit.framework.TestCase;
  
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.store.RAMDirectory;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  /**
   * Tests {@link Document} class.
   *
   * @author Bernhard Messer
   * @version $Id: TestBinaryDocument.java,v 1.1 2004/09/30 12:40:28 goller Exp $
   */
  public class TestBinaryDocument extends TestCase
  {
  
    String binaryValStored = "this text will be stored as a byte array in the index";
    String binaryValCompressed = "this text will be also stored and compressed as a byte array in the index";
    
    public void testBinaryFieldInIndex()
      throws Exception
    {
      Field binaryFldStored = new Field("binaryStored", binaryValStored.getBytes(), Field.Store.YES);
      Field binaryFldCompressed = new Field("binaryCompressed", binaryValCompressed.getBytes(), Field.Store.COMPRESS);
      Field stringFldStored = new Field("stringStored", binaryValStored, Field.Store.YES, Field.Index.NO, Field.TermVector.NO);
      Field stringFldCompressed = new Field("stringCompressed", binaryValCompressed, Field.Store.COMPRESS, Field.Index.NO, Field.TermVector.NO);
      
      try {
        // binary fields with store off are not allowed
        new Field("fail", binaryValCompressed.getBytes(), Field.Store.NO);
        fail();
      }
      catch (IllegalArgumentException iae) {
        ;
      }
      
      Document doc = new Document();
      
      doc.add(binaryFldStored);
      doc.add(binaryFldCompressed);
      
      doc.add(stringFldStored);
      doc.add(stringFldCompressed);
      
      /** test for field count */
      assertEquals(4, doc.fields.size());
      
      /** add the doc to a ram index */
      RAMDirectory dir = new RAMDirectory();
      IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(), true);
      writer.addDocument(doc);
      writer.close();
      
      /** open a reader and fetch the document */ 
      IndexReader reader = IndexReader.open(dir);
      Document docFromReader = reader.document(0);
      assertTrue(docFromReader != null);
      
      /** fetch the binary stored field and compare it's content with the original one */
      String binaryFldStoredTest = new String(docFromReader.getBinaryValue("binaryStored"));
      assertTrue(binaryFldStoredTest.equals(binaryValStored));
      
      /** fetch the binary compressed field and compare it's content with the original one */
      String binaryFldCompressedTest = new String(docFromReader.getBinaryValue("binaryCompressed"));
      assertTrue(binaryFldCompressedTest.equals(binaryValCompressed));
      
      /** fetch the string field and compare it's content with the original one */
      String stringFldStoredTest = new String(docFromReader.get("stringStored"));
      assertTrue(stringFldStoredTest.equals(binaryValStored));
      
      /** fetch the compressed string field and compare it's content with the original one */
      String stringFldCompressedTest = new String(docFromReader.get("stringCompressed"));
      assertTrue(stringFldCompressedTest.equals(binaryValCompressed));
      
      /** delete the document from index */
      reader.delete(0);
      assertEquals(0, reader.numDocs());
      
      reader.close();
      
    }
    
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org