You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/05/21 12:24:05 UTC

svn commit: r1680806 - in /jackrabbit/oak/trunk: oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/ oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/ oak-core/src/main/java/org/apache/jackrabbit/oak/util/ oak-lucene/src/main/...

Author: chetanm
Date: Thu May 21 10:24:05 2015
New Revision: 1680806

URL: http://svn.apache.org/r1680806
Log:
OAK-2895 - Avoid accessing binary content if the mimeType is excluded from indexing

-- Use TypeDetector instead of DefaultDetector to avoid Tika sniffing the mimeType by reading the input stream
-- Use a LazyInputStream to lazily load the stream if and when required

Added:
    jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/
    jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/LazyInputStream.java   (with props)
    jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/package-info.java   (with props)
    jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/
    jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/LazyInputStreamTest.java   (with props)
    jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/util/BlobByteSource.java   (with props)
Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
    jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Added: jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/LazyInputStream.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/LazyInputStream.java?rev=1680806&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/LazyInputStream.java (added)
+++ jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/LazyInputStream.java Thu May 21 10:24:05 2015
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.commons.io;
+
+import java.io.FilterInputStream;
+import java.io.IOException;
+
+import com.google.common.io.ByteSource;
+import org.apache.commons.io.input.ClosedInputStream;
+
+/**
+ * * This input stream delays accessing the ByteSource until the first byte is read
+ */
+public class LazyInputStream extends FilterInputStream {
+    private final ByteSource byteSource;
+    private boolean opened;
+
+    public LazyInputStream(ByteSource byteSource) {
+        super(null);
+        this.byteSource = byteSource;
+    }
+
+    @Override
+    public int read() throws IOException {
+        ensureOpen();
+        return super.read();
+    }
+
+    @Override
+    public int read(byte[] b) throws IOException {
+        ensureOpen();
+        return super.read(b);
+    }
+
+    @Override
+    public int read(byte[] b, int off, int len) throws IOException {
+        ensureOpen();
+        return super.read(b, off, len);
+    }
+
+    @Override
+    public long skip(long n) throws IOException {
+        ensureOpen();
+        return super.skip(n);
+    }
+
+    @Override
+    public int available() throws IOException {
+        ensureOpen();
+        return super.available();
+    }
+
+    @Override
+    public void close() throws IOException {
+        // make sure the file is not opened afterwards
+        opened = true;
+
+        // only close the file if it was in fact opened
+        if (in != null) {
+            super.close();
+        } else {
+            in = ClosedInputStream.CLOSED_INPUT_STREAM;
+        }
+    }
+
+    @Override
+    public synchronized void mark(int readlimit) {
+        ensureOpenWithUnCheckedException();
+        super.mark(readlimit);
+    }
+
+    @Override
+    public synchronized void reset() throws IOException {
+        ensureOpen();
+        super.reset();
+    }
+
+    @Override
+    public boolean markSupported() {
+        ensureOpenWithUnCheckedException();
+        return super.markSupported();
+    }
+
+    private void ensureOpen() throws IOException {
+        if (!opened) {
+            opened = true;
+            in = byteSource.openStream();
+        }
+    }
+
+    private void ensureOpenWithUnCheckedException(){
+        try {
+            ensureOpen();
+        } catch (IOException e) {
+            throw new IllegalStateException(e);
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/LazyInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/package-info.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/package-info.java?rev=1680806&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/package-info.java (added)
+++ jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/package-info.java Thu May 21 10:24:05 2015
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+@Version("1.0")
+@Export(optional = "provide:=true")
+package org.apache.jackrabbit.oak.commons.io;
+
+import aQute.bnd.annotation.Export;
+import aQute.bnd.annotation.Version;
\ No newline at end of file

Propchange: jackrabbit/oak/trunk/oak-commons/src/main/java/org/apache/jackrabbit/oak/commons/io/package-info.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/LazyInputStreamTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/LazyInputStreamTest.java?rev=1680806&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/LazyInputStreamTest.java (added)
+++ jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/LazyInputStreamTest.java Thu May 21 10:24:05 2015
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.commons.io;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static com.google.common.io.Files.asByteSource;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.fail;
+
+/**
+ * Tests the LazyInputStream class.
+ */
+public class LazyInputStreamTest {
+    
+    private File file;
+    
+    @Rule
+    public final TemporaryFolder temporaryFolder = new TemporaryFolder(); 
+
+    @Test
+    public void test() throws IOException {
+        createFile();
+        
+        // test open / close (without reading)
+        LazyInputStream in = new LazyInputStream(asByteSource(file));
+        in.close();
+        
+        // test reading too much and closing too much
+        in = new LazyInputStream(asByteSource(file));
+        assertEquals(0, in.read());
+        assertEquals(-1, in.read());
+        assertEquals(-1, in.read());
+        assertEquals(-1, in.read());
+        in.close();
+        in.close();
+        in.close();
+
+        // test markSupported, mark, and reset
+        in = new LazyInputStream(asByteSource(file));
+        assertFalse(in.markSupported());
+        in.mark(1);
+        assertEquals(0, in.read());
+        try {
+            in.reset();
+            fail();
+        } catch (IOException e) {
+            // expected
+        }
+        assertEquals(-1, in.read());
+        in.close();
+        
+        // test read(byte[])
+        in = new LazyInputStream(asByteSource(file));
+        byte[] test = new byte[2];
+        assertEquals(1, in.read(test));
+        in.close();        
+        
+        // test read(byte[],int,int)
+        in = new LazyInputStream(asByteSource(file));
+        assertEquals(1, in.read(test, 0, 2));
+        in.close();        
+
+        // test skip
+        in = new LazyInputStream(asByteSource(file));
+        assertEquals(2, in.skip(2));
+        assertEquals(-1, in.read(test));
+        in.close();
+
+        createFile();
+        
+        // test that the file is closed after reading the last byte
+        in = new LazyInputStream(asByteSource(file));
+        assertEquals(0, in.read());
+        assertEquals(-1, in.read());
+
+        in.close();
+
+        file.delete();
+        
+    }
+
+    private void createFile() throws IOException {
+        file = temporaryFolder.newFile();
+        FileOutputStream out = new FileOutputStream(file);
+        out.write(new byte[1]);
+        out.close();
+    }
+
+}

Propchange: jackrabbit/oak/trunk/oak-commons/src/test/java/org/apache/jackrabbit/oak/commons/io/LazyInputStreamTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/util/BlobByteSource.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/util/BlobByteSource.java?rev=1680806&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/util/BlobByteSource.java (added)
+++ jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/util/BlobByteSource.java Thu May 21 10:24:05 2015
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.api.Blob;
+
+public final class BlobByteSource extends ByteSource {
+    private final Blob blob;
+
+    public BlobByteSource(Blob blob) {
+        this.blob = blob;
+    }
+
+    @Override
+    public InputStream openStream() throws IOException {
+        return blob.getNewStream();
+    }
+
+    @Override
+    public long size() throws IOException {
+        return blob.length();
+    }
+
+    @Override
+    public boolean isEmpty() throws IOException {
+        return blob.length() == 0;
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/util/BlobByteSource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1680806&r1=1680805&r2=1680806&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java Thu May 21 10:24:05 2015
@@ -46,6 +46,7 @@ import org.apache.jackrabbit.oak.api.Pro
 import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
 import org.apache.jackrabbit.oak.plugins.index.IndexEditor;
 import org.apache.jackrabbit.oak.plugins.index.IndexUpdateCallback;
 import org.apache.jackrabbit.oak.plugins.index.PathFilter;
@@ -55,6 +56,7 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.spi.commit.Editor;
 import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.jackrabbit.oak.util.BlobByteSource;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DoubleDocValuesField;
 import org.apache.lucene.document.DoubleField;
@@ -803,7 +805,7 @@ public class LuceneIndexEditor implement
         long start = System.currentTimeMillis();
         long size = 0;
         try {
-            CountingInputStream stream = new CountingInputStream(v.getNewStream());
+            CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
             try {
                 context.getParser().parse(stream, handler, metadata, new ParseContext());
             } finally {

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml?rev=1680806&r1=1680805&r2=1680806&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml Thu May 21 10:24:05 2015
@@ -21,7 +21,7 @@
 
 <properties>
   <detectors>
-    <detector class="org.apache.tika.detect.DefaultDetector"/>
+    <detector class="org.apache.tika.detect.TypeDetector"/>
   </detectors>
   <parsers>
     <parser class="org.apache.tika.parser.DefaultParser"/>

Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1680806&r1=1680805&r2=1680806&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Thu May 21 10:24:05 2015
@@ -19,6 +19,7 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import java.io.InputStream;
 import java.text.ParseException;
 import java.util.Calendar;
 import java.util.Collections;
@@ -26,16 +27,20 @@ import java.util.List;
 import java.util.Random;
 import java.util.Set;
 
+import javax.annotation.Nonnull;
 import javax.jcr.PropertyType;
 
+import com.google.common.base.Charsets;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
+import com.google.common.io.CountingInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.Oak;
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
 import org.apache.jackrabbit.oak.api.ContentRepository;
 import org.apache.jackrabbit.oak.api.PropertyValue;
@@ -45,6 +50,7 @@ import org.apache.jackrabbit.oak.api.Typ
 import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
 import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
 import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
+import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.apache.jackrabbit.oak.plugins.memory.PropertyStates;
 import org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent;
 import org.apache.jackrabbit.oak.plugins.nodetype.write.NodeTypeRegistry;
@@ -79,6 +85,7 @@ import static org.apache.jackrabbit.oak.
 import static org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty;
 import static org.hamcrest.CoreMatchers.not;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertThat;
 import static org.junit.matchers.JUnitMatchers.containsString;
 
@@ -1208,6 +1215,22 @@ public class LucenePropertyIndexTest ext
     }
 
     @Test
+    public void excludedBlobContentNotAccessed() throws Exception{
+        Tree idx = createFulltextIndex(root.getTree("/"), "test");
+        TestUtil.useV2(idx);
+
+        AccessStateProvidingBlob testBlob =
+                new AccessStateProvidingBlob("<?xml version=\"1.0\" encoding=\"UTF-8\"?><msg>sky is blue</msg>");
+
+        Tree test = root.getTree("/").addChild("test");
+        createFileNode(test, "zip", testBlob, "application/zip");
+        root.commit();
+
+        assertFalse(testBlob.isStreamAccessed());
+        assertEquals(0, testBlob.readByteCount());
+    }
+
+    @Test
     public void maxFieldLengthCheck() throws Exception{
         Tree idx = createFulltextIndex(root.getTree("/"), "test");
         TestUtil.useV2(idx);
@@ -1312,8 +1335,12 @@ public class LucenePropertyIndexTest ext
     }
 
     private Tree createFileNode(Tree tree, String name, String content, String mimeType){
+        return createFileNode(tree, name, new ArrayBasedBlob(content.getBytes()), mimeType);
+    }
+
+    private Tree createFileNode(Tree tree, String name, Blob content, String mimeType){
         Tree jcrContent = tree.addChild(name).addChild(JCR_CONTENT);
-        jcrContent.setProperty(JcrConstants.JCR_DATA, content.getBytes());
+        jcrContent.setProperty(JcrConstants.JCR_DATA, content);
         jcrContent.setProperty(JcrConstants.JCR_MIMETYPE, mimeType);
         return jcrContent;
     }
@@ -1483,4 +1510,38 @@ public class LucenePropertyIndexTest ext
                     '}';
         }
     }
+
+    private static class AccessStateProvidingBlob extends ArrayBasedBlob {
+        private CountingInputStream stream;
+
+        public AccessStateProvidingBlob(byte[] value) {
+            super(value);
+        }
+
+        public AccessStateProvidingBlob(String content) {
+            this(content.getBytes(Charsets.UTF_8));
+        }
+
+        @Nonnull
+        @Override
+        public InputStream getNewStream() {
+            stream = new CountingInputStream(super.getNewStream());
+            return stream;
+        }
+
+        public boolean isStreamAccessed() {
+            return stream != null;
+        }
+
+        public void resetState(){
+            stream = null;
+        }
+
+        public long readByteCount(){
+            if (stream == null){
+                return 0;
+            }
+            return stream.getCount();
+        }
+    }
 }