You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/10 13:46:04 UTC

svn commit: r1690249 - in /jackrabbit/oak/trunk/oak-run: ./ src/main/assembly/ src/main/java/org/apache/jackrabbit/oak/plugins/tika/ src/main/java/org/apache/jackrabbit/oak/run/ src/main/resources/ src/test/java/org/apache/jackrabbit/oak/plugins/ src/t...

Author: chetanm
Date: Fri Jul 10 11:46:03 2015
New Revision: 1690249

URL: http://svn.apache.org/r1690249
Log:
OAK-2953 - Implement text extractor as part of oak-run

-- Add Tika dependency to oak-run
-- Ensure that various parser related dependency do not get pulled in while building oak-run. Changed assembly config for that
-- Exposed a new 'tika' command

Added:
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java   (with props)
Modified:
    jackrabbit/oak/trunk/oak-run/pom.xml
    jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml
    jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java
    jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml

Modified: jackrabbit/oak/trunk/oak-run/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/pom.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-run/pom.xml Fri Jul 10 11:46:03 2015
@@ -362,6 +362,22 @@
       <scope>compile</scope>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+      <version>1.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>1.5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-csv</artifactId>
+      <version>1.1</version>
+    </dependency>
+
     <!-- Findbugs annotations -->
     <dependency>
       <groupId>com.google.code.findbugs</groupId>

Modified: jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml Fri Jul 10 11:46:03 2015
@@ -38,11 +38,13 @@
       <excludes>
         <exclude>org.apache.jackrabbit:oak-lucene</exclude>
         <exclude>org.apache.lucene</exclude>
+        <exclude>org.apache.tika</exclude>
       </excludes>
       <useStrictFiltering>true</useStrictFiltering>
       <useProjectArtifact>true</useProjectArtifact>
       <unpack>true</unpack>
       <useTransitiveDependencies>true</useTransitiveDependencies>
+      <useTransitiveFiltering>true</useTransitiveFiltering>
       <unpackOptions>
         <excludes>
           <exclude>META-INF/*.SF</exclude>

Modified: jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml Fri Jul 10 11:46:03 2015
@@ -33,11 +33,14 @@
         <exclude>org.apache.jackrabbit:jackrabbit-core</exclude>
         <exclude>org.apache.lucene</exclude>
         <exclude>org.apache.derby</exclude>
+        <exclude>org.apache.tika:tika-core:*</exclude>
+        <exclude>org.apache.tika:tika-parsers:*</exclude>
       </excludes>
       <useStrictFiltering>true</useStrictFiltering>
       <useProjectArtifact>true</useProjectArtifact>
       <unpack>true</unpack>
       <useTransitiveDependencies>true</useTransitiveDependencies>
+      <useTransitiveFiltering>true</useTransitiveFiltering>
       <unpackOptions>
         <excludes>
           <exclude>META-INF/*.SF</exclude>
@@ -51,5 +54,24 @@
         </excludes>
       </unpackOptions>
     </dependencySet>
+    <!-- Exclude the transitive dependency as tika-parsers depend
+      on many other jars. Instead users can include tika-app.jar in classpath-->
+    <dependencySet>
+      <outputDirectory>/</outputDirectory>
+      <includes>
+        <include>org.apache.tika:tika-core</include>
+        <include>org.apache.tika:tika-parsers</include>
+      </includes>
+      <useStrictFiltering>true</useStrictFiltering>
+      <useTransitiveDependencies>false</useTransitiveDependencies>
+      <unpack>true</unpack>
+      <unpackOptions>
+        <excludes>
+          <exclude>META-INF/*.SF</exclude>
+          <exclude>META-INF/*.DSA</exclude>
+          <exclude>META-INF/*.RSA</exclude>
+        </excludes>
+      </unpackOptions>
+    </dependencySet>
   </dependencySets>
 </assembly>

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nullable;
+
+import com.google.common.io.ByteSource;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+class BinaryResource {
+    private final ByteSource byteSource;
+    private final String mimeType;
+    private final String encoding;
+    private final String path;
+    private final String blobId;
+
+    public BinaryResource(ByteSource byteSource,
+                          @Nullable String mimeType,
+                          @Nullable String encoding,
+                          String path,
+                          String blobId) {
+        this.byteSource = checkNotNull(byteSource, "ByteSource must be provided");
+        this.mimeType = mimeType;
+        this.encoding = encoding;
+        this.path = checkNotNull(path, "Path must be provided");
+        this.blobId = checkNotNull(blobId, "BlobId must be specified");
+    }
+
+    public ByteSource getByteSource() {
+        return byteSource;
+    }
+
+    @CheckForNull
+    public String getMimeType() {
+        return mimeType;
+    }
+
+    @CheckForNull
+    public String getEncoding() {
+        return encoding;
+    }
+
+    public String getPath() {
+        return path;
+    }
+
+    public String getBlobId() {
+        return blobId;
+    }
+
+    @Override
+    public String toString() {
+        return path;
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.IOException;
+
+import com.google.common.collect.FluentIterable;
+
+/**
+ * Provides an iterator for binaries present under given path
+ */
+interface BinaryResourceProvider {
+
+    FluentIterable<BinaryResource> getBinaries(String path) throws IOException;
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Maps;
+import org.codehaus.groovy.runtime.StringGroovyMethods;
+
+import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
+
+class BinaryStats {
+    private final TikaHelper tika;
+    private final List<MimeTypeStats> stats;
+    private long totalSize;
+    private long totalCount;
+    private long indexedSize;
+    private long indexedCount;
+
+    public BinaryStats(File tikaConfig, BinaryResourceProvider provider) throws IOException {
+        this.tika = new TikaHelper(tikaConfig);
+        this.stats = collectStats(provider);
+    }
+
+    public long getTotalSize() {
+        return totalSize;
+    }
+
+    public long getTotalCount() {
+        return totalCount;
+    }
+
+    public long getIndexedSize() {
+        return indexedSize;
+    }
+
+    public long getIndexedCount() {
+        return indexedCount;
+    }
+
+    public String getSummary() throws IOException {
+        return getSummary(stats);
+    }
+
+    private List<MimeTypeStats> collectStats(BinaryResourceProvider provider) throws IOException {
+        Map<String, MimeTypeStats> stats = Maps.newHashMap();
+        for (BinaryResource binary : provider.getBinaries("/")) {
+            String mimeType = binary.getMimeType();
+            if (mimeType != null) {
+                MimeTypeStats mimeStats = stats.get(mimeType);
+                if (mimeStats == null) {
+                    mimeStats = createStat(mimeType);
+                    stats.put(mimeType, mimeStats);
+                }
+
+                long size = binary.getByteSource().size();
+                mimeStats.addSize(size);
+                totalSize += size;
+                totalCount++;
+
+                if (mimeStats.isIndexed()) {
+                    indexedSize += size;
+                    indexedCount++;
+                }
+            }
+        }
+
+        List<MimeTypeStats> result = new ArrayList<MimeTypeStats>(stats.values());
+        Collections.sort(result, Collections.reverseOrder());
+        return result;
+    }
+
+    private String getSummary(List<MimeTypeStats> stats) {
+        int maxWidth = 0;
+        for (MimeTypeStats s : stats) {
+            maxWidth = Math.max(maxWidth, s.getName().length());
+        }
+
+        maxWidth += 5;
+
+        StringWriter sw = new StringWriter();
+        PrintWriter pw = new PrintWriter(sw);
+        pw.println("MimeType Stats");
+        pw.printf("\tTotal size          : %s%n", humanReadableByteCount(totalSize));
+        pw.printf("\tTotal indexed size  : %s%n", humanReadableByteCount(indexedSize));
+        pw.printf("\tTotal count         : %d%n", totalCount);
+        pw.printf("\tTotal indexed count : %d%n", indexedCount);
+        pw.println();
+
+        String header = center("Type", maxWidth) + " " +
+                center("Indexed", 10) + " " +
+                center("Supported", 10) + " " +
+                center("Count", 10) + " " +
+                center("Size", 10);
+
+        pw.println(header);
+        pw.println(Strings.repeat("_", header.length() + 5));
+
+        for (MimeTypeStats s : stats) {
+            pw.printf("%-" + maxWidth + "s|%10s|%10s|  %-8d|%10s%n",
+                    s.getName(),
+                    s.isIndexed(),
+                    s.isSupported(),
+                    s.getCount(),
+                    humanReadableByteCount(s.getTotalSize()));
+        }
+        return sw.toString();
+    }
+
+    private MimeTypeStats createStat(String mimeType) {
+        MimeTypeStats stats = new MimeTypeStats(mimeType);
+        stats.setIndexed(tika.isIndexed(mimeType));
+        stats.setSupported(tika.isSupportedMediaType(mimeType));
+        return stats;
+    }
+
+    private static String center(String s, int width) {
+        return StringGroovyMethods.center(s, width);
+    }
+
+    private static class MimeTypeStats implements Comparable<MimeTypeStats> {
+        private final String mimeType;
+        private int count;
+        private long totalSize;
+        private boolean supported;
+        private boolean indexed;
+
+        public MimeTypeStats(String mimeType) {
+            this.mimeType = mimeType;
+        }
+
+        public void addSize(long size) {
+            count++;
+            totalSize += size;
+        }
+
+        public void setSupported(boolean supported) {
+            this.supported = supported;
+        }
+
+        public void setIndexed(boolean indexed) {
+            this.indexed = indexed;
+        }
+
+        public long getTotalSize() {
+            return totalSize;
+        }
+
+        public int getCount() {
+            return count;
+        }
+
+        public String getName() {
+            return mimeType;
+        }
+
+        public boolean isIndexed() {
+            return indexed;
+        }
+
+        public boolean isSupported() {
+            return supported;
+        }
+
+        @Override
+        public int compareTo(MimeTypeStats o) {
+            return ComparisonChain.start()
+                    .compareFalseFirst(indexed, o.indexed)
+                    .compare(totalSize, o.totalSize)
+                    .result();
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.annotation.Nullable;
+
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+
+/**
+ * Avoiding use of BlobByteSource to avoid concurrent access to NodeState
+ */
+class BlobStoreByteSource extends ByteSource {
+    private final BlobStore blobStore;
+    private final String blobId;
+    private final Long size;
+
+    BlobStoreByteSource(BlobStore blobStore, String blobId,@Nullable Long size) {
+        this.blobStore = blobStore;
+        this.blobId = blobId;
+        this.size = size;
+    }
+
+    BlobStoreByteSource(BlobStore blobStore, String blobId) {
+        this(blobStore, blobId, null);
+    }
+
+    @Override
+    public InputStream openStream() throws IOException {
+        return blobStore.getInputStream(blobId);
+    }
+
+    @Override
+    public long size() throws IOException {
+        if (size != null) {
+            return size;
+        }
+        return blobStore.getBlobLength(blobId);
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+
+import javax.annotation.Nullable;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Function;
+import com.google.common.base.Predicate;
+import com.google.common.collect.FluentIterable;
+import com.google.common.io.Closer;
+import com.google.common.primitives.Longs;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Predicates.notNull;
+import static org.apache.jackrabbit.JcrConstants.JCR_ENCODING;
+import static org.apache.jackrabbit.JcrConstants.JCR_MIMETYPE;
+import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
+
+class CSVFileBinaryResourceProvider implements BinaryResourceProvider, Closeable {
+    private static final String BLOB_ID = "blobId";
+    private static final String LENGTH = "length";
+    static final CSVFormat FORMAT = CSVFormat.DEFAULT
+            .withCommentMarker('#')
+            .withHeader(
+                    BLOB_ID,
+                    LENGTH,
+                    JCR_MIMETYPE,
+                    JCR_ENCODING,
+                    JCR_PATH
+            )
+            .withNullString("") //Empty string are considered as null
+            .withIgnoreSurroundingSpaces()
+            .withSkipHeaderRecord();
+    private final Logger log = LoggerFactory.getLogger(getClass());
+    private final File dataFile;
+    private final BlobStore blobStore;
+    private final Closer closer = Closer.create();
+
+    public CSVFileBinaryResourceProvider(File dataFile, @Nullable BlobStore blobStore) {
+        checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile);
+        this.dataFile = dataFile;
+        this.blobStore = blobStore;
+    }
+
+    @Override
+    public FluentIterable<BinaryResource> getBinaries(final String path) throws IOException {
+        CSVParser parser = CSVParser.parse(dataFile, Charsets.UTF_8, FORMAT);
+        closer.register(parser);
+        return FluentIterable.from(parser)
+                .transform(new RecordTransformer())
+                .filter(notNull())
+                .filter(new Predicate<BinaryResource>() {
+                    @Override
+                    public boolean apply(BinaryResource input) {
+                        return PathUtils.isAncestor(path, input.getPath());
+                    }
+                });
+    }
+
+    @Override
+    public void close() throws IOException {
+        closer.close();
+    }
+
+    private class RecordTransformer implements Function<CSVRecord, BinaryResource> {
+
+        @Nullable
+        @Override
+        public BinaryResource apply(CSVRecord input) {
+            String path = input.get(JCR_PATH);
+            String mimeType = input.get(JCR_MIMETYPE);
+            String encoding = input.get(JCR_ENCODING);
+            String blobId = input.get(BLOB_ID);
+            String length = input.get(LENGTH);
+            Long len = length != null ? Longs.tryParse(length) : null;
+            if (path == null || blobId == null || mimeType == null) {
+                log.warn("Ignoring invalid record {}. Either of mimeType, blobId or path is null", input);
+                return null;
+            }
+
+            return new BinaryResource(new BlobStoreByteSource(blobStore, blobId, len),
+                    mimeType, encoding, path, blobId);
+        }
+    }
+
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nullable;
+
+import com.google.common.base.Function;
+import com.google.common.collect.FluentIterable;
+import com.google.common.collect.TreeTraverser;
+import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.api.PropertyState;
+import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Predicates.notNull;
+import static org.apache.jackrabbit.oak.plugins.tree.TreeFactory.createReadOnlyTree;
+import static org.apache.jackrabbit.oak.spi.state.NodeStateUtils.getNode;
+
+class NodeStoreBinaryResourceProvider implements BinaryResourceProvider {
+    private static final Logger log = LoggerFactory.getLogger(NodeStoreBinaryResourceProvider.class);
+    private final NodeStore nodeStore;
+    private final BlobStore blobStore;
+
+    public NodeStoreBinaryResourceProvider(NodeStore nodeStore, BlobStore blobStore) {
+        this.nodeStore = nodeStore;
+        this.blobStore = blobStore;
+    }
+
+    public FluentIterable<BinaryResource> getBinaries(String path) {
+        return new OakTreeTraverser()
+                .preOrderTraversal(createReadOnlyTree(getNode(nodeStore.getRoot(), path)))
+                .transform(new TreeToBinarySource())
+                .filter(notNull());
+    }
+
+    private class TreeToBinarySource implements Function<Tree, BinaryResource> {
+        @Nullable
+        @Override
+        public BinaryResource apply(Tree tree) {
+            PropertyState data = tree.getProperty(JcrConstants.JCR_DATA);
+            if (data == null) {
+                return null;
+            }
+
+            if (data.isArray()) {
+                log.debug("Ignoring jcr:data property at {} as its a MVP", tree.getPath());
+                return null;
+            }
+
+            Blob blob = data.getValue(Type.BINARY);
+            String blobId = blob.getContentIdentity();
+            if (blobId == null) {
+                log.debug("Ignoring jcr:data property at {} as its an inlined blob", tree.getPath());
+                return null;
+            }
+
+            String mimeType = getString(tree, JcrConstants.JCR_MIMETYPE);
+            String encoding = getString(tree, JcrConstants.JCR_ENCODING);
+
+            return new BinaryResource(new BlobStoreByteSource(blobStore, blobId), mimeType,
+                    encoding, tree.getPath(), blobId);
+        }
+    }
+
+    private static class OakTreeTraverser extends TreeTraverser<Tree> {
+        @Override
+        public Iterable<Tree> children(Tree root) {
+            return root.getChildren();
+        }
+    }
+
+    @CheckForNull
+    private static String getString(Tree tree, String name) {
+        PropertyState prop = tree.getProperty(name);
+        return prop != null ? prop.getValue(Type.STRING) : null;
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.google.common.io.ByteSource;
+import com.google.common.io.CountingInputStream;
+import org.apache.jackrabbit.oak.commons.IOUtils;
+import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+class TextExtractor implements Closeable {
+    private static final Logger log = LoggerFactory.getLogger(TextExtractor.class);
+    private static final Logger parserError = LoggerFactory.getLogger("org.apache.jackrabbit.oak.plugins.tika.ParserError");
+    private static final int PROGRESS_BATCH_SIZE = 1000;
+    private static final int MAX_EXTRACT_LENGTH = 100000;
+    private static final String ERROR_TEXT = "TextExtractionError";
+
+    private final TextWriter textWriter;
+
+    private final WorkItem SHUTDOWN_SIGNAL = new WorkItem(null);
+    private BlockingQueue<WorkItem> inputQueue;
+    private ExecutorService executorService;
+    private int threadPoolSize = Runtime.getRuntime().availableProcessors();
+    private int queueSize = 100;
+
+    private final AtomicInteger errorCount = new AtomicInteger();
+    private final AtomicLong timeTaken = new AtomicLong();
+    private final AtomicInteger extractionCount = new AtomicInteger();
+    private final AtomicInteger textWrittenCount = new AtomicInteger();
+    private final AtomicInteger parserErrorCount = new AtomicInteger();
+    private final AtomicInteger processedCount = new AtomicInteger();
+    private final AtomicInteger emptyCount = new AtomicInteger();
+    private final AtomicInteger notSupportedCount = new AtomicInteger();
+    private final AtomicInteger alreadyExtractedCount = new AtomicInteger();
+    private final AtomicLong extractedTextSize = new AtomicLong();
+    private final AtomicLong nonEmptyExtractedTextSize = new AtomicLong();
+    private final AtomicLong totalSizeRead = new AtomicLong();
+
+    private int maxExtractedLength = MAX_EXTRACT_LENGTH;
+    private File tikaConfig;
+    private TikaHelper tika;
+    private boolean initialized;
+    private BinaryStats stats;
+    private boolean closed;
+
+    public TextExtractor(TextWriter textWriter) {
+        this.textWriter = textWriter;
+    }
+
+    public void extract(Iterable<BinaryResource> binaries) throws InterruptedException, IOException {
+        initialize();
+        for (BinaryResource binary : binaries) {
+            inputQueue.put(new WorkItem(binary));
+        }
+    }
+
+    @Override
+    public void close() {
+        if (closed) {
+            return;
+        }
+        if (!inputQueue.isEmpty()) {
+            log.info("Shutting down the extractor. Pending task count {}", inputQueue.size());
+        }
+
+        if (executorService != null) {
+            try {
+                inputQueue.put(SHUTDOWN_SIGNAL);
+                executorService.shutdown();
+                //Wait long enough
+                executorService.awaitTermination(10, TimeUnit.DAYS);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+        dumpStats();
+        closed = true;
+    }
+
+    public void setTikaConfig(File tikaConfig) {
+        this.tikaConfig = tikaConfig;
+    }
+
+    public void setThreadPoolSize(int threadPoolSize) {
+        this.threadPoolSize = threadPoolSize;
+    }
+
+    public void setStats(BinaryStats stats) {
+        this.stats = stats;
+    }
+
+    private void dumpStats() {
+        StringWriter sw = new StringWriter();
+        PrintWriter pw = new PrintWriter(sw);
+        pw.println("Text extraction stats");
+        pw.printf("\t Processed Count           : %d%n", processedCount.get());
+        pw.printf("\t   Extraction Count        : %d%n", extractionCount.get());
+        pw.printf("\t     Empty Count           : %d%n", emptyCount.get());
+        pw.printf("\t     Text Written Count    : %d%n", textWrittenCount.get());
+        pw.printf("\t   Parser Error Count      : %d%n", parserErrorCount.get());
+        pw.printf("\t   Error Count             : %d%n", errorCount.get());
+        pw.printf("\t   Not Supported Count     : %d%n", notSupportedCount.get());
+        pw.printf("\t   Already processed Count : %d%n", alreadyExtractedCount.get());
+        pw.printf("\t Total bytes read          : %s%n", IOUtils.humanReadableByteCount(totalSizeRead.get()));
+        pw.printf("\t Total text extracted      : %s%n", IOUtils.humanReadableByteCount(extractedTextSize.get()));
+        pw.printf("\t   Non empty text          : %s%n", IOUtils.humanReadableByteCount(nonEmptyExtractedTextSize.get()));
+        pw.printf("\t Time taken                : %d sec%n", timeTaken.get() / 1000);
+        pw.close();
+        log.info(sw.toString());
+    }
+
+    private void dumpProgress(int count) {
+        if (count % PROGRESS_BATCH_SIZE == 0) {
+            String progress = "";
+            if (stats != null) {
+                double processedPercent = count * 1.0 / stats.getTotalCount() * 100;
+                double indexedPercent = extractionCount.get() * 1.0 / stats.getIndexedCount() * 100;
+                progress = String.format("(%1.2f%%) (Extraction stats %d/%d %1.2f%%, Ignored count %d)",
+                        processedPercent, extractionCount.get(), stats.getIndexedCount(),
+                        indexedPercent, notSupportedCount.get());
+            }
+            log.info("Processed {} {} binaries so far ...", count, progress);
+        }
+    }
+
+    private synchronized void initialize() throws IOException {
+        if (initialized) {
+            return;
+        }
+        inputQueue = new ArrayBlockingQueue<WorkItem>(queueSize);
+        tika = new TikaHelper(tikaConfig);
+        initializeExecutorService();
+        initialized = true;
+    }
+
+    private void extractText(BinaryResource source) throws IOException {
+        String type = source.getMimeType();
+        if (type == null || !tika.isSupportedMediaType(type)) {
+            log.trace("Ignoring binary content for node {} due to unsupported " +
+                    "(or null) jcr:mimeType [{}]", source, type);
+            notSupportedCount.incrementAndGet();
+            return;
+        }
+
+        String blobId = source.getBlobId();
+        if (textWriter.isProcessed(blobId)) {
+            alreadyExtractedCount.incrementAndGet();
+            return;
+        }
+
+        //TODO Handle case where same blob is being concurrently processed
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, type);
+        if (source.getEncoding() != null) { // not mandatory
+            metadata.set(Metadata.CONTENT_ENCODING, source.getEncoding());
+        }
+
+        String extractedContent = parseStringValue(source.getByteSource(), metadata, source.getPath());
+        if (ERROR_TEXT.equals(extractedContent)) {
+            textWriter.markError(blobId);
+        } else if (extractedContent != null) {
+            extractedContent = extractedContent.trim();
+            if (!extractedContent.isEmpty()) {
+                nonEmptyExtractedTextSize.addAndGet(extractedContent.length());
+                textWriter.write(blobId, extractedContent);
+                textWrittenCount.incrementAndGet();
+            } else {
+                textWriter.markEmpty(blobId);
+                emptyCount.incrementAndGet();
+            }
+        }
+    }
+
+    private void initializeExecutorService() {
+        executorService = Executors.newFixedThreadPool(threadPoolSize);
+        for (int i = 0; i < threadPoolSize; i++) {
+            executorService.submit(new Extractor());
+        }
+        log.info("Initialized text extractor pool with {} threads", threadPoolSize);
+    }
+
+    private class Extractor implements Runnable {
+        @Override
+        public void run() {
+            while (true) {
+                WorkItem workItem = null;
+                try {
+                    workItem = inputQueue.take();
+                    if (workItem == SHUTDOWN_SIGNAL) {
+                        inputQueue.put(SHUTDOWN_SIGNAL); //put back for other workers
+                        return;
+                    }
+                    extractText(workItem.source);
+                    dumpProgress(processedCount.incrementAndGet());
+                } catch (InterruptedException e) {
+                    Thread.currentThread().interrupt();
+                    return;
+                } catch (Exception e) {
+                    errorCount.incrementAndGet();
+                    log.warn("Error occurred while processing {}", workItem, e);
+                }
+            }
+        }
+    }
+
+    //~--------------------------------------< Tika >
+
+    private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
+        WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
+        long start = System.currentTimeMillis();
+        long size = 0;
+        try {
+            CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
+            try {
+                tika.getParser().parse(stream, handler, metadata, new ParseContext());
+            } finally {
+                size = stream.getCount();
+                stream.close();
+            }
+        } catch (LinkageError e) {
+            // Capture and ignore errors caused by extraction libraries
+            // not being present. This is equivalent to disabling
+            // selected media types in configuration, so we can simply
+            // ignore these errors.
+        } catch (Throwable t) {
+            // Capture and report any other full text extraction problems.
+            // The special STOP exception is used for normal termination.
+            if (!handler.isWriteLimitReached(t)) {
+                parserErrorCount.incrementAndGet();
+                parserError.debug("Failed to extract text from a binary property: "
+                        + path
+                        + " This is a fairly common case, and nothing to"
+                        + " worry about. The stack trace is included to"
+                        + " help improve the text extraction feature.", t);
+                return ERROR_TEXT;
+            }
+        }
+        String result = handler.toString();
+        timeTaken.addAndGet(System.currentTimeMillis() - start);
+        if (size > 0) {
+            extractedTextSize.addAndGet(result.length());
+            extractionCount.incrementAndGet();
+            totalSizeRead.addAndGet(size);
+            return result;
+        }
+
+        return null;
+    }
+
+    //~--------------------------------------< WorkItem >
+
+    private static class WorkItem {
+        final BinaryResource source;
+
+        private WorkItem(BinaryResource source) {
+            this.source = source;
+        }
+
+        @Override
+        public String toString() {
+            return source != null ? source.toString() : "<EMPTY>";
+        }
+    }
+
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.Closeable;
+import java.io.File;
+import java.util.List;
+
+import com.google.common.io.Closer;
+import joptsimple.OptionParser;
+import joptsimple.OptionSet;
+import joptsimple.OptionSpec;
+import org.apache.jackrabbit.core.data.FileDataStore;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.util.Arrays.asList;
+
+public class TextExtractorMain {
+    private static final Logger log = LoggerFactory.getLogger(TextExtractorMain.class);
+
+    public static void main(String[] args) throws Exception {
+        Closer closer = Closer.create();
+        String h = "tika [extract|report|generate]\n" +
+                "\n" +
+                "report   : Generates a summary report related to binary data\n" +
+                "extract  : Performs the text extraction\n" +
+                "generate : Generates the csv data file based on configured NodeStore/BlobStore";
+        try {
+            OptionParser parser = new OptionParser();
+            OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"),
+                    "show help").forHelp();
+
+            OptionSpec<String> nodeStoreSpec = parser
+                    .accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database")
+                    .withRequiredArg()
+                    .ofType(String.class);
+
+            OptionSpec<String> pathSpec = parser
+                    .accepts("path", "Path in repository under which the binaries would be searched")
+                    .withRequiredArg()
+                    .ofType(String.class);
+
+            OptionSpec<File> dataFileSpec = parser
+                    .accepts("data-file", "Data file in csv format containing the binary metadata")
+                    .withRequiredArg()
+                    .ofType(File.class);
+
+            OptionSpec<File> tikaConfigSpec = parser
+                    .accepts("tika-config", "Tika config file path")
+                    .withRequiredArg()
+                    .ofType(File.class);
+
+            OptionSpec<File> fdsDirSpec = parser
+                    .accepts("fds-path", "Path of directory used by FileDataStore")
+                    .withRequiredArg()
+                    .ofType(File.class);
+
+            OptionSpec<File> storeDirSpec = parser
+                    .accepts("store-path", "Path of directory used to store extracted text content")
+                    .withRequiredArg()
+                    .ofType(File.class);
+
+            OptionSpec<Integer> poolSize = parser
+                    .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " +
+                            "to number of cores on the system")
+                    .withRequiredArg()
+                    .ofType(Integer.class);
+
+            //TODO implement generate support
+
+            OptionSpec<String> nonOption = parser.nonOptions(h);
+
+            OptionSet options = parser.parse(args);
+            List<String> nonOptions = nonOption.values(options);
+
+            if (options.has(help)) {
+                parser.printHelpOn(System.out);
+                System.exit(0);
+            }
+
+            if (nonOptions.isEmpty()) {
+                parser.printHelpOn(System.err);
+                System.exit(1);
+            }
+
+            boolean report = nonOptions.contains("report");
+            boolean extract = nonOptions.contains("extract");
+            File dataFile;
+            File fdsDir;
+            File storeDir = null;
+            File tikaConfigFile = null;
+            BlobStore blobStore = null;
+            BinaryResourceProvider binaryResourceProvider = null;
+            BinaryStats stats = null;
+            String path = "/";
+
+            if (options.has(tikaConfigSpec)) {
+                tikaConfigFile = tikaConfigSpec.value(options);
+                checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist",
+                        tikaConfigFile.getAbsolutePath());
+            }
+
+            if (options.has(storeDirSpec)) {
+                storeDir = storeDirSpec.value(options);
+                if (storeDir.exists()) {
+                    checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " +
+                            "text content '%s' is not a directory", storeDir.getAbsolutePath(), storeDirSpec.options());
+                }
+            }
+
+            if (options.has(fdsDirSpec)) {
+                fdsDir = fdsDirSpec.value(options);
+                checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath());
+                FileDataStore fds = new FileDataStore();
+                fds.setPath(fdsDir.getAbsolutePath());
+                fds.init(null);
+                blobStore = new DataStoreBlobStore(fds);
+            }
+
+            if (options.has(dataFileSpec)) {
+                dataFile = dataFileSpec.value(options);
+                checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
+                binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
+            }
+
+            if (binaryResourceProvider instanceof Closeable) {
+                closer.register((Closeable) binaryResourceProvider);
+            }
+
+            if (report || extract) {
+                checkNotNull(binaryResourceProvider, "BinaryProvider source must be specified either " +
+                        "via '%s' or '%s", dataFileSpec.options(), nodeStoreSpec.options());
+
+                stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
+                String summary = stats.getSummary();
+                log.info(summary);
+            }
+
+            if (extract) {
+                checkNotNull(storeDir, "Directory to store extracted text content " +
+                        "must be specified via %s", storeDirSpec.options());
+                checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " +
+                        "must be specified via %s", fdsDirSpec.options());
+
+                DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
+                TextExtractor extractor = new TextExtractor(writer);
+
+                if (options.has(poolSize)) {
+                    extractor.setThreadPoolSize(poolSize.value(options));
+                }
+
+                if (tikaConfigFile != null) {
+                    extractor.setTikaConfig(tikaConfigFile);
+                }
+
+                if (options.has(pathSpec)) {
+                    path = pathSpec.value(options);
+                }
+
+                closer.register(writer);
+                closer.register(extractor);
+
+                extractor.setStats(stats);
+                log.info("Using path {}", path);
+                extractor.extract(binaryResourceProvider.getBinaries(path));
+
+                extractor.close();
+                writer.close();
+            }
+
+        } catch (Throwable e) {
+            throw closer.rethrow(e);
+        } finally {
+            closer.close();
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import javax.annotation.Nullable;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+class TikaHelper {
+    private static final String DEFAULT_TIKA_CONFIG = "/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml";
+    private static final Logger log = LoggerFactory.getLogger(TikaHelper.class);
+
+    private final AutoDetectParser parser;
+    private final Set<MediaType> supportedMediaTypes;
+    private static AtomicBoolean supportedTypesLogged = new AtomicBoolean();
+
+    public TikaHelper(@Nullable File tikaConfig) throws IOException {
+        try {
+            parser =  new AutoDetectParser(getTikaConfig(tikaConfig));
+            supportedMediaTypes = parser.getSupportedTypes(new ParseContext());
+            logSupportedTypesOnce(supportedMediaTypes);
+        } catch (TikaException e) {
+            throw new RuntimeException(e);
+        } catch (SAXException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public Parser getParser() {
+        return parser;
+    }
+
+    public boolean isSupportedMediaType(String type) {
+        return supportedMediaTypes.contains(MediaType.parse(type));
+    }
+
+    /**
+     * This method should only be used for information purpose and not be relied
+     * upon to determine if the given type is indexed or not. It relies on Tika
+     * implementation detail to determine if a given type is meant to be indexed
+     *
+     * @param type mimeType to check
+     * @return true if the given type is supported and indexed
+     */
+    public boolean isIndexed(String type) {
+        if (!isSupportedMediaType(type)){
+            return false;
+        }
+
+        MediaType mediaType = MediaType.parse(type);
+        Parser p = getSupportingParser(parser, mediaType);
+        if (p == null){
+            return false;
+        }
+        p = unwrap(p);
+        if (p instanceof EmptyParser){
+            return false;
+        }
+        return true;
+    }
+
+    private static TikaConfig getTikaConfig(File tikaConfig) throws TikaException, IOException, SAXException {
+        TikaConfig config;
+        if (tikaConfig == null) {
+            URL configUrl = TextExtractor.class.getResource(DEFAULT_TIKA_CONFIG);
+            if (configUrl != null) {
+                log.info("Loading default Tika config from {}", configUrl);
+                config = new TikaConfig(configUrl);
+            } else {
+                log.info("Using default Tika config");
+                config = TikaConfig.getDefaultConfig();
+            }
+        } else {
+            log.info("Loading external Tika config from {}", tikaConfig);
+            config = new TikaConfig(tikaConfig);
+        }
+        return config;
+    }
+
+    private static Parser getSupportingParser(Parser p, MediaType mediaType){
+        if (p instanceof CompositeParser){
+            Map<MediaType, Parser> parsers = ((CompositeParser) p).getParsers();
+            return getSupportingParser(parsers.get(mediaType), mediaType);
+        }
+        return p;
+    }
+
+    private static Parser unwrap(Parser p){
+        if (p instanceof ParserDecorator){
+            return unwrap(((ParserDecorator) p).getWrappedParser());
+        }
+        return p;
+    }
+
+    private static void logSupportedTypesOnce(Set<MediaType> supportedMediaTypes) {
+        boolean alreadyLogged = supportedTypesLogged.getAndSet(true);
+        if (!alreadyLogged) {
+            log.info("Supported media types {}", supportedMediaTypes);
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java Fri Jul 10 11:46:03 2015
@@ -101,6 +101,7 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.plugins.segment.standby.server.StandbyServer;
 import org.apache.jackrabbit.oak.remote.content.ContentRemoteRepository;
 import org.apache.jackrabbit.oak.remote.http.RemoteServlet;
+import org.apache.jackrabbit.oak.plugins.tika.TextExtractorMain;
 import org.apache.jackrabbit.oak.scalability.ScalabilityRunner;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
@@ -189,6 +190,9 @@ public final class Main {
             case REPAIR:
                 repair(args);
                 break;
+            case TIKA:
+                TextExtractorMain.main(args);
+                break;
             case HELP:
             default:
                 System.err.print("Available run modes: ");
@@ -1180,7 +1184,8 @@ public final class Main {
         HELP("help"),
         CHECKPOINTS("checkpoints"),
         RECOVERY("recovery"),
-        REPAIR("repair");
+        REPAIR("repair"),
+        TIKA("tika");
 
         private final String name;
 

Modified: jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml Fri Jul 10 11:46:03 2015
@@ -36,6 +36,8 @@
   <!-- Display info messages from the scalability suite -->
   <logger name="org.apache.jackrabbit.oak.scalability" level="INFO"/>
 
+  <logger name="org.apache.jackrabbit.oak.plugins.tika" level="INFO"/>
+
   <logger name="org.apache.jackrabbit.oak.plugins.segment.file.tooling.ConsistencyChecker" level="DEBUG"/>
 
   <root level="warn">

Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import com.google.common.base.Function;
+
+public enum BinarySourceMapper implements Function<BinaryResource, String> {
+    BY_BLOBID {
+        @Override
+        public String apply(BinaryResource input) {
+            return input.getBlobId();
+        }
+    },
+
+    BY_PATH {
+        @Override
+        public String apply(BinaryResource input) {
+            return input.getPath();
+        }
+    }
+
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.util.Map;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.apache.commons.csv.CSVPrinter;
+import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.junit.Assert.assertEquals;
+
+public class CSVFileBinaryResourceProviderTest {
+
+    @Rule
+    public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+    @Test
+    public void testGetBinaries() throws Exception {
+        StringBuilder sb = new StringBuilder();
+        CSVPrinter p = new CSVPrinter(sb, CSVFileBinaryResourceProvider.FORMAT);
+        // BLOB_ID, LENGTH, JCR_MIMETYPE, JCR_ENCODING, JCR_PATH
+        p.printRecord("a", 123, "text/plain", null, "/a");
+        p.printRecord("a2", 123, "text/plain", null, "/a/c");
+        p.printRecord("b", null, "text/plain", null, "/b");
+        p.printRecord(null, null, "text/plain", null, "/c");
+
+        File dataFile = temporaryFolder.newFile();
+        Files.write(sb, dataFile, Charsets.UTF_8);
+
+        CSVFileBinaryResourceProvider provider = new CSVFileBinaryResourceProvider(dataFile, new MemoryBlobStore());
+
+        Map<String, BinaryResource> binaries = provider.getBinaries("/").uniqueIndex(BinarySourceMapper.BY_BLOBID);
+        assertEquals(3, binaries.size());
+        assertEquals("a", binaries.get("a").getBlobId());
+        assertEquals("/a", binaries.get("a").getPath());
+
+        binaries = provider.getBinaries("/a").uniqueIndex(BinarySourceMapper.BY_BLOBID);
+        assertEquals(1, binaries.size());
+
+        provider.close();
+    }
+}
\ No newline at end of file

Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
+import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.junit.Test;
+
+import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
+import static org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent.INITIAL_CONTENT;
+import static org.junit.Assert.assertEquals;
+
+public class NodeStoreBinaryResourceProviderTest {
+    private NodeState root = INITIAL_CONTENT;
+
+    @Test
+    public void countBinaries() throws Exception {
+        NodeBuilder builder = root.builder();
+        createFileNode(builder, "a", new IdBlob("hello", null), "text/plain");
+        createFileNode(builder, "b", new IdBlob("hello", "id1"), "text/plain");
+
+        createFileNode(builder.child("a2"), "c", new IdBlob("hello", "id2"), "text/foo")
+                .setProperty(JcrConstants.JCR_ENCODING, "bar");
+
+        NodeStore store = new MemoryNodeStore(builder.getNodeState());
+        BlobStore blobStore = new MemoryBlobStore();
+        NodeStoreBinaryResourceProvider extractor = new NodeStoreBinaryResourceProvider(store, blobStore);
+
+        assertEquals(2, extractor.getBinaries("/").size());
+        assertEquals(1, extractor.getBinaries("/a2").size());
+
+        BinaryResource bs = extractor.getBinaries("/a2").first().get();
+        assertEquals("text/foo", bs.getMimeType());
+        assertEquals("bar", bs.getEncoding());
+        assertEquals("id2", bs.getBlobId());
+
+    }
+
+    private NodeBuilder createFileNode(NodeBuilder base, String name, Blob content, String mimeType) {
+        NodeBuilder jcrContent = base.child(name).child(JCR_CONTENT);
+        jcrContent.setProperty(JcrConstants.JCR_DATA, content);
+        jcrContent.setProperty(JcrConstants.JCR_MIMETYPE, mimeType);
+        return jcrContent;
+    }
+
+    private static class IdBlob extends ArrayBasedBlob {
+        final String id;
+
+        public IdBlob(String value, String id) {
+            super(value.getBytes());
+            this.id = id;
+        }
+
+        @Override
+        public String getContentIdentity() {
+            return id;
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Maps;
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.junit.Test;
+
+import static java.util.Arrays.asList;
+import static org.junit.Assert.assertEquals;
+
+public class TextExtractorTest {
+
+    @Test
+    public void basicWorking() throws Exception {
+        MapTextWriter writer = new MapTextWriter();
+        TextExtractor extractor = new TextExtractor(writer);
+
+        List<BinaryResource> binaries = asList(
+                bin("hello", "text/plain", "a"),
+                bin("foo", "text/plain", "b")
+        );
+
+        extractor.extract(binaries);
+
+        extractor.close();
+        assertEquals(2, writer.data.size());
+        assertEquals("foo", writer.data.get("b").trim());
+    }
+
+    private static BinaryResource bin(String text, String mime, String id) {
+        return new BinaryResource(ByteSource.wrap(text.getBytes()), mime, null, id, id);
+    }
+
+    private static class MapTextWriter implements TextWriter {
+        final Map<String, String> data = Maps.newConcurrentMap();
+
+        @Override
+        public void write(String blobId, String text) throws IOException {
+            data.put(blobId, text);
+        }
+
+        @Override
+        public void markEmpty(String blobId) {
+
+        }
+
+        @Override
+        public void markError(String blobId) {
+
+        }
+
+        @Override
+        public boolean isProcessed(String blobId) {
+            return data.containsKey(blobId);
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class TikaHelperTest {
+    @Rule
+    public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+    @Test
+    public void supportedTypes() throws Exception {
+        TikaHelper tika = new TikaHelper(null);
+        assertTrue(tika.isSupportedMediaType("text/plain"));
+    }
+
+    @Test
+    public void indexedTypes() throws Exception {
+        File config = temporaryFolder.newFile();
+        String configText = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+                "<properties>\n" +
+                "  <detectors>\n" +
+                "    <detector class=\"org.apache.tika.detect.DefaultDetector\"/>\n" +
+                "  </detectors>\n" +
+                "  <parsers>\n" +
+                "    <parser class=\"org.apache.tika.parser.DefaultParser\"/>\n" +
+                "    <parser class=\"org.apache.tika.parser.EmptyParser\">\n" +
+                "      <mime>application/xml</mime>\n" +
+                "    </parser>\n" +
+                "  </parsers>\n" +
+                "</properties>";
+        Files.write(configText, config, Charsets.UTF_8);
+        TikaHelper tika = new TikaHelper(config);
+        assertFalse(tika.isIndexed("application/xml"));
+    }
+
+}
\ No newline at end of file

Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java
------------------------------------------------------------------------------
    svn:eol-style = native