You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/10 13:46:04 UTC
svn commit: r1690249 - in /jackrabbit/oak/trunk/oak-run: ./
src/main/assembly/ src/main/java/org/apache/jackrabbit/oak/plugins/tika/
src/main/java/org/apache/jackrabbit/oak/run/ src/main/resources/
src/test/java/org/apache/jackrabbit/oak/plugins/ src/t...
Author: chetanm
Date: Fri Jul 10 11:46:03 2015
New Revision: 1690249
URL: http://svn.apache.org/r1690249
Log:
OAK-2953 - Implement text extractor as part of oak-run
-- Add Tika dependency to oak-run
-- Ensure that various parser related dependency do not get pulled in while building oak-run. Changed assembly config for that
-- Exposed a new 'tika' command
Added:
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (with props)
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java (with props)
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java (with props)
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java (with props)
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (with props)
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java (with props)
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java (with props)
Modified:
jackrabbit/oak/trunk/oak-run/pom.xml
jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml
jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java
jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml
Modified: jackrabbit/oak/trunk/oak-run/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/pom.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-run/pom.xml Fri Jul 10 11:46:03 2015
@@ -362,6 +362,22 @@
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>1.5</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>1.5</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ <version>1.1</version>
+ </dependency>
+
<!-- Findbugs annotations -->
<dependency>
<groupId>com.google.code.findbugs</groupId>
Modified: jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run-jr2.xml Fri Jul 10 11:46:03 2015
@@ -38,11 +38,13 @@
<excludes>
<exclude>org.apache.jackrabbit:oak-lucene</exclude>
<exclude>org.apache.lucene</exclude>
+ <exclude>org.apache.tika</exclude>
</excludes>
<useStrictFiltering>true</useStrictFiltering>
<useProjectArtifact>true</useProjectArtifact>
<unpack>true</unpack>
<useTransitiveDependencies>true</useTransitiveDependencies>
+ <useTransitiveFiltering>true</useTransitiveFiltering>
<unpackOptions>
<excludes>
<exclude>META-INF/*.SF</exclude>
Modified: jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/assembly/oak-run.xml Fri Jul 10 11:46:03 2015
@@ -33,11 +33,14 @@
<exclude>org.apache.jackrabbit:jackrabbit-core</exclude>
<exclude>org.apache.lucene</exclude>
<exclude>org.apache.derby</exclude>
+ <exclude>org.apache.tika:tika-core:*</exclude>
+ <exclude>org.apache.tika:tika-parsers:*</exclude>
</excludes>
<useStrictFiltering>true</useStrictFiltering>
<useProjectArtifact>true</useProjectArtifact>
<unpack>true</unpack>
<useTransitiveDependencies>true</useTransitiveDependencies>
+ <useTransitiveFiltering>true</useTransitiveFiltering>
<unpackOptions>
<excludes>
<exclude>META-INF/*.SF</exclude>
@@ -51,5 +54,24 @@
</excludes>
</unpackOptions>
</dependencySet>
+ <!-- Exclude the transitive dependency as tika-parsers depend
+ on many other jars. Instead users can include tika-app.jar in classpath-->
+ <dependencySet>
+ <outputDirectory>/</outputDirectory>
+ <includes>
+ <include>org.apache.tika:tika-core</include>
+ <include>org.apache.tika:tika-parsers</include>
+ </includes>
+ <useStrictFiltering>true</useStrictFiltering>
+ <useTransitiveDependencies>false</useTransitiveDependencies>
+ <unpack>true</unpack>
+ <unpackOptions>
+ <excludes>
+ <exclude>META-INF/*.SF</exclude>
+ <exclude>META-INF/*.DSA</exclude>
+ <exclude>META-INF/*.RSA</exclude>
+ </excludes>
+ </unpackOptions>
+ </dependencySet>
</dependencySets>
</assembly>
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nullable;
+
+import com.google.common.io.ByteSource;
+
+import static com.google.common.base.Preconditions.checkNotNull;
+
+class BinaryResource {
+ private final ByteSource byteSource;
+ private final String mimeType;
+ private final String encoding;
+ private final String path;
+ private final String blobId;
+
+ public BinaryResource(ByteSource byteSource,
+ @Nullable String mimeType,
+ @Nullable String encoding,
+ String path,
+ String blobId) {
+ this.byteSource = checkNotNull(byteSource, "ByteSource must be provided");
+ this.mimeType = mimeType;
+ this.encoding = encoding;
+ this.path = checkNotNull(path, "Path must be provided");
+ this.blobId = checkNotNull(blobId, "BlobId must be specified");
+ }
+
+ public ByteSource getByteSource() {
+ return byteSource;
+ }
+
+ @CheckForNull
+ public String getMimeType() {
+ return mimeType;
+ }
+
+ @CheckForNull
+ public String getEncoding() {
+ return encoding;
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ public String getBlobId() {
+ return blobId;
+ }
+
+ @Override
+ public String toString() {
+ return path;
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResource.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.IOException;
+
+import com.google.common.collect.FluentIterable;
+
+/**
+ * Provides an iterator for binaries present under given path
+ */
+interface BinaryResourceProvider {
+
+ FluentIterable<BinaryResource> getBinaries(String path) throws IOException;
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryResourceProvider.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Maps;
+import org.codehaus.groovy.runtime.StringGroovyMethods;
+
+import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
+
+class BinaryStats {
+ private final TikaHelper tika;
+ private final List<MimeTypeStats> stats;
+ private long totalSize;
+ private long totalCount;
+ private long indexedSize;
+ private long indexedCount;
+
+ public BinaryStats(File tikaConfig, BinaryResourceProvider provider) throws IOException {
+ this.tika = new TikaHelper(tikaConfig);
+ this.stats = collectStats(provider);
+ }
+
+ public long getTotalSize() {
+ return totalSize;
+ }
+
+ public long getTotalCount() {
+ return totalCount;
+ }
+
+ public long getIndexedSize() {
+ return indexedSize;
+ }
+
+ public long getIndexedCount() {
+ return indexedCount;
+ }
+
+ public String getSummary() throws IOException {
+ return getSummary(stats);
+ }
+
+ private List<MimeTypeStats> collectStats(BinaryResourceProvider provider) throws IOException {
+ Map<String, MimeTypeStats> stats = Maps.newHashMap();
+ for (BinaryResource binary : provider.getBinaries("/")) {
+ String mimeType = binary.getMimeType();
+ if (mimeType != null) {
+ MimeTypeStats mimeStats = stats.get(mimeType);
+ if (mimeStats == null) {
+ mimeStats = createStat(mimeType);
+ stats.put(mimeType, mimeStats);
+ }
+
+ long size = binary.getByteSource().size();
+ mimeStats.addSize(size);
+ totalSize += size;
+ totalCount++;
+
+ if (mimeStats.isIndexed()) {
+ indexedSize += size;
+ indexedCount++;
+ }
+ }
+ }
+
+ List<MimeTypeStats> result = new ArrayList<MimeTypeStats>(stats.values());
+ Collections.sort(result, Collections.reverseOrder());
+ return result;
+ }
+
+ private String getSummary(List<MimeTypeStats> stats) {
+ int maxWidth = 0;
+ for (MimeTypeStats s : stats) {
+ maxWidth = Math.max(maxWidth, s.getName().length());
+ }
+
+ maxWidth += 5;
+
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw);
+ pw.println("MimeType Stats");
+ pw.printf("\tTotal size : %s%n", humanReadableByteCount(totalSize));
+ pw.printf("\tTotal indexed size : %s%n", humanReadableByteCount(indexedSize));
+ pw.printf("\tTotal count : %d%n", totalCount);
+ pw.printf("\tTotal indexed count : %d%n", indexedCount);
+ pw.println();
+
+ String header = center("Type", maxWidth) + " " +
+ center("Indexed", 10) + " " +
+ center("Supported", 10) + " " +
+ center("Count", 10) + " " +
+ center("Size", 10);
+
+ pw.println(header);
+ pw.println(Strings.repeat("_", header.length() + 5));
+
+ for (MimeTypeStats s : stats) {
+ pw.printf("%-" + maxWidth + "s|%10s|%10s| %-8d|%10s%n",
+ s.getName(),
+ s.isIndexed(),
+ s.isSupported(),
+ s.getCount(),
+ humanReadableByteCount(s.getTotalSize()));
+ }
+ return sw.toString();
+ }
+
+ private MimeTypeStats createStat(String mimeType) {
+ MimeTypeStats stats = new MimeTypeStats(mimeType);
+ stats.setIndexed(tika.isIndexed(mimeType));
+ stats.setSupported(tika.isSupportedMediaType(mimeType));
+ return stats;
+ }
+
+ private static String center(String s, int width) {
+ return StringGroovyMethods.center(s, width);
+ }
+
+ private static class MimeTypeStats implements Comparable<MimeTypeStats> {
+ private final String mimeType;
+ private int count;
+ private long totalSize;
+ private boolean supported;
+ private boolean indexed;
+
+ public MimeTypeStats(String mimeType) {
+ this.mimeType = mimeType;
+ }
+
+ public void addSize(long size) {
+ count++;
+ totalSize += size;
+ }
+
+ public void setSupported(boolean supported) {
+ this.supported = supported;
+ }
+
+ public void setIndexed(boolean indexed) {
+ this.indexed = indexed;
+ }
+
+ public long getTotalSize() {
+ return totalSize;
+ }
+
+ public int getCount() {
+ return count;
+ }
+
+ public String getName() {
+ return mimeType;
+ }
+
+ public boolean isIndexed() {
+ return indexed;
+ }
+
+ public boolean isSupported() {
+ return supported;
+ }
+
+ @Override
+ public int compareTo(MimeTypeStats o) {
+ return ComparisonChain.start()
+ .compareFalseFirst(indexed, o.indexed)
+ .compare(totalSize, o.totalSize)
+ .result();
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BinaryStats.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.annotation.Nullable;
+
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+
+/**
+ * Avoiding use of BlobByteSource to avoid concurrent access to NodeState
+ */
+class BlobStoreByteSource extends ByteSource {
+ private final BlobStore blobStore;
+ private final String blobId;
+ private final Long size;
+
+ BlobStoreByteSource(BlobStore blobStore, String blobId,@Nullable Long size) {
+ this.blobStore = blobStore;
+ this.blobId = blobId;
+ this.size = size;
+ }
+
+ BlobStoreByteSource(BlobStore blobStore, String blobId) {
+ this(blobStore, blobId, null);
+ }
+
+ @Override
+ public InputStream openStream() throws IOException {
+ return blobStore.getInputStream(blobId);
+ }
+
+ @Override
+ public long size() throws IOException {
+ if (size != null) {
+ return size;
+ }
+ return blobStore.getBlobLength(blobId);
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/BlobStoreByteSource.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+
+import javax.annotation.Nullable;
+
+import com.google.common.base.Charsets;
+import com.google.common.base.Function;
+import com.google.common.base.Predicate;
+import com.google.common.collect.FluentIterable;
+import com.google.common.io.Closer;
+import com.google.common.primitives.Longs;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Predicates.notNull;
+import static org.apache.jackrabbit.JcrConstants.JCR_ENCODING;
+import static org.apache.jackrabbit.JcrConstants.JCR_MIMETYPE;
+import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
+
+class CSVFileBinaryResourceProvider implements BinaryResourceProvider, Closeable {
+ private static final String BLOB_ID = "blobId";
+ private static final String LENGTH = "length";
+ static final CSVFormat FORMAT = CSVFormat.DEFAULT
+ .withCommentMarker('#')
+ .withHeader(
+ BLOB_ID,
+ LENGTH,
+ JCR_MIMETYPE,
+ JCR_ENCODING,
+ JCR_PATH
+ )
+ .withNullString("") //Empty string are considered as null
+ .withIgnoreSurroundingSpaces()
+ .withSkipHeaderRecord();
+ private final Logger log = LoggerFactory.getLogger(getClass());
+ private final File dataFile;
+ private final BlobStore blobStore;
+ private final Closer closer = Closer.create();
+
+ public CSVFileBinaryResourceProvider(File dataFile, @Nullable BlobStore blobStore) {
+ checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile);
+ this.dataFile = dataFile;
+ this.blobStore = blobStore;
+ }
+
+ @Override
+ public FluentIterable<BinaryResource> getBinaries(final String path) throws IOException {
+ CSVParser parser = CSVParser.parse(dataFile, Charsets.UTF_8, FORMAT);
+ closer.register(parser);
+ return FluentIterable.from(parser)
+ .transform(new RecordTransformer())
+ .filter(notNull())
+ .filter(new Predicate<BinaryResource>() {
+ @Override
+ public boolean apply(BinaryResource input) {
+ return PathUtils.isAncestor(path, input.getPath());
+ }
+ });
+ }
+
+ @Override
+ public void close() throws IOException {
+ closer.close();
+ }
+
+ private class RecordTransformer implements Function<CSVRecord, BinaryResource> {
+
+ @Nullable
+ @Override
+ public BinaryResource apply(CSVRecord input) {
+ String path = input.get(JCR_PATH);
+ String mimeType = input.get(JCR_MIMETYPE);
+ String encoding = input.get(JCR_ENCODING);
+ String blobId = input.get(BLOB_ID);
+ String length = input.get(LENGTH);
+ Long len = length != null ? Longs.tryParse(length) : null;
+ if (path == null || blobId == null || mimeType == null) {
+ log.warn("Ignoring invalid record {}. Either of mimeType, blobId or path is null", input);
+ return null;
+ }
+
+ return new BinaryResource(new BlobStoreByteSource(blobStore, blobId, len),
+ mimeType, encoding, path, blobId);
+ }
+ }
+
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProvider.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nullable;
+
+import com.google.common.base.Function;
+import com.google.common.collect.FluentIterable;
+import com.google.common.collect.TreeTraverser;
+import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.api.PropertyState;
+import org.apache.jackrabbit.oak.api.Tree;
+import org.apache.jackrabbit.oak.api.Type;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Predicates.notNull;
+import static org.apache.jackrabbit.oak.plugins.tree.TreeFactory.createReadOnlyTree;
+import static org.apache.jackrabbit.oak.spi.state.NodeStateUtils.getNode;
+
+class NodeStoreBinaryResourceProvider implements BinaryResourceProvider {
+ private static final Logger log = LoggerFactory.getLogger(NodeStoreBinaryResourceProvider.class);
+ private final NodeStore nodeStore;
+ private final BlobStore blobStore;
+
+ public NodeStoreBinaryResourceProvider(NodeStore nodeStore, BlobStore blobStore) {
+ this.nodeStore = nodeStore;
+ this.blobStore = blobStore;
+ }
+
+ public FluentIterable<BinaryResource> getBinaries(String path) {
+ return new OakTreeTraverser()
+ .preOrderTraversal(createReadOnlyTree(getNode(nodeStore.getRoot(), path)))
+ .transform(new TreeToBinarySource())
+ .filter(notNull());
+ }
+
+ private class TreeToBinarySource implements Function<Tree, BinaryResource> {
+ @Nullable
+ @Override
+ public BinaryResource apply(Tree tree) {
+ PropertyState data = tree.getProperty(JcrConstants.JCR_DATA);
+ if (data == null) {
+ return null;
+ }
+
+ if (data.isArray()) {
+ log.debug("Ignoring jcr:data property at {} as its a MVP", tree.getPath());
+ return null;
+ }
+
+ Blob blob = data.getValue(Type.BINARY);
+ String blobId = blob.getContentIdentity();
+ if (blobId == null) {
+ log.debug("Ignoring jcr:data property at {} as its an inlined blob", tree.getPath());
+ return null;
+ }
+
+ String mimeType = getString(tree, JcrConstants.JCR_MIMETYPE);
+ String encoding = getString(tree, JcrConstants.JCR_ENCODING);
+
+ return new BinaryResource(new BlobStoreByteSource(blobStore, blobId), mimeType,
+ encoding, tree.getPath(), blobId);
+ }
+ }
+
+ private static class OakTreeTraverser extends TreeTraverser<Tree> {
+ @Override
+ public Iterable<Tree> children(Tree root) {
+ return root.getChildren();
+ }
+ }
+
+ @CheckForNull
+ private static String getString(Tree tree, String name) {
+ PropertyState prop = tree.getProperty(name);
+ return prop != null ? prop.getValue(Type.STRING) : null;
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+import com.google.common.io.ByteSource;
+import com.google.common.io.CountingInputStream;
+import org.apache.jackrabbit.oak.commons.IOUtils;
+import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+class TextExtractor implements Closeable {
+ private static final Logger log = LoggerFactory.getLogger(TextExtractor.class);
+ private static final Logger parserError = LoggerFactory.getLogger("org.apache.jackrabbit.oak.plugins.tika.ParserError");
+ private static final int PROGRESS_BATCH_SIZE = 1000;
+ private static final int MAX_EXTRACT_LENGTH = 100000;
+ private static final String ERROR_TEXT = "TextExtractionError";
+
+ private final TextWriter textWriter;
+
+ private final WorkItem SHUTDOWN_SIGNAL = new WorkItem(null);
+ private BlockingQueue<WorkItem> inputQueue;
+ private ExecutorService executorService;
+ private int threadPoolSize = Runtime.getRuntime().availableProcessors();
+ private int queueSize = 100;
+
+ private final AtomicInteger errorCount = new AtomicInteger();
+ private final AtomicLong timeTaken = new AtomicLong();
+ private final AtomicInteger extractionCount = new AtomicInteger();
+ private final AtomicInteger textWrittenCount = new AtomicInteger();
+ private final AtomicInteger parserErrorCount = new AtomicInteger();
+ private final AtomicInteger processedCount = new AtomicInteger();
+ private final AtomicInteger emptyCount = new AtomicInteger();
+ private final AtomicInteger notSupportedCount = new AtomicInteger();
+ private final AtomicInteger alreadyExtractedCount = new AtomicInteger();
+ private final AtomicLong extractedTextSize = new AtomicLong();
+ private final AtomicLong nonEmptyExtractedTextSize = new AtomicLong();
+ private final AtomicLong totalSizeRead = new AtomicLong();
+
+ private int maxExtractedLength = MAX_EXTRACT_LENGTH;
+ private File tikaConfig;
+ private TikaHelper tika;
+ private boolean initialized;
+ private BinaryStats stats;
+ private boolean closed;
+
+ public TextExtractor(TextWriter textWriter) {
+ this.textWriter = textWriter;
+ }
+
+ public void extract(Iterable<BinaryResource> binaries) throws InterruptedException, IOException {
+ initialize();
+ for (BinaryResource binary : binaries) {
+ inputQueue.put(new WorkItem(binary));
+ }
+ }
+
+ @Override
+ public void close() {
+ if (closed) {
+ return;
+ }
+ if (!inputQueue.isEmpty()) {
+ log.info("Shutting down the extractor. Pending task count {}", inputQueue.size());
+ }
+
+ if (executorService != null) {
+ try {
+ inputQueue.put(SHUTDOWN_SIGNAL);
+ executorService.shutdown();
+ //Wait long enough
+ executorService.awaitTermination(10, TimeUnit.DAYS);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ }
+ dumpStats();
+ closed = true;
+ }
+
+ public void setTikaConfig(File tikaConfig) {
+ this.tikaConfig = tikaConfig;
+ }
+
+ public void setThreadPoolSize(int threadPoolSize) {
+ this.threadPoolSize = threadPoolSize;
+ }
+
+ public void setStats(BinaryStats stats) {
+ this.stats = stats;
+ }
+
+ private void dumpStats() {
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw);
+ pw.println("Text extraction stats");
+ pw.printf("\t Processed Count : %d%n", processedCount.get());
+ pw.printf("\t Extraction Count : %d%n", extractionCount.get());
+ pw.printf("\t Empty Count : %d%n", emptyCount.get());
+ pw.printf("\t Text Written Count : %d%n", textWrittenCount.get());
+ pw.printf("\t Parser Error Count : %d%n", parserErrorCount.get());
+ pw.printf("\t Error Count : %d%n", errorCount.get());
+ pw.printf("\t Not Supported Count : %d%n", notSupportedCount.get());
+ pw.printf("\t Already processed Count : %d%n", alreadyExtractedCount.get());
+ pw.printf("\t Total bytes read : %s%n", IOUtils.humanReadableByteCount(totalSizeRead.get()));
+ pw.printf("\t Total text extracted : %s%n", IOUtils.humanReadableByteCount(extractedTextSize.get()));
+ pw.printf("\t Non empty text : %s%n", IOUtils.humanReadableByteCount(nonEmptyExtractedTextSize.get()));
+ pw.printf("\t Time taken : %d sec%n", timeTaken.get() / 1000);
+ pw.close();
+ log.info(sw.toString());
+ }
+
+ private void dumpProgress(int count) {
+ if (count % PROGRESS_BATCH_SIZE == 0) {
+ String progress = "";
+ if (stats != null) {
+ double processedPercent = count * 1.0 / stats.getTotalCount() * 100;
+ double indexedPercent = extractionCount.get() * 1.0 / stats.getIndexedCount() * 100;
+ progress = String.format("(%1.2f%%) (Extraction stats %d/%d %1.2f%%, Ignored count %d)",
+ processedPercent, extractionCount.get(), stats.getIndexedCount(),
+ indexedPercent, notSupportedCount.get());
+ }
+ log.info("Processed {} {} binaries so far ...", count, progress);
+ }
+ }
+
+ private synchronized void initialize() throws IOException {
+ if (initialized) {
+ return;
+ }
+ inputQueue = new ArrayBlockingQueue<WorkItem>(queueSize);
+ tika = new TikaHelper(tikaConfig);
+ initializeExecutorService();
+ initialized = true;
+ }
+
+ private void extractText(BinaryResource source) throws IOException {
+ String type = source.getMimeType();
+ if (type == null || !tika.isSupportedMediaType(type)) {
+ log.trace("Ignoring binary content for node {} due to unsupported " +
+ "(or null) jcr:mimeType [{}]", source, type);
+ notSupportedCount.incrementAndGet();
+ return;
+ }
+
+ String blobId = source.getBlobId();
+ if (textWriter.isProcessed(blobId)) {
+ alreadyExtractedCount.incrementAndGet();
+ return;
+ }
+
+ //TODO Handle case where same blob is being concurrently processed
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ if (source.getEncoding() != null) { // not mandatory
+ metadata.set(Metadata.CONTENT_ENCODING, source.getEncoding());
+ }
+
+ String extractedContent = parseStringValue(source.getByteSource(), metadata, source.getPath());
+ if (ERROR_TEXT.equals(extractedContent)) {
+ textWriter.markError(blobId);
+ } else if (extractedContent != null) {
+ extractedContent = extractedContent.trim();
+ if (!extractedContent.isEmpty()) {
+ nonEmptyExtractedTextSize.addAndGet(extractedContent.length());
+ textWriter.write(blobId, extractedContent);
+ textWrittenCount.incrementAndGet();
+ } else {
+ textWriter.markEmpty(blobId);
+ emptyCount.incrementAndGet();
+ }
+ }
+ }
+
+ private void initializeExecutorService() {
+ executorService = Executors.newFixedThreadPool(threadPoolSize);
+ for (int i = 0; i < threadPoolSize; i++) {
+ executorService.submit(new Extractor());
+ }
+ log.info("Initialized text extractor pool with {} threads", threadPoolSize);
+ }
+
+ private class Extractor implements Runnable {
+ @Override
+ public void run() {
+ while (true) {
+ WorkItem workItem = null;
+ try {
+ workItem = inputQueue.take();
+ if (workItem == SHUTDOWN_SIGNAL) {
+ inputQueue.put(SHUTDOWN_SIGNAL); //put back for other workers
+ return;
+ }
+ extractText(workItem.source);
+ dumpProgress(processedCount.incrementAndGet());
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ return;
+ } catch (Exception e) {
+ errorCount.incrementAndGet();
+ log.warn("Error occurred while processing {}", workItem, e);
+ }
+ }
+ }
+ }
+
+ //~--------------------------------------< Tika >
+
+ private String parseStringValue(ByteSource byteSource, Metadata metadata, String path) {
+ WriteOutContentHandler handler = new WriteOutContentHandler(maxExtractedLength);
+ long start = System.currentTimeMillis();
+ long size = 0;
+ try {
+ CountingInputStream stream = new CountingInputStream(new LazyInputStream(byteSource));
+ try {
+ tika.getParser().parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ size = stream.getCount();
+ stream.close();
+ }
+ } catch (LinkageError e) {
+ // Capture and ignore errors caused by extraction libraries
+ // not being present. This is equivalent to disabling
+ // selected media types in configuration, so we can simply
+ // ignore these errors.
+ } catch (Throwable t) {
+ // Capture and report any other full text extraction problems.
+ // The special STOP exception is used for normal termination.
+ if (!handler.isWriteLimitReached(t)) {
+ parserErrorCount.incrementAndGet();
+ parserError.debug("Failed to extract text from a binary property: "
+ + path
+ + " This is a fairly common case, and nothing to"
+ + " worry about. The stack trace is included to"
+ + " help improve the text extraction feature.", t);
+ return ERROR_TEXT;
+ }
+ }
+ String result = handler.toString();
+ timeTaken.addAndGet(System.currentTimeMillis() - start);
+ if (size > 0) {
+ extractedTextSize.addAndGet(result.length());
+ extractionCount.incrementAndGet();
+ totalSizeRead.addAndGet(size);
+ return result;
+ }
+
+ return null;
+ }
+
+ //~--------------------------------------< WorkItem >
+
+ private static class WorkItem {
+ final BinaryResource source;
+
+ private WorkItem(BinaryResource source) {
+ this.source = source;
+ }
+
+ @Override
+ public String toString() {
+ return source != null ? source.toString() : "<EMPTY>";
+ }
+ }
+
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.Closeable;
+import java.io.File;
+import java.util.List;
+
+import com.google.common.io.Closer;
+import joptsimple.OptionParser;
+import joptsimple.OptionSet;
+import joptsimple.OptionSpec;
+import org.apache.jackrabbit.core.data.FileDataStore;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkNotNull;
+import static java.util.Arrays.asList;
+
+public class TextExtractorMain {
+ private static final Logger log = LoggerFactory.getLogger(TextExtractorMain.class);
+
+ public static void main(String[] args) throws Exception {
+ Closer closer = Closer.create();
+ String h = "tika [extract|report|generate]\n" +
+ "\n" +
+ "report : Generates a summary report related to binary data\n" +
+ "extract : Performs the text extraction\n" +
+ "generate : Generates the csv data file based on configured NodeStore/BlobStore";
+ try {
+ OptionParser parser = new OptionParser();
+ OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"),
+ "show help").forHelp();
+
+ OptionSpec<String> nodeStoreSpec = parser
+ .accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database")
+ .withRequiredArg()
+ .ofType(String.class);
+
+ OptionSpec<String> pathSpec = parser
+ .accepts("path", "Path in repository under which the binaries would be searched")
+ .withRequiredArg()
+ .ofType(String.class);
+
+ OptionSpec<File> dataFileSpec = parser
+ .accepts("data-file", "Data file in csv format containing the binary metadata")
+ .withRequiredArg()
+ .ofType(File.class);
+
+ OptionSpec<File> tikaConfigSpec = parser
+ .accepts("tika-config", "Tika config file path")
+ .withRequiredArg()
+ .ofType(File.class);
+
+ OptionSpec<File> fdsDirSpec = parser
+ .accepts("fds-path", "Path of directory used by FileDataStore")
+ .withRequiredArg()
+ .ofType(File.class);
+
+ OptionSpec<File> storeDirSpec = parser
+ .accepts("store-path", "Path of directory used to store extracted text content")
+ .withRequiredArg()
+ .ofType(File.class);
+
+ OptionSpec<Integer> poolSize = parser
+ .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " +
+ "to number of cores on the system")
+ .withRequiredArg()
+ .ofType(Integer.class);
+
+ //TODO implement generate support
+
+ OptionSpec<String> nonOption = parser.nonOptions(h);
+
+ OptionSet options = parser.parse(args);
+ List<String> nonOptions = nonOption.values(options);
+
+ if (options.has(help)) {
+ parser.printHelpOn(System.out);
+ System.exit(0);
+ }
+
+ if (nonOptions.isEmpty()) {
+ parser.printHelpOn(System.err);
+ System.exit(1);
+ }
+
+ boolean report = nonOptions.contains("report");
+ boolean extract = nonOptions.contains("extract");
+ File dataFile;
+ File fdsDir;
+ File storeDir = null;
+ File tikaConfigFile = null;
+ BlobStore blobStore = null;
+ BinaryResourceProvider binaryResourceProvider = null;
+ BinaryStats stats = null;
+ String path = "/";
+
+ if (options.has(tikaConfigSpec)) {
+ tikaConfigFile = tikaConfigSpec.value(options);
+ checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist",
+ tikaConfigFile.getAbsolutePath());
+ }
+
+ if (options.has(storeDirSpec)) {
+ storeDir = storeDirSpec.value(options);
+ if (storeDir.exists()) {
+ checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " +
+ "text content '%s' is not a directory", storeDir.getAbsolutePath(), storeDirSpec.options());
+ }
+ }
+
+ if (options.has(fdsDirSpec)) {
+ fdsDir = fdsDirSpec.value(options);
+ checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath());
+ FileDataStore fds = new FileDataStore();
+ fds.setPath(fdsDir.getAbsolutePath());
+ fds.init(null);
+ blobStore = new DataStoreBlobStore(fds);
+ }
+
+ if (options.has(dataFileSpec)) {
+ dataFile = dataFileSpec.value(options);
+ checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
+ binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
+ }
+
+ if (binaryResourceProvider instanceof Closeable) {
+ closer.register((Closeable) binaryResourceProvider);
+ }
+
+ if (report || extract) {
+ checkNotNull(binaryResourceProvider, "BinaryProvider source must be specified either " +
+ "via '%s' or '%s", dataFileSpec.options(), nodeStoreSpec.options());
+
+ stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
+ String summary = stats.getSummary();
+ log.info(summary);
+ }
+
+ if (extract) {
+ checkNotNull(storeDir, "Directory to store extracted text content " +
+ "must be specified via %s", storeDirSpec.options());
+ checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " +
+ "must be specified via %s", fdsDirSpec.options());
+
+ DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false);
+ TextExtractor extractor = new TextExtractor(writer);
+
+ if (options.has(poolSize)) {
+ extractor.setThreadPoolSize(poolSize.value(options));
+ }
+
+ if (tikaConfigFile != null) {
+ extractor.setTikaConfig(tikaConfigFile);
+ }
+
+ if (options.has(pathSpec)) {
+ path = pathSpec.value(options);
+ }
+
+ closer.register(writer);
+ closer.register(extractor);
+
+ extractor.setStats(stats);
+ log.info("Using path {}", path);
+ extractor.extract(binaryResourceProvider.getBinaries(path));
+
+ extractor.close();
+ writer.close();
+ }
+
+ } catch (Throwable e) {
+ throw closer.rethrow(e);
+ } finally {
+ closer.close();
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import javax.annotation.Nullable;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+class TikaHelper {
+ private static final String DEFAULT_TIKA_CONFIG = "/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml";
+ private static final Logger log = LoggerFactory.getLogger(TikaHelper.class);
+
+ private final AutoDetectParser parser;
+ private final Set<MediaType> supportedMediaTypes;
+ private static AtomicBoolean supportedTypesLogged = new AtomicBoolean();
+
+ public TikaHelper(@Nullable File tikaConfig) throws IOException {
+ try {
+ parser = new AutoDetectParser(getTikaConfig(tikaConfig));
+ supportedMediaTypes = parser.getSupportedTypes(new ParseContext());
+ logSupportedTypesOnce(supportedMediaTypes);
+ } catch (TikaException e) {
+ throw new RuntimeException(e);
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public Parser getParser() {
+ return parser;
+ }
+
+ public boolean isSupportedMediaType(String type) {
+ return supportedMediaTypes.contains(MediaType.parse(type));
+ }
+
+ /**
+ * This method should only be used for information purpose and not be relied
+ * upon to determine if the given type is indexed or not. It relies on Tika
+ * implementation detail to determine if a given type is meant to be indexed
+ *
+ * @param type mimeType to check
+ * @return true if the given type is supported and indexed
+ */
+ public boolean isIndexed(String type) {
+ if (!isSupportedMediaType(type)){
+ return false;
+ }
+
+ MediaType mediaType = MediaType.parse(type);
+ Parser p = getSupportingParser(parser, mediaType);
+ if (p == null){
+ return false;
+ }
+ p = unwrap(p);
+ if (p instanceof EmptyParser){
+ return false;
+ }
+ return true;
+ }
+
+ private static TikaConfig getTikaConfig(File tikaConfig) throws TikaException, IOException, SAXException {
+ TikaConfig config;
+ if (tikaConfig == null) {
+ URL configUrl = TextExtractor.class.getResource(DEFAULT_TIKA_CONFIG);
+ if (configUrl != null) {
+ log.info("Loading default Tika config from {}", configUrl);
+ config = new TikaConfig(configUrl);
+ } else {
+ log.info("Using default Tika config");
+ config = TikaConfig.getDefaultConfig();
+ }
+ } else {
+ log.info("Loading external Tika config from {}", tikaConfig);
+ config = new TikaConfig(tikaConfig);
+ }
+ return config;
+ }
+
+ private static Parser getSupportingParser(Parser p, MediaType mediaType){
+ if (p instanceof CompositeParser){
+ Map<MediaType, Parser> parsers = ((CompositeParser) p).getParsers();
+ return getSupportingParser(parsers.get(mediaType), mediaType);
+ }
+ return p;
+ }
+
+ private static Parser unwrap(Parser p){
+ if (p instanceof ParserDecorator){
+ return unwrap(((ParserDecorator) p).getWrappedParser());
+ }
+ return p;
+ }
+
+ private static void logSupportedTypesOnce(Set<MediaType> supportedMediaTypes) {
+ boolean alreadyLogged = supportedTypesLogged.getAndSet(true);
+ if (!alreadyLogged) {
+ log.info("Supported media types {}", supportedMediaTypes);
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelper.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/run/Main.java Fri Jul 10 11:46:03 2015
@@ -101,6 +101,7 @@ import org.apache.jackrabbit.oak.plugins
import org.apache.jackrabbit.oak.plugins.segment.standby.server.StandbyServer;
import org.apache.jackrabbit.oak.remote.content.ContentRemoteRepository;
import org.apache.jackrabbit.oak.remote.http.RemoteServlet;
+import org.apache.jackrabbit.oak.plugins.tika.TextExtractorMain;
import org.apache.jackrabbit.oak.scalability.ScalabilityRunner;
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.jackrabbit.oak.spi.state.NodeStore;
@@ -189,6 +190,9 @@ public final class Main {
case REPAIR:
repair(args);
break;
+ case TIKA:
+ TextExtractorMain.main(args);
+ break;
case HELP:
default:
System.err.print("Available run modes: ");
@@ -1180,7 +1184,8 @@ public final class Main {
HELP("help"),
CHECKPOINTS("checkpoints"),
RECOVERY("recovery"),
- REPAIR("repair");
+ REPAIR("repair"),
+ TIKA("tika");
private final String name;
Modified: jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml?rev=1690249&r1=1690248&r2=1690249&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/resources/logback.xml Fri Jul 10 11:46:03 2015
@@ -36,6 +36,8 @@
<!-- Display info messages from the scalability suite -->
<logger name="org.apache.jackrabbit.oak.scalability" level="INFO"/>
+ <logger name="org.apache.jackrabbit.oak.plugins.tika" level="INFO"/>
+
<logger name="org.apache.jackrabbit.oak.plugins.segment.file.tooling.ConsistencyChecker" level="DEBUG"/>
<root level="warn">
Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import com.google.common.base.Function;
+
+public enum BinarySourceMapper implements Function<BinaryResource, String> {
+ BY_BLOBID {
+ @Override
+ public String apply(BinaryResource input) {
+ return input.getBlobId();
+ }
+ },
+
+ BY_PATH {
+ @Override
+ public String apply(BinaryResource input) {
+ return input.getPath();
+ }
+ }
+
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/BinarySourceMapper.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.util.Map;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.apache.commons.csv.CSVPrinter;
+import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.junit.Assert.assertEquals;
+
+public class CSVFileBinaryResourceProviderTest {
+
+ @Rule
+ public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+ @Test
+ public void testGetBinaries() throws Exception {
+ StringBuilder sb = new StringBuilder();
+ CSVPrinter p = new CSVPrinter(sb, CSVFileBinaryResourceProvider.FORMAT);
+ // BLOB_ID, LENGTH, JCR_MIMETYPE, JCR_ENCODING, JCR_PATH
+ p.printRecord("a", 123, "text/plain", null, "/a");
+ p.printRecord("a2", 123, "text/plain", null, "/a/c");
+ p.printRecord("b", null, "text/plain", null, "/b");
+ p.printRecord(null, null, "text/plain", null, "/c");
+
+ File dataFile = temporaryFolder.newFile();
+ Files.write(sb, dataFile, Charsets.UTF_8);
+
+ CSVFileBinaryResourceProvider provider = new CSVFileBinaryResourceProvider(dataFile, new MemoryBlobStore());
+
+ Map<String, BinaryResource> binaries = provider.getBinaries("/").uniqueIndex(BinarySourceMapper.BY_BLOBID);
+ assertEquals(3, binaries.size());
+ assertEquals("a", binaries.get("a").getBlobId());
+ assertEquals("/a", binaries.get("a").getPath());
+
+ binaries = provider.getBinaries("/a").uniqueIndex(BinarySourceMapper.BY_BLOBID);
+ assertEquals(1, binaries.size());
+
+ provider.close();
+ }
+}
\ No newline at end of file
Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileBinaryResourceProviderTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import org.apache.jackrabbit.JcrConstants;
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
+import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
+import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
+import org.apache.jackrabbit.oak.spi.state.NodeState;
+import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.junit.Test;
+
+import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
+import static org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent.INITIAL_CONTENT;
+import static org.junit.Assert.assertEquals;
+
+public class NodeStoreBinaryResourceProviderTest {
+ private NodeState root = INITIAL_CONTENT;
+
+ @Test
+ public void countBinaries() throws Exception {
+ NodeBuilder builder = root.builder();
+ createFileNode(builder, "a", new IdBlob("hello", null), "text/plain");
+ createFileNode(builder, "b", new IdBlob("hello", "id1"), "text/plain");
+
+ createFileNode(builder.child("a2"), "c", new IdBlob("hello", "id2"), "text/foo")
+ .setProperty(JcrConstants.JCR_ENCODING, "bar");
+
+ NodeStore store = new MemoryNodeStore(builder.getNodeState());
+ BlobStore blobStore = new MemoryBlobStore();
+ NodeStoreBinaryResourceProvider extractor = new NodeStoreBinaryResourceProvider(store, blobStore);
+
+ assertEquals(2, extractor.getBinaries("/").size());
+ assertEquals(1, extractor.getBinaries("/a2").size());
+
+ BinaryResource bs = extractor.getBinaries("/a2").first().get();
+ assertEquals("text/foo", bs.getMimeType());
+ assertEquals("bar", bs.getEncoding());
+ assertEquals("id2", bs.getBlobId());
+
+ }
+
+ private NodeBuilder createFileNode(NodeBuilder base, String name, Blob content, String mimeType) {
+ NodeBuilder jcrContent = base.child(name).child(JCR_CONTENT);
+ jcrContent.setProperty(JcrConstants.JCR_DATA, content);
+ jcrContent.setProperty(JcrConstants.JCR_MIMETYPE, mimeType);
+ return jcrContent;
+ }
+
+ private static class IdBlob extends ArrayBasedBlob {
+ final String id;
+
+ public IdBlob(String value, String id) {
+ super(value.getBytes());
+ this.id = id;
+ }
+
+ @Override
+ public String getContentIdentity() {
+ return id;
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Maps;
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.junit.Test;
+
+import static java.util.Arrays.asList;
+import static org.junit.Assert.assertEquals;
+
+public class TextExtractorTest {
+
+ @Test
+ public void basicWorking() throws Exception {
+ MapTextWriter writer = new MapTextWriter();
+ TextExtractor extractor = new TextExtractor(writer);
+
+ List<BinaryResource> binaries = asList(
+ bin("hello", "text/plain", "a"),
+ bin("foo", "text/plain", "b")
+ );
+
+ extractor.extract(binaries);
+
+ extractor.close();
+ assertEquals(2, writer.data.size());
+ assertEquals("foo", writer.data.get("b").trim());
+ }
+
+ private static BinaryResource bin(String text, String mime, String id) {
+ return new BinaryResource(ByteSource.wrap(text.getBytes()), mime, null, id, id);
+ }
+
+ private static class MapTextWriter implements TextWriter {
+ final Map<String, String> data = Maps.newConcurrentMap();
+
+ @Override
+ public void write(String blobId, String text) throws IOException {
+ data.put(blobId, text);
+ }
+
+ @Override
+ public void markEmpty(String blobId) {
+
+ }
+
+ @Override
+ public void markError(String blobId) {
+
+ }
+
+ @Override
+ public boolean isProcessed(String blobId) {
+ return data.containsKey(blobId);
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java?rev=1690249&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java Fri Jul 10 11:46:03 2015
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class TikaHelperTest {
+ @Rule
+ public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+ @Test
+ public void supportedTypes() throws Exception {
+ TikaHelper tika = new TikaHelper(null);
+ assertTrue(tika.isSupportedMediaType("text/plain"));
+ }
+
+ @Test
+ public void indexedTypes() throws Exception {
+ File config = temporaryFolder.newFile();
+ String configText = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+ "<properties>\n" +
+ " <detectors>\n" +
+ " <detector class=\"org.apache.tika.detect.DefaultDetector\"/>\n" +
+ " </detectors>\n" +
+ " <parsers>\n" +
+ " <parser class=\"org.apache.tika.parser.DefaultParser\"/>\n" +
+ " <parser class=\"org.apache.tika.parser.EmptyParser\">\n" +
+ " <mime>application/xml</mime>\n" +
+ " </parser>\n" +
+ " </parsers>\n" +
+ "</properties>";
+ Files.write(configText, config, Charsets.UTF_8);
+ TikaHelper tika = new TikaHelper(config);
+ assertFalse(tika.isIndexed("application/xml"));
+ }
+
+}
\ No newline at end of file
Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TikaHelperTest.java
------------------------------------------------------------------------------
svn:eol-style = native