You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/13 13:27:54 UTC

svn commit: r1690636 - in /jackrabbit/oak/trunk/oak-run/src: main/java/org/apache/jackrabbit/oak/plugins/tika/ test/java/org/apache/jackrabbit/oak/plugins/tika/

Author: chetanm
Date: Mon Jul 13 11:27:54 2015
New Revision: 1690636

URL: http://svn.apache.org/r1690636
Log:
OAK-2953 - Implement text extractor as part of oak-run

Add support for generating the csv file by connecting to a NodeStore

Added:
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java   (with props)
Modified:
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java?rev=1690636&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java Mon Jul 13 11:27:54 2015
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.io.IOException;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.FluentIterable;
+import com.google.common.io.Closer;
+import com.google.common.io.Files;
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVPrinter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CSVFileGenerator {
+    /*
+        Instead of using the FORMAT from CSVFileBinaryResourceProvider
+        defining our own without header. Otherwise commons-csv was always
+        adding the header
+     */
+    private static final CSVFormat FORMAT = CSVFormat.DEFAULT
+            .withCommentMarker('#')
+            .withNullString("") //Empty string are considered as null
+            .withIgnoreSurroundingSpaces();
+    private final Logger log = LoggerFactory.getLogger(getClass());
+    private File outFile;
+
+    public CSVFileGenerator(File outFile) {
+        this.outFile = outFile;
+    }
+
+    public void generate(FluentIterable<BinaryResource> binaries) throws IOException {
+        Closer closer = Closer.create();
+        int count = 0;
+        try{
+            CSVPrinter printer = new CSVPrinter(Files.newWriter(outFile, Charsets.UTF_8), FORMAT);
+            for (BinaryResource br : binaries){
+                count++;
+                printer.printRecord(
+                        br.getBlobId(),
+                        br.getByteSource().size(),
+                        br.getMimeType(),
+                        br.getEncoding(),
+                        br.getPath()
+                );
+            }
+            printer.flush();
+            log.info("Generated csv output at {} with {} entries", outFile.getAbsolutePath(), count);
+        }finally {
+            closer.close();
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java?rev=1690636&r1=1690635&r2=1690636&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProvider.java Mon Jul 13 11:27:54 2015
@@ -72,7 +72,9 @@ class NodeStoreBinaryResourceProvider im
 
             Blob blob = data.getValue(Type.BINARY);
             String blobId = blob.getContentIdentity();
-            if (blobId == null) {
+            //Check for ref being non null to ensure its not an inlined binary
+            //For Segment ContentIdentity defaults to RecordId
+            if (blob.getReference() == null || blobId == null) {
                 log.debug("Ignoring jcr:data property at {} as its an inlined blob", tree.getPath());
                 return null;
             }

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1690636&r1=1690635&r2=1690636&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Mon Jul 13 11:27:54 2015
@@ -21,16 +21,26 @@ package org.apache.jackrabbit.oak.plugin
 
 import java.io.Closeable;
 import java.io.File;
+import java.io.IOException;
 import java.util.List;
 
 import com.google.common.io.Closer;
+import com.mongodb.MongoClientURI;
+import com.mongodb.MongoURI;
 import joptsimple.OptionParser;
 import joptsimple.OptionSet;
 import joptsimple.OptionSpec;
 import org.apache.jackrabbit.core.data.FileDataStore;
 import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore;
 import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreTextWriter;
+import org.apache.jackrabbit.oak.plugins.document.DocumentMK;
+import org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore;
+import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection;
+import org.apache.jackrabbit.oak.plugins.segment.SegmentNodeStore;
+import org.apache.jackrabbit.oak.plugins.segment.file.FileStore;
+import org.apache.jackrabbit.oak.run.Main;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
+import org.apache.jackrabbit.oak.spi.state.NodeStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -89,8 +99,6 @@ public class TextExtractorMain {
                     .withRequiredArg()
                     .ofType(Integer.class);
 
-            //TODO implement generate support
-
             OptionSpec<String> nonOption = parser.nonOptions(h);
 
             OptionSet options = parser.parse(args);
@@ -108,7 +116,8 @@ public class TextExtractorMain {
 
             boolean report = nonOptions.contains("report");
             boolean extract = nonOptions.contains("extract");
-            File dataFile;
+            boolean generate = nonOptions.contains("generate");
+            File dataFile = null;
             File fdsDir;
             File storeDir = null;
             File tikaConfigFile = null;
@@ -142,23 +151,35 @@ public class TextExtractorMain {
 
             if (options.has(dataFileSpec)) {
                 dataFile = dataFileSpec.value(options);
-                checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath());
-                binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
             }
 
-            if (binaryResourceProvider instanceof Closeable) {
-                closer.register((Closeable) binaryResourceProvider);
-            }
+            checkNotNull(dataFile, "Data file not configured with %s", dataFileSpec);
 
             if (report || extract) {
-                checkNotNull(binaryResourceProvider, "BinaryProvider source must be specified either " +
-                        "via '%s' or '%s", dataFileSpec.options(), nodeStoreSpec.options());
+                checkArgument(dataFile.exists(),
+                        "Data file %s does not exist", dataFile.getAbsolutePath());
+
+                binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore);
+                if (binaryResourceProvider instanceof Closeable) {
+                    closer.register((Closeable) binaryResourceProvider);
+                }
 
                 stats = new BinaryStats(tikaConfigFile, binaryResourceProvider);
                 String summary = stats.getSummary();
                 log.info(summary);
             }
 
+            if (generate){
+                String src = nodeStoreSpec.value(options);
+                checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " +
+                        "must be specified via %s", fdsDirSpec.options());
+                checkNotNull(dataFile, "Data file path not provided");
+                NodeStore nodeStore = bootStrapNodeStore(src, blobStore, closer);
+                BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore);
+                CSVFileGenerator generator = new CSVFileGenerator(dataFile);
+                generator.generate(brp.getBinaries(path));
+            }
+
             if (extract) {
                 checkNotNull(storeDir, "Directory to store extracted text content " +
                         "must be specified via %s", storeDirSpec.options());
@@ -197,4 +218,56 @@ public class TextExtractorMain {
             closer.close();
         }
     }
+
+    private static NodeStore bootStrapNodeStore(String src, BlobStore blobStore,
+                                                Closer closer) throws IOException {
+        if (src.startsWith(MongoURI.MONGODB_PREFIX)) {
+            MongoClientURI uri = new MongoClientURI(src);
+            if (uri.getDatabase() == null) {
+                System.err.println("Database missing in MongoDB URI: "
+                        + uri.getURI());
+                System.exit(1);
+            }
+            MongoConnection mongo = new MongoConnection(uri.getURI());
+            closer.register(asCloseable(mongo));
+            DocumentNodeStore store = new DocumentMK.Builder()
+                    .setBlobStore(blobStore)
+                    .setMongoDB(mongo.getDB()).getNodeStore();
+            closer.register(asCloseable(store));
+            return store;
+        }
+        FileStore fs = FileStore.newFileStore(new File(src))
+                .withBlobStore(blobStore)
+                .withMemoryMapping(Main.TAR_STORAGE_MEMORY_MAPPED)
+                .create();
+        closer.register(asCloseable(fs));
+        return SegmentNodeStore.newSegmentNodeStore(fs).create();
+    }
+
+    private static Closeable asCloseable(final FileStore fs) {
+        return new Closeable() {
+            @Override
+            public void close() throws IOException {
+                fs.close();
+            }
+        };
+    }
+
+    private static Closeable asCloseable(final DocumentNodeStore dns) {
+        return new Closeable() {
+            @Override
+            public void close() throws IOException {
+                dns.dispose();
+            }
+        };
+    }
+
+    private static Closeable asCloseable(final MongoConnection con) {
+        return new Closeable() {
+            @Override
+            public void close() throws IOException {
+                con.close();
+            }
+        };
+    }
 }

Modified: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java?rev=1690636&r1=1690635&r2=1690636&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/NodeStoreBinaryResourceProviderTest.java Mon Jul 13 11:27:54 2015
@@ -19,8 +19,13 @@
 
 package org.apache.jackrabbit.oak.plugins.tika;
 
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.plugins.blob.BlobStoreBlob;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
@@ -28,13 +33,18 @@ import org.apache.jackrabbit.oak.spi.blo
 import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 
 import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT;
 import static org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent.INITIAL_CONTENT;
 import static org.junit.Assert.assertEquals;
 
 public class NodeStoreBinaryResourceProviderTest {
+    @Rule
+    public final TemporaryFolder temporaryFolder = new TemporaryFolder();
+
     private NodeState root = INITIAL_CONTENT;
 
     @Test
@@ -57,6 +67,24 @@ public class NodeStoreBinaryResourceProv
         assertEquals("text/foo", bs.getMimeType());
         assertEquals("bar", bs.getEncoding());
         assertEquals("id2", bs.getBlobId());
+    }
+
+    @Test
+    public void csvGenerator() throws Exception {
+        File csv = new File(temporaryFolder.getRoot(), "test.csv");
+        BlobStore blobStore = new MemoryBlobStore();
+        NodeBuilder builder = root.builder();
+        createFileNode(builder, "a", blobOf("foo", blobStore), "text/plain");
+        createFileNode(builder, "b", blobOf("hello", blobStore), "text/plain");
+
+        NodeStore store = new MemoryNodeStore(builder.getNodeState());
+
+        NodeStoreBinaryResourceProvider extractor = new NodeStoreBinaryResourceProvider(store, blobStore);
+        CSVFileGenerator generator = new CSVFileGenerator(csv);
+        generator.generate(extractor.getBinaries("/"));
+
+        CSVFileBinaryResourceProvider csvbrp = new CSVFileBinaryResourceProvider(csv, blobStore);
+        assertEquals(2, csvbrp.getBinaries("/").size());
 
     }
 
@@ -67,6 +95,11 @@ public class NodeStoreBinaryResourceProv
         return jcrContent;
     }
 
+    private Blob blobOf(String content, BlobStore bs) throws IOException {
+        String id = bs.writeBlob(new ByteArrayInputStream(content.getBytes()));
+        return new BlobStoreBlob(bs, id);
+    }
+
     private static class IdBlob extends ArrayBasedBlob {
         final String id;