You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/21 20:50:32 UTC

[tika] 01/02: make strawman app driver actually work. Add ability to specify a list of files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 384e97156c5c5bf54d54b452c1783b7c4e5df068
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Sep 21 16:20:58 2017 -0400

    make strawman app driver actually work.  Add ability to specify a list of files.
---
 .../batch/fs/strawman/StrawManTikaAppDriver.java   | 139 +++++++++++----------
 1 file changed, 74 insertions(+), 65 deletions(-)

diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/strawman/StrawManTikaAppDriver.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/strawman/StrawManTikaAppDriver.java
index 8523d5c..4fef6c8 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/strawman/StrawManTikaAppDriver.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/strawman/StrawManTikaAppDriver.java
@@ -17,9 +17,9 @@ package org.apache.tika.batch.fs.strawman;
  * limitations under the License.
  */
 
+import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -38,7 +38,6 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.atomic.AtomicInteger;
 
-import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.MarkerFactory;
@@ -60,12 +59,14 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
     private final int threadNum;
     private Path inputRoot = null;
     private Path outputRoot = null;
+    private Path fileList = null;
     private String[] args = null;
 
     public StrawManTikaAppDriver(Path inputRoot, Path outputRoot,
-                                 int totalThreads, String[] args) {
+                                 int totalThreads, Path fileList, String[] args) {
         this.inputRoot = inputRoot;
         this.outputRoot = outputRoot;
+        this.fileList = fileList;
         this.args = args;
         threadNum = threadCount.getAndIncrement();
         this.totalThreads = totalThreads;
@@ -73,7 +74,7 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
 
 
     private class TikaVisitor extends SimpleFileVisitor<Path> {
-        private int processed = 0;
+        private volatile int processed = 0;
 
         int getProcessed() {
             return processed;
@@ -87,10 +88,31 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
                     return FileVisitResult.CONTINUE;
                 }
             }
-            assert(file.startsWith(inputRoot));
+            if (! file.startsWith(inputRoot)) {
+                LOG.warn("File ("+file.toAbsolutePath()+
+                        ") doesn't start with input root ("+inputRoot.toAbsolutePath()+")");
+                return FileVisitResult.CONTINUE;
+            }
             Path relPath = inputRoot.relativize(file);
+            String suffix = ".txt";
+            List<String> commandLine = new ArrayList<>();
+            for (String arg : args) {
+                commandLine.add(arg);
+                if (arg.equals("-J")) {
+                    suffix = ".json";
+                } else if (arg.contains("-x")) {
+                    suffix = ".html";
+                }
+            }
+            String fullPath = file.toAbsolutePath().toString();
+            if (fullPath.contains(" ")) {
+                fullPath = "\""+fullPath+"\"";
+            }
+            commandLine.add(fullPath);
+
+
             Path outputFile = Paths.get(outputRoot.toAbsolutePath().toString(),
-                    relPath.toString() + ".txt");
+                    relPath.toString() + suffix);
             try {
                 Files.createDirectories(outputFile.getParent());
             } catch (IOException e) {
@@ -98,23 +120,15 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
                         "parent directory for {} was not made!", outputFile);
                 throw new RuntimeException("couldn't make parent file for " + outputFile);
             }
-            List<String> commandLine = new ArrayList<>();
-            for (String arg : args) {
-                commandLine.add(arg);
-            }
-            commandLine.add("-t");
-            commandLine.add("\""+outputFile.toAbsolutePath()+"\"");
-            ProcessBuilder builder = new ProcessBuilder(commandLine.toArray(new String[commandLine.size()]));
+            ProcessBuilder builder = new ProcessBuilder();
+            builder.command(commandLine);
             LOG.info("about to process: {}", file.toAbsolutePath());
+            builder.redirectOutput(outputFile.toFile());
+            builder.redirectError(ProcessBuilder.Redirect.INHERIT);
+
             Process proc = null;
-            RedirectGobbler gobbler = null;
-            Thread gobblerThread = null;
             try {
-                OutputStream os = Files.newOutputStream(outputFile);
                 proc = builder.start();
-                gobbler = new RedirectGobbler(proc.getInputStream(), os);
-                gobblerThread = new Thread(gobbler);
-                gobblerThread.start();
             } catch (IOException e) {
                 LOG.error(e.getMessage(), e);
                 return FileVisitResult.CONTINUE;
@@ -141,8 +155,12 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
                 LOG.warn("Had to kill process working on: {}", file.toAbsolutePath());
                 proc.destroy();
             }
-            gobbler.close();
-            gobblerThread.interrupt();
+            try {
+                proc.getOutputStream().flush();
+                proc.getOutputStream().close();
+            } catch (IOException e) {
+                LOG.warn("couldn't close process outputstream", e);
+            }
             processed++;
             return FileVisitResult.CONTINUE;
         }
@@ -155,53 +173,33 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
     public Integer call() throws Exception {
         long start = new Date().getTime();
         TikaVisitor v = new TikaVisitor();
-        Files.walkFileTree(inputRoot, v);
+        if (fileList != null) {
+            TikaVisitor tikaVisitor = new TikaVisitor();
+            try (BufferedReader reader = Files.newBufferedReader(fileList, StandardCharsets.UTF_8)) {
+                String line = reader.readLine();
+                while (line != null) {
+                    Path inputFile = inputRoot.resolve(line.trim());
+                    if (Files.isReadable(inputFile)) {
+                        try {
+                            tikaVisitor.visitFile(inputFile, Files.readAttributes(inputFile, BasicFileAttributes.class));
+                        } catch (IOException e) {
+                            LOG.warn("Problem with: "+inputFile, e);
+                        }
+                    } else {
+                        LOG.warn("Not readable: "+inputFile);
+                    }
+                    line = reader.readLine();
+                }
+            }
+        } else {
+            Files.walkFileTree(inputRoot, v);
+        }
         int processed = v.getProcessed();
         double elapsedSecs = ((double)new Date().getTime()-(double)start)/(double)1000;
         LOG.info("Finished processing {} files in {} seconds.", processed, elapsedSecs);
         return processed;
     }
 
-    private class RedirectGobbler implements Runnable {
-        private OutputStream redirectOs = null;
-        private InputStream redirectIs = null;
-
-        private RedirectGobbler(InputStream is, OutputStream os) {
-            this.redirectIs = is;
-            this.redirectOs = os;
-        }
-
-        private void close() {
-            if (redirectOs != null) {
-                try {
-                    redirectOs.flush();
-                } catch (IOException e) {
-                    LOG.error("can't flush");
-                }
-                try {
-                    redirectIs.close();
-                } catch (IOException e) {
-                    LOG.error("can't close input in redirect gobbler");
-                }
-                try {
-                    redirectOs.close();
-                } catch (IOException e) {
-                    LOG.error("can't close output in redirect gobbler");
-                }
-            }
-        }
-
-        @Override
-        public void run() {
-            try {
-                IOUtils.copy(redirectIs, redirectOs);
-            } catch (IOException e) {
-                LOG.error("IOException while gobbling");
-            }
-        }
-    }
-
-
 
     public static String usage() {
         StringBuilder sb = new StringBuilder();
@@ -220,9 +218,18 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
         Path inputDir = Paths.get(args[0]);
         Path outputDir = Paths.get(args[1]);
         int totalThreads = Integer.parseInt(args[2]);
+        Path fileList = null;
+        if (args.length > 3) {
+            fileList = Paths.get(args[3]);
+            if (! Files.isReadable(fileList)) {
+                fileList = null;
+            }
+        }
 
         List<String> commandLine = new ArrayList<>();
-        commandLine.addAll(Arrays.asList(args).subList(3, args.length));
+
+        int initialParams = (fileList == null) ? 3 : 4;
+        commandLine.addAll(Arrays.asList(args).subList(initialParams, args.length));
         totalThreads = (totalThreads < 1) ? 1 : totalThreads;
         ExecutorService ex = Executors.newFixedThreadPool(totalThreads);
         ExecutorCompletionService<Integer> completionService =
@@ -230,7 +237,8 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
 
         for (int i = 0; i < totalThreads; i++) {
             StrawManTikaAppDriver driver =
-                    new StrawManTikaAppDriver(inputDir, outputDir, totalThreads, commandLine.toArray(new String[commandLine.size()]));
+                    new StrawManTikaAppDriver(inputDir, outputDir, totalThreads, fileList,
+                            commandLine.toArray(new String[commandLine.size()]));
             completionService.submit(driver);
         }
 
@@ -247,5 +255,6 @@ public class StrawManTikaAppDriver implements Callable<Integer> {
         }
         double elapsedSeconds = (double)(new Date().getTime() - start) / (double)1000;
         LOG.info("Processed {} in {} seconds", totalFilesProcessed, elapsedSeconds);
+        ex.shutdownNow();
     }
 }

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.