You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2012/10/31 01:37:03 UTC

svn commit: r1403941 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/embedder/ tika-parsers/src/test/java/org/apache/tika/embedder/

Author: rgauss
Date: Wed Oct 31 00:37:02 2012
New Revision: 1403941

URL: http://svn.apache.org/viewvc?rev=1403941&view=rev
Log:
TIKA-775: Embed Capabilities
   - Added an Embedder interface, similar to Parser, which defines getSupportedEmbedTypes and an embed method
   - Added a base ExternalEmbedder implementation of the Embedder interface, similar to ExternalParser, which can call a command line executable, the default being sed, to perform embedding
   - Added a base ExternalEmbedderTest which 'embeds' lines in a text file then uses a TXTParser to verify the expected embedded metadata exists

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/
    tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java   (with props)
    tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java   (with props)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java   (with props)

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java?rev=1403941&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java Wed Oct 31 00:37:02 2012
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.embedder;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+/**
+ * Tika embedder interface
+ *
+ * @since Apache Tika 1.3
+ */
+public interface Embedder extends Serializable {
+
+    /**
+     * Returns the set of media types supported by this embedder when used with
+     * the given parse context.
+     * <p>
+     * The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)}
+     * so that parser implementations may also choose to implement this interface.
+     *
+     * @param context parse context
+     * @return immutable set of media types
+     */
+    Set<MediaType> getSupportedEmbedTypes(ParseContext context);
+
+    /**
+     * Embeds related document metadata from the given metadata object into the
+     * given output stream.
+     * <p>
+     * The given document stream is consumed but not closed by this method. The
+     * responsibility to close the stream remains on the caller.
+     * <p>
+     * Information about the parsing context can be passed in the context
+     * parameter. See the parser implementations for the kinds of context
+     * information they expect.
+     * <p>
+     * In general implementations should favor preserving the source file's metadata 
+     * unless an update to a field is explicitly defined in the Metadata object.
+     * More specifically:
+     * <ul>
+     *  <li>Embedder implementations should only attempt to update metadata fields
+     *  present in the given Metadata object.  Other fields should be left untouched.</li>
+     *  <li>Embedder implementations should set properties as empty when the 
+     *  corresponding field in the Metadata object is an empty string, i.e. ""</li>
+     *  <li>Embedder implementations should nullify or delete properties 
+     *  corresponding to fields with a null value in the given Metadata object.</li>
+     *  <li>Embedder implementations should set the property 
+     *  corresponding to a particular field in the given Metadata object in all 
+     *  metadata containers whenever possible and appropriate for the file format at the time. 
+     *  If a particular metadata container falls out of use and/or is superseded by another 
+     *  (such as IIC vs XMP for IPTC) it is up to the implementation to decide if and when 
+     *  to cease embedding in the alternate container.</li>
+     *  <li>Embedder implementations should attempt to embed as much of the metadata 
+     *  as accurately as possible. An implementation may choose a strict approach 
+     *  and throw an exception if a value to be embedded exceeds the length allowed 
+     *  or may choose to truncate the value.</li>
+     * </ul>
+     * 
+     * @param metadata document metadata (input and output)
+     * @param originalStream the document stream (input)
+     * @param outputStream the output stream to write the metadata embedded data to
+     * @param context parse context
+     * @throws IOException if the document stream could not be read
+     * @throws TikaException if the document could not be parsed
+     */
+    void embed(Metadata metadata, InputStream originalStream,
+            OutputStream outputStream, ParseContext context)
+            throws IOException, TikaException;
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java?rev=1403941&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java Wed Oct 31 00:37:02 2012
@@ -0,0 +1,543 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.embedder;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+
+/**
+ * Embedder that uses an external program (like sed or exiftool) to embed text
+ * content and metadata into a given document.
+ *
+ * @since Apache Tika 1.3
+ */
+public class ExternalEmbedder implements Embedder {
+
+    private static final long serialVersionUID = -2828829275642475697L;
+
+    /**
+     * Token to be replaced with a String array of metadata assignment command
+     * arguments
+     */
+    public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";
+
+    /**
+     * Token to be replaced with a String array of metadata assignment command
+     * arguments
+     */
+    public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN = "${METADATA_SERIALIZED}";
+
+    /**
+     * Media types supported by the external program.
+     */
+    private Set<MediaType> supportedEmbedTypes = Collections.emptySet();
+
+    /**
+     * Mapping of Tika metadata to command line parameters.
+     */
+    private Map<Property, String[]> metadataCommandArguments = null;
+
+    /**
+     * The external command to invoke.
+     *
+     * @see Runtime#exec(String[])
+     */
+    private String[] command = new String[] {
+            "sed", "-e",
+            "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
+            ExternalParser.INPUT_FILE_TOKEN
+    };
+
+    private String commandAssignmentOperator = "=";
+    private String commandAssignmentDelimeter = ", ";
+    private String commandAppendOperator = "=";
+
+    private boolean quoteAssignmentValues = false;
+
+    private TemporaryResources tmp = new TemporaryResources();
+
+    public Set<MediaType> getSupportedEmbedTypes(ParseContext context) {
+        return getSupportedEmbedTypes();
+    }
+
+    public Set<MediaType> getSupportedEmbedTypes() {
+        return supportedEmbedTypes;
+    }
+
+    public void setSupportedEmbedTypes(Set<MediaType> supportedEmbedTypes) {
+        this.supportedEmbedTypes = Collections
+                .unmodifiableSet(new HashSet<MediaType>(supportedEmbedTypes));
+    }
+
+    /**
+     * Gets the command to be run. This can include either of
+     * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} if the command
+     * needs filenames.
+     *
+     * @return
+     */
+    public String[] getCommand() {
+        return command;
+    }
+
+    /**
+     * Sets the command to be run. This can include either of
+     * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} if the command
+     * needs filenames.
+     *
+     * @see Runtime#exec(String[])
+     */
+    public void setCommand(String... command) {
+        this.command = command;
+    }
+
+    /**
+     * Gets the assignment operator for the command line tool, i.e. "=".
+     *
+     * @return the assignment operator
+     */
+    public String getCommandAssignmentOperator() {
+        return commandAssignmentOperator;
+    }
+
+    /**
+     * Sets the assignment operator for the command line tool, i.e. "=".
+     *
+     * @param commandAssignmentOperator
+     */
+    public void setCommandAssignmentOperator(String commandAssignmentOperator) {
+        this.commandAssignmentOperator = commandAssignmentOperator;
+    }
+
+    /**
+     * Gets the delimiter for multiple assignments for the command line tool,
+     * i.e. ", ".
+     *
+     * @return the assignment delimiter
+     */
+    public String getCommandAssignmentDelimeter() {
+        return commandAssignmentDelimeter;
+    }
+
+    /**
+     * Sets the delimiter for multiple assignments for the command line tool,
+     * i.e. ", ".
+     *
+     * @param commandAssignmentDelimeter
+     */
+    public void setCommandAssignmentDelimeter(String commandAssignmentDelimeter) {
+        this.commandAssignmentDelimeter = commandAssignmentDelimeter;
+    }
+
+    /**
+     * Gets the operator to append rather than replace a value for the command
+     * line tool, i.e. "+=".
+     *
+     * @return the append operator
+     */
+    public String getCommandAppendOperator() {
+        return commandAppendOperator;
+    }
+
+    /**
+     * Sets the operator to append rather than replace a value for the command
+     * line tool, i.e. "+=".
+     *
+     * @param commandAppendOperator
+     */
+    public void setCommandAppendOperator(String commandAppendOperator) {
+        this.commandAppendOperator = commandAppendOperator;
+    }
+
+    /**
+     * Gets whether or not to quote assignment values, i.e. tag='value'. The
+     * default is false.
+     *
+     * @return whether or not to quote assignment values
+     */
+    public boolean isQuoteAssignmentValues() {
+        return quoteAssignmentValues;
+    }
+
+    /**
+     * Sets whether or not to quote assignment values, i.e. tag='value'.
+     *
+     * @param quoteAssignmentValues
+     */
+    public void setQuoteAssignmentValues(boolean quoteAssignmentValues) {
+        this.quoteAssignmentValues = quoteAssignmentValues;
+    }
+
+    /**
+     * Gets the map of Metadata keys to command line parameters.
+     *
+     * @return the metadata to CLI param map
+     */
+    public Map<Property, String[]> getMetadataCommandArguments() {
+        return metadataCommandArguments;
+    }
+
+    /**
+     * Sets the map of Metadata keys to command line parameters. Set this to
+     * null to disable Metadata embedding.
+     *
+     * @param arguments
+     */
+    public void setMetadataCommandArguments(Map<Property, String[]> arguments) {
+        this.metadataCommandArguments = arguments;
+    }
+
+    /**
+     * Constructs a collection of command line arguments responsible for setting
+     * individual metadata fields based on the given <code>metadata</code>.
+     *
+     * @param metadata the metadata to embed
+     * @return the metadata-related command line arguments
+     */
+    protected List<String> getCommandMetadataSegments(Metadata metadata) {
+        List<String> commandMetadataSegments = new ArrayList<String>();
+        if (metadata == null || metadata.names() == null) {
+            return commandMetadataSegments;
+        }
+        for (String metadataName : metadata.names()) {
+            for (Property property : getMetadataCommandArguments().keySet()) {
+                if (metadataName.equals(property.getName())) {
+                    String[] metadataCommandArguments = getMetadataCommandArguments().get(property);
+                    if (metadataCommandArguments != null) {
+                        for (String metadataCommandArgument : metadataCommandArguments) {
+                            if (metadata.isMultiValued(metadataName)) {
+                                for (String metadataValue : metadata
+                                        .getValues(metadataName)) {
+                                    String assignmentValue = metadataValue;
+                                    if (quoteAssignmentValues) {
+                                        assignmentValue = "'" + assignmentValue
+                                                + "'";
+                                    }
+                                    commandMetadataSegments
+                                            .add(metadataCommandArgument
+                                                    + commandAppendOperator
+                                                    + assignmentValue);
+                                }
+                            } else {
+                                String assignmentValue = metadata.get(metadataName);
+                                if (quoteAssignmentValues) {
+                                    assignmentValue = "'" + assignmentValue + "'";
+                                }
+                                commandMetadataSegments.add(metadataCommandArgument
+                                        + commandAssignmentOperator
+                                        + assignmentValue);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        return commandMetadataSegments;
+    }
+
+    /**
+     * Serializes a collection of metadata command line arguments into a single
+     * string.
+     *
+     * @param metadataCommandArguments
+     * @return the serialized metadata arguments string
+     */
+    protected static String serializeMetadata(
+            List<String> metadataCommandArguments) {
+        if (metadataCommandArguments != null) {
+            return Arrays.toString(metadataCommandArguments.toArray());
+        }
+        return "";
+    }
+
+    /**
+     * Executes the configured external command and passes the given document
+     * stream as a simple XHTML document to the given SAX content handler.
+     * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
+     * has been called to set arguments.
+     */
+    public void embed(final Metadata metadata, final InputStream inputStream,
+            final OutputStream outputStream, final ParseContext context)
+            throws IOException, TikaException {
+
+        boolean inputToStdIn = true;
+        boolean outputFromStdOut = true;
+        boolean hasMetadataCommandArguments =
+                (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
+        boolean serializeMetadataCommandArgumentsToken = false;
+        boolean replacedMetadataCommandArgumentsToken = false;
+
+        TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
+        File tempOutputFile = null;
+
+        List<String> commandMetadataSegments = null;
+        if (hasMetadataCommandArguments) {
+            commandMetadataSegments = getCommandMetadataSegments(metadata);
+        }
+
+        // Build our command
+        List<String> origCmd = Arrays.asList(command);
+        List<String> cmd = new ArrayList<String>();
+        for (String commandSegment : origCmd) {
+            if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
+                commandSegment = commandSegment.replace(
+                        ExternalParser.INPUT_FILE_TOKEN,
+                        tikaInputStream.getFile().toString());
+                inputToStdIn = false;
+            }
+            if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
+                tempOutputFile = tmp.createTemporaryFile();
+                commandSegment = commandSegment.replace(
+                        ExternalParser.OUTPUT_FILE_TOKEN,
+                        tempOutputFile.toString());
+                outputFromStdOut = false;
+            }
+            if (commandSegment
+                    .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
+                serializeMetadataCommandArgumentsToken = true;
+            }
+            if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
+                if (hasMetadataCommandArguments) {
+                    for (String commandMetadataSegment : commandMetadataSegments) {
+                        cmd.add(commandMetadataSegment);
+                    }
+                }
+                replacedMetadataCommandArgumentsToken = true;
+            } else {
+                cmd.add(commandSegment);
+            }
+        }
+        if (hasMetadataCommandArguments) {
+            if (serializeMetadataCommandArgumentsToken) {
+                // Find all metadata tokens and replace with encapsulated metadata
+                int i = 0;
+                for (String commandSegment : cmd) {
+                    if (commandSegment
+                            .indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
+                        commandSegment = commandSegment.replace(
+                                METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
+                                serializeMetadata(commandMetadataSegments));
+                        cmd.set(i, commandSegment);
+                    }
+                    i++;
+                }
+            } else if (!replacedMetadataCommandArgumentsToken
+                    && !serializeMetadataCommandArgumentsToken) {
+                // Tack metadata onto the end of the cmd as arguments
+                cmd.addAll(commandMetadataSegments);
+            }
+        }
+
+        // Execute
+        Process process;
+        if (cmd.toArray().length == 1) {
+            process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
+        } else {
+            process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
+        }
+
+        ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
+
+        try {
+            sendStdErrToOutputStream(process, stdErrOutputStream);
+
+            if (inputToStdIn) {
+                sendInputStreamToStdIn(inputStream, process);
+            } else {
+                // We're not writing to std in this case so close
+                process.getOutputStream().close();
+            }
+
+            if (outputFromStdOut) {
+                sendStdOutToOutputStream(process, outputStream);
+            } else {
+                tmp.dispose();
+                try {
+                    process.waitFor();
+                } catch (InterruptedException ignore) {
+                }
+                // The command is finished, read the output file into the given output stream
+                InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
+                IOUtils.copy(tempOutputFileInputStream, outputStream);
+            }
+        } finally {
+            if (outputFromStdOut) {
+                try {
+                    process.waitFor();
+                } catch (InterruptedException ignore) {
+                }
+            } else {
+                try {
+                    // Clean up temp output files
+                    tempOutputFile.delete();
+                } catch (Exception e) {
+                }
+            }
+            if (!inputToStdIn) {
+                // Clean up temp input files
+                tikaInputStream.getFile().delete();
+            }
+            IOUtils.closeQuietly(outputStream);
+            IOUtils.closeQuietly(stdErrOutputStream);
+            if (process.exitValue() != 0) {
+                throw new TikaException("There was an error executing the command line" +
+                        "\nExecutable Command:\n\n" + cmd +
+                        "\nExecutable Error:\n\n" + stdErrOutputStream.toString("UTF-8"));
+            }
+        }
+    }
+
+    /**
+     * Creates a new thread for copying a given input stream to a given output stream.
+     *
+     * @param inputStream the source input stream
+     * @param outputStream the target output stream
+     */
+    private void multiThreadedStreamCopy(
+            final InputStream inputStream,
+            final OutputStream outputStream) {
+        new Thread(new Runnable() {
+            public void run() {
+                try {
+                    IOUtils.copy(inputStream, outputStream);
+                } catch (IOException e) {
+                    System.out.println("ERROR: " + e.getMessage());
+                }
+            }
+        }).start();
+    }
+
+    /**
+     * Sends the contents of the given input stream to the
+     * standard input of the given process. Potential exceptions are
+     * ignored.
+     * <p>
+     * Note that the given input stream is <em>not</em> closed by this method.
+     *
+     * @param process the process
+     * @param inputStream the input stream to send to standard input of the process
+     */
+    private void sendInputStreamToStdIn(
+            final InputStream inputStream,
+            final Process process) {
+        multiThreadedStreamCopy(inputStream, process.getOutputStream());
+    }
+
+    /**
+     * Sends the standard output of the given
+     * process to the given output stream. Potential exceptions are
+     * ignored.
+     * <p>
+     * Note that the given output stream is <em>not</em> closed by this method.
+     *
+     * @param process the process
+     * @param outputStream the putput stream to send to standard input of the process
+     */
+    private void sendStdOutToOutputStream(
+            final Process process,
+            final OutputStream outputStream) {
+        try {
+            IOUtils.copy(process.getInputStream(), outputStream);
+        } catch (IOException e) {
+            System.out.println("ERROR: " + e.getMessage());
+        }
+    }
+
+    /**
+     * Starts a thread that reads and discards the contents of the standard
+     * stream of the given process. Potential exceptions are ignored, and the
+     * stream is closed once fully processed.
+     *
+     * @param process the process
+     * param outputStream the output stream to send to standard error of the process
+     */
+    private void sendStdErrToOutputStream(
+            final Process process,
+            final OutputStream outputStream) {
+        multiThreadedStreamCopy(process.getErrorStream(), outputStream);
+    }
+
+    /**
+     * Checks to see if the command can be run. Typically used with something
+     * like "myapp --version" to check to see if "myapp" is installed and on the
+     * path.
+     *
+     * @param checkCmd the check command to run
+     * @param errorValue what is considered an error value?
+     * @return whether or not the check completed without error
+     */
+    public static boolean check(String checkCmd, int... errorValue) {
+        return check(new String[] { checkCmd }, errorValue);
+    }
+
+    /**
+     * Checks to see if the command can be run. Typically used with something
+     * like "myapp --version" to check to see if "myapp" is installed and on the
+     * path.
+     *
+     * @param checkCmd the check command to run
+     * @param errorValue what is considered an error value?
+     * @return whether or not the check completed without error
+     */
+    public static boolean check(String[] checkCmd, int... errorValue) {
+        if (errorValue.length == 0) {
+            errorValue = new int[] { 127 };
+        }
+
+        try {
+            Process process;
+            if (checkCmd.length == 1) {
+                process = Runtime.getRuntime().exec(checkCmd[0]);
+            } else {
+                process = Runtime.getRuntime().exec(checkCmd);
+            }
+            int result = process.waitFor();
+
+            for (int err : errorValue) {
+                if (result == err)
+                    return false;
+            }
+            return true;
+        } catch (IOException e) {
+            // Some problem, command is there or is broken
+            return false;
+        } catch (InterruptedException ie) {
+            // Some problem, command is there or is broken
+            return false;
+        }
+    }
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java?rev=1403941&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java Wed Oct 31 00:37:02 2012
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.embedder;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.embedder.Embedder;
+import org.apache.tika.embedder.ExternalEmbedder;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.txt.TXTParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Unit test for {@link ExternalEmbedder}s.
+ */
+public class ExternalEmbedderTest extends TestCase {
+
+    private static final Log logger = LogFactory
+            .getLog(ExternalEmbedderTest.class);
+
+    protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER =
+            new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+    protected static final String DEFAULT_CHARSET = "UTF-8";
+    private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description";
+    private static final String TEST_TXT_PATH = "/test-documents/testTXT.txt";
+
+    private TemporaryResources tmp = new TemporaryResources();
+
+    /**
+     * Create the test case
+     *
+     * @param testName
+     *            name of the test case
+     */
+    public ExternalEmbedderTest(String testName) {
+        super(testName);
+    }
+
+    /**
+     * @return the suite of tests being tested
+     */
+    public static Test suite() {
+        return new TestSuite(ExternalEmbedderTest.class);
+    }
+
+    /**
+     * Gets the expected returned metadata value for the given field
+     *
+     * @param fieldName
+     * @return a prefix added to the field name
+     */
+    protected String getExpectedMetadataValueString(String fieldName, Date timestamp) {
+        return this.getClass().getSimpleName() + " embedded " + fieldName +
+                " on " + EXPECTED_METADATA_DATE_FORMATTER.format(timestamp);
+    }
+
+    /**
+     * Gets the tika <code>Metadata</code> object containing data to be
+     * embedded.
+     *
+     * @return the populated tika metadata object
+     */
+    protected Metadata getMetadataToEmbed(Date timestamp) {
+        Metadata metadata = new Metadata();
+        metadata.add(TikaCoreProperties.DESCRIPTION,
+                getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp));
+        return metadata;
+    }
+
+    /**
+     * Gets the <code>Embedder</code> to test.
+     *
+     * @return the embedder under test
+     */
+    protected Embedder getEmbedder() {
+        ExternalEmbedder embedder = new ExternalEmbedder();
+        Map<Property, String[]> metadataCommandArguments = new HashMap<Property, String[]>(1);
+        metadataCommandArguments.put(TikaCoreProperties.DESCRIPTION,
+                new String[] { COMMAND_METADATA_ARGUMENT_DESCRIPTION });
+        embedder.setMetadataCommandArguments(metadataCommandArguments);
+        return embedder;
+    }
+
+    /**
+     * Gets the original input stream before metadata has been embedded.
+     *
+     * @return a fresh input stream
+     */
+    protected InputStream getOriginalInputStream() {
+        return this.getClass().getResourceAsStream(TEST_TXT_PATH);
+    }
+
+    /**
+     * Gets the parser to use to verify the result of the embed operation.
+     *
+     * @return the parser to read embedded metadata
+     */
+    protected Parser getParser() {
+        return new TXTParser();
+    }
+
+    /**
+     * Whether or not the final result of reading the now embedded metadata is
+     * expected in the output of the external tool
+     *
+     * @return whether or not results are expected in command line output
+     */
+    protected boolean getIsMetadataExpectedInOutput() {
+        return true;
+    }
+
+    /**
+     * Tests embedding metadata then reading metadata to verify the results.
+     *
+     * @param isResultExpectedInOutput whether or not results are expected in command line output
+     */
+    protected void embedInTempFile(boolean isResultExpectedInOutput) {
+        Date timestamp = new Date();
+        Metadata metadataToEmbed = getMetadataToEmbed(timestamp);
+        Embedder embedder = getEmbedder();
+
+        try {
+            // Get the input stream for the test document
+            InputStream origInputStream = getOriginalInputStream();
+            File tempOutputFile = tmp.createTemporaryFile();
+            FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile);
+
+            // Embed the metadata into a copy of the original output stream
+            embedder.embed(metadataToEmbed, origInputStream, tempFileOutputStream, null);
+
+            ParseContext context = new ParseContext();
+            Parser parser = getParser();
+            context.set(Parser.class, parser);
+
+            // Setup the extracting content handler
+            ByteArrayOutputStream result = new ByteArrayOutputStream();
+            OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET);
+            ContentHandler handler = new BodyContentHandler(outputWriter);
+
+            // Create a new metadata object to read the new metadata into
+            Metadata embeddedMetadata = new Metadata();
+
+            // Setup a re-read of the now embeded temp file
+            FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile);
+
+            parser.parse(embeddedFileInputStream, handler, embeddedMetadata,
+                    context);
+
+            tmp.dispose();
+
+            String outputString = null;
+            if (isResultExpectedInOutput) {
+                outputString = result.toString(DEFAULT_CHARSET);
+                logger.trace("outputString=" + outputString);
+            } else {
+                assertTrue("no metadata found", embeddedMetadata.size() > 0);
+            }
+
+            // Check each metadata property for the expected value
+            for (String metadataName : metadataToEmbed.names()) {
+                if (metadataToEmbed.get(metadataName) != null) {
+                    String expectedValue = metadataToEmbed.get(metadataName);
+                    logger.trace("expecting value of '"
+                            + metadataName + "' = '"
+                            + expectedValue + "'");
+                    boolean foundExpectedValue = false;
+                    if (isResultExpectedInOutput) {
+                        // just check that the entire output contains the expected string
+                        foundExpectedValue = outputString.contains(expectedValue);
+                    } else {
+                        if (embeddedMetadata.isMultiValued(metadataName)) {
+                            for (String embeddedValue : embeddedMetadata.getValues(metadataName)) {
+                                logger.debug("embedded values of '"
+                                        + metadataName + "' contains '"
+                                        + embeddedValue + "'");
+                                if (embeddedValue != null) {
+                                    if (embeddedValue.contains(expectedValue)) {
+                                        foundExpectedValue = true;
+                                        break;
+                                    }
+                                }
+                            }
+                        } else {
+                            String embeddedValue = embeddedMetadata.get(metadataName);
+                            logger.debug("embedded value of '"
+                                    + metadataName + "' = '"
+                                    + embeddedValue + "'");
+                            assertNotNull("expected metadata for "
+                                    + metadataName + " not found",
+                                    embeddedValue);
+                            foundExpectedValue = embeddedValue.contains(expectedValue);
+                        }
+                    }
+                    assertTrue(
+                            "result did not contain expected appended metadata "
+                                    + metadataName + "="
+                                    + expectedValue,
+                            foundExpectedValue);
+                }
+            }
+        } catch (IOException e) {
+            logger.error(e.getMessage(), e);
+            fail(e.getMessage());
+        } catch (TikaException e) {
+            logger.error(e.getMessage(), e);
+            fail(e.getMessage());
+        } catch (SAXException e) {
+            logger.error(e.getMessage(), e);
+            fail(e.getMessage());
+        }
+    }
+
+    public void testEmbed() {
+        embedInTempFile(getIsMetadataExpectedInOutput());
+    }
+
+}

Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native