You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/11/26 17:12:11 UTC

svn commit: r1641841 - in /tika/trunk: tika-app/src/test/java/org/apache/tika/cli/ tika-core/src/main/java/org/apache/tika/metadata/ tika-core/src/main/java/org/apache/tika/parser/ tika-parsers/src/main/java/org/apache/tika/parser/pdf/

Author: tallison
Date: Wed Nov 26 16:12:11 2014
New Revision: 1641841

URL: http://svn.apache.org/r1641841
Log:
TIKA-1488 add X-Tika as namespace

Modified:
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1641841&r1=1641840&r2=1641841&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Nov 26 16:12:11 2014
@@ -356,10 +356,8 @@ public class TikaCLITest {
                 "    \"Application-Version\": \"15.0000\",\n" +
                 "    \"Character Count\": \"28\",\n" +
                 "    \"Character-Count-With-Spaces\": \"31\","));
-        assertTrue(content.endsWith("    \"tika:embedded_resource_path\": \"test_recursive_embedded.docx/embed1.zip\"\n" +
-                "  }\n" +
-                "]"));
-        assertFalse(content.contains("tika:content"));
+        assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"test_recursive_embedded.docx/embed1.zip\""));
+        assertFalse(content.contains("X-TIKA:content"));
 
     }
 
@@ -368,7 +366,7 @@ public class TikaCLITest {
         String[] params = new String[]{"-J", "-r", resourcePrefix+"test_recursive_embedded.docx"};
         TikaCLI.main(params);
         String content = outContent.toString("UTF-8");
-        assertTrue(content.contains("\"tika:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
+        assertTrue(content.contains("\"X-TIKA:content\": \"\\u003chtml xmlns\\u003d\\\"http://www.w3.org/1999/xhtml"));
     }
 
     @Test

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java?rev=1641841&r1=1641840&r2=1641841&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java Wed Nov 26 16:12:11 2014
@@ -51,8 +51,21 @@ public interface TikaCoreProperties {
         ATTACHMENT
     };
 
+    /**
+     * Use this to prefix metadata properties that store information
+     * about the parsing process.  Users should be able to distinguish
+     * between metadata that was contained within the document and
+     * metadata about the parsing process.
+     * In Tika 2.0 (or earlier?), let's change X-ParsedBy to X-TIKA-Parsed-By.
+     */
+    public static String TIKA_META_PREFIX = "X-TIKA"+Metadata.NAMESPACE_PREFIX_DELIMITER;
 
     /**
+     * Use this to store parse exception information in the Metadata object.
+     */
+    public static String TIKA_META_EXCEPTION_PREFIX = TIKA_META_PREFIX+"EXCEPTION"+
+            Metadata.NAMESPACE_PREFIX_DELIMITER;
+    /**
      * @see DublinCore#FORMAT
      */
     public static final Property FORMAT = Property.composite(DublinCore.FORMAT, 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java?rev=1641841&r1=1641840&r2=1641841&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java Wed Nov 26 16:12:11 2014
@@ -17,10 +17,18 @@ package org.apache.tika.parser;
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.sax.ContentHandlerFactory;
@@ -28,13 +36,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Date;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Set;
-
 /**
  * This is a helper class that wraps a parser in a recursive handler.
  * It takes care of setting the embedded parser in the ParseContext 
@@ -75,25 +76,17 @@ public class RecursiveParserWrapper impl
      */
     private static final long serialVersionUID = 9086536568120690938L;
 
-
-    
-    public final static String TIKA_PREFIX = "tika:";
-    public final static String TIKA_EXCEPTION_PREFIX = "tika_ex:";
-    
     //move this to TikaCoreProperties?
-    public final static Property TIKA_CONTENT = Property.internalText(TIKA_PREFIX+"content");
-    public final static Property PARSE_TIME_MILLIS = Property.internalText(TIKA_PREFIX+"parse_time_millis");
+    public final static Property TIKA_CONTENT = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"content");
+    public final static Property PARSE_TIME_MILLIS = Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"parse_time_millis");
     public final static Property WRITE_LIMIT_REACHED =
-                Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"write_limit_reached");
+                Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"write_limit_reached");
     public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = 
-                Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"embedded_resource_limit_reached");
-
-    public final static Property PARSE_EXCEPTION =
-            Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"parse_exception");
+                Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"embedded_resource_limit_reached");
 
     //move this to TikaCoreProperties?
     public final static Property EMBEDDED_RESOURCE_PATH = 
-                Property.internalText(TIKA_PREFIX+"embedded_resource_path");
+                Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
  
     private final Parser wrappedParser;
     private final ContentHandlerFactory contentHandlerFactory;

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1641841&r1=1641840&r2=1641841&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Wed Nov 26 16:12:11 2014
@@ -254,7 +254,7 @@ public class PDFParser extends AbstractP
                 // TODO WARN if this XMP version is inconsistent with document header version?          
             }
         } catch (IOException e) {
-            metadata.set("pdf:metadata-xmp-parse-failed", ""+e);
+            metadata.set(TikaCoreProperties.TIKA_META_PREFIX+"pdf:metadata-xmp-parse-failed", ""+e);
         }
         //TODO: Let's try to move this into PDFBox.
         //Attempt to determine Adobe extension level, if present: