You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/10/31 16:07:09 UTC

svn commit: r1404184 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/ tika-app/src/test/java/org/apache/tika/cli/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/

Author: mikemccand
Date: Wed Oct 31 15:07:08 2012
New Revision: 1404184

URL: http://svn.apache.org/viewvc?rev=1404184&view=rev
Log:
TIKA-1015: include rel id in Metadata when parsing embedded documents inside Word (.doc)

Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Oct 31 15:07:08 2012
@@ -705,7 +705,7 @@ public class TikaCLI {
             }
 
             String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
-            if (relID != null) {
+            if (relID != null && !name.startsWith(relID)) {
               name = relID + "_" + name;
             }
 

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Oct 31 15:07:08 2012
@@ -188,7 +188,7 @@ public class TikaCLITest extends TestCas
             // ChemDraw file
             File expected1 = new File(tempFile, "MBD002B040A.cdx");
             // OLE10Native
-            File expected2 = new File(tempFile, "file5");
+            File expected2 = new File(tempFile, "MBD002B0FA6_file5");
             // Image of one of the embedded resources
             File expected3 = new File(tempFile, "file0.emf");
             

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Oct 31 15:07:08 2012
@@ -84,7 +84,8 @@ abstract class AbstractPOIFSExtractor {
     }
     
     protected void handleEmbeddedResource(TikaInputStream resource, String filename,
-          String mediaType, XHTMLContentHandler xhtml, boolean outputHtml)
+                                          String relationshipID, String mediaType, XHTMLContentHandler xhtml,
+                                          boolean outputHtml)
           throws IOException, SAXException, TikaException {
        try {
            Metadata metadata = new Metadata();
@@ -92,6 +93,9 @@ abstract class AbstractPOIFSExtractor {
                metadata.set(Metadata.TIKA_MIME_FILE, filename);
                metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
            }
+           if (relationshipID != null) {
+               metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+           }
            if(mediaType != null) {
                metadata.set(Metadata.CONTENT_TYPE, mediaType);
            }
@@ -122,7 +126,7 @@ abstract class AbstractPOIFSExtractor {
             try {
                 ZipContainerDetector detector = new ZipContainerDetector();
                 MediaType type = detector.detect(stream, new Metadata());
-                handleEmbeddedResource(stream, null, type.toString(), xhtml, true);
+                handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
                 return;
             } finally {
                 stream.close();
@@ -133,6 +137,7 @@ abstract class AbstractPOIFSExtractor {
 
         // What kind of document is it?
         Metadata metadata = new Metadata();
+        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         TikaInputStream embedded = null;
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Oct 31 15:07:08 2012
@@ -577,7 +577,7 @@ public class ExcelExtractor extends Abst
                     
                     // Handle the embeded resource
                     extractor.handleEmbeddedResource(
-                          stream, null, mimeType,
+                          stream, null, null, mimeType,
                           handler, true
                     );
                  }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Oct 31 15:07:08 2012
@@ -202,7 +202,7 @@ public class HSLFExtractor extends Abstr
             }
 
             handleEmbeddedResource(
-                  TikaInputStream.get(pic.getData()), null,
+                  TikaInputStream.get(pic.getData()), null, null,
                   mediaType, xhtml, false);
         }
     }
@@ -234,7 +234,7 @@ public class HSLFExtractor extends Abstr
                         mediaType = "application/vnd.ms-excel";
                      }
                      handleEmbeddedResource(
-                           stream, Integer.toString(oleShape.getObjectID()),
+                           stream, Integer.toString(oleShape.getObjectID()), null,
                            mediaType, xhtml, false);
                   } finally {
                      stream.close();

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Oct 31 15:07:08 2012
@@ -234,7 +234,7 @@ public class OutlookExtractor extends Ab
                if(attachment.attachData != null) {
                   handleEmbeddedResource(
                         TikaInputStream.get(attachment.attachData.getValue()),
-                        filename,
+                        filename, null,
                         null, xhtml, true
                   );
                }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Oct 31 15:07:08 2012
@@ -196,8 +196,8 @@ public class WordExtractor extends Abstr
              Field field = document.getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN,
                                                                       cr.getStartOffset());
              if (field != null && field.getType() == 58) {
-                // Embedded Object: add a <div
-               // embedded="name"/> so consumer can see where
+               // Embedded Object: add a <div
+               // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
@@ -422,7 +422,7 @@ public class WordExtractor extends Abstr
        // (Only expose each individual image once) 
        if(! pictures.hasOutput(picture)) {
           TikaInputStream stream = TikaInputStream.get(picture.getContent());
-          handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
+          handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
           pictures.recordOutput(picture);
        }
     }