You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/11/12 12:26:50 UTC

svn commit: r1408245 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Mon Nov 12 11:26:49 2012
New Revision: 1408245

URL: http://svn.apache.org/viewvc?rev=1408245&view=rev
Log:
TIKA-1019: also leave placeholder for links inside .doc

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1408245&r1=1408244&r2=1408245&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Nov 12 11:26:49 2012
@@ -1,9 +1,10 @@
 Release 1.3 - Current Development
 
-  * MS Word: When a Word (.doc) document contains embedded files, Tika
-    now places a <div class="embedded" id="_XXX"/> into the XHTML so
-    you can see where in the main text the embedded document
-    occurred (TIKA-956).  Embedded Wordpad/RTF documents are now
+  * MS Word: When a Word (.doc) document contains embedded files or
+    links to external documents, Tika now places a <div
+    class="embedded" id="_XXX"/> placeholder into the XHTML so you can
+    see where in the main text the embedded document occurred
+    (TIKA-956, TIKA-1019).  Embedded Wordpad/RTF documents are now
     recognized (TIKA-982).
 
   * PDF: Text from pop-up annotations is now extracted (TIKA-981)

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1408245&r1=1408244&r2=1408245&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Nov 12 11:26:49 2012
@@ -195,7 +195,9 @@ public class WordExtractor extends Abstr
           if (cr.text().getBytes()[0] == 0x13) {
              Field field = document.getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN,
                                                                       cr.getStartOffset());
-             if (field != null && field.getType() == 58) {
+             // 58 is an embedded document
+             // 56 is a document link
+             if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1408245&r1=1408244&r2=1408245&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Mon Nov 12 11:26:49 2012
@@ -194,6 +194,13 @@ public class WordParserTest extends Tika
         assertTrue(result.indexOf("_1404039792.rtf") != -1);
     }
 
+    // TIKA-1019
+    public void testDocumentLink() throws Exception {
+        String result = getXML("/test-documents/testDocumentLink.doc").xml;
+        assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\"/>") != -1);
+        assertTrue(result.indexOf("_1327495610.unknown") != -1);
+    }
+
     public void testWord6Parser() throws Exception {
         InputStream input = WordParserTest.class.getResourceAsStream(
                 "/test-documents/testWORD6.doc");

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc?rev=1408245&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
------------------------------------------------------------------------------
    svn:executable = *

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword