You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/08/07 23:41:30 UTC

svn commit: r1370548 - in /tika/trunk: CHANGES.txt tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Author: mikemccand
Date: Tue Aug  7 21:41:30 2012
New Revision: 1370548

URL: http://svn.apache.org/viewvc?rev=1370548&view=rev
Log:
TIKA-956: show where embedded docs occurred when extracting processing Word (.doc) documents

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1370548&r1=1370547&r2=1370548&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Aug  7 21:41:30 2012
@@ -1,5 +1,9 @@
 Release 1.3 - Current Development
 
+  * MS Word: When a Word (.doc) document contains embedded files, Tika
+    now places a <div class="embedded" id="_XXX"/> into the XHTML so
+    you can see where in the main text the embedded document
+    occurred. (TIKA-956)
 
 Release 1.2 - 07/10/2012
 ---------------------------------

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1370548&r1=1370547&r2=1370548&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Tue Aug  7 21:41:30 2012
@@ -29,9 +29,11 @@ import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFOldDocument;
 import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.model.FieldsDocumentPart;
 import org.apache.poi.hwpf.model.PicturesTable;
 import org.apache.poi.hwpf.model.StyleDescription;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Field;
 import org.apache.poi.hwpf.usermodel.Paragraph;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.hwpf.usermodel.Range;
@@ -188,6 +190,24 @@ public class WordExtractor extends Abstr
 
        for(int j=0; j<p.numCharacterRuns(); j++) {
           CharacterRun cr = p.getCharacterRun(j);
+
+          // FIELD_BEGIN_MARK:
+          if (cr.text().getBytes()[0] == 0x13) {
+             Field field = document.getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN,
+                                                                      cr.getStartOffset());
+             if (field != null && field.getType() == 58) {
+                // Embedded Object: add a <div
+               // embedded="name"/> so consumer can see where
+               // in the main text each embedded document
+               // occurred:
+               String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
+               AttributesImpl attributes = new AttributesImpl();
+               attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+               attributes.addAttribute("", "id", "id", "CDATA", id);
+               xhtml.startElement("div", attributes);
+               xhtml.endElement("div");
+             }
+          }
           
           if(cr.text().equals("\u0013")) {
              j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1370548&r1=1370547&r2=1370548&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Tue Aug  7 21:41:30 2012
@@ -171,6 +171,22 @@ public class WordParserTest extends Tika
         assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
     }
 
+    public void testEmbeddedNames() throws Exception {
+        String result = getXML("/test-documents/testWORD_embedded_pdf.doc").xml;
+
+        // Make sure the embedded div comes out after "Here
+        // is the pdf file" and before "Bye Bye":
+        int i = result.indexOf("Here is the pdf file:");
+        assertTrue(i != -1);
+        int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\"/>");
+        assertTrue(j != -1);
+        int k = result.indexOf("Bye Bye");
+        assertTrue(k != -1);
+
+        assertTrue(i < j);
+        assertTrue(j < k);
+    }
+
     public void testWord6Parser() throws Exception {
         InputStream input = WordParserTest.class.getResourceAsStream(
                 "/test-documents/testWORD6.doc");