You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/08/07 23:41:30 UTC
svn commit: r1370548 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Author: mikemccand
Date: Tue Aug 7 21:41:30 2012
New Revision: 1370548
URL: http://svn.apache.org/viewvc?rev=1370548&view=rev
Log:
TIKA-956: show where embedded docs occurred when extracting processing Word (.doc) documents
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1370548&r1=1370547&r2=1370548&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Aug 7 21:41:30 2012
@@ -1,5 +1,9 @@
Release 1.3 - Current Development
+ * MS Word: When a Word (.doc) document contains embedded files, Tika
+ now places a <div class="embedded" id="_XXX"/> into the XHTML so
+ you can see where in the main text the embedded document
+ occurred. (TIKA-956)
Release 1.2 - 07/10/2012
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1370548&r1=1370547&r2=1370548&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Tue Aug 7 21:41:30 2012
@@ -29,9 +29,11 @@ import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.HWPFOldDocument;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Field;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
@@ -188,6 +190,24 @@ public class WordExtractor extends Abstr
for(int j=0; j<p.numCharacterRuns(); j++) {
CharacterRun cr = p.getCharacterRun(j);
+
+ // FIELD_BEGIN_MARK:
+ if (cr.text().getBytes()[0] == 0x13) {
+ Field field = document.getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN,
+ cr.getStartOffset());
+ if (field != null && field.getType() == 58) {
+ // Embedded Object: add a <div
+ // embedded="name"/> so consumer can see where
+ // in the main text each embedded document
+ // occurred:
+ String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", id);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
if(cr.text().equals("\u0013")) {
j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1370548&r1=1370547&r2=1370548&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Tue Aug 7 21:41:30 2012
@@ -171,6 +171,22 @@ public class WordParserTest extends Tika
assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
+ public void testEmbeddedNames() throws Exception {
+ String result = getXML("/test-documents/testWORD_embedded_pdf.doc").xml;
+
+ // Make sure the embedded div comes out after "Here
+ // is the pdf file" and before "Bye Bye":
+ int i = result.indexOf("Here is the pdf file:");
+ assertTrue(i != -1);
+ int j = result.indexOf("<div class=\"embedded\" id=\"_1402837031\"/>");
+ assertTrue(j != -1);
+ int k = result.indexOf("Bye Bye");
+ assertTrue(k != -1);
+
+ assertTrue(i < j);
+ assertTrue(j < k);
+ }
+
public void testWord6Parser() throws Exception {
InputStream input = WordParserTest.class.getResourceAsStream(
"/test-documents/testWORD6.doc");