You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/11/12 12:26:50 UTC
svn commit: r1408245 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Mon Nov 12 11:26:49 2012
New Revision: 1408245
URL: http://svn.apache.org/viewvc?rev=1408245&view=rev
Log:
TIKA-1019: also leave placeholder for links inside .doc
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1408245&r1=1408244&r2=1408245&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Nov 12 11:26:49 2012
@@ -1,9 +1,10 @@
Release 1.3 - Current Development
- * MS Word: When a Word (.doc) document contains embedded files, Tika
- now places a <div class="embedded" id="_XXX"/> into the XHTML so
- you can see where in the main text the embedded document
- occurred (TIKA-956). Embedded Wordpad/RTF documents are now
+ * MS Word: When a Word (.doc) document contains embedded files or
+ links to external documents, Tika now places a <div
+ class="embedded" id="_XXX"/> placeholder into the XHTML so you can
+ see where in the main text the embedded document occurred
+ (TIKA-956, TIKA-1019). Embedded Wordpad/RTF documents are now
recognized (TIKA-982).
* PDF: Text from pop-up annotations is now extracted (TIKA-981)
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1408245&r1=1408244&r2=1408245&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Nov 12 11:26:49 2012
@@ -195,7 +195,9 @@ public class WordExtractor extends Abstr
if (cr.text().getBytes()[0] == 0x13) {
Field field = document.getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN,
cr.getStartOffset());
- if (field != null && field.getType() == 58) {
+ // 58 is an embedded document
+ // 56 is a document link
+ if (field != null && (field.getType() == 58 || field.getType() == 56)) {
// Embedded Object: add a <div
// class="embedded" id="_X"/> so consumer can see where
// in the main text each embedded document
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1408245&r1=1408244&r2=1408245&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Mon Nov 12 11:26:49 2012
@@ -194,6 +194,13 @@ public class WordParserTest extends Tika
assertTrue(result.indexOf("_1404039792.rtf") != -1);
}
+ // TIKA-1019
+ public void testDocumentLink() throws Exception {
+ String result = getXML("/test-documents/testDocumentLink.doc").xml;
+ assertTrue(result.indexOf("<div class=\"embedded\" id=\"_1327495610\"/>") != -1);
+ assertTrue(result.indexOf("_1327495610.unknown") != -1);
+ }
+
public void testWord6Parser() throws Exception {
InputStream input = WordParserTest.class.getResourceAsStream(
"/test-documents/testWORD6.doc");
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc?rev=1408245&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
------------------------------------------------------------------------------
svn:executable = *
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testDocumentLink.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword