You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/10/31 16:07:09 UTC
svn commit: r1404184 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/
tika-app/src/test/java/org/apache/tika/cli/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
Author: mikemccand
Date: Wed Oct 31 15:07:08 2012
New Revision: 1404184
URL: http://svn.apache.org/viewvc?rev=1404184&view=rev
Log:
TIKA-1015: include rel id in Metadata when parsing embedded documents inside Word (.doc)
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Wed Oct 31 15:07:08 2012
@@ -705,7 +705,7 @@ public class TikaCLI {
}
String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
- if (relID != null) {
+ if (relID != null && !name.startsWith(relID)) {
name = relID + "_" + name;
}
Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Wed Oct 31 15:07:08 2012
@@ -188,7 +188,7 @@ public class TikaCLITest extends TestCas
// ChemDraw file
File expected1 = new File(tempFile, "MBD002B040A.cdx");
// OLE10Native
- File expected2 = new File(tempFile, "file5");
+ File expected2 = new File(tempFile, "MBD002B0FA6_file5");
// Image of one of the embedded resources
File expected3 = new File(tempFile, "file0.emf");
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Oct 31 15:07:08 2012
@@ -84,7 +84,8 @@ abstract class AbstractPOIFSExtractor {
}
protected void handleEmbeddedResource(TikaInputStream resource, String filename,
- String mediaType, XHTMLContentHandler xhtml, boolean outputHtml)
+ String relationshipID, String mediaType, XHTMLContentHandler xhtml,
+ boolean outputHtml)
throws IOException, SAXException, TikaException {
try {
Metadata metadata = new Metadata();
@@ -92,6 +93,9 @@ abstract class AbstractPOIFSExtractor {
metadata.set(Metadata.TIKA_MIME_FILE, filename);
metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
}
+ if (relationshipID != null) {
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+ }
if(mediaType != null) {
metadata.set(Metadata.CONTENT_TYPE, mediaType);
}
@@ -122,7 +126,7 @@ abstract class AbstractPOIFSExtractor {
try {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = detector.detect(stream, new Metadata());
- handleEmbeddedResource(stream, null, type.toString(), xhtml, true);
+ handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
return;
} finally {
stream.close();
@@ -133,6 +137,7 @@ abstract class AbstractPOIFSExtractor {
// What kind of document is it?
Metadata metadata = new Metadata();
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Oct 31 15:07:08 2012
@@ -577,7 +577,7 @@ public class ExcelExtractor extends Abst
// Handle the embeded resource
extractor.handleEmbeddedResource(
- stream, null, mimeType,
+ stream, null, null, mimeType,
handler, true
);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Wed Oct 31 15:07:08 2012
@@ -202,7 +202,7 @@ public class HSLFExtractor extends Abstr
}
handleEmbeddedResource(
- TikaInputStream.get(pic.getData()), null,
+ TikaInputStream.get(pic.getData()), null, null,
mediaType, xhtml, false);
}
}
@@ -234,7 +234,7 @@ public class HSLFExtractor extends Abstr
mediaType = "application/vnd.ms-excel";
}
handleEmbeddedResource(
- stream, Integer.toString(oleShape.getObjectID()),
+ stream, Integer.toString(oleShape.getObjectID()), null,
mediaType, xhtml, false);
} finally {
stream.close();
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Oct 31 15:07:08 2012
@@ -234,7 +234,7 @@ public class OutlookExtractor extends Ab
if(attachment.attachData != null) {
handleEmbeddedResource(
TikaInputStream.get(attachment.attachData.getValue()),
- filename,
+ filename, null,
null, xhtml, true
);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1404184&r1=1404183&r2=1404184&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Oct 31 15:07:08 2012
@@ -196,8 +196,8 @@ public class WordExtractor extends Abstr
Field field = document.getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN,
cr.getStartOffset());
if (field != null && field.getType() == 58) {
- // Embedded Object: add a <div
- // embedded="name"/> so consumer can see where
+ // Embedded Object: add a <div
+ // class="embedded" id="_X"/> so consumer can see where
// in the main text each embedded document
// occurred:
String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
@@ -422,7 +422,7 @@ public class WordExtractor extends Abstr
// (Only expose each individual image once)
if(! pictures.hasOutput(picture)) {
TikaInputStream stream = TikaInputStream.get(picture.getContent());
- handleEmbeddedResource(stream, filename, mimeType, xhtml, false);
+ handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
pictures.recordOutput(picture);
}
}