You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/02/08 23:22:11 UTC
svn commit: r742187 - in /lucene/tika/trunk: CHANGES.txt
src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
src/test/resources/test-documents/testMSG.msg
Author: jukka
Date: Sun Feb 8 22:22:10 2009
New Revision: 742187
URL: http://svn.apache.org/viewvc?rev=742187&view=rev
Log:
TIKA-197: Microsoft Outlook (msg) files get parsed multiple times
Added a somewhat ugly marker flag to prevent Outlook documents from being parsed more than once. Added a test case for this fix.
Added:
lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg (with props)
Modified:
lucene/tika/trunk/CHANGES.txt
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Modified: lucene/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=742187&r1=742186&r2=742187&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Sun Feb 8 22:22:10 2009
@@ -25,6 +25,9 @@
* Embedded text in MIDI files is now extracted. For example many karaoke
files contain song lyrics embedded as MIDI text.
+ * The text content of Microsoft Outlook message files no longer appears as
+ multiple copies in the extracted text. (TIKA-197)
+
See http://tinyurl.com/tika-0-3-changes for a list of all changes in Tika 0.3.
The following people have contributed to Tika 0.3 by submitting or commenting
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=742187&r1=742186&r2=742187&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Feb 8 22:22:10 2009
@@ -61,6 +61,7 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
+ boolean outlookExtracted = false;
POIFSFileSystem filesystem = new POIFSFileSystem(stream);
Iterator<?> entries = filesystem.getRoot().getEntries();
while (entries.hasNext()) {
@@ -92,7 +93,9 @@
for (String text : extractor.getAllText()) {
xhtml.element("p", text);
}
- } else if (name.startsWith("__substg1.0_")) {
+ } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
+ // TODO: Cleaner mechanism for detecting Outlook
+ outlookExtracted = true;
setType(metadata, "application/vnd.ms-outlook");
new OutlookExtractor(filesystem).parse(xhtml, metadata);
}
Modified: lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=742187&r1=742186&r2=742187&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Sun Feb 8 22:22:10 2009
@@ -17,6 +17,8 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import junit.framework.TestCase;
@@ -62,4 +64,33 @@
assertTrue(content.contains("Messagerie et groupes de discussion"));
}
+ /**
+ * Test case for TIKA-197
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
+ */
+ public void testMultipleCopies() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG.msg");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+ Pattern pattern = Pattern.compile("From");
+ Matcher matcher = pattern.matcher(content);
+ assertTrue(matcher.find());
+ assertFalse(matcher.find());
+ }
+
}
Added: lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg?rev=742187&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream