You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/10/09 18:43:34 UTC
svn commit: r1396103 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Author: mikemccand
Date: Tue Oct 9 16:43:34 2012
New Revision: 1396103
URL: http://svn.apache.org/viewvc?rev=1396103&view=rev
Log:
TIKA-999: also extract CREATION_DATE from RTF
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1396103&r1=1396102&r2=1396103&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Oct 9 16:43:34 2012
@@ -25,8 +25,8 @@ Release 1.3 - Current Development
key, and TikaCLI prepends the rId (if present) onto the filename
it extracts (TIKA-989).
- * RTF: Page, word and character count metadata are now extracted for
- RTF documents (TIKA-999).
+ * RTF: Page, word, character count and creation date metadata are
+ now extracted for RTF documents (TIKA-999).
* MS PowerPoint (.pptx): When a PowerPoint (.pptx) document contains
embedded files, Tika now places a <div class="embedded" id="XXX"/> into the
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1396103&r1=1396102&r2=1396103&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Tue Oct 9 16:43:34 2012
@@ -26,6 +26,7 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
+import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
@@ -196,6 +197,9 @@ final class TextExtractor {
private final XHTMLContentHandler out;
private final Metadata metadata;
+ // Used when extracting CREATION date:
+ private int year, month, day, hour, minute;
+
// How many next ansi chars we should skip; this
// is 0 except when we are still in the "ansi
// shadow" after seeing a unicode escape, at which
@@ -788,6 +792,16 @@ final class TextExtractor {
metadata.add(Office.WORD_COUNT, Integer.toString(param));
} else if (equals("nofchars")) {
metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
+ } else if (equals("yr")) {
+ year = param;
+ } else if (equals("mo")) {
+ month = param;
+ } else if (equals("dy")) {
+ day = param;
+ } else if (equals("hr")) {
+ hour = param;
+ } else if (equals("min")) {
+ minute = param;
}
if (fontTableState == 1) {
@@ -931,6 +945,8 @@ final class TextExtractor {
nextMetaData = OfficeOpenXMLExtended.MANAGER;
} else if (equals("template")) {
nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
+ } else if (equals("creatim")) {
+ nextMetaData = TikaCoreProperties.CREATED;
}
}
@@ -1150,7 +1166,11 @@ final class TextExtractor {
if (inHeader) {
if (nextMetaData != null) {
- if (nextMetaData.isMultiValuePermitted()) {
+ if (nextMetaData == TikaCoreProperties.CREATED) {
+ Calendar cal = Calendar.getInstance();
+ cal.set(year, month-1, day, hour, minute, 0);
+ metadata.set(nextMetaData, cal.getTime());
+ } else if (nextMetaData.isMultiValuePermitted()) {
metadata.add(nextMetaData, pendingBuffer.toString());
} else {
metadata.set(nextMetaData, pendingBuffer.toString());
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1396103&r1=1396102&r2=1396103&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Tue Oct 9 16:43:34 2012
@@ -326,6 +326,7 @@ public class RTFParserTest extends TikaT
assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
assertEquals("7", xml.metadata.get(Office.WORD_COUNT));
assertEquals("36", xml.metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("2012-09-02T17:01:00Z", xml.metadata.get(Office.CREATION_DATE));
}
private Result getResult(String filename) throws Exception {