You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2014/09/08 21:36:14 UTC
svn commit: r1623501 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/pkg/
tika-parsers/src/test/java/org/apache/tika/parser/pkg/
Author: nick
Date: Mon Sep 8 19:36:14 2014
New Revision: 1623501
URL: http://svn.apache.org/r1623501
Log:
Have PackageParser include the last-modified date from the archive in the metadata, when handling embedded entries TIKA-1246
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Sep 8 19:36:14 2014
@@ -1,5 +1,7 @@
Release 1.7 - Current Development
+ * PackageParser includes the last-modified date from the archive
+ in the metadata, when handling embedded entries (TIKA-1246)
Release 1.6 - 08/31/2014
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Mon Sep 8 19:36:14 2014
@@ -40,6 +40,7 @@ import org.apache.tika.io.CloseShieldInp
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
@@ -169,7 +170,9 @@ public class PackageParser extends Abstr
throws SAXException, IOException, TikaException {
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
+ // Fetch the metadata on the entry contained in the archive
Metadata entrydata = new Metadata();
+ entrydata.set(TikaCoreProperties.MODIFIED, entry.getLastModifiedDate());
if (name != null && name.length() > 0) {
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
AttributesImpl attributes = new AttributesImpl();
@@ -180,6 +183,7 @@ public class PackageParser extends Abstr
entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
}
+
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Mon Sep 8 19:36:14 2014
@@ -25,6 +25,7 @@ import java.util.Set;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
@@ -60,11 +61,13 @@ public abstract class AbstractPkgTest ex
protected static class EmbeddedTrackingParser extends AbstractParser {
protected List<String> filenames = new ArrayList<String>();
protected List<String> mediatypes = new ArrayList<String>();
+ protected List<String> modifiedAts = new ArrayList<String>();
protected byte[] lastSeenStart;
public void reset() {
filenames.clear();
mediatypes.clear();
+ modifiedAts.clear();
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -77,6 +80,7 @@ public abstract class AbstractPkgTest ex
SAXException, TikaException {
filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+ modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
lastSeenStart = new byte[32];
stream.read(lastSeenStart);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java Mon Sep 8 19:36:14 2014
@@ -87,8 +87,12 @@ public class ArParserTest extends Abstra
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
assertEquals("testTXT.txt", tracker.filenames.get(0));
+
+ String modifiedAt = tracker.modifiedAts.get(0);
+ assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
for (String type : tracker.mediatypes) {
assertNull(type);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Mon Sep 8 19:36:14 2014
@@ -91,9 +91,11 @@ public class Bzip2ParserTest extends Abs
// Should find a single entry, for the (compressed) tar file
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
// Tar file starts with the directory name
assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Mon Sep 8 19:36:14 2014
@@ -90,9 +90,11 @@ public class GzipParserTest extends Abst
// Should find a single entry, for the (compressed) tar file
assertEquals(1, tracker.filenames.size());
assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
// Tar file starts with the directory name
assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java Mon Sep 8 19:36:14 2014
@@ -17,6 +17,7 @@
package org.apache.tika.parser.pkg;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -98,6 +99,7 @@ public class Seven7ParserTest extends Ab
// Should have found all 9 documents, but not the directory
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
// Should have names but not content types, as 7z doesn't
// store the content types
@@ -114,5 +116,9 @@ public class Seven7ParserTest extends Ab
for(String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Mon Sep 8 19:36:14 2014
@@ -17,6 +17,7 @@
package org.apache.tika.parser.pkg;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -91,6 +92,7 @@ public class TarParserTest extends Abstr
// Should have found all 9 documents, but not the directory
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
// Should have names but not content types, as tar doesn't
// store the content types
@@ -107,5 +109,9 @@ public class TarParserTest extends Abstr
for(String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Mon Sep 8 19:36:14 2014
@@ -17,6 +17,7 @@
package org.apache.tika.parser.pkg;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -100,9 +101,10 @@ public class ZipParserTest extends Abstr
// Should have found all 9 documents
assertEquals(9, tracker.filenames.size());
assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
- // Should have names but not content types, as zip doesn't
- // store the content types
+ // Should have names and modified dates, but not content types,
+ // as zip doesn't store the content types
assertEquals("testEXCEL.xls", tracker.filenames.get(0));
assertEquals("testHTML.html", tracker.filenames.get(1));
assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
@@ -116,6 +118,10 @@ public class ZipParserTest extends Abstr
for(String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
}
/**