You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2014/09/08 21:36:14 UTC

svn commit: r1623501 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/parser/pkg/

Author: nick
Date: Mon Sep  8 19:36:14 2014
New Revision: 1623501

URL: http://svn.apache.org/r1623501
Log:
Have PackageParser include the last-modified date from the archive in the metadata, when handling embedded entries TIKA-1246

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Sep  8 19:36:14 2014
@@ -1,5 +1,7 @@
 Release 1.7 - Current Development
 
+  * PackageParser includes the last-modified date from the archive
+    in the metadata, when handling embedded entries (TIKA-1246)
 
 
 Release 1.6 - 08/31/2014

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Mon Sep  8 19:36:14 2014
@@ -40,6 +40,7 @@ import org.apache.tika.io.CloseShieldInp
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -169,7 +170,9 @@ public class PackageParser extends Abstr
             throws SAXException, IOException, TikaException {
         String name = entry.getName();
         if (archive.canReadEntryData(entry)) {
+            // Fetch the metadata on the entry contained in the archive
             Metadata entrydata = new Metadata();
+            entrydata.set(TikaCoreProperties.MODIFIED, entry.getLastModifiedDate());
             if (name != null && name.length() > 0) {
                 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
                 AttributesImpl attributes = new AttributesImpl();
@@ -180,6 +183,7 @@ public class PackageParser extends Abstr
 
                 entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
             }
+            
             if (extractor.shouldParseEmbedded(entrydata)) {
                 // For detectors to work, we need a mark/reset supporting
                 // InputStream, which ArchiveInputStream isn't, so wrap

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Mon Sep  8 19:36:14 2014
@@ -25,6 +25,7 @@ import java.util.Set;
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.AutoDetectParser;
@@ -60,11 +61,13 @@ public abstract class AbstractPkgTest ex
    protected static class EmbeddedTrackingParser extends AbstractParser {
       protected List<String> filenames = new ArrayList<String>();
       protected List<String> mediatypes = new ArrayList<String>();
+      protected List<String> modifiedAts = new ArrayList<String>();
       protected byte[] lastSeenStart;
       
       public void reset() {
          filenames.clear();
          mediatypes.clear();
+         modifiedAts.clear();
       }
       
       public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -77,6 +80,7 @@ public abstract class AbstractPkgTest ex
             SAXException, TikaException {
          filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
          mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+         modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
          
          lastSeenStart = new byte[32];
          stream.read(lastSeenStart);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java Mon Sep  8 19:36:14 2014
@@ -87,8 +87,12 @@ public class ArParserTest extends Abstra
 
 		assertEquals(1, tracker.filenames.size());
 		assertEquals(1, tracker.mediatypes.size());
+                assertEquals(1, tracker.modifiedAts.size());
 
 		assertEquals("testTXT.txt", tracker.filenames.get(0));
+		
+		String modifiedAt = tracker.modifiedAts.get(0);
+	        assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
 
 		for (String type : tracker.mediatypes) {
 			assertNull(type);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Mon Sep  8 19:36:14 2014
@@ -91,9 +91,11 @@ public class Bzip2ParserTest extends Abs
        // Should find a single entry, for the (compressed) tar file
        assertEquals(1, tracker.filenames.size());
        assertEquals(1, tracker.mediatypes.size());
+       assertEquals(1, tracker.modifiedAts.size());
        
        assertEquals(null, tracker.filenames.get(0));
        assertEquals(null, tracker.mediatypes.get(0));
+       assertEquals(null, tracker.modifiedAts.get(0));
 
        // Tar file starts with the directory name
        assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Mon Sep  8 19:36:14 2014
@@ -90,9 +90,11 @@ public class GzipParserTest extends Abst
        // Should find a single entry, for the (compressed) tar file
        assertEquals(1, tracker.filenames.size());
        assertEquals(1, tracker.mediatypes.size());
+       assertEquals(1, tracker.modifiedAts.size());
        
        assertEquals(null, tracker.filenames.get(0));
        assertEquals(null, tracker.mediatypes.get(0));
+       assertEquals(null, tracker.modifiedAts.get(0));
 
        // Tar file starts with the directory name
        assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java Mon Sep  8 19:36:14 2014
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.pkg;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
@@ -98,6 +99,7 @@ public class Seven7ParserTest extends Ab
        // Should have found all 9 documents, but not the directory
        assertEquals(9, tracker.filenames.size());
        assertEquals(9, tracker.mediatypes.size());
+       assertEquals(9, tracker.modifiedAts.size());
        
        // Should have names but not content types, as 7z doesn't
        //  store the content types
@@ -114,5 +116,9 @@ public class Seven7ParserTest extends Ab
        for(String type : tracker.mediatypes) {
           assertNull(type);
        }
+       for(String mod : tracker.modifiedAts) {
+           assertNotNull(mod);
+           assertTrue("Modified at " + mod, mod.startsWith("20"));
+       }
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Mon Sep  8 19:36:14 2014
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.pkg;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
@@ -91,6 +92,7 @@ public class TarParserTest extends Abstr
        // Should have found all 9 documents, but not the directory
        assertEquals(9, tracker.filenames.size());
        assertEquals(9, tracker.mediatypes.size());
+       assertEquals(9, tracker.modifiedAts.size());
        
        // Should have names but not content types, as tar doesn't
        //  store the content types
@@ -107,5 +109,9 @@ public class TarParserTest extends Abstr
        for(String type : tracker.mediatypes) {
           assertNull(type);
        }
+       for(String mod : tracker.modifiedAts) {
+           assertNotNull(mod);
+           assertTrue("Modified at " + mod, mod.startsWith("20"));
+       }
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1623501&r1=1623500&r2=1623501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Mon Sep  8 19:36:14 2014
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.pkg;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
@@ -100,9 +101,10 @@ public class ZipParserTest extends Abstr
        // Should have found all 9 documents
        assertEquals(9, tracker.filenames.size());
        assertEquals(9, tracker.mediatypes.size());
+       assertEquals(9, tracker.modifiedAts.size());
        
-       // Should have names but not content types, as zip doesn't
-       //  store the content types
+       // Should have names and modified dates, but not content types, 
+       //  as zip doesn't store the content types
        assertEquals("testEXCEL.xls", tracker.filenames.get(0));
        assertEquals("testHTML.html", tracker.filenames.get(1));
        assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
@@ -116,6 +118,10 @@ public class ZipParserTest extends Abstr
        for(String type : tracker.mediatypes) {
           assertNull(type);
        }
+       for(String mod : tracker.modifiedAts) {
+           assertNotNull(mod);
+           assertTrue("Modified at " + mod, mod.startsWith("20"));
+       }
     }
 
     /**