You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/14 12:42:05 UTC

svn commit: r1651625 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pkg/ test/java/org/apache/tika/parser/pkg/

Author: nick
Date: Wed Jan 14 11:42:05 2015
New Revision: 1651625

URL: http://svn.apache.org/r1651625
Log:
TIKA-241 Refactor to use common logic between PackageParser and RarParser for populating xhtml+metadata of embedded resources

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Wed Jan 14 11:42:05 2015
@@ -19,6 +19,7 @@ package org.apache.tika.parser.pkg;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Date;
 import java.util.Set;
 
 import org.apache.commons.compress.archivers.ArchiveEntry;
@@ -175,20 +176,10 @@ public class PackageParser extends Abstr
         String name = entry.getName();
         if (archive.canReadEntryData(entry)) {
             // Fetch the metadata on the entry contained in the archive
-            Metadata entrydata = new Metadata();
-            entrydata.set(TikaCoreProperties.MODIFIED, entry.getLastModifiedDate());
-            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(entry.getSize()));
-            if (name != null && name.length() > 0) {
-                entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", name);
-                xhtml.startElement("div", attributes);
-                xhtml.endElement("div");
-
-                entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
-            }
+            Metadata entrydata = handleEntryMetadata(name, null, 
+                    entry.getLastModifiedDate(), entry.getSize(), xhtml);
             
+            // Recurse into the entry if desired
             if (extractor.shouldParseEmbedded(entrydata)) {
                 // For detectors to work, we need a mark/reset supporting
                 // InputStream, which ArchiveInputStream isn't, so wrap
@@ -204,6 +195,34 @@ public class PackageParser extends Abstr
             xhtml.element("p", name);
         }
     }
+    
+    protected static Metadata handleEntryMetadata(
+            String name, Date createAt, Date modifiedAt,
+            Long size, XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
+        Metadata entrydata = new Metadata();
+        if (createAt != null) {
+            entrydata.set(TikaCoreProperties.CREATED, createAt);
+        }
+        if (modifiedAt != null) {
+            entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+        }
+        if (size != null) {
+            entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+        }
+        if (name != null && name.length() > 0) {
+            name = name.replace("\\", "/");
+            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+            attributes.addAttribute("", "id", "id", "CDATA", name);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+
+            entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
+        }
+        return entrydata;
+    }
 
     // Pending a fix for COMPRESS-269, we have to wrap ourselves
     private static class SevenZWrapper extends ArchiveInputStream {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java Wed Jan 14 11:42:05 2015
@@ -28,7 +28,6 @@ import org.apache.tika.extractor.Parsing
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -81,28 +80,21 @@ public class RarParser extends AbstractP
 
             FileHeader header = rar.nextFileHeader();
             while (header != null && !Thread.currentThread().isInterrupted()) {
-
                 if (!header.isDirectory()) {
-
                     InputStream subFile = null;
                     try {
-
                         subFile = rar.getInputStream(header);
 
-                        Metadata entrydata = new Metadata();
-                        entrydata.set(Metadata.RESOURCE_NAME_KEY, header
-                                .getFileNameString().replace("\\", "/"));
-                        entrydata.set(TikaCoreProperties.CREATED,
-                                header.getCTime());
-                        entrydata.set(TikaCoreProperties.MODIFIED,
-                                header.getMTime());
-                        entrydata.set(Metadata.CONTENT_LENGTH,
-                                Long.toString(header.getFullUnpackSize()));
-
-                        if (extractor.shouldParseEmbedded(entrydata))
-                            extractor.parseEmbedded(subFile, handler,
-                                    entrydata, true);
-
+                        Metadata entrydata = PackageParser.handleEntryMetadata(
+                                header.getFileNameString(),
+                                header.getCTime(), header.getMTime(),
+                                header.getFullUnpackSize(),
+                                xhtml
+                        );
+
+                        if (extractor.shouldParseEmbedded(entrydata)) {
+                            extractor.parseEmbedded(subFile, handler, entrydata, true);
+                        }
                     } finally {
                         if (subFile != null)
                             subFile.close();

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Wed Jan 14 11:42:05 2015
@@ -61,12 +61,14 @@ public abstract class AbstractPkgTest ex
    protected static class EmbeddedTrackingParser extends AbstractParser {
       protected List<String> filenames = new ArrayList<String>();
       protected List<String> mediatypes = new ArrayList<String>();
+      protected List<String> createdAts = new ArrayList<String>();
       protected List<String> modifiedAts = new ArrayList<String>();
       protected byte[] lastSeenStart;
       
       public void reset() {
          filenames.clear();
          mediatypes.clear();
+         createdAts.clear();
          modifiedAts.clear();
       }
       
@@ -80,6 +82,7 @@ public abstract class AbstractPkgTest ex
             SAXException, TikaException {
          filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
          mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+         createdAts.add(metadata.get(TikaCoreProperties.CREATED));
          modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
          
          lastSeenStart = new byte[32];

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java Wed Jan 14 11:42:05 2015
@@ -97,6 +97,9 @@ public class ArParserTest extends Abstra
         for (String type : tracker.mediatypes) {
             assertNull(type);
         }
+        for(String crt : tracker.createdAts) {
+            assertNull(crt);
+        }
 
         tracker.reset();
         stream = ArParserTest.class.getResourceAsStream(
@@ -118,5 +121,8 @@ public class ArParserTest extends Abstra
         for (String type : tracker.mediatypes) {
             assertNull(type);
         }
+        for(String crt : tracker.createdAts) {
+            assertNull(crt);
+        }
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Wed Jan 14 11:42:05 2015
@@ -95,6 +95,7 @@ public class Bzip2ParserTest extends Abs
        
        assertEquals(null, tracker.filenames.get(0));
        assertEquals(null, tracker.mediatypes.get(0));
+       assertEquals(null, tracker.createdAts.get(0));
        assertEquals(null, tracker.modifiedAts.get(0));
 
        // Tar file starts with the directory name

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java Wed Jan 14 11:42:05 2015
@@ -109,9 +109,24 @@ public class RarParserTest extends Abstr
        for(String type : tracker.mediatypes) {
           assertNull(type);
        }
+       for(String crt : tracker.createdAts) {
+           assertNull(crt);
+       }
        for(String mod : tracker.modifiedAts) {
            assertNotNull(mod);
            assertTrue("Modified at " + mod, mod.startsWith("20"));
        }
+       
+       // Should have filenames in the content string
+       String content = handler.toString();
+       assertContains("test-documents/testHTML.html", content);
+       assertContains("test-documents/testEXCEL.xls", content);
+       assertContains("test-documents/testOpenOffice2.odt", content);
+       assertContains("test-documents/testPDF.pdf", content);
+       assertContains("test-documents/testPPT.ppt", content);
+       assertContains("test-documents/testRTF.rtf", content);
+       assertContains("test-documents/testTXT.txt", content);
+       assertContains("test-documents/testWORD.doc", content);
+       assertContains("test-documents/testXML.xml", content);
     }
 }
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Wed Jan 14 11:42:05 2015
@@ -109,6 +109,9 @@ public class TarParserTest extends Abstr
        for(String type : tracker.mediatypes) {
           assertNull(type);
        }
+       for(String crt : tracker.createdAts) {
+           assertNull(crt);
+       }
        for(String mod : tracker.modifiedAts) {
            assertNotNull(mod);
            assertTrue("Modified at " + mod, mod.startsWith("20"));

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Wed Jan 14 11:42:05 2015
@@ -118,6 +118,9 @@ public class ZipParserTest extends Abstr
        for(String type : tracker.mediatypes) {
           assertNull(type);
        }
+       for(String crt : tracker.createdAts) {
+           assertNull(crt);
+       }
        for(String mod : tracker.modifiedAts) {
            assertNotNull(mod);
            assertTrue("Modified at " + mod, mod.startsWith("20"));