You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/14 12:42:05 UTC
svn commit: r1651625 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pkg/ test/java/org/apache/tika/parser/pkg/
Author: nick
Date: Wed Jan 14 11:42:05 2015
New Revision: 1651625
URL: http://svn.apache.org/r1651625
Log:
TIKA-241 Refactor to use common logic between PackageParser and RarParser for populating xhtml+metadata of embedded resources
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Wed Jan 14 11:42:05 2015
@@ -19,6 +19,7 @@ package org.apache.tika.parser.pkg;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Date;
import java.util.Set;
import org.apache.commons.compress.archivers.ArchiveEntry;
@@ -175,20 +176,10 @@ public class PackageParser extends Abstr
String name = entry.getName();
if (archive.canReadEntryData(entry)) {
// Fetch the metadata on the entry contained in the archive
- Metadata entrydata = new Metadata();
- entrydata.set(TikaCoreProperties.MODIFIED, entry.getLastModifiedDate());
- entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(entry.getSize()));
- if (name != null && name.length() > 0) {
- entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", name);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
-
- entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
- }
+ Metadata entrydata = handleEntryMetadata(name, null,
+ entry.getLastModifiedDate(), entry.getSize(), xhtml);
+ // Recurse into the entry if desired
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
// InputStream, which ArchiveInputStream isn't, so wrap
@@ -204,6 +195,34 @@ public class PackageParser extends Abstr
xhtml.element("p", name);
}
}
+
+ protected static Metadata handleEntryMetadata(
+ String name, Date createAt, Date modifiedAt,
+ Long size, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ Metadata entrydata = new Metadata();
+ if (createAt != null) {
+ entrydata.set(TikaCoreProperties.CREATED, createAt);
+ }
+ if (modifiedAt != null) {
+ entrydata.set(TikaCoreProperties.MODIFIED, modifiedAt);
+ }
+ if (size != null) {
+ entrydata.set(Metadata.CONTENT_LENGTH, Long.toString(size));
+ }
+ if (name != null && name.length() > 0) {
+ name = name.replace("\\", "/");
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", name);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ entrydata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, name);
+ }
+ return entrydata;
+ }
// Pending a fix for COMPRESS-269, we have to wrap ourselves
private static class SevenZWrapper extends ArchiveInputStream {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/RarParser.java Wed Jan 14 11:42:05 2015
@@ -28,7 +28,6 @@ import org.apache.tika.extractor.Parsing
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
@@ -81,28 +80,21 @@ public class RarParser extends AbstractP
FileHeader header = rar.nextFileHeader();
while (header != null && !Thread.currentThread().isInterrupted()) {
-
if (!header.isDirectory()) {
-
InputStream subFile = null;
try {
-
subFile = rar.getInputStream(header);
- Metadata entrydata = new Metadata();
- entrydata.set(Metadata.RESOURCE_NAME_KEY, header
- .getFileNameString().replace("\\", "/"));
- entrydata.set(TikaCoreProperties.CREATED,
- header.getCTime());
- entrydata.set(TikaCoreProperties.MODIFIED,
- header.getMTime());
- entrydata.set(Metadata.CONTENT_LENGTH,
- Long.toString(header.getFullUnpackSize()));
-
- if (extractor.shouldParseEmbedded(entrydata))
- extractor.parseEmbedded(subFile, handler,
- entrydata, true);
-
+ Metadata entrydata = PackageParser.handleEntryMetadata(
+ header.getFileNameString(),
+ header.getCTime(), header.getMTime(),
+ header.getFullUnpackSize(),
+ xhtml
+ );
+
+ if (extractor.shouldParseEmbedded(entrydata)) {
+ extractor.parseEmbedded(subFile, handler, entrydata, true);
+ }
} finally {
if (subFile != null)
subFile.close();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Wed Jan 14 11:42:05 2015
@@ -61,12 +61,14 @@ public abstract class AbstractPkgTest ex
protected static class EmbeddedTrackingParser extends AbstractParser {
protected List<String> filenames = new ArrayList<String>();
protected List<String> mediatypes = new ArrayList<String>();
+ protected List<String> createdAts = new ArrayList<String>();
protected List<String> modifiedAts = new ArrayList<String>();
protected byte[] lastSeenStart;
public void reset() {
filenames.clear();
mediatypes.clear();
+ createdAts.clear();
modifiedAts.clear();
}
@@ -80,6 +82,7 @@ public abstract class AbstractPkgTest ex
SAXException, TikaException {
filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+ createdAts.add(metadata.get(TikaCoreProperties.CREATED));
modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
lastSeenStart = new byte[32];
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java Wed Jan 14 11:42:05 2015
@@ -97,6 +97,9 @@ public class ArParserTest extends Abstra
for (String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
tracker.reset();
stream = ArParserTest.class.getResourceAsStream(
@@ -118,5 +121,8 @@ public class ArParserTest extends Abstra
for (String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Wed Jan 14 11:42:05 2015
@@ -95,6 +95,7 @@ public class Bzip2ParserTest extends Abs
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.createdAts.get(0));
assertEquals(null, tracker.modifiedAts.get(0));
// Tar file starts with the directory name
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java Wed Jan 14 11:42:05 2015
@@ -109,9 +109,24 @@ public class RarParserTest extends Abstr
for(String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
for(String mod : tracker.modifiedAts) {
assertNotNull(mod);
assertTrue("Modified at " + mod, mod.startsWith("20"));
}
+
+ // Should have filenames in the content string
+ String content = handler.toString();
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
}
}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Wed Jan 14 11:42:05 2015
@@ -109,6 +109,9 @@ public class TarParserTest extends Abstr
for(String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
for(String mod : tracker.modifiedAts) {
assertNotNull(mod);
assertTrue("Modified at " + mod, mod.startsWith("20"));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1651625&r1=1651624&r2=1651625&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Wed Jan 14 11:42:05 2015
@@ -118,6 +118,9 @@ public class ZipParserTest extends Abstr
for(String type : tracker.mediatypes) {
assertNull(type);
}
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
for(String mod : tracker.modifiedAts) {
assertNotNull(mod);
assertTrue("Modified at " + mod, mod.startsWith("20"));