You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/12/01 19:06:09 UTC
svn commit: r1416033 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/pkg/
tika-parsers/src/test/java/org/apache/tika/parser/pkg/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Sat Dec 1 18:06:08 2012
New Revision: 1416033
URL: http://svn.apache.org/viewvc?rev=1416033&view=rev
Log:
TIKA-1036: leave placeholders when we extract embedded archive members
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec 1 18:06:08 2012
@@ -53,9 +53,12 @@ Release 1.3 - Current Development
certain JVMs this would incorrectly extract the BOM as the tag's
value (TIKA-1024).
- * ZIP: TikaCLI would hit FileNotFoundException when extracting files
- that were under sub-directories from a ZIP archive, because it
- failed to create the parent directories first (TIKA-1031).
+ * ZIP: placeholders (<div class="embedded" id="<entry name>"/>) are
+ now left in the XHTML so you can see where each archive member
+ appears (TIKA-1036). TikaCLI would hit FileNotFoundException when
+ extracting files that were under sub-directories from a ZIP
+ archive, because it failed to create the parent directories first
+ (TIKA-1031).
Release 1.2 - 07/10/2012
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Sat Dec 1 18:06:08 2012
@@ -16,8 +16,6 @@
*/
package org.apache.tika.parser.pkg;
-import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
-
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -46,6 +44,9 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
/**
* Parser for various packaging formats. Package entries will be written to
@@ -151,6 +152,11 @@ public class PackageParser extends Abstr
Metadata entrydata = new Metadata();
if (name != null && name.length() > 0) {
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", name);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
}
if (extractor.shouldParseEmbedded(entrydata)) {
// For detectors to work, we need a mark/reset supporting
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Sat Dec 1 18:06:08 2012
@@ -22,8 +22,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Set;
-import junit.framework.TestCase;
-
+import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -37,7 +36,7 @@ import org.xml.sax.SAXException;
/**
* Parent class for all Package based Test cases
*/
-public abstract class AbstractPkgTest extends TestCase {
+public abstract class AbstractPkgTest extends TikaTest {
protected ParseContext trackingContext;
protected ParseContext recursingContext;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Sat Dec 1 18:06:08 2012
@@ -117,4 +117,10 @@ public class ZipParserTest extends Abstr
assertTrue(content.contains("README"));
}
+ // TIKA-1036
+ public void testPlaceholders() throws Exception {
+ String xml = getXML("testEmbedded.zip").xml;
+ assertContains("<div class=\"embedded\" id=\"test1.txt\"/>", xml);
+ assertContains("<div class=\"embedded\" id=\"test2.txt\"/>", xml);
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip?rev=1416033&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream