You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/12/01 19:06:09 UTC

svn commit: r1416033 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/java/org/apache/tika/parser/pkg/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Sat Dec  1 18:06:08 2012
New Revision: 1416033

URL: http://svn.apache.org/viewvc?rev=1416033&view=rev
Log:
TIKA-1036: leave placeholders when we extract embedded archive members

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Dec  1 18:06:08 2012
@@ -53,9 +53,12 @@ Release 1.3 - Current Development
     certain JVMs this would incorrectly extract the BOM as the tag's
     value (TIKA-1024).
 
-  * ZIP: TikaCLI would hit FileNotFoundException when extracting files
-    that were under sub-directories from a ZIP archive, because it
-    failed to create the parent directories first (TIKA-1031).
+  * ZIP: placeholders (<div class="embedded" id="<entry name>"/>) are
+    now left in the XHTML so you can see where each archive member
+    appears (TIKA-1036). TikaCLI would hit FileNotFoundException when
+    extracting files that were under sub-directories from a ZIP
+    archive, because it failed to create the parent directories first
+    (TIKA-1031).
 
 Release 1.2 - 07/10/2012
 ---------------------------------

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Sat Dec  1 18:06:08 2012
@@ -16,8 +16,6 @@
  */
 package org.apache.tika.parser.pkg;
 
-import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
-
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -46,6 +44,9 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
 
 /**
  * Parser for various packaging formats. Package entries will be written to
@@ -151,6 +152,11 @@ public class PackageParser extends Abstr
             Metadata entrydata = new Metadata();
             if (name != null && name.length() > 0) {
                 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                attributes.addAttribute("", "id", "id", "CDATA", name);
+                xhtml.startElement("div", attributes);
+                xhtml.endElement("div");
             }
             if (extractor.shouldParseEmbedded(entrydata)) {
                 // For detectors to work, we need a mark/reset supporting

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Sat Dec  1 18:06:08 2012
@@ -22,8 +22,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 
-import junit.framework.TestCase;
-
+import org.apache.tika.TikaTest;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -37,7 +36,7 @@ import org.xml.sax.SAXException;
 /**
  * Parent class for all Package based Test cases
  */
-public abstract class AbstractPkgTest extends TestCase {
+public abstract class AbstractPkgTest extends TikaTest {
    protected ParseContext trackingContext;
    protected ParseContext recursingContext;
    

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1416033&r1=1416032&r2=1416033&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Sat Dec  1 18:06:08 2012
@@ -117,4 +117,10 @@ public class ZipParserTest extends Abstr
         assertTrue(content.contains("README"));
     }
 
+    // TIKA-1036
+    public void testPlaceholders() throws Exception {
+        String xml = getXML("testEmbedded.zip").xml;
+        assertContains("<div class=\"embedded\" id=\"test1.txt\"/>", xml);
+        assertContains("<div class=\"embedded\" id=\"test2.txt\"/>", xml);
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip?rev=1416033&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testEmbedded.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream