You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by rg...@apache.org on 2012/12/18 18:20:46 UTC

svn commit: r1423538 - in /tika/trunk: tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java

Author: rgauss
Date: Tue Dec 18 17:20:45 2012
New Revision: 1423538

URL: http://svn.apache.org/viewvc?rev=1423538&view=rev
Log:
TIKA-725: Empty title element makes Tika-generated HTML documents not open in Chromium
   - Added an assert to TikaCLITest which verifies the issue
   - Added ExpandedTitleContentHandler
   - Changed TikaCLI to use ExpandedTitleContentHandler for html output

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java   (with props)
Modified:
    tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java

Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1423538&r1=1423537&r2=1423538&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Tue Dec 18 17:20:45 2012
@@ -84,6 +84,7 @@ import org.apache.tika.parser.ParserDeco
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ExpandedTitleContentHandler;
 import org.apache.tika.xmp.XMPMetadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -166,7 +167,7 @@ public class TikaCLI {
         @Override
         protected ContentHandler getContentHandler(
                 OutputStream output, Metadata metadata) throws Exception {
-            return getTransformerHandler(output, "html", encoding, prettyPrint);
+            return new ExpandedTitleContentHandler(getTransformerHandler(output, "html", encoding, prettyPrint));
         }
     };
 

Modified: tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java?rev=1423538&r1=1423537&r2=1423538&view=diff
==============================================================================
--- tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java (original)
+++ tika/trunk/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java Tue Dec 18 17:20:45 2012
@@ -98,6 +98,8 @@ public class TikaCLITest extends TestCas
         String[] params = {"-h", resourcePrefix + "alice.cli.test"};
         TikaCLI.main(params);
         Assert.assertTrue(outContent.toString().contains("html xmlns=\"http://www.w3.org/1999/xhtml"));
+        Assert.assertTrue("Expanded <title></title> element should be present",
+                outContent.toString().contains("<title></title>"));
     }
 
     /**

Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java?rev=1423538&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java Tue Dec 18 17:20:45 2012
@@ -0,0 +1,75 @@
+package org.apache.tika.sax;
+
+import javax.xml.transform.sax.TransformerHandler;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Content handler decorator which wraps a {@link TransformerHandler} in order to 
+ * allow the <code>TITLE</code> tag to render as <code>&lt;title&gt;&lt;/title&gt;</code>
+ * rather than <code>&lt;title/&gt;</code> which is accomplished
+ * by calling the {@link TransformerHandler#characters(char[], int, int)} method
+ * with a <code>length</code> of 1 but a zero length char array.
+ * <p>
+ * This workaround is an unfortunate circumstance of the limitations imposed by the
+ * implementation of the XML serialization code in the JDK brought over from
+ * the xalan project which no longer allows for the specification of an 
+ * alternate <code>content-handler</code> via xslt templates or other means.
+ * 
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-725">TIKA-725</a>
+ */
+public class ExpandedTitleContentHandler extends ContentHandlerDecorator {
+    
+    private boolean isTitleTagOpen;
+    private static final String TITLE_TAG = "TITLE";
+    
+    public ExpandedTitleContentHandler() {
+        super();
+    }
+
+    public ExpandedTitleContentHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+        isTitleTagOpen = false;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName,
+            Attributes atts) throws SAXException {
+        super.startElement(uri, localName, qName, atts);
+        if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) {
+            isTitleTagOpen = true;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        super.endElement(uri, localName, qName);
+        if (TITLE_TAG.equalsIgnoreCase(localName) && XHTMLContentHandler.XHTML.equals(uri)) {
+            isTitleTagOpen = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (isTitleTagOpen && length == 0) {
+            // Hack to close the title tag
+            try {
+                super.characters(new char[0], 0, 1);
+            } catch (ArrayIndexOutOfBoundsException e) {
+                // Expected, just wanted to close the title tag
+            }
+        } else {
+            super.characters(ch, start, length);
+        }
+    }
+
+}

Propchange: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native