You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/01/15 23:44:00 UTC

svn commit: r734844 - in /lucene/tika/trunk/src: main/java/org/apache/tika/parser/microsoft/ main/java/org/apache/tika/parser/mp3/ main/java/org/apache/tika/parser/pkg/ main/java/org/apache/tika/sax/ test/java/org/apache/tika/sax/

Author: jukka
Date: Thu Jan 15 14:43:56 2009
New Revision: 734844

URL: http://svn.apache.org/viewvc?rev=734844&view=rev
Log:
TIKA-188: Automatic whitespace for block elements in XHTMLContentHandler

Added:
    lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Removed:
    lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
Modified:
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
    lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Thu Jan 15 14:43:56 2009
@@ -339,7 +339,6 @@
             if (currentSheetIndex < sheetNames.size()) {
                 handler.element("h1", sheetNames.get(currentSheetIndex));
             }
-            handler.characters("\n");
             handler.startElement("table");
             handler.startElement("tbody");
 
@@ -352,7 +351,6 @@
                 while (currentRow < entry.getKey().y) {
                     handler.endElement("td");
                     handler.endElement("tr");
-                    handler.characters("\n");
                     handler.startElement("tr");
                     handler.startElement("td");
                     currentRow++;
@@ -361,7 +359,6 @@
 
                 while (currentColumn < entry.getKey().x) {
                     handler.endElement("td");
-                    handler.characters("\t");
                     handler.startElement("td");
                     currentColumn++;
                 }
@@ -375,7 +372,6 @@
             handler.endElement("tbody");
             handler.endElement("table");
             handler.endElement("div");
-            handler.characters("\n");
         }
     }
 

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Thu Jan 15 14:43:56 2009
@@ -55,7 +55,6 @@
         metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
 
         xhtml.element("h1", subject);
-        xhtml.characters("\n");
 
         xhtml.startElement("dl");
         header(xhtml, "From", from);
@@ -63,7 +62,6 @@
         header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
         header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
         xhtml.endElement("dl");
-        xhtml.characters("\n");
 
         xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
     }
@@ -72,9 +70,7 @@
             throws SAXException {
         if (value.length() > 0) {
             xhtml.element("dt", key);
-            xhtml.characters("\t");
             xhtml.element("dd", value);
-            xhtml.characters("\n");
         }
     }
 

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Thu Jan 15 14:43:56 2009
@@ -192,11 +192,7 @@
             metadata.set(Metadata.AUTHOR, artist);
 
             xhtml.element("h1", title);
-            xhtml.characters("\n");
-
             xhtml.element("p", artist);
-            xhtml.characters("\n");
-
             // ID3v1.1 Track addition
             // If the last two bytes of the comment field are zero and
             // non-zero, then the last byte is the track number
@@ -206,16 +202,9 @@
             } else {
                 xhtml.element("p", album);
             }
-            xhtml.characters("\n");
-
             xhtml.element("p", year);
-            xhtml.characters("\n");
-
             xhtml.element("p", comment);
-            xhtml.characters("\n");
-
             xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
-            xhtml.characters("\n");
         }
 
         xhtml.endDocument();

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Thu Jan 15 14:43:56 2009
@@ -87,7 +87,6 @@
         String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
         if (name != null) {
             xhtml.element("h1", name);
-            xhtml.characters("\n");
         }
 
         try {
@@ -95,7 +94,6 @@
                     new CloseShieldInputStream(stream),
                     new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
                     metadata);
-            xhtml.characters("\n");
         } catch (TikaException e) {
             // Could not parse the entry, just skip the content
         }

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Jan 15 14:43:56 2009
@@ -60,7 +60,7 @@
      * @param writer writer
      */
     public BodyContentHandler(Writer writer) {
-        this(new XHTMLToTextContentHandler(new WriteOutContentHandler(writer)));
+        this(new WriteOutContentHandler(writer));
     }
 
     /**
@@ -70,7 +70,7 @@
      * @param stream output stream
      */
     public BodyContentHandler(OutputStream stream) {
-        this(new XHTMLToTextContentHandler(new WriteOutContentHandler(stream)));
+        this(new WriteOutContentHandler(stream));
     }
 
     /**
@@ -79,7 +79,7 @@
      * using the {@link #toString()} method.
      */
     public BodyContentHandler() {
-        this(new XHTMLToTextContentHandler(new WriteOutContentHandler()));
+        this(new WriteOutContentHandler());
     }
 
 }

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Thu Jan 15 14:43:56 2009
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.sax;
 
+import java.util.HashSet;
+import java.util.Set;
+
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -34,6 +37,58 @@
     public static final String XHTML = "http://www.w3.org/1999/xhtml";
 
     /**
+     * The newline character that gets inserted after block elements.
+     */
+    private static final char[] NL = new char[] { '\n' };
+
+    /**
+     * The tab character gets inserted before table cells and list items.
+     */
+    private static final char[] TAB = new char[] { '\t' };
+
+    /**
+     * The elements that get prepended with the {@link #TAB} character.
+     */
+    private static final Set<String> INDENT = new HashSet<String>() {{
+        add("li");
+        add("dd");
+        add("dt");
+        add("td");
+        add("th");
+    }};
+
+    /**
+     * The elements that get appended with the {@link #NL} character.
+     */
+    private static final Set<String> ENDLINE = new HashSet<String>() {{
+        add("p");
+        add("h1");
+        add("h2");
+        add("h3");
+        add("h4");
+        add("h5");
+        add("h6");
+        add("div");
+        add("ul");
+        add("ol");
+        add("dl");
+        add("pre");
+        add("hr");
+        add("blockquote");
+        add("address");
+        add("fieldset");
+        add("table");
+        add("form");
+        add("noscript");
+        add("li");
+        add("dt");
+        add("dd");
+        add("noframes");
+        add("br");
+        add("tr");
+    }};
+
+    /**
      * Metadata associated with the document. Used to fill in the
      * &lt;head/&gt; section.
      */
@@ -103,14 +158,36 @@
         super.endDocument();
     }
 
+    /**
+     * Starts the given element. Table cells and list items are automatically
+     * indented by emitting a tab character as ignorable whitespace.
+     */
     @Override
     public void startElement(
             String uri, String local, String name, Attributes attributes)
             throws SAXException {
         lazyStartDocument();
+        if (XHTML.equals(uri) && INDENT.contains(local)) {
+            ignorableWhitespace(TAB, 0, TAB.length);
+        }
         super.startElement(uri, local, name, attributes);
     }
 
+    /**
+     * Ends the given element. Block elements are automatically followed
+     * by a newline character.
+     */
+    @Override
+    public void endElement(String uri, String local, String name)
+            throws SAXException {
+        super.endElement(uri, local, name);
+        if (XHTML.equals(uri) && ENDLINE.contains(local)) {
+            ignorableWhitespace(NL, 0, NL.length);
+        }
+    }
+
+    //------------------------------------------< public convenience methods >
+
     public void startElement(String name) throws SAXException {
         startElement(XHTML, name, name, new AttributesImpl());
     }

Added: lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=734844&view=auto
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (added)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java Thu Jan 15 14:43:56 2009
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link XHTMLContentHandler} class.
+ */
+public class XHTMLContentHandlerTest extends TestCase {
+
+    private ContentHandler output;
+
+    private XHTMLContentHandler xhtml;
+
+    protected void setUp() {
+        output = new BodyContentHandler();
+        xhtml = new XHTMLContentHandler(output, new Metadata());
+    }
+
+    /**
+     * Test that content in block elements are properly separated in text
+     * output.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-188">TIKA-188</a>
+     */
+    public void testExtraWhitespace() throws SAXException {
+        xhtml.startDocument();
+
+        xhtml.element("p", "foo");
+        xhtml.startElement("p");
+        xhtml.characters("b");
+        xhtml.element("b", "a"); // inlines should not cause extra whitespace
+        xhtml.characters("r");
+        xhtml.endElement("p");
+
+        xhtml.startElement("table");
+        xhtml.startElement("tr");
+        xhtml.element("th", "x");
+        xhtml.element("th", "y");
+        xhtml.endElement("tr");
+        xhtml.startElement("tr");
+        xhtml.element("td", "a");
+        xhtml.element("td", "b");
+        xhtml.endElement("tr");
+        xhtml.endElement("table");
+        xhtml.endDocument();
+
+        String[] words = output.toString().split("\\s+");
+        assertEquals(6, words.length);
+        assertEquals("foo", words[0]);
+        assertEquals("bar", words[1]);
+        assertEquals("x", words[2]);
+        assertEquals("y", words[3]);
+        assertEquals("a", words[4]);
+        assertEquals("b", words[5]);
+    }
+
+}