You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/01/15 23:44:00 UTC
svn commit: r734844 - in /lucene/tika/trunk/src:
main/java/org/apache/tika/parser/microsoft/
main/java/org/apache/tika/parser/mp3/ main/java/org/apache/tika/parser/pkg/
main/java/org/apache/tika/sax/ test/java/org/apache/tika/sax/
Author: jukka
Date: Thu Jan 15 14:43:56 2009
New Revision: 734844
URL: http://svn.apache.org/viewvc?rev=734844&view=rev
Log:
TIKA-188: Automatic whitespace for block elements in XHTMLContentHandler
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
Removed:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLToTextContentHandler.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Thu Jan 15 14:43:56 2009
@@ -339,7 +339,6 @@
if (currentSheetIndex < sheetNames.size()) {
handler.element("h1", sheetNames.get(currentSheetIndex));
}
- handler.characters("\n");
handler.startElement("table");
handler.startElement("tbody");
@@ -352,7 +351,6 @@
while (currentRow < entry.getKey().y) {
handler.endElement("td");
handler.endElement("tr");
- handler.characters("\n");
handler.startElement("tr");
handler.startElement("td");
currentRow++;
@@ -361,7 +359,6 @@
while (currentColumn < entry.getKey().x) {
handler.endElement("td");
- handler.characters("\t");
handler.startElement("td");
currentColumn++;
}
@@ -375,7 +372,6 @@
handler.endElement("tbody");
handler.endElement("table");
handler.endElement("div");
- handler.characters("\n");
}
}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Thu Jan 15 14:43:56 2009
@@ -55,7 +55,6 @@
metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
xhtml.element("h1", subject);
- xhtml.characters("\n");
xhtml.startElement("dl");
header(xhtml, "From", from);
@@ -63,7 +62,6 @@
header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
xhtml.endElement("dl");
- xhtml.characters("\n");
xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
}
@@ -72,9 +70,7 @@
throws SAXException {
if (value.length() > 0) {
xhtml.element("dt", key);
- xhtml.characters("\t");
xhtml.element("dd", value);
- xhtml.characters("\n");
}
}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Thu Jan 15 14:43:56 2009
@@ -192,11 +192,7 @@
metadata.set(Metadata.AUTHOR, artist);
xhtml.element("h1", title);
- xhtml.characters("\n");
-
xhtml.element("p", artist);
- xhtml.characters("\n");
-
// ID3v1.1 Track addition
// If the last two bytes of the comment field are zero and
// non-zero, then the last byte is the track number
@@ -206,16 +202,9 @@
} else {
xhtml.element("p", album);
}
- xhtml.characters("\n");
-
xhtml.element("p", year);
- xhtml.characters("\n");
-
xhtml.element("p", comment);
- xhtml.characters("\n");
-
xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
- xhtml.characters("\n");
}
xhtml.endDocument();
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java Thu Jan 15 14:43:56 2009
@@ -87,7 +87,6 @@
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
xhtml.element("h1", name);
- xhtml.characters("\n");
}
try {
@@ -95,7 +94,6 @@
new CloseShieldInputStream(stream),
new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
metadata);
- xhtml.characters("\n");
} catch (TikaException e) {
// Could not parse the entry, just skip the content
}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/BodyContentHandler.java Thu Jan 15 14:43:56 2009
@@ -60,7 +60,7 @@
* @param writer writer
*/
public BodyContentHandler(Writer writer) {
- this(new XHTMLToTextContentHandler(new WriteOutContentHandler(writer)));
+ this(new WriteOutContentHandler(writer));
}
/**
@@ -70,7 +70,7 @@
* @param stream output stream
*/
public BodyContentHandler(OutputStream stream) {
- this(new XHTMLToTextContentHandler(new WriteOutContentHandler(stream)));
+ this(new WriteOutContentHandler(stream));
}
/**
@@ -79,7 +79,7 @@
* using the {@link #toString()} method.
*/
public BodyContentHandler() {
- this(new XHTMLToTextContentHandler(new WriteOutContentHandler()));
+ this(new WriteOutContentHandler());
}
}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=734844&r1=734843&r2=734844&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Thu Jan 15 14:43:56 2009
@@ -16,6 +16,9 @@
*/
package org.apache.tika.sax;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -34,6 +37,58 @@
public static final String XHTML = "http://www.w3.org/1999/xhtml";
/**
+ * The newline character that gets inserted after block elements.
+ */
+ private static final char[] NL = new char[] { '\n' };
+
+ /**
+ * The tab character gets inserted before table cells and list items.
+ */
+ private static final char[] TAB = new char[] { '\t' };
+
+ /**
+ * The elements that get prepended with the {@link #TAB} character.
+ */
+ private static final Set<String> INDENT = new HashSet<String>() {{
+ add("li");
+ add("dd");
+ add("dt");
+ add("td");
+ add("th");
+ }};
+
+ /**
+ * The elements that get appended with the {@link #NL} character.
+ */
+ private static final Set<String> ENDLINE = new HashSet<String>() {{
+ add("p");
+ add("h1");
+ add("h2");
+ add("h3");
+ add("h4");
+ add("h5");
+ add("h6");
+ add("div");
+ add("ul");
+ add("ol");
+ add("dl");
+ add("pre");
+ add("hr");
+ add("blockquote");
+ add("address");
+ add("fieldset");
+ add("table");
+ add("form");
+ add("noscript");
+ add("li");
+ add("dt");
+ add("dd");
+ add("noframes");
+ add("br");
+ add("tr");
+ }};
+
+ /**
* Metadata associated with the document. Used to fill in the
* <head/> section.
*/
@@ -103,14 +158,36 @@
super.endDocument();
}
+ /**
+ * Starts the given element. Table cells and list items are automatically
+ * indented by emitting a tab character as ignorable whitespace.
+ */
@Override
public void startElement(
String uri, String local, String name, Attributes attributes)
throws SAXException {
lazyStartDocument();
+ if (XHTML.equals(uri) && INDENT.contains(local)) {
+ ignorableWhitespace(TAB, 0, TAB.length);
+ }
super.startElement(uri, local, name, attributes);
}
+ /**
+ * Ends the given element. Block elements are automatically followed
+ * by a newline character.
+ */
+ @Override
+ public void endElement(String uri, String local, String name)
+ throws SAXException {
+ super.endElement(uri, local, name);
+ if (XHTML.equals(uri) && ENDLINE.contains(local)) {
+ ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+
+ //------------------------------------------< public convenience methods >
+
public void startElement(String name) throws SAXException {
startElement(XHTML, name, name, new AttributesImpl());
}
Added: lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java?rev=734844&view=auto
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java (added)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java Thu Jan 15 14:43:56 2009
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the {@link XHTMLContentHandler} class.
+ */
+public class XHTMLContentHandlerTest extends TestCase {
+
+ private ContentHandler output;
+
+ private XHTMLContentHandler xhtml;
+
+ protected void setUp() {
+ output = new BodyContentHandler();
+ xhtml = new XHTMLContentHandler(output, new Metadata());
+ }
+
+ /**
+ * Test that content in block elements are properly separated in text
+ * output.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-188">TIKA-188</a>
+ */
+ public void testExtraWhitespace() throws SAXException {
+ xhtml.startDocument();
+
+ xhtml.element("p", "foo");
+ xhtml.startElement("p");
+ xhtml.characters("b");
+ xhtml.element("b", "a"); // inlines should not cause extra whitespace
+ xhtml.characters("r");
+ xhtml.endElement("p");
+
+ xhtml.startElement("table");
+ xhtml.startElement("tr");
+ xhtml.element("th", "x");
+ xhtml.element("th", "y");
+ xhtml.endElement("tr");
+ xhtml.startElement("tr");
+ xhtml.element("td", "a");
+ xhtml.element("td", "b");
+ xhtml.endElement("tr");
+ xhtml.endElement("table");
+ xhtml.endDocument();
+
+ String[] words = output.toString().split("\\s+");
+ assertEquals(6, words.length);
+ assertEquals("foo", words[0]);
+ assertEquals("bar", words[1]);
+ assertEquals("x", words[2]);
+ assertEquals("y", words[3]);
+ assertEquals("a", words[4]);
+ assertEquals("b", words[5]);
+ }
+
+}