You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2013/01/07 00:49:01 UTC
svn commit: r1429615 - in /tika/trunk: ./
tika-core/src/main/java/org/apache/tika/sax/
tika-parsers/src/main/java/org/apache/tika/parser/xml/
tika-parsers/src/test/java/org/apache/tika/parser/
tika-parsers/src/test/java/org/apache/tika/parser/xml/ tika...
Author: mikemccand
Date: Sun Jan 6 23:49:01 2013
New Revision: 1429615
URL: http://svn.apache.org/viewvc?rev=1429615&view=rev
Log:
TIKA-1048: add space after each extracted XML element
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Jan 6 23:49:01 2013
@@ -60,6 +60,9 @@ Release 1.3 - Current Development
archive, because it failed to create the parent directories first
(TIKA-1031).
+ * XML: a space character is now added before each element
+ (TIKA-1048)
+
Release 1.2 - 07/10/2012
---------------------------------
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java Sun Jan 6 23:49:01 2013
@@ -16,6 +16,7 @@
*/
package org.apache.tika.sax;
+import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -29,10 +30,18 @@ import org.xml.sax.helpers.DefaultHandle
*/
public class TextContentHandler extends DefaultHandler {
+ private static final char[] SPACE = new char[] {' '};
+
private final ContentHandler delegate;
+ private final boolean addSpaceBetweenElements;
public TextContentHandler(ContentHandler delegate) {
+ this(delegate, false);
+ }
+
+ public TextContentHandler(ContentHandler delegate, boolean addSpaceBetweenElements) {
this.delegate = delegate;
+ this.addSpaceBetweenElements = addSpaceBetweenElements;
}
@Override
@@ -48,6 +57,14 @@ public class TextContentHandler extends
}
@Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ if (addSpaceBetweenElements) {
+ delegate.characters(SPACE, 0, SPACE.length);
+ }
+ }
+
+ @Override
public void startDocument() throws SAXException {
delegate.startDocument();
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Jan 6 23:49:01 2013
@@ -84,6 +84,6 @@ public class XMLParser extends AbstractP
protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata, ParseContext context) {
- return new TextContentHandler(handler);
+ return new TextContentHandler(handler, true);
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java Sun Jan 6 23:49:01 2013
@@ -53,11 +53,13 @@ public class ParsingReaderTest extends T
String data = "<p>test <span>content</span></p>";
InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
Reader reader = new ParsingReader(stream, "test.xml");
+ assertEquals(' ', (char) reader.read());
assertEquals('t', (char) reader.read());
assertEquals('e', (char) reader.read());
assertEquals('s', (char) reader.read());
assertEquals('t', (char) reader.read());
assertEquals(' ', (char) reader.read());
+ assertEquals(' ', (char) reader.read());
assertEquals('c', (char) reader.read());
assertEquals('o', (char) reader.read());
assertEquals('n', (char) reader.read());
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Sun Jan 6 23:49:01 2013
@@ -18,15 +18,14 @@ package org.apache.tika.parser.xml;
import java.io.InputStream;
-import junit.framework.TestCase;
-
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
-public class DcXMLParserTest extends TestCase {
+public class DcXMLParserTest extends TikaTest {
public void testXMLParserAsciiChars() throws Exception {
InputStream input = DcXMLParserTest.class.getResourceAsStream(
@@ -92,4 +91,9 @@ public class DcXMLParserTest extends Tes
}
}
+ // TIKA-1048
+ public void testNoSpaces() throws Exception {
+ String text = getXML("testXML2.xml").xml;
+ assertFalse(text.contains("testSubject"));
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml?rev=1429615&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml Sun Jan 6 23:49:01 2013
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" ?><searchml xmlns="http://www.foobar.com"><document type="Microsoft Word 2003/2004"><doc_properties><doc_property type="title">Title test</doc_property><doc_property type="subject">Subject test</doc_property></doc_properties></document></searchml>
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml
------------------------------------------------------------------------------
svn:eol-style = native