You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2013/01/07 00:49:01 UTC

svn commit: r1429615 - in /tika/trunk: ./ tika-core/src/main/java/org/apache/tika/sax/ tika-parsers/src/main/java/org/apache/tika/parser/xml/ tika-parsers/src/test/java/org/apache/tika/parser/ tika-parsers/src/test/java/org/apache/tika/parser/xml/ tika...

Author: mikemccand
Date: Sun Jan  6 23:49:01 2013
New Revision: 1429615

URL: http://svn.apache.org/viewvc?rev=1429615&view=rev
Log:
TIKA-1048: add space after each extracted XML element

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun Jan  6 23:49:01 2013
@@ -60,6 +60,9 @@ Release 1.3 - Current Development
     archive, because it failed to create the parent directories first
     (TIKA-1031).
 
+  * XML: a space character is now added before each element
+    (TIKA-1048)
+
 Release 1.2 - 07/10/2012
 ---------------------------------
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java Sun Jan  6 23:49:01 2013
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.sax;
 
+import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
@@ -29,10 +30,18 @@ import org.xml.sax.helpers.DefaultHandle
  */
 public class TextContentHandler extends DefaultHandler {
 
+    private static final char[] SPACE = new char[] {' '};
+
     private final ContentHandler delegate;
+    private final boolean addSpaceBetweenElements;
 
     public TextContentHandler(ContentHandler delegate) {
+        this(delegate, false);
+    }
+
+    public TextContentHandler(ContentHandler delegate, boolean addSpaceBetweenElements) {
         this.delegate = delegate;
+        this.addSpaceBetweenElements = addSpaceBetweenElements;
     }
 
     @Override
@@ -48,6 +57,14 @@ public class TextContentHandler extends 
     }
 
     @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes)
+             throws SAXException {
+        if (addSpaceBetweenElements) {
+            delegate.characters(SPACE, 0, SPACE.length);
+        }
+    }
+
+    @Override
     public void startDocument() throws SAXException {
         delegate.startDocument();
     }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Jan  6 23:49:01 2013
@@ -84,6 +84,6 @@ public class XMLParser extends AbstractP
 
     protected ContentHandler getContentHandler(
             ContentHandler handler, Metadata metadata, ParseContext context) {
-        return new TextContentHandler(handler);
+        return new TextContentHandler(handler, true);
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java Sun Jan  6 23:49:01 2013
@@ -53,11 +53,13 @@ public class ParsingReaderTest extends T
         String data = "<p>test <span>content</span></p>";
         InputStream stream = new ByteArrayInputStream(data.getBytes("UTF-8"));
         Reader reader = new ParsingReader(stream, "test.xml");
+        assertEquals(' ', (char) reader.read());
         assertEquals('t', (char) reader.read());
         assertEquals('e', (char) reader.read());
         assertEquals('s', (char) reader.read());
         assertEquals('t', (char) reader.read());
         assertEquals(' ', (char) reader.read());
+        assertEquals(' ', (char) reader.read());
         assertEquals('c', (char) reader.read());
         assertEquals('o', (char) reader.read());
         assertEquals('n', (char) reader.read());

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1429615&r1=1429614&r2=1429615&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Sun Jan  6 23:49:01 2013
@@ -18,15 +18,14 @@ package org.apache.tika.parser.xml;
 
 import java.io.InputStream;
 
-import junit.framework.TestCase;
-
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class DcXMLParserTest extends TestCase {
+public class DcXMLParserTest extends TikaTest {
 
     public void testXMLParserAsciiChars() throws Exception {
         InputStream input = DcXMLParserTest.class.getResourceAsStream(
@@ -92,4 +91,9 @@ public class DcXMLParserTest extends Tes
         }
     }
 
+    // TIKA-1048
+    public void testNoSpaces() throws Exception {
+      String text = getXML("testXML2.xml").xml;
+      assertFalse(text.contains("testSubject"));
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml?rev=1429615&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml Sun Jan  6 23:49:01 2013
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8" ?><searchml xmlns="http://www.foobar.com"><document type="Microsoft Word 2003/2004"><doc_properties><doc_property type="title">Title test</doc_property><doc_property type="subject">Subject test</doc_property></doc_properties></document></searchml>

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML2.xml
------------------------------------------------------------------------------
    svn:eol-style = native