You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/11/30 21:30:15 UTC

[tika] branch master updated (a477d73 -> 6b5dd8b)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from a477d73  TIKA-2776 -- improve documentation for -maxFiles
     new 4ae1a10  TIKA-2550 -- fix whitespace
     new 6b5dd8b  TIKA-2550 -- prevent content in style/script elements from being written in ToTextContentHandler

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   3 +
 .../org/apache/tika/sax/ToTextContentHandler.java  |  43 +++++-
 .../tika/parser/code/SourceCodeParserTest.java     | 163 ++++++++++++---------
 3 files changed, 137 insertions(+), 72 deletions(-)


[tika] 02/02: TIKA-2550 -- prevent content in style/script elements from being written in ToTextContentHandler

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6b5dd8bbe09eb099ec75846ec02391cbd32351c4
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Nov 30 16:23:36 2018 -0500

    TIKA-2550 -- prevent content in style/script elements from being written in ToTextContentHandler
---
 CHANGES.txt                                        |  3 ++
 .../org/apache/tika/sax/ToTextContentHandler.java  | 43 +++++++++++++++++++++-
 .../tika/parser/code/SourceCodeParserTest.java     | 33 ++++++++++++++---
 3 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 843b25b..750a5c9 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,9 @@ Release 2.0.0 - ???
 
 Release 1.20 - ???
 
+   * Prevent content within <style/> and <script/> elements
+     to be written in the ToTextContentHandler (TIKA-2550).
+
    * Switch child to parent communication to a shared memory-mapped
      file in tika-server's -spawnChild mode.
 
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
index 4fdeaf3..530eb90 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
@@ -23,7 +23,9 @@ import java.io.StringWriter;
 import java.io.UnsupportedEncodingException;
 import java.io.Writer;
 import java.nio.charset.Charset;
+import java.util.Locale;
 
+import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
@@ -31,11 +33,19 @@ import org.xml.sax.helpers.DefaultHandler;
  * SAX event handler that writes all character content out to a character
  * stream. No escaping or other transformations are made on the character
  * content.
- *
+ * <p>
+ * As of Tika 1.20, this handler ignores content within &lt;script&gt; and
+ * &lt;style&gt; tags.
+ *</p>
  * @since Apache Tika 0.10
  */
 public class ToTextContentHandler extends DefaultHandler {
 
+    private static final String STYLE = "STYLE";
+    private static final String SCRIPT = "SCRIPT";
+    private int styleDepth = 0;
+    private int scriptDepth = 0;
+
     /**
      * The character stream.
      */
@@ -89,6 +99,11 @@ public class ToTextContentHandler extends DefaultHandler {
     @Override
     public void characters(char[] ch, int start, int length)
             throws SAXException {
+
+        if (styleDepth+scriptDepth != 0) {
+            return;
+        }
+
         try {
             writer.write(ch, start, length);
         } catch (IOException e) {
@@ -125,6 +140,32 @@ public class ToTextContentHandler extends DefaultHandler {
         }
     }
 
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes atts)
+            throws SAXException {
+        String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH);
+        if (uc.equals(STYLE)) {
+            styleDepth++;
+        }
+        if (uc.equals(SCRIPT)) {
+            scriptDepth++;
+        }
+    }
+
+    @Override
+    public void endElement(
+            String uri, String localName, String qName)
+            throws SAXException {
+        String uc = (qName == null) ? "" : qName.toUpperCase(Locale.ENGLISH);
+        if (uc.equals(STYLE)) {
+            styleDepth--;
+        }
+        if (uc.equals(SCRIPT)) {
+            scriptDepth--;
+        }
+    }
+
     /**
      * Returns the contents of the internal string buffer where
      * all the received characters have been collected. Only works
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index e6af91d..5c791f4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -17,12 +17,16 @@
 package org.apache.tika.parser.code;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToTextContentHandler;
 import org.junit.Test;
+import org.xml.sax.ContentHandler;
 
 import java.io.ByteArrayInputStream;
 import java.util.Set;
@@ -91,11 +95,28 @@ public class SourceCodeParserTest extends TikaTest {
         assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
     }
 
-  private Metadata createMetadata(String mimeType) {
-    Metadata metadata = new Metadata();
-    metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testFile");
-    metadata.add(Metadata.CONTENT_TYPE, mimeType);
-    return metadata;
-  }
+    @Test
+    public void testNoMarkupInToTextHandler() throws Exception {
+
+        Parser p = new AutoDetectParser();
+        ContentHandler contentHandler = new ToTextContentHandler();
+        ParseContext parseContext = new ParseContext();
+        try (TikaInputStream tis = TikaInputStream.get(
+                getResourceAsStream("/test-documents/testJAVA.java"))) {
+            p.parse(tis, contentHandler, createMetadata("text/x-java-source"),
+                    parseContext);
+        }
+        String strContent = contentHandler.toString();
+        assertContains("public class HelloWorld {", strContent);
+        assertNotContained("background-color", strContent);
+    }
+
+
+    private Metadata createMetadata(String mimeType) {
+        Metadata metadata = new Metadata();
+        metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, "testFile");
+        metadata.add(Metadata.CONTENT_TYPE, mimeType);
+        return metadata;
+    }
 
 }


[tika] 01/02: TIKA-2550 -- fix whitespace

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4ae1a10ec3f44f5278a1b741f0ea795c3f664cb3
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Nov 30 15:58:44 2018 -0500

    TIKA-2550 -- fix whitespace
---
 .../tika/parser/code/SourceCodeParserTest.java     | 110 ++++++++++-----------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
index 554b060..e6af91d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/code/SourceCodeParserTest.java
@@ -16,14 +16,6 @@
  */
 package org.apache.tika.parser.code;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import java.io.ByteArrayInputStream;
-import java.util.Set;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -32,64 +24,72 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.util.Set;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
 public class SourceCodeParserTest extends TikaTest {
 
-  private SourceCodeParser sourceCodeParser = new SourceCodeParser();
+    private SourceCodeParser sourceCodeParser = new SourceCodeParser();
 
-  @Test
-  public void testSupportTypes() throws Exception {
-    Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
-    assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
-    assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
-    assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
+    @Test
+    public void testSupportTypes() throws Exception {
+        Set<MediaType> supportedTypes = sourceCodeParser.getSupportedTypes(new ParseContext());
+        assertTrue(supportedTypes.contains(new MediaType("text", "x-java-source")));
+        assertTrue(supportedTypes.contains(new MediaType("text", "x-groovy")));
+        assertTrue(supportedTypes.contains(new MediaType("text", "x-c++src")));
 
-    assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
-  }
+        assertFalse(sourceCodeParser.getSupportedTypes(new ParseContext()).contains(new MediaType("text", "html")));
+    }
 
-  @Test
-  public void testHTMLRenderWithReturnLine() throws Exception {
-    String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
-    
-    assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
-    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
-    assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
-    assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
-  }
-  
-  @Test
-  public void testTextRender() throws Exception {
-    String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
-    
-    assertTrue(textContent.length() > 0);
-    assertTrue(textContent.indexOf("html") < 0);
-    
-    textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
-    assertTrue(textContent.length() > 0);
-    assertTrue(textContent.indexOf("html") < 0);
-  }
+    @Test
+    public void testHTMLRenderWithReturnLine() throws Exception {
+        String htmlContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source")).xml;
 
-  @Test
-  public void testLoC() throws Exception {
-    Metadata metadata = createMetadata("text/x-groovy");
-    getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
+        assertTrue(htmlContent.indexOf("<html:html lang=\"en\" xml:lang=\"en\"") == 0);
+        assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">public</span><html:span class=\"java_plain\">") > 0);
+        assertTrue(htmlContent.indexOf("<html:span class=\"java_keyword\">static</span>") > 0);
+        assertTrue(htmlContent.indexOf("<html:br clear=\"none\" />") > 0);
+    }
 
-    assertEquals(metadata.get("LoC"), "9");
-  }
+    @Test
+    public void testTextRender() throws Exception {
+        String textContent = getText(getResourceAsStream("/test-documents/testJAVA.java"), sourceCodeParser, createMetadata("text/x-java-source"));
 
-  @Test
-  public void testAuthor() throws Exception {
-    Metadata metadata = createMetadata("text/x-c++src");
-    getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
+        assertTrue(textContent.length() > 0);
+        assertTrue(textContent.indexOf("html") < 0);
 
-    assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
-  }
+        textContent = getText(new ByteArrayInputStream("public class HelloWorld {}".getBytes(UTF_8)), sourceCodeParser, createMetadata("text/x-java-source"));
+        assertTrue(textContent.length() > 0);
+        assertTrue(textContent.indexOf("html") < 0);
+    }
 
-  @Test
-  public void testReturnContentAsIsForTextHandler() throws Exception {
-    String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
+    @Test
+    public void testLoC() throws Exception {
+        Metadata metadata = createMetadata("text/x-groovy");
+        getText(getResourceAsStream("/test-documents/testGROOVY.groovy"), sourceCodeParser, metadata);
 
-    assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
-  }
+        assertEquals(metadata.get("LoC"), "9");
+    }
+
+    @Test
+    public void testAuthor() throws Exception {
+        Metadata metadata = createMetadata("text/x-c++src");
+        getText(getResourceAsStream("/test-documents/testCPP.cpp"), sourceCodeParser, metadata);
+
+        assertEquals("Hong-Thai Nguyen", metadata.get(TikaCoreProperties.CREATOR));
+    }
+
+    @Test
+    public void testReturnContentAsIsForTextHandler() throws Exception {
+        String strContent = getXML(getResourceAsStream("/test-documents/testJAVA.java"), new AutoDetectParser(), createMetadata("text/plain")).xml;
+
+        assertTrue(strContent.indexOf("public class HelloWorld {") > 0);
+    }
 
   private Metadata createMetadata(String mimeType) {
     Metadata metadata = new Metadata();