You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/19 20:58:33 UTC

[tika] 02/02: TIKA-3120 -- remove whitelist/blacklist

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d7480a96f25e161263a25e29b8e3ac4eb15efc7c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jun 19 16:48:13 2020 -0400

    TIKA-3120 -- remove whitelist/blacklist
    
    # Conflicts:
    #	tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
---
 CHANGES.txt                                        |  2 +-
 .../tika/eval/tools/TopCommonTokenCounter.java     | 10 ++++----
 .../tika/parser/html/BoilerpipeContentHandler.java | 27 ++++++++++++++--------
 .../test/java/org/apache/tika/TestXXEInXML.java    |  2 +-
 .../apache/tika/config/TikaDetectorConfigTest.java |  2 +-
 .../tika/config/TikaEncodingDetectorTest.java      |  4 ++--
 .../apache/tika/config/TikaParserConfigTest.java   |  8 +++----
 ...KA-1558-blacklist.xml => TIKA-1558-exclude.xml} |  0
 ...8-blacklistsub.xml => TIKA-1558-excludesub.xml} |  0
 ...lacklist.xml => TIKA-1702-detector-exclude.xml} |  0
 ...IKA-2273-exclude-encoding-detector-default.xml} |  0
 11 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6081b4f..359d7c2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1093,7 +1093,7 @@ Release 1.8 - 4/13/2015
 
   * MediaTypeRegistry support for returning known child types.
 
-  * Support for excluding (blacklisting) certain Parsers from being
+  * Support for excluding certain Parsers from being
     used by DefaultParser via the Tika Config file, using the new
     parser-exclude tag (TIKA-1558).
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
index e430c08..539980e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
@@ -59,7 +59,7 @@ import org.apache.tika.utils.ProcessUtils;
  * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
  * but includes bigrams for cjk.
  *
- * It also has a white list for __email__ and __url__ and a black list
+ * It also has a include list for __email__ and __url__ and a skip list
  * for common html markup terms.
  */
 public class TopCommonTokenCounter {
@@ -85,7 +85,7 @@ public class TopCommonTokenCounter {
     private static int TOP_N = 30000;
     private static int MIN_DOC_FREQ = 10;
     //these should exist in every list
-    static Set<String> WHITE_LIST = new HashSet<>(Arrays.asList(
+    static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
             new String[] {
                     URLEmailNormalizingFilterFactory.URL,
                     URLEmailNormalizingFilterFactory.EMAIL
@@ -96,7 +96,7 @@ public class TopCommonTokenCounter {
     //these are common 4 letter html markup words that we do
     //not want to count in case of failed markup processing.
     //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
-    static Set<String> BLACK_LIST = new HashSet<>(Arrays.asList(
+    static Set<String> SKIP_LIST = new HashSet<>(Arrays.asList(
             "span",
             "table",
             "href",
@@ -213,7 +213,7 @@ public class TopCommonTokenCounter {
                     if (queue.top() == null || queue.size() < TOP_N ||
                             df >= queue.top().df) {
                         String t = bytesRef.utf8ToString();
-                        if (! BLACK_LIST.contains(t)) {
+                        if (! SKIP_LIST.contains(t)) {
                             queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                         }
 
@@ -260,7 +260,7 @@ public class TopCommonTokenCounter {
         writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
         writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
         //add these tokens no matter what
-        for (String t : WHITE_LIST) {
+        for (String t : INCLUDE_LIST) {
             writer.write(t);
             writer.newLine();
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
index 4d5cc46..eaa3e8d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -21,7 +21,9 @@ import java.util.ArrayList;
 import java.util.BitSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Set;
 
+import com.google.common.collect.Sets;
 import de.l3s.boilerpipe.BoilerpipeExtractor;
 import de.l3s.boilerpipe.BoilerpipeProcessingException;
 import de.l3s.boilerpipe.document.TextBlock;
@@ -58,6 +60,7 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
     private int headerCharOffset;
     private List<RecordedElement> elements;
     private TextDocument td;
+    private Set<Character> allowableCharSet = Sets.newHashSet(' ', '\n', '\r');
     /**
      * Creates a new boilerpipe-based content extractor, using the
      * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
@@ -120,7 +123,7 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
         headerCharOffset = 0;
 
         if (includeMarkup) {
-            elements = new ArrayList<RecordedElement>();
+            elements = new ArrayList<>();
         }
     }
 
@@ -230,18 +233,24 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
                     case CONTINUE:
                         // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
                         // we have to follow suit.
-                        for (char[] chars : element.getCharacters()) {
+                        for (int i = 0; i < element.getCharacters().size(); i++) {
+                            char[] chars = element.getCharacters().get(i);
                             curCharsIndex++;
+                            boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex);
 
-                            if (validCharacterRuns.get(curCharsIndex)) {
+                            // https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683
+                            // Allow exempted characters to be written
+                            if (isValidCharacterRun ||
+                                    (chars.length == 1 && allowableCharSet.contains(chars[0]))) {
                                 delegate.characters(chars, 0, chars.length);
+                            }
 
-                                // https://issues.apache.org/jira/browse/TIKA-961
-                                if (!Character.isWhitespace(chars[chars.length - 1])) {
-                                    // Only add whitespace for certain elements
-                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
-                                        delegate.ignorableWhitespace(NL, 0, NL.length);
-                                    }
+                            // https://issues.apache.org/jira/browse/TIKA-961
+                            if (isValidCharacterRun && i == element.getCharacters().size() - 1
+                                    && !Character.isWhitespace(chars[chars.length - 1])) {
+                                // Only add whitespace for certain elements
+                                if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+                                    delegate.ignorableWhitespace(NL, 0, NL.length);
                                 }
                             }
                         }
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
index 720ff76..eeac98a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
@@ -228,7 +228,7 @@ public class TestXXEInXML extends XMLTestBase {
         //tests the DOM reader in TikaConfig
         //if the safeguards aren't in place, this throws a FNFE
         try (InputStream is =
-                getResourceAsStream("/org/apache/tika/config/TIKA-1558-blacklist.xml") ) {
+                getResourceAsStream("/org/apache/tika/config/TIKA-1558-exclude.xml") ) {
             ByteArrayOutputStream bos = new ByteArrayOutputStream();
             IOUtils.copy(is, bos);
             byte[] injected = injectXML(bos.toByteArray(), XXE);
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
index 949107c..364765a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
@@ -41,7 +41,7 @@ import org.junit.Test;
 public class TikaDetectorConfigTest extends AbstractTikaConfigTest {
     @Test
     public void testDetectorExcludeFromDefault() throws Exception {
-        TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml");
+        TikaConfig config = getConfig("TIKA-1702-detector-exclude.xml");
         assertNotNull(config.getParser());
         assertNotNull(config.getDetector());
         CompositeDetector detector = (CompositeDetector)config.getDetector();
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 41b66fd..91ce09d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -58,8 +58,8 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
     }
 
     @Test
-    public void testBlackList() throws Exception {
-        TikaConfig config = getConfig("TIKA-2273-blacklist-encoding-detector-default.xml");
+    public void testExcludeList() throws Exception {
+        TikaConfig config = getConfig("TIKA-2273-exclude-encoding-detector-default.xml");
         EncodingDetector detector = config.getEncodingDetector();
         assertTrue(detector instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
index 2acd358..008e0a6 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
@@ -43,7 +43,7 @@ import org.junit.Test;
 public class TikaParserConfigTest extends AbstractTikaConfigTest {
     @Test
     public void testMimeExcludeInclude() throws Exception {
-        TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
+        TikaConfig config = getConfig("TIKA-1558-exclude.xml");
         assertNotNull(config.getParser());
         assertNotNull(config.getDetector());
         Parser parser = config.getParser();
@@ -82,7 +82,7 @@ public class TikaParserConfigTest extends AbstractTikaConfigTest {
     
     @Test
     public void testParserExcludeFromDefault() throws Exception {
-        TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
+        TikaConfig config = getConfig("TIKA-1558-exclude.xml");
         assertNotNull(config.getParser());
         assertNotNull(config.getDetector());
         CompositeParser parser = (CompositeParser)config.getParser();
@@ -128,7 +128,7 @@ public class TikaParserConfigTest extends AbstractTikaConfigTest {
      * DefaultParser.
      */
     @Test
-    public void defaultParserBlacklist() throws Exception {
+    public void defaultParserExclude() throws Exception {
         TikaConfig config = new TikaConfig();
         assertNotNull(config.getParser());
         assertNotNull(config.getDetector());
@@ -145,7 +145,7 @@ public class TikaParserConfigTest extends AbstractTikaConfigTest {
         assertTrue("Default config should include an XMLParser.", hasXML);
 
         // This custom TikaConfig should exclude XMLParser and all of its subclasses.
-        config = getConfig("TIKA-1558-blacklistsub.xml");
+        config = getConfig("TIKA-1558-excludesub.xml");
         cp = (CompositeParser) config.getParser();
         parsers = cp.getAllComponentParsers();
 
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklist.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklist.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklistsub.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklistsub.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-blacklist.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-blacklist.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-blacklist-encoding-detector-default.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-blacklist-encoding-detector-default.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml