You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/19 20:58:33 UTC
[tika] 02/02: TIKA-3120 -- remove whitelist/blacklist
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d7480a96f25e161263a25e29b8e3ac4eb15efc7c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jun 19 16:48:13 2020 -0400
TIKA-3120 -- remove whitelist/blacklist
# Conflicts:
# tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
---
CHANGES.txt | 2 +-
.../tika/eval/tools/TopCommonTokenCounter.java | 10 ++++----
.../tika/parser/html/BoilerpipeContentHandler.java | 27 ++++++++++++++--------
.../test/java/org/apache/tika/TestXXEInXML.java | 2 +-
.../apache/tika/config/TikaDetectorConfigTest.java | 2 +-
.../tika/config/TikaEncodingDetectorTest.java | 4 ++--
.../apache/tika/config/TikaParserConfigTest.java | 8 +++----
...KA-1558-blacklist.xml => TIKA-1558-exclude.xml} | 0
...8-blacklistsub.xml => TIKA-1558-excludesub.xml} | 0
...lacklist.xml => TIKA-1702-detector-exclude.xml} | 0
...IKA-2273-exclude-encoding-detector-default.xml} | 0
11 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6081b4f..359d7c2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1093,7 +1093,7 @@ Release 1.8 - 4/13/2015
* MediaTypeRegistry support for returning known child types.
- * Support for excluding (blacklisting) certain Parsers from being
+ * Support for excluding certain Parsers from being
used by DefaultParser via the Tika Config file, using the new
parser-exclude tag (TIKA-1558).
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
index e430c08..539980e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
@@ -59,7 +59,7 @@ import org.apache.tika.utils.ProcessUtils;
* The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
* but includes bigrams for cjk.
*
- * It also has a white list for __email__ and __url__ and a black list
+ * It also has a include list for __email__ and __url__ and a skip list
* for common html markup terms.
*/
public class TopCommonTokenCounter {
@@ -85,7 +85,7 @@ public class TopCommonTokenCounter {
private static int TOP_N = 30000;
private static int MIN_DOC_FREQ = 10;
//these should exist in every list
- static Set<String> WHITE_LIST = new HashSet<>(Arrays.asList(
+ static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
new String[] {
URLEmailNormalizingFilterFactory.URL,
URLEmailNormalizingFilterFactory.EMAIL
@@ -96,7 +96,7 @@ public class TopCommonTokenCounter {
//these are common 4 letter html markup words that we do
//not want to count in case of failed markup processing.
//see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
- static Set<String> BLACK_LIST = new HashSet<>(Arrays.asList(
+ static Set<String> SKIP_LIST = new HashSet<>(Arrays.asList(
"span",
"table",
"href",
@@ -213,7 +213,7 @@ public class TopCommonTokenCounter {
if (queue.top() == null || queue.size() < TOP_N ||
df >= queue.top().df) {
String t = bytesRef.utf8ToString();
- if (! BLACK_LIST.contains(t)) {
+ if (! SKIP_LIST.contains(t)) {
queue.insertWithOverflow(new TokenDFTF(t, df, tf));
}
@@ -260,7 +260,7 @@ public class TopCommonTokenCounter {
writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
//add these tokens no matter what
- for (String t : WHITE_LIST) {
+ for (String t : INCLUDE_LIST) {
writer.write(t);
writer.newLine();
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
index 4d5cc46..eaa3e8d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -21,7 +21,9 @@ import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import java.util.Locale;
+import java.util.Set;
+import com.google.common.collect.Sets;
import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextBlock;
@@ -58,6 +60,7 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
private int headerCharOffset;
private List<RecordedElement> elements;
private TextDocument td;
+ private Set<Character> allowableCharSet = Sets.newHashSet(' ', '\n', '\r');
/**
* Creates a new boilerpipe-based content extractor, using the
* {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
@@ -120,7 +123,7 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
headerCharOffset = 0;
if (includeMarkup) {
- elements = new ArrayList<RecordedElement>();
+ elements = new ArrayList<>();
}
}
@@ -230,18 +233,24 @@ public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
case CONTINUE:
// Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
// we have to follow suit.
- for (char[] chars : element.getCharacters()) {
+ for (int i = 0; i < element.getCharacters().size(); i++) {
+ char[] chars = element.getCharacters().get(i);
curCharsIndex++;
+ boolean isValidCharacterRun = validCharacterRuns.get(curCharsIndex);
- if (validCharacterRuns.get(curCharsIndex)) {
+ // https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683
+ // Allow exempted characters to be written
+ if (isValidCharacterRun ||
+ (chars.length == 1 && allowableCharSet.contains(chars[0]))) {
delegate.characters(chars, 0, chars.length);
+ }
- // https://issues.apache.org/jira/browse/TIKA-961
- if (!Character.isWhitespace(chars[chars.length - 1])) {
- // Only add whitespace for certain elements
- if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
- delegate.ignorableWhitespace(NL, 0, NL.length);
- }
+ // https://issues.apache.org/jira/browse/TIKA-961
+ if (isValidCharacterRun && i == element.getCharacters().size() - 1
+ && !Character.isWhitespace(chars[chars.length - 1])) {
+ // Only add whitespace for certain elements
+ if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+ delegate.ignorableWhitespace(NL, 0, NL.length);
}
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
index 720ff76..eeac98a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
@@ -228,7 +228,7 @@ public class TestXXEInXML extends XMLTestBase {
//tests the DOM reader in TikaConfig
//if the safeguards aren't in place, this throws a FNFE
try (InputStream is =
- getResourceAsStream("/org/apache/tika/config/TIKA-1558-blacklist.xml") ) {
+ getResourceAsStream("/org/apache/tika/config/TIKA-1558-exclude.xml") ) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] injected = injectXML(bos.toByteArray(), XXE);
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
index 949107c..364765a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
@@ -41,7 +41,7 @@ import org.junit.Test;
public class TikaDetectorConfigTest extends AbstractTikaConfigTest {
@Test
public void testDetectorExcludeFromDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml");
+ TikaConfig config = getConfig("TIKA-1702-detector-exclude.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
CompositeDetector detector = (CompositeDetector)config.getDetector();
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 41b66fd..91ce09d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -58,8 +58,8 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
}
@Test
- public void testBlackList() throws Exception {
- TikaConfig config = getConfig("TIKA-2273-blacklist-encoding-detector-default.xml");
+ public void testExcludeList() throws Exception {
+ TikaConfig config = getConfig("TIKA-2273-exclude-encoding-detector-default.xml");
EncodingDetector detector = config.getEncodingDetector();
assertTrue(detector instanceof CompositeEncodingDetector);
List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors();
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
index 2acd358..008e0a6 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
@@ -43,7 +43,7 @@ import org.junit.Test;
public class TikaParserConfigTest extends AbstractTikaConfigTest {
@Test
public void testMimeExcludeInclude() throws Exception {
- TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
+ TikaConfig config = getConfig("TIKA-1558-exclude.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
Parser parser = config.getParser();
@@ -82,7 +82,7 @@ public class TikaParserConfigTest extends AbstractTikaConfigTest {
@Test
public void testParserExcludeFromDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
+ TikaConfig config = getConfig("TIKA-1558-exclude.xml");
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
CompositeParser parser = (CompositeParser)config.getParser();
@@ -128,7 +128,7 @@ public class TikaParserConfigTest extends AbstractTikaConfigTest {
* DefaultParser.
*/
@Test
- public void defaultParserBlacklist() throws Exception {
+ public void defaultParserExclude() throws Exception {
TikaConfig config = new TikaConfig();
assertNotNull(config.getParser());
assertNotNull(config.getDetector());
@@ -145,7 +145,7 @@ public class TikaParserConfigTest extends AbstractTikaConfigTest {
assertTrue("Default config should include an XMLParser.", hasXML);
// This custom TikaConfig should exclude XMLParser and all of its subclasses.
- config = getConfig("TIKA-1558-blacklistsub.xml");
+ config = getConfig("TIKA-1558-excludesub.xml");
cp = (CompositeParser) config.getParser();
parsers = cp.getAllComponentParsers();
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklist.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklist.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklistsub.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-blacklistsub.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-blacklist.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-blacklist.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
diff --git a/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-blacklist-encoding-detector-default.xml b/tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
similarity index 100%
rename from tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-blacklist-encoding-detector-default.xml
rename to tika-parsers/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml