You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/05/07 19:23:03 UTC
[tika] branch master updated: TIKA-2865 - parameterize
minConfidence for tst/csv detector
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 4b33026 TIKA-2865 - parameterize minConfidence for tst/csv detector
4b33026 is described below
commit 4b330263d1759c35886e6243d5edc5426be54029
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue May 7 15:22:43 2019 -0400
TIKA-2865 - parameterize minConfidence for tst/csv detector
---
.../org/apache/tika/parser/csv/CSVSniffer.java | 9 ++++---
.../apache/tika/parser/csv/TextAndCSVParser.java | 9 ++++++-
.../tika/parser/csv/TextAndCSVParserTest.java | 13 +++++++++-
.../org/apache/tika/parser/csv/tika-config.xml | 30 ++++++++++++++++++++++
4 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
index 2bb0851..83241a2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
@@ -35,6 +35,7 @@ import org.apache.tika.mime.MediaType;
class CSVSniffer {
private static final int DEFAULT_MARK_LIMIT = 10000;
+ private static final double DEFAULT_MIN_CONFIDENCE = 0.50;
private static final int PUSH_BACK = 2;
static final int EOF = -1;
static final int NEW_LINE = '\n';
@@ -43,14 +44,16 @@ class CSVSniffer {
private final char[] delimiters;
private final int markLimit;
+ private final double minConfidence;
CSVSniffer(char[] delimiters) {
- this(DEFAULT_MARK_LIMIT, delimiters);
+ this(DEFAULT_MARK_LIMIT, delimiters, DEFAULT_MIN_CONFIDENCE);
}
- CSVSniffer(int markLimit, char[] delimiters) {
+ CSVSniffer(int markLimit, char[] delimiters, double minConfidence) {
this.markLimit = markLimit;
this.delimiters = delimiters;
+ this.minConfidence = minConfidence;
}
List<CSVResult> sniff(Reader reader) throws IOException {
@@ -86,7 +89,7 @@ class CSVSniffer {
return CSVResult.TEXT;
}
CSVResult bestResult = results.get(0);
- if (bestResult.getConfidence() < 0.10) {
+ if (bestResult.getConfidence() < minConfidence) {
return CSVResult.TEXT;
}
return bestResult;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
index 4018d1c..36ed122 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -116,6 +116,13 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
private int markLimit = DEFAULT_MARK_LIMIT;
+ /**
+ * minimum confidence score that there's enough
+ * evidence to determine csv/tsv vs. txt
+ */
+ @Field
+ private double minConfidence = 0.50;
+
public TextAndCSVParser() {
super();
}
@@ -267,7 +274,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
(params.getMediaType() == null ||
isCSVOrTSV(params.getMediaType()))) {
- CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters);
+ CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters, minConfidence);
CSVResult result = sniffer.getBest(reader, metadata);
params.setMediaType(result.getMediaType());
params.setDelimiter(result.getDelimiter());
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index 63328d9..809f881 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -28,12 +28,14 @@ import java.util.Map;
import org.apache.commons.io.ByteOrderMark;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
+import org.junit.BeforeClass;
import org.junit.Test;
public class TextAndCSVParserTest extends TikaTest {
@@ -67,7 +69,16 @@ public class TextAndCSVParserTest extends TikaTest {
private static String EXPECTED_CSV = EXPECTED_TSV.replaceAll(",+", " ");
- private static Parser PARSER = new AutoDetectParser();
+ private static Parser PARSER;
+
+ @BeforeClass
+ public static void setUp() throws Exception {
+
+ try (InputStream is = Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream("org/apache/tika/parser/csv/tika-config.xml")) {
+ PARSER = new AutoDetectParser(new TikaConfig(is));
+ }
+ }
@Test
public void testCSV_UTF8() throws Exception {
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/csv/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/csv/tika-config.xml
new file mode 100644
index 0000000..0b4de32
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/csv/tika-config.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.csv.TextAndCSVParser"/>
+ </parser>
+
+ <parser class="org.apache.tika.parser.csv.TextAndCSVParser">
+ <params>
+ <param name="minConfidence" type="double">0.2</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>