You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/05/07 19:23:03 UTC

[tika] branch master updated: TIKA-2865 - parameterize minConfidence for tst/csv detector

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 4b33026  TIKA-2865 - parameterize minConfidence for tst/csv detector
4b33026 is described below

commit 4b330263d1759c35886e6243d5edc5426be54029
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue May 7 15:22:43 2019 -0400

    TIKA-2865 - parameterize minConfidence for tst/csv detector
---
 .../org/apache/tika/parser/csv/CSVSniffer.java     |  9 ++++---
 .../apache/tika/parser/csv/TextAndCSVParser.java   |  9 ++++++-
 .../tika/parser/csv/TextAndCSVParserTest.java      | 13 +++++++++-
 .../org/apache/tika/parser/csv/tika-config.xml     | 30 ++++++++++++++++++++++
 4 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
index 2bb0851..83241a2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
@@ -35,6 +35,7 @@ import org.apache.tika.mime.MediaType;
 
 class CSVSniffer {
     private static final int DEFAULT_MARK_LIMIT = 10000;
+    private static final double DEFAULT_MIN_CONFIDENCE = 0.50;
     private static final int PUSH_BACK = 2;
     static final int EOF = -1;
     static final int NEW_LINE = '\n';
@@ -43,14 +44,16 @@ class CSVSniffer {
 
     private final char[] delimiters;
     private final int markLimit;
+    private final double minConfidence;
 
     CSVSniffer(char[] delimiters) {
-        this(DEFAULT_MARK_LIMIT, delimiters);
+        this(DEFAULT_MARK_LIMIT, delimiters, DEFAULT_MIN_CONFIDENCE);
     }
 
-    CSVSniffer(int markLimit, char[] delimiters) {
+    CSVSniffer(int markLimit, char[] delimiters, double minConfidence) {
         this.markLimit = markLimit;
         this.delimiters = delimiters;
+        this.minConfidence = minConfidence;
     }
 
     List<CSVResult> sniff(Reader reader) throws IOException {
@@ -86,7 +89,7 @@ class CSVSniffer {
             return CSVResult.TEXT;
         }
         CSVResult bestResult = results.get(0);
-        if (bestResult.getConfidence() < 0.10) {
+        if (bestResult.getConfidence() < minConfidence) {
             return CSVResult.TEXT;
         }
         return bestResult;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
index 4018d1c..36ed122 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -116,6 +116,13 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
     private int markLimit = DEFAULT_MARK_LIMIT;
 
 
+    /**
+     * minimum confidence score that there's enough
+     * evidence to determine csv/tsv vs. txt
+     */
+    @Field
+    private double minConfidence = 0.50;
+
     public TextAndCSVParser() {
         super();
     }
@@ -267,7 +274,7 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
                 (params.getMediaType() == null ||
                         isCSVOrTSV(params.getMediaType()))) {
 
-            CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters);
+            CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters, minConfidence);
             CSVResult result = sniffer.getBest(reader, metadata);
             params.setMediaType(result.getMediaType());
             params.setDelimiter(result.getDelimiter());
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index 63328d9..809f881 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -28,12 +28,14 @@ import java.util.Map;
 
 import org.apache.commons.io.ByteOrderMark;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 public class TextAndCSVParserTest extends TikaTest {
@@ -67,7 +69,16 @@ public class TextAndCSVParserTest extends TikaTest {
 
     private static String EXPECTED_CSV = EXPECTED_TSV.replaceAll(",+", " ");
 
-    private static Parser PARSER = new AutoDetectParser();
+    private static Parser PARSER;
+
+    @BeforeClass
+    public static void setUp() throws Exception {
+
+        try (InputStream is = Thread.currentThread().getContextClassLoader()
+                .getResourceAsStream("org/apache/tika/parser/csv/tika-config.xml")) {
+            PARSER = new AutoDetectParser(new TikaConfig(is));
+        }
+    }
 
     @Test
     public void testCSV_UTF8() throws Exception {
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/csv/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/csv/tika-config.xml
new file mode 100644
index 0000000..0b4de32
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/csv/tika-config.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.csv.TextAndCSVParser"/>
+        </parser>
+
+        <parser class="org.apache.tika.parser.csv.TextAndCSVParser">
+            <params>
+                <param name="minConfidence" type="double">0.2</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>