You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/03/07 21:38:44 UTC

[tika] branch branch_1x updated: TIKA-2836 -- csv/txt detection should glob identification

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 0a86906  TIKA-2836 -- csv/txt detection should glob identification
0a86906 is described below

commit 0a86906cd1cfcb0f7d972265948849342ab72562
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Mar 7 16:37:55 2019 -0500

    TIKA-2836 -- csv/txt detection should glob identification
---
 .../java/org/apache/tika/parser/csv/CSVResult.java |  23 +-
 .../org/apache/tika/parser/csv/CSVSniffer.java     |  27 +--
 .../apache/tika/parser/csv/TextAndCSVParser.java   | 232 ++++++---------------
 .../org/apache/tika/parser/csv/CSVSnifferTest.java |   1 +
 .../tika/parser/csv/TextAndCSVParserTest.java      |  26 ++-
 5 files changed, 123 insertions(+), 186 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java
index 13dad8e..817d909 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java
@@ -22,13 +22,13 @@ import org.apache.tika.mime.MediaType;
 
 public class CSVResult implements Comparable<CSVResult> {
 
-    static CSVResult TEXT = new CSVResult(1.0, MediaType.TEXT_PLAIN, '\n');
+    static CSVResult TEXT = new CSVResult(1.0, MediaType.TEXT_PLAIN, null);
 
     private final double confidence;
     private final MediaType mediaType;
-    private final char delimiter;
+    private final Character delimiter;
 
-    public CSVResult(double confidence, MediaType mediaType, char delimiter) {
+    public CSVResult(double confidence, MediaType mediaType, Character delimiter) {
         this.confidence = confidence;
         this.mediaType = mediaType;
         this.delimiter = delimiter;
@@ -38,6 +38,10 @@ public class CSVResult implements Comparable<CSVResult> {
         return mediaType;
     }
 
+    /**
+     *
+     * @return returns the delimiter or <code>null</code> if the mediatype=text/plain
+     */
     public Character getDelimiter() {
         return delimiter;
     }
@@ -58,8 +62,8 @@ public class CSVResult implements Comparable<CSVResult> {
         if (o == null || getClass() != o.getClass()) return false;
         CSVResult csvResult = (CSVResult) o;
         return Double.compare(csvResult.confidence, confidence) == 0 &&
-                delimiter == csvResult.delimiter &&
-                mediaType.equals(csvResult.mediaType);
+                mediaType.equals(csvResult.mediaType) &&
+                Objects.equals(delimiter, csvResult.delimiter);
     }
 
     @Override
@@ -67,6 +71,15 @@ public class CSVResult implements Comparable<CSVResult> {
         return Objects.hash(confidence, mediaType, delimiter);
     }
 
+    @Override
+    public String toString() {
+        return "CSVResult{" +
+                "confidence=" + confidence +
+                ", mediaType=" + mediaType +
+                ", delimiter=" + delimiter +
+                '}';
+    }
+
     public double getConfidence() {
         return confidence;
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
index b463272..2bb0851 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
@@ -30,6 +30,7 @@ import java.util.Map;
 
 import org.apache.commons.io.input.ProxyReader;
 import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
 class CSVSniffer {
@@ -70,25 +71,25 @@ class CSVSniffer {
         return ret;
     }
 
-    //gets the best result with confidence > 0
-    //otherwise, returns CSVResult
-
     /**
-     *
      * @param reader
-     * @return the best result with confidence > 0; if none exist, it returns {@link CSVResult#TEXT}
+     * @param metadata
+     * @return the best result given the detection results or {@link CSVResult#TEXT}
+     *         if the confidence is not above a threshold.
      * @throws IOException
      */
-    CSVResult getBest(Reader reader) throws IOException {
+    CSVResult getBest(Reader reader, Metadata metadata) throws IOException {
+        //TODO: take into consideration the filename.  Perhaps require
+        //a higher confidence if detection contradicts filename?
         List<CSVResult> results = sniff(reader);
-
-        if (results.size() > 0) {
-            CSVResult result = results.get(0);
-            if (result.getConfidence() > 0.0) {
-                return result;
-            }
+        if (results == null || results.size() == 0) {
+            return CSVResult.TEXT;
+        }
+        CSVResult bestResult = results.get(0);
+        if (bestResult.getConfidence() < 0.10) {
+            return CSVResult.TEXT;
         }
-        return CSVResult.TEXT;
+        return bestResult;
     }
 
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
index 12db86b..8e560c4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -16,9 +16,7 @@
  */
 package org.apache.tika.parser.csv;
 
-import java.io.BufferedInputStream;
 import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -39,19 +37,34 @@ import org.apache.tika.config.Field;
 import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
-import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+/**
+ * Unless the {@link TikaCoreProperties#CONTENT_TYPE_OVERRIDE} is set,
+ * this parser tries to assess whether the file is a text file, csv or tsv.
+ * If the detector detects regularity in column numbers and/or encapsulated cells,
+ * this parser will apply the {@link org.apache.commons.csv.CSVParser};
+ * otherwise, it will treat the contents as text.
+ * <p>
+ *     If there is a csv parse exception during detection, the parser sets
+ * the {@link Metadata#CONTENT_TYPE} to {@link MediaType#TEXT_PLAIN}
+ * and treats the file as {@link MediaType#TEXT_PLAIN}.
+ * </p>
+ * <p>
+ *     If there is a csv parse exception during the parse, the parser
+ *     writes what's left of the stream as if it were text and then throws
+ *     an exception.  As of this writing, the content that was buffered by the underlying
+ *     {@link org.apache.commons.csv.CSVParser} is lost.
+ * </p>
+ */
 public class TextAndCSVParser extends AbstractEncodingDetectorParser {
 
     private static final String CSV_PREFIX = "csv";
@@ -87,6 +100,22 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
             STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey());
         }
     }
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    CSV, TSV, MediaType.TEXT_PLAIN)));
+
+    private char[] delimiters = DEFAULT_DELIMITERS;
+
+    /**
+     * This is the mark limit in characters (not bytes) to
+     * read from the stream when classifying the stream as
+     * csv, tsv or txt.
+     */
+    @Field
+    private int markLimit = DEFAULT_MARK_LIMIT;
+
+
     public TextAndCSVParser() {
         super();
     }
@@ -94,14 +123,6 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
     public TextAndCSVParser(EncodingDetector encodingDetector) {
         super(encodingDetector);
     }
-    private char[] delimiters = DEFAULT_DELIMITERS;
-
-    @Field
-    private int markLimit = DEFAULT_MARK_LIMIT;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    CSV, TSV, MediaType.TEXT_PLAIN)));
 
     @Override
     public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -126,17 +147,18 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
             reader = new BufferedReader(new InputStreamReader(stream, params.getCharset()));
             charset = params.getCharset();
         }
+
+        updateMetadata(params, metadata);
+
         //if text or a non-csv/tsv category of text
         //treat this as text and be done
-        //TODO -- if it was detected already as a non-csv subtype of text
+        //TODO -- if it was detected as a non-csv subtype of text
         if (! params.getMediaType().getBaseType().equals(CSV) &&
             ! params.getMediaType().getBaseType().equals(TSV)) {
             handleText(reader, charset, handler, metadata);
             return;
         }
 
-        updateMetadata(params, metadata);
-
         CSVFormat csvFormat = CSVFormat.EXCEL.withDelimiter(params.getDelimiter());
         metadata.set(DELIMITER_PROPERTY, CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter()));
 
@@ -213,24 +235,6 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
 
     }
 
-    private void updateMetadata(CSVParams params, Metadata metadata) {
-        MediaType mediaType = (params.getDelimiter() == '\t') ? TSV : CSV;
-        Map<String, String> attrs = new HashMap<>();
-        attrs.put(CHARSET, params.getCharset().name());
-        if (params.getDelimiter() != null) {
-            if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
-                attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
-            } else {
-                attrs.put(DELIMITER, Integer.toString((int)params.getDelimiter().charValue()));
-            }
-        }
-        MediaType type = new MediaType(mediaType, attrs);
-        metadata.set(Metadata.CONTENT_TYPE, type.toString());
-        // deprecated, see TIKA-431
-        metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
-
-    }
-
     private Reader detect(CSVParams params, InputStream stream,
                         Metadata metadata, ParseContext context) throws IOException, TikaException {
         //if the file was already identified as not .txt, .csv or .tsv
@@ -263,12 +267,8 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
                 (params.getMediaType() == null ||
                         isCSVOrTSV(params.getMediaType()))) {
 
-            CSVSniffer sniffer = new CSVSniffer(delimiters);
-            CSVResult result = sniffer.getBest(reader);
-            //we should require a higher confidence if the content-type
-            //is text/plain -- e.g. if the file name ends in .txt or
-            //the parent parser has an indication that this is txt
-            //(as in mail attachment headers)
+            CSVSniffer sniffer = new CSVSniffer(markLimit, delimiters);
+            CSVResult result = sniffer.getBest(reader, metadata);
             params.setMediaType(result.getMediaType());
             params.setDelimiter(result.getDelimiter());
         }
@@ -321,140 +321,38 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
         }
         return false;
     }
-    private byte[] readFirstX(InputStream stream, int markLimit) throws IOException {
-        byte[] bytes = new byte[markLimit];
-
-        try {
-            stream.mark(markLimit);
-            int numRead = IOUtils.read(stream, bytes, 0, bytes.length);
-            if (numRead < markLimit) {
-                byte[] dest = new byte[numRead];
-                System.arraycopy(bytes, 0, dest, 0, numRead);
-                bytes = dest;
-            }
-        } finally {
-            stream.reset();
-        }
-        return bytes;
-    }
-
-    private CSVFormat guessFormat(byte[] bytes, Charset charset, Metadata metadata) throws IOException {
-
-        String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
-        char bestDelimiter = (mediaTypeString.endsWith("csv")) ? ',' : '\t';
-        CSVReadTestResult bestResult = null;
-
-        for (char c : DEFAULT_DELIMITERS) {
-
-            try (Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes), charset)) {
-
-                CSVReadTestResult testResult = attemptCSVRead(c, bytes.length, reader);
-                if (bestResult == null || testResult.isBetterThan(bestResult)) {
-                    bestResult = testResult;
-                    bestDelimiter = c;
-                }
-            }
-        }
-        return CSVFormat.EXCEL.withDelimiter(bestDelimiter);
-    }
 
-    private CSVReadTestResult attemptCSVRead(char delimiter, int bytesTotal, Reader reader) throws IOException {
-
-        //maps <rowLength, numberOfRows>
-        Map<Integer, Integer> colCounts = new HashMap<>();
-        long lastCharacterPosition = -1L;
-        int rowCount = 0;
-        boolean illegalStateException = false;
-        try {
-            org.apache.commons.csv.CSVParser p = new org.apache.commons.csv.CSVParser(reader, CSVFormat.EXCEL.withDelimiter(delimiter));
-
-            for (CSVRecord row : p) {
-                int colCount = row.size();
-                lastCharacterPosition = row.getCharacterPosition();
-                Integer cnt = colCounts.get(colCount);
-                if (cnt == null) {
-                    cnt = 1;
-                } else {
-                    cnt++;
-                }
-                colCounts.put(colCount, cnt);
-                rowCount++;
+    private void updateMetadata(CSVParams params, Metadata metadata) {
+        MediaType mediaType = null;
+        if (params.getMediaType().getBaseType().equals(MediaType.TEXT_PLAIN)) {
+            mediaType = MediaType.TEXT_PLAIN;
+        } else if (params.getDelimiter() != null) {
+            if (params.getDelimiter() == '\t') {
+                mediaType = TSV;
+            } else {
+                mediaType = CSV;
             }
-        } catch (IllegalStateException e) {
-            //this could be bad encapsulation -- invalid char between encapsulated token
-            //swallow while guessing
-            illegalStateException = true;
-        }
-
-        int mostCommonColCount = -1;
-        int totalCount = 0;
-        for (Integer count : colCounts.values()) {
-            if (count > mostCommonColCount) {
-                mostCommonColCount = count;
+        } else {
+            if (metadata.get(Metadata.CONTENT_TYPE) != null) {
+                mediaType = MediaType.parse(
+                        metadata.get(Metadata.CONTENT_TYPE));
             }
-            totalCount += count;
-        }
-        double percentMostCommonRowLength = -1.0f;
-        if (totalCount > 0) {
-            percentMostCommonRowLength = (double) mostCommonColCount / (double) totalCount;
         }
-        return new CSVReadTestResult(bytesTotal, lastCharacterPosition, rowCount, percentMostCommonRowLength, illegalStateException);
-
-    }
-
-    private static class CSVReadTestResult {
-        private final int bytesTotal;
-        private final long bytesParsed;
-        private final int rowCount;
-        //the percentage of the rows that have the
-        //the most common row length -- maybe use stdev?
-        private final double percentMostCommonRowLength;
-        private final boolean illegalStateException;
-
-        public CSVReadTestResult(int bytesTotal, long bytesParsed, int rowCount,
-                                 double percentMostCommonRowLength, boolean illegalStateException) {
-            this.bytesTotal = bytesTotal;
-            this.bytesParsed = bytesParsed;
-            this.rowCount = rowCount;
-            this.percentMostCommonRowLength = percentMostCommonRowLength;
-            this.illegalStateException = illegalStateException;
+        Map<String, String> attrs = new HashMap<>();
+        if (params.getCharset() != null) {
+            attrs.put(CHARSET, params.getCharset().name());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
         }
-
-        public boolean isBetterThan(CSVReadTestResult bestResult) {
-            if (bestResult == null) {
-                return true;
-            }
-            if (illegalStateException && ! bestResult.illegalStateException) {
-                return false;
-            } else if (! illegalStateException && bestResult.illegalStateException) {
-                return true;
-            }
-            //if there are >= 3 rows in both, select the one with the better
-            //percentMostCommonRowLength
-            if (this.rowCount >= 3 && bestResult.rowCount >= 3) {
-                if (percentMostCommonRowLength > bestResult.percentMostCommonRowLength) {
-                    return true;
-                } else {
-                    return false;
-                }
-            }
-
-            //if there's a big difference between the number of bytes parsed,
-            //pick the one that allowed more parsed bytes
-            if (bytesTotal > 0 && Math.abs((bestResult.bytesParsed - bytesParsed) / bytesTotal) > 0.1f) {
-                if (bytesParsed > bestResult.bytesParsed) {
-                    return true;
-                } else {
-                    return false;
-                }
+        if (!mediaType.equals(MediaType.TEXT_PLAIN) && params.getDelimiter() != null) {
+            if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
+                attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
+            } else {
+                attrs.put(DELIMITER, Integer.toString((int)params.getDelimiter().charValue()));
             }
-            //add other heuristics as necessary
-
-            //if there's no other information,
-            //default to not better = default
-            return false;
         }
+        MediaType type = new MediaType(mediaType, attrs);
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
     }
 
-
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java
index 998473c..89b92d3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java
@@ -89,6 +89,7 @@ public class CSVSnifferTest extends TikaTest {
     @Test
     public void testCSVMidCellQuoteException() throws Exception {
         List<CSVResult> results = sniff(DELIMITERS, CSV_MID_CELL_QUOTE_EXCEPTION, StandardCharsets.UTF_8);
+
         assertEquals(2, results.size());
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index 0e07511..ed05d1a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -28,6 +28,7 @@ import java.util.Map;
 
 import org.apache.commons.io.ByteOrderMark;
 import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
@@ -163,10 +164,25 @@ public class TextAndCSVParserTest extends TikaTest {
         metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(csv), PARSER, metadata);
         assertNull(xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
-        assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContains("the,quick", xmlResult.xml);
     }
 
+    @Test //TIKA-2836
+    public void testNonCSV() throws Exception {
+
+        byte[] bytes = ("testcsv\n" +
+                "testcsv testcsv;;; testcsv").getBytes(StandardCharsets.UTF_8);
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(bytes), PARSER, metadata);
+        assertContains("text/plain", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
+        xmlResult = getXML(new ByteArrayInputStream(bytes), PARSER, metadata);
+        assertContains("text/plain", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
     @Test
     public void testLong() throws Exception {
         //test mark/reset worked on the sniffers
@@ -183,6 +199,14 @@ public class TextAndCSVParserTest extends TikaTest {
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
     }
 
+    //TIKA-2047
+    @Test
+    public void testSubclassingMimeTypesRemain() throws Exception {
+        XMLResult r = getXML("testVCalendar.vcs");
+        assertEquals("text/x-vcalendar; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+
     private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) {
         assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
     }