You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/02/13 01:39:37 UTC

[tika] branch branch_1x updated: TIKA-2828 -- initial CSVParser commit

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 150d4bc  TIKA-2828 -- initial CSVParser commit
150d4bc is described below

commit 150d4bc2d3af238bd3608309a345e189b3c657b6
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Feb 12 20:38:55 2019 -0500

    TIKA-2828 -- initial CSVParser commit
---
 CHANGES.txt                                        |   3 +
 .../src/main/java/org/apache/tika/io/IOUtils.java  |  31 ++
 .../java/org/apache/tika/parser/csv/CSVParser.java | 313 +++++++++++++++++++++
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../org/apache/tika/parser/csv/CSVParserTest.java  | 164 +++++++++++
 5 files changed, 512 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index dfa6bae..cd71cd7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.21 - ????
 
+   * Add a CSVParser.  CSV detection is currently based solely on filename
+     and/or information conveyed via Metadata (TIKA-2826).
+
    * General upgrades: jackcess, opennlp, httpcomponents, zstd-jni, cxf, Lucene (TIKA-2824)
 
    * Bundle xerces2 with tika-parsers (TIKA-2802).
diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
index 11d3bd3..a05176f 100644
--- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
@@ -1183,4 +1183,35 @@ public class IOUtils {
         return (ch2 == -1);
     }
 
+    /**
+     * Reads bytes from an input stream.
+     * This implementation guarantees that it will read as many bytes
+     * as possible before giving up; this may not always be the case for
+     * subclasses of {@link InputStream}.
+     *
+     * @param input where to read input from
+     * @param buffer destination
+     * @param offset initial offset into buffer
+     * @param length length to read, must be &gt;= 0
+     * @return actual length read; may be less than requested if EOF was reached
+     * @throws IOException if a read error occurs
+     * @since 2.2
+     */
+    public static int read(final InputStream input, final byte[] buffer, final int offset, final int length)
+            throws IOException {
+        if (length < 0) {
+            throw new IllegalArgumentException("Length must not be negative: " + length);
+        }
+        int remaining = length;
+        while (remaining > 0) {
+            final int location = length - remaining;
+            final int count = input.read(buffer, offset + location, remaining);
+            if (count == -1) { // EOF
+                break;
+            }
+            remaining -= count;
+        }
+        return length - remaining;
+    }
+
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java
new file mode 100644
index 0000000..7fb57f6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.tika.Tika;
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class CSVParser extends AbstractParser {
+    private static final String CSV_PREFIX = "csv";
+    public static final Property DELIMITER = Property.externalText(
+            CSV_PREFIX+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER+"delimiter");
+
+    private static final String TD = "td";
+    private static final String TR = "tr";
+    private static final String TABLE = "table";
+
+    private static final MediaType CSV = MediaType.text("csv");
+    private static final MediaType TSV = MediaType.text("tsv");
+
+    private static final int DEFAULT_MARK_LIMIT = 20000;
+
+    //TODO: add | or make this configurable?
+    private static final char[] CANDIDATE_DELIMITERS = new char[]{',', '\t'};
+
+    private static final Map<Character, String> DELIMITERS = new HashMap<>();
+
+    static {
+        DELIMITERS.put(',', "comma");
+        DELIMITERS.put('\t', "tab");
+    }
+
+
+
+    @Field
+    private int markLimit = DEFAULT_MARK_LIMIT;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    CSV, TSV)));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+
+        String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE);
+        Character overrideDelimiter = null;
+        Charset overrideCharset = null;
+        if (override != null) {
+            MediaType mediaType = MediaType.parse(override);
+            String charset = mediaType.getParameters().get("charset");
+            overrideDelimiter = mediaType.getBaseType().toString().endsWith("tsv") ? '\t' : ',';
+            if (charset != null) {
+                try {
+                    overrideCharset = Charset.forName(charset);
+                } catch (UnsupportedCharsetException e) {
+                    //swallow
+                }
+            }
+        }
+        if (overrideDelimiter == null || overrideCharset == null) {
+            if (!stream.markSupported()) {
+                stream = new BufferedInputStream(stream);
+            }
+        }
+        //buffer the firstx bytes to detect delimiter
+        byte[] firstX = null;
+        if (overrideDelimiter == null) {
+            firstX = readFirstX(stream, markLimit);
+        }
+        Charset charset = null;
+        Reader reader = null;
+        org.apache.commons.csv.CSVParser commonsParser = null;
+        try {
+            //need to detect if nothing has been sent in via override
+            if (overrideCharset == null) {
+                reader = new AutoDetectReader(stream);
+                charset = ((AutoDetectReader) reader).getCharset();
+            } else {
+                reader = new BufferedReader(new InputStreamReader(stream, overrideCharset));
+                charset = overrideCharset;
+            }
+            CSVFormat csvFormat = null;
+            if (overrideDelimiter == null) {
+                csvFormat = guessFormat(firstX, charset, metadata);
+            } else {
+                csvFormat = CSVFormat.EXCEL.withDelimiter(overrideDelimiter);
+            }
+            metadata.set(DELIMITER, DELIMITERS.get(csvFormat.getDelimiter()));
+
+            if (overrideCharset == null || overrideDelimiter == null) {
+                MediaType mediaType = (csvFormat.getDelimiter() == '\t') ? TSV : CSV;
+                MediaType type = new MediaType(mediaType, charset);
+                metadata.set(Metadata.CONTENT_TYPE, type.toString());
+                // deprecated, see TIKA-431
+                metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+            }
+
+            XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
+            commonsParser = new org.apache.commons.csv.CSVParser(reader, csvFormat);
+            xhtmlContentHandler.startDocument();
+            xhtmlContentHandler.startElement(TABLE);
+            try {
+                for (CSVRecord row : commonsParser) {
+                    xhtmlContentHandler.startElement(TR);
+                    for (String cell : row) {
+                        xhtmlContentHandler.startElement(TD);
+                        xhtmlContentHandler.characters(cell);
+                        xhtmlContentHandler.endElement(TD);
+                    }
+                    xhtmlContentHandler.endElement(TR);
+                }
+            } catch (IllegalStateException e) {
+                throw new TikaException("exception parsing the csv", e);
+            }
+
+            xhtmlContentHandler.endElement(TABLE);
+            xhtmlContentHandler.endDocument();
+        } finally {
+            if (commonsParser != null) {
+                try {
+                    commonsParser.close();
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
+            IOUtils.closeQuietly(reader);
+        }
+    }
+
+    private byte[] readFirstX(InputStream stream, int markLimit) throws IOException {
+        byte[] bytes = new byte[markLimit];
+
+        try {
+            stream.mark(markLimit);
+            int numRead = IOUtils.read(stream, bytes, 0, bytes.length);
+            if (numRead < markLimit) {
+                byte[] dest = new byte[numRead];
+                System.arraycopy(bytes, 0, dest, 0, numRead);
+                bytes = dest;
+            }
+        } finally {
+            stream.reset();
+        }
+        return bytes;
+    }
+
+    private CSVFormat guessFormat(byte[] bytes, Charset charset, Metadata metadata) throws IOException {
+
+        String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
+        char bestDelimiter = (mediaTypeString.endsWith("csv")) ? ',' : '\t';
+        CSVReadTestResult bestResult = null;
+
+        for (char c : CANDIDATE_DELIMITERS) {
+
+            try (Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes), charset)) {
+
+                CSVReadTestResult testResult = attemptCSVRead(c, bytes.length, reader);
+                if (bestResult == null || testResult.isBetterThan(bestResult)) {
+                    bestResult = testResult;
+                    bestDelimiter = c;
+                }
+            }
+        }
+        return CSVFormat.EXCEL.withDelimiter(bestDelimiter);
+    }
+
+    private CSVReadTestResult attemptCSVRead(char delimiter, int bytesTotal, Reader reader) throws IOException {
+
+        //maps <rowLength, numberOfRows>
+        Map<Integer, Integer> colCounts = new HashMap<>();
+        long lastCharacterPosition = -1L;
+        int rowCount = 0;
+        boolean illegalStateException = false;
+        try {
+            org.apache.commons.csv.CSVParser p = new org.apache.commons.csv.CSVParser(reader, CSVFormat.EXCEL.withDelimiter(delimiter));
+
+            for (CSVRecord row : p) {
+                int colCount = row.size();
+                lastCharacterPosition = row.getCharacterPosition();
+                Integer cnt = colCounts.get(colCount);
+                if (cnt == null) {
+                    cnt = 1;
+                } else {
+                    cnt++;
+                }
+                colCounts.put(colCount, cnt);
+                rowCount++;
+            }
+        } catch (IllegalStateException e) {
+            //this could be bad encapsulation -- invalid char between encapsulated token
+            //swallow while guessing
+            illegalStateException = true;
+        }
+
+        int mostCommonColCount = -1;
+        int totalCount = 0;
+        for (Integer count : colCounts.values()) {
+            if (count > mostCommonColCount) {
+                mostCommonColCount = count;
+            }
+            totalCount += count;
+        }
+        double percentMostCommonRowLength = -1.0f;
+        if (totalCount > 0) {
+            percentMostCommonRowLength = (double) mostCommonColCount / (double) totalCount;
+        }
+        return new CSVReadTestResult(bytesTotal, lastCharacterPosition, rowCount, percentMostCommonRowLength, illegalStateException);
+
+    }
+
+    private static class CSVReadTestResult {
+        private final int bytesTotal;
+        private final long bytesParsed;
+        private final int rowCount;
+        //the percentage of the rows that have the
+        //the most common row length -- maybe use stdev?
+        private final double percentMostCommonRowLength;
+        private final boolean illegalStateException;
+
+        public CSVReadTestResult(int bytesTotal, long bytesParsed, int rowCount,
+                                 double percentMostCommonRowLength, boolean illegalStateException) {
+            this.bytesTotal = bytesTotal;
+            this.bytesParsed = bytesParsed;
+            this.rowCount = rowCount;
+            this.percentMostCommonRowLength = percentMostCommonRowLength;
+            this.illegalStateException = illegalStateException;
+        }
+
+        public boolean isBetterThan(CSVReadTestResult bestResult) {
+            if (bestResult == null) {
+                return true;
+            }
+            if (illegalStateException && ! bestResult.illegalStateException) {
+                return false;
+            } else if (! illegalStateException && bestResult.illegalStateException) {
+                return true;
+            }
+            //if there are >= 3 rows in both, select the one with the better
+            //percentMostCommonRowLength
+            if (this.rowCount >= 3 && bestResult.rowCount >= 3) {
+                if (percentMostCommonRowLength > bestResult.percentMostCommonRowLength) {
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+
+            //if there's a big difference between the number of bytes parsed,
+            //pick the one that allowed more parsed bytes
+            if (bytesTotal > 0 && Math.abs((bestResult.bytesParsed - bytesParsed) / bytesTotal) > 0.1f) {
+                if (bytesParsed > bestResult.bytesParsed) {
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+            //add other heuristics as necessary
+
+            //if there's no other information,
+            //default to not better = default
+            return false;
+        }
+    }
+}
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index a33a578..f33c4d6 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -19,6 +19,7 @@ org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
 org.apache.tika.parser.crypto.Pkcs7Parser
 org.apache.tika.parser.crypto.TSDParser
+org.apache.tika.parser.csv.CSVParser
 org.apache.tika.parser.dwg.DWGParser
 org.apache.tika.parser.epub.EpubParser
 org.apache.tika.parser.executable.ExecutableParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java
new file mode 100644
index 0000000..de6ec29
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+public class CSVParserTest extends TikaTest {
+
+    private static byte[] CSV_UTF8 =
+            ("the,quick,brown\tfox\n" +
+              "jumped \tover,the\tlazy,\tdog\n"+
+              "and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_8);
+
+    private static byte[] CSV_UTF_16LE =
+            ("the,quick,brown\tfox\n" +
+                    "jumped \tover,the\tlazy,\tdog\n"+
+                    "and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_16LE);
+
+
+    private static byte[] TSV_UTF8 =
+            ("the\tquick\tbrown,fox\n" +
+                    "jumped ,over\tthe,lazy\t,dog\n"+
+                    "and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_8);
+
+    private static byte[] TSV_UTF_16LE =
+            ("the\tquick\tbrown,fox\n" +
+                    "jumped ,over\tthe,lazy\t,dog\n"+
+                    "and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_16LE);
+
+
+    private static String EXPECTED_TSV = ("<table><tr> <td>the</td> <td>quick</td> <td>brown,fox</td></tr>\n" +
+            "<tr> <td>jumped ,over</td> <td>the,lazy</td> <td>,dog</td></tr>\n" +
+            "<tr> <td>and then</td> <td>ran</td> <td>down,the,street</td></tr>\n" +
+            "</table>").replaceAll("[\r\n\t ]+", " ");
+
+    private static String EXPECTED_CSV = EXPECTED_TSV.replaceAll(",+", " ");
+
+    private static Parser PARSER = new AutoDetectParser();
+
+    @Test
+    public void testCSV_UTF8() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
+        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+    }
+
+    @Test
+    public void testCSV_UTF8_TypeOverride() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, "text/csv; charset=UTF-8");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
+        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/csv; charset=UTF-8", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+    }
+
+    @Test
+    public void testCSV_UTF8_Type() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "text/csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
+        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+    }
+
+    @Test
+    public void testCSV_UTF16LE() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF_16LE), PARSER, metadata);
+        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/csv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+    }
+
+    @Test
+    public void testCSV_UTF16LE_BOM() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(
+                concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER, metadata);
+        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/csv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+    }
+
+    private static byte[] concat(byte[] bytesA, byte[] bytesB) {
+        byte[] ret = new byte[bytesA.length+bytesB.length];
+        System.arraycopy(bytesA, 0, ret, 0, bytesA.length);
+        System.arraycopy(bytesB, 0, ret, bytesA.length, bytesB.length);
+        return ret;
+    }
+
+    @Test
+    public void testTSV_UTF8() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF8), PARSER, metadata);
+        assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/tsv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
+    }
+
+    @Test
+    public void testTSV_UTF16LE() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF_16LE), PARSER, metadata);
+        assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/tsv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
+    }
+
+    @Test
+    public void testBadCsv() throws Exception {
+        //this causes an IllegalStateException during delimiter detection
+        //when trying to parse with ','; therefore, the parser backs off to '\t'.
+        //this isn't necessarily the best outcome, but we want to make sure
+        //that an IllegalStateException during delimiter guessing doesn't
+        //make the parse fail.
+
+        byte[] csv = ("the,quick\n" +
+                "brown,\"la\"zy\"\n" +
+                "brown,\"dog\n").getBytes(StandardCharsets.UTF_8);
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(csv), PARSER, metadata);
+        assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
+        assertEquals("text/tsv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) {
+        assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
+    }
+}