You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/02/28 18:29:38 UTC

[tika] branch TIKA-2833 updated (eff7458 -> ff6ea72)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-2833
in repository https://gitbox.apache.org/repos/asf/tika.git.


 discard eff7458  TIKA-2833 -- initial commit with csv detection and swapping out the TXTParser in favor of the CSVParser
 discard d3317f9  TIKA-2833 -- initial commit with csv detection and swapping out the TXTParser in favor of the CSVParser
     new ff6ea72  TIKA-2833 -- initial commit with csv detection and swapping out the TXTParser in favor of the CSVParser

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (eff7458)
            \
             N -- N -- N   refs/heads/TIKA-2833 (ff6ea72)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:


[tika] 01/01: TIKA-2833 -- initial commit with csv detection and swapping out the TXTParser in favor of the CSVParser

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-2833
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ff6ea7213d800c7e4786023c3f7937a02ac6982f
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Feb 27 21:52:46 2019 -0500

    TIKA-2833 -- initial commit with csv detection and swapping
    out the TXTParser in favor of the CSVParser
---
 CHANGES.txt                                        |   4 +
 .../tika/extractor/TestEmbeddedDocumentUtil.java   |   8 +-
 .../java/org/apache/tika/parser/csv/CSVParams.java |  72 ++++
 .../java/org/apache/tika/parser/csv/CSVParser.java | 313 --------------
 .../java/org/apache/tika/parser/csv/CSVResult.java |  73 ++++
 .../org/apache/tika/parser/csv/CSVSniffer.java     | 375 +++++++++++++++++
 .../apache/tika/parser/csv/TextAndCSVParser.java   | 460 +++++++++++++++++++++
 .../tika/parser/mail/MailContentHandler.java       |  12 +-
 .../apache/tika/parser/mbox/OutlookPSTParser.java  |  15 +-
 .../parser/microsoft/AbstractPOIFSExtractor.java   |  25 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |   5 +-
 .../services/org.apache.tika.parser.Parser         |   3 +-
 .../tika/config/TikaEncodingDetectorTest.java      |   2 +-
 .../org/apache/tika/parser/csv/CSVSnifferTest.java | 115 ++++++
 ...SVParserTest.java => TextAndCSVParserTest.java} | 103 +++--
 15 files changed, 1217 insertions(+), 368 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8569191..a283e33 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,9 @@
 Release 1.21 - ????
 
+   * Add CSV detection and replace TXTParser with TextAndCSVParser;
+     users can turn off CSV detection by excluding the TextAndCSVParser
+     and adding back the TXTParser via tika-config (TIKA-2833).
+
    * Add a CSVParser.  CSV detection is currently based solely on filename
      and/or information conveyed via Metadata (TIKA-2826).
 
diff --git a/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java b/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java
index 2262998..ac969f2 100644
--- a/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java
+++ b/tika-app/src/test/java/org/apache/tika/extractor/TestEmbeddedDocumentUtil.java
@@ -38,9 +38,9 @@ public class TestEmbeddedDocumentUtil {
         Parser p = new AutoDetectParser();
         ParseContext parseContext = new ParseContext();
         parseContext.set(Parser.class, p);
-        Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
+        Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.csv.TextAndCSVParser.class, parseContext);
         assertNotNull(txtParser);
-        assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
+        assertEquals(org.apache.tika.parser.csv.TextAndCSVParser.class, txtParser.getClass());
 
     }
 
@@ -51,8 +51,8 @@ public class TestEmbeddedDocumentUtil {
                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
         ParseContext parseContext = new ParseContext();
         parseContext.set(Parser.class, wrapper);
-        Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.txt.TXTParser.class, parseContext);
+        Parser txtParser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(org.apache.tika.parser.csv.TextAndCSVParser.class, parseContext);
         assertNotNull(txtParser);
-        assertEquals(org.apache.tika.parser.txt.TXTParser.class, txtParser.getClass());
+        assertEquals(org.apache.tika.parser.csv.TextAndCSVParser.class, txtParser.getClass());
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParams.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParams.java
new file mode 100644
index 0000000..7e0ffb1
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParams.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.nio.charset.Charset;
+
+import org.apache.tika.mime.MediaType;
+
+public class CSVParams {
+
+    private MediaType mediaType = null;
+    private Character delimiter = null;
+    private Charset charset = null;
+
+    CSVParams() {}
+    CSVParams(MediaType mediaType, Charset charset) {
+        this.mediaType = mediaType;
+        this.charset = charset;
+    }
+
+    CSVParams(MediaType mediaType, Charset charset, Character delimiter) {
+        this.mediaType = mediaType;
+        this.charset = charset;
+        this.delimiter = delimiter;
+    }
+
+    public boolean isEmpty() {
+        return mediaType == null && delimiter == null && charset == null;
+    }
+
+    public boolean isComplete() {
+        return mediaType != null && delimiter != null && charset != null;
+    }
+
+    public MediaType getMediaType() {
+        return mediaType;
+    }
+
+    public void setMediaType(MediaType mediaType) {
+        this.mediaType = mediaType;
+    }
+
+    public Character getDelimiter() {
+        return delimiter;
+    }
+
+    public void setDelimiter(Character delimiter) {
+        this.delimiter = delimiter;
+    }
+
+    public Charset getCharset() {
+        return charset;
+    }
+
+    public void setCharset(Charset charset) {
+        this.charset = charset;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java
deleted file mode 100644
index 1e1aa20..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.csv;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedReader;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.Charset;
-import java.nio.charset.UnsupportedCharsetException;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.csv.CSVFormat;
-import org.apache.commons.csv.CSVRecord;
-import org.apache.tika.Tika;
-import org.apache.tika.config.Field;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-public class CSVParser extends AbstractParser {
-    private static final String CSV_PREFIX = "csv";
-    public static final Property DELIMITER = Property.externalText(
-            CSV_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER+"delimiter");
-
-    private static final String TD = "td";
-    private static final String TR = "tr";
-    private static final String TABLE = "table";
-
-    private static final MediaType CSV = MediaType.text("csv");
-    private static final MediaType TSV = MediaType.text("tsv");
-
-    private static final int DEFAULT_MARK_LIMIT = 20000;
-
-    //TODO: add | or make this configurable?
-    private static final char[] CANDIDATE_DELIMITERS = new char[]{',', '\t'};
-
-    private static final Map<Character, String> DELIMITERS = new HashMap<>();
-
-    static {
-        DELIMITERS.put(',', "comma");
-        DELIMITERS.put('\t', "tab");
-    }
-
-
-
-    @Field
-    private int markLimit = DEFAULT_MARK_LIMIT;
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    CSV, TSV)));
-
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler,
-                      Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
-
-        String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE);
-        Character overrideDelimiter = null;
-        Charset overrideCharset = null;
-        if (override != null) {
-            MediaType mediaType = MediaType.parse(override);
-            String charset = mediaType.getParameters().get("charset");
-            overrideDelimiter = mediaType.getBaseType().toString().endsWith("tsv") ? '\t' : ',';
-            if (charset != null) {
-                try {
-                    overrideCharset = Charset.forName(charset);
-                } catch (UnsupportedCharsetException e) {
-                    //swallow
-                }
-            }
-        }
-        if (overrideDelimiter == null || overrideCharset == null) {
-            if (!stream.markSupported()) {
-                stream = new BufferedInputStream(stream);
-            }
-        }
-        //buffer the firstx bytes to detect delimiter
-        byte[] firstX = null;
-        if (overrideDelimiter == null) {
-            firstX = readFirstX(stream, markLimit);
-        }
-        Charset charset = null;
-        Reader reader = null;
-        org.apache.commons.csv.CSVParser commonsParser = null;
-        try {
-            //need to detect if nothing has been sent in via override
-            if (overrideCharset == null) {
-                reader = new AutoDetectReader(stream);
-                charset = ((AutoDetectReader) reader).getCharset();
-            } else {
-                reader = new BufferedReader(new InputStreamReader(stream, overrideCharset));
-                charset = overrideCharset;
-            }
-            CSVFormat csvFormat = null;
-            if (overrideDelimiter == null) {
-                csvFormat = guessFormat(firstX, charset, metadata);
-            } else {
-                csvFormat = CSVFormat.EXCEL.withDelimiter(overrideDelimiter);
-            }
-            metadata.set(DELIMITER, DELIMITERS.get(csvFormat.getDelimiter()));
-
-            if (overrideCharset == null || overrideDelimiter == null) {
-                MediaType mediaType = (csvFormat.getDelimiter() == '\t') ? TSV : CSV;
-                MediaType type = new MediaType(mediaType, charset);
-                metadata.set(Metadata.CONTENT_TYPE, type.toString());
-                // deprecated, see TIKA-431
-                metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-            }
-
-            XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
-            commonsParser = new org.apache.commons.csv.CSVParser(reader, csvFormat);
-            xhtmlContentHandler.startDocument();
-            xhtmlContentHandler.startElement(TABLE);
-            try {
-                for (CSVRecord row : commonsParser) {
-                    xhtmlContentHandler.startElement(TR);
-                    for (String cell : row) {
-                        xhtmlContentHandler.startElement(TD);
-                        xhtmlContentHandler.characters(cell);
-                        xhtmlContentHandler.endElement(TD);
-                    }
-                    xhtmlContentHandler.endElement(TR);
-                }
-            } catch (IllegalStateException e) {
-                throw new TikaException("exception parsing the csv", e);
-            }
-
-            xhtmlContentHandler.endElement(TABLE);
-            xhtmlContentHandler.endDocument();
-        } finally {
-            if (commonsParser != null) {
-                try {
-                    commonsParser.close();
-                } catch (IOException e) {
-                    //swallow
-                }
-            }
-            IOUtils.closeQuietly(reader);
-        }
-    }
-
-    private byte[] readFirstX(InputStream stream, int markLimit) throws IOException {
-        byte[] bytes = new byte[markLimit];
-
-        try {
-            stream.mark(markLimit);
-            int numRead = IOUtils.read(stream, bytes, 0, bytes.length);
-            if (numRead < markLimit) {
-                byte[] dest = new byte[numRead];
-                System.arraycopy(bytes, 0, dest, 0, numRead);
-                bytes = dest;
-            }
-        } finally {
-            stream.reset();
-        }
-        return bytes;
-    }
-
-    private CSVFormat guessFormat(byte[] bytes, Charset charset, Metadata metadata) throws IOException {
-
-        String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
-        char bestDelimiter = (mediaTypeString.endsWith("csv")) ? ',' : '\t';
-        CSVReadTestResult bestResult = null;
-
-        for (char c : CANDIDATE_DELIMITERS) {
-
-            try (Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes), charset)) {
-
-                CSVReadTestResult testResult = attemptCSVRead(c, bytes.length, reader);
-                if (bestResult == null || testResult.isBetterThan(bestResult)) {
-                    bestResult = testResult;
-                    bestDelimiter = c;
-                }
-            }
-        }
-        return CSVFormat.EXCEL.withDelimiter(bestDelimiter);
-    }
-
-    private CSVReadTestResult attemptCSVRead(char delimiter, int bytesTotal, Reader reader) throws IOException {
-
-        //maps <rowLength, numberOfRows>
-        Map<Integer, Integer> colCounts = new HashMap<>();
-        long lastCharacterPosition = -1L;
-        int rowCount = 0;
-        boolean illegalStateException = false;
-        try {
-            org.apache.commons.csv.CSVParser p = new org.apache.commons.csv.CSVParser(reader, CSVFormat.EXCEL.withDelimiter(delimiter));
-
-            for (CSVRecord row : p) {
-                int colCount = row.size();
-                lastCharacterPosition = row.getCharacterPosition();
-                Integer cnt = colCounts.get(colCount);
-                if (cnt == null) {
-                    cnt = 1;
-                } else {
-                    cnt++;
-                }
-                colCounts.put(colCount, cnt);
-                rowCount++;
-            }
-        } catch (IllegalStateException e) {
-            //this could be bad encapsulation -- invalid char between encapsulated token
-            //swallow while guessing
-            illegalStateException = true;
-        }
-
-        int mostCommonColCount = -1;
-        int totalCount = 0;
-        for (Integer count : colCounts.values()) {
-            if (count > mostCommonColCount) {
-                mostCommonColCount = count;
-            }
-            totalCount += count;
-        }
-        double percentMostCommonRowLength = -1.0f;
-        if (totalCount > 0) {
-            percentMostCommonRowLength = (double) mostCommonColCount / (double) totalCount;
-        }
-        return new CSVReadTestResult(bytesTotal, lastCharacterPosition, rowCount, percentMostCommonRowLength, illegalStateException);
-
-    }
-
-    private static class CSVReadTestResult {
-        private final int bytesTotal;
-        private final long bytesParsed;
-        private final int rowCount;
-        //the percentage of the rows that have the
-        //the most common row length -- maybe use stdev?
-        private final double percentMostCommonRowLength;
-        private final boolean illegalStateException;
-
-        public CSVReadTestResult(int bytesTotal, long bytesParsed, int rowCount,
-                                 double percentMostCommonRowLength, boolean illegalStateException) {
-            this.bytesTotal = bytesTotal;
-            this.bytesParsed = bytesParsed;
-            this.rowCount = rowCount;
-            this.percentMostCommonRowLength = percentMostCommonRowLength;
-            this.illegalStateException = illegalStateException;
-        }
-
-        public boolean isBetterThan(CSVReadTestResult bestResult) {
-            if (bestResult == null) {
-                return true;
-            }
-            if (illegalStateException && ! bestResult.illegalStateException) {
-                return false;
-            } else if (! illegalStateException && bestResult.illegalStateException) {
-                return true;
-            }
-            //if there are >= 3 rows in both, select the one with the better
-            //percentMostCommonRowLength
-            if (this.rowCount >= 3 && bestResult.rowCount >= 3) {
-                if (percentMostCommonRowLength > bestResult.percentMostCommonRowLength) {
-                    return true;
-                } else {
-                    return false;
-                }
-            }
-
-            //if there's a big difference between the number of bytes parsed,
-            //pick the one that allowed more parsed bytes
-            if (bytesTotal > 0 && Math.abs((bestResult.bytesParsed - bytesParsed) / bytesTotal) > 0.1f) {
-                if (bytesParsed > bestResult.bytesParsed) {
-                    return true;
-                } else {
-                    return false;
-                }
-            }
-            //add other heuristics as necessary
-
-            //if there's no other information,
-            //default to not better = default
-            return false;
-        }
-    }
-}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java
new file mode 100644
index 0000000..13dad8e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVResult.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.util.Objects;
+
+import org.apache.tika.mime.MediaType;
+
+public class CSVResult implements Comparable<CSVResult> {
+
+    static CSVResult TEXT = new CSVResult(1.0, MediaType.TEXT_PLAIN, '\n');
+
+    private final double confidence;
+    private final MediaType mediaType;
+    private final char delimiter;
+
+    public CSVResult(double confidence, MediaType mediaType, char delimiter) {
+        this.confidence = confidence;
+        this.mediaType = mediaType;
+        this.delimiter = delimiter;
+    }
+
+    public MediaType getMediaType() {
+        return mediaType;
+    }
+
+    public Character getDelimiter() {
+        return delimiter;
+    }
+
+    /**
+     * Sorts in descending order of confidence
+     * @param o
+     * @return
+     */
+    @Override
+    public int compareTo(CSVResult o) {
+        return Double.compare(o.confidence, this.confidence);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        CSVResult csvResult = (CSVResult) o;
+        return Double.compare(csvResult.confidence, confidence) == 0 &&
+                delimiter == csvResult.delimiter &&
+                mediaType.equals(csvResult.mediaType);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(confidence, mediaType, delimiter);
+    }
+
+    public double getConfidence() {
+        return confidence;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
new file mode 100644
index 0000000..b463272
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVSniffer.java
@@ -0,0 +1,375 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.csv;
+
+import java.io.BufferedReader;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.PushbackReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.ProxyReader;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.tika.mime.MediaType;
+
+class CSVSniffer {
+    private static final int DEFAULT_MARK_LIMIT = 10000;
+    private static final int PUSH_BACK = 2;
+    static final int EOF = -1;
+    static final int NEW_LINE = '\n';
+    static final int CARRIAGE_RETURN = '\r';
+    private static final int SPACE = ' ';
+
+    private final char[] delimiters;
+    private final int markLimit;
+
+    CSVSniffer(char[] delimiters) {
+        this(DEFAULT_MARK_LIMIT, delimiters);
+    }
+
+    CSVSniffer(int markLimit, char[] delimiters) {
+        this.markLimit = markLimit;
+        this.delimiters = delimiters;
+    }
+
+    List<CSVResult> sniff(Reader reader) throws IOException {
+        if (! reader.markSupported()) {
+            reader = new BufferedReader(reader);
+        }
+        List<CSVResult> ret = new ArrayList<>();
+        for (char delimiter : delimiters) {
+            reader.mark(markLimit);
+            try {
+                CSVResult result = new Snifflet(delimiter).sniff(reader);
+                ret.add(result);
+            } finally {
+                reader.reset();
+            }
+        }
+        Collections.sort(ret);
+        return ret;
+    }
+
+    //gets the best result with confidence > 0
+    //otherwise, returns CSVResult
+
+    /**
+     *
+     * @param reader
+     * @return the best result with confidence > 0; if none exist, it returns {@link CSVResult#TEXT}
+     * @throws IOException
+     */
+    CSVResult getBest(Reader reader) throws IOException {
+        List<CSVResult> results = sniff(reader);
+
+        if (results.size() > 0) {
+            CSVResult result = results.get(0);
+            if (result.getConfidence() > 0.0) {
+                return result;
+            }
+        }
+        return CSVResult.TEXT;
+    }
+
+
+    //inner class that tests a single hypothesis/combination
+    //of parameters for delimiter and quote character
+    //this will throw an EOF before reading beyond the
+    //markLimit number of characters (not bytes!)
+    private class Snifflet {
+
+        private final char delimiter;
+
+        //hardcode this for now
+        private final char quoteCharacter = '"';
+
+        Map<Integer, MutableInt> rowLengthCounts = new HashMap<>();
+        int charsRead = 0;
+        int colCount = 0;
+        int encapsulated = 0; //number of cells that are encapsulated in dquotes (for now)
+        boolean parseException = false;
+
+        public Snifflet(char delimiter) {
+            this.delimiter = delimiter;
+        }
+
+        CSVResult sniff(Reader r) throws IOException {
+            boolean eof = false;
+            boolean hitMarkLimit = false;
+            int lastC = -1;
+            StringBuilder unquoted = new StringBuilder();
+            try (PushbackReader reader = new PushbackReader(new CloseShieldReader(r), PUSH_BACK)) {
+                int c = read(reader);
+                while (c != EOF) {
+                    if (c == quoteCharacter) {
+                        handleUnquoted(unquoted);
+                        //test to make sure there isn't an unencapsulated quote character in the middle of a cell
+                        if (lastC > -1 && lastC != delimiter && lastC != NEW_LINE && lastC != CARRIAGE_RETURN) {
+                            parseException = true;
+                            return calcResult();
+                        }
+                        //TODO: test to make sure cell doesn't start with escaped ""the quick brown cat"
+                        boolean correctlyEncapsulated = consumeQuoted(reader, quoteCharacter);
+                        if (! correctlyEncapsulated) {
+                            parseException = true;
+                            return calcResult();
+                        }
+                    } else if (c == delimiter) {
+                        handleUnquoted(unquoted);
+                        endColumn();
+                        consumeSpaceCharacters(reader);
+                    } else if (c == NEW_LINE || c == CARRIAGE_RETURN) {
+                        if (unquoted.length() > 0) {
+                            endColumn();
+                        }
+                        handleUnquoted(unquoted);
+                        endRow();
+                        consumeNewLines(reader);
+                    } else {
+                        unquoted.append((char) c);
+                    }
+                    lastC = c;
+                    c = read(reader);
+                }
+            } catch (HitMarkLimitException e) {
+                hitMarkLimit = true;
+            } catch (UnsurprisingEOF e) {
+                //totally ignore
+            } catch (EOFException e) {
+                //the consume* throw this to avoid
+                //having to check -1 every time and
+                //having to rely on potentially wonky
+                //inputstreams not consistently returning -1
+                //after hitting EOF and returning the first -1.
+                //Yes.  That's a thing.
+                eof = true;
+            } finally {
+                r.reset();
+            }
+            //if you've hit the marklimit or an eof on a truncated file
+            //don't add the last row's info
+            if (!hitMarkLimit && !eof && lastC != NEW_LINE && lastC != CARRIAGE_RETURN) {
+                handleUnquoted(unquoted);
+                endColumn();
+                endRow();
+            }
+            return calcResult();
+        }
+
+        private CSVResult calcResult() {
+            double confidence = getConfidence();
+            MediaType mediaType = TextAndCSVParser.CSV;
+            if (delimiter == '\t') {
+                mediaType = TextAndCSVParser.TSV;
+            }
+            return new CSVResult(confidence, mediaType, delimiter);
+        }
+
+        private void handleUnquoted(StringBuilder unquoted) {
+            if (unquoted.length() > 0) {
+                unquoted(unquoted.toString());
+                unquoted.setLength(0);
+            }
+        }
+
+        void consumeSpaceCharacters(PushbackReader reader) throws IOException {
+            int c = read(reader);
+            while (c == SPACE) {
+                c = read(reader);
+            }
+            if (c == EOF) {
+                throw new UnsurprisingEOF();
+            }
+            unread(reader, c);
+        }
+
+
+        /**
+         *
+         * @param reader
+         * @param quoteCharacter
+         * @return whether or not this was a correctly encapsulated cell
+         * @throws UnsurprisingEOF if the file ended immediately after the close quote
+         * @throws EOFException if the file ended in the middle of the encapsulated section
+         * @throws IOException on other IOExceptions
+         */
+        boolean consumeQuoted(PushbackReader reader, int quoteCharacter) throws IOException {
+            //this currently assumes excel "escaping" of double quotes:
+            //'the " quick' -> "the "" quick"
+            //we can make this more interesting later with other
+            //escaping options
+            int c = read(reader);
+            while (c != -1) {
+                if (c == quoteCharacter) {
+                    int nextC = read(reader);
+                    if (nextC == EOF) {
+                        encapsulated++;
+                        endColumn();
+                        throw new UnsurprisingEOF();
+                    } else if (nextC != quoteCharacter) {
+                        encapsulated++;
+                        endColumn();
+                        unread(reader, nextC);
+                        consumeSpaceCharacters(reader);
+                        //now make sure that the next character is eof, \r\n
+                        //or a delimiter
+                        nextC = read(reader);
+                        if (nextC == EOF) {
+                            throw new UnsurprisingEOF();
+                        } else if (nextC == NEW_LINE || nextC == CARRIAGE_RETURN) {
+                            unread(reader, nextC);
+                            return true;
+                        } else if (nextC != delimiter) {
+                            unread(reader, nextC);
+                            return false;
+                        }
+                        unread(reader, nextC);
+                        return true;
+                    }
+                }
+                c = read(reader);
+            }
+            throw new EOFException();
+        }
+
+        private int read(PushbackReader reader) throws IOException {
+            if (charsRead >= markLimit -1) {
+                throw new HitMarkLimitException();
+            }
+            int c = reader.read();
+            if (c == EOF) {
+                return EOF;
+            }
+            charsRead++;
+            return c;
+        }
+
+        private void unread(PushbackReader reader, int c) throws IOException {
+            if (c != EOF) {
+                reader.unread(c);
+                charsRead--;
+            }
+        }
+        //consume all consecutive '\r\n' in any order
+        void consumeNewLines(PushbackReader reader) throws IOException {
+            int c = read(reader);
+            while (c == NEW_LINE || c == CARRIAGE_RETURN) {
+                c = read(reader);
+            }
+            if (c == EOF) {
+                throw new EOFException();
+            }
+            unread(reader, c);
+            return;
+        }
+
+
+        void endColumn() {
+            colCount++;
+        }
+
+        void endRow() {
+            MutableInt cnt = rowLengthCounts.get(colCount);
+            if (cnt == null) {
+                cnt = new MutableInt(1);
+                rowLengthCounts.put(colCount, cnt);
+            } else {
+                cnt.increment();
+            }
+            colCount = 0;
+        }
+
+        void unquoted(String string) {
+            //TODO -- do some analysis to make sure you don't have
+            //large tokens like 2,3,2,3,2,3,
+        }
+
+        double getConfidence() {
+            double confidence = 0.0f;
+
+            if (parseException) {
+                return -1.0f;
+            }
+            //TODO -- add tests for long tokens containing
+            //other delimiters, e.g. the,quick,brown,fox as a token
+            //when testing '\t'
+            double colCountConsistencyConf = calculateColumnCountConsistency();
+            if (colCountConsistencyConf > -1.0) {
+                confidence = colCountConsistencyConf;
+            }
+            //the idea is that if there are a bunch of encapsulated
+            //cells, then that should outweigh column length inconsistency
+            //this particular formula offers a small initial increase
+            //that eventually approaches 1.0
+            double encapsulatedBonus = 0;
+            if (encapsulated > 0) {
+                encapsulatedBonus = 1.0-(1.0d/Math.pow(encapsulated, 0.2));
+            }
+            return Math.min(confidence+encapsulatedBonus, 1.0);
+        }
+
+        private double calculateColumnCountConsistency() {
+            int max = -1;
+            int totalRows = 0;
+            //find the most common row
+            for (Map.Entry<Integer, MutableInt> e : rowLengthCounts.entrySet()) {
+                int numCols = e.getKey();
+                int count = e.getValue().intValue();
+                //require that numCols > 1 so that you had at least
+                //one delimiter in that row
+                if (numCols > 1 && count > max) {
+                    max = count;
+                }
+                totalRows += count;
+            }
+            //if there's not enough info
+            if (max < 0 || totalRows < 3) {
+                return 0.0;
+            }
+
+            //TODO: convert this to continuous vs vague heuristic step function
+            double consistency = (double)max/(double)totalRows;
+            return ((1d-(1d/Math.pow(totalRows,0.3)))*consistency);
+        }
+
+    }
+
+    private static class UnsurprisingEOF extends EOFException {
+
+    }
+
+    private static class HitMarkLimitException extends EOFException {
+
+    }
+
+    private class CloseShieldReader extends ProxyReader {
+        public CloseShieldReader(Reader r) {
+            super(r);
+        }
+
+        @Override
+        public void close() throws IOException {
+            //do nothing
+        }
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
new file mode 100644
index 0000000..12db86b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -0,0 +1,460 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractEncodingDetectorParser;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TextAndCSVParser extends AbstractEncodingDetectorParser {
+
+    private static final String CSV_PREFIX = "csv";
+    private static final String CHARSET = "charset";
+    private static final String DELIMITER = "delimiter";
+    public static final Property DELIMITER_PROPERTY = Property.externalText(
+            CSV_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER+DELIMITER);
+
+    private static final String TD = "td";
+    private static final String TR = "tr";
+    private static final String TABLE = "table";
+
+    static final MediaType CSV = MediaType.text("csv");
+    static final MediaType TSV = MediaType.text("tsv");
+
+    private static final int DEFAULT_MARK_LIMIT = 20000;
+
+    private static final char[] DEFAULT_DELIMITERS = new char[]{',', '\t'};
+
+    private static final Map<Character, String> CHAR_TO_STRING_DELIMITER_MAP = new HashMap<>();
+    private static final Map<String, Character> STRING_TO_CHAR_DELIMITER_MAP = new HashMap<>();
+
+    static {
+        CHAR_TO_STRING_DELIMITER_MAP.put(',', "comma");
+        CHAR_TO_STRING_DELIMITER_MAP.put('\t', "tab");
+        CHAR_TO_STRING_DELIMITER_MAP.put('|', "pipe");
+        CHAR_TO_STRING_DELIMITER_MAP.put(';', "semicolon");
+        CHAR_TO_STRING_DELIMITER_MAP.put(':', "colon");
+    }
+
+    static {
+        for (Map.Entry<Character, String> e : CHAR_TO_STRING_DELIMITER_MAP.entrySet()) {
+            STRING_TO_CHAR_DELIMITER_MAP.put(e.getValue(), e.getKey());
+        }
+    }
+    public TextAndCSVParser() {
+        super();
+    }
+
+    public TextAndCSVParser(EncodingDetector encodingDetector) {
+        super(encodingDetector);
+    }
+    private char[] delimiters = DEFAULT_DELIMITERS;
+
+    @Field
+    private int markLimit = DEFAULT_MARK_LIMIT;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    CSV, TSV, MediaType.TEXT_PLAIN)));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+
+        CSVParams params = getOverride(metadata);
+        Reader reader = null;
+        Charset charset = null;
+        if (! params.isComplete()) {
+            reader = detect(params, stream, metadata, context);
+            if (params.getCharset() != null) {
+                charset = params.getCharset();
+            } else {
+                charset = ((AutoDetectReader) reader).getCharset();
+            }
+        } else {
+            reader = new BufferedReader(new InputStreamReader(stream, params.getCharset()));
+            charset = params.getCharset();
+        }
+        //if text or a non-csv/tsv category of text
+        //treat this as text and be done
+        //TODO -- if it was detected already as a non-csv subtype of text
+        if (! params.getMediaType().getBaseType().equals(CSV) &&
+            ! params.getMediaType().getBaseType().equals(TSV)) {
+            handleText(reader, charset, handler, metadata);
+            return;
+        }
+
+        updateMetadata(params, metadata);
+
+        CSVFormat csvFormat = CSVFormat.EXCEL.withDelimiter(params.getDelimiter());
+        metadata.set(DELIMITER_PROPERTY, CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter()));
+
+        XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
+        try (org.apache.commons.csv.CSVParser commonsParser =
+                     new org.apache.commons.csv.CSVParser(reader, csvFormat)) {
+            xhtmlContentHandler.startDocument();
+            xhtmlContentHandler.startElement(TABLE);
+            try {
+                for (CSVRecord row : commonsParser) {
+                    xhtmlContentHandler.startElement(TR);
+                    for (String cell : row) {
+                        xhtmlContentHandler.startElement(TD);
+                        xhtmlContentHandler.characters(cell);
+                        xhtmlContentHandler.endElement(TD);
+                    }
+                    xhtmlContentHandler.endElement(TR);
+                }
+            } catch (IllegalStateException e) {
+                //if there's a parse exception
+                //try to get the rest of the content...treat it as text for now
+                //There will be some content lost because of buffering.
+                //TODO -- figure out how to improve this
+                xhtmlContentHandler.endElement(TABLE);
+                xhtmlContentHandler.startElement("div", "name", "after exception");
+                handleText(reader, xhtmlContentHandler);
+                xhtmlContentHandler.endElement("div");
+                xhtmlContentHandler.endDocument();
+                //TODO -- consider dumping what's left in the reader as text
+                throw new TikaException("exception parsing the csv", e);
+            }
+
+            xhtmlContentHandler.endElement(TABLE);
+            xhtmlContentHandler.endDocument();
+        }
+    }
+
+    private void handleText(Reader reader, Charset charset,
+                            ContentHandler handler, Metadata metadata)
+            throws SAXException, IOException, TikaException {
+        // Automatically detect the character encoding
+            //try to get detected content type; could be a subclass of text/plain
+            //such as vcal, etc.
+            String incomingMime = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType mediaType = MediaType.TEXT_PLAIN;
+            if (incomingMime != null) {
+                MediaType tmpMediaType = MediaType.parse(incomingMime);
+                if (tmpMediaType != null) {
+                    mediaType = tmpMediaType;
+                }
+            }
+            MediaType type = new MediaType(mediaType, charset);
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            XHTMLContentHandler xhtml =
+                    new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+            handleText(reader, xhtml);
+            xhtml.endDocument();
+    }
+
+    private static void handleText(Reader reader, XHTMLContentHandler xhtml)
+            throws SAXException, IOException {
+        xhtml.startElement("p");
+        char[] buffer = new char[4096];
+        int n = reader.read(buffer);
+        while (n != -1) {
+            xhtml.characters(buffer, 0, n);
+            n = reader.read(buffer);
+        }
+        xhtml.endElement("p");
+
+    }
+
+    private void updateMetadata(CSVParams params, Metadata metadata) {
+        MediaType mediaType = (params.getDelimiter() == '\t') ? TSV : CSV;
+        Map<String, String> attrs = new HashMap<>();
+        attrs.put(CHARSET, params.getCharset().name());
+        if (params.getDelimiter() != null) {
+            if (CHAR_TO_STRING_DELIMITER_MAP.containsKey(params.getDelimiter())) {
+                attrs.put(DELIMITER, CHAR_TO_STRING_DELIMITER_MAP.get(params.getDelimiter()));
+            } else {
+                attrs.put(DELIMITER, Integer.toString((int)params.getDelimiter().charValue()));
+            }
+        }
+        MediaType type = new MediaType(mediaType, attrs);
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
+        // deprecated, see TIKA-431
+        metadata.set(Metadata.CONTENT_ENCODING, params.getCharset().name());
+
+    }
+
+    private Reader detect(CSVParams params, InputStream stream,
+                        Metadata metadata, ParseContext context) throws IOException, TikaException {
+        //if the file was already identified as not .txt, .csv or .tsv
+        //don't even try to csv or not
+        String mediaString = metadata.get(Metadata.CONTENT_TYPE);
+        if (mediaString != null) {
+            MediaType mediaType = MediaType.parse(mediaString);
+            if (! SUPPORTED_TYPES.contains(mediaType.getBaseType())) {
+                params.setMediaType(mediaType);
+                return new AutoDetectReader(
+                        new CloseShieldInputStream(stream),
+                        metadata, getEncodingDetector(context));
+            }
+        }
+        Reader reader = null;
+        if (params.getCharset() == null) {
+            reader = new AutoDetectReader(
+                    new CloseShieldInputStream(stream),
+                    metadata, getEncodingDetector(context));
+            params.setCharset(((AutoDetectReader)reader).getCharset());
+            if (params.isComplete()) {
+                return reader;
+            }
+        } else {
+            reader = new BufferedReader(new InputStreamReader(
+                    new CloseShieldInputStream(stream), params.getCharset()));
+        }
+
+        if (params.getDelimiter() == null &&
+                (params.getMediaType() == null ||
+                        isCSVOrTSV(params.getMediaType()))) {
+
+            CSVSniffer sniffer = new CSVSniffer(delimiters);
+            CSVResult result = sniffer.getBest(reader);
+            //we should require a higher confidence if the content-type
+            //is text/plain -- e.g. if the file name ends in .txt or
+            //the parent parser has an indication that this is txt
+            //(as in mail attachment headers)
+            params.setMediaType(result.getMediaType());
+            params.setDelimiter(result.getDelimiter());
+        }
+        return reader;
+    }
+
+    private CSVParams getOverride(Metadata metadata) {
+        String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE);
+        if (override == null) {
+            return new CSVParams();
+        }
+        MediaType mediaType = MediaType.parse(override);
+        if (mediaType == null) {
+            return new CSVParams();
+        }
+        String charsetString = mediaType.getParameters().get(CHARSET);
+        Charset charset = null;
+        if (charsetString != null) {
+            try {
+                charset = Charset.forName(charsetString);
+            } catch (UnsupportedCharsetException e) {
+
+            }
+        }
+        if (! isCSVOrTSV(mediaType)) {
+            return new CSVParams(mediaType, charset);
+        }
+
+        String delimiterString = mediaType.getParameters().get(DELIMITER);
+        if (delimiterString == null) {
+            return new CSVParams(mediaType, charset);
+        }
+        if (STRING_TO_CHAR_DELIMITER_MAP.containsKey(delimiterString)) {
+            return new CSVParams(mediaType, charset, (char) STRING_TO_CHAR_DELIMITER_MAP.get(delimiterString));
+        }
+        if (delimiterString.length() == 1) {
+            return new CSVParams(mediaType, charset, delimiterString.charAt(0));
+        }
+        //TODO: log bad/unrecognized delimiter string
+        return new CSVParams(mediaType, charset);
+    }
+
+    static boolean isCSVOrTSV(MediaType mediaType) {
+        if (mediaType == null) {
+            return false;
+        }
+        if (mediaType.getBaseType().equals(TSV) ||
+                mediaType.getBaseType().equals(CSV)) {
+            return true;
+        }
+        return false;
+    }
+    private byte[] readFirstX(InputStream stream, int markLimit) throws IOException {
+        byte[] bytes = new byte[markLimit];
+
+        try {
+            stream.mark(markLimit);
+            int numRead = IOUtils.read(stream, bytes, 0, bytes.length);
+            if (numRead < markLimit) {
+                byte[] dest = new byte[numRead];
+                System.arraycopy(bytes, 0, dest, 0, numRead);
+                bytes = dest;
+            }
+        } finally {
+            stream.reset();
+        }
+        return bytes;
+    }
+
+    private CSVFormat guessFormat(byte[] bytes, Charset charset, Metadata metadata) throws IOException {
+
+        String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
+        char bestDelimiter = (mediaTypeString.endsWith("csv")) ? ',' : '\t';
+        CSVReadTestResult bestResult = null;
+
+        for (char c : DEFAULT_DELIMITERS) {
+
+            try (Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes), charset)) {
+
+                CSVReadTestResult testResult = attemptCSVRead(c, bytes.length, reader);
+                if (bestResult == null || testResult.isBetterThan(bestResult)) {
+                    bestResult = testResult;
+                    bestDelimiter = c;
+                }
+            }
+        }
+        return CSVFormat.EXCEL.withDelimiter(bestDelimiter);
+    }
+
+    private CSVReadTestResult attemptCSVRead(char delimiter, int bytesTotal, Reader reader) throws IOException {
+
+        //maps <rowLength, numberOfRows>
+        Map<Integer, Integer> colCounts = new HashMap<>();
+        long lastCharacterPosition = -1L;
+        int rowCount = 0;
+        boolean illegalStateException = false;
+        try {
+            org.apache.commons.csv.CSVParser p = new org.apache.commons.csv.CSVParser(reader, CSVFormat.EXCEL.withDelimiter(delimiter));
+
+            for (CSVRecord row : p) {
+                int colCount = row.size();
+                lastCharacterPosition = row.getCharacterPosition();
+                Integer cnt = colCounts.get(colCount);
+                if (cnt == null) {
+                    cnt = 1;
+                } else {
+                    cnt++;
+                }
+                colCounts.put(colCount, cnt);
+                rowCount++;
+            }
+        } catch (IllegalStateException e) {
+            //this could be bad encapsulation -- invalid char between encapsulated token
+            //swallow while guessing
+            illegalStateException = true;
+        }
+
+        int mostCommonColCount = -1;
+        int totalCount = 0;
+        for (Integer count : colCounts.values()) {
+            if (count > mostCommonColCount) {
+                mostCommonColCount = count;
+            }
+            totalCount += count;
+        }
+        double percentMostCommonRowLength = -1.0f;
+        if (totalCount > 0) {
+            percentMostCommonRowLength = (double) mostCommonColCount / (double) totalCount;
+        }
+        return new CSVReadTestResult(bytesTotal, lastCharacterPosition, rowCount, percentMostCommonRowLength, illegalStateException);
+
+    }
+
+    private static class CSVReadTestResult {
+        private final int bytesTotal;
+        private final long bytesParsed;
+        private final int rowCount;
+        //the percentage of the rows that have the
+        //the most common row length -- maybe use stdev?
+        private final double percentMostCommonRowLength;
+        private final boolean illegalStateException;
+
+        public CSVReadTestResult(int bytesTotal, long bytesParsed, int rowCount,
+                                 double percentMostCommonRowLength, boolean illegalStateException) {
+            this.bytesTotal = bytesTotal;
+            this.bytesParsed = bytesParsed;
+            this.rowCount = rowCount;
+            this.percentMostCommonRowLength = percentMostCommonRowLength;
+            this.illegalStateException = illegalStateException;
+        }
+
+        public boolean isBetterThan(CSVReadTestResult bestResult) {
+            if (bestResult == null) {
+                return true;
+            }
+            if (illegalStateException && ! bestResult.illegalStateException) {
+                return false;
+            } else if (! illegalStateException && bestResult.illegalStateException) {
+                return true;
+            }
+            //if there are >= 3 rows in both, select the one with the better
+            //percentMostCommonRowLength
+            if (this.rowCount >= 3 && bestResult.rowCount >= 3) {
+                if (percentMostCommonRowLength > bestResult.percentMostCommonRowLength) {
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+
+            //if there's a big difference between the number of bytes parsed,
+            //pick the one that allowed more parsed bytes
+            if (bytesTotal > 0 && Math.abs((bestResult.bytesParsed - bytesParsed) / bytesTotal) > 0.1f) {
+                if (bytesParsed > bestResult.bytesParsed) {
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+            //add other heuristics as necessary
+
+            //if there's no other information,
+            //default to not better = default
+            return false;
+        }
+    }
+
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 80884eb..6bee05c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -45,6 +45,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.csv.TextAndCSVParser;
 import org.apache.tika.parser.html.HtmlParser;
 import org.apache.tika.parser.rtf.RTFParser;
 import org.apache.tika.parser.txt.TXTParser;
@@ -584,6 +585,7 @@ class MailContentHandler implements ContentHandler {
     private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
         String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
         Parser parser = null;
+        boolean inlineText = false;
         if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
             parser =
                     EmbeddedDocumentUtil.tryToFindExistingLeafParser(HtmlParser.class, parseContext);
@@ -593,6 +595,10 @@ class MailContentHandler implements ContentHandler {
         } else if (MediaType.TEXT_PLAIN.toString().equalsIgnoreCase(contentType)) {
             parser =
                     EmbeddedDocumentUtil.tryToFindExistingLeafParser(TXTParser.class, parseContext);
+            if (parser == null) {
+                parser = EmbeddedDocumentUtil.tryToFindExistingLeafParser(TextAndCSVParser.class, parseContext);
+                inlineText = true;
+            }
         }
 
 
@@ -605,10 +611,14 @@ class MailContentHandler implements ContentHandler {
 
             //parse inline
             try {
+                Metadata inlineMetadata = new Metadata();
+                if (inlineText) {
+                    inlineMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, MediaType.TEXT_PLAIN.toString());
+                }
                 parser.parse(
                         new ByteArrayInputStream(part.bytes),
                         new EmbeddedContentHandler(new BodyContentHandler(handler)),
-                        new Metadata(), parseContext
+                        inlineMetadata, parseContext
                 );
             } catch (SAXException | TikaException e) {
                 throw new MimeException(e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index 61d7bac..7c2c3f3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -21,7 +21,6 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import static java.util.Collections.singleton;
 
 import java.io.ByteArrayInputStream;
-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Set;
@@ -32,12 +31,9 @@ import com.pff.PSTFile;
 import com.pff.PSTFolder;
 import com.pff.PSTMessage;
 import com.pff.PSTRecipient;
-import org.apache.poi.ss.formula.functions.T;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
@@ -46,12 +42,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.microsoft.OutlookExtractor;
-import org.apache.tika.parser.rtf.RTFParser;
-import org.apache.tika.parser.txt.TXTParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -219,8 +210,10 @@ public class OutlookPSTParser extends AbstractParser {
         //the underlying bytes from the pstMail object...
 
         byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
-        mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, MediaType.TEXT_PLAIN.toString());
-        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+        mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE,
+                MediaType.TEXT_PLAIN.toString());
+        embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent),
+                handler, mailMetadata, true);
     }
 
     private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email,
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 35e9ff6..c9c409d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -104,24 +104,33 @@ abstract class AbstractPOIFSExtractor {
                                           String relationshipID, ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml,
                                           boolean outputHtml)
             throws IOException, SAXException, TikaException {
+        handleEmbeddedResource(resource, new Metadata(), filename,
+                relationshipID, storageClassID, mediaType, xhtml, outputHtml);
+    }
+
+    protected void handleEmbeddedResource(TikaInputStream resource, Metadata embeddedMetadata, String filename,
+                String relationshipID, ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml,
+        boolean outputHtml)
+            throws IOException, SAXException, TikaException {
+
         try {
-            Metadata metadata = new Metadata();
+
             if (filename != null) {
-                metadata.set(Metadata.TIKA_MIME_FILE, filename);
-                metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+                embeddedMetadata.set(Metadata.TIKA_MIME_FILE, filename);
+                embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, filename);
             }
             if (relationshipID != null) {
-                metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+                embeddedMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
             }
             if (storageClassID != null) {
-                metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, storageClassID.toString());
+                embeddedMetadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, storageClassID.toString());
             }
             if (mediaType != null) {
-                metadata.set(Metadata.CONTENT_TYPE, mediaType);
+                embeddedMetadata.set(Metadata.CONTENT_TYPE, mediaType);
             }
 
-            if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
-                embeddedDocumentUtil.parseEmbedded(resource, xhtml, metadata, outputHtml);
+            if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
+                embeddedDocumentUtil.parseEmbedded(resource, xhtml, embeddedMetadata, outputHtml);
             }
         } finally {
             resource.close();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 5d13351..c2e27d6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -376,8 +376,11 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
         if (textChunk != null) {
             byte[] data = getValue(textChunk);
             if (data != null) {
+                Metadata chunkMetadata = new Metadata();
+                chunkMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE,
+                        MediaType.TEXT_PLAIN.toString());
                 handleEmbeddedResource(
-                        TikaInputStream.get(data),
+                        TikaInputStream.get(data), chunkMetadata, null,
                         "text-body", null,
                         MediaType.TEXT_PLAIN.toString(), xhtml, true
                 );
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index f33c4d6..f90fe8f 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -19,7 +19,7 @@ org.apache.tika.parser.audio.AudioParser
 org.apache.tika.parser.audio.MidiParser
 org.apache.tika.parser.crypto.Pkcs7Parser
 org.apache.tika.parser.crypto.TSDParser
-org.apache.tika.parser.csv.CSVParser
+org.apache.tika.parser.csv.TextAndCSVParser
 org.apache.tika.parser.dwg.DWGParser
 org.apache.tika.parser.epub.EpubParser
 org.apache.tika.parser.executable.ExecutableParser
@@ -60,7 +60,6 @@ org.apache.tika.parser.pkg.PackageParser
 org.apache.tika.parser.pkg.RarParser
 org.apache.tika.parser.rtf.RTFParser
 org.apache.tika.parser.sas.SAS7BDATParser
-org.apache.tika.parser.txt.TXTParser
 org.apache.tika.parser.video.FLVParser
 org.apache.tika.parser.wordperfect.QuattroProParser
 org.apache.tika.parser.wordperfect.WordPerfectParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 0f56b87..819ed02 100644
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -158,7 +158,7 @@ public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
         List<Parser> parsers = new ArrayList<>();
         findEncodingDetectionParsers(p, parsers);
 
-        assertEquals(3, parsers.size());
+        assertEquals(4, parsers.size());
 
         for (Parser encodingDetectingParser : parsers) {
             EncodingDetector encodingDetector = ((AbstractEncodingDetectorParser) encodingDetectingParser).getEncodingDetector();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java
new file mode 100644
index 0000000..998473c
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVSnifferTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+public class CSVSnifferTest extends TikaTest {
+
+    private static char[] DELIMITERS = new char[]{ ',', '\t'};
+
+    private static byte[] CSV_BASIC =
+            ("the,quick,brown\tfox\n" +
+              "jumped \tover,the\tlazy,\tdog\n"+
+              "and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_8);
+
+    private static byte[] TSV_BASIC =
+            ("the\tquick\tbrown,fox\n" +
+                    "jumped ,over\tthe,lazy\t,dog\n"+
+                    "and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_8);
+
+    private static byte[] CSV_MID_CELL_QUOTE_EXCEPTION =
+            ("the,quick,brown\"fox\n" +
+                    "jumped over,the lazy,dog\n"+
+                    "and then,ran,down the street").getBytes(StandardCharsets.UTF_8);
+
+
+    private static byte[] ALLOW_SPACES_BEFORE_QUOTE =
+            ("the,quick,         \"brown\"\"fox\"\n" +
+                    "jumped over,the lazy,dog\n"+
+                    "and then,ran,down the street").getBytes(StandardCharsets.UTF_8);
+
+    private static byte[] ALLOW_SPACES_AFTER_QUOTE =
+            ("the,\"quick\"     ,brown  fox\n" +
+                    "jumped over,the lazy,dog\n"+
+                    "and then,ran,down the street").getBytes(StandardCharsets.UTF_8);
+
+    @Test
+    public void testCSVBasic() throws Exception {
+        List<CSVResult> results = sniff(DELIMITERS, CSV_BASIC, StandardCharsets.UTF_8);
+        assertEquals(2, results.size());
+        assertEquals(new Character(','), results.get(0).getDelimiter());
+
+        results = sniff(DELIMITERS, TSV_BASIC, StandardCharsets.UTF_8);
+        assertEquals(2, results.size());
+        assertEquals(new Character('\t'), results.get(0).getDelimiter());
+    }
+
+    private static List<CSVResult> sniff(char[] delimiters, byte[] bytes, Charset charset) throws IOException {
+        CSVSniffer sniffer = new CSVSniffer(delimiters);
+        try (BufferedReader reader = new BufferedReader(
+                new InputStreamReader(new ByteArrayInputStream(bytes), charset))) {
+            return sniffer.sniff(reader);
+        }
+    }
+
+    @Test
+    public void testCSVMidCellQuoteException() throws Exception {
+        List<CSVResult> results = sniff(DELIMITERS, CSV_MID_CELL_QUOTE_EXCEPTION, StandardCharsets.UTF_8);
+        assertEquals(2, results.size());
+    }
+
+    @Test
+    public void testAllowWhiteSpacesAroundAQuote() throws Exception {
+        List<CSVResult> results = sniff(DELIMITERS,
+                ALLOW_SPACES_BEFORE_QUOTE, StandardCharsets.UTF_8);
+        assertEquals(2, results.size());
+        assertEquals(new Character(','), results.get(0).getDelimiter());
+
+        results = sniff(DELIMITERS, ALLOW_SPACES_AFTER_QUOTE, StandardCharsets.UTF_8);
+        assertEquals(2, results.size());
+        assertEquals(new Character(','), results.get(0).getDelimiter());
+    }
+
+    @Test
+    public void testSort() {
+        List<CSVResult> list = new ArrayList<>();
+        list.add(new CSVResult(0.1, MediaType.TEXT_HTML, '-'));
+        list.add(new CSVResult(0.2, MediaType.TEXT_PLAIN, ','));
+        Collections.sort(list);
+        assertEquals(0.2, list.get(0).getConfidence(), 0.00001);
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
similarity index 64%
rename from tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java
rename to tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index c20c000..0e07511 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -17,19 +17,25 @@
 package org.apache.tika.parser.csv;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
+import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.commons.io.ByteOrderMark;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.junit.Test;
 
-public class CSVParserTest extends TikaTest {
+public class TextAndCSVParserTest extends TikaTest {
 
     private static byte[] CSV_UTF8 =
             ("the,quick,brown\tfox\n" +
@@ -65,10 +71,11 @@ public class CSVParserTest extends TikaTest {
     @Test
     public void testCSV_UTF8() throws Exception {
         Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
+//        metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
-        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("csv", "ISO-8859-1","comma",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
     }
 
@@ -77,8 +84,10 @@ public class CSVParserTest extends TikaTest {
         Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, "text/csv; charset=UTF-8");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
-        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/csv; charset=UTF-8", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("csv", "UTF-8","comma",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
     }
 
@@ -87,8 +96,9 @@ public class CSVParserTest extends TikaTest {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
-        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("csv", "ISO-8859-1","comma",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
     }
 
@@ -97,8 +107,9 @@ public class CSVParserTest extends TikaTest {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF_16LE), PARSER, metadata);
-        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/csv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("csv", "UTF-16LE","comma",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
     }
 
@@ -108,25 +119,20 @@ public class CSVParserTest extends TikaTest {
         metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(
                 concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER, metadata);
-        assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/csv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("csv", "UTF-16LE","comma",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
     }
 
-    private static byte[] concat(byte[] bytesA, byte[] bytesB) {
-        byte[] ret = new byte[bytesA.length+bytesB.length];
-        System.arraycopy(bytesA, 0, ret, 0, bytesA.length);
-        System.arraycopy(bytesB, 0, ret, bytesA.length, bytesB.length);
-        return ret;
-    }
-
     @Test
     public void testTSV_UTF8() throws Exception {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF8), PARSER, metadata);
-        assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/tsv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("tab", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("tsv", "ISO-8859-1","tab",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
     }
 
@@ -135,16 +141,18 @@ public class CSVParserTest extends TikaTest {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF_16LE), PARSER, metadata);
-        assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/tsv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("tab", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertMediaTypeEquals("tsv", "UTF-16LE","tab",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
     }
 
     @Test
     public void testBadCsv() throws Exception {
         //this causes an IllegalStateException during delimiter detection
-        //when trying to parse with ','; therefore, the parser backs off to '\t'.
-        //this isn't necessarily the best outcome, but we want to make sure
+        //when trying to parse with ','; therefore, the parser backs off to
+        //treating this as straight text.
+        //This isn't necessarily the best outcome, but we want to make sure
         //that an IllegalStateException during delimiter guessing doesn't
         //make the parse fail.
 
@@ -154,11 +162,52 @@ public class CSVParserTest extends TikaTest {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(new ByteArrayInputStream(csv), PARSER, metadata);
-        assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
-        assertEquals("text/tsv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertNull(xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
+        assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+        assertContains("the,quick", xmlResult.xml);
+    }
+
+    @Test
+    public void testLong() throws Exception {
+        //test mark/reset worked on the sniffers
+        StringBuilder sb = new StringBuilder();
+        for (int rows = 0; rows < 1000; rows++) {
+            for (int cols = 0; cols < 10; cols++) {
+                sb.append("2").append(",");
+            }
+            sb.append("\n");
+        }
+        Metadata metadata = new Metadata();
+        XMLResult xmlResult = getXML(new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)), PARSER, metadata);
+        assertMediaTypeEquals("csv", "ISO-8859-1","comma",
+                xmlResult.metadata.get(Metadata.CONTENT_TYPE));
     }
 
     private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) {
         assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
     }
+
+    private static void assertMediaTypeEquals(String csv, String charset, String delimiter, String mediaTypeString) {
+        if (mediaTypeString == null) {
+            fail("media type string must not be null");
+        }
+        MediaType expected = mediaType(csv, charset, delimiter);
+        MediaType observed = MediaType.parse(mediaTypeString);
+        assertEquals(expected, observed);
+    }
+
+    private static MediaType mediaType(String csv, String charset, String delimiter) {
+        Map<String, String> attrs = new HashMap<>();
+        attrs.put("charset", charset);
+        attrs.put("delimiter", delimiter);
+        return new MediaType(MediaType.text(csv), attrs);
+    }
+
+    private static byte[] concat(byte[] bytesA, byte[] bytesB) {
+        byte[] ret = new byte[bytesA.length+bytesB.length];
+        System.arraycopy(bytesA, 0, ret, 0, bytesA.length);
+        System.arraycopy(bytesB, 0, ret, bytesA.length, bytesB.length);
+        return ret;
+    }
+
 }