You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/02/13 01:39:10 UTC
[tika] branch master updated: TIKA-2828 -- initial CSVParser commit
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new d9229df TIKA-2828 -- initial CSVParser commit
d9229df is described below
commit d9229df3daa527cc09b9e231f923097f0577bb1f
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Feb 12 20:38:55 2019 -0500
TIKA-2828 -- initial CSVParser commit
---
CHANGES.txt | 3 +
.../src/main/java/org/apache/tika/io/IOUtils.java | 31 ++
.../java/org/apache/tika/parser/csv/CSVParser.java | 313 +++++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 1 +
.../org/apache/tika/parser/csv/CSVParserTest.java | 164 +++++++++++
5 files changed, 512 insertions(+)
diff --git a/CHANGES.txt b/CHANGES.txt
index d6d1647..9269715 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,9 @@ Release 2.0.0 - ???
Release 1.21 - ????
+ * Add a CSVParser. CSV detection is currently based solely on filename
+ and/or information conveyed via Metadata (TIKA-2826).
+
* General upgrades: jackcess, opennlp, httpcomponents, zstd-jni, cxf, Lucene (TIKA-2824)
* Bundle xerces2 with tika-parsers (TIKA-2802).
diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
index 11d3bd3..a05176f 100644
--- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
@@ -1183,4 +1183,35 @@ public class IOUtils {
return (ch2 == -1);
}
+ /**
+ * Reads bytes from an input stream.
+ * This implementation guarantees that it will read as many bytes
+ * as possible before giving up; this may not always be the case for
+ * subclasses of {@link InputStream}.
+ *
+ * @param input where to read input from
+ * @param buffer destination
+ * @param offset initial offset into buffer
+ * @param length length to read, must be >= 0
+ * @return actual length read; may be less than requested if EOF was reached
+ * @throws IOException if a read error occurs
+ * @since 2.2
+ */
+ public static int read(final InputStream input, final byte[] buffer, final int offset, final int length)
+ throws IOException {
+ if (length < 0) {
+ throw new IllegalArgumentException("Length must not be negative: " + length);
+ }
+ int remaining = length;
+ while (remaining > 0) {
+ final int location = length - remaining;
+ final int count = input.read(buffer, offset + location, remaining);
+ if (count == -1) { // EOF
+ break;
+ }
+ remaining -= count;
+ }
+ return length - remaining;
+ }
+
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java
new file mode 100644
index 0000000..7fb57f6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/csv/CSVParser.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.tika.Tika;
+import org.apache.tika.config.Field;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class CSVParser extends AbstractParser {
+ private static final String CSV_PREFIX = "csv";
+ public static final Property DELIMITER = Property.externalText(
+ CSV_PREFIX+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER+"delimiter");
+
+ private static final String TD = "td";
+ private static final String TR = "tr";
+ private static final String TABLE = "table";
+
+ private static final MediaType CSV = MediaType.text("csv");
+ private static final MediaType TSV = MediaType.text("tsv");
+
+ private static final int DEFAULT_MARK_LIMIT = 20000;
+
+ //TODO: add | or make this configurable?
+ private static final char[] CANDIDATE_DELIMITERS = new char[]{',', '\t'};
+
+ private static final Map<Character, String> DELIMITERS = new HashMap<>();
+
+ static {
+ DELIMITERS.put(',', "comma");
+ DELIMITERS.put('\t', "tab");
+ }
+
+
+
+ @Field
+ private int markLimit = DEFAULT_MARK_LIMIT;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ CSV, TSV)));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+
+ String override = metadata.get(TikaCoreProperties.CONTENT_TYPE_OVERRIDE);
+ Character overrideDelimiter = null;
+ Charset overrideCharset = null;
+ if (override != null) {
+ MediaType mediaType = MediaType.parse(override);
+ String charset = mediaType.getParameters().get("charset");
+ overrideDelimiter = mediaType.getBaseType().toString().endsWith("tsv") ? '\t' : ',';
+ if (charset != null) {
+ try {
+ overrideCharset = Charset.forName(charset);
+ } catch (UnsupportedCharsetException e) {
+ //swallow
+ }
+ }
+ }
+ if (overrideDelimiter == null || overrideCharset == null) {
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+ }
+ //buffer the firstx bytes to detect delimiter
+ byte[] firstX = null;
+ if (overrideDelimiter == null) {
+ firstX = readFirstX(stream, markLimit);
+ }
+ Charset charset = null;
+ Reader reader = null;
+ org.apache.commons.csv.CSVParser commonsParser = null;
+ try {
+ //need to detect if nothing has been sent in via override
+ if (overrideCharset == null) {
+ reader = new AutoDetectReader(stream);
+ charset = ((AutoDetectReader) reader).getCharset();
+ } else {
+ reader = new BufferedReader(new InputStreamReader(stream, overrideCharset));
+ charset = overrideCharset;
+ }
+ CSVFormat csvFormat = null;
+ if (overrideDelimiter == null) {
+ csvFormat = guessFormat(firstX, charset, metadata);
+ } else {
+ csvFormat = CSVFormat.EXCEL.withDelimiter(overrideDelimiter);
+ }
+ metadata.set(DELIMITER, DELIMITERS.get(csvFormat.getDelimiter()));
+
+ if (overrideCharset == null || overrideDelimiter == null) {
+ MediaType mediaType = (csvFormat.getDelimiter() == '\t') ? TSV : CSV;
+ MediaType type = new MediaType(mediaType, charset);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+ }
+
+ XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
+ commonsParser = new org.apache.commons.csv.CSVParser(reader, csvFormat);
+ xhtmlContentHandler.startDocument();
+ xhtmlContentHandler.startElement(TABLE);
+ try {
+ for (CSVRecord row : commonsParser) {
+ xhtmlContentHandler.startElement(TR);
+ for (String cell : row) {
+ xhtmlContentHandler.startElement(TD);
+ xhtmlContentHandler.characters(cell);
+ xhtmlContentHandler.endElement(TD);
+ }
+ xhtmlContentHandler.endElement(TR);
+ }
+ } catch (IllegalStateException e) {
+ throw new TikaException("exception parsing the csv", e);
+ }
+
+ xhtmlContentHandler.endElement(TABLE);
+ xhtmlContentHandler.endDocument();
+ } finally {
+ if (commonsParser != null) {
+ try {
+ commonsParser.close();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ IOUtils.closeQuietly(reader);
+ }
+ }
+
+ private byte[] readFirstX(InputStream stream, int markLimit) throws IOException {
+ byte[] bytes = new byte[markLimit];
+
+ try {
+ stream.mark(markLimit);
+ int numRead = IOUtils.read(stream, bytes, 0, bytes.length);
+ if (numRead < markLimit) {
+ byte[] dest = new byte[numRead];
+ System.arraycopy(bytes, 0, dest, 0, numRead);
+ bytes = dest;
+ }
+ } finally {
+ stream.reset();
+ }
+ return bytes;
+ }
+
+ private CSVFormat guessFormat(byte[] bytes, Charset charset, Metadata metadata) throws IOException {
+
+ String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
+ char bestDelimiter = (mediaTypeString.endsWith("csv")) ? ',' : '\t';
+ CSVReadTestResult bestResult = null;
+
+ for (char c : CANDIDATE_DELIMITERS) {
+
+ try (Reader reader = new InputStreamReader(new ByteArrayInputStream(bytes), charset)) {
+
+ CSVReadTestResult testResult = attemptCSVRead(c, bytes.length, reader);
+ if (bestResult == null || testResult.isBetterThan(bestResult)) {
+ bestResult = testResult;
+ bestDelimiter = c;
+ }
+ }
+ }
+ return CSVFormat.EXCEL.withDelimiter(bestDelimiter);
+ }
+
+ private CSVReadTestResult attemptCSVRead(char delimiter, int bytesTotal, Reader reader) throws IOException {
+
+ //maps <rowLength, numberOfRows>
+ Map<Integer, Integer> colCounts = new HashMap<>();
+ long lastCharacterPosition = -1L;
+ int rowCount = 0;
+ boolean illegalStateException = false;
+ try {
+ org.apache.commons.csv.CSVParser p = new org.apache.commons.csv.CSVParser(reader, CSVFormat.EXCEL.withDelimiter(delimiter));
+
+ for (CSVRecord row : p) {
+ int colCount = row.size();
+ lastCharacterPosition = row.getCharacterPosition();
+ Integer cnt = colCounts.get(colCount);
+ if (cnt == null) {
+ cnt = 1;
+ } else {
+ cnt++;
+ }
+ colCounts.put(colCount, cnt);
+ rowCount++;
+ }
+ } catch (IllegalStateException e) {
+ //this could be bad encapsulation -- invalid char between encapsulated token
+ //swallow while guessing
+ illegalStateException = true;
+ }
+
+ int mostCommonColCount = -1;
+ int totalCount = 0;
+ for (Integer count : colCounts.values()) {
+ if (count > mostCommonColCount) {
+ mostCommonColCount = count;
+ }
+ totalCount += count;
+ }
+ double percentMostCommonRowLength = -1.0f;
+ if (totalCount > 0) {
+ percentMostCommonRowLength = (double) mostCommonColCount / (double) totalCount;
+ }
+ return new CSVReadTestResult(bytesTotal, lastCharacterPosition, rowCount, percentMostCommonRowLength, illegalStateException);
+
+ }
+
+ private static class CSVReadTestResult {
+ private final int bytesTotal;
+ private final long bytesParsed;
+ private final int rowCount;
+ //the percentage of the rows that have the
+ //the most common row length -- maybe use stdev?
+ private final double percentMostCommonRowLength;
+ private final boolean illegalStateException;
+
+ public CSVReadTestResult(int bytesTotal, long bytesParsed, int rowCount,
+ double percentMostCommonRowLength, boolean illegalStateException) {
+ this.bytesTotal = bytesTotal;
+ this.bytesParsed = bytesParsed;
+ this.rowCount = rowCount;
+ this.percentMostCommonRowLength = percentMostCommonRowLength;
+ this.illegalStateException = illegalStateException;
+ }
+
+ public boolean isBetterThan(CSVReadTestResult bestResult) {
+ if (bestResult == null) {
+ return true;
+ }
+ if (illegalStateException && ! bestResult.illegalStateException) {
+ return false;
+ } else if (! illegalStateException && bestResult.illegalStateException) {
+ return true;
+ }
+ //if there are >= 3 rows in both, select the one with the better
+ //percentMostCommonRowLength
+ if (this.rowCount >= 3 && bestResult.rowCount >= 3) {
+ if (percentMostCommonRowLength > bestResult.percentMostCommonRowLength) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ //if there's a big difference between the number of bytes parsed,
+ //pick the one that allowed more parsed bytes
+ if (bytesTotal > 0 && Math.abs((bestResult.bytesParsed - bytesParsed) / bytesTotal) > 0.1f) {
+ if (bytesParsed > bestResult.bytesParsed) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ //add other heuristics as necessary
+
+ //if there's no other information,
+ //default to not better = default
+ return false;
+ }
+ }
+}
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index ac6ce39..cfabd0f 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -19,6 +19,7 @@ org.apache.tika.parser.audio.AudioParser
org.apache.tika.parser.audio.MidiParser
org.apache.tika.parser.crypto.Pkcs7Parser
org.apache.tika.parser.crypto.TSDParser
+org.apache.tika.parser.csv.CSVParser
org.apache.tika.parser.dwg.DWGParser
org.apache.tika.parser.envi.EnviHeaderParser
org.apache.tika.parser.epub.EpubParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java
new file mode 100644
index 0000000..de6ec29
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/csv/CSVParserTest.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.csv;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+public class CSVParserTest extends TikaTest {
+
+ private static byte[] CSV_UTF8 =
+ ("the,quick,brown\tfox\n" +
+ "jumped \tover,the\tlazy,\tdog\n"+
+ "and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_8);
+
+ private static byte[] CSV_UTF_16LE =
+ ("the,quick,brown\tfox\n" +
+ "jumped \tover,the\tlazy,\tdog\n"+
+ "and then,ran,down\tthe\tstreet").getBytes(StandardCharsets.UTF_16LE);
+
+
+ private static byte[] TSV_UTF8 =
+ ("the\tquick\tbrown,fox\n" +
+ "jumped ,over\tthe,lazy\t,dog\n"+
+ "and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_8);
+
+ private static byte[] TSV_UTF_16LE =
+ ("the\tquick\tbrown,fox\n" +
+ "jumped ,over\tthe,lazy\t,dog\n"+
+ "and then\tran\tdown,the,street").getBytes(StandardCharsets.UTF_16LE);
+
+
+ private static String EXPECTED_TSV = ("<table><tr> <td>the</td> <td>quick</td> <td>brown,fox</td></tr>\n" +
+ "<tr> <td>jumped ,over</td> <td>the,lazy</td> <td>,dog</td></tr>\n" +
+ "<tr> <td>and then</td> <td>ran</td> <td>down,the,street</td></tr>\n" +
+ "</table>").replaceAll("[\r\n\t ]+", " ");
+
+ private static String EXPECTED_CSV = EXPECTED_TSV.replaceAll(",+", " ");
+
+ private static Parser PARSER = new AutoDetectParser();
+
+ @Test
+ public void testCSV_UTF8() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
+ assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+ }
+
+ @Test
+ public void testCSV_UTF8_TypeOverride() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, "text/csv; charset=UTF-8");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
+ assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/csv; charset=UTF-8", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+ }
+
+ @Test
+ public void testCSV_UTF8_Type() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF8), PARSER, metadata);
+ assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/csv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+ }
+
+ @Test
+ public void testCSV_UTF16LE() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(CSV_UTF_16LE), PARSER, metadata);
+ assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/csv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+ }
+
+ @Test
+ public void testCSV_UTF16LE_BOM() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(
+ concat(ByteOrderMark.UTF_16LE.getBytes(), CSV_UTF_16LE)), PARSER, metadata);
+ assertEquals("comma", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/csv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+ }
+
+ private static byte[] concat(byte[] bytesA, byte[] bytesB) {
+ byte[] ret = new byte[bytesA.length+bytesB.length];
+ System.arraycopy(bytesA, 0, ret, 0, bytesA.length);
+ System.arraycopy(bytesB, 0, ret, bytesA.length, bytesB.length);
+ return ret;
+ }
+
+ @Test
+ public void testTSV_UTF8() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF8), PARSER, metadata);
+ assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/tsv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
+ }
+
+ @Test
+ public void testTSV_UTF16LE() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(TSV_UTF_16LE), PARSER, metadata);
+ assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/tsv; charset=UTF-16LE", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
+ }
+
+ @Test
+ public void testBadCsv() throws Exception {
+ //this causes an IllegalStateException during delimiter detection
+ //when trying to parse with ','; therefore, the parser backs off to '\t'.
+ //this isn't necessarily the best outcome, but we want to make sure
+ //that an IllegalStateException during delimiter guessing doesn't
+ //make the parse fail.
+
+ byte[] csv = ("the,quick\n" +
+ "brown,\"la\"zy\"\n" +
+ "brown,\"dog\n").getBytes(StandardCharsets.UTF_8);
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
+ XMLResult xmlResult = getXML(new ByteArrayInputStream(csv), PARSER, metadata);
+ assertEquals("tab", xmlResult.metadata.get(CSVParser.DELIMITER));
+ assertEquals("text/tsv; charset=ISO-8859-1", xmlResult.metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ private void assertContainsIgnoreWhiteSpaceDiffs(String expected, String xml) {
+ assertContains(expected, xml.replaceAll("[\r\n\t ]", " "));
+ }
+}