You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/28 16:05:50 UTC
[tika] branch main updated: TIKA-3938 -- extract column numbers and row numbers from csvs.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 0cf1daa1f TIKA-3938 -- extract column numbers and row numbers from csvs.
0cf1daa1f is described below
commit 0cf1daa1f90e0adc817373c870f8ef0a93116a65
Author: tallison <ta...@apache.org>
AuthorDate: Mon Nov 28 11:05:40 2022 -0500
TIKA-3938 -- extract column numbers and row numbers from csvs.
---
.../apache/tika/parser/csv/TextAndCSVParser.java | 24 ++++++++++++++++++++++
.../tika/parser/csv/TextAndCSVParserTest.java | 2 ++
2 files changed, 26 insertions(+)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
index e0a79337f..d0f81379b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -75,6 +75,20 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
private static final String DELIMITER = "delimiter";
public static final Property DELIMITER_PROPERTY = Property.externalText(
CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + DELIMITER);
+
+ /**
+ * If the file is detected as a csv/tsv, this is the number of columns in the first row.
+ */
+ public static final Property NUM_COLUMNS = Property.externalInteger(
+ CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "num_columns");
+
+ /**
+ * If the file is detected as a csv/tsv, this is the number of rows if the file
+ * is successfully read (e.g. no encapsulation exceptions, etc).
+ */
+ public static final Property NUM_ROWS = Property.externalInteger(
+ CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "num_rows");
+
private static final String TD = "td";
private static final String TR = "tr";
private static final String TABLE = "table";
@@ -190,20 +204,30 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter()));
XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
+ int totalRows = 0;
try (org.apache.commons.csv.CSVParser commonsParser = new org.apache.commons.csv.CSVParser(
reader, csvFormat)) {
xhtmlContentHandler.startDocument();
xhtmlContentHandler.startElement(TABLE);
+ int firstRowColCount = 0;
try {
for (CSVRecord row : commonsParser) {
xhtmlContentHandler.startElement(TR);
for (String cell : row) {
+ if (totalRows == 0) {
+ firstRowColCount++;
+ }
xhtmlContentHandler.startElement(TD);
xhtmlContentHandler.characters(cell);
xhtmlContentHandler.endElement(TD);
}
xhtmlContentHandler.endElement(TR);
+ if (totalRows == 0) {
+ metadata.set(NUM_COLUMNS, firstRowColCount);
+ }
+ totalRows++;
}
+ metadata.set(NUM_ROWS, totalRows);
} catch (IllegalStateException e) {
//if there's a parse exception
//try to get the rest of the content...treat it as text for now
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index ea1b919bb..0e6117277 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -108,6 +108,8 @@ public class TextAndCSVParserTest extends TikaTest {
assertMediaTypeEquals("csv", "ISO-8859-1", "comma",
xmlResult.metadata.get(Metadata.CONTENT_TYPE));
assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+ assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_COLUMNS));
+ assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_ROWS));
}
@Test