You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/28 16:05:50 UTC

[tika] branch main updated: TIKA-3938 -- extract column numbers and row numbers from csvs.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0cf1daa1f TIKA-3938 -- extract column numbers and row numbers from csvs.
0cf1daa1f is described below

commit 0cf1daa1f90e0adc817373c870f8ef0a93116a65
Author: tallison <ta...@apache.org>
AuthorDate: Mon Nov 28 11:05:40 2022 -0500

    TIKA-3938 -- extract column numbers and row numbers from csvs.
---
 .../apache/tika/parser/csv/TextAndCSVParser.java   | 24 ++++++++++++++++++++++
 .../tika/parser/csv/TextAndCSVParserTest.java      |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
index e0a79337f..d0f81379b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java
@@ -75,6 +75,20 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
     private static final String DELIMITER = "delimiter";
     public static final Property DELIMITER_PROPERTY = Property.externalText(
             CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + DELIMITER);
+
+    /**
+     * If the file is detected as a csv/tsv, this is the number of columns in the first row.
+     */
+    public static final Property NUM_COLUMNS = Property.externalInteger(
+            CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "num_columns");
+
+    /**
+     * If the file is detected as a csv/tsv, this is the number of rows if the file
+     * is successfully read (e.g. no encapsulation exceptions, etc).
+     */
+    public static final Property NUM_ROWS = Property.externalInteger(
+            CSV_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "num_rows");
+
     private static final String TD = "td";
     private static final String TR = "tr";
     private static final String TABLE = "table";
@@ -190,20 +204,30 @@ public class TextAndCSVParser extends AbstractEncodingDetectorParser {
                 CHAR_TO_STRING_DELIMITER_MAP.get(csvFormat.getDelimiter()));
 
         XHTMLContentHandler xhtmlContentHandler = new XHTMLContentHandler(handler, metadata);
+        int totalRows = 0;
         try (org.apache.commons.csv.CSVParser commonsParser = new org.apache.commons.csv.CSVParser(
                 reader, csvFormat)) {
             xhtmlContentHandler.startDocument();
             xhtmlContentHandler.startElement(TABLE);
+            int firstRowColCount = 0;
             try {
                 for (CSVRecord row : commonsParser) {
                     xhtmlContentHandler.startElement(TR);
                     for (String cell : row) {
+                        if (totalRows == 0) {
+                            firstRowColCount++;
+                        }
                         xhtmlContentHandler.startElement(TD);
                         xhtmlContentHandler.characters(cell);
                         xhtmlContentHandler.endElement(TD);
                     }
                     xhtmlContentHandler.endElement(TR);
+                    if (totalRows == 0) {
+                        metadata.set(NUM_COLUMNS, firstRowColCount);
+                    }
+                    totalRows++;
                 }
+                metadata.set(NUM_ROWS, totalRows);
             } catch (IllegalStateException e) {
                 //if there's a parse exception
                 //try to get the rest of the content...treat it as text for now
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index ea1b919bb..0e6117277 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -108,6 +108,8 @@ public class TextAndCSVParserTest extends TikaTest {
         assertMediaTypeEquals("csv", "ISO-8859-1", "comma",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
+        assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_COLUMNS));
+        assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_ROWS));
     }
 
     @Test