You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/27 14:16:49 UTC
[tika] 15/30: Check header contents, check data rows count, add XLSX test

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 65af2d99be50c00fedd6261e11df9f60bd05d7ad
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu May 10 15:13:43 2018 +0100

    Check header contents, check data rows count, add XLSX test
---
 .../org/apache/tika/parser/TabularFormatsTest.java | 77 +++++++++++++++++-----
 1 file changed, 61 insertions(+), 16 deletions(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
index 8574d37..023f49d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
@@ -31,7 +31,7 @@ import org.junit.Test;
  */
 public class TabularFormatsTest extends TikaTest {
     protected static final String[] columnNames = new String[] {
-         "recnum","square","desc","pctdone","pctinc",
+         "recnum","square","desc","pctdone","pctincr",
          "date","datetime","time"
     };
     protected static final String[] columnLabels = new String[] {
@@ -49,8 +49,9 @@ public class TabularFormatsTest extends TikaTest {
              "0","1","2","3","4","5","6","7","8","9","10"
         },
         new String[] {
-             "0","1","4" // etc
+             "0","1","4","9","16","25","36","49","64","81","100"
         },
+/*        
         new String[] {  // etc
                 "01-01-1960"
         },
@@ -59,37 +60,72 @@ public class TabularFormatsTest extends TikaTest {
         new String[] {
                 ""
         }
+*/
     };
-
-    protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) {
-        // Find the first row
-        int splitAt = xml.indexOf("</tr>");
-        String hRow = xml.substring(0, splitAt);
-        splitAt = xml.indexOf("<tr>");
-        hRow = hRow.substring(splitAt+4);
-
+    
+    protected static String[] toCells(String row, boolean isTH) {
         // Split into cells, ignoring stuff before first cell
         String[] cells;
         if (isTH) {
-            cells = hRow.split("<th");
+            cells = row.split("<th");
         } else {
-            cells = hRow.split("<td");
+            cells = row.split("<td");
         }
         cells = Arrays.copyOfRange(cells, 1, cells.length);
         for (int i=0; i<cells.length; i++) {
-            splitAt = cells[i].lastIndexOf("</");
+            int splitAt = cells[i].lastIndexOf("</");
             cells[i] = cells[i].substring(0, splitAt).trim();
         }
+        return cells;
+    }
+
+    protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) {
+        // Find the first row
+        int splitAt = xml.indexOf("</tr>");
+        String hRow = xml.substring(0, splitAt);
+        splitAt = xml.indexOf("<tr>");
+        hRow = hRow.substring(splitAt+4);
+
+        // Split into cells, ignoring stuff before first cell
+        String[] cells = toCells(hRow, isTH);
 
         // Check we got the right number
         assertEquals("Wrong number of cells in header row " + hRow,
                      columnLabels.length, cells.length);
 
         // Check we got the right stuff
-        // TODO
+        for (int i=0; i<cells.length; i++) {
+            if (hasLabel && hasName) {
+                assertContains("title=\"" + columnNames[i] + "\"", cells[i]); 
+                assertContains(">" + columnLabels[i], cells[i]); 
+            } else if (hasName) {
+                assertContains(">" + columnNames[i], cells[i]); 
+            } else {
+                assertContains(">" + columnLabels[i], cells[i]); 
+            }
+        }
     }
     protected void assertContents(String xml, boolean hasHeader) {
-        // TODO Check the rows
+        // Ignore anything before the first <tr>
+        // Ignore the header row if there is one
+        int ignores = 1;
+        if (hasHeader) ignores++;
+
+        // Split into rows, and discard the row closing (and anything after)
+        String[] rows = xml.split("<tr>");
+        rows = Arrays.copyOfRange(rows, ignores, rows.length);
+        for (int i=0; i<rows.length; i++) {
+            rows[i] = rows[i].split("</tr>")[0].trim();
+        }
+
+        // Check we got the right number of rows
+        for (int cn=0; cn<table.length; cn++) {
+            assertEquals("Wrong number of rows found compared to column " + (cn+1),
+                         table[cn].length, rows.length);
+        }
+
+        // Check each row's values
+        // TODO
     }
 
     @Test
@@ -106,7 +142,16 @@ public class TabularFormatsTest extends TikaTest {
         assertHeaders(xml, false, true, false);
         assertContents(xml, true);
     }
-    // TODO Other formats
+    @Test
+    public void testXLSX() throws Exception {
+        XMLResult result = getXML("test-columnar.xlsx");
+        String xml = result.xml;
+        assertHeaders(xml, false, true, false);
+        assertContents(xml, true);
+    }
+    // TODO Test ODS
+    
+    // TODO Test other formats, eg Database formats
 
     /**
      * Note - we don't have a dedicated CSV parser