You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/27 14:16:49 UTC
[tika] 15/30: Check header contents, check data rows count,
add XLSX test
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 65af2d99be50c00fedd6261e11df9f60bd05d7ad
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu May 10 15:13:43 2018 +0100
Check header contents, check data rows count, add XLSX test
---
.../org/apache/tika/parser/TabularFormatsTest.java | 77 +++++++++++++++++-----
1 file changed, 61 insertions(+), 16 deletions(-)
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
index 8574d37..023f49d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
@@ -31,7 +31,7 @@ import org.junit.Test;
*/
public class TabularFormatsTest extends TikaTest {
protected static final String[] columnNames = new String[] {
- "recnum","square","desc","pctdone","pctinc",
+ "recnum","square","desc","pctdone","pctincr",
"date","datetime","time"
};
protected static final String[] columnLabels = new String[] {
@@ -49,8 +49,9 @@ public class TabularFormatsTest extends TikaTest {
"0","1","2","3","4","5","6","7","8","9","10"
},
new String[] {
- "0","1","4" // etc
+ "0","1","4","9","16","25","36","49","64","81","100"
},
+/*
new String[] { // etc
"01-01-1960"
},
@@ -59,37 +60,72 @@ public class TabularFormatsTest extends TikaTest {
new String[] {
""
}
+*/
};
-
- protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) {
- // Find the first row
- int splitAt = xml.indexOf("</tr>");
- String hRow = xml.substring(0, splitAt);
- splitAt = xml.indexOf("<tr>");
- hRow = hRow.substring(splitAt+4);
-
+
+ protected static String[] toCells(String row, boolean isTH) {
// Split into cells, ignoring stuff before first cell
String[] cells;
if (isTH) {
- cells = hRow.split("<th");
+ cells = row.split("<th");
} else {
- cells = hRow.split("<td");
+ cells = row.split("<td");
}
cells = Arrays.copyOfRange(cells, 1, cells.length);
for (int i=0; i<cells.length; i++) {
- splitAt = cells[i].lastIndexOf("</");
+ int splitAt = cells[i].lastIndexOf("</");
cells[i] = cells[i].substring(0, splitAt).trim();
}
+ return cells;
+ }
+
+ protected void assertHeaders(String xml, boolean isTH, boolean hasLabel, boolean hasName) {
+ // Find the first row
+ int splitAt = xml.indexOf("</tr>");
+ String hRow = xml.substring(0, splitAt);
+ splitAt = xml.indexOf("<tr>");
+ hRow = hRow.substring(splitAt+4);
+
+ // Split into cells, ignoring stuff before first cell
+ String[] cells = toCells(hRow, isTH);
// Check we got the right number
assertEquals("Wrong number of cells in header row " + hRow,
columnLabels.length, cells.length);
// Check we got the right stuff
- // TODO
+ for (int i=0; i<cells.length; i++) {
+ if (hasLabel && hasName) {
+ assertContains("title=\"" + columnNames[i] + "\"", cells[i]);
+ assertContains(">" + columnLabels[i], cells[i]);
+ } else if (hasName) {
+ assertContains(">" + columnNames[i], cells[i]);
+ } else {
+ assertContains(">" + columnLabels[i], cells[i]);
+ }
+ }
}
protected void assertContents(String xml, boolean hasHeader) {
- // TODO Check the rows
+ // Ignore anything before the first <tr>
+ // Ignore the header row if there is one
+ int ignores = 1;
+ if (hasHeader) ignores++;
+
+ // Split into rows, and discard the row closing (and anything after)
+ String[] rows = xml.split("<tr>");
+ rows = Arrays.copyOfRange(rows, ignores, rows.length);
+ for (int i=0; i<rows.length; i++) {
+ rows[i] = rows[i].split("</tr>")[0].trim();
+ }
+
+ // Check we got the right number of rows
+ for (int cn=0; cn<table.length; cn++) {
+ assertEquals("Wrong number of rows found compared to column " + (cn+1),
+ table[cn].length, rows.length);
+ }
+
+ // Check each row's values
+ // TODO
}
@Test
@@ -106,7 +142,16 @@ public class TabularFormatsTest extends TikaTest {
assertHeaders(xml, false, true, false);
assertContents(xml, true);
}
- // TODO Other formats
+ @Test
+ public void testXLSX() throws Exception {
+ XMLResult result = getXML("test-columnar.xlsx");
+ String xml = result.xml;
+ assertHeaders(xml, false, true, false);
+ assertContents(xml, true);
+ }
+ // TODO Test ODS
+
+ // TODO Test other formats, eg Database formats
/**
* Note - we don't have a dedicated CSV parser