You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/27 14:16:52 UTC
[tika] 18/30: Not all formats know about %s,
dates not completely consistent either...
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 507f59ff2df6e3bcded201700c284d37a3b4cc62
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu May 10 16:33:45 2018 +0100
Not all formats know about %s, dates not completely consistent either...
---
.../org/apache/tika/parser/TabularFormatsTest.java | 33 ++++++++++++++++++----
1 file changed, 27 insertions(+), 6 deletions(-)
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
index 7330f6a..80a7f56 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
@@ -20,6 +20,8 @@ package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
import org.apache.tika.TikaTest;
import org.junit.Test;
@@ -56,7 +58,7 @@ public class TabularFormatsTest extends TikaTest {
"60%","70%","80%","90%","100%"
},
new String[] {
- "M","0.0%","50.0%","66.7%",
+ "","0.0%","50.0%","66.7%",
"75.0%","80.0%","83.3%","85.7%",
"87.5%","88.9%","90.0%"
},
@@ -100,6 +102,15 @@ public class TabularFormatsTest extends TikaTest {
table[2][i] = "This is row " + i + " of 10";
}
}
+ // Which columns hold percentages? Not all parsers
+ // correctly format these...
+ protected static final List<Integer> percentageColumns =
+ Arrays.asList(new Integer[] { 3, 4 });
+ // Which columns hold dates? Some parsers output
+ // bits of the month in lower case, some all upper, eg JAN vs Jan
+ protected static final List<Integer> dateColumns =
+ Arrays.asList(new Integer[] { 5, 6 });
+ // TODO Handle 60 vs 1960
protected static String[] toCells(String row, boolean isTH) {
// Split into cells, ignoring stuff before first cell
@@ -152,7 +163,7 @@ public class TabularFormatsTest extends TikaTest {
}
}
}
- protected void assertContents(String xml, boolean hasHeader) {
+ protected void assertContents(String xml, boolean hasHeader, boolean doesPercents) {
// Ignore anything before the first <tr>
// Ignore the header row if there is one
int ignores = 1;
@@ -178,8 +189,14 @@ public class TabularFormatsTest extends TikaTest {
table.length, cells.length);
for (int cn=0; cn<table.length; cn++) {
+ String val = cells[cn];
+
+ // If the parser doesn't know about % formats,
+ // skip the cell if the column in a % one
+ if (!doesPercents && percentageColumns.contains(cn)) continue;
+ if (dateColumns.contains(cn)) val = val.toUpperCase(Locale.ROOT);
+
// Ignore cell attributes
- String val = cells.length > (cn-1) ? cells[cn] : "";
if (! val.isEmpty()) val = val.split(">")[1];
// Check
assertEquals("Wrong text in row " + (rn+1) + " and column " + (cn+1),
@@ -193,21 +210,25 @@ public class TabularFormatsTest extends TikaTest {
XMLResult result = getXML("test-columnar.sas7bdat");
String xml = result.xml;
assertHeaders(xml, true, true, true);
- //assertContents(xml, true);
+ // TODO Wait for https://github.com/epam/parso/issues/28 to be fixed
+ // then check the % formats again
+// assertContents(xml, true, false);
}
@Test
public void testXLS() throws Exception {
XMLResult result = getXML("test-columnar.xls");
String xml = result.xml;
assertHeaders(xml, false, true, false);
- //assertContents(xml, true);
+ // TODO Correctly handle empty cells then test
+ //assertContents(xml, true, false);
}
@Test
public void testXLSX() throws Exception {
XMLResult result = getXML("test-columnar.xlsx");
String xml = result.xml;
assertHeaders(xml, false, true, false);
- //assertContents(xml, true);
+ // TODO Correctly handle empty cells then test
+ //assertContents(xml, true, false);
}
// TODO Test ODS