You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/27 14:16:53 UTC
[tika] 19/30: Use patterns to handle the date format variations
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 81caa71785d3e76df7bee93e71627c4f90a29323
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu May 10 16:59:09 2018 +0100
Use patterns to handle the date format variations
---
.../org/apache/tika/parser/TabularFormatsTest.java | 101 ++++++++++++---------
1 file changed, 56 insertions(+), 45 deletions(-)
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
index 80a7f56..119c9cd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/TabularFormatsTest.java
@@ -18,10 +18,11 @@ package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.util.Arrays;
import java.util.List;
-import java.util.Locale;
+import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.junit.Test;
@@ -45,14 +46,14 @@ public class TabularFormatsTest extends TikaTest {
/**
* Expected values, by <em>column</em>
*/
- protected static final String[][] table = new String[][] {
+ protected static final Object[][] table = new Object[][] {
new String[] {
"0","1","2","3","4","5","6","7","8","9","10"
},
new String[] {
"0","1","4","9","16","25","36","49","64","81","100"
},
- new String[] {}, // Done later
+ new String[] {}, // Generated later
new String[] {
"0%","10%","20%","30%","40%","50%",
"60%","70%","80%","90%","100%"
@@ -62,37 +63,44 @@ public class TabularFormatsTest extends TikaTest {
"75.0%","80.0%","83.3%","85.7%",
"87.5%","88.9%","90.0%"
},
- new String[] {
- "01-01-1960", "02-01-1960", "17-01-1960",
- "22-03-1960", "13-09-1960", "17-09-1961",
- "20-07-1963", "29-07-1966", "20-03-1971",
- "18-12-1977", "19-05-1987"
+ new Pattern[] {
+ Pattern.compile("01-(01|JAN|Jan)-(60|1960)"),
+ Pattern.compile("02-01-1960"),
+ Pattern.compile("17-01-1960"),
+ Pattern.compile("22-03-1960"),
+ Pattern.compile("13-09-1960"),
+ Pattern.compile("17-09-1961"),
+ Pattern.compile("20-07-1963"),
+ Pattern.compile("29-07-1966"),
+ Pattern.compile("20-03-1971"),
+ Pattern.compile("18-12-1977"),
+ Pattern.compile("19-05-1987"),
},
- new String[] {
- "01JAN60:00:00:01",
- "01JAN60:00:00:10",
- "01JAN60:00:01:40",
- "01JAN60:00:16:40",
- "01JAN60:02:46:40",
- "02JAN60:03:46:40",
- "12JAN60:13:46:40",
- "25APR60:17:46:40",
- "03MAR63:09:46:40",
- "09SEP91:01:46:40",
- "19NOV76:17:46:40"
+ new Pattern[] {
+ Pattern.compile("01(JAN|Jan)(60|1960):00:00:01(.00)?"),
+ Pattern.compile("01(JAN|Jan)(60|1960):00:00:10(.00)?"),
+ Pattern.compile("01(JAN|Jan)(60|1960):00:01:40(.00)?"),
+ Pattern.compile("01(JAN|Jan)(60|1960):00:16:40(.00)?"),
+ Pattern.compile("01(JAN|Jan)(60|1960):02:46:40(.00)?"),
+ Pattern.compile("02(JAN|Jan)(60|1960):03:46:40(.00)?"),
+ Pattern.compile("12(JAN|Jan)(60|1960):13:46:40(.00)?"),
+ Pattern.compile("25(APR|Apr)(60|1960):17:46:40(.00)?"),
+ Pattern.compile("03(MAR|Mar)(63|1963):09:46:40(.00)?"),
+ Pattern.compile("09(SEP|Sep)(91|1991):01:46:40(.00)?"),
+ Pattern.compile("19(NOV|Nov)(76|2276):17:46:40(.00)?")
},
- new String[] {
- "0:00:01",
- "0:00:03",
- "0:00:09",
- "0:00:27",
- "0:01:21",
- "0:04:03",
- "0:12:09",
- "0:36:27",
- "1:49:21",
- "5:28:03",
- "16:24:09"
+ new Pattern[] {
+ Pattern.compile("0?0:00:01(.\\d\\d)?"),
+ Pattern.compile("0?0:00:03(.\\d\\d)?"),
+ Pattern.compile("0?0:00:09(.\\d\\d)?"),
+ Pattern.compile("0?0:00:27(.\\d\\d)?"),
+ Pattern.compile("0?0:01:21(.\\d\\d)?"),
+ Pattern.compile("0?0:04:03(.\\d\\d)?"),
+ Pattern.compile("0?0:12:09(.\\d\\d)?"),
+ Pattern.compile("0?0:36:27(.\\d\\d)?"),
+ Pattern.compile("0?1:49:21(.\\d\\d)?"),
+ Pattern.compile("0?5:28:03(.\\d\\d)?"),
+ Pattern.compile("16:24:09(.\\d\\d)?")
}
};
static {
@@ -106,11 +114,6 @@ public class TabularFormatsTest extends TikaTest {
// correctly format these...
protected static final List<Integer> percentageColumns =
Arrays.asList(new Integer[] { 3, 4 });
- // Which columns hold dates? Some parsers output
- // bits of the month in lower case, some all upper, eg JAN vs Jan
- protected static final List<Integer> dateColumns =
- Arrays.asList(new Integer[] { 5, 6 });
- // TODO Handle 60 vs 1960
protected static String[] toCells(String row, boolean isTH) {
// Split into cells, ignoring stuff before first cell
@@ -194,13 +197,17 @@ public class TabularFormatsTest extends TikaTest {
// If the parser doesn't know about % formats,
// skip the cell if the column in a % one
if (!doesPercents && percentageColumns.contains(cn)) continue;
- if (dateColumns.contains(cn)) val = val.toUpperCase(Locale.ROOT);
// Ignore cell attributes
if (! val.isEmpty()) val = val.split(">")[1];
// Check
- assertEquals("Wrong text in row " + (rn+1) + " and column " + (cn+1),
- table[cn][rn], val);
+ String error = "Wrong text in row " + (rn+1) + " and column " +
+ (cn+1) + " - " + table[cn][rn] + " vs " + val;
+ if (table[cn][rn] instanceof String) {
+ assertEquals(error, table[cn][rn], val);
+ } else {
+ assertTrue(error, ((Pattern)table[cn][rn]).matcher(val).matches());
+ }
}
}
}
@@ -212,7 +219,7 @@ public class TabularFormatsTest extends TikaTest {
assertHeaders(xml, true, true, true);
// TODO Wait for https://github.com/epam/parso/issues/28 to be fixed
// then check the % formats again
-// assertContents(xml, true, false);
+ assertContents(xml, true, false);
}
@Test
public void testXLS() throws Exception {
@@ -230,7 +237,7 @@ public class TabularFormatsTest extends TikaTest {
// TODO Correctly handle empty cells then test
//assertContents(xml, true, false);
}
- // TODO Test ODS
+ // TODO Test OpenDocument ODS test
// TODO Test other formats, eg Database formats
@@ -249,9 +256,13 @@ public class TabularFormatsTest extends TikaTest {
for (String label : columnLabels) {
assertContains(label, xml);
}
- for (String[] vals : table) {
- for (String val : vals) {
- assertContains(val, xml);
+ for (Object[] vals : table) {
+ for (Object val : vals) {
+ if (val instanceof String)
+ assertContains((String)val, xml);
+ else if (val instanceof Pattern)
+ assertTrue("Not matched: " + val,
+ ((Pattern)val).matcher(xml).find());
}
}
}