You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/28 19:12:14 UTC
[tika] branch master updated: TIKA-2618 -- avoid overwriting labels
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 7a9b17f TIKA-2618 -- avoid overwriting labels
7a9b17f is described below
commit 7a9b17f478c867c7df5516b4ebb2ce3bf8b0aa36
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 28 15:12:02 2018 -0400
TIKA-2618 -- avoid overwriting labels
---
.../apache/tika/parser/microsoft/ExcelExtractor.java | 17 ++++++++++++++++-
.../apache/tika/parser/microsoft/ExcelParserTest.java | 7 +++++++
.../test-documents/testEXCEL_labels-govdocs-515858.xls | Bin 0 -> 57856 bytes
3 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 4ea8068..0dc33ee 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -541,7 +541,16 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
CellValueRecordInterface value =
(CellValueRecordInterface) record;
Point point = new Point(value.getColumn(), value.getRow());
- currentSheet.put(point, cell);
+ if (currentSheet.containsKey(point)) {
+ //avoid overwriting content
+ //for now, add to extraTextCells
+ //TODO: consider allowing multiple text pieces
+ //per x,y to keep the text together
+ extraTextCells.add(cell);
+ } else {
+ currentSheet.put(point, cell);
+ }
+
} else {
// Cell outside the worksheets
extraTextCells.add(cell);
@@ -650,6 +659,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
}
@Override
+ public void processRecord(Record record) {
+// System.out.println(record.getClass() + " : "+record.toString());
+ super.processRecord(record);
+ }
+
+ @Override
public String formatNumberDateCell(CellValueRecordInterface cell) {
String formatString = this.getFormatString(cell);
if (formatString != null && ! formatString.equals("General")) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 7d2b1e2..732c11c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
+import java.io.File;
import java.io.InputStream;
import java.text.DecimalFormatSymbols;
import java.util.List;
@@ -540,4 +541,10 @@ public class ExcelParserTest extends TikaTest {
getXML("testEXCEL_phonetic.xls", parser).xml);
}
+
+ @Test
+ public void testLabelsAreExtracted() throws Exception {
+ String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml;
+ assertContains("Morocco", xml);
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls
new file mode 100644
index 0000000..fd29a76
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls differ
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.