You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/28 19:12:14 UTC

[tika] branch master updated: TIKA-2618 -- avoid overwriting labels

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a9b17f  TIKA-2618 -- avoid overwriting labels
7a9b17f is described below

commit 7a9b17f478c867c7df5516b4ebb2ce3bf8b0aa36
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 28 15:12:02 2018 -0400

    TIKA-2618 -- avoid overwriting labels
---
 .../apache/tika/parser/microsoft/ExcelExtractor.java   |  17 ++++++++++++++++-
 .../apache/tika/parser/microsoft/ExcelParserTest.java  |   7 +++++++
 .../test-documents/testEXCEL_labels-govdocs-515858.xls | Bin 0 -> 57856 bytes
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 4ea8068..0dc33ee 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -541,7 +541,16 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                 CellValueRecordInterface value =
                         (CellValueRecordInterface) record;
                 Point point = new Point(value.getColumn(), value.getRow());
-                currentSheet.put(point, cell);
+                if (currentSheet.containsKey(point)) {
+                    //avoid overwriting content
+                    //for now, add to extraTextCells
+                    //TODO: consider allowing multiple text pieces
+                    //per x,y to keep the text together
+                    extraTextCells.add(cell);
+                } else {
+                    currentSheet.put(point, cell);
+                }
+
             } else {
                 // Cell outside the worksheets
                 extraTextCells.add(cell);
@@ -650,6 +659,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             }
 
             @Override
+            public void processRecord(Record record) {
+//                System.out.println(record.getClass() + " : "+record.toString());
+                super.processRecord(record);
+            }
+
+            @Override
             public String formatNumberDateCell(CellValueRecordInterface cell) {
                 String formatString = this.getFormatString(cell);
                 if (formatString != null && ! formatString.equals("General")) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 7d2b1e2..732c11c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.io.File;
 import java.io.InputStream;
 import java.text.DecimalFormatSymbols;
 import java.util.List;
@@ -540,4 +541,10 @@ public class ExcelParserTest extends TikaTest {
                 getXML("testEXCEL_phonetic.xls", parser).xml);
 
     }
+
+    @Test
+    public void testLabelsAreExtracted() throws Exception {
+        String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml;
+        assertContains("Morocco", xml);
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls
new file mode 100644
index 0000000..fd29a76
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls differ

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.