You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/28 19:13:40 UTC

[tika] branch branch_1x updated (c5cf55f -> ca9c2f5)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from c5cf55f  TIKA-2617 -- handle new IOOBE on streams now parsed as npoifs in ppt embedded streams as any other IOException on an embedded stream
     new e44a38d  Update forbiddenapis to version 2.5 and remove commons-io hack from pom.xml
     new ca9c2f5  TIKA-2618 -- avoid overwriting labels

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tika-parent/pom.xml                                    |   8 ++------
 .../apache/tika/parser/microsoft/ExcelExtractor.java   |  17 ++++++++++++++++-
 .../apache/tika/parser/microsoft/ExcelParserTest.java  |   7 +++++++
 .../test-documents/testEXCEL_labels-govdocs-515858.xls | Bin 0 -> 57856 bytes
 4 files changed, 25 insertions(+), 7 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 01/02: Update forbiddenapis to version 2.5 and remove commons-io hack from pom.xml

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e44a38d4b9d8a12ae20e357b5ad0bee2030dfc0c
Author: Uwe Schindler <us...@apache.org>
AuthorDate: Wed Mar 28 20:27:48 2018 +0200

    Update forbiddenapis to version 2.5 and remove commons-io hack from pom.xml
---
 tika-parent/pom.xml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 03c8ea0..e464c1a 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -327,7 +327,7 @@
         <groupId>de.thetaphi</groupId>
         <artifactId>forbiddenapis</artifactId>
         <!-- if this version contains commons-io 2.6, remove hard-coded commons-io version below -->
-        <version>2.4.1</version>
+        <version>2.5</version>
         <configuration>
           <targetVersion>${maven.compiler.target}</targetVersion>
           <failOnUnresolvableSignatures>false</failOnUnresolvableSignatures>
@@ -337,11 +337,7 @@
             <bundledSignature>jdk-deprecated</bundledSignature>
             <bundledSignature>jdk-non-portable</bundledSignature>
             <bundledSignature>jdk-internal</bundledSignature>
-            <!--2.6 is the same as 2.5
-              TODO: change back to the following when we upgrade forbidden apis
-              <bundledSignature>commons-io-unsafe-${commons.io.version}</bundledSignature>
-            -->
-            <bundledSignature>commons-io-unsafe-2.5</bundledSignature>
+            <bundledSignature>commons-io-unsafe-${commons.io.version}</bundledSignature>
           </bundledSignatures>
         </configuration>
         <executions>

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.

[tika] 02/02: TIKA-2618 -- avoid overwriting labels

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ca9c2f53048e84a6c483165ba7779f8cb6393ec7
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Mar 28 15:12:02 2018 -0400

    TIKA-2618 -- avoid overwriting labels
---
 .../apache/tika/parser/microsoft/ExcelExtractor.java   |  17 ++++++++++++++++-
 .../apache/tika/parser/microsoft/ExcelParserTest.java  |   7 +++++++
 .../test-documents/testEXCEL_labels-govdocs-515858.xls | Bin 0 -> 57856 bytes
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 4ea8068..0dc33ee 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -541,7 +541,16 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                 CellValueRecordInterface value =
                         (CellValueRecordInterface) record;
                 Point point = new Point(value.getColumn(), value.getRow());
-                currentSheet.put(point, cell);
+                if (currentSheet.containsKey(point)) {
+                    //avoid overwriting content
+                    //for now, add to extraTextCells
+                    //TODO: consider allowing multiple text pieces
+                    //per x,y to keep the text together
+                    extraTextCells.add(cell);
+                } else {
+                    currentSheet.put(point, cell);
+                }
+
             } else {
                 // Cell outside the worksheets
                 extraTextCells.add(cell);
@@ -650,6 +659,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
             }
 
             @Override
+            public void processRecord(Record record) {
+//                System.out.println(record.getClass() + " : "+record.toString());
+                super.processRecord(record);
+            }
+
+            @Override
             public String formatNumberDateCell(CellValueRecordInterface cell) {
                 String formatString = this.getFormatString(cell);
                 if (formatString != null && ! formatString.equals("General")) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 98f9259..75c972b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.io.File;
 import java.io.InputStream;
 import java.text.DecimalFormatSymbols;
 import java.util.List;
@@ -544,4 +545,10 @@ public class ExcelParserTest extends TikaTest {
                 getXML("testEXCEL_phonetic.xls", parser).xml);
 
     }
+
+    @Test
+    public void testLabelsAreExtracted() throws Exception {
+        String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml;
+        assertContains("Morocco", xml);
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls b/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls
new file mode 100644
index 0000000..fd29a76
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testEXCEL_labels-govdocs-515858.xls differ

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.