You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/18 13:01:59 UTC

[tika] branch master updated: TIKA-2429 -- upgrade to POI 3.17, last version of POI that runs on Java < 1.8

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new f16bd0e  TIKA-2429 -- upgrade to POI 3.17, last version of POI that runs on Java < 1.8
f16bd0e is described below

commit f16bd0e2dc8b86fc3cc58b9a80dc7544d549651d
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Sep 18 09:01:43 2017 -0400

    TIKA-2429 -- upgrade to POI 3.17, last version of POI that runs on Java < 1.8
---
 CHANGES.txt                                        |  2 ++
 tika-eval/pom.xml                                  |  2 +-
 tika-parsers/pom.xml                               |  2 +-
 .../apache/tika/parser/microsoft/WMFParser.java    |  4 ++--
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    | 24 ++++++++--------------
 .../ooxml/SXWPFWordExtractorDecorator.java         |  3 ++-
 .../ooxml/XSSFBExcelExtractorDecorator.java        |  3 ++-
 .../ooxml/XSSFExcelExtractorDecorator.java         |  4 +++-
 .../ooxml/XWPFWordExtractorDecorator.java          |  9 ++++++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  1 -
 10 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 26ad26e..7ad0f99 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.17 - ???
 
+  * Upgrade to POI 3.17 (TIKA-2429).
+
   * Enabling extraction of standard references from text (TIKA-2449).
 
   * Load external custom mimetypes XML from system property 
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index e92ae8a..2f72371 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -37,7 +37,7 @@
         <cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? -->
         <!-- upgrade to 6.x or something more modern once Tika requires Java 1.8 -->
         <lucene.version>5.5.4</lucene.version>
-        <poi.version>3.17-beta1</poi.version>
+        <poi.version>3.17</poi.version>
     </properties>
 
     <dependencies>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 34c3669..2f3e2a9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
   <url>http://tika.apache.org/</url>
 
   <properties>
-    <poi.version>3.17-beta1</poi.version>
+    <poi.version>3.17</poi.version>
     <!-- NOTE: sync codec version with POI -->
     <codec.version>1.10</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
index aef09f9..e0a2507 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
@@ -71,9 +71,9 @@ public class WMFParser extends AbstractParser {
                 //This fix should be done within POI
                 if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
                     HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont();
-                    charset = (font.getCharSet() == null || font.getCharSet().getCharset() == null)
+                    charset = (font.getCharset() == null || font.getCharset().getCharset() == null)
                             ? LocaleUtil.CHARSET_1252 :
-                            font.getCharSet().getCharset();
+                            font.getCharset().getCharset();
                 }
                 if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
                     HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 0b3bbd6..4e1bfd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -44,6 +44,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -77,23 +78,16 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
 
     static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
-    static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
-    static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
-    static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
-    static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject";
-    static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
     static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
-    //once we add this to XWPFRelation, we should swap that out and remove this
-    static final String RELATION_CHART = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart";
 
     private static final String TYPE_OLE_OBJECT =
             "application/vnd.openxmlformats-officedocument.oleObject";
 
     protected final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
             RELATION_AUDIO,
-            RELATION_IMAGE,
-            RELATION_PACKAGE,
-            RELATION_OFFICE_DOCUMENT,
+            PackageRelationshipTypes.IMAGE_PART,
+            POIXMLDocument.PACK_OBJECT_REL_TYPE,
+            PackageRelationshipTypes.CORE_DOCUMENT,
             RELATION_DIAGRAM_DATA
     };
 
@@ -250,15 +244,15 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             }
 
             String type = rel.getRelationshipType();
-            if (RELATION_OLE_OBJECT.equals(type)
+            if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)
                     && TYPE_OLE_OBJECT.equals(target.getContentType())) {
                 handleEmbeddedOLE(target, handler, sourceDesc + rel.getId(), parentMetadata);
             } else if (RELATION_AUDIO.equals(type)
-                    || RELATION_IMAGE.equals(type)
-                    || RELATION_PACKAGE.equals(type)
-                    || RELATION_OLE_OBJECT.equals(type)) {
+                    || PackageRelationshipTypes.IMAGE_PART.equals(type)
+                    || POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type)
+                    || POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
                 handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
-            } else if (RELATION_MACRO.equals(type)) {
+            } else if (XSSFRelation.VBA_MACROS.getRelation().equals(type)) {
                 handleMacros(target, handler);
             }
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 5c7352e..287a129 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -30,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFNumbering;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.tika.exception.TikaException;
@@ -156,7 +157,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         //for now, just dump other components at end
         for (String rel : new String[]{
                 AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
-                AbstractOOXMLExtractor.RELATION_CHART,
+                XSSFRelation.CHART.getRelation(),
                 XWPFRelation.FOOTNOTE.getRelation(),
                 XWPFRelation.COMMENT.getRelation(),
                 XWPFRelation.FOOTER.getRelation(),
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index a7516ac..0a511c2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -30,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.xssf.binary.XSSFBCommentsTable;
@@ -307,7 +308,7 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
         //add main document so that macros can be extracted
         //by AbstractOOXMLExtractor
         for (PackagePart part : extractor.getPackage().
-                getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+                getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT)) {
             parts.add(part);
         }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index c12059e..c3b7285 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -25,6 +25,7 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 
+import org.apache.poi.POIXMLDocument;
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -33,6 +34,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.ss.usermodel.DataFormatter;
@@ -383,7 +385,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         //add main document so that macros can be extracted
         //by AbstractOOXMLExtractor
         for (PackagePart part : extractor.getPackage().
-                getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+                getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT)) {
             parts.add(part);
         }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 181f777..55a38fd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -25,6 +25,7 @@ import java.util.List;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
@@ -132,7 +133,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         );
         //handle chart data
         handleGeneralTextContainingPart(
-                AbstractOOXMLExtractor.RELATION_CHART,
+                XSSFRelation.CHART.getRelation(),
                 "chart",
                 document.getPackagePart(),
                 metadata,
@@ -394,7 +395,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             tfmtg.setItalic(run.isItalic());
         }
 
-        xhtml.characters(run.toString());
+        if (config.getConcatenatePhoneticRuns()) {
+            xhtml.characters(run.toString());
+        } else {
+            xhtml.characters(run.text());
+        }
 
         // If we have any pictures, output them
         for (XWPFPicture picture : run.getEmbeddedPictures()) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 55946b2..b8b3dd1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1738,7 +1738,6 @@ public class OOXMLParserTest extends TikaTest {
     }
 
     @Test
-    @Ignore("to be fixed in > POI 3.17")
     public void testDOCXPhoneticStrings() throws Exception {
 
         assertContains("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)",

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].