You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/18 13:01:59 UTC
[tika] branch master updated: TIKA-2429 -- upgrade to POI 3.17,
last version of POI that runs on Java < 1.8
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new f16bd0e TIKA-2429 -- upgrade to POI 3.17, last version of POI that runs on Java < 1.8
f16bd0e is described below
commit f16bd0e2dc8b86fc3cc58b9a80dc7544d549651d
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Sep 18 09:01:43 2017 -0400
TIKA-2429 -- upgrade to POI 3.17, last version of POI that runs on Java < 1.8
---
CHANGES.txt | 2 ++
tika-eval/pom.xml | 2 +-
tika-parsers/pom.xml | 2 +-
.../apache/tika/parser/microsoft/WMFParser.java | 4 ++--
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 24 ++++++++--------------
.../ooxml/SXWPFWordExtractorDecorator.java | 3 ++-
.../ooxml/XSSFBExcelExtractorDecorator.java | 3 ++-
.../ooxml/XSSFExcelExtractorDecorator.java | 4 +++-
.../ooxml/XWPFWordExtractorDecorator.java | 9 ++++++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 1 -
10 files changed, 29 insertions(+), 25 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 26ad26e..7ad0f99 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.17 - ???
+ * Upgrade to POI 3.17 (TIKA-2429).
+
* Enabling extraction of standard references from text (TIKA-2449).
* Load external custom mimetypes XML from system property
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index e92ae8a..2f72371 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -37,7 +37,7 @@
<cli.version>1.4</cli.version> <!--sync version with tika-server or move to parent? -->
<!-- upgrade to 6.x or something more modern once Tika requires Java 1.8 -->
<lucene.version>5.5.4</lucene.version>
- <poi.version>3.17-beta1</poi.version>
+ <poi.version>3.17</poi.version>
</properties>
<dependencies>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 34c3669..2f3e2a9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.17-beta1</poi.version>
+ <poi.version>3.17</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
index aef09f9..e0a2507 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
@@ -71,9 +71,9 @@ public class WMFParser extends AbstractParser {
//This fix should be done within POI
if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont();
- charset = (font.getCharSet() == null || font.getCharSet().getCharset() == null)
+ charset = (font.getCharset() == null || font.getCharset().getCharset() == null)
? LocaleUtil.CHARSET_1252 :
- font.getCharSet().getCharset();
+ font.getCharset().getCharset();
}
if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 0b3bbd6..4e1bfd6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -44,6 +44,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -77,23 +78,16 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
- static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
- static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
- static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
- static final String RELATION_MACRO = "http://schemas.microsoft.com/office/2006/relationships/vbaProject";
- static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
- //once we add this to XWPFRelation, we should swap that out and remove this
- static final String RELATION_CHART = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart";
private static final String TYPE_OLE_OBJECT =
"application/vnd.openxmlformats-officedocument.oleObject";
protected final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
RELATION_AUDIO,
- RELATION_IMAGE,
- RELATION_PACKAGE,
- RELATION_OFFICE_DOCUMENT,
+ PackageRelationshipTypes.IMAGE_PART,
+ POIXMLDocument.PACK_OBJECT_REL_TYPE,
+ PackageRelationshipTypes.CORE_DOCUMENT,
RELATION_DIAGRAM_DATA
};
@@ -250,15 +244,15 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
}
String type = rel.getRelationshipType();
- if (RELATION_OLE_OBJECT.equals(type)
+ if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)
&& TYPE_OLE_OBJECT.equals(target.getContentType())) {
handleEmbeddedOLE(target, handler, sourceDesc + rel.getId(), parentMetadata);
} else if (RELATION_AUDIO.equals(type)
- || RELATION_IMAGE.equals(type)
- || RELATION_PACKAGE.equals(type)
- || RELATION_OLE_OBJECT.equals(type)) {
+ || PackageRelationshipTypes.IMAGE_PART.equals(type)
+ || POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type)
+ || POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
- } else if (RELATION_MACRO.equals(type)) {
+ } else if (XSSFRelation.VBA_MACROS.getRelation().equals(type)) {
handleMacros(target, handler);
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 5c7352e..287a129 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -30,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
@@ -156,7 +157,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
//for now, just dump other components at end
for (String rel : new String[]{
AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
- AbstractOOXMLExtractor.RELATION_CHART,
+ XSSFRelation.CHART.getRelation(),
XWPFRelation.FOOTNOTE.getRelation(),
XWPFRelation.COMMENT.getRelation(),
XWPFRelation.FOOTER.getRelation(),
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index a7516ac..0a511c2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -30,6 +30,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xssf.binary.XSSFBCommentsTable;
@@ -307,7 +308,7 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
//add main document so that macros can be extracted
//by AbstractOOXMLExtractor
for (PackagePart part : extractor.getPackage().
- getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+ getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT)) {
parts.add(part);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index c12059e..c3b7285 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -25,6 +25,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
+import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
@@ -33,6 +34,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.ss.usermodel.DataFormatter;
@@ -383,7 +385,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
//add main document so that macros can be extracted
//by AbstractOOXMLExtractor
for (PackagePart part : extractor.getPackage().
- getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
+ getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT)) {
parts.add(part);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 181f777..55a38fd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
@@ -132,7 +133,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
);
//handle chart data
handleGeneralTextContainingPart(
- AbstractOOXMLExtractor.RELATION_CHART,
+ XSSFRelation.CHART.getRelation(),
"chart",
document.getPackagePart(),
metadata,
@@ -394,7 +395,11 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
tfmtg.setItalic(run.isItalic());
}
- xhtml.characters(run.toString());
+ if (config.getConcatenatePhoneticRuns()) {
+ xhtml.characters(run.toString());
+ } else {
+ xhtml.characters(run.text());
+ }
// If we have any pictures, output them
for (XWPFPicture picture : run.getEmbeddedPictures()) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 55946b2..b8b3dd1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1738,7 +1738,6 @@ public class OOXMLParserTest extends TikaTest {
}
@Test
- @Ignore("to be fixed in > POI 3.17")
public void testDOCXPhoneticStrings() throws Exception {
assertContains("\u6771\u4EAC (\u3068\u3046\u304D\u3087\u3046)",
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].