You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/08 18:16:49 UTC
[1/2] tika git commit: TIKA-2030 - add processing for
element in odt, thanks to David Pilato for identifying this.
Repository: tika
Updated Branches:
refs/heads/master 636060eb6 -> 8d29f7a62
TIKA-2030 - add processing for <text:s/> element in odt, thanks to David Pilato for identifying this.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c0320f14
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c0320f14
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c0320f14
Branch: refs/heads/master
Commit: c0320f14194608d31b9ffaae9250f28c46017b75
Parents: 95b2cd1
Author: tballison <ta...@mitre.org>
Authored: Fri Jul 8 14:15:50 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jul 8 14:15:50 2016 -0400
----------------------------------------------------------------------
.../parser/odf/OpenDocumentContentParser.java | 3 +++
.../test-documents/testOpenOffice2.odt | Bin 26448 -> 27554 bytes
2 files changed, 3 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/c0320f14/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index a32d406..b40ed27 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -68,6 +68,7 @@ public class OpenDocumentContentParser extends AbstractParser {
private static final class OpenDocumentElementMappingContentHandler extends
ElementMappingContentHandler {
+ private static final char[] SPACE = new char[]{ ' '};
private final ContentHandler handler;
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
@@ -283,6 +284,8 @@ public class OpenDocumentContentParser extends AbstractParser {
startList(attrs.getValue(TEXT_NS, "style-name"));
} else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
startSpan(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
+ handler.characters(SPACE, 0, 1);
} else {
super.startElement(namespaceURI, localName, qName, attrs);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/c0320f14/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt b/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt
index bc31925..f6c72b6 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt and b/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt differ
[2/2] tika git commit: Merge remote-tracking branch 'origin/master'
Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8d29f7a6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8d29f7a6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8d29f7a6
Branch: refs/heads/master
Commit: 8d29f7a625e932fe31d0611e894f4f1c485457c4
Parents: c0320f1 636060e
Author: tballison <ta...@mitre.org>
Authored: Fri Jul 8 14:16:24 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jul 8 14:16:24 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 3 +
tika-bundle/pom.xml | 2 +-
tika-parsers/pom.xml | 17 +-
.../tika/parser/ocr/TesseractOCRConfig.java | 181 ++++++++++++++++++-
.../tika/parser/ocr/TesseractOCRParser.java | 113 +++++++++++-
.../parser/ocr/TesseractOCRConfig.properties | 13 +-
.../org/apache/tika/parser/ocr/rotation.py | 72 ++++++++
.../tika/parser/ocr/TesseractOCRConfigTest.java | 61 ++++++-
.../tika/parser/ocr/TesseractOCRParserTest.java | 18 +-
.../TesseractOCRConfig-full.properties | 6 +
.../TesseractOCRConfig-partial.properties | 8 +-
11 files changed, 483 insertions(+), 11 deletions(-)
----------------------------------------------------------------------