You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/08 18:16:49 UTC

[1/2] tika git commit: TIKA-2030 - add processing for element in odt, thanks to David Pilato for identifying this.

Repository: tika
Updated Branches:
  refs/heads/master 636060eb6 -> 8d29f7a62


TIKA-2030 - add processing for <text:s/> element in odt, thanks to David Pilato for identifying this.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c0320f14
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c0320f14
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c0320f14

Branch: refs/heads/master
Commit: c0320f14194608d31b9ffaae9250f28c46017b75
Parents: 95b2cd1
Author: tballison <ta...@mitre.org>
Authored: Fri Jul 8 14:15:50 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jul 8 14:15:50 2016 -0400

----------------------------------------------------------------------
 .../parser/odf/OpenDocumentContentParser.java   |   3 +++
 .../test-documents/testOpenOffice2.odt          | Bin 26448 -> 27554 bytes
 2 files changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/c0320f14/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index a32d406..b40ed27 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -68,6 +68,7 @@ public class OpenDocumentContentParser extends AbstractParser {
 
     private static final class OpenDocumentElementMappingContentHandler extends
             ElementMappingContentHandler {
+        private static final char[] SPACE = new char[]{ ' '};
         private final ContentHandler handler;
         private final BitSet textNodeStack = new BitSet();
         private int nodeDepth = 0;
@@ -283,6 +284,8 @@ public class OpenDocumentContentParser extends AbstractParser {
                     startList(attrs.getValue(TEXT_NS, "style-name"));
                 } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
                     startSpan(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && "s".equals(localName)) {
+                    handler.characters(SPACE, 0, 1);
                 } else {
                     super.startElement(namespaceURI, localName, qName, attrs);
                 }

http://git-wip-us.apache.org/repos/asf/tika/blob/c0320f14/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt b/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt
index bc31925..f6c72b6 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt and b/tika-parsers/src/test/resources/test-documents/testOpenOffice2.odt differ


[2/2] tika git commit: Merge remote-tracking branch 'origin/master'

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master'


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8d29f7a6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8d29f7a6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8d29f7a6

Branch: refs/heads/master
Commit: 8d29f7a625e932fe31d0611e894f4f1c485457c4
Parents: c0320f1 636060e
Author: tballison <ta...@mitre.org>
Authored: Fri Jul 8 14:16:24 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jul 8 14:16:24 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 tika-bundle/pom.xml                             |   2 +-
 tika-parsers/pom.xml                            |  17 +-
 .../tika/parser/ocr/TesseractOCRConfig.java     | 181 ++++++++++++++++++-
 .../tika/parser/ocr/TesseractOCRParser.java     | 113 +++++++++++-
 .../parser/ocr/TesseractOCRConfig.properties    |  13 +-
 .../org/apache/tika/parser/ocr/rotation.py      |  72 ++++++++
 .../tika/parser/ocr/TesseractOCRConfigTest.java |  61 ++++++-
 .../tika/parser/ocr/TesseractOCRParserTest.java |  18 +-
 .../TesseractOCRConfig-full.properties          |   6 +
 .../TesseractOCRConfig-partial.properties       |   8 +-
 11 files changed, 483 insertions(+), 11 deletions(-)
----------------------------------------------------------------------