You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/04 01:29:19 UTC

[tika] branch 2.x updated: TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176. Split to different change list...argh.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  fe3971a   TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176. Split to different change list...argh.
fe3971a is described below

commit fe3971a69e203f38214071f6df65430d835592a0
Author: tballison <ta...@mitre.org>
AuthorDate: Wed May 3 21:29:11 2017 -0400

    TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176.
    Split to different change list...argh.
---
 .../wordperfect/WP5DocumentAreaExtractor.java      | 36 +++++++++++++++++++---
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP5DocumentAreaExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP5DocumentAreaExtractor.java
index bd59725..01e5a0b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP5DocumentAreaExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP5DocumentAreaExtractor.java
@@ -17,7 +17,10 @@
 package org.apache.tika.parser.wordperfect;
 
 import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
 
+import org.apache.commons.collections4.MapUtils;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -28,8 +31,32 @@ import org.xml.sax.SAXException;
  */
 class WP5DocumentAreaExtractor extends WPDocumentAreaExtractor {
     
-    protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml)
-            throws IOException, SAXException {
+    /* 192-207 characters represent fixed-length multi-byte functions.  
+     * Those that are not handled explicitely in the code below should be
+     * skipped according to their size (minus the first char if already read).
+     */
+    private static final Map<Integer, Integer> FIXED_LENGTH_FUNCTION_SIZES = 
+            MapUtils.putAll(new HashMap<Integer, Integer>(), new Integer[] {
+        192, 4,  // Extended character
+        193, 9,  // Center/Align/ Tab/Left Margin Release
+        194, 11, // Indent
+        195, 3,  // Attribute ON
+        196, 3,  // Attribute OFF
+        197, 5,  // Block Protect
+        198, 6,  // End of Indent
+        199, 7,  // Different Display Character when Hyphenated
+        200, 4,  // (Reserved)
+        201, 5,  // (Reserved)
+        202, 6,  // (Reserved)
+        203, 6,  // (Reserved)
+        204, 8,  // (Reserved)
+        205, 10, // (Reserved)
+        206, 10, // (Reserved)
+        207, 12, // (Reserved)
+    });
+    
+    protected void extract(int c, WPInputStream in, StringBuilder out, 
+            XHTMLContentHandler xhtml) throws IOException, SAXException {
 
         // 0-31: control characters
         if (c == 10) {
@@ -65,8 +92,9 @@ class WP5DocumentAreaExtractor extends WPDocumentAreaExtractor {
             in.readWP(); // closing character
             WP5Charsets.append(out, charset, charval);
         } else if (c >= 193 && c <= 207) {
-            skipUntilChar(in, c); // opening/closing chars are same
-
+            // removing 1 from function length since first char already read
+            in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
+            
         // 208-255: variable-length multi-byte function
         } else if (c >= 208 && c <= 255) {
             // Variable-Length Multi-Byte Functions

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].