You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/03 20:24:35 UTC

[tika] branch 2.x updated: TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  babb253   TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176.
babb253 is described below

commit babb2534e163b182b3c55f5e02188302b5c4d07e
Author: tballison <ta...@mitre.org>
AuthorDate: Wed May 3 16:24:21 2017 -0400

    TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176.
---
 .../wordperfect/WP6DocumentAreaExtractor.java      | 36 ++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java
index 862c858..5083711 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java
@@ -17,7 +17,10 @@
 package org.apache.tika.parser.wordperfect;
 
 import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
 
+import org.apache.commons.collections4.MapUtils;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -27,6 +30,29 @@ import org.xml.sax.SAXException;
  * @author Pascal Essiembre
  */
 class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
+
+    /* 240-254 characters represent fixed-length multi-byte functions.  
+     * Those that are not handled explicitely in the code below should be
+     * skipped according to their size (minus the first char if already read).
+     */
+    private static final Map<Integer, Integer> FIXED_LENGTH_FUNCTION_SIZES = 
+            MapUtils.putAll(new HashMap<Integer, Integer>(), new Integer[] {
+        240, 4,  // Extended Character
+        241, 5,  // Undo
+        242, 3,  // Attribute On
+        243, 3,  // Attribute Off
+        244, 3,  // (Reserved)
+        245, 3,  // (Reserved)
+        246, 4,  // (Reserved)
+        247, 4,  // (Reserved)
+        248, 4,  // (Reserved)
+        249, 5,  // (Reserved)
+        250, 5,  // (Reserved)
+        251, 6,  // (Reserved)
+        252, 6,  // (Reserved)
+        253, 8,  // (Reserved)
+        254, 8,  // (Reserved)
+    });    
     
     protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml)
             throws IOException, SAXException {
@@ -50,8 +76,9 @@ class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
             out.append('\t');
         } else if (c >= 180 && c <= 207) {
             endParagraph(out, xhtml);
+            
+        // 208-239: variable-length multi-byte function
         } else if (c >= 208 && c <= 239) {
-            // Variable-Length Multi-Byte Functions
             int subgroup = in.readWP();
             int functionSize = in.readWPShort();
             for (int i = 0; i < functionSize - 4; i++) {
@@ -85,9 +112,14 @@ class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
             int charset = in.readWP();
             in.readWP(); // closing character
             WP6Charsets.append(out, charset, charval);
+            
+        // 241-254: fixed-length multi-byte function
         } else if (c >= 241 && c <= 254) {
-            skipUntilChar(in, c);
+            // removing 1 from function length since first char already read
+            in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);            
         } else if (c == 255) {
+            // Should not be used so this line should not be called.
+            // We still have this code in case a future version uses it.
             skipUntilChar(in, c);
         }
         

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].