You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/05/03 20:24:35 UTC
[tika] branch 2.x updated: TIKA-2352 -- bug fix for WordPerfect
parser via Pascal Essiembre. Pull request 176.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/2.x by this push:
new babb253 TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176.
babb253 is described below
commit babb2534e163b182b3c55f5e02188302b5c4d07e
Author: tballison <ta...@mitre.org>
AuthorDate: Wed May 3 16:24:21 2017 -0400
TIKA-2352 -- bug fix for WordPerfect parser via Pascal Essiembre. Pull request 176.
---
.../wordperfect/WP6DocumentAreaExtractor.java | 36 ++++++++++++++++++++--
1 file changed, 34 insertions(+), 2 deletions(-)
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java
index 862c858..5083711 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/wordperfect/WP6DocumentAreaExtractor.java
@@ -17,7 +17,10 @@
package org.apache.tika.parser.wordperfect;
import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.commons.collections4.MapUtils;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -27,6 +30,29 @@ import org.xml.sax.SAXException;
* @author Pascal Essiembre
*/
class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
+
+ /* 240-254 characters represent fixed-length multi-byte functions.
+ * Those that are not handled explicitely in the code below should be
+ * skipped according to their size (minus the first char if already read).
+ */
+ private static final Map<Integer, Integer> FIXED_LENGTH_FUNCTION_SIZES =
+ MapUtils.putAll(new HashMap<Integer, Integer>(), new Integer[] {
+ 240, 4, // Extended Character
+ 241, 5, // Undo
+ 242, 3, // Attribute On
+ 243, 3, // Attribute Off
+ 244, 3, // (Reserved)
+ 245, 3, // (Reserved)
+ 246, 4, // (Reserved)
+ 247, 4, // (Reserved)
+ 248, 4, // (Reserved)
+ 249, 5, // (Reserved)
+ 250, 5, // (Reserved)
+ 251, 6, // (Reserved)
+ 252, 6, // (Reserved)
+ 253, 8, // (Reserved)
+ 254, 8, // (Reserved)
+ });
protected void extract(int c, WPInputStream in, StringBuilder out, XHTMLContentHandler xhtml)
throws IOException, SAXException {
@@ -50,8 +76,9 @@ class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
out.append('\t');
} else if (c >= 180 && c <= 207) {
endParagraph(out, xhtml);
+
+ // 208-239: variable-length multi-byte function
} else if (c >= 208 && c <= 239) {
- // Variable-Length Multi-Byte Functions
int subgroup = in.readWP();
int functionSize = in.readWPShort();
for (int i = 0; i < functionSize - 4; i++) {
@@ -85,9 +112,14 @@ class WP6DocumentAreaExtractor extends WPDocumentAreaExtractor {
int charset = in.readWP();
in.readWP(); // closing character
WP6Charsets.append(out, charset, charval);
+
+ // 241-254: fixed-length multi-byte function
} else if (c >= 241 && c <= 254) {
- skipUntilChar(in, c);
+ // removing 1 from function length since first char already read
+ in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
} else if (c == 255) {
+ // Should not be used so this line should not be called.
+ // We still have this code in case a future version uses it.
skipUntilChar(in, c);
}
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].