You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/09/15 18:42:07 UTC
svn commit: r1171171 [1/3] - in /tika/trunk: ./
tika-core/src/main/java/org/apache/tika/sax/
tika-core/src/main/java/org/apache/tika/utils/
tika-parsers/src/main/java/org/apache/tika/parser/rtf/
tika-parsers/src/test/java/org/apache/tika/ tika-parsers/...
Author: mikemccand
Date: Thu Sep 15 16:42:06 2011
New Revision: 1171171
URL: http://svn.apache.org/viewvc?rev=1171171&view=rev
Log:
TIKA-683: new RTF parser that performs its own direct shallow parse (instead of using RTFEditorKit from javax.swing)
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java (with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFBoldItalic.rtf (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFControls.rtf (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFInvalidUnicode.rtf (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFUmlautSpaces2.rtf (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFUnicodeUCNControlWordCharacterDoubling.rtf (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFVarious.rtf (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Sep 15 16:42:06 2011
@@ -6,6 +6,14 @@ The most notable changes in Tika 1.0 ove
* A parser for CHM help files was added. (TIKA-245)
+ * The RTF parser was rewritten to perform its own direct shallow
+ parse of the RTF content, instead of using RTFEditorKit from
+ javax.swing. This fixes several issues in the old parser,
+ including doubling of Unicode characters in certain cases
+ (TIKA-683), exceptions on mal-formed RTF docs (TIKA-666), and
+ missing text from some elements (header/footer, hyperlinks,
+ footnotes, text inside pictures).
+
Release 0.9 - 02/13/2011
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java Thu Sep 15 16:42:06 2011
@@ -16,6 +16,11 @@
*/
package org.apache.tika.sax;
+/*
+import java.util.ArrayList;
+import java.util.List;
+*/
+
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -200,12 +205,48 @@ public class SafeContentHandler extends
output.write(REPLACEMENT, 0, REPLACEMENT.length);
}
+
+ /*
+ private final List<String> elements = new ArrayList<String>();
+
+ // Called only from assert
+ private boolean verifyStartElement(String name) {
+ // TODO: we could strengthen this to do full
+ // XTHML validation, eg you shouldn't start p inside
+ // another p (but ODF parser, at least, seems to
+ // violate this):
+ //if (name.equals("p")) {
+ //assert elements.size() == 0 || !elements.get(elements.size()-1).equals("p");
+ //}
+ elements.add(name);
+ return true;
+ }
+
+ // Called only from assert
+ private boolean verifyEndElement(String name) {
+ assert elements.size() > 0: "end tag=" + name + " with no startElement";
+ final String currentElement = elements.get(elements.size()-1);
+ assert currentElement.equals(name): "mismatched elements open=" + currentElement + " close=" + name;
+ elements.remove(elements.size()-1);
+ return true;
+ }
+
+ // Called only from assert
+ private boolean verifyEndDocument() {
+ assert elements.size() == 0;
+ return true;
+ }
+ */
+
//------------------------------------------------------< ContentHandler >
@Override
public void startElement(
String uri, String localName, String name, Attributes atts)
throws SAXException {
+ // TODO: enable this, but some parsers currently
+ // trip it
+ //assert verifyStartElement(name);
// Look for any invalid characters in attribute values.
for (int i = 0; i < atts.getLength(); i++) {
if (isInvalid(atts.getValue(i))) {
@@ -231,6 +272,23 @@ public class SafeContentHandler extends
}
@Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ // TODO: enable this, but some parsers currently
+ // trip it
+ //assert verifyEndElement(name);
+ super.endElement(uri, localName, name);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ // TODO: enable this, but some parsers currently
+ // trip it
+ //assert verifyEndDocument();
+ super.endDocument();
+ }
+
+ @Override
public void characters(char[] ch, int start, int length)
throws SAXException {
filter(ch, start, length, charactersOutput);
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java Thu Sep 15 16:42:06 2011
@@ -16,6 +16,8 @@
*/
package org.apache.tika.utils;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.HashMap;
@@ -51,6 +53,9 @@ public class CharsetUtils {
*/
public static boolean isSupported(String charsetName) {
try {
+ if (isSupportedICU != null && ((Boolean) isSupportedICU.invoke(null, charsetName)).booleanValue()) {
+ return true;
+ }
return Charset.isSupported(charsetName);
} catch (IllegalCharsetNameException e) {
return false;
@@ -103,11 +108,54 @@ public class CharsetUtils {
}
try {
- Charset cs = Charset.forName(result);
+ Charset cs = forName(result);
return cs.name();
} catch (Exception e) {
return null;
}
}
+ private static Method getCharsetICU;
+ private static Method isSupportedICU;
+
+ static {
+ // See if we can load the icu4j CharsetICU class
+ Class icuCharset = null;
+ try {
+ icuCharset = CharsetUtils.class.getClassLoader().loadClass("com.ibm.icu.charset.CharsetICU");
+ }
+ catch (ClassNotFoundException e) {
+ }
+ if (icuCharset != null) {
+ try {
+ getCharsetICU = icuCharset.getMethod("forNameICU", String.class);
+ } catch (Throwable t) {
+ throw new RuntimeException(t);
+ }
+ try {
+ isSupportedICU = icuCharset.getMethod("isSupported", String.class);
+ } catch (Throwable t) {
+ }
+ // TODO: would be nice to somehow log that we
+ // successfully found ICU
+ }
+ }
+
+ /** Returns Charset impl, if one exists. This method
+ * optionally uses ICU4J's CharsetICU.forNameICU,
+ * if it is found on the classpath, else only uses
+ * JDK's builtin Charset.forName. */
+ public static Charset forName(String name) {
+ if (getCharsetICU != null) {
+ try {
+ Charset cs = (Charset) getCharsetICU.invoke(null, name);
+ if (cs != null) {
+ return cs;
+ }
+ } catch (InvocationTargetException ite) {
+ } catch (IllegalAccessException iae) {
+ }
+ }
+ return Charset.forName(name);
+ }
}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1171171&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java Thu Sep 15 16:42:06 2011
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+/* Holds all state associated with current RTF group, ie {
+ * ... }. */
+
+class GroupState {
+ public int depth;
+ public boolean bold;
+ public boolean italic;
+ // True if we are skipping all text in current group,
+ // eg if group leads with a \*:
+ public boolean ignore;
+ // Default is 1 if no uc control has been seen yet:
+ public int ucSkip = 1;
+ public String fontCharset;
+
+ // Create default (root) GroupState
+ public GroupState() {
+ }
+
+ // Create new GroupState, inheriting all properties from current one, adding 1 to the depth
+ public GroupState(GroupState other) {
+ bold = other.bold;
+ italic = other.italic;
+ ignore = other.ignore;
+ ucSkip = other.ucSkip;
+ fontCharset = other.fontCharset;
+ depth = 1+other.depth;
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Thu Sep 15 16:42:06 2011
@@ -16,33 +16,13 @@
*/
package org.apache.tika.parser.rtf;
-import java.io.BufferedOutputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
import java.util.Collections;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.Map;
import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import javax.swing.text.AttributeSet;
-import javax.swing.text.BadLocationException;
-import javax.swing.text.DefaultStyledDocument;
-import javax.swing.text.Document;
-import javax.swing.text.StyleContext;
-import javax.swing.text.rtf.RTFEditorKit;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TaggedInputStream;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -59,60 +39,6 @@ public class RTFParser extends AbstractP
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.singleton(MediaType.application("rtf"));
- private static final Pattern F_PATTERN = Pattern.compile("\\\\a?f([0-9]+)");
-
- private static final Pattern FCHARSET_PATTERN = Pattern
- .compile("\\\\fcharset[0-9]+");
-
- private static final Pattern ANSICPG_PATTERN = Pattern
- .compile("\\\\ansicpg[0-9]+");
-
- private static final Pattern DEFAULT_FONT_PATTERN = Pattern.compile("\\\\deff(0-9)+");
-
- private static final Pattern FONT_FAMILY_PATTERN = Pattern.compile("\\\\f(nil|roman|swiss|modern|script|decor|tech|bidi)");
-
- private static Map<Integer, String> FONTSET_MAP = new HashMap<Integer, String>();
- static {
- FONTSET_MAP.put(0, "windows-1251"); // ANSI
- // charset 1 is Default
- // charset 2 is Symbol
-
- FONTSET_MAP.put(77, "MacRoman"); // Mac Roman
- FONTSET_MAP.put(78, "Shift_JIS"); // Mac Shift Jis
- FONTSET_MAP.put(79, "ms949"); // Mac Hangul
- FONTSET_MAP.put(80, "GB2312"); // Mac GB2312
- FONTSET_MAP.put(81, "Big5"); // Mac Big5
- FONTSET_MAP.put(82, "johab"); // Mac Johab (old)
- FONTSET_MAP.put(83, "MacHebrew"); // Mac Hebrew
- FONTSET_MAP.put(84, "MacArabic"); // Mac Arabic
- FONTSET_MAP.put(85, "MacGreek"); // Mac Greek
- FONTSET_MAP.put(86, "MacTurkish"); // Mac Turkish
- FONTSET_MAP.put(87, "MacThai"); // Mac Thai
- FONTSET_MAP.put(88, "cp1250"); // Mac East Europe
- FONTSET_MAP.put(89, "cp1251"); // Mac Russian
-
- FONTSET_MAP.put(128, "MS932"); // Shift JIS
- FONTSET_MAP.put(129, "ms949"); // Hangul
- FONTSET_MAP.put(130, "ms1361"); // Johab
- FONTSET_MAP.put(134, "ms936"); // GB2312
- FONTSET_MAP.put(136, "ms950"); // Big5
- FONTSET_MAP.put(161, "cp1253"); // Greek
- FONTSET_MAP.put(162, "cp1254"); // Turkish
- FONTSET_MAP.put(163, "cp1258"); // Vietnamese
- FONTSET_MAP.put(177, "cp1255"); // Hebrew
- FONTSET_MAP.put(178, "cp1256"); // Arabic
- // FONTSET_MAP.put( 179, "" ); // Arabic Traditional
- // FONTSET_MAP.put( 180, "" ); // Arabic user
- // FONTSET_MAP.put( 181, "" ); // Hebrew user
- FONTSET_MAP.put(186, "cp1257"); // Baltic
-
- FONTSET_MAP.put(204, "cp1251"); // Russian
- FONTSET_MAP.put(222, "ms874"); // Thai
- FONTSET_MAP.put(238, "cp1250"); // Eastern European
- FONTSET_MAP.put(254, "cp437"); // PC 437
- FONTSET_MAP.put(255, "cp850"); // OEM
- }
-
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -120,294 +46,15 @@ public class RTFParser extends AbstractP
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
+ throws IOException, SAXException, TikaException {
TaggedInputStream tagged = new TaggedInputStream(stream);
- TemporaryResources tmp = new TemporaryResources();
try {
- File tempFile = tmp.createTemporaryFile();
- createUnicodeRtfTempFile(tempFile, stream);
-
- InputStream in = TikaInputStream.get(tempFile);
- try {
- Document sd = new CustomStyledDocument();
- new RTFEditorKit().read(in, sd, 0);
-
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.element("p", sd.getText(0, sd.getLength()));
- xhtml.endDocument();
-
- // TODO Extract some of the metadata
- metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
- } finally {
- in.close();
- }
+ final TextExtractor ert = new TextExtractor(new XHTMLContentHandler(handler, metadata), metadata);
+ ert.extract(stream);
+ metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
} catch (IOException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("Error parsing an RTF document", e);
- } catch (BadLocationException e) {
- throw new TikaException("Error parsing an RTF document", e);
- } catch (NullPointerException e) {
- // TIKA-621: RTF parsing fails with Java 7 early access
- // on 64bit platforms
- throw new TikaException("Error parsing an RTF document", e);
- } finally {
- tmp.dispose();
- }
- }
-
- private String escapeByUnicode(String data, String enc) {
- StringBuilder dataBuf = new StringBuilder(data.length() + 16);
- StringBuilder keywordBuf = new StringBuilder(4);
- StringBuilder origDataBuf = new StringBuilder();
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- for (int i = 0; i < data.length(); i++) {
- char c1 = data.charAt(i);
- keywordBuf.append(c1);
- if (c1 == '\\' && data.length()>i+1) {
- i++;
- char c2 = data.charAt(i);
- keywordBuf.append(c2);
- if (c2 == '\'') {
- i++;
- char c3 = data.charAt(i);
- keywordBuf.append(c3);
- if ((c3 >= '0' && c3 <= '9') || (c3 >= 'a' && c3 <= 'f')
- || (c3 >= 'A' && c3 <= 'F')) {
- i++;
- char c4 = data.charAt(i);
- keywordBuf.append(c4);
- if ((c4 >= '0' && c4 <= '9')
- || (c4 >= 'a' && c4 <= 'f')
- || (c4 >= 'A' && c4 <= 'F')) {
- int value = Integer.parseInt(
- String.valueOf(new char[] { c3, c4 }), 16);
- baos.write(value);
- origDataBuf.append(keywordBuf.toString());
- keywordBuf.delete(0, 4);
- continue;
- }
- }
- }
- }
- if (baos.size() != 0) {
- try {
- appendUnicodeStr(dataBuf, new String(baos.toByteArray(),
- enc));
- } catch (UnsupportedEncodingException e) {
- dataBuf.append(origDataBuf.toString());
- }
- origDataBuf.delete(0, origDataBuf.length());
- baos.reset();
- }
- dataBuf.append(keywordBuf.toString());
- keywordBuf.delete(0, 4);
- }
-
- if (baos.size() != 0) {
- try {
- appendUnicodeStr(dataBuf, new String(baos.toByteArray(), enc));
- } catch (UnsupportedEncodingException e) {
- dataBuf.append(origDataBuf.toString());
- }
- }
-
- return dataBuf.toString();
- }
-
- private void appendUnicodeStr(StringBuilder dataBuf, String value) {
- for (int j = 0; j < value.length(); j++) {
- char ch = value.charAt(j);
- if (ch >= 20 && ch < 80) {
- dataBuf.append(ch);
- } else {
- dataBuf.append("{\\u");
- dataBuf.append((int) ch);
- dataBuf.append('}');
- }
- }
- }
-
- private void createUnicodeRtfTempFile(File tempFile, InputStream in)
- throws IOException {
- OutputStream out =
- new BufferedOutputStream(new FileOutputStream(tempFile));
- try {
- String defaultCharset = "windows-1251"; // ansi
- String defaultFont = "0";
- Map<String, String> fontTableMap = new HashMap<String, String>();
- StringBuilder dataBuf = new StringBuilder(255);
- int ch;
- LinkedList<String> charsetQueue = new LinkedList<String>();
- int depth = 0;
- String prevFt = null;
- int prevCh = -1;
- while ((ch = in.read()) != -1) {
- if ( ((ch == '{' || ch == '}') && prevCh!='\\') || ( ch == ' ' && (! FONT_FAMILY_PATTERN.matcher(dataBuf.toString()).find())) ) {
- if (charsetQueue.size() > depth + 1) {
- charsetQueue.removeLast();
- }
-
- String data = dataBuf.toString();
- data = data.replace("\\cell","\\u0020\\cell");
-
- if(data.indexOf("\\colortbl")!=-1){
- // End of font table, clear last/previous font encountered.
- prevFt = null;
- }
-
- if (depth == 1) {
- // check control words for a default charset
- String cset = loadAnsiCpg(data);
- if (cset != null) {
- defaultCharset = cset;
- }
- Matcher matcher = DEFAULT_FONT_PATTERN.matcher(data);
- if(matcher.find()){
- defaultFont = matcher.group(1);
- }
- }
-
- String ft = loadFontTable(data);
- String charset = loadCharset(data);
- if (ft != null && charset != null) {
- fontTableMap.put(ft, charset);
- }
-
- if (ft == null && prevCh == ' ') {
- ft = prevFt;
- } else if (ft != null) {
- prevFt = ft;
- }
- if(ft==null){
- ft = defaultFont;
- }
-
- // set a current charset
- if (charset == null && ft != null) {
- charset = fontTableMap.get(ft);
- }
- if (charset == null && charsetQueue.size() > 0) {
- charset = charsetQueue.getLast();
- }
- if (charset == null) {
- charset = defaultCharset;
- }
-
- // add the current charset to a queue
- if (charsetQueue.size() < depth + 1) {
- charsetQueue.add(charset);
- }
-
- String escapedStr = "windows-1251".equals(charset) ? data
- : escapeByUnicode(data, charset);
- out.write(escapedStr.getBytes("UTF-8"));
- out.write(ch);
- dataBuf.delete(0, dataBuf.length());
-
- prevCh = ch;
-
- // update a depth
- if (ch == '{') {
- depth++;
- } else if (ch == '}') {
- depth--;
- }
- } else {
- dataBuf.append((char) ch);
- }
- }
- } finally {
- out.close();
- }
- }
-
- private String loadFontTable(String line) {
- Matcher m = F_PATTERN.matcher(line);
- String font = null;
- while((m.find())) {
- font = m.group(1);
- }
- return font;
- }
-
- private String loadAnsiCpg(String line) {
- Matcher m = ANSICPG_PATTERN.matcher(line);
- String charset = null;
- if (m.find()) {
- int encVal;
- try {
- encVal = Integer.parseInt(m.group().substring(8));
- charset = FONTSET_MAP.get(encVal);
- } catch (NumberFormatException e) {
- // ignore
- }
- }
-
- return charset;
- }
-
- private String loadCharset(String line) {
- Matcher m = FCHARSET_PATTERN.matcher(line);
- String charset = null;
- if (m.find()) {
- int encVal;
- try {
- encVal = Integer.parseInt(m.group().substring(9));
- } catch (NumberFormatException e) {
- encVal = 0;
- }
- charset = FONTSET_MAP.get(encVal);
}
-
- return charset;
- }
-
- /**
- * Customized version of {@link DefaultStyledDocument}. Adds whitespace
- * to places where words otherwise could have run together (see
- * <a href="https://issues.apache.org/jira/browse/TIKA-392">TIKA-392</a>),
- * and works around the problem of Swing expecting a GUI environment (see
- * <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>).
- */
- private static class CustomStyledDocument extends DefaultStyledDocument {
- private boolean isPrevUnicode = false;
-
- public CustomStyledDocument() {
- super(new NoReclaimStyleContext());
- }
-
- @Override
- public void insertString(int offs, String str, AttributeSet a)
- throws BadLocationException {
- boolean isUnicode = str.length() == 1 && str.charAt(0) > 127;
-
- if (offs > 0 && offs == getLength() && !isPrevUnicode && !isUnicode) {
- super.insertString(offs, " ", a);
- super.insertString(getLength(), str, a);
- } else {
- super.insertString(offs, str, a);
- }
-
- isPrevUnicode = isUnicode;
- }
-
- }
-
- /**
- * A workaround to
- * <a href="https://issues.apache.org/jira/browse/TIKA-282">TIKA-282</a>:
- * RTF parser expects a GUI environment. This class simply disables the
- * troublesome SwingUtilities.isEventDispatchThread() call that's made in
- * the {@link StyleContext#reclaim(AttributeSet)} method.
- */
- private static class NoReclaimStyleContext extends StyleContext {
-
- /** Ignored. */
- public void reclaim(AttributeSet a) {
- }
-
}
-
}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1171171&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Thu Sep 15 16:42:06 2011
@@ -0,0 +1,1027 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.rtf;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.SAXException;
+
+/* Tokenizes and performs a "shallow" parse of the RTF
+ * document, just enough to properly decode the text.
+ *
+ * TODO: we should cutover to a "real" tokenizer (eg JFlex);
+ * it should give better perf, by replacing the excessive
+ * "else if" string compares with FSA traversal. */
+
+final class TextExtractor {
+
+ // Hold pending bytes (encoded in the current charset)
+ // for text output:
+ private byte[] pendingBytes = new byte[16];
+ private int pendingByteCount;
+ private ByteBuffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+
+ // Holds pending chars for text output
+ private char[] pendingChars = new char[10];
+ private int pendingCharCount;
+
+ // Holds chars for a still-being-tokenized control word
+ private byte[] pendingControl = new byte[10];
+ private int pendingControlCount;
+
+ // Used when we decode bytes -> chars using CharsetDecoder:
+ private final char[] outputArray = new char[128];
+ private final CharBuffer outputBuffer = CharBuffer.wrap(outputArray);
+
+ // Reused when possible:
+ private CharsetDecoder decoder;
+ private String lastCharset;
+
+ private String globalCharset = "windows-1252";
+ private int globalDefaultFont = -1;
+ private int curFontID = -1;
+
+ // Holds the font table from this RTF doc, mapping
+ // the font number (from \fN control word) to the
+ // corresponding charset:
+ private final Map<Integer,String> fontToCharset = new HashMap<Integer,String>();
+
+ // Group stack: when we open a new group, we push
+ // the previous group state onto the stack; when we
+ // close the group, we restore it
+ private final LinkedList<GroupState> groupStates = new LinkedList<GroupState>();
+
+ // Current group state; in theory this initial
+ // GroupState is unused because the RTF doc should
+ // immediately open the top group (start with {):
+ private GroupState groupState = new GroupState();
+
+ private boolean inHeader = true;
+ private int fontTableState;
+ private int fontTableDepth;
+
+ // Non null if we are processing metadata (title,
+ // keywords, etc.) inside the info group:
+ private String nextMetaData;
+ private boolean inParagraph;
+
+ private final StringBuilder headerBuffer = new StringBuilder();
+
+ // Used to process the sub-groups inside the upr
+ // group:
+ private int uprState = -1;
+
+ private final XHTMLContentHandler out;
+ private final Metadata metadata;
+
+ // How many next ansi chars we should skip; this
+ // is 0 except when we are still in the "ansi
+ // shadow" after seeing a unicode escape, at which
+ // point it's set to the last ucN skip we had seen:
+ int ansiSkip = 0;
+
+ // The RTF doc has a "font table" that assigns ords
+ // (f0, f1, f2, etc.) to fonts and charsets, using the
+ // \fcharsetN control word. This mapping maps from the
+ // N to corresponding Java charset:
+ private static final Map<Integer, String> FCHARSET_MAP = new HashMap<Integer, String>();
+ static {
+ FCHARSET_MAP.put(0, "windows-1252"); // ANSI
+ // charset 1 is Default
+ // charset 2 is Symbol
+
+ FCHARSET_MAP.put(77, "MacRoman"); // Mac Roman
+ FCHARSET_MAP.put(78, "Shift_JIS"); // Mac Shift Jis
+ FCHARSET_MAP.put(79, "ms949"); // Mac Hangul
+ FCHARSET_MAP.put(80, "GB2312"); // Mac GB2312
+ FCHARSET_MAP.put(81, "Big5"); // Mac Big5
+ FCHARSET_MAP.put(82, "johab"); // Mac Johab (old)
+ FCHARSET_MAP.put(83, "MacHebrew"); // Mac Hebrew
+ FCHARSET_MAP.put(84, "MacArabic"); // Mac Arabic
+ FCHARSET_MAP.put(85, "MacGreek"); // Mac Greek
+ FCHARSET_MAP.put(86, "MacTurkish"); // Mac Turkish
+ FCHARSET_MAP.put(87, "MacThai"); // Mac Thai
+ FCHARSET_MAP.put(88, "cp1250"); // Mac East Europe
+ FCHARSET_MAP.put(89, "cp1251"); // Mac Russian
+
+ FCHARSET_MAP.put(128, "MS932"); // Shift JIS
+ FCHARSET_MAP.put(129, "ms949"); // Hangul
+ FCHARSET_MAP.put(130, "ms1361"); // Johab
+ FCHARSET_MAP.put(134, "ms936"); // GB2312
+ FCHARSET_MAP.put(136, "ms950"); // Big5
+ FCHARSET_MAP.put(161, "cp1253"); // Greek
+ FCHARSET_MAP.put(162, "cp1254"); // Turkish
+ FCHARSET_MAP.put(163, "cp1258"); // Vietnamese
+ FCHARSET_MAP.put(177, "cp1255"); // Hebrew
+ FCHARSET_MAP.put(178, "cp1256"); // Arabic
+ // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
+ // FCHARSET_MAP.put( 180, "" ); // Arabic user
+ // FCHARSET_MAP.put( 181, "" ); // Hebrew user
+ FCHARSET_MAP.put(186, "cp1257"); // Baltic
+
+ FCHARSET_MAP.put(204, "cp1251"); // Russian
+ FCHARSET_MAP.put(222, "ms874"); // Thai
+ FCHARSET_MAP.put(238, "cp1250"); // Eastern European
+ FCHARSET_MAP.put(254, "cp437"); // PC 437
+ FCHARSET_MAP.put(255, "cp850"); // OEM
+ }
+
+ // The RTF may specify the \ansicpgN charset in the
+ // header; this maps the N to the corresponding Java
+ // character set:
+
+ private static final Map<Integer, String> ANSICPG_MAP = new HashMap<Integer, String>();
+ static {
+ ANSICPG_MAP.put(437, "CP437"); // US IBM
+ ANSICPG_MAP.put(708, "ISO-8859-6"); // Arabic (ASMO 708)
+
+ ANSICPG_MAP.put(709, "windows-709"); // Arabic (ASMO 449+, BCON V4)
+ ANSICPG_MAP.put(710, "windows-710"); // Arabic (transparent Arabic)
+ ANSICPG_MAP.put(710, "windows-711"); // Arabic (Nafitha Enhanced)
+ ANSICPG_MAP.put(710, "windows-720"); // Arabic (transparent ASMO)
+ ANSICPG_MAP.put(819, "CP819"); // Windows 3.1 (US & Western Europe)
+ ANSICPG_MAP.put(819, "CP819"); // Windows 3.1 (US & Western Europe)
+
+ ANSICPG_MAP.put(819, "CP819"); // Windows 3.1 (US & Western Europe)
+ ANSICPG_MAP.put(850, "CP850"); // IBM Multilingual
+ ANSICPG_MAP.put(852, "CP852"); // Eastern European
+ ANSICPG_MAP.put(860, "CP860"); // Portuguese
+ ANSICPG_MAP.put(862, "CP862"); // Hebrew
+ ANSICPG_MAP.put(863, "CP863"); // French Canadian
+ ANSICPG_MAP.put(864, "CP864"); // Arabic
+ ANSICPG_MAP.put(865, "CP865"); // Norwegian
+ ANSICPG_MAP.put(866, "CP866"); // Soviet Union
+ ANSICPG_MAP.put(874, "MS874"); // Thai
+ ANSICPG_MAP.put(932, "MS932"); // Japanese
+ ANSICPG_MAP.put(936, "MS936"); // Simplified Chinese
+ ANSICPG_MAP.put(949, "CP949"); // Korean
+ ANSICPG_MAP.put(950, "CP950"); // Traditional Chinese
+ ANSICPG_MAP.put(1250, "CP1250"); // Eastern European
+ ANSICPG_MAP.put(1251, "CP1251"); // Cyrillic
+ ANSICPG_MAP.put(1252, "CP1252"); // Western European
+ ANSICPG_MAP.put(1253, "CP1253"); // Greek
+ ANSICPG_MAP.put(1254, "CP1254"); // Turkish
+ ANSICPG_MAP.put(1255, "CP1255"); // Hebrew
+ ANSICPG_MAP.put(1256, "CP1256"); // Arabic
+ ANSICPG_MAP.put(1257, "CP1257"); // Baltic
+ ANSICPG_MAP.put(1258, "CP1258"); // Vietnamese
+ ANSICPG_MAP.put(1361, "x-Johab"); // Johab
+ ANSICPG_MAP.put(10000, "MacRoman"); // Mac Roman
+ ANSICPG_MAP.put(10001, "Shift_JIS"); // Mac Japan
+ ANSICPG_MAP.put(10004, "MacArabic"); // Mac Arabic
+ ANSICPG_MAP.put(10005, "MacHebrew"); // Mac Hebrew
+ ANSICPG_MAP.put(10006, "MacGreek"); // Mac Hebrew
+ ANSICPG_MAP.put(10007, "MacCyrillic"); // Mac Cyrillic
+ ANSICPG_MAP.put(10029, "x-MacCentralEurope"); // MAC Latin2
+ ANSICPG_MAP.put(10081, "MacTurkish"); // Mac Turkish
+ ANSICPG_MAP.put(57002, "x-ISCII91"); // Devanagari
+
+ // TODO: in theory these other charsets are simple
+ // shifts off of Devanagari, so we could impl that
+ // here:
+ ANSICPG_MAP.put(57003, "windows-57003"); // Bengali
+ ANSICPG_MAP.put(57004, "windows-57004"); // Tamil
+ ANSICPG_MAP.put(57005, "windows-57005"); // Telugu
+ ANSICPG_MAP.put(57006, "windows-57006"); // Assamese
+ ANSICPG_MAP.put(57007, "windows-57007"); // Oriya
+ ANSICPG_MAP.put(57008, "windows-57008"); // Kannada
+ ANSICPG_MAP.put(57009, "windows-57009"); // Malayalam
+ ANSICPG_MAP.put(57010, "windows-57010"); // Gujariti
+ ANSICPG_MAP.put(57011, "windows-57011"); // Punjabi
+ }
+
+ public TextExtractor(XHTMLContentHandler out, Metadata metadata) {
+ this.metadata = metadata;
+ this.out = out;
+ }
+
+ private static boolean isHexChar(char ch) {
+ return (ch >= '0' && ch <= '9') ||
+ (ch >= 'a' && ch <= 'f') ||
+ (ch >= 'A' && ch <= 'F');
+ }
+
+ private static boolean isAlpha(char ch) {
+ return (ch >= 'a' && ch <= 'z') ||
+ (ch >= 'A' && ch <= 'Z');
+ }
+
+ private static boolean isDigit(char ch) {
+ return ch >= '0' && ch <= '9';
+ }
+
+ private static int hexValue(char ch) {
+ if (ch >= '0' && ch <= '9') {
+ return ch - '0';
+ } else if (ch >= 'a' && ch <= 'z') {
+ return 10 + (ch - 'a');
+ } else {
+ assert ch >= 'A' && ch <= 'Z';
+ return 10 + (ch - 'A');
+ }
+ }
+
+ // Push pending bytes or pending chars:
+ private void pushText() throws IOException, SAXException, TikaException {
+ if (pendingByteCount != 0) {
+ assert pendingCharCount == 0;
+ pushBytes();
+ } else {
+ pushChars();
+ }
+ }
+
+ // Buffers the byte (unit in the current charset) for
+ // output:
+ private void addOutputByte(byte b) throws IOException, SAXException, TikaException {
+
+ if (pendingCharCount != 0) {
+ pushChars();
+ }
+
+ // Save the byte in pending buffer:
+ if (pendingByteCount == pendingBytes.length) {
+ // Gradual but exponential growth:
+ final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)];
+ System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
+ pendingBytes = newArray;
+ pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+ }
+ pendingBytes[pendingByteCount++] = b;
+ }
+
+ // Buffers a byte as part of a control word:
+ private void addControl(byte b) {
+ assert isAlpha((char) b);
+ // Save the byte in pending buffer:
+ if (pendingControlCount == pendingControl.length) {
+ // Gradual but exponential growth:
+ final byte[] newArray = new byte[(int) (pendingControl.length*1.25)];
+ System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
+ pendingControl = newArray;
+ }
+ pendingControl[pendingControlCount++] = b;
+ }
+
+ // Buffers a UTF16 code unit for output
+ private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
+ if (pendingByteCount != 0) {
+ pushBytes();
+ }
+
+ if (inHeader) {
+ headerBuffer.append(ch);
+ } else {
+ if (pendingCharCount == pendingChars.length) {
+ // Gradual but exponential growth:
+ final char[] newArray = new char[(int) (pendingChars.length*1.25)];
+ System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
+ pendingChars = newArray;
+ }
+ pendingChars[pendingCharCount++] = ch;
+ }
+ }
+
+ // Shallow parses the entire doc, writing output to
+ // this.out and this.metadata
+ public void extract(InputStream in) throws IOException, SAXException, TikaException {
+ out.startDocument();
+
+ int state = 0;
+ int pushBack = -2;
+ boolean negParam = false;
+ char hex1 = 0;
+ long param = 0;
+
+ while (true) {
+ final int b;
+ if (pushBack != -2) {
+ b = pushBack;
+ pushBack = -2;
+ } else {
+ b = in.read();
+ }
+ if (b == -1) {
+ break;
+ }
+
+ // NOTE: this is always a 8bit clean byte (ie
+ // < 128), but we use a char for
+ // convenience in the testing below:
+ final char ch = (char) b;
+
+ switch (state) {
+
+ case 0:
+ if (ch == '\\') {
+ state = 1;
+ } else if (ch == '{') {
+ pushText();
+ processGroupStart();
+ } else if (ch == '}') {
+ pushText();
+ processGroupEnd();
+ } else if (ch != '\r' && ch != '\n' && (!groupState.ignore || nextMetaData != null)) {
+ // Linefeed and carriage return are not
+ // significant
+ if (ansiSkip != 0) {
+ ansiSkip--;
+ } else {
+ addOutputByte((byte) ch);
+ }
+ }
+ break;
+
+ // saw \
+ case 1:
+ if (ch == '\'') {
+ // escaped hex char
+ state = 2;
+ } else if (isAlpha(ch)) {
+ // control word
+ //pushText();
+ addControl((byte) ch);
+ state = 4;
+ } else if (ch == '{' || ch == '}' || ch == '\\' || ch == '\r' || ch == '\n') {
+ // escaped char
+ addOutputByte((byte) ch);
+ state = 0;
+ } else {
+ // control symbol, eg \* or \~
+ //pushText();
+ processControlSymbol(ch);
+ state = 0;
+ }
+ break;
+
+ // saw \'
+ case 2:
+ if (isHexChar(ch)) {
+ hex1 = ch;
+ state = 3;
+ } else {
+ // DOC ERROR (malformed hex escape): ignore
+ state = 0;
+ }
+ break;
+
+ // saw \'x
+ case 3:
+ if (isHexChar(ch)) {
+ if (ansiSkip != 0) {
+ // Skip this ansi char since we are
+ // still in the shadow of a unicode
+ // escape:
+ ansiSkip--;
+ } else {
+ // Unescape:
+ addOutputByte((byte) (16*hexValue(hex1) + hexValue(ch)));
+ }
+ state = 0;
+ } else {
+ // TODO: log a warning here, somehow?
+ // DOC ERROR (malformed hex escape):
+ // ignore
+ state = 0;
+ }
+ break;
+
+ // inside control word
+ case 4:
+ if (isAlpha(ch)) {
+ // still in control word
+ addControl((byte) ch);
+ } else if (ch == '-') {
+ // end of control word, start of negative parameter
+ negParam = true;
+ param = 0;
+ state = 5;
+ } else if (isDigit(ch)) {
+ // end of control word, start of positive parameter
+ negParam = false;
+ param = (long) (ch - '0');
+ state = 5;
+ } else if (ch == ' ') {
+ // space is consumed as part of the
+ // control word, but is not added to the
+ // control word
+ processControlWord();
+ pendingControlCount = 0;
+ state = 0;
+ } else {
+ processControlWord();
+ pendingControlCount = 0;
+ // eps transition back to start state
+ pushBack = ch;
+ state = 0;
+ }
+ break;
+
+ // inside control word's numeric param
+ case 5:
+ if (isDigit(ch)) {
+ param = (10*param) + (long) (ch - '0');
+ } else {
+ if (negParam) {
+ param = -param;
+ }
+ processControlWord(param);
+ pendingControlCount = 0;
+ if (ch != ' ') {
+ // space is consumed as part of the
+ // control word
+ pushBack = ch;
+ }
+ state = 0;
+ }
+ break;
+
+ default:
+ throw new RuntimeException("invalid state");
+ }
+ }
+
+ endParagraph(false);
+ out.endDocument();
+ }
+
+ private void lazyStartParagraph() throws IOException, SAXException, TikaException {
+ if (!inParagraph) {
+ // Ensure </i></b> order
+ if (groupState.italic) {
+ end("i");
+ }
+ if (groupState.bold) {
+ end("b");
+ }
+ out.startElement("p");
+ // Ensure <b><i> order
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ inParagraph = true;
+ }
+ }
+
+ private void endParagraph(boolean preserveStyles) throws IOException, SAXException, TikaException {
+ pushText();
+ if (inParagraph) {
+ if (groupState.italic) {
+ end("i");
+ groupState.italic = preserveStyles;
+ }
+ if (groupState.bold) {
+ end("b");
+ groupState.bold = preserveStyles;
+ }
+ out.endElement("p");
+ if (preserveStyles && (groupState.bold || groupState.italic)) {
+ start("p");
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ inParagraph = true;
+ } else {
+ inParagraph = false;
+ }
+ }
+ }
+
+ // Push pending UTF16 units to out ContentHandler
+ private void pushChars() throws IOException, SAXException, TikaException {
+ if (pendingCharCount != 0) {
+ lazyStartParagraph();
+ out.characters(pendingChars, 0, pendingCharCount);
+ pendingCharCount = 0;
+ }
+ }
+
+ // Decodes the buffered bytes in pendingBytes
+ // into UTF16 code units, and sends the characters
+ // to the out ContentHandler, if we are in the body,
+ // else appends the characters to the headerBuffer
+ private void pushBytes() throws IOException, SAXException, TikaException {
+ if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
+
+ final CharsetDecoder decoder = getDecoder();
+ pendingByteBuffer.limit(pendingByteCount);
+ assert pendingByteBuffer.position() == 0;
+ assert outputBuffer.position() == 0;
+
+ while (true) {
+ // We pass true for endOfInput because, when
+ // we are called, we should have seen a
+ // complete sequence of characters for this
+ // charset:
+ final CoderResult result = decoder.decode(pendingByteBuffer, outputBuffer, true);
+
+ final int pos = outputBuffer.position();
+ if (pos > 0) {
+ if (inHeader) {
+ headerBuffer.append(outputArray, 0, pos);
+ } else {
+ lazyStartParagraph();
+ out.characters(outputArray, 0, pos);
+ }
+ outputBuffer.position(0);
+ }
+
+ if (result == CoderResult.UNDERFLOW) {
+ break;
+ }
+ }
+
+ while (true) {
+ final CoderResult result = decoder.flush(outputBuffer);
+
+ final int pos = outputBuffer.position();
+ if (pos > 0) {
+ if (inHeader) {
+ headerBuffer.append(outputArray, 0, pos);
+ } else {
+ lazyStartParagraph();
+ out.characters(outputArray, 0, pos);
+ }
+ outputBuffer.position(0);
+ }
+
+ if (result == CoderResult.UNDERFLOW) {
+ break;
+ }
+ }
+
+ // Reset for next decode
+ decoder.reset();
+ pendingByteBuffer.position(0);
+ }
+
+ pendingByteCount = 0;
+ }
+
+ // NOTE: s must be ascii alpha only
+ private boolean equals(String s) {
+ if (pendingControlCount != s.length()) {
+ return false;
+ }
+ for(int idx=0;idx<pendingControlCount;idx++) {
+ assert isAlpha(s.charAt(idx));
+ if (((byte) s.charAt(idx)) != pendingControl[idx]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
+ switch(ch) {
+ case '~':
+ // Non-breaking space -> unicode NON-BREAKING SPACE
+ addOutputChar('\u00a0');
+ break;
+ case '*':
+ // Ignorable destination (control words defined
+ // after the 1987 RTF spec). Note that
+ // sometimes we un-ignore within this group, eg
+ // when handling upr escape.
+ groupState.ignore = true;
+ break;
+ case '-':
+ // Optional hyphen -> unicode SOFT HYPHEN
+ addOutputChar('\u00ad');
+ break;
+ case '_':
+ // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
+ addOutputChar('\u2011');
+ break;
+ default:
+ break;
+ }
+ }
+
+ private CharsetDecoder getDecoder() throws TikaException {
+ final String charset = getCharset();
+
+ // Common case: charset is same as last time, so
+ // just reuse it:
+ if (lastCharset == null || !charset.equals(lastCharset)) {
+ decoder = CharsetUtils.forName(charset).newDecoder();
+ if (decoder == null) {
+ throw new TikaException("cannot find decoder for charset=" + charset);
+ }
+ decoder.onMalformedInput(CodingErrorAction.REPLACE);
+ decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
+ lastCharset = charset;
+ }
+
+ return decoder;
+ }
+
+ // Return current charset in-use
+ private String getCharset() throws TikaException {
+ // If a specific font (fN) was set, use its charset
+ if (groupState.fontCharset != null) {
+ return groupState.fontCharset;
+ }
+
+ // Else, if global default font (defN) was set, use
+ // that
+ if (globalDefaultFont != -1 && !inHeader) {
+ final String cs = fontToCharset.get(globalDefaultFont);
+ if (cs != null) {
+ return cs;
+ }
+ }
+
+ // Else, use the global charset
+ if (globalCharset == null) {
+ throw new TikaException("unable to determine charset");
+ }
+
+ return globalCharset;
+ }
+
+ // Handle control word that takes a parameter:
+ // Param is long because spec says max value is 1+ Integer.MAX_VALUE!
+ private void processControlWord(long param) throws IOException, SAXException, TikaException {
+
+ // TODO: afN? (associated font number)
+
+ // TODO: do these alter text output...?
+ /*
+ } else if (equals("stshfdbch")) {
+ // font to be used by default in
+ // style sheet for East Asian chars
+ // arg N is font table entry
+ } else if (equals("stshfloch")) {
+ // font to be used by default in
+ // style sheet for ASCII chars
+ // arg N is font table entry
+ } else if (equals("stshfhich")) {
+ // font to be used by default in
+ // style sheet for High Ansi chars
+ // arg N is font table entry
+ } else if (equals("stshfbi")) {
+ // style sheet for Complex Scripts (BIDI) chars
+ // arg N is font table entry
+ */
+
+ // TODO: inefficient that we check equals N times;
+ // we'd get better perf w/ real lexer (eg
+ // JFlex), which uses single-pass FSM to do cmp:
+ if (inHeader) {
+ if (equals("ansicpg")) {
+ // ANSI codepage
+ final String cs = ANSICPG_MAP.get((int) param);
+ if (cs != null) {
+ globalCharset = cs;
+ }
+ } else if (equals("deff")) {
+ // Default font
+ globalDefaultFont = (int) param;
+ }
+
+ if (fontTableState == 1) {
+ // Still inside font table -- record the
+ // mappings of fN to the fcharset:
+ if (groupState.depth < fontTableDepth) {
+ fontTableState = 2;
+ } else {
+ if (equals("f")) {
+ // Start new font definition
+ curFontID = (int) param;
+ } else if (equals("fcharset")) {
+ final String cs = FCHARSET_MAP.get((int) param);
+ if (cs != null) {
+ fontToCharset.put(curFontID, cs);
+ }
+ }
+ }
+ }
+ } else {
+ // In document
+ if (equals("b")) {
+ // b0
+ assert param == 0;
+ if (groupState.bold) {
+ pushText();
+ if (groupState.italic) {
+ end("i");
+ }
+ end("b");
+ if (groupState.italic) {
+ start("i");
+ }
+ groupState.bold = false;
+ }
+ } else if (equals("i")) {
+ // i0
+ assert param == 0;
+ if (groupState.italic) {
+ pushText();
+ end("i");
+ groupState.italic = false;
+ }
+ } else if (equals("f")) {
+ // Change current font
+ final String fontCharset = fontToCharset.get((int) param);
+ if (fontCharset != null) {
+ groupState.fontCharset = fontCharset;
+ } else {
+ // DOC ERROR: font change referenced a
+ // non-table'd font number
+ // TODO: log a warning? Throw an exc?
+ groupState.fontCharset = null;
+ }
+ }
+ }
+
+ // Process unicode escape. This can appear in doc
+ // or in header, since the metadata (info) fields
+ // in the header can be unicode escaped as well:
+ if (pendingControl[0] == 'u') {
+ if (pendingControlCount == 1) {
+ // Unicode escape
+ if (!groupState.ignore) {
+ final char utf16CodeUnit = (char) (((int) param) & 0xffff);
+ addOutputChar(utf16CodeUnit);
+ }
+
+ // After seeing a unicode escape we must
+ // skip the next ucSkip ansi chars (the
+ // "unicode shadow")
+ ansiSkip = groupState.ucSkip;
+ } else if (pendingControlCount == 2 && pendingControl[1] == 'c') {
+ // Change unicode shadow length
+ groupState.ucSkip = (int) param;
+ }
+ }
+ }
+
+ private void end(String tag) throws IOException, SAXException, TikaException {
+ out.endElement(tag);
+ }
+
+ private void start(String tag) throws IOException, SAXException, TikaException {
+ out.startElement(tag);
+ }
+
+ // Handle non-parameter control word:
+ private void processControlWord() throws IOException, SAXException, TikaException {
+ if (inHeader) {
+ if (equals("ansi")) {
+ globalCharset = "cp1252";
+ } else if (equals("pca")) {
+ globalCharset = "cp850";
+ } else if (equals("pc")) {
+ globalCharset = "cp437";
+ } else if (equals("mac")) {
+ globalCharset = "MacRoman";
+ }
+
+ if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
+ groupState.ignore = true;
+ }
+
+ if (uprState == -1) {
+ // TODO: we can also parse \creatim, \revtim,
+ // \printim, \version, \nofpages, \nofwords,
+ // \nofchars, etc.
+ if (equals("author")) {
+ nextMetaData = Metadata.AUTHOR;
+ } else if (equals("title")) {
+ nextMetaData = Metadata.TITLE;
+ } else if (equals("subject")) {
+ nextMetaData = Metadata.SUBJECT;
+ } else if (equals("keywords")) {
+ nextMetaData = Metadata.KEYWORDS;
+ } else if (equals("category")) {
+ nextMetaData = Metadata.CATEGORY;
+ } else if (equals("comment")) {
+ nextMetaData = Metadata.COMMENT;
+ } else if (equals("company")) {
+ nextMetaData = Metadata.COMPANY;
+ } else if (equals("manager")) {
+ nextMetaData = Metadata.MANAGER;
+ } else if (equals("template")) {
+ nextMetaData = Metadata.TEMPLATE;
+ }
+ }
+
+ if (fontTableState == 0) {
+ // Didn't see font table yet
+ if (equals("fonttbl")) {
+ fontTableState = 1;
+ fontTableDepth = groupState.depth;
+ }
+ } else if (fontTableState == 1) {
+ // Inside font table
+ if (groupState.depth < fontTableDepth) {
+ fontTableState = 2;
+ }
+ }
+
+ if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
+ inHeader = false;
+ }
+ } else {
+ if (equals("b")) {
+ if (!groupState.bold) {
+ pushText();
+ lazyStartParagraph();
+ if (groupState.italic) {
+ // Make sure nesting is always <b><i>
+ end("i");
+ }
+ groupState.bold = true;
+ start("b");
+ if (groupState.italic) {
+ start("i");
+ }
+ }
+ } else if (equals("i")) {
+ if (!groupState.italic) {
+ pushText();
+ lazyStartParagraph();
+ groupState.italic = true;
+ start("i");
+ }
+ }
+ }
+
+ if (equals("pard")) {
+ // Reset styles
+ pushText();
+ if (groupState.italic) {
+ end("i");
+ groupState.italic = false;
+ }
+ if (groupState.bold) {
+ end("b");
+ groupState.bold = false;
+ }
+ } else if (equals("par")) {
+ endParagraph(true);
+ } else if (equals("shptxt")) {
+ pushText();
+ // Text inside a shape
+ groupState.ignore = false;
+ } else if (equals("cell")) {
+ // TODO: we should produce a table output here?
+ //addOutputChar(' ');
+ endParagraph(true);
+ } else if (equals("pict")) {
+ pushText();
+ // TODO: create img tag? but can that support
+ // embedded image data?
+ groupState.ignore = true;
+ } else if (equals("line")) {
+ addOutputChar('\n');
+ } else if (equals("column")) {
+ addOutputChar(' ');
+ } else if (equals("page")) {
+ addOutputChar('\n');
+ } else if (equals("softline")) {
+ addOutputChar('\n');
+ } else if (equals("softcolumn")) {
+ addOutputChar(' ');
+ } else if (equals("softpage")) {
+ addOutputChar('\n');
+ } else if (equals("tab")) {
+ addOutputChar('\t');
+ } else if (equals("upr")) {
+ uprState = 0;
+ } else if (equals("ud") && uprState == 1) {
+ uprState = -1;
+ // 2nd group inside the upr destination, which
+ // contains the unicode encoding of the text, so
+ // we want to keep that:
+ groupState.ignore = false;
+ } else if (equals("bullet")) {
+ // unicode BULLET
+ addOutputChar('\u2022');
+ } else if (equals("endash")) {
+ // unicode EN DASH
+ addOutputChar('\u2013');
+ } else if (equals("emdash")) {
+ // unicode EM DASH
+ addOutputChar('\u2014');
+ } else if (equals("enspace")) {
+ // unicode EN SPACE
+ addOutputChar('\u2002');
+ } else if (equals("qmspace")) {
+ // quarter em space -> unicode FOUR-PER-EM SPACE
+ addOutputChar('\u2005');
+ } else if (equals("emspace")) {
+ // unicode EM SPACE
+ addOutputChar('\u2003');
+ } else if (equals("lquote")) {
+ // unicode LEFT SINGLE QUOTATION MARK
+ addOutputChar('\u2018');
+ } else if (equals("rquote")) {
+ // unicode RIGHT SINGLE QUOTATION MARK
+ addOutputChar('\u2019');
+ } else if (equals("ldblquote")) {
+ // unicode LEFT DOUBLE QUOTATION MARK
+ addOutputChar('\u201C');
+ } else if (equals("rdblquote")) {
+ // unicode RIGHT DOUBLE QUOTATION MARK
+ addOutputChar('\u201D');
+ }
+ }
+
+ // Push new GroupState
+ private void processGroupStart() throws IOException {
+ ansiSkip = 0;
+ // Push current groupState onto the stack
+ groupStates.add(groupState);
+
+ // Make new GroupState
+ groupState = new GroupState(groupState);
+ assert groupStates.size() == groupState.depth: "size=" + groupStates.size() + " depth=" + groupState.depth;
+
+ if (uprState == 0) {
+ uprState = 1;
+ groupState.ignore = true;
+ }
+ }
+
+ // Pop current GroupState
+ private void processGroupEnd() throws IOException, SAXException, TikaException {
+
+ if (inHeader) {
+ if (nextMetaData != null) {
+ metadata.add(nextMetaData, headerBuffer.toString());
+ nextMetaData = null;
+ }
+ headerBuffer.setLength(0);
+ }
+
+ assert groupState.depth > 0;
+ ansiSkip = 0;
+
+ // Restore group state:
+ final GroupState outerGroupState = groupStates.removeLast();
+
+ // Close italic, if outer does not have italic or
+ // bold changed:
+ if (groupState.italic) {
+ if (!outerGroupState.italic ||
+ groupState.bold != outerGroupState.bold) {
+ end("i");
+ groupState.italic = false;
+ }
+ }
+
+ // Close bold
+ if (groupState.bold && !outerGroupState.bold) {
+ end("b");
+ }
+
+ // Open bold
+ if (!groupState.bold && outerGroupState.bold) {
+ start("b");
+ }
+
+ // Open italic
+ if (!groupState.italic && outerGroupState.italic) {
+ start("i");
+ }
+ groupState = outerGroupState;
+ assert groupStates.size() == groupState.depth;
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Thu Sep 15 16:42:06 2011
@@ -55,59 +55,6 @@ public class TestParsers extends TikaTes
assertEquals(s1, s2);
}
- public void testRTFExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTF.rtf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
- assertEquals(s1, s2);
- }
-
- public void testRTFms932Extraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
- assertEquals(s1, s2);
- // Hello in Japanese
- assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
- }
-
- public void testRTFUmlautSpacesExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
- assertEquals(s1, s2);
- assertTrue(s1.contains("\u00DCbersicht"));
- }
-
- public void testRTFWordPadCzechCharactersExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
- assertEquals(s1, s2);
- assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
- assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
- }
-
- public void testRTFWord2010CzechCharactersExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
- assertEquals(s1, s2);
- assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
- assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
- }
-
- public void testRTFTableCellSeparation() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf");
- String s1 = ParseUtils.getStringContent(file, tc);
- String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
- assertEquals(s1, s2);
- String content = s1;
- content = content.replaceAll("\ufffd", " ");
- content = content.replaceAll("\\s+"," ");
- assertTrue(content.contains("a b c d \u00E4 \u00EB \u00F6 \u00FC"));
- }
-
public void testXMLExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testXML.xml");
String s1 = ParseUtils.getStringContent(file, tc);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java Thu Sep 15 16:42:06 2011
@@ -74,6 +74,7 @@ public class PowerPointParserTest extend
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu Sep 15 16:42:06 2011
@@ -212,6 +212,7 @@ public class WordParserTest extends Tika
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Thu Sep 15 16:42:06 2011
@@ -482,6 +482,7 @@ public class OOXMLParserTest extends Tik
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
@@ -551,6 +552,7 @@ public class OOXMLParserTest extends Tik
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1171171&r1=1171170&r2=1171171&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu Sep 15 16:42:06 2011
@@ -171,6 +171,7 @@ public class PDFParserTest extends TikaT
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+"," "));
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+"," "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);