You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/11/09 00:21:38 UTC
svn commit: r1199524 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/rtf/
tika-parsers/src/test/java/org/apache/tika/parser/rtf/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Tue Nov 8 23:21:38 2011
New Revision: 1199524
URL: http://svn.apache.org/viewvc?rev=1199524&view=rev
Log:
TIKA-777: process buffered bytes/text on font change
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1199524&r1=1199523&r2=1199524&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Nov 8 23:21:38 2011
@@ -9,6 +9,10 @@ Release 1.1 - Current Development
speedups to text extraction and may workaround cases where
non-duplicated characters were incorrectly removed. (TIKA-767)
+ * RTF: Fixed case where a font change would result in processing
+ bytes in the wrong font's charset, producing bogus text output
+ (TIKA-777)
+
Release 1.0 - 11/4/2011
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1199524&r1=1199523&r2=1199524&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Tue Nov 8 23:21:38 2011
@@ -772,6 +772,11 @@ final class TextExtractor {
} else if (equals("f")) {
// Change current font
final String fontCharset = fontToCharset.get((int) param);
+
+ // Push any buffered text before changing
+ // font:
+ pushText();
+
if (fontCharset != null) {
groupState.fontCharset = fontCharset;
} else {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1199524&r1=1199523&r2=1199524&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Tue Nov 8 23:21:38 2011
@@ -274,6 +274,11 @@ public class RTFParserTest extends TikaT
assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
}
+ public void testFontAfterBufferedText() throws Exception {
+ assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
+ getXML("testFontAfterBufferedText.rtf").xml);
+ }
+
private Result getResult(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf?rev=1199524&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf Tue Nov 8 23:21:38 2011
@@ -0,0 +1,7 @@
+{\rtf1\ansi\ansicpg1252\fromtext \fbidis \deff0
+{\fonttbl
+
+{\f0\fswiss\fcharset0 Arial;} {\f1\fswiss\fcharset204 Arial;}
+}
+\par{\f1\fs20 \'d3\'e2\'e0\'e6\'e0\'e5\'ec\'fb\'e9 \'ea\'eb\'e8\'e5\'ed\'f2!\f0}\par
+}
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf
------------------------------------------------------------------------------
svn:eol-style = native