You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/11/09 00:21:38 UTC

svn commit: r1199524 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/rtf/ tika-parsers/src/test/java/org/apache/tika/parser/rtf/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Tue Nov  8 23:21:38 2011
New Revision: 1199524

URL: http://svn.apache.org/viewvc?rev=1199524&view=rev
Log:
TIKA-777: process buffered bytes/text on font change

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1199524&r1=1199523&r2=1199524&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Tue Nov  8 23:21:38 2011
@@ -9,6 +9,10 @@ Release 1.1 - Current Development
    speedups to text extraction and may workaround cases where
    non-duplicated characters were incorrectly removed.  (TIKA-767)
 
+ * RTF: Fixed case where a font change would result in processing
+   bytes in the wrong font's charset, producing bogus text output
+   (TIKA-777)
+
 Release 1.0 - 11/4/2011
 ---------------------------------
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1199524&r1=1199523&r2=1199524&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Tue Nov  8 23:21:38 2011
@@ -772,6 +772,11 @@ final class TextExtractor {
             } else if (equals("f")) {
                 // Change current font
                 final String fontCharset = fontToCharset.get((int) param);
+
+                // Push any buffered text before changing
+                // font:
+                pushText();
+
                 if (fontCharset != null) {
                     groupState.fontCharset = fontCharset;
                 } else {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1199524&r1=1199523&r2=1199524&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Tue Nov  8 23:21:38 2011
@@ -274,6 +274,11 @@ public class RTFParserTest extends TikaT
         assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
     }
 
+    public void testFontAfterBufferedText() throws Exception {
+        assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
+                       getXML("testFontAfterBufferedText.rtf").xml);
+    }
+
     private Result getResult(String filename) throws Exception {
         File file = getResourceAsFile("/test-documents/" + filename);
        

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf?rev=1199524&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf Tue Nov  8 23:21:38 2011
@@ -0,0 +1,7 @@
+{\rtf1\ansi\ansicpg1252\fromtext \fbidis \deff0
+{\fonttbl
+
+{\f0\fswiss\fcharset0 Arial;} {\f1\fswiss\fcharset204 Arial;}
+}
+\par{\f1\fs20 \'d3\'e2\'e0\'e6\'e0\'e5\'ec\'fb\'e9 \'ea\'eb\'e8\'e5\'ed\'f2!\f0}\par
+}

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testFontAfterBufferedText.rtf
------------------------------------------------------------------------------
    svn:eol-style = native