You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2013/05/30 17:16:16 UTC

svn commit: r1487894 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Thu May 30 15:16:16 2013
New Revision: 1487894

URL: http://svn.apache.org/r1487894
Log:
TIKA-1128: replace line tabulation with line break

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1487894&r1=1487893&r2=1487894&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu May 30 15:16:16 2013
@@ -28,6 +28,9 @@ Release 1.4 Current Development
     embedded documennts in TikaCLI are now logged instead
     of aborting extraction (TIKA-1074)
 
+  * MS Word: line tabular character is now replaced with newline
+    (TIKA-1128)
+
 Release 1.3 - 01/19/2013
 
   * Mimetype definitions added for more common programming languages,

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1487894&r1=1487893&r2=1487894&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Thu May 30 15:16:16 2013
@@ -309,6 +309,9 @@ public class WordExtractor extends Abstr
 
        // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
 
+       // line tabulator as break line
+       text = text.replace((char)0x000b,'\n');
+
        // Non-breaking hyphens are returned as char 30
        text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1487894&r1=1487893&r2=1487894&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu May 30 15:16:16 2013
@@ -330,4 +330,8 @@ public class WordParserTest extends Tika
         Logger.getRootLogger().setLevel(logLevelStart);
       }
     }
+
+    public void testTabularSymbol() throws Exception {
+        assertContains("one\ntwo", getXML("testWORD_tabular_symbol.doc").xml);
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc?rev=1487894&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword