You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2013/05/30 17:16:16 UTC
svn commit: r1487894 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Thu May 30 15:16:16 2013
New Revision: 1487894
URL: http://svn.apache.org/r1487894
Log:
TIKA-1128: replace line tabulation with line break
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1487894&r1=1487893&r2=1487894&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu May 30 15:16:16 2013
@@ -28,6 +28,9 @@ Release 1.4 Current Development
embedded documennts in TikaCLI are now logged instead
of aborting extraction (TIKA-1074)
+ * MS Word: line tabular character is now replaced with newline
+ (TIKA-1128)
+
Release 1.3 - 01/19/2013
* Mimetype definitions added for more common programming languages,
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1487894&r1=1487893&r2=1487894&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Thu May 30 15:16:16 2013
@@ -309,6 +309,9 @@ public class WordExtractor extends Abstr
// Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
+ // line tabulator as break line
+ text = text.replace((char)0x000b,'\n');
+
// Non-breaking hyphens are returned as char 30
text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1487894&r1=1487893&r2=1487894&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu May 30 15:16:16 2013
@@ -330,4 +330,8 @@ public class WordParserTest extends Tika
Logger.getRootLogger().setLevel(logLevelStart);
}
}
+
+ public void testTabularSymbol() throws Exception {
+ assertContains("one\ntwo", getXML("testWORD_tabular_symbol.doc").xml);
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc?rev=1487894&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_tabular_symbol.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword