You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/07/15 16:53:21 UTC
svn commit: r1147172 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/rtf/RTFParser.java
test/java/org/apache/tika/parser/rtf/
test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Author: nick
Date: Fri Jul 15 14:53:20 2011
New Revision: 1147172
URL: http://svn.apache.org/viewvc?rev=1147172&view=rev
Log:
TIKA-683 Create a dedicate RTF parser test, based on the existing checks in TestParsers
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1147172&r1=1147171&r2=1147172&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Fri Jul 15 14:53:20 2011
@@ -121,6 +121,9 @@ public class RTFParser extends AbstractP
xhtml.startDocument();
xhtml.element("p", sd.getText(0, sd.getLength()));
xhtml.endDocument();
+
+ // TODO Extract some of the metadata
+ metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
} finally {
in.close();
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1147172&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Fri Jul 15 14:53:20 2011
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+
+/**
+ * Junit test class for the Tika {@link RTFParser}
+ */
+public class RTFParserTest extends TikaTest {
+ private TikaConfig tc;
+ private RTFParser parser;
+
+ public void setUp() throws Exception {
+ tc = TikaConfig.getDefaultConfig();
+ parser = new RTFParser();
+ }
+
+ public void testBasicExtraction() throws Exception {
+ File file = getResourceAsFile("/test-documents/testRTF.rtf");
+
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new FileInputStream(file),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+ String content = writer.toString();
+
+ assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Test", content);
+ assertContains("indexation Word", content);
+ }
+
+ public void testRTFms932Extraction() throws Exception {
+ String content = getText("testRTF-ms932.rtf");
+
+ // Hello in Japanese
+ assertContains("\u3053\u3093\u306b\u3061\u306f", content);
+ }
+
+ public void testRTFUmlautSpacesExtraction() throws Exception {
+ String content = getText("testRTFUmlautSpaces.rtf");
+
+ assertContains("\u00DCbersicht", content);
+ }
+
+ public void testRTFWordPadCzechCharactersExtraction() throws Exception {
+ String content = getText("testRTFWordPadCzechCharacters.rtf");
+
+ assertContains("\u010Cl\u00E1nek t\u00FDdne", content);
+ assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", content);
+ }
+
+ public void testRTFWord2010CzechCharactersExtraction() throws Exception {
+ String content = getText("testRTFWord2010CzechCharacters.rtf");
+
+ assertContains("\u010Cl\u00E1nek t\u00FDdne", content);
+ assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", content);
+ }
+
+ public void testRTFTableCellSeparation() throws Exception {
+ String content = getText("testRTFTableCellSeparation.rtf");
+
+ content = content.replaceAll("\\s+"," ");
+ assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
+ }
+
+ private String getText(String filename) throws Exception {
+ File file = getResourceAsFile("/test-documents/" + filename);
+
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ parser.parse(
+ new FileInputStream(file),
+ new WriteOutContentHandler(writer),
+ metadata,
+ new ParseContext());
+ String content = writer.toString();
+ return content;
+ }
+}