You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/07/15 16:53:21 UTC

svn commit: r1147172 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/rtf/RTFParser.java test/java/org/apache/tika/parser/rtf/ test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Author: nick
Date: Fri Jul 15 14:53:20 2011
New Revision: 1147172

URL: http://svn.apache.org/viewvc?rev=1147172&view=rev
Log:
TIKA-683 Create a dedicate RTF parser test, based on the existing checks in TestParsers

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1147172&r1=1147171&r2=1147172&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java Fri Jul 15 14:53:20 2011
@@ -121,6 +121,9 @@ public class RTFParser extends AbstractP
                 xhtml.startDocument();
                 xhtml.element("p", sd.getText(0, sd.getLength()));
                 xhtml.endDocument();
+                
+                // TODO Extract some of the metadata
+                metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
             } finally {
                 in.close();
             }

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1147172&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Fri Jul 15 14:53:20 2011
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.rtf;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+
+/**
+ * Junit test class for the Tika {@link RTFParser}
+ */
+public class RTFParserTest extends TikaTest {
+    private TikaConfig tc;
+    private RTFParser parser;
+
+    public void setUp() throws Exception {
+        tc = TikaConfig.getDefaultConfig();
+        parser = new RTFParser();
+    }
+
+    public void testBasicExtraction() throws Exception {
+        File file = getResourceAsFile("/test-documents/testRTF.rtf");
+        
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                new FileInputStream(file),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+        String content = writer.toString();
+
+        assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
+        assertContains("Test", content);
+        assertContains("indexation Word", content);
+    }
+
+    public void testRTFms932Extraction() throws Exception {
+        String content = getText("testRTF-ms932.rtf");
+        
+        // Hello in Japanese
+        assertContains("\u3053\u3093\u306b\u3061\u306f", content);
+    }
+
+    public void testRTFUmlautSpacesExtraction() throws Exception {
+        String content = getText("testRTFUmlautSpaces.rtf");
+
+        assertContains("\u00DCbersicht", content);
+    }
+
+    public void testRTFWordPadCzechCharactersExtraction() throws Exception {
+        String content = getText("testRTFWordPadCzechCharacters.rtf");
+
+        assertContains("\u010Cl\u00E1nek t\u00FDdne", content);
+        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", content);
+    }
+
+    public void testRTFWord2010CzechCharactersExtraction() throws Exception {
+        String content = getText("testRTFWord2010CzechCharacters.rtf");
+
+        assertContains("\u010Cl\u00E1nek t\u00FDdne", content);
+        assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", content);
+    }
+
+    public void testRTFTableCellSeparation() throws Exception {
+        String content = getText("testRTFTableCellSeparation.rtf");
+
+        content = content.replaceAll("\\s+"," ");
+        assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
+    }
+    
+    private String getText(String filename) throws Exception {
+       File file = getResourceAsFile("/test-documents/" + filename);
+       
+       Metadata metadata = new Metadata();
+       StringWriter writer = new StringWriter();
+       parser.parse(
+               new FileInputStream(file),
+               new WriteOutContentHandler(writer),
+               metadata,
+               new ParseContext());
+       String content = writer.toString();
+       return content;
+    }
+}