You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC

svn commit: r1725014 [28/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-m...

Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DcXMLParserTest extends TikaTest {
+
+    @Test
+    public void testXMLParserAsciiChars() throws Exception {
+        try (InputStream input = DcXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML.xml")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DcXMLParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/xml",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
+
+            // The file contains 5 dc:subject tags, which come through as
+            //  a multi-valued Tika Metadata entry in file order
+            assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
+            assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
+            assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+            assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
+            assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
+            assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
+            assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
+            assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+            assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
+            assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+            assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
+            assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+            assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+            assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
+
+            assertEquals(
+                    "Framework d\'indexation des documents XML, HTML, PDF etc..",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals(
+                    "http://www.apache.org",
+                    metadata.get(TikaCoreProperties.IDENTIFIER));
+            assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
+            assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
+            assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
+            assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
+
+            String content = handler.toString();
+            assertContains("Tika test document", content);
+
+            assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
+        }
+    }
+    
+    @Test
+    public void testXMLParserNonAsciiChars() throws Exception {
+        try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
+            Metadata metadata = new Metadata();
+            new DcXMLParser().parse(input, new DefaultHandler(), metadata);
+
+            final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+            assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS));
+        }
+    }
+
+    // TIKA-1048
+    @Test
+    public void testNoSpaces() throws Exception {
+      String text = getXML("testXML2.xml").xml;
+      assertFalse(text.contains("testSubject"));
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+    
+    private Property FIRST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+    private Property LAST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+    @Test
+    public void testDefaultBehavior() throws Exception {
+        try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML3.xml")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(4, metadata.getValues(FIRST_NAME).length);
+            assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+            assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+            assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+            assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+            // We didn't know Bob's last name, but now we don't know an entry existed
+            assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+            // We don't know Kate's last name because it was a duplicate
+            assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+        }
+    }
+    
+    @Test
+    public void testEmptiesAndRepeats() throws Exception {
+        try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML3.xml")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(4, metadata.getValues(FIRST_NAME).length);
+            assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+            assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+            assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+            assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+            assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+            assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+            assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+        }
+    }
+    
+    private class DefaultCustomXMLTestParser extends XMLParser {
+    
+        private static final long serialVersionUID = 2458579047014545931L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    tikaProperty);
+        }
+        
+        protected ContentHandler getContentHandler(
+                ContentHandler handler, Metadata metadata, ParseContext context) {
+            return new TeeContentHandler(
+                    super.getContentHandler(handler, metadata, context),
+                    getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+                    getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+        }
+    }
+    
+    private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+        
+        private static final long serialVersionUID = 3735646809954466229L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    tikaProperty,
+                    true,
+                    true);
+        }
+    }
+    
+    
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FictionBookParserTest {
+  
+    @Test
+    public void testFB2() throws Exception {
+        try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new FictionBookParser().parse(input, handler, metadata, new ParseContext());
+            String content = handler.toString();
+
+            assertContains("1812", content);
+        }
+    }
+
+    @Test
+    public void testEmbedded() throws Exception {
+        try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+            ContainerExtractor extractor = new ParserContainerExtractor();
+            TikaInputStream stream = TikaInputStream.get(input);
+
+            assertEquals(true, extractor.isSupported(stream));
+
+            // Process it
+            TrackingHandler handler = new TrackingHandler();
+            extractor.extract(stream, null, handler);
+
+            assertEquals(2, handler.filenames.size());
+        }
+    }
+}

Modified: tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml?rev=1725014&r1=1725011&r2=1725014&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -19,8 +19,8 @@
     <version>2.0-SNAPSHOT</version>
   </parent>
 
-  <artifactId>tika-web-module</artifactId>
-  <name>Apache Tika Web Module</name>
+  <artifactId>tika-web-parser-module</artifactId>
+  <name>Apache Tika Web Parser Module</name>
   <url>http://tika.apache.org/</url>
   
   <properties>
@@ -73,13 +73,13 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-text-module</artifactId>
+      <artifactId>tika-text-parser-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-package-module</artifactId>
+      <artifactId>tika-package-parser-module</artifactId>
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>

Modified: tika/branches/2.x/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/pom.xml?rev=1725014&r1=1725013&r2=1725014&view=diff
==============================================================================
--- tika/branches/2.x/tika-parsers/pom.xml (original)
+++ tika/branches/2.x/tika-parsers/pom.xml Sat Jan 16 18:23:01 2016
@@ -147,67 +147,67 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-multimedia-module</artifactId>
+      <artifactId>tika-multimedia-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-advanced-module</artifactId>
+      <artifactId>tika-advanced-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-cad-module</artifactId>
+      <artifactId>tika-cad-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-code-module</artifactId>
+      <artifactId>tika-code-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-database-module</artifactId>
+      <artifactId>tika-database-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-ebook-module</artifactId>
+      <artifactId>tika-ebook-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-journal-module</artifactId>
+      <artifactId>tika-journal-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-office-module</artifactId>
+      <artifactId>tika-office-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-package-module</artifactId>
+      <artifactId>tika-package-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-pdf-module</artifactId>
+      <artifactId>tika-pdf-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-scientific-module</artifactId>
+      <artifactId>tika-scientific-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-text-module</artifactId>
+      <artifactId>tika-text-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
-      <artifactId>tika-web-module</artifactId>
+      <artifactId>tika-web-parser-module</artifactId>
       <version>${project.version}</version>
     </dependency>
     <dependency>
@@ -322,19 +322,19 @@
               </createDependencyReducedPom>
               <artifactSet>
                 <includes>
-                  <include>org.apache.tika:tika-multimedia-module</include>
-                  <include>org.apache.tika:tika-advanced-module</include>
-                  <include>org.apache.tika:tika-cad-module</include>
-                  <include>org.apache.tika:tika-code-module</include>
-                  <include>org.apache.tika:tika-database-module</include>
-                  <include>org.apache.tika:tika-ebook-module</include>
-                  <include>org.apache.tika:tika-journal-module</include>
-                  <include>org.apache.tika:tika-office-module</include>
-                  <include>org.apache.tika:tika-package-module</include>
-                  <include>org.apache.tika:tika-pdf-module</include>
-                  <include>org.apache.tika:tika-scientific-module</include>
-                  <include>org.apache.tika:tika-text-module</include>
-                  <include>org.apache.tika:tika-web-module</include>
+                  <include>org.apache.tika:tika-multimedia-parser-module</include>
+                  <include>org.apache.tika:tika-advanced-parser-module</include>
+                  <include>org.apache.tika:tika-cad-parser-module</include>
+                  <include>org.apache.tika:tika-code-parser-module</include>
+                  <include>org.apache.tika:tika-database-parser-module</include>
+                  <include>org.apache.tika:tika-ebook-parser-module</include>
+                  <include>org.apache.tika:tika-journal-parser-module</include>
+                  <include>org.apache.tika:tika-office-parser-module</include>
+                  <include>org.apache.tika:tika-package-parser-module</include>
+                  <include>org.apache.tika:tika-pdf-parser-module</include>
+                  <include>org.apache.tika:tika-scientific-parser-module</include>
+                  <include>org.apache.tika:tika-text-parser-module</include>
+                  <include>org.apache.tika:tika-web-parser-module</include>
                 </includes>
               </artifactSet>
               <transformers>