You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [28/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-m...
Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DcXMLParserTest extends TikaTest {
+
+ @Test
+ public void testXMLParserAsciiChars() throws Exception {
+ try (InputStream input = DcXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML.xml")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DcXMLParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
+
+ // The file contains 5 dc:subject tags, which come through as
+ // a multi-valued Tika Metadata entry in file order
+ assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
+ assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
+ assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+ assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
+ assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
+ assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
+ assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
+ assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+ assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
+ assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+ assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
+ assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+ assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+ assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
+
+ assertEquals(
+ "Framework d\'indexation des documents XML, HTML, PDF etc..",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals(
+ "http://www.apache.org",
+ metadata.get(TikaCoreProperties.IDENTIFIER));
+ assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
+ assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
+ assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
+ assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
+
+ String content = handler.toString();
+ assertContains("Tika test document", content);
+
+ assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
+ }
+ }
+
+ @Test
+ public void testXMLParserNonAsciiChars() throws Exception {
+ try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
+ Metadata metadata = new Metadata();
+ new DcXMLParser().parse(input, new DefaultHandler(), metadata);
+
+ final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+ assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS));
+ }
+ }
+
+ // TIKA-1048
+ @Test
+ public void testNoSpaces() throws Exception {
+ String text = getXML("testXML2.xml").xml;
+ assertFalse(text.contains("testSubject"));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+
+ private Property FIRST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+ private Property LAST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+ @Test
+ public void testDefaultBehavior() throws Exception {
+ try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML3.xml")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ // We didn't know Bob's last name, but now we don't know an entry existed
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+ // We don't know Kate's last name because it was a duplicate
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ }
+ }
+
+ @Test
+ public void testEmptiesAndRepeats() throws Exception {
+ try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML3.xml")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+ assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+ }
+ }
+
+ private class DefaultCustomXMLTestParser extends XMLParser {
+
+ private static final long serialVersionUID = 2458579047014545931L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ tikaProperty);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+ getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+ }
+ }
+
+ private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+
+ private static final long serialVersionUID = 3735646809954466229L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ tikaProperty,
+ true,
+ true);
+ }
+ }
+
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-parser-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FictionBookParserTest {
+
+ @Test
+ public void testFB2() throws Exception {
+ try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new FictionBookParser().parse(input, handler, metadata, new ParseContext());
+ String content = handler.toString();
+
+ assertContains("1812", content);
+ }
+ }
+
+ @Test
+ public void testEmbedded() throws Exception {
+ try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TikaInputStream stream = TikaInputStream.get(input);
+
+ assertEquals(true, extractor.isSupported(stream));
+
+ // Process it
+ TrackingHandler handler = new TrackingHandler();
+ extractor.extract(stream, null, handler);
+
+ assertEquals(2, handler.filenames.size());
+ }
+ }
+}
Modified: tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml?rev=1725014&r1=1725011&r2=1725014&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml (original)
+++ tika/branches/2.x/tika-parser-modules/tika-web-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -19,8 +19,8 @@
<version>2.0-SNAPSHOT</version>
</parent>
- <artifactId>tika-web-module</artifactId>
- <name>Apache Tika Web Module</name>
+ <artifactId>tika-web-parser-module</artifactId>
+ <name>Apache Tika Web Parser Module</name>
<url>http://tika.apache.org/</url>
<properties>
@@ -73,13 +73,13 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-text-module</artifactId>
+ <artifactId>tika-text-parser-module</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-package-module</artifactId>
+ <artifactId>tika-package-parser-module</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
Modified: tika/branches/2.x/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/pom.xml?rev=1725014&r1=1725013&r2=1725014&view=diff
==============================================================================
--- tika/branches/2.x/tika-parsers/pom.xml (original)
+++ tika/branches/2.x/tika-parsers/pom.xml Sat Jan 16 18:23:01 2016
@@ -147,67 +147,67 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-multimedia-module</artifactId>
+ <artifactId>tika-multimedia-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-advanced-module</artifactId>
+ <artifactId>tika-advanced-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-cad-module</artifactId>
+ <artifactId>tika-cad-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-code-module</artifactId>
+ <artifactId>tika-code-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-database-module</artifactId>
+ <artifactId>tika-database-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-ebook-module</artifactId>
+ <artifactId>tika-ebook-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-journal-module</artifactId>
+ <artifactId>tika-journal-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-office-module</artifactId>
+ <artifactId>tika-office-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-package-module</artifactId>
+ <artifactId>tika-package-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-pdf-module</artifactId>
+ <artifactId>tika-pdf-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-scientific-module</artifactId>
+ <artifactId>tika-scientific-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-text-module</artifactId>
+ <artifactId>tika-text-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-web-module</artifactId>
+ <artifactId>tika-web-parser-module</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
@@ -322,19 +322,19 @@
</createDependencyReducedPom>
<artifactSet>
<includes>
- <include>org.apache.tika:tika-multimedia-module</include>
- <include>org.apache.tika:tika-advanced-module</include>
- <include>org.apache.tika:tika-cad-module</include>
- <include>org.apache.tika:tika-code-module</include>
- <include>org.apache.tika:tika-database-module</include>
- <include>org.apache.tika:tika-ebook-module</include>
- <include>org.apache.tika:tika-journal-module</include>
- <include>org.apache.tika:tika-office-module</include>
- <include>org.apache.tika:tika-package-module</include>
- <include>org.apache.tika:tika-pdf-module</include>
- <include>org.apache.tika:tika-scientific-module</include>
- <include>org.apache.tika:tika-text-module</include>
- <include>org.apache.tika:tika-web-module</include>
+ <include>org.apache.tika:tika-multimedia-parser-module</include>
+ <include>org.apache.tika:tika-advanced-parser-module</include>
+ <include>org.apache.tika:tika-cad-parser-module</include>
+ <include>org.apache.tika:tika-code-parser-module</include>
+ <include>org.apache.tika:tika-database-parser-module</include>
+ <include>org.apache.tika:tika-ebook-parser-module</include>
+ <include>org.apache.tika:tika-journal-parser-module</include>
+ <include>org.apache.tika:tika-office-parser-module</include>
+ <include>org.apache.tika:tika-package-parser-module</include>
+ <include>org.apache.tika:tika-pdf-parser-module</include>
+ <include>org.apache.tika:tika-scientific-parser-module</include>
+ <include>org.apache.tika:tika-text-parser-module</include>
+ <include>org.apache.tika:tika-web-parser-module</include>
</includes>
</artifactSet>
<transformers>