You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jspwiki.apache.org by ju...@apache.org on 2019/05/02 17:36:42 UTC

[jspwiki] 01/05: JSPWIKI-469: new TikaSearchProvider, contributed by Ulf Dittmer - thanks!

This is an automated email from the ASF dual-hosted git repository.

juanpablo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/jspwiki.git

commit f33ed8822b141545fef4e9998702c8dab327b8f6
Author: juanpablo <ju...@apache.org>
AuthorDate: Thu May 2 19:32:18 2019 +0200

    JSPWIKI-469: new TikaSearchProvider, contributed by Ulf Dittmer - thanks!
---
 jspwiki-tika-searchprovider/pom.xml                |  84 ++++++++++++++
 .../wiki/search/tika/TikaSearchProvider.java       | 126 +++++++++++++++++++++
 .../wiki/search/tika/TikaSearchProviderTest.java   |  83 ++++++++++++++
 .../src/test/resources/aaa-diagram.pdf             | Bin 0 -> 37465 bytes
 .../src/test/resources/favicon.png                 | Bin 0 -> 631 bytes
 .../src/test/resources/jspwiki-custom.properties   |  86 ++++++++++++++
 pom.xml                                            |   8 ++
 7 files changed, 387 insertions(+)

diff --git a/jspwiki-tika-searchprovider/pom.xml b/jspwiki-tika-searchprovider/pom.xml
new file mode 100644
index 0000000..dae835d
--- /dev/null
+++ b/jspwiki-tika-searchprovider/pom.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <parent>
+    <groupId>org.apache.jspwiki</groupId>
+    <artifactId>jspwiki-builder</artifactId>
+    <version>2.11.0.M4-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>jspwiki-tika-searchprovider</artifactId>
+  <modelVersion>4.0.0</modelVersion>
+  <name>Apache JSPWiki Tika Search provider</name>
+  <description>Apache JSPWiki Tika Search provider</description>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>jspwiki-main</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>jspwiki-main</artifactId>
+      <type>test-jar</type>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-api</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-params</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-engine</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>javax.servlet</groupId>
+      <artifactId>javax.servlet-api</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>net.sourceforge.stripes</groupId>
+      <artifactId>stripes</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>
\ No newline at end of file
diff --git a/jspwiki-tika-searchprovider/src/main/java/org/apache/wiki/search/tika/TikaSearchProvider.java b/jspwiki-tika-searchprovider/src/main/java/org/apache/wiki/search/tika/TikaSearchProvider.java
new file mode 100644
index 0000000..f0eb367
--- /dev/null
+++ b/jspwiki-tika-searchprovider/src/main/java/org/apache/wiki/search/tika/TikaSearchProvider.java
@@ -0,0 +1,126 @@
+/*
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.    
+ */
+package org.apache.wiki.search.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.metadata.ClimateForcast;
+import org.apache.tika.metadata.CreativeCommons;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.HTML;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.IPTC;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.wiki.search.LuceneSearchProvider;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.log4j.Logger;
+import org.apache.wiki.api.exceptions.ProviderException;
+import org.apache.wiki.attachment.Attachment;
+import org.apache.wiki.attachment.AttachmentManager;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+
+/**
+ * Search provider that extends {link LuceneSearchProvider} using Apache Tika for indexing attachment content.
+ *
+ * @since 2.11.0
+ * @see <a href="https://issues.apache.org/jira/browse/JSPWIKI-469">JSPWIKI-469</a>
+ */
+public class TikaSearchProvider extends LuceneSearchProvider {
+
+    private static final Logger LOG = Logger.getLogger( TikaSearchProvider.class );
+	AutoDetectParser parser;
+	Set< String > textualMetadataFields;
+
+	public TikaSearchProvider() {
+		parser = new AutoDetectParser();
+
+		// metadata fields that also are indexed
+		textualMetadataFields = new HashSet<>();
+		textualMetadataFields.add( TikaCoreProperties.TITLE.getName() );
+		textualMetadataFields.add( TikaCoreProperties.COMMENTS.getName() );
+		textualMetadataFields.add( TikaCoreProperties.KEYWORDS.getName() );
+		textualMetadataFields.add( TikaCoreProperties.DESCRIPTION.getName() );
+		textualMetadataFields.add( TikaCoreProperties.TYPE.getName() );
+		textualMetadataFields.add( TikaMetadataKeys.RESOURCE_NAME_KEY );
+		textualMetadataFields.add( PDF.DOC_INFO_TITLE.getName() );
+		textualMetadataFields.add( PDF.DOC_INFO_KEY_WORDS.getName() );
+		textualMetadataFields.add( PDF.DOC_INFO_SUBJECT.getName() );
+		textualMetadataFields.add( OfficeOpenXMLCore.SUBJECT.getName() );
+		textualMetadataFields.add( Office.KEYWORDS.getName() );
+		textualMetadataFields.add( TikaCoreProperties.TYPE.getName() );
+		textualMetadataFields.add( HttpHeaders.CONTENT_TYPE );
+		textualMetadataFields.add( IPTC.HEADLINE.getName() );
+		textualMetadataFields.add( Database.COLUMN_NAME.getName() );
+		textualMetadataFields.add( Database.TABLE_NAME.getName() );
+		textualMetadataFields.add( CreativeCommons.WORK_TYPE );
+		textualMetadataFields.add( ClimateForcast.COMMENT );
+		textualMetadataFields.add( ClimateForcast.HISTORY );
+		textualMetadataFields.add( ClimateForcast.INSTITUTION );
+	}
+
+    /**
+	 * {@inheritDoc}
+     * @param att Attachment to get content for. Filename extension is used to determine the type of the attachment.
+     * @return String representing the content of the file.
+     */
+    @Override
+    protected String getAttachmentContent( final Attachment att ) {
+		// LOG.debug("indexing "+att.getFileName());
+        final AttachmentManager mgr = getEngine().getAttachmentManager();
+		final StringBuilder out = new StringBuilder();
+
+		try( final InputStream attStream = mgr.getAttachmentStream( att ) ) {
+			final Metadata metadata = new Metadata();
+			metadata.set( TikaMetadataKeys.RESOURCE_NAME_KEY, att.getFileName() );
+
+			final ContentHandler handler = new BodyContentHandler(-1 );
+			// -1 disables the character size limit; otherwise only the first 100.000 characters are indexed
+
+			parser.parse( attStream, handler, metadata );
+			out.append( handler.toString() );
+
+			final String[] names = metadata.names();
+			for( int j = 0; j < names.length; j++ ) {
+				if( textualMetadataFields.contains( names[ j ] ) ) {
+					out.append( " " ).append( metadata.get( names[ j ] ) );
+				}
+			}
+		} catch( TikaException | SAXException e ) {
+			LOG.error( "Attachment cannot be parsed", e );
+		} catch( ProviderException | IOException e ) {
+			LOG.error( "Attachment cannot be loaded", e );
+		}
+
+        return out.toString();
+    }
+
+}
\ No newline at end of file
diff --git a/jspwiki-tika-searchprovider/src/test/java/org/apache/wiki/search/tika/TikaSearchProviderTest.java b/jspwiki-tika-searchprovider/src/test/java/org/apache/wiki/search/tika/TikaSearchProviderTest.java
new file mode 100644
index 0000000..dbd7031
--- /dev/null
+++ b/jspwiki-tika-searchprovider/src/test/java/org/apache/wiki/search/tika/TikaSearchProviderTest.java
@@ -0,0 +1,83 @@
+package org.apache.wiki.search.tika;
+
+import net.sf.ehcache.CacheManager;
+import net.sourceforge.stripes.mock.MockHttpServletRequest;
+import org.apache.wiki.TestEngine;
+import org.apache.wiki.WikiContext;
+import org.apache.wiki.search.SearchResult;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Collection;
+import java.util.Properties;
+
+
+public class TikaSearchProviderTest {
+
+    private static final long SLEEP_TIME = 2_000L;
+    private static final int SLEEP_COUNT = 50;
+    TestEngine engine;
+    Properties props;
+
+    @BeforeEach
+    void setUp() throws Exception {
+        props = TestEngine.getTestProperties();
+        TestEngine.emptyWorkDir( props );
+        CacheManager.getInstance().removeAllCaches();
+
+        engine = new TestEngine( props );
+    }
+
+    @Test
+    void testGetAttachmentContent() throws Exception {
+        engine.saveText( "test-tika", "blablablabla" );
+        byte[] filePng = Files.readAllBytes( Paths.get( TikaSearchProviderTest.class.getClassLoader().getResource( "favicon.png" ).toURI() ) );
+        byte[] filePdf = Files.readAllBytes( Paths.get( TikaSearchProviderTest.class.getClassLoader().getResource( "aaa-diagram.pdf" ).toURI() ) );
+        engine.addAttachment( "test-tika", "aaa-diagram.pdf", filePdf );
+        engine.addAttachment( "test-tika", "favicon.png", filePng );
+
+        engine.getSearchManager().getSearchEngine().reindexPage( engine.getPage( "test-tika" ) );
+        Collection< SearchResult > res = waitForIndex( "favicon.png" , "testGetAttachmentContent" );
+        Assertions.assertNotNull( res );
+        Assertions.assertEquals( 1, res.size(), debugSearchResults( res ) );
+
+        res = waitForIndex( "application\\/pdf" , "testGetAttachmentContent" );
+        Assertions.assertNotNull( res );
+        Assertions.assertEquals( 1, res.size(), debugSearchResults( res ) );
+    }
+
+    String debugSearchResults( Collection< SearchResult > res ) {
+        StringBuilder sb = new StringBuilder();
+        for( SearchResult next : res ) {
+            sb.append( System.lineSeparator() + "* page: " + next.getPage() );
+            for( String s : next.getContexts() ) {
+                sb.append( System.lineSeparator() + "** snippet: " + s );
+            }
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Should cover for both index and initial delay
+     */
+    Collection<SearchResult> waitForIndex( String text, String testName ) throws Exception {
+        Collection< SearchResult > res = null;
+        for( long l = 0; l < SLEEP_COUNT; l++ ) {
+            if( res == null || res.isEmpty() ) {
+                Thread.sleep( SLEEP_TIME );
+            } else {
+                break;
+            }
+            MockHttpServletRequest request = engine.newHttpRequest();
+            WikiContext ctx = engine.createContext( request, WikiContext.EDIT );
+
+            res = engine.getSearchManager().findPages( text, ctx );
+
+        }
+        return res;
+    }
+
+}
\ No newline at end of file
diff --git a/jspwiki-tika-searchprovider/src/test/resources/aaa-diagram.pdf b/jspwiki-tika-searchprovider/src/test/resources/aaa-diagram.pdf
new file mode 100644
index 0000000..9bb37c7
Binary files /dev/null and b/jspwiki-tika-searchprovider/src/test/resources/aaa-diagram.pdf differ
diff --git a/jspwiki-tika-searchprovider/src/test/resources/favicon.png b/jspwiki-tika-searchprovider/src/test/resources/favicon.png
new file mode 100644
index 0000000..bb6f654
Binary files /dev/null and b/jspwiki-tika-searchprovider/src/test/resources/favicon.png differ
diff --git a/jspwiki-tika-searchprovider/src/test/resources/jspwiki-custom.properties b/jspwiki-tika-searchprovider/src/test/resources/jspwiki-custom.properties
new file mode 100644
index 0000000..ebc68be
--- /dev/null
+++ b/jspwiki-tika-searchprovider/src/test/resources/jspwiki-custom.properties
@@ -0,0 +1,86 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+#
+# Custom configuration file used by most JUnit tests overriding
+# certain default values in src/main/resources/ini/jspwiki.properties
+#
+jspwiki.fileSystemProvider.pageDir = target/test-classes/testrepository
+jspwiki.workDir = target/test-classes/testworkdir
+jspwiki.searchProvider = org.apache.wiki.search.tika.TikaSearchProvider
+jspwiki.lucene.initialdelay = 1
+
+jspwiki.translatorReader.camelCaseLinks = true
+jspwiki.breakTitleWithSpaces = true
+jspwiki.translatorReader.useOutlinkImage = false
+jspwiki.basicAttachmentProvider.storageDir = target/test-classes/testrepository
+jspwiki.encoding = ISO-8859-1
+jspwiki.filterConfig = /filters.xml
+jspwiki.referenceStyle = relative
+jspwiki.authorizer=org.apache.wiki.auth.TestAuthorizer
+
+# log file under ./target
+log4j.appender.FileLog.File=./target/logs/jspwiki.log
+
+# RSS under ./target
+jspwiki.rss.fileName=./target/rss.rdf
+
+#
+# Security: use standard providers for user/group auth, user management
+# and ACLs. Use a test userdatabase for storing users.
+#
+jspwiki.xmlGroupDatabaseFile = target/test-classes/groupdatabase.xml
+jspwiki.xmlUserDatabaseFile = target/test-classes/userdatabase.xml
+
+log4j.logger.SecurityLog=INFO, SecurityAppender
+log4j.appender.SecurityAppender = org.apache.log4j.RollingFileAppender
+log4j.appender.SecurityAppender.File = ./target/logs/security.log
+log4j.appender.SecurityAppender.layout = org.apache.log4j.PatternLayout
+log4j.appender.SecurityAppender.layout.ConversionPattern=%d %p - %m%n
+
+# Used by CommandResolverTest
+jspwiki.specialPage.RecentChanges = RecentChanges.jsp
+jspwiki.specialPage.Search = Search.jsp
+
+# Used by JSPWikiMarkupParserTest
+jspwiki.translatorReader.inlinePattern.1 = *.jpg
+jspwiki.translatorReader.inlinePattern.2 = *.png
+jspwiki.translatorReader.inlinePattern.3 = http://images.com/*
+
+# Used by WorkflowManagerTest
+jspwiki.approver.workflow.saveWikiPage=
+jspwiki.approver.workflow.foo=janne
+jspwiki.approver.workflow.bar=Admin
+
+# Fields needed in order to run MailUtilTest
+#mail.smtp.host = 127.0.0.1
+#mail.smtp.port = 25
+#mail.from = JSPWiki <JS...@localhost>
+#mail.smtp.account =
+#mail.smtp.password =
+
+# for JDBC tests
+server.port=9321
+server.database.0=file:target/jspwiki.hsqldb
+server.dbname.0=jspwiki
+
+jdbc.admin.id=SA
+jdbc.admin.password=
+jdbc.driver.class=org.hsqldb.jdbc.JDBCDriver
+jdbc.driver.id=hsql
+jdbc.driver.url=jdbc\:hsqldb\:hsql\://localhost:9321/jspwiki
+jdbc.user.id=jspwiki
+jdbc.user.password=password
diff --git a/pom.xml b/pom.xml
index 1acb987..e8ae528 100644
--- a/pom.xml
+++ b/pom.xml
@@ -69,6 +69,7 @@
     <selenide.version>5.2.2</selenide.version>
     <slf4j.version>1.7.26</slf4j.version>
     <stripes.version>1.7.0-async-beta</stripes.version>
+    <tika.version>1.20</tika.version>
     <tomcat.version>8.5.40</tomcat.version>
     <wro4j.version>1.8.0</wro4j.version>
     <xmlrpc.version>2.0.1</xmlrpc.version>
@@ -119,6 +120,7 @@
     <module>jspwiki-util</module>
     <module>jspwiki-main</module>
     <module>jspwiki-markdown</module>
+    <module>jspwiki-tika-searchprovider</module>
     <module>jspwiki-war</module>
     <module>jspwiki-portable</module>
     <module>jspwiki-it-tests</module><!-- IT tests are launched only if -Pintegration-tests is given -->
@@ -242,6 +244,12 @@
       </dependency>
 
       <dependency>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-parsers</artifactId>
+        <version>${tika.version}</version>
+      </dependency>
+
+      <dependency>
         <groupId>org.freshcookies</groupId>
         <artifactId>freshcookies-security</artifactId>
         <version>${freshcookies-security.version}</version>