You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jspwiki.apache.org by ju...@apache.org on 2019/05/02 17:36:42 UTC
[jspwiki] 01/05: JSPWIKI-469: new TikaSearchProvider,
contributed by Ulf Dittmer - thanks!
This is an automated email from the ASF dual-hosted git repository.
juanpablo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/jspwiki.git
commit f33ed8822b141545fef4e9998702c8dab327b8f6
Author: juanpablo <ju...@apache.org>
AuthorDate: Thu May 2 19:32:18 2019 +0200
JSPWIKI-469: new TikaSearchProvider, contributed by Ulf Dittmer - thanks!
---
jspwiki-tika-searchprovider/pom.xml | 84 ++++++++++++++
.../wiki/search/tika/TikaSearchProvider.java | 126 +++++++++++++++++++++
.../wiki/search/tika/TikaSearchProviderTest.java | 83 ++++++++++++++
.../src/test/resources/aaa-diagram.pdf | Bin 0 -> 37465 bytes
.../src/test/resources/favicon.png | Bin 0 -> 631 bytes
.../src/test/resources/jspwiki-custom.properties | 86 ++++++++++++++
pom.xml | 8 ++
7 files changed, 387 insertions(+)
diff --git a/jspwiki-tika-searchprovider/pom.xml b/jspwiki-tika-searchprovider/pom.xml
new file mode 100644
index 0000000..dae835d
--- /dev/null
+++ b/jspwiki-tika-searchprovider/pom.xml
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <parent>
+ <groupId>org.apache.jspwiki</groupId>
+ <artifactId>jspwiki-builder</artifactId>
+ <version>2.11.0.M4-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>jspwiki-tika-searchprovider</artifactId>
+ <modelVersion>4.0.0</modelVersion>
+ <name>Apache JSPWiki Tika Search provider</name>
+ <description>Apache JSPWiki Tika Search provider</description>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>jspwiki-main</artifactId>
+ <version>${project.version}</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>jspwiki-main</artifactId>
+ <type>test-jar</type>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-api</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-params</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-engine</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>javax.servlet</groupId>
+ <artifactId>javax.servlet-api</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>net.sourceforge.stripes</groupId>
+ <artifactId>stripes</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
diff --git a/jspwiki-tika-searchprovider/src/main/java/org/apache/wiki/search/tika/TikaSearchProvider.java b/jspwiki-tika-searchprovider/src/main/java/org/apache/wiki/search/tika/TikaSearchProvider.java
new file mode 100644
index 0000000..f0eb367
--- /dev/null
+++ b/jspwiki-tika-searchprovider/src/main/java/org/apache/wiki/search/tika/TikaSearchProvider.java
@@ -0,0 +1,126 @@
+/*
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+ */
+package org.apache.wiki.search.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.metadata.ClimateForcast;
+import org.apache.tika.metadata.CreativeCommons;
+import org.apache.tika.metadata.Database;
+import org.apache.tika.metadata.HTML;
+import org.apache.tika.metadata.HttpHeaders;
+import org.apache.tika.metadata.IPTC;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.wiki.search.LuceneSearchProvider;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.log4j.Logger;
+import org.apache.wiki.api.exceptions.ProviderException;
+import org.apache.wiki.attachment.Attachment;
+import org.apache.wiki.attachment.AttachmentManager;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+
+/**
+ * Search provider that extends {link LuceneSearchProvider} using Apache Tika for indexing attachment content.
+ *
+ * @since 2.11.0
+ * @see <a href="https://issues.apache.org/jira/browse/JSPWIKI-469">JSPWIKI-469</a>
+ */
+public class TikaSearchProvider extends LuceneSearchProvider {
+
+ private static final Logger LOG = Logger.getLogger( TikaSearchProvider.class );
+ AutoDetectParser parser;
+ Set< String > textualMetadataFields;
+
+ public TikaSearchProvider() {
+ parser = new AutoDetectParser();
+
+ // metadata fields that also are indexed
+ textualMetadataFields = new HashSet<>();
+ textualMetadataFields.add( TikaCoreProperties.TITLE.getName() );
+ textualMetadataFields.add( TikaCoreProperties.COMMENTS.getName() );
+ textualMetadataFields.add( TikaCoreProperties.KEYWORDS.getName() );
+ textualMetadataFields.add( TikaCoreProperties.DESCRIPTION.getName() );
+ textualMetadataFields.add( TikaCoreProperties.TYPE.getName() );
+ textualMetadataFields.add( TikaMetadataKeys.RESOURCE_NAME_KEY );
+ textualMetadataFields.add( PDF.DOC_INFO_TITLE.getName() );
+ textualMetadataFields.add( PDF.DOC_INFO_KEY_WORDS.getName() );
+ textualMetadataFields.add( PDF.DOC_INFO_SUBJECT.getName() );
+ textualMetadataFields.add( OfficeOpenXMLCore.SUBJECT.getName() );
+ textualMetadataFields.add( Office.KEYWORDS.getName() );
+ textualMetadataFields.add( TikaCoreProperties.TYPE.getName() );
+ textualMetadataFields.add( HttpHeaders.CONTENT_TYPE );
+ textualMetadataFields.add( IPTC.HEADLINE.getName() );
+ textualMetadataFields.add( Database.COLUMN_NAME.getName() );
+ textualMetadataFields.add( Database.TABLE_NAME.getName() );
+ textualMetadataFields.add( CreativeCommons.WORK_TYPE );
+ textualMetadataFields.add( ClimateForcast.COMMENT );
+ textualMetadataFields.add( ClimateForcast.HISTORY );
+ textualMetadataFields.add( ClimateForcast.INSTITUTION );
+ }
+
+ /**
+ * {@inheritDoc}
+ * @param att Attachment to get content for. Filename extension is used to determine the type of the attachment.
+ * @return String representing the content of the file.
+ */
+ @Override
+ protected String getAttachmentContent( final Attachment att ) {
+ // LOG.debug("indexing "+att.getFileName());
+ final AttachmentManager mgr = getEngine().getAttachmentManager();
+ final StringBuilder out = new StringBuilder();
+
+ try( final InputStream attStream = mgr.getAttachmentStream( att ) ) {
+ final Metadata metadata = new Metadata();
+ metadata.set( TikaMetadataKeys.RESOURCE_NAME_KEY, att.getFileName() );
+
+ final ContentHandler handler = new BodyContentHandler(-1 );
+ // -1 disables the character size limit; otherwise only the first 100.000 characters are indexed
+
+ parser.parse( attStream, handler, metadata );
+ out.append( handler.toString() );
+
+ final String[] names = metadata.names();
+ for( int j = 0; j < names.length; j++ ) {
+ if( textualMetadataFields.contains( names[ j ] ) ) {
+ out.append( " " ).append( metadata.get( names[ j ] ) );
+ }
+ }
+ } catch( TikaException | SAXException e ) {
+ LOG.error( "Attachment cannot be parsed", e );
+ } catch( ProviderException | IOException e ) {
+ LOG.error( "Attachment cannot be loaded", e );
+ }
+
+ return out.toString();
+ }
+
+}
\ No newline at end of file
diff --git a/jspwiki-tika-searchprovider/src/test/java/org/apache/wiki/search/tika/TikaSearchProviderTest.java b/jspwiki-tika-searchprovider/src/test/java/org/apache/wiki/search/tika/TikaSearchProviderTest.java
new file mode 100644
index 0000000..dbd7031
--- /dev/null
+++ b/jspwiki-tika-searchprovider/src/test/java/org/apache/wiki/search/tika/TikaSearchProviderTest.java
@@ -0,0 +1,83 @@
+package org.apache.wiki.search.tika;
+
+import net.sf.ehcache.CacheManager;
+import net.sourceforge.stripes.mock.MockHttpServletRequest;
+import org.apache.wiki.TestEngine;
+import org.apache.wiki.WikiContext;
+import org.apache.wiki.search.SearchResult;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Collection;
+import java.util.Properties;
+
+
+public class TikaSearchProviderTest {
+
+ private static final long SLEEP_TIME = 2_000L;
+ private static final int SLEEP_COUNT = 50;
+ TestEngine engine;
+ Properties props;
+
+ @BeforeEach
+ void setUp() throws Exception {
+ props = TestEngine.getTestProperties();
+ TestEngine.emptyWorkDir( props );
+ CacheManager.getInstance().removeAllCaches();
+
+ engine = new TestEngine( props );
+ }
+
+ @Test
+ void testGetAttachmentContent() throws Exception {
+ engine.saveText( "test-tika", "blablablabla" );
+ byte[] filePng = Files.readAllBytes( Paths.get( TikaSearchProviderTest.class.getClassLoader().getResource( "favicon.png" ).toURI() ) );
+ byte[] filePdf = Files.readAllBytes( Paths.get( TikaSearchProviderTest.class.getClassLoader().getResource( "aaa-diagram.pdf" ).toURI() ) );
+ engine.addAttachment( "test-tika", "aaa-diagram.pdf", filePdf );
+ engine.addAttachment( "test-tika", "favicon.png", filePng );
+
+ engine.getSearchManager().getSearchEngine().reindexPage( engine.getPage( "test-tika" ) );
+ Collection< SearchResult > res = waitForIndex( "favicon.png" , "testGetAttachmentContent" );
+ Assertions.assertNotNull( res );
+ Assertions.assertEquals( 1, res.size(), debugSearchResults( res ) );
+
+ res = waitForIndex( "application\\/pdf" , "testGetAttachmentContent" );
+ Assertions.assertNotNull( res );
+ Assertions.assertEquals( 1, res.size(), debugSearchResults( res ) );
+ }
+
+ String debugSearchResults( Collection< SearchResult > res ) {
+ StringBuilder sb = new StringBuilder();
+ for( SearchResult next : res ) {
+ sb.append( System.lineSeparator() + "* page: " + next.getPage() );
+ for( String s : next.getContexts() ) {
+ sb.append( System.lineSeparator() + "** snippet: " + s );
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Should cover for both index and initial delay
+ */
+ Collection<SearchResult> waitForIndex( String text, String testName ) throws Exception {
+ Collection< SearchResult > res = null;
+ for( long l = 0; l < SLEEP_COUNT; l++ ) {
+ if( res == null || res.isEmpty() ) {
+ Thread.sleep( SLEEP_TIME );
+ } else {
+ break;
+ }
+ MockHttpServletRequest request = engine.newHttpRequest();
+ WikiContext ctx = engine.createContext( request, WikiContext.EDIT );
+
+ res = engine.getSearchManager().findPages( text, ctx );
+
+ }
+ return res;
+ }
+
+}
\ No newline at end of file
diff --git a/jspwiki-tika-searchprovider/src/test/resources/aaa-diagram.pdf b/jspwiki-tika-searchprovider/src/test/resources/aaa-diagram.pdf
new file mode 100644
index 0000000..9bb37c7
Binary files /dev/null and b/jspwiki-tika-searchprovider/src/test/resources/aaa-diagram.pdf differ
diff --git a/jspwiki-tika-searchprovider/src/test/resources/favicon.png b/jspwiki-tika-searchprovider/src/test/resources/favicon.png
new file mode 100644
index 0000000..bb6f654
Binary files /dev/null and b/jspwiki-tika-searchprovider/src/test/resources/favicon.png differ
diff --git a/jspwiki-tika-searchprovider/src/test/resources/jspwiki-custom.properties b/jspwiki-tika-searchprovider/src/test/resources/jspwiki-custom.properties
new file mode 100644
index 0000000..ebc68be
--- /dev/null
+++ b/jspwiki-tika-searchprovider/src/test/resources/jspwiki-custom.properties
@@ -0,0 +1,86 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Custom configuration file used by most JUnit tests overriding
+# certain default values in src/main/resources/ini/jspwiki.properties
+#
+jspwiki.fileSystemProvider.pageDir = target/test-classes/testrepository
+jspwiki.workDir = target/test-classes/testworkdir
+jspwiki.searchProvider = org.apache.wiki.search.tika.TikaSearchProvider
+jspwiki.lucene.initialdelay = 1
+
+jspwiki.translatorReader.camelCaseLinks = true
+jspwiki.breakTitleWithSpaces = true
+jspwiki.translatorReader.useOutlinkImage = false
+jspwiki.basicAttachmentProvider.storageDir = target/test-classes/testrepository
+jspwiki.encoding = ISO-8859-1
+jspwiki.filterConfig = /filters.xml
+jspwiki.referenceStyle = relative
+jspwiki.authorizer=org.apache.wiki.auth.TestAuthorizer
+
+# log file under ./target
+log4j.appender.FileLog.File=./target/logs/jspwiki.log
+
+# RSS under ./target
+jspwiki.rss.fileName=./target/rss.rdf
+
+#
+# Security: use standard providers for user/group auth, user management
+# and ACLs. Use a test userdatabase for storing users.
+#
+jspwiki.xmlGroupDatabaseFile = target/test-classes/groupdatabase.xml
+jspwiki.xmlUserDatabaseFile = target/test-classes/userdatabase.xml
+
+log4j.logger.SecurityLog=INFO, SecurityAppender
+log4j.appender.SecurityAppender = org.apache.log4j.RollingFileAppender
+log4j.appender.SecurityAppender.File = ./target/logs/security.log
+log4j.appender.SecurityAppender.layout = org.apache.log4j.PatternLayout
+log4j.appender.SecurityAppender.layout.ConversionPattern=%d %p - %m%n
+
+# Used by CommandResolverTest
+jspwiki.specialPage.RecentChanges = RecentChanges.jsp
+jspwiki.specialPage.Search = Search.jsp
+
+# Used by JSPWikiMarkupParserTest
+jspwiki.translatorReader.inlinePattern.1 = *.jpg
+jspwiki.translatorReader.inlinePattern.2 = *.png
+jspwiki.translatorReader.inlinePattern.3 = http://images.com/*
+
+# Used by WorkflowManagerTest
+jspwiki.approver.workflow.saveWikiPage=
+jspwiki.approver.workflow.foo=janne
+jspwiki.approver.workflow.bar=Admin
+
+# Fields needed in order to run MailUtilTest
+#mail.smtp.host = 127.0.0.1
+#mail.smtp.port = 25
+#mail.from = JSPWiki <JS...@localhost>
+#mail.smtp.account =
+#mail.smtp.password =
+
+# for JDBC tests
+server.port=9321
+server.database.0=file:target/jspwiki.hsqldb
+server.dbname.0=jspwiki
+
+jdbc.admin.id=SA
+jdbc.admin.password=
+jdbc.driver.class=org.hsqldb.jdbc.JDBCDriver
+jdbc.driver.id=hsql
+jdbc.driver.url=jdbc\:hsqldb\:hsql\://localhost:9321/jspwiki
+jdbc.user.id=jspwiki
+jdbc.user.password=password
diff --git a/pom.xml b/pom.xml
index 1acb987..e8ae528 100644
--- a/pom.xml
+++ b/pom.xml
@@ -69,6 +69,7 @@
<selenide.version>5.2.2</selenide.version>
<slf4j.version>1.7.26</slf4j.version>
<stripes.version>1.7.0-async-beta</stripes.version>
+ <tika.version>1.20</tika.version>
<tomcat.version>8.5.40</tomcat.version>
<wro4j.version>1.8.0</wro4j.version>
<xmlrpc.version>2.0.1</xmlrpc.version>
@@ -119,6 +120,7 @@
<module>jspwiki-util</module>
<module>jspwiki-main</module>
<module>jspwiki-markdown</module>
+ <module>jspwiki-tika-searchprovider</module>
<module>jspwiki-war</module>
<module>jspwiki-portable</module>
<module>jspwiki-it-tests</module><!-- IT tests are launched only if -Pintegration-tests is given -->
@@ -242,6 +244,12 @@
</dependency>
<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${tika.version}</version>
+ </dependency>
+
+ <dependency>
<groupId>org.freshcookies</groupId>
<artifactId>freshcookies-security</artifactId>
<version>${freshcookies-security.version}</version>