You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jspwiki.apache.org by ju...@apache.org on 2019/08/08 22:30:54 UTC

[jspwiki] 05/07: JSPWIKI-427: Keywords for Lucene Index

This is an automated email from the ASF dual-hosted git repository.

juanpablo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/jspwiki.git

commit 1213cd78313329bd6282c8f700856d4743d9d803
Author: juanpablo <ju...@apache.org>
AuthorDate: Fri Aug 9 00:27:33 2019 +0200

    JSPWIKI-427: Keywords for Lucene Index
---
 .../apache/wiki/search/LuceneSearchProvider.java   | 174 ++++++++-------------
 .../org/apache/wiki/search/SearchManagerTest.java  |  35 +++--
 2 files changed, 86 insertions(+), 123 deletions(-)

diff --git a/jspwiki-main/src/main/java/org/apache/wiki/search/LuceneSearchProvider.java b/jspwiki-main/src/main/java/org/apache/wiki/search/LuceneSearchProvider.java
index d8ed170..9874d63 100644
--- a/jspwiki-main/src/main/java/org/apache/wiki/search/LuceneSearchProvider.java
+++ b/jspwiki-main/src/main/java/org/apache/wiki/search/LuceneSearchProvider.java
@@ -26,7 +26,6 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
@@ -46,7 +45,6 @@ import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.SimpleHTMLEncoder;
 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.LockObtainFailedException;
 import org.apache.lucene.store.SimpleFSDirectory;
 import org.apache.wiki.InternalWikiException;
 import org.apache.wiki.WatchDog;
@@ -78,7 +76,6 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Properties;
 
@@ -116,6 +113,7 @@ public class LuceneSearchProvider implements SearchProvider {
     protected static final String LUCENE_AUTHOR        = "author";
     protected static final String LUCENE_ATTACHMENTS   = "attachment";
     protected static final String LUCENE_PAGE_NAME     = "name";
+    protected static final String LUCENE_PAGE_KEYWORDS = "keywords";
 
     private String           m_luceneDirectory;
     protected List<Object[]> m_updates = Collections.synchronizedList( new ArrayList<>() ); 
@@ -329,19 +327,18 @@ public class LuceneSearchProvider implements SearchProvider {
      *  @param page The WikiPage to check
      *  @param text The page text to index.
      */
-    protected synchronized void updateLuceneIndex( WikiPage page, String text )
-    {
+    protected synchronized void updateLuceneIndex( final WikiPage page, final String text ) {
         log.debug("Updating Lucene index for page '" + page.getName() + "'...");
         pageRemoved( page );
 
         // Now add back the new version.
-        try( Directory luceneDir = new SimpleFSDirectory( new File( m_luceneDirectory ).toPath() );
-             IndexWriter writer = getIndexWriter( luceneDir ); ) {
+        try( final Directory luceneDir = new SimpleFSDirectory( new File( m_luceneDirectory ).toPath() );
+             final IndexWriter writer = getIndexWriter( luceneDir ) ) {
             luceneIndexPage( page, text, writer );
-        } catch ( IOException e ) {
+        } catch( final IOException e ) {
             log.error("Unable to update page '" + page.getName() + "' from Lucene index", e);
             // reindexPage( page );
-        } catch( Exception e ) {
+        } catch( final Exception e ) {
             log.error("Unexpected Lucene exception - please check configuration!",e);
             // reindexPage( page );
         }
@@ -376,15 +373,17 @@ public class LuceneSearchProvider implements SearchProvider {
      *  @return the created index Document
      *  @throws IOException If there's an indexing problem
      */
-    protected Document luceneIndexPage( WikiPage page, String text, IndexWriter writer )
-        throws IOException
-    {
-        if( log.isDebugEnabled() ) log.debug( "Indexing "+page.getName()+"..." );
+    protected Document luceneIndexPage( final WikiPage page, final String text, final IndexWriter writer ) throws IOException {
+        if( log.isDebugEnabled() ) {
+            log.debug( "Indexing "+page.getName()+"..." );
+        }
         
         // make a new, empty document
-        Document doc = new Document();
+        final Document doc = new Document();
 
-        if( text == null ) return doc;
+        if( text == null ) {
+            return doc;
+        }
 
         // Raw name is the keyword we'll use to refer to this document for updates.
         Field field = new Field( LUCENE_ID, page.getName(), StringField.TYPE_STORED );
@@ -395,9 +394,9 @@ public class LuceneSearchProvider implements SearchProvider {
         doc.add( field );
 
         // Allow searching by page name. Both beautified and raw
-        String unTokenizedTitle = StringUtils.replaceChars( page.getName(),
-                                                            MarkupParser.PUNCTUATION_CHARS_ALLOWED,
-                                                            c_punctuationSpaces );
+        final String unTokenizedTitle = StringUtils.replaceChars( page.getName(),
+                                                                  MarkupParser.PUNCTUATION_CHARS_ALLOWED,
+                                                                  c_punctuationSpaces );
 
         field = new Field( LUCENE_PAGE_NAME,
                            TextUtil.beautifyString( page.getName() ) + " " + unTokenizedTitle,
@@ -405,32 +404,31 @@ public class LuceneSearchProvider implements SearchProvider {
         doc.add( field );
 
         // Allow searching by authorname
-
-        if( page.getAuthor() != null )
-        {
+        if( page.getAuthor() != null ) {
             field = new Field( LUCENE_AUTHOR, page.getAuthor(), TextField.TYPE_STORED );
             doc.add( field );
         }
 
         // Now add the names of the attachments of this page
-        try
-        {
-            List< Attachment > attachments = m_engine.getAttachmentManager().listAttachments(page);
+        try {
+            final List< Attachment > attachments = m_engine.getAttachmentManager().listAttachments(page);
             String attachmentNames = "";
 
-            for( Iterator< Attachment > it = attachments.iterator(); it.hasNext(); )
-            {
-                Attachment att = it.next();
+            for( final Attachment att : attachments ) {
                 attachmentNames += att.getName() + ";";
             }
             field = new Field( LUCENE_ATTACHMENTS, attachmentNames, TextField.TYPE_STORED );
             doc.add( field );
 
-        }
-        catch(ProviderException e)
-        {
+        } catch( final ProviderException e ) {
             // Unable to read attachments
-            log.error("Failed to get attachments for page", e);
+            log.error( "Failed to get attachments for page", e );
+        }
+
+        // also index page keywords, if available
+        if( page.getAttribute( "keywords" ) != null ) {
+            field = new Field( LUCENE_PAGE_KEYWORDS, page.getAttribute( "keywords" ).toString(), TextField.TYPE_STORED );
+            doc.add( field );
         }
         writer.addDocument(doc);
 
@@ -441,19 +439,17 @@ public class LuceneSearchProvider implements SearchProvider {
      *  {@inheritDoc}
      */
     @Override
-    public void pageRemoved( WikiPage page ) {
-        try( Directory luceneDir = new SimpleFSDirectory( new File( m_luceneDirectory ).toPath() );
-             IndexWriter writer = getIndexWriter( luceneDir ); ) {
-            Query query = new TermQuery( new Term( LUCENE_ID, page.getName() ) );
+    public void pageRemoved( final WikiPage page ) {
+        try( final Directory luceneDir = new SimpleFSDirectory( new File( m_luceneDirectory ).toPath() );
+             final IndexWriter writer = getIndexWriter( luceneDir ); ) {
+            final Query query = new TermQuery( new Term( LUCENE_ID, page.getName() ) );
             writer.deleteDocuments( query );
-        } catch ( Exception e ) {
+        } catch ( final Exception e ) {
             log.error("Unable to remove page '" + page.getName() + "' from Lucene index", e);
         }
     }
     
-    IndexWriter getIndexWriter( Directory luceneDir ) throws CorruptIndexException, 
-            LockObtainFailedException, IOException, ProviderException 
-    {
+    IndexWriter getIndexWriter( Directory luceneDir ) throws IOException, ProviderException {
         IndexWriterConfig writerConfig = new IndexWriterConfig( getLuceneAnalyzer() );
         writerConfig.setOpenMode( OpenMode.CREATE_OR_APPEND );
         IndexWriter writer = new IndexWriter( luceneDir, writerConfig );
@@ -513,116 +509,68 @@ public class LuceneSearchProvider implements SearchProvider {
      *  @return A Collection of SearchResult instances
      *  @throws ProviderException if there is a problem with the backend
      */
-    public Collection< SearchResult > findPages( String query, int flags, WikiContext wikiContext )
-        throws ProviderException
-    {
-        IndexSearcher  searcher = null;
+    public Collection< SearchResult > findPages( final String query, final int flags, final WikiContext wikiContext ) throws ProviderException {
         ArrayList<SearchResult> list = null;
         Highlighter highlighter = null;
 
-        try
-        {
-            String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS };
-            QueryParser qp = new MultiFieldQueryParser( queryfields, getLuceneAnalyzer() );
-
-            //QueryParser qp = new QueryParser( LUCENE_PAGE_CONTENTS, getLuceneAnalyzer() );
-            Query luceneQuery = qp.parse( query );
+        try( final Directory luceneDir = new SimpleFSDirectory( new File( m_luceneDirectory ).toPath() );
+             final IndexReader reader = DirectoryReader.open( luceneDir ) ) {
+            final String[] queryfields = { LUCENE_PAGE_CONTENTS, LUCENE_PAGE_NAME, LUCENE_AUTHOR, LUCENE_ATTACHMENTS, LUCENE_PAGE_KEYWORDS };
+            final QueryParser qp = new MultiFieldQueryParser( queryfields, getLuceneAnalyzer() );
+            final Query luceneQuery = qp.parse( query );
+            final IndexSearcher searcher = new IndexSearcher( reader );
 
-            if( (flags & FLAG_CONTEXTS) != 0 )
-            {
+            if( (flags & FLAG_CONTEXTS) != 0 ) {
                 highlighter = new Highlighter(new SimpleHTMLFormatter("<span class=\"searchmatch\">", "</span>"),
                                               new SimpleHTMLEncoder(),
                                               new QueryScorer(luceneQuery));
             }
 
-            try
-            {
-                File dir = new File(m_luceneDirectory);
-                Directory luceneDir = new SimpleFSDirectory( dir.toPath() );
-                IndexReader reader = DirectoryReader.open(luceneDir);
-                searcher = new IndexSearcher(reader);
-            }
-            catch( Exception ex )
-            {
-                log.info("Lucene not yet ready; indexing not started",ex);
-                return null;
-            }
-
-            ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;
-
-            AuthorizationManager mgr = m_engine.getAuthorizationManager();
+            final ScoreDoc[] hits = searcher.search(luceneQuery, MAX_SEARCH_HITS).scoreDocs;
+            final AuthorizationManager mgr = m_engine.getAuthorizationManager();
 
             list = new ArrayList<>(hits.length);
-            for ( int curr = 0; curr < hits.length; curr++ )
-            {
+            for ( int curr = 0; curr < hits.length; curr++ ) {
                 int docID = hits[curr].doc;
                 Document doc = searcher.doc( docID );
                 String pageName = doc.get(LUCENE_ID);
                 WikiPage page = m_engine.getPage(pageName, WikiPageProvider.LATEST_VERSION);
 
-                if(page != null)
-                {
-                    if(page instanceof Attachment)
-                    {
+                if( page != null ) {
+                    if( page instanceof Attachment ) {
                         // Currently attachments don't look nice on the search-results page
                         // When the search-results are cleaned up this can be enabled again.
                     }
 
-                    PagePermission pp = new PagePermission( page, PagePermission.VIEW_ACTION );
+                    final PagePermission pp = new PagePermission( page, PagePermission.VIEW_ACTION );
 	                if( mgr.checkPermission( wikiContext.getWikiSession(), pp ) ) {
-	
-	                    int score = (int)(hits[curr].score * 100);
-	
-	
+                        final int score = (int)(hits[curr].score * 100);
+
 	                    // Get highlighted search contexts
-	                    String text = doc.get(LUCENE_PAGE_CONTENTS);
+	                    final String text = doc.get(LUCENE_PAGE_CONTENTS);
 	
 	                    String[] fragments = new String[0];
 	                    if( text != null && highlighter != null ) {
-	                        TokenStream tokenStream = getLuceneAnalyzer()
-	                        .tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
+	                        TokenStream tokenStream = getLuceneAnalyzer().tokenStream(LUCENE_PAGE_CONTENTS, new StringReader(text));
 	                        fragments = highlighter.getBestFragments(tokenStream, text, MAX_FRAGMENTS);
 	                    }
-	
-	                    SearchResult result = new SearchResultImpl( page, score, fragments );     
+
+                        final SearchResult result = new SearchResultImpl( page, score, fragments );
 	                    list.add(result);
 	                }
-                }
-                else
-                {
+                } else {
                     log.error("Lucene found a result page '" + pageName + "' that could not be loaded, removing from Lucene cache");
                     pageRemoved(new WikiPage( m_engine, pageName ));
                 }
             }
-        }
-        catch( IOException e )
-        {
+        } catch( final IOException e ) {
             log.error("Failed during lucene search",e);
-        }
-        catch( ParseException e )
-        {
-            log.info("Broken query; cannot parse query ",e);
-
-            throw new ProviderException("You have entered a query Lucene cannot process: "+e.getMessage());
-        }
-        catch( InvalidTokenOffsetsException e )
-        {
+        } catch( final ParseException e ) {
+            log.info("Broken query; cannot parse query: " + query, e);
+            throw new ProviderException( "You have entered a query Lucene cannot process [" + query + "]: " + e.getMessage() );
+        } catch( final InvalidTokenOffsetsException e ) {
             log.error("Tokens are incompatible with provided text ",e);
         }
-        finally
-        {
-            if( searcher != null )
-            {
-                try
-                {
-                    searcher.getIndexReader().close();
-                }
-                catch( IOException e )
-                {
-                    log.error( e );
-                }
-            }
-        }
 
         return list;
     }
diff --git a/jspwiki-main/src/test/java/org/apache/wiki/search/SearchManagerTest.java b/jspwiki-main/src/test/java/org/apache/wiki/search/SearchManagerTest.java
index 4aa37c9..22773cb 100644
--- a/jspwiki-main/src/test/java/org/apache/wiki/search/SearchManagerTest.java
+++ b/jspwiki-main/src/test/java/org/apache/wiki/search/SearchManagerTest.java
@@ -18,10 +18,8 @@
  */
 package org.apache.wiki.search;
 
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.Properties;
-
+import net.sf.ehcache.CacheManager;
+import net.sourceforge.stripes.mock.MockHttpServletRequest;
 import org.apache.wiki.TestEngine;
 import org.apache.wiki.WikiContext;
 import org.junit.jupiter.api.AfterEach;
@@ -29,8 +27,9 @@ import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
-import net.sf.ehcache.CacheManager;
-import net.sourceforge.stripes.mock.MockHttpServletRequest;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Properties;
 
 public class SearchManagerTest {
 
@@ -42,9 +41,9 @@ public class SearchManagerTest {
 
     @BeforeEach
     public void setUp() throws Exception {
-        Properties props = TestEngine.getTestProperties();
-        String workDir = props.getProperty( "jspwiki.workDir" );
-        String workRepo = props.getProperty( "jspwiki.fileSystemProvider.pageDir" );
+        final Properties props = TestEngine.getTestProperties();
+        final String workDir = props.getProperty( "jspwiki.workDir" );
+        final String workRepo = props.getProperty( "jspwiki.fileSystemProvider.pageDir" );
 
         props.setProperty( SearchManager.PROP_SEARCHPROVIDER, "LuceneSearchProvider" );
         props.setProperty( "jspwiki.lucene.initialdelay", "1" );
@@ -57,7 +56,7 @@ public class SearchManagerTest {
     }
 
     @AfterEach
-    public void tearDown() throws Exception {
+    public void tearDown() {
     	TestEngine.emptyWorkDir( props );
     }
 
@@ -220,4 +219,20 @@ public class SearchManagerTest {
         m_engine.deleteTestPage("TestPage");
     }
 
+    @Test
+    public void testKeywordsSearch() throws Exception {
+        String txt = "[{SET keywords=perry,mason,attorney,law}] Nonsensical content that should not match";
+
+        m_engine.saveText("TestPage", txt);
+
+        Thread.yield();
+        Collection< SearchResult > res = waitForIndex( "perry" , "testKeywordsSearch" );
+
+        Assertions.assertNotNull( res, "null result" );
+        Assertions.assertEquals( 1, res.size(), "no pages" );
+
+        Assertions.assertEquals( "TestPage", res.iterator().next().getPage().getName(), "page" );
+        m_engine.deleteTestPage("TestPage");
+    }
+
 }
\ No newline at end of file