You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2013/06/23 18:26:33 UTC
svn commit: r1495837 - in /jena/trunk/jena-text/src/main/java/org/apache/jena/query/text: QueryPF.java TextIndex.java TextIndexLucene.java assembler/TextIndexLuceneAssembler.java

Author: andy
Date: Sun Jun 23 16:26:33 2013
New Revision: 1495837

URL: http://svn.apache.org/r1495837
Log:
Functionally correct but potentially slow implementation for checking existence of URI against a text query.  
This is not the common use case.

Modified:
    jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/QueryPF.java
    jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java
    jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
    jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java

Modified: jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/QueryPF.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/QueryPF.java?rev=1495837&r1=1495836&r2=1495837&view=diff
==============================================================================
--- jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/QueryPF.java (original)
+++ jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/QueryPF.java Sun Jun 23 16:26:33 2013
@@ -16,14 +16,16 @@
  * limitations under the License.
  */
 
-package org.apache.jena.query.text;
+package org.apache.jena.query.text ;
 
 import java.util.List ;
-import java.util.Map ;
 
 import org.apache.jena.atlas.iterator.Iter ;
 import org.apache.jena.atlas.lib.InternalErrorException ;
 import org.apache.jena.atlas.logging.Log ;
+import org.apache.lucene.queryparser.classic.QueryParser ;
+import org.slf4j.Logger ;
+import org.slf4j.LoggerFactory ;
 
 import com.hp.hpl.jena.datatypes.RDFDatatype ;
 import com.hp.hpl.jena.datatypes.xsd.XSDDatatype ;
@@ -39,82 +41,52 @@ import com.hp.hpl.jena.sparql.engine.ite
 import com.hp.hpl.jena.sparql.pfunction.PropFuncArg ;
 import com.hp.hpl.jena.sparql.pfunction.PropertyFunctionBase ;
 import com.hp.hpl.jena.sparql.util.IterLib ;
-import com.hp.hpl.jena.sparql.util.NodeFactoryExtra;
+import com.hp.hpl.jena.sparql.util.NodeFactoryExtra ;
 
-/** property function that accesses a Solr server */ 
-public class QueryPF extends PropertyFunctionBase
-{
-    private TextIndex server = null ;  
-    private boolean warningIssued = false ;
-    
-    public QueryPF() { } 
+/** property function that accesses a Solr server */
+public class QueryPF extends PropertyFunctionBase {
+    private static Logger log           = LoggerFactory.getLogger(QueryPF.class) ;
+    /*
+     * ?uri :queryPF (property? "string" limit? score?) score? not implemented
+     */
+
+    private TextIndex     server        = null ;
+    private boolean       warningIssued = false ;
+
+    public QueryPF() {}
 
     @Override
-    public void build(PropFuncArg argSubject, Node predicate, PropFuncArg argObject, ExecutionContext execCxt)
-    {
+    public void build(PropFuncArg argSubject, Node predicate, PropFuncArg argObject, ExecutionContext execCxt) {
         super.build(argSubject, predicate, argObject, execCxt) ;
-        
+
         DatasetGraph dsg = execCxt.getDataset() ;
         server = chooseTextIndex(dsg) ;
-        
-        if ( ! argSubject.isNode() )
-            throw new QueryBuildException("Subject is not a single node: "+argSubject) ;
-        
-        if ( argObject.isList() )
-        {
+
+        if (!argSubject.isNode())
+            throw new QueryBuildException("Subject is not a single node: " + argSubject) ;
+
+        if (argObject.isList()) {
             List<Node> list = argObject.getArgList() ;
-            if ( list.size() == 0 )
+            if (list.size() == 0)
                 throw new QueryBuildException("Zero-length argument list") ;
 
-            if ( list.size() > 4 )
-                throw new QueryBuildException("Too many arguments in list : "+list) ;
+            if (list.size() > 4)
+                throw new QueryBuildException("Too many arguments in list : " + list) ;
         }
     }
 
-    /*
-     * ?uri :queryPF (property? "string" limit? score?)
-     * score? not implemented
-     */
-    
-    // score limit - float : new IteratorTruncate<SolrDocument>(...., iter) ; 
-    
-    static class StrMatch
-    {
-        private final Node property ;
-        private final String queryString ;
-        private final int limit ;
-        private final float scoreLimit ;
-
-        public StrMatch(Node property, String queryString, int limit, float scoreLimit)
-        {
-            super() ;
-            this.property = property ;
-            this.queryString = queryString ;
-            this.limit = limit ;
-            this.scoreLimit = scoreLimit ;
-        }
-
-        public Node getProperty()           { return property ; }
-
-        public String getQueryString()      { return queryString ; }
-
-        public int getLimit()               { return limit ; }
-
-        public float getScoreLimit()        { return scoreLimit ; }
-    }
-    
-    private static TextIndex chooseTextIndex(DatasetGraph dsg)
-    {
+    private static TextIndex chooseTextIndex(DatasetGraph dsg) {
         Object obj = dsg.getContext().get(TextQuery.textIndex) ;
 
-        if ( obj != null )
-        {
-            try { return (TextIndex)obj ; } 
-            catch (ClassCastException ex) { Log.warn(QueryPF.class, "Context setting '"+TextQuery.textIndex+"'is not a TextIndex") ; }
+        if (obj != null) {
+            try {
+                return (TextIndex)obj ;
+            } catch (ClassCastException ex) {
+                Log.warn(QueryPF.class, "Context setting '" + TextQuery.textIndex + "'is not a TextIndex") ;
+            }
         }
 
-        if ( dsg instanceof DatasetGraphText )
-        {
+        if (dsg instanceof DatasetGraphText) {
             DatasetGraphText x = (DatasetGraphText)dsg ;
             return x.getTextIndex() ;
         }
@@ -123,134 +95,191 @@ public class QueryPF extends PropertyFun
     }
 
     @Override
-    public QueryIterator exec(Binding binding, PropFuncArg argSubject, Node predicate, PropFuncArg argObject, ExecutionContext execCxt)
-    {
-        if ( server == null )
-        {
-            if ( ! warningIssued )
-            {
+    public QueryIterator exec(Binding binding, PropFuncArg argSubject, Node predicate, PropFuncArg argObject,
+                              ExecutionContext execCxt) {
+        if (server == null) {
+            if (!warningIssued) {
                 Log.warn(getClass(), "No text index - no text search performed") ;
                 warningIssued = true ;
             }
             // Not a text dataset - no-op
             return IterLib.result(binding, execCxt) ;
         }
-     
+
         DatasetGraph dsg = execCxt.getDataset() ;
-        
-        if ( ! argSubject.isNode() )
+
+        if (!argSubject.isNode())
             throw new InternalErrorException("Subject is not a node (it was earlier!)") ;
-            
+
         Node s = argSubject.getArg() ;
-        
-        if ( s.isLiteral() )
+
+        if (s.isLiteral())
             // Does not match
             return IterLib.noResults(execCxt) ;
-        
+
         StrMatch match = objectToStruct(argObject) ;
+        if (match == null) {
+            // can't match
+            return IterLib.noResults(execCxt) ;
+        }
 
         // ----
-        
-        QueryIterator qIter =  ( Var.isVar(s) ) 
-            ? variableSubject(binding, s, match, execCxt)
-            : concreteSubject(binding, s, match, execCxt) ;
-        
-        if ( match.getLimit() >= 0 )
+
+        QueryIterator qIter = (Var.isVar(s)) ? variableSubject(binding, s, match, execCxt) : concreteSubject(binding,
+                                                                                                             s, match,
+                                                                                                             execCxt) ;
+
+        if (match.getLimit() >= 0)
             qIter = new QueryIterSlice(qIter, 0, match.getLimit(), execCxt) ;
         return qIter ;
     }
 
-    private QueryIterator variableSubject(Binding binding, Node s, StrMatch match, ExecutionContext execCxt )
-    {
+    private QueryIterator variableSubject(Binding binding, Node s, StrMatch match, ExecutionContext execCxt) {
         Var v = Var.alloc(s) ;
-        List<Node> r = server.query(match.getQueryString(), match.getLimit()) ;
-        // Make distinct.  Note interaction with limit is imperfect
+        List<Node> r = query(match.getQueryString(), match.getLimit()) ;
+        // Make distinct. Note interaction with limit is imperfect
         r = Iter.iter(r).distinct().toList() ;
         QueryIterator qIter = new QueryIterExtendByVar(binding, v, r.iterator(), execCxt) ;
         return qIter ;
     }
 
-    private QueryIterator concreteSubject(Binding binding, Node s, StrMatch match, ExecutionContext execCxt )
-    {
-        if ( ! s.isURI() )
-        {
-            Log.warn(this, "Subject not a URI: "+s) ;
-            return IterLib.noResults(execCxt) ; 
+    private QueryIterator concreteSubject(Binding binding, Node s, StrMatch match, ExecutionContext execCxt) {
+        if (!s.isURI()) {
+            log.warn("Subject not a URI: " + s) ;
+            return IterLib.noResults(execCxt) ;
         }
-        
+
         String uri = s.getURI() ;
-        Map<String, Node> x = server.get(uri) ;
-        if ( x == null || x.isEmpty() )
+        
+        // Restrict to matching and entity field be right.
+        String qs = match.getQueryString() ;
+        if ( false ) {
+            // This should work but it doesn't
+            String escaped = QueryParser.escape(uri) ;
+            String qs2 = server.getDocDef().getEntityField() + ":" + escaped ;
+            qs = qs2 + " AND " + qs ;
+            List<Node> x = query(qs, 1) ;
+            if (x == null || x.isEmpty())
+                return IterLib.noResults(execCxt) ;
+            else
+                return IterLib.result(binding, execCxt) ;
+        }
+        // Crude.
+        List<Node> x = query(qs, -1) ;
+        if ( x == null || ! x.contains(s) )
             return IterLib.noResults(execCxt) ;
         else
             return IterLib.result(binding, execCxt) ;
     }
 
-    /** Deconstruct the node or list object argument and make a StrMatch */ 
-    private StrMatch objectToStruct(PropFuncArg argObject)
-    {
-        
-        EntityDefinition docDef = server.getDocDef()  ;
-        if ( argObject.isNode() )
-        {
+    private List<Node> query(String queryString, int limit) {
+        // Explain
+        if ( log.isInfoEnabled())
+            log.info("Text query: {} ({})", queryString,limit) ;
+        return server.query(queryString, limit) ;
+    }
+    
+    /** Deconstruct the node or list object argument and make a StrMatch */
+    private StrMatch objectToStruct(PropFuncArg argObject) {
+        EntityDefinition docDef = server.getDocDef() ;
+        if (argObject.isNode()) {
             Node o = argObject.getArg() ;
-            
-            if ( ! o.isLiteral() )
-            { System.err.println("Bad/4") ; }
-            
+
+            if (!o.isLiteral()) {
+                log.warn("Object to text query is not a literal") ;
+                return null ;
+            }
+
             RDFDatatype dt = o.getLiteralDatatype() ;
-            if ( dt != null && dt != XSDDatatype.XSDstring )
-            { System.err.println("Bad") ; }
-                
+            if (dt != null && dt != XSDDatatype.XSDstring) {
+                log.warn("Object to text query is not a string") ;
+                return null ;
+            }
+
             String qs = o.getLiteralLexicalForm() ;
-            return new StrMatch(docDef.getPrimaryPredicate(), qs, -1, 0) ; 
+            return new StrMatch(docDef.getPrimaryPredicate(), qs, -1, 0) ;
         }
-         
+
         List<Node> list = argObject.getArgList() ;
-        if ( list.size() == 0 || list.size() > 3 )
-            throw new TextIndexException("Change in object list size") ; 
+        if (list.size() == 0 || list.size() > 3)
+            throw new TextIndexException("Change in object list size") ;
 
         Node p = docDef.getPrimaryPredicate() ;
         String field = docDef.getPrimaryField() ;
         int idx = 0 ;
         Node x = list.get(0) ;
         // Property?
-        if ( x.isURI() )
-        {
+        if (x.isURI()) {
             p = x ;
             idx++ ;
-            if ( idx >= list.size() )
-                throw new TextIndexException("Property specificied but no query string : "+list) ;
+            if (idx >= list.size())
+                throw new TextIndexException("Property specificied but no query string : " + list) ;
             x = list.get(idx) ;
-            field = docDef.getField(p) ; 
+            field = docDef.getField(p) ;
+            if (field == null) {
+                log.warn("Predicate not indexed: " + p) ;
+                return null ;
+            }
         }
-        
+
         // String!
-        if ( ! x.isLiteral() )
-            throw new TextIndexException("Query isn't a literal string : "+list) ;
-        if ( x.getLiteralDatatype() != null && ! x.getLiteralDatatype().equals(XSDDatatype.XSDstring) )
-            throw new TextIndexException("Query isn't a string : "+list) ;
-        String queryString = x.getLiteralLexicalForm() ;  
+        if (!x.isLiteral()) {
+            log.warn("Text query string is not a literal " + list) ;
+            return null ;
+        }
+        if (x.getLiteralDatatype() != null && !x.getLiteralDatatype().equals(XSDDatatype.XSDstring)) {
+            log.warn("Text query is not a string " + list) ;
+            return null ;
+        }
+        String queryString = x.getLiteralLexicalForm() ;
         idx++ ;
-        
+
         int limit = -1 ;
         float score = 0 ;
-        
-        if ( idx < list.size() )
-        {        
+
+        if (idx < list.size()) {
             // Limit?
             x = list.get(idx) ;
             idx++ ;
             int v = NodeFactoryExtra.nodeToInt(x) ;
-            limit = ( v < 0 ) ? -1 : v ; 
+            limit = (v < 0) ? -1 : v ;
         }
 
         String qs = queryString ;
-        if ( field != null )
-            qs = field+":"+qs ;
-        
+        if (field != null)
+            qs = field + ":" + qs ;
+
         return new StrMatch(p, qs, limit, score) ;
     }
-}
 
+    class StrMatch {
+        private final Node   property ;
+        private final String queryString ;
+        private final int    limit ;
+        private final float  scoreLimit ;
 
+        public StrMatch(Node property, String queryString, int limit, float scoreLimit) {
+            super() ;
+            this.property = property ;
+            this.queryString = queryString ;
+            this.limit = limit ;
+            this.scoreLimit = scoreLimit ;
+        }
+
+        public Node getProperty() {
+            return property ;
+        }
+
+        public String getQueryString() {
+            return queryString ;
+        }
+
+        public int getLimit() {
+            return limit ;
+        }
+
+        public float getScoreLimit() {
+            return scoreLimit ;
+        }
+    }
+}

Modified: jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java?rev=1495837&r1=1495836&r2=1495837&view=diff
==============================================================================
--- jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java (original)
+++ jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java Sun Jun 23 16:26:33 2013
@@ -25,13 +25,17 @@ import org.apache.jena.atlas.lib.Closeab
 
 import com.hp.hpl.jena.graph.Node ;
 
+/** TextIndex abstraction */ 
 public interface TextIndex extends Closeable //, Transactional 
 {
+    // Update operations
     public abstract void startIndexing() ;
     public abstract void addEntity(Entity entity) ;
     public abstract void finishIndexing() ;
     public abstract void abortIndexing() ;
     
+    // read operations
+    /** Get all entries for uri */
     public abstract Map<String, Node> get(String uri) ;
 
     /** Access the index - limit if -1 for as many as possible */ 

Modified: jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java?rev=1495837&r1=1495836&r2=1495837&view=diff
==============================================================================
--- jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java (original)
+++ jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java Sun Jun 23 16:26:33 2013
@@ -18,45 +18,52 @@
 
 package org.apache.jena.query.text;
 
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.queryparser.classic.ParseException;
-import org.apache.lucene.queryparser.classic.QueryParser;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.Version;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import java.io.IOException ;
+import java.util.* ;
+import java.util.Map.Entry ;
+
+import org.apache.lucene.analysis.Analyzer ;
+import org.apache.lucene.analysis.standard.StandardAnalyzer ;
+import org.apache.lucene.document.Document ;
+import org.apache.lucene.document.Field ;
+import org.apache.lucene.document.FieldType ;
+import org.apache.lucene.document.TextField ;
+import org.apache.lucene.index.DirectoryReader ;
+import org.apache.lucene.index.IndexReader ;
+import org.apache.lucene.index.IndexWriter ;
+import org.apache.lucene.index.IndexWriterConfig ;
+import org.apache.lucene.queryparser.classic.ParseException ;
+import org.apache.lucene.queryparser.classic.QueryParser ;
+import org.apache.lucene.search.IndexSearcher ;
+import org.apache.lucene.search.Query ;
+import org.apache.lucene.search.ScoreDoc ;
+import org.apache.lucene.store.Directory ;
+import org.apache.lucene.util.Version ;
+import org.slf4j.Logger ;
+import org.slf4j.LoggerFactory ;
 
-import com.hp.hpl.jena.graph.Node;
+import com.hp.hpl.jena.graph.Node ;
 import com.hp.hpl.jena.graph.NodeFactory ;
-import com.hp.hpl.jena.sparql.util.NodeFactoryExtra;
+import com.hp.hpl.jena.sparql.util.NodeFactoryExtra ;
 
 public class TextIndexLucene implements TextIndex
 {
     private static Logger log = LoggerFactory.getLogger(TextIndexLucene.class) ;
     
     private static int MAX_N = 10000 ;
-    private static final Version VER = Version.LUCENE_41 ;
-    static FieldType ftIndexedStored = TextField.TYPE_STORED ; 
-    static FieldType ftIndexed = TextField.TYPE_NOT_STORED ;
+    public static final Version VER = Version.LUCENE_41 ;
+    
+    public static final FieldType ftIRI ;
+    static {
+        ftIRI = new FieldType() ;
+        ftIRI.setTokenized(false) ;
+        ftIRI.setStored(true) ;
+        ftIRI.setIndexed(true) ;
+        ftIRI.freeze() ;
+    }
+    //public static final FieldType ftText = TextField.TYPE_NOT_STORED ;
+    // Bigger index, easier to debug!
+    public static final FieldType ftText = TextField.TYPE_STORED ;
     
     private final EntityDefinition docDef ;
     private final Directory directory ;
@@ -70,11 +77,14 @@ public class TextIndexLucene implements 
         this.docDef = def ;
         
         // force creation of the index if it don't exist
-        // othewise if we get a search before data is written we get an exception
+        // otherwise if we get a search before data is written we get an exception
         startIndexing();
         finishIndexing();
     }
     
+    public Directory getDirectory()     { return directory ; }
+    public Analyzer getAnalyzer()       { return analyzer ; }
+    
     @Override
     public void startIndexing()
     { 
@@ -119,12 +129,12 @@ public class TextIndexLucene implements 
     private Document doc(Entity entity)
     {
         Document doc = new Document() ;
-        Field entField = new Field(docDef.getEntityField(), entity.getId(), ftIndexedStored) ;
+        Field entField = new Field(docDef.getEntityField(), entity.getId(), ftIRI) ;
         doc.add(entField) ;
         
         for ( Entry<String, Object> e : entity.getMap().entrySet() )
         {
-            Field field = new Field(e.getKey(), (String)e.getValue(), ftIndexed) ;
+            Field field = new Field(e.getKey(), (String)e.getValue(), ftText) ;
             doc.add(field) ;
         }
         return doc ;
@@ -144,13 +154,13 @@ public class TextIndexLucene implements 
         } catch (Exception ex) { exception(ex) ; return null ; } 
     }
     
-    private List<Map<String, Node>> get$(IndexReader indexReader , String uri)  throws ParseException, IOException {
+    private List<Map<String, Node>> get$(IndexReader indexReader, String uri)  throws ParseException, IOException {
         String escaped = QueryParser.escape(uri);
         String qs = docDef.getEntityField()+":"+escaped ;
         QueryParser queryParser = new QueryParser(VER, docDef.getPrimaryField(), analyzer);
         Query query = queryParser.parse(qs);
         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
-        ScoreDoc[] sDocs = indexSearcher.search(query, 1).scoreDocs ;   // Only need one hit.    
+        ScoreDoc[] sDocs = indexSearcher.search(query, 1).scoreDocs ;    
         List<Map<String, Node>> records = new ArrayList<Map<String, Node>>() ;
 
         // Align and DRY with Solr.
@@ -195,7 +205,7 @@ public class TextIndexLucene implements 
         } catch (Exception ex) { exception(ex) ; return null ; } 
     }
         
-    public List<Node> query$(IndexReader indexReader , String qs, int limit) throws ParseException, IOException {
+    private List<Node> query$(IndexReader indexReader , String qs, int limit) throws ParseException, IOException {
         IndexSearcher indexSearcher = new IndexSearcher(indexReader);
         QueryParser queryParser = new QueryParser(VER, docDef.getPrimaryField(), analyzer);
         Query query = queryParser.parse(qs);

Modified: jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java?rev=1495837&r1=1495836&r2=1495837&view=diff
==============================================================================
--- jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java (original)
+++ jena/trunk/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java Sun Jun 23 16:26:33 2013
@@ -65,7 +65,7 @@ public class TextIndexLuceneAssembler ex
             if ( n.isLiteral() )
             {
                 if ( ! "mem".equals(n.asLiteral().getLexicalForm()) )
-                    throw new TextIndexException("No 'text:directory' property on "+root+ " is a liteal and not \"mem\"") ;
+                    throw new TextIndexException("No 'text:directory' property on "+root+ " is a literal and not \"mem\"") ;
                  directory = new RAMDirectory() ;
             }
             else