You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2017/06/28 16:57:12 UTC

[03/18] jena git commit: implement GenericAnalyzerAssembler. TO DO: Tests

implement GenericAnalyzerAssembler. TO DO: Tests

Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/27ea30b7
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/27ea30b7
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/27ea30b7

Branch: refs/heads/master
Commit: 27ea30b73855d7a3cf0cd9561d2089295ec03353
Parents: 8b3757b
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Thu Apr 20 15:37:00 2017 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Apr 20 15:37:00 2017 -0500

----------------------------------------------------------------------
 .../assembler/GenericAnalyzerAssembler.java     | 332 +++++++++++++++++--
 .../query/text/assembler/TextAssembler.java     |   2 +-
 .../jena/query/text/assembler/TextVocab.java    |   8 +-
 3 files changed, 318 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/27ea30b7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
index db707d2..7fb04cc 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
@@ -18,10 +18,24 @@
 
 package org.apache.jena.query.text.assembler;
 
+import java.io.Reader;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.List;
+
 import org.apache.jena.assembler.Assembler;
 import org.apache.jena.assembler.Mode;
 import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.rdf.model.Literal;
+import org.apache.jena.rdf.model.RDFNode;
 import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 
 /**
  * Creates generic analyzers given a fully qualified Class name and a list
@@ -64,10 +78,29 @@ import org.apache.jena.rdf.model.Resource;
  * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int.</li>
  * </ul>
  * <p>
- * A parameter of type <code>set</code> <i>may have</i> zero or more <code>text:paramValue</code>s.
+ * A parameter of type <code>set</code> <i>must have</i> a list of zero or more <code>String</code>s.
  * <p>
  * A parameter of type <code>string</code>, <code>file</code>, <code>boolean</code>, or 
- * <code>int</code> <i>must have</i> a single <code>text:paramValue</code>
+ * <code>int</code> <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * Example:
+ * <pre>
+    text:map (
+         [ text:field "text" ; 
+           text:predicate rdfs:label;
+           text:analyzer [
+               a text:GenericAnalyzer ;
+               text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
+               text:params (
+                    [ text:paramName "stopwords" ;
+                      text:paramType "set" ;
+                      text:paramValue ("the" "a" "an") ]
+                    [ text:paramName "stemExclusionSet" ;
+                      text:paramType "set" ;
+                      text:paramValue ("ing" "ed") ]
+                    )
+           ] .
+ * </pre>
  */
 public class GenericAnalyzerAssembler extends AssemblerBase {
     /*
@@ -77,29 +110,284 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
            text:analyzer [
                a text:GenericAnalyzer ;
                text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
-               text:params [
-                    a rdf:seq ;
-                    rdf:_1 [
-                        text:paramName "stopwords" ;
-                        text:paramType "set" ;
-                        text:paramValue "the", "a", "an" ] ;
-                    rdf:_2 [
-                        text:paramName "stemExclusionSet" ;
-                        text:paramType "set" ;
-                        text:paramValue "ing", "ed" ]
-                    ]
-                ]
-          ] .
+               text:params (
+                    [ text:paramName "stopwords" ;
+                      text:paramType "set" ;
+                      text:paramValue ("the" "a" "an") ]
+                    [ text:paramName "stemExclusionSet" ;
+                      text:paramType "set" ;
+                      text:paramValue ("ing" "ed") ]
+                    )
+           ] .
      */
 
-	public GenericAnalyzerAssembler() {
-		// TODO Auto-generated constructor stub
-	}
-
 	@Override
-	public Object open(Assembler a, Resource root, Mode mode) {
-		// TODO Auto-generated method stub
-		return null;
+	public Analyzer open(Assembler a, Resource root, Mode mode) {
+	    if (root.hasProperty(TextVocab.pClass)) {
+	        // text:class is expected to be a string literal
+	        String className = root.getProperty(TextVocab.pClass).getString();
+
+	        // is the class accessible?
+	        Class<?> clazz = null;
+	        try {
+	            clazz = Class.forName(className);
+	        } catch (ClassNotFoundException e) {
+	            Log.error(this, "Analyzer class " + className + " not found. " + e.getMessage(), e);
+	            return null;
+	        }
+
+	        // Is the class an Analyzer?
+	        if (!Analyzer.class.isAssignableFrom(clazz)) {
+	            Log.error(this, clazz.getName() + " has to be a subclass of " + Analyzer.class.getName());
+	            return null;
+	        }
+	        
+	        if (root.hasProperty(TextVocab.pParams)) {
+	            RDFNode node = root.getProperty(TextVocab.pParams).getObject();
+	            if (! node.isResource()) {
+	                throw new TextIndexException("text:params must be a list of parameter resources: " + node);
+	            }
+
+	            List<ParamSpec> specs = getParamSpecs((Resource) node);
+
+	            // split the param specs into classes and values for constructor lookup
+	            final Class<?> paramClasses[] = new Class<?>[specs.size()];
+	            final Object paramValues[] = new Object[specs.size()];
+	            for (int i = 0; i < specs.size(); i++) {
+	                ParamSpec spec = specs.get(i);
+	                paramClasses[i] = spec.getValueClass();
+	                paramValues[i] = spec.getValue();
+	            }
+
+	            // Create new analyzer
+	            return newAnalyzer(clazz, paramClasses, paramValues);
+
+	        } else {
+	            // use the nullary Analyzer constructor
+	            return newAnalyzer(clazz, new Class<?>[0], new Object[0]);
+	        }
+	    } else {
+	        throw new TextIndexException("text:class property is required by GenericAnalyzer");
+	    }
 	}
 
+    /**
+     * Create instance of the Lucene Analyzer, <code>class</code>, with provided parameters
+     *
+     * @param clazz The analyzer class
+     * @param paramClasses The parameter classes
+     * @param paramValues The parameter values
+     * @return The lucene analyzer
+     */
+    private Analyzer newAnalyzer(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+
+        String className = clazz.getName();
+
+        try {
+            final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses);
+
+            return (Analyzer) cstr.newInstance(paramValues);
+
+        } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) {
+            Log.error(this, "Exception while instantiating analyzer class " + className + ". " + e.getMessage(), e);
+        } catch (NoSuchMethodException ex) {
+            Log.error(this, "Could not find matching analyzer class constructor for " + className + " " + ex.getMessage(), ex);
+        }
+
+        return null;
+    }
+    
+    private List<ParamSpec> getParamSpecs(Resource list) {
+        List<ParamSpec> result = new ArrayList<>();
+        Resource current = list;
+        
+        while (current != null && ! current.equals(RDF.nil)){
+            Statement firstStmt = current.getProperty(RDF.first);
+            if (firstStmt == null) {
+                throw new TextIndexException("parameter list not well formed: " + current);
+            }
+            
+            RDFNode first = firstStmt.getObject();
+            if (! first.isResource()) {
+                throw new TextIndexException("parameter specification must be an anon resource : " + first);
+            }
+
+            result.add(getParamSpec((Resource) first));
+            
+            Statement restStmt = current.getProperty(RDF.rest);
+            if (restStmt == null) {
+                throw new TextIndexException("parameter list not terminated by rdf:nil");
+            }
+            
+            RDFNode rest = restStmt.getObject();
+            if (! rest.isResource()) {
+                throw new TextIndexException("parameter list node is not a resource : " + rest);
+            }
+            
+            current = (Resource) rest;
+        }
+        
+        return result;
+    }
+    
+    private ParamSpec getParamSpec(Resource node) {
+        Statement nameStmt = node.getProperty(TextVocab.pParamName);
+        Statement typeStmt = node.getProperty(TextVocab.pParamType);
+        Statement valueStmt = node.getProperty(TextVocab.pParamValue);
+        
+        String name = getStringValue(nameStmt);
+        String type = getStringValue(typeStmt);
+        String value = getStringValue(valueStmt);
+
+        switch (type) {
+
+        // String
+        case "string": {
+            if (value == null) {
+                throw new TextIndexException("Value for string param: " + name + " must not be empty!");
+            }
+
+            return new ParamSpec(name, value, String.class);
+        }
+        
+        // "java.io.FileReader":
+        case "file": {
+
+            if (value == null) {
+                throw new TextIndexException("Value for file param must exist and must contain a file name.");
+            }
+
+            try {
+                // The analyzer is responsible for closing the file
+                Reader fileReader = new java.io.FileReader(value);
+                return new ParamSpec(name, fileReader, Reader.class);
+
+            } catch (java.io.FileNotFoundException ex) {
+                throw new TextIndexException("File " + value + " for param " + name + " not found!");
+            }
+        }
+        
+        // "org.apache.lucene.analysis.util.CharArraySet":
+        case "set": {
+            if (valueStmt == null) {
+                throw new TextIndexException("A set param spec must have a text:paramValue:" + node);
+            }
+            
+            RDFNode valueNode = valueStmt.getObject();
+            if (!valueNode.isResource()) {
+                throw new TextIndexException("A set param spec text:paramValue must be a list of strings: " + valueNode);
+            }
+            
+            List<String> values = toStrings((Resource) valueNode);
+
+            return new ParamSpec(name, new CharArraySet(values, false), CharArraySet.class);
+        }
+        
+        // "int":
+        case "int":
+            if (value == null) {
+                throw new TextIndexException("Value for int param: " + name + " must not be empty!");
+            }
+
+            int n = ((Literal) valueStmt.getObject()).getInt();
+            return new ParamSpec(name, n, int.class);
+
+        // "boolean":
+        case "boolean":
+            if (value == null) {
+                throw new TextIndexException("Value for boolean param: " + name + " must not be empty!");
+            }
+
+            boolean b = ((Literal) valueStmt.getObject()).getBoolean();
+            return new ParamSpec(name, b, boolean.class);
+        
+        default:
+            // there was no match
+            Log.error(this, "Unknown parameter type: " + type + " for param: " + name + " with value: " + value);
+            break;
+        }
+
+        return null;
+    }
+    
+    private String getStringValue(Statement stmt) {
+        if (stmt == null) {
+            return null;
+        } else {
+            RDFNode node = stmt.getObject();
+            if (node.isLiteral()) {
+                return ((Literal) node).getLexicalForm();
+            } else {
+                return null;
+            }
+        }
+    }
+
+    private List<String> toStrings(Resource list) {
+        List<String> result = new ArrayList<>();
+        Resource current = list;
+        
+        while (current != null && ! current.equals(RDF.nil)){
+            Statement firstStmt = current.getProperty(RDF.first);
+            if (firstStmt == null) {
+                throw new TextIndexException("param spec of type set not well formed");
+            }
+            
+            RDFNode first = firstStmt.getObject();
+            if (! first.isLiteral()) {
+                throw new TextIndexException("param spec of type set item is not a literal: " + first);
+            }
+            
+            result.add(((Literal)first).getLexicalForm());
+            
+            Statement restStmt = current.getProperty(RDF.rest);
+            if (restStmt == null) {
+                throw new TextIndexException("param spec of type set not terminated by rdf:nil");
+            }
+            
+            RDFNode rest = restStmt.getObject();
+            if (! rest.isResource()) {
+                throw new TextIndexException("param spec of type set rest is not a resource: " + rest);
+            }
+            
+            current = (Resource) rest;
+        }
+        
+        return result;
+    }
+
+    /**
+     * <code>ParamSpec</code> contains the <code>name</code>, <code>Class</code>, and 
+     * <code>value</code> of a parameter for a constructor (or really any method in general)
+     */
+    private static final class ParamSpec {
+
+        private final String name;
+        private final Object value;
+        private final Class<?> clazz;
+
+        @SuppressWarnings("unused")
+        public ParamSpec(String key, Object value) {
+            this(key, value, value.getClass());
+        }
+
+        public ParamSpec(String key, Object value, Class<?> clazz) {
+            this.name = key;
+            this.value = value;
+            this.clazz = clazz;
+        }
+
+        @SuppressWarnings("unused")
+        public String getKey() {
+            return name;
+        }
+
+        public Object getValue() {
+            return value;
+        }
+
+        public Class<?> getValueClass() {
+            return clazz;
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/27ea30b7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
index 636c6bc..45f5cee 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
@@ -37,7 +37,7 @@ public class TextAssembler
         Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ;
         Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ;
         Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ;
-        Assembler.general.implementWith(TextVocab.genericAnalyzer, new GenericAnalyzerAssembler()) ;
+        Assembler.general.implementWith(TextVocab.genericAnalyzer,  new GenericAnalyzerAssembler()) ;
 
     }
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/27ea30b7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
index bc49d10..cd1844d 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
@@ -66,7 +66,6 @@ public class TextVocab
     public static final Resource lowerCaseKeywordAnalyzer    = Vocab.resource(NS, "LowerCaseKeywordAnalyzer");
     public static final Resource localizedAnalyzer    = Vocab.resource(NS, "LocalizedAnalyzer");
     public static final Resource configurableAnalyzer = Vocab.resource(NS, "ConfigurableAnalyzer");
-    public static final Resource genericAnalyzer   = Vocab.resource(NS, "GenericAnalyzer");
     
     // Tokenizers
     public static final Resource standardTokenizer  = Vocab.resource(NS, "StandardTokenizer");
@@ -87,5 +86,12 @@ public class TextVocab
     public static final Property pReplicas          = Vocab.property(NS, "replicas");
     public static final Property pIndexName          = Vocab.property(NS, "indexName");
 
+    //GenericAnalyzer
+    public static final Resource genericAnalyzer    = Vocab.resource(NS, "GenericAnalyzer");
+    public static final Property pClass             = Vocab.property(NS, "class");
+    public static final Property pParams            = Vocab.property(NS, "params");
+    public static final Property pParamName         = Vocab.property(NS, "paramName");
+    public static final Property pParamType         = Vocab.property(NS, "paramType");
+    public static final Property pParamValue        = Vocab.property(NS, "paramValue");
 }