You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2017/06/28 16:57:12 UTC
[03/18] jena git commit: implement GenericAnalyzerAssembler. TO DO:
Tests
implement GenericAnalyzerAssembler. TO DO: Tests
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/27ea30b7
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/27ea30b7
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/27ea30b7
Branch: refs/heads/master
Commit: 27ea30b73855d7a3cf0cd9561d2089295ec03353
Parents: 8b3757b
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Thu Apr 20 15:37:00 2017 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Apr 20 15:37:00 2017 -0500
----------------------------------------------------------------------
.../assembler/GenericAnalyzerAssembler.java | 332 +++++++++++++++++--
.../query/text/assembler/TextAssembler.java | 2 +-
.../jena/query/text/assembler/TextVocab.java | 8 +-
3 files changed, 318 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/27ea30b7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
index db707d2..7fb04cc 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
@@ -18,10 +18,24 @@
package org.apache.jena.query.text.assembler;
+import java.io.Reader;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.jena.assembler.Assembler;
import org.apache.jena.assembler.Mode;
import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.rdf.model.Literal;
+import org.apache.jena.rdf.model.RDFNode;
import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
/**
* Creates generic analyzers given a fully qualified Class name and a list
@@ -64,10 +78,29 @@ import org.apache.jena.rdf.model.Resource;
* <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int.</li>
* </ul>
* <p>
- * A parameter of type <code>set</code> <i>may have</i> zero or more <code>text:paramValue</code>s.
+ * A parameter of type <code>set</code> <i>must have</i> a list of zero or more <code>String</code>s.
* <p>
* A parameter of type <code>string</code>, <code>file</code>, <code>boolean</code>, or
- * <code>int</code> <i>must have</i> a single <code>text:paramValue</code>
+ * <code>int</code> <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * Example:
+ * <pre>
+ text:map (
+ [ text:field "text" ;
+ text:predicate rdfs:label;
+ text:analyzer [
+ a text:GenericAnalyzer ;
+ text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
+ text:params (
+ [ text:paramName "stopwords" ;
+ text:paramType "set" ;
+ text:paramValue ("the" "a" "an") ]
+ [ text:paramName "stemExclusionSet" ;
+ text:paramType "set" ;
+ text:paramValue ("ing" "ed") ]
+ )
+ ] .
+ * </pre>
*/
public class GenericAnalyzerAssembler extends AssemblerBase {
/*
@@ -77,29 +110,284 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
text:analyzer [
a text:GenericAnalyzer ;
text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
- text:params [
- a rdf:seq ;
- rdf:_1 [
- text:paramName "stopwords" ;
- text:paramType "set" ;
- text:paramValue "the", "a", "an" ] ;
- rdf:_2 [
- text:paramName "stemExclusionSet" ;
- text:paramType "set" ;
- text:paramValue "ing", "ed" ]
- ]
- ]
- ] .
+ text:params (
+ [ text:paramName "stopwords" ;
+ text:paramType "set" ;
+ text:paramValue ("the" "a" "an") ]
+ [ text:paramName "stemExclusionSet" ;
+ text:paramType "set" ;
+ text:paramValue ("ing" "ed") ]
+ )
+ ] .
*/
- public GenericAnalyzerAssembler() {
- // TODO Auto-generated constructor stub
- }
-
@Override
- public Object open(Assembler a, Resource root, Mode mode) {
- // TODO Auto-generated method stub
- return null;
+ public Analyzer open(Assembler a, Resource root, Mode mode) {
+ if (root.hasProperty(TextVocab.pClass)) {
+ // text:class is expected to be a string literal
+ String className = root.getProperty(TextVocab.pClass).getString();
+
+ // is the class accessible?
+ Class<?> clazz = null;
+ try {
+ clazz = Class.forName(className);
+ } catch (ClassNotFoundException e) {
+ Log.error(this, "Analyzer class " + className + " not found. " + e.getMessage(), e);
+ return null;
+ }
+
+ // Is the class an Analyzer?
+ if (!Analyzer.class.isAssignableFrom(clazz)) {
+ Log.error(this, clazz.getName() + " has to be a subclass of " + Analyzer.class.getName());
+ return null;
+ }
+
+ if (root.hasProperty(TextVocab.pParams)) {
+ RDFNode node = root.getProperty(TextVocab.pParams).getObject();
+ if (! node.isResource()) {
+ throw new TextIndexException("text:params must be a list of parameter resources: " + node);
+ }
+
+ List<ParamSpec> specs = getParamSpecs((Resource) node);
+
+ // split the param specs into classes and values for constructor lookup
+ final Class<?> paramClasses[] = new Class<?>[specs.size()];
+ final Object paramValues[] = new Object[specs.size()];
+ for (int i = 0; i < specs.size(); i++) {
+ ParamSpec spec = specs.get(i);
+ paramClasses[i] = spec.getValueClass();
+ paramValues[i] = spec.getValue();
+ }
+
+ // Create new analyzer
+ return newAnalyzer(clazz, paramClasses, paramValues);
+
+ } else {
+ // use the nullary Analyzer constructor
+ return newAnalyzer(clazz, new Class<?>[0], new Object[0]);
+ }
+ } else {
+ throw new TextIndexException("text:class property is required by GenericAnalyzer");
+ }
}
+ /**
+ * Create instance of the Lucene Analyzer, <code>class</code>, with provided parameters
+ *
+ * @param clazz The analyzer class
+ * @param paramClasses The parameter classes
+ * @param paramValues The parameter values
+ * @return The lucene analyzer
+ */
+ private Analyzer newAnalyzer(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+
+ String className = clazz.getName();
+
+ try {
+ final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses);
+
+ return (Analyzer) cstr.newInstance(paramValues);
+
+ } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) {
+ Log.error(this, "Exception while instantiating analyzer class " + className + ". " + e.getMessage(), e);
+ } catch (NoSuchMethodException ex) {
+ Log.error(this, "Could not find matching analyzer class constructor for " + className + " " + ex.getMessage(), ex);
+ }
+
+ return null;
+ }
+
+ private List<ParamSpec> getParamSpecs(Resource list) {
+ List<ParamSpec> result = new ArrayList<>();
+ Resource current = list;
+
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement firstStmt = current.getProperty(RDF.first);
+ if (firstStmt == null) {
+ throw new TextIndexException("parameter list not well formed: " + current);
+ }
+
+ RDFNode first = firstStmt.getObject();
+ if (! first.isResource()) {
+ throw new TextIndexException("parameter specification must be an anon resource : " + first);
+ }
+
+ result.add(getParamSpec((Resource) first));
+
+ Statement restStmt = current.getProperty(RDF.rest);
+ if (restStmt == null) {
+ throw new TextIndexException("parameter list not terminated by rdf:nil");
+ }
+
+ RDFNode rest = restStmt.getObject();
+ if (! rest.isResource()) {
+ throw new TextIndexException("parameter list node is not a resource : " + rest);
+ }
+
+ current = (Resource) rest;
+ }
+
+ return result;
+ }
+
+ private ParamSpec getParamSpec(Resource node) {
+ Statement nameStmt = node.getProperty(TextVocab.pParamName);
+ Statement typeStmt = node.getProperty(TextVocab.pParamType);
+ Statement valueStmt = node.getProperty(TextVocab.pParamValue);
+
+ String name = getStringValue(nameStmt);
+ String type = getStringValue(typeStmt);
+ String value = getStringValue(valueStmt);
+
+ switch (type) {
+
+ // String
+ case "string": {
+ if (value == null) {
+ throw new TextIndexException("Value for string param: " + name + " must not be empty!");
+ }
+
+ return new ParamSpec(name, value, String.class);
+ }
+
+ // "java.io.FileReader":
+ case "file": {
+
+ if (value == null) {
+ throw new TextIndexException("Value for file param must exist and must contain a file name.");
+ }
+
+ try {
+ // The analyzer is responsible for closing the file
+ Reader fileReader = new java.io.FileReader(value);
+ return new ParamSpec(name, fileReader, Reader.class);
+
+ } catch (java.io.FileNotFoundException ex) {
+ throw new TextIndexException("File " + value + " for param " + name + " not found!");
+ }
+ }
+
+ // "org.apache.lucene.analysis.util.CharArraySet":
+ case "set": {
+ if (valueStmt == null) {
+ throw new TextIndexException("A set param spec must have a text:paramValue:" + node);
+ }
+
+ RDFNode valueNode = valueStmt.getObject();
+ if (!valueNode.isResource()) {
+ throw new TextIndexException("A set param spec text:paramValue must be a list of strings: " + valueNode);
+ }
+
+ List<String> values = toStrings((Resource) valueNode);
+
+ return new ParamSpec(name, new CharArraySet(values, false), CharArraySet.class);
+ }
+
+ // "int":
+ case "int":
+ if (value == null) {
+ throw new TextIndexException("Value for int param: " + name + " must not be empty!");
+ }
+
+ int n = ((Literal) valueStmt.getObject()).getInt();
+ return new ParamSpec(name, n, int.class);
+
+ // "boolean":
+ case "boolean":
+ if (value == null) {
+ throw new TextIndexException("Value for boolean param: " + name + " must not be empty!");
+ }
+
+ boolean b = ((Literal) valueStmt.getObject()).getBoolean();
+ return new ParamSpec(name, b, boolean.class);
+
+ default:
+ // there was no match
+ Log.error(this, "Unknown parameter type: " + type + " for param: " + name + " with value: " + value);
+ break;
+ }
+
+ return null;
+ }
+
+ private String getStringValue(Statement stmt) {
+ if (stmt == null) {
+ return null;
+ } else {
+ RDFNode node = stmt.getObject();
+ if (node.isLiteral()) {
+ return ((Literal) node).getLexicalForm();
+ } else {
+ return null;
+ }
+ }
+ }
+
+ private List<String> toStrings(Resource list) {
+ List<String> result = new ArrayList<>();
+ Resource current = list;
+
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement firstStmt = current.getProperty(RDF.first);
+ if (firstStmt == null) {
+ throw new TextIndexException("param spec of type set not well formed");
+ }
+
+ RDFNode first = firstStmt.getObject();
+ if (! first.isLiteral()) {
+ throw new TextIndexException("param spec of type set item is not a literal: " + first);
+ }
+
+ result.add(((Literal)first).getLexicalForm());
+
+ Statement restStmt = current.getProperty(RDF.rest);
+ if (restStmt == null) {
+ throw new TextIndexException("param spec of type set not terminated by rdf:nil");
+ }
+
+ RDFNode rest = restStmt.getObject();
+ if (! rest.isResource()) {
+ throw new TextIndexException("param spec of type set rest is not a resource: " + rest);
+ }
+
+ current = (Resource) rest;
+ }
+
+ return result;
+ }
+
+ /**
+ * <code>ParamSpec</code> contains the <code>name</code>, <code>Class</code>, and
+ * <code>value</code> of a parameter for a constructor (or really any method in general)
+ */
+ private static final class ParamSpec {
+
+ private final String name;
+ private final Object value;
+ private final Class<?> clazz;
+
+ @SuppressWarnings("unused")
+ public ParamSpec(String key, Object value) {
+ this(key, value, value.getClass());
+ }
+
+ public ParamSpec(String key, Object value, Class<?> clazz) {
+ this.name = key;
+ this.value = value;
+ this.clazz = clazz;
+ }
+
+ @SuppressWarnings("unused")
+ public String getKey() {
+ return name;
+ }
+
+ public Object getValue() {
+ return value;
+ }
+
+ public Class<?> getValueClass() {
+ return clazz;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/jena/blob/27ea30b7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
index 636c6bc..45f5cee 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
@@ -37,7 +37,7 @@ public class TextAssembler
Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ;
- Assembler.general.implementWith(TextVocab.genericAnalyzer, new GenericAnalyzerAssembler()) ;
+ Assembler.general.implementWith(TextVocab.genericAnalyzer, new GenericAnalyzerAssembler()) ;
}
}
http://git-wip-us.apache.org/repos/asf/jena/blob/27ea30b7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
index bc49d10..cd1844d 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
@@ -66,7 +66,6 @@ public class TextVocab
public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer");
public static final Resource localizedAnalyzer = Vocab.resource(NS, "LocalizedAnalyzer");
public static final Resource configurableAnalyzer = Vocab.resource(NS, "ConfigurableAnalyzer");
- public static final Resource genericAnalyzer = Vocab.resource(NS, "GenericAnalyzer");
// Tokenizers
public static final Resource standardTokenizer = Vocab.resource(NS, "StandardTokenizer");
@@ -87,5 +86,12 @@ public class TextVocab
public static final Property pReplicas = Vocab.property(NS, "replicas");
public static final Property pIndexName = Vocab.property(NS, "indexName");
+ //GenericAnalyzer
+ public static final Resource genericAnalyzer = Vocab.resource(NS, "GenericAnalyzer");
+ public static final Property pClass = Vocab.property(NS, "class");
+ public static final Property pParams = Vocab.property(NS, "params");
+ public static final Property pParamName = Vocab.property(NS, "paramName");
+ public static final Property pParamType = Vocab.property(NS, "paramType");
+ public static final Property pParamValue = Vocab.property(NS, "paramValue");
}