You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by co...@apache.org on 2018/03/21 14:44:35 UTC
[1/4] jena git commit: Merged JENA-1506-definedFilters
Repository: jena
Updated Branches:
refs/heads/master 50b46f0f0 -> 2e3d1fa27
Merged JENA-1506-definedFilters
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/795b9eb7
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/795b9eb7
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/795b9eb7
Branch: refs/heads/master
Commit: 795b9eb7cb45999c1d884bcd84c83896a498ed87
Parents: 6e20282
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Thu Mar 15 14:12:53 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Mar 15 14:12:53 2018 -0500
----------------------------------------------------------------------
.../text/analyzer/ConfigurableAnalyzer.java | 133 +++++--
.../ConfigurableAnalyzerAssembler.java | 4 +-
.../assembler/DefineAnalyzersAssembler.java | 3 +-
.../text/assembler/DefineFiltersAssembler.java | 103 ++++++
.../assembler/DefineTokenizersAssembler.java | 100 +++++
.../assembler/GenericAnalyzerAssembler.java | 228 +-----------
.../text/assembler/GenericFilterAssembler.java | 199 ++++++++++
.../assembler/GenericTokenizerAssembler.java | 198 ++++++++++
.../jena/query/text/assembler/Params.java | 362 +++++++++++++++++++
.../query/text/assembler/TextAssembler.java | 2 +
.../assembler/TextIndexLuceneAssembler.java | 52 +--
.../jena/query/text/assembler/TextVocab.java | 24 +-
.../org/apache/jena/query/text/TS_Text.java | 1 +
.../query/text/TestTextDefineAnalyzers.java | 182 ++++++++++
14 files changed, 1308 insertions(+), 283 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
index 2008445..8d54d2c 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
@@ -18,9 +18,16 @@
package org.apache.jena.query.text.analyzer ;
+import java.lang.reflect.Constructor;
+import java.util.Hashtable;
import java.util.List ;
+import java.lang.reflect.InvocationTargetException;
+import org.apache.jena.atlas.logging.Log;
import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.assembler.GenericFilterAssembler.FilterSpec;
+import org.apache.jena.query.text.assembler.GenericTokenizerAssembler.TokenizerSpec;
+import org.apache.jena.query.text.assembler.TextVocab;
import org.apache.lucene.analysis.Analyzer ;
import org.apache.lucene.analysis.TokenFilter ;
import org.apache.lucene.analysis.Tokenizer ;
@@ -43,32 +50,114 @@ public class ConfigurableAnalyzer extends Analyzer {
private final String tokenizer;
private final List<String> filters;
+ private static Hashtable<String, FilterSpec> filterSpecs = new Hashtable<>();
+ private static Hashtable<String, TokenizerSpec> tokenizerSpecs = new Hashtable<>();
+
+ static{
+ Class<?>[] paramClasses = new Class<?>[0];
+ Object[] paramValues = new Object[0];
+
+ tokenizerSpecs.put(TextVocab.NS+"KeywordTokenizer", new TokenizerSpec(KeywordTokenizer.class, paramClasses, paramValues));
+ tokenizerSpecs.put(TextVocab.NS+"LetterTokenizer", new TokenizerSpec(LetterTokenizer.class, paramClasses, paramValues));
+ tokenizerSpecs.put(TextVocab.NS+"StandardTokenizer", new TokenizerSpec(StandardTokenizer.class, paramClasses, paramValues));
+ tokenizerSpecs.put(TextVocab.NS+"WhitespaceTokenizer", new TokenizerSpec(WhitespaceTokenizer.class, paramClasses, paramValues));
+
+ paramClasses = new Class<?>[] {TokenStream.class};
+ paramValues = new Object[]{ null };
+
+ filterSpecs.put(TextVocab.NS+"ASCIIFoldingFilter", new FilterSpec(ASCIIFoldingFilter.class, paramClasses, paramValues));
+ filterSpecs.put(TextVocab.NS+"LowerCaseFilter", new FilterSpec(LowerCaseFilter.class, paramClasses, paramValues));
+ filterSpecs.put(TextVocab.NS+"StandardFilter", new FilterSpec(StandardFilter.class, paramClasses, paramValues));
+ }
+
+ public static void defineFilter(String id, FilterSpec spec) {
+ filterSpecs.put(id, spec);
+ }
+
+ public static void defineTokenizer(String id, TokenizerSpec spec) {
+ tokenizerSpecs.put(id, spec);
+ }
+
+ /**
+ * Create instance of a Lucene Tokenizer, <code>class</code>, with provided parameters
+ *
+ * @param clazz The analyzer class
+ * @param paramClasses The parameter classes
+ * @param paramValues The parameter values
+ * @return The lucene analyzer
+ */
+ private Tokenizer newTokenizer(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+
+ String className = clazz.getName();
+
+ try {
+ final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses);
+
+ return (Tokenizer) cstr.newInstance(paramValues);
+
+ } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) {
+ Log.error(this, "Exception while instantiating tokenizer class " + className + ". " + e.getMessage(), e);
+ } catch (NoSuchMethodException ex) {
+ Log.error(this, "Could not find matching tokenizer class constructor for " + className + " " + ex.getMessage(), ex);
+ }
+
+ return null;
+ }
+
+ /**
+ * Create instance of the Lucene Analyzer, <code>class</code>, with provided parameters
+ *
+ * @param clazz The analyzer class
+ * @param paramClasses The parameter classes
+ * @param paramValues The parameter values
+ * @return The lucene analyzer
+ */
+ private TokenFilter newFilter(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+
+ String className = clazz.getName();
+
+ try {
+ final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses);
+
+ return (TokenFilter) cstr.newInstance(paramValues);
+
+ } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) {
+ Log.error(this, "Exception while instantiating analyzer class " + className + ". " + e.getMessage(), e);
+ } catch (NoSuchMethodException ex) {
+ Log.error(this, "Could not find matching analyzer class constructor for " + className + " " + ex.getMessage(), ex);
+ }
+
+ return null;
+ }
+
private Tokenizer getTokenizer(String tokenizerName) {
- switch(tokenizerName) {
- case "KeywordTokenizer":
- return new KeywordTokenizer();
- case "LetterTokenizer":
- return new LetterTokenizer();
- case "StandardTokenizer":
- return new StandardTokenizer();
- case "WhitespaceTokenizer":
- return new WhitespaceTokenizer();
- default:
- throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
- }
+ TokenizerSpec spec = tokenizerSpecs.get(tokenizerName);
+ if (spec == null) {
+ throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
+ }
+
+ Class<?> clazz = spec.clazz;
+ Class<?>[] paramClasses = spec.paramClasses;
+ Object[] paramValues = spec.paramValues;
+
+ return newTokenizer(clazz, paramClasses, paramValues);
}
private TokenFilter getTokenFilter(String filterName, TokenStream source) {
- switch(filterName) {
- case "ASCIIFoldingFilter":
- return new ASCIIFoldingFilter(source);
- case "LowerCaseFilter":
- return new LowerCaseFilter(source);
- case "StandardFilter":
- return new StandardFilter(source);
- default:
- throw new TextIndexException("Unknown filter : " + filterName);
- }
+ FilterSpec spec = filterSpecs.get(filterName);
+
+ if (spec == null) {
+ throw new TextIndexException("Unknown filter : " + filterName);
+ }
+
+ Class<?> clazz = spec.clazz;
+ Class<?>[] paramClasses = spec.paramClasses;
+ Object[] paramValues = spec.paramValues;
+
+ // the source should always be the first parameter
+ paramValues[0] = source;
+
+ return newFilter(clazz, paramClasses, paramValues);
}
public ConfigurableAnalyzer(String tokenizer, List<String> filters) {
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
index 5ec96eb..bf38508 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
@@ -55,7 +55,7 @@ public class ConfigurableAnalyzerAssembler extends AssemblerBase {
public Analyzer open(Assembler a, Resource root, Mode mode) {
if (root.hasProperty(TextVocab.pTokenizer)) {
Resource tokenizerResource = root.getPropertyResourceValue(TextVocab.pTokenizer);
- String tokenizer = tokenizerResource.getLocalName();
+ String tokenizer = tokenizerResource.getURI();
List<String> filters;
if (root.hasProperty(TextVocab.pFilters)) {
Resource filtersResource = root.getPropertyResourceValue(TextVocab.pFilters);
@@ -82,7 +82,7 @@ public class ConfigurableAnalyzerAssembler extends AssemblerBase {
throw new TextIndexException("filter is not a resource : " + node);
}
- result.add(node.asResource().getLocalName());
+ result.add(node.asResource().getURI());
stmt = current.getProperty(RDF.rest);
if (stmt == null) {
throw new TextIndexException("filter list not terminated by rdf:nil");
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
index 11270e2..6326128 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
@@ -64,6 +64,7 @@ public class DefineAnalyzersAssembler {
throw new TextIndexException("addAnalyzers text:analyzer must be an analyzer spec resource: " + analyzerNode);
}
+ // calls GenericAnalyzerAssembler
Analyzer analyzer = (Analyzer) a.open((Resource) analyzerNode);
if (adding.hasProperty(TextVocab.pAddLang)) {
@@ -83,8 +84,6 @@ public class DefineAnalyzersAssembler {
throw new TextIndexException("addAnalyzers text:defineAnalyzer property must be a non-blank resource: " + adding);
}
}
- } else {
- throw new TextIndexException("text:analyzer property is required when adding an analyzer: " + adding);
}
Statement restStmt = current.getProperty(RDF.rest);
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java
new file mode 100644
index 0000000..b7d1c63
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.atlas.logging.Log;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+import org.apache.jena.query.text.assembler.GenericFilterAssembler.FilterSpec;
+
+public class DefineFiltersAssembler {
+ /*
+ <#indexLucene> a text:TextIndexLucene ;
+ text:directory <file:Lucene> ;
+ text:entityMap <#entMap> ;
+ text:defineAnalyzers (
+ [text:addLang "sa-x-iast" ;
+ text:analyzer [ . . . ]]
+ [text:defineAnalyzer <#foo> ;
+ text:analyzer [ . . . ]]
+ [text:defineFilter <#bar> ;
+ text:filter [ . . . ]]
+ [text:defineTokenizer <#baz> ;
+ text:tokenizer [ . . . ]]
+ )
+ */
+
+ public static boolean open(Assembler a, Resource list) {
+ Resource current = list;
+
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement firstStmt = current.getProperty(RDF.first);
+ if (firstStmt == null) {
+ throw new TextIndexException("parameter list not well formed: " + current);
+ }
+
+ RDFNode first = firstStmt.getObject();
+ if (! first.isResource()) {
+ throw new TextIndexException("parameter specification must be an anon resource : " + first);
+ }
+
+ // process the current list element to add an analyzer
+ Resource adding = (Resource) first;
+ if (adding.hasProperty(TextVocab.pFilter)) {
+ Statement filterStmt = adding.getProperty(TextVocab.pFilter);
+ RDFNode filterNode = filterStmt.getObject();
+ if (!filterNode.isResource()) {
+ throw new TextIndexException("addFilters text:filter must be a filter spec resource: " + filterNode);
+ }
+
+ // calls GenericFilterAssembler
+ FilterSpec filterSpec = (FilterSpec) a.open((Resource) filterNode);
+
+ if (adding.hasProperty(TextVocab.pDefFilter)) {
+ Statement defStmt = adding.getProperty(TextVocab.pDefFilter);
+ Resource id = defStmt.getResource();
+
+ if (id.getURI() != null) {
+ ConfigurableAnalyzer.defineFilter(id.getURI(), filterSpec);
+ } else {
+ throw new TextIndexException("text:defineFilters text:defineAnalyzer property must be a non-blank resource: " + adding);
+ }
+ } else {
+ Log.warn("DefineFiltersAssembler", "Filter specified but no text:defineFilter so filter is not accessible!");
+ }
+ }
+
+ Statement restStmt = current.getProperty(RDF.rest);
+ if (restStmt == null) {
+ throw new TextIndexException("parameter list not terminated by rdf:nil");
+ }
+
+ RDFNode rest = restStmt.getObject();
+ if (! rest.isResource()) {
+ throw new TextIndexException("parameter list node is not a resource : " + rest);
+ }
+
+ current = (Resource) rest;
+ }
+
+ return true;
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java
new file mode 100644
index 0000000..504e975
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer;
+import org.apache.jena.query.text.assembler.GenericTokenizerAssembler.TokenizerSpec;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+
+public class DefineTokenizersAssembler {
+ /*
+ <#indexLucene> a text:TextIndexLucene ;
+ text:directory <file:Lucene> ;
+ text:entityMap <#entMap> ;
+ text:defineAnalyzers (
+ [text:addLang "sa-x-iast" ;
+ text:analyzer [ . . . ]]
+ [text:defineAnalyzer <#foo> ;
+ text:analyzer [ . . . ]]
+ [text:defineFilter <#bar> ;
+ text:filter [ . . . ]]
+ [text:defineTokenizer <#baz> ;
+ text:tokenizer [ . . . ]]
+ )
+ */
+
+ public static boolean open(Assembler a, Resource list) {
+ Resource current = list;
+ boolean isMultilingualSupport = false;
+
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement firstStmt = current.getProperty(RDF.first);
+ if (firstStmt == null) {
+ throw new TextIndexException("parameter list not well formed: " + current);
+ }
+
+ RDFNode first = firstStmt.getObject();
+ if (! first.isResource()) {
+ throw new TextIndexException("parameter specification must be an anon resource : " + first);
+ }
+
+ // process the current list element to add an analyzer
+ Resource adding = (Resource) first;
+ if (adding.hasProperty(TextVocab.pTokenizer)) {
+ Statement tokenizerStmt = adding.getProperty(TextVocab.pTokenizer);
+ RDFNode tokenizerNode = tokenizerStmt.getObject();
+ if (!tokenizerNode.isResource()) {
+ throw new TextIndexException("addTokenizers text:tokenizer must be an tokenizer spec resource: " + tokenizerNode);
+ }
+
+ TokenizerSpec spec = (TokenizerSpec) a.open((Resource) tokenizerNode);
+
+ if (adding.hasProperty(TextVocab.pDefTokenizer)) {
+ Statement defStmt = adding.getProperty(TextVocab.pDefTokenizer);
+ Resource id = defStmt.getResource();
+
+ if (id.getURI() != null) {
+ ConfigurableAnalyzer.defineTokenizer(id.getURI(), spec);
+ } else {
+ throw new TextIndexException("addTokenizers text:defineTokenizer property must be a non-blank resource: " + adding);
+ }
+ }
+ }
+
+ Statement restStmt = current.getProperty(RDF.rest);
+ if (restStmt == null) {
+ throw new TextIndexException("parameter list not terminated by rdf:nil");
+ }
+
+ RDFNode rest = restStmt.getObject();
+ if (! rest.isResource()) {
+ throw new TextIndexException("parameter list node is not a resource : " + rest);
+ }
+
+ current = (Resource) rest;
+ }
+
+ return isMultilingualSupport;
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
index 21f4be1..3cf2004 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
@@ -18,10 +18,8 @@
package org.apache.jena.query.text.assembler;
-import java.io.Reader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
-import java.util.ArrayList;
import java.util.List;
import org.apache.jena.assembler.Assembler;
@@ -29,13 +27,9 @@ import org.apache.jena.assembler.Mode;
import org.apache.jena.assembler.assemblers.AssemblerBase;
import org.apache.jena.atlas.logging.Log ;
import org.apache.jena.query.text.TextIndexException;
-import org.apache.jena.rdf.model.Literal;
import org.apache.jena.rdf.model.RDFNode;
import org.apache.jena.rdf.model.Resource;
-import org.apache.jena.rdf.model.Statement;
-import org.apache.jena.vocabulary.RDF;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
/**
* Creates generic analyzers given a fully qualified Class name and a list
@@ -142,13 +136,6 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
] .
*/
- public static final String TYPE_ANALYZER = "TypeAnalyzer";
- public static final String TYPE_BOOL = "TypeBoolean";
- public static final String TYPE_FILE = "TypeFile";
- public static final String TYPE_INT = "TypeInt";
- public static final String TYPE_SET = "TypeSet";
- public static final String TYPE_STRING = "TypeString";
-
@Override
public Analyzer open(Assembler a, Resource root, Mode mode) {
if (root.hasProperty(TextVocab.pClass)) {
@@ -176,13 +163,13 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
throw new TextIndexException("text:params must be a list of parameter resources: " + node);
}
- List<ParamSpec> specs = getParamSpecs((Resource) node);
+ List<Params.ParamSpec> specs = Params.getParamSpecs((Resource) node);
// split the param specs into classes and values for constructor lookup
final Class<?> paramClasses[] = new Class<?>[specs.size()];
final Object paramValues[] = new Object[specs.size()];
for (int i = 0; i < specs.size(); i++) {
- ParamSpec spec = specs.get(i);
+ Params.ParamSpec spec = specs.get(i);
paramClasses[i] = spec.getValueClass();
paramValues[i] = spec.getValue();
}
@@ -224,215 +211,4 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
return null;
}
-
- private List<ParamSpec> getParamSpecs(Resource list) {
- List<ParamSpec> result = new ArrayList<>();
- Resource current = list;
-
- while (current != null && ! current.equals(RDF.nil)){
- Statement firstStmt = current.getProperty(RDF.first);
- if (firstStmt == null) {
- throw new TextIndexException("parameter list not well formed: " + current);
- }
-
- RDFNode first = firstStmt.getObject();
- if (! first.isResource()) {
- throw new TextIndexException("parameter specification must be an anon resource : " + first);
- }
-
- result.add(getParamSpec((Resource) first));
-
- Statement restStmt = current.getProperty(RDF.rest);
- if (restStmt == null) {
- throw new TextIndexException("parameter list not terminated by rdf:nil");
- }
-
- RDFNode rest = restStmt.getObject();
- if (! rest.isResource()) {
- throw new TextIndexException("parameter list node is not a resource : " + rest);
- }
-
- current = (Resource) rest;
- }
-
- return result;
- }
-
- private ParamSpec getParamSpec(Resource node) {
- Statement nameStmt = node.getProperty(TextVocab.pParamName);
- Statement typeStmt = node.getProperty(TextVocab.pParamType);
- Statement valueStmt = node.getProperty(TextVocab.pParamValue);
-
- if (typeStmt == null) {
- throw new TextIndexException("Parameter specification must have a text:paramType: " + node);
- }
- Resource typeRes = typeStmt.getResource();
- String type = typeRes.getLocalName();
-
- String name = getStringValue(nameStmt);
- String value = getStringValue(valueStmt);
-
- switch (type) {
-
- // String
- case TYPE_STRING: {
- if (value == null) {
- throw new TextIndexException("Value for string param: " + name + " must not be empty!");
- }
-
- return new ParamSpec(name, value, String.class);
- }
-
- // java.io.FileReader
- case TYPE_FILE: {
-
- if (value == null) {
- throw new TextIndexException("Value for file param must exist and must contain a file name.");
- }
-
- try {
- // The analyzer is responsible for closing the file
- Reader fileReader = new java.io.FileReader(value);
- return new ParamSpec(name, fileReader, Reader.class);
-
- } catch (java.io.FileNotFoundException ex) {
- throw new TextIndexException("File " + value + " for param " + name + " not found!");
- }
- }
-
- // org.apache.lucene.analysis.util.CharArraySet
- case TYPE_SET: {
- if (valueStmt == null) {
- throw new TextIndexException("A set param spec must have a text:paramValue:" + node);
- }
-
- RDFNode valueNode = valueStmt.getObject();
- if (!valueNode.isResource()) {
- throw new TextIndexException("A set param spec text:paramValue must be a list of strings: " + valueNode);
- }
-
- List<String> values = toStrings((Resource) valueNode);
-
- return new ParamSpec(name, new CharArraySet(values, false), CharArraySet.class);
- }
-
- // int
- case TYPE_INT:
- if (value == null) {
- throw new TextIndexException("Value for int param: " + name + " must not be empty!");
- }
-
- int n = ((Literal) valueStmt.getObject()).getInt();
- return new ParamSpec(name, n, int.class);
-
- // boolean
- case TYPE_BOOL:
- if (value == null) {
- throw new TextIndexException("Value for boolean param: " + name + " must not be empty!");
- }
-
- boolean b = ((Literal) valueStmt.getObject()).getBoolean();
- return new ParamSpec(name, b, boolean.class);
-
- // org.apache.lucene.analysis.Analyzer
- case TYPE_ANALYZER:
- if (valueStmt == null) {
- throw new TextIndexException("Analyzer param spec must have a text:paramValue:" + node);
- }
-
- RDFNode valueNode = valueStmt.getObject();
- if (!valueNode.isResource()) {
- throw new TextIndexException("Analyzer param spec text:paramValue must be an analyzer spec resource: " + valueNode);
- }
-
- Analyzer analyzer = (Analyzer) Assembler.general.open((Resource) valueNode);
- return new ParamSpec(name, analyzer, Analyzer.class);
-
- default:
- // there was no match
- Log.error(this, "Unknown parameter type: " + type + " for param: " + name + " with value: " + value);
- break;
- }
-
- return null;
- }
-
- private String getStringValue(Statement stmt) {
- if (stmt == null) {
- return null;
- } else {
- RDFNode node = stmt.getObject();
- if (node.isLiteral()) {
- return ((Literal) node).getLexicalForm();
- } else {
- return null;
- }
- }
- }
-
- private List<String> toStrings(Resource list) {
- List<String> result = new ArrayList<>();
- Resource current = list;
-
- while (current != null && ! current.equals(RDF.nil)){
- Statement firstStmt = current.getProperty(RDF.first);
- if (firstStmt == null) {
- throw new TextIndexException("param spec of type set not well formed");
- }
-
- RDFNode first = firstStmt.getObject();
- if (! first.isLiteral()) {
- throw new TextIndexException("param spec of type set item is not a literal: " + first);
- }
-
- result.add(((Literal)first).getLexicalForm());
-
- Statement restStmt = current.getProperty(RDF.rest);
- if (restStmt == null) {
- throw new TextIndexException("param spec of type set not terminated by rdf:nil");
- }
-
- RDFNode rest = restStmt.getObject();
- if (! rest.isResource()) {
- throw new TextIndexException("param spec of type set rest is not a resource: " + rest);
- }
-
- current = (Resource) rest;
- }
-
- return result;
- }
-
- /**
- * <code>ParamSpec</code> contains the <code>name</code>, <code>Class</code>, and
- * <code>value</code> of a parameter for a constructor (or really any method in general)
- */
- private static final class ParamSpec {
-
- private final String name;
- private final Object value;
- private final Class<?> clazz;
-
- public ParamSpec(String key, Object value) {
- this(key, value, value.getClass());
- }
-
- public ParamSpec(String key, Object value, Class<?> clazz) {
- this.name = key;
- this.value = value;
- this.clazz = clazz;
- }
-
- public String getKey() {
- return name;
- }
-
- public Object getValue() {
- return value;
- }
-
- public Class<?> getValueClass() {
- return clazz;
- }
- }
}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java
new file mode 100644
index 0000000..245f3f9
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.assembler.Mode;
+import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.assembler.Params.ParamSpec;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Creates generic filters given a fully qualified Class name and a list
+ * of parameters for a constructor of the Class.
+ * <p>
+ * The parameters may be of the following types:
+ * <pre>
+ * text:TypeString String
+ * text:TypeSet org.apache.lucene.analysis.util.CharArraySet
+ * text:TypeFile java.io.FileReader
+ * text:TypeInt int
+ * text:TypeBoolean boolean
+ * text:TypeTokenStream TokenStream
+ * text:TypeAnalyzer org.apache.lucene.analysis.Analyzer
+ * </pre>
+ *
+ * Although the list of types is not exhaustive it is a simple matter
+ * to create a wrapper Analyzer that reads a file with information that can
+ * be used to initialize any sort of parameters that may be needed for
+ * a given Analyzer. The provided types cover the vast majority of cases.
+ * <p>
+ * For example, <code>org.apache.lucene.analysis.ja.JapaneseAnalyzer</code>
+ * has a constructor with 4 parameters: a <code>UserDict</code>,
+ * a <code>CharArraySet</code>, a <code>JapaneseTokenizer.Mode</code>, and a
+ * <code>Set<String></code>. So a simple wrapper can extract the values
+ * needed for the various parameters with types not available in this
+ * extension, construct the required instances, and instantiate the
+ * <code>JapaneseAnalyzer</code>.
+ * <p>
+ * Adding custom Analyzers such as the above wrapper analyzer is a simple
+ * matter of adding the Analyzer class and any associated filters and tokenizer
+ * and so on to the classpath for Jena - usually in a jar. Of course, all of
+ * the Analyzers that are included in the Lucene distribution bundled with Jena
+ * are available as generic Analyzers as well.
+ * <p>
+ * Each parameter object is specified with:
+ * <ul>
+ * <li>an optional <code>text:paramName</code> that may be used to document which
+ * parameter is represented</li>
+ * <li>a <code>text:paramType</code> which is one of: <code>text:TypeString</code>,
+ * <code>text:TypeSet</code>, <code>text:TypeFile</code>, <code>text:TypeInt</code>,
+ * <code>text:TypeBoolean</code>, <code>text:TypeAnalyzer</code>.</li>
+ * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int or resource.</li>
+ * </ul>
+ * <p>
+ * A parameter of type <code>text:TypeSet</code> <i>must have</i> a list of zero or
+ * more <code>String</code>s.
+ * <p>
+ * A parameter of type <code>text:TypeString</code>, <code>text:TypeFile</code>,
+ * <code>text:TypeBoolean</code>, <code>text:TypeInt</code> or <code>text:TypeAnalyzer</code>
+ * <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * Examples:
+ * <pre>
+ <#indexLucene> a text:TextIndexLucene ;
+ text:directory <file:Lucene> ;
+ text:entityMap <#entMap> ;
+ text:defineAnalyzers (
+ [text:addLang "sa-x-iast" ;
+ text:analyzer [ . . . ]]
+ [text:defineAnalyzer <#foo> ;
+ text:analyzer [ . . . ]]
+ [text:defineFilter <#bar> ;
+ text:filter [
+ a text:GenericFilter ;
+ text:class "org.apache.jena.query.text.filter.SelectiveFoldingFilter" ;
+ text:params (
+ [ text:paramName "whitelisted" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("ç") ]
+ )
+ ]
+ ]
+ )
+ * </pre>
+ */
+public class GenericFilterAssembler extends AssemblerBase {
+ /*
+ <#indexLucene> a text:TextIndexLucene ;
+ text:directory <file:Lucene> ;
+ text:entityMap <#entMap> ;
+ text:defineAnalyzers (
+ [text:addLang "sa-x-iast" ;
+ text:analyzer [ . . . ]]
+ [text:defineAnalyzer <#foo> ;
+ text:analyzer [ . . . ]]
+ [text:defineFilter <#bar> ;
+ text:filter [
+ a text:GenericFilter ;
+ text:class "org.apache.jena.query.text.filter.SelectiveFoldingFilter" ;
+ text:params (
+ [ text:paramName "whitelisted" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("ç") ]
+ )
+ ]
+ ]
+ )
+ */
+
+ @Override
+ public FilterSpec open(Assembler a, Resource root, Mode mode) {
+ if (root.hasProperty(TextVocab.pClass)) {
+ // text:class is expected to be a string literal
+ String className = root.getProperty(TextVocab.pClass).getString();
+
+ // is the class accessible?
+ Class<?> clazz = null;
+ try {
+ clazz = Class.forName(className);
+ } catch (ClassNotFoundException e) {
+ Log.error(this, "Filter class " + className + " not found. " + e.getMessage(), e);
+ return null;
+ }
+
+ // Is the class an Analyzer?
+ if (!TokenFilter.class.isAssignableFrom(clazz)) {
+ Log.error(this, clazz.getName() + " has to be a subclass of " + TokenFilter.class.getName());
+ return null;
+ }
+
+ if (root.hasProperty(TextVocab.pParams)) {
+ RDFNode node = root.getProperty(TextVocab.pParams).getObject();
+ if (! node.isResource()) {
+ throw new TextIndexException("text:params must be a list of parameter resources: " + node);
+ }
+
+ List<ParamSpec> specs = Params.getParamSpecs((Resource) node);
+
+ // split the param specs into classes and values for constructor lookup
+ // add an initial param for the TokenStream source. The source value is
+ // set to null and the actual value supplied in ConfigurableAnalyzer when
+ // used.
+ final Class<?> paramClasses[] = new Class<?>[specs.size()+1];
+ paramClasses[0] = TokenStream.class;
+ final Object paramValues[] = new Object[specs.size()+1];
+ paramValues[0] = null;
+ for (int i = 0; i < specs.size(); i++) {
+ ParamSpec spec = specs.get(i);
+ paramClasses[i+1] = spec.getValueClass();
+ paramValues[i+1] = spec.getValue();
+ }
+
+ // Create spec for new filter
+ return new FilterSpec(clazz, paramClasses, paramValues);
+
+ } else {
+ // use the TokenStream constructor for the new filter
+ return new FilterSpec(clazz, new Class<?>[] { TokenStream.class }, new Object[] { null });
+ }
+ } else {
+ throw new TextIndexException("text:class property is required by GenericFilter: " + root);
+ }
+ }
+
+ public static class FilterSpec {
+ public Class<?> clazz;
+ public Class<?>[] paramClasses;
+ public Object[] paramValues;
+
+ public FilterSpec(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+ this.clazz = clazz;
+ this.paramClasses = paramClasses;
+ this.paramValues = paramValues;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java
new file mode 100644
index 0000000..2e2b39b
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.assembler.Mode;
+import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.assembler.Params.ParamSpec;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Creates generic tokenizers given a fully qualified Class name and a list
+ * of parameters for a constructor of the Class.
+ * <p>
+ * The parameters may be of the following types:
+ * <pre>
+ * text:TypeString String
+ * text:TypeSet org.apache.lucene.analysis.util.CharArraySet
+ * text:TypeFile java.io.FileReader
+ * text:TypeInt int
+ * text:TypeBoolean boolean
+ * text:TypeAnalyzer org.apache.lucene.analysis.Analyzer
+ * </pre>
+ *
+ * Although the list of types is not exhaustive it is a simple matter
+ * to create a wrapper Analyzer that reads a file with information that can
+ * be used to initialize any sort of parameters that may be needed for
+ * a given Analyzer. The provided types cover the vast majority of cases.
+ * <p>
+ * For example, <code>org.apache.lucene.analysis.ja.JapaneseAnalyzer</code>
+ * has a constructor with 4 parameters: a <code>UserDict</code>,
+ * a <code>CharArraySet</code>, a <code>JapaneseTokenizer.Mode</code>, and a
+ * <code>Set<String></code>. So a simple wrapper can extract the values
+ * needed for the various parameters with types not available in this
+ * extension, construct the required instances, and instantiate the
+ * <code>JapaneseAnalyzer</code>.
+ * <p>
+ * Adding custom Analyzers such as the above wrapper analyzer is a simple
+ * matter of adding the Analyzer class and any associated filters and tokenizer
+ * and so on to the classpath for Jena - usually in a jar. Of course, all of
+ * the Analyzers that are included in the Lucene distribution bundled with Jena
+ * are available as generic Analyzers as well.
+ * <p>
+ * Each parameter object is specified with:
+ * <ul>
+ * <li>an optional <code>text:paramName</code> that may be used to document which
+ * parameter is represented</li>
+ * <li>a <code>text:paramType</code> which is one of: <code>text:TypeString</code>,
+ * <code>text:TypeSet</code>, <code>text:TypeFile</code>, <code>text:TypeInt</code>,
+ * <code>text:TypeBoolean</code>, <code>text:TypeAnalyzer</code>.</li>
+ * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int or resource.</li>
+ * </ul>
+ * <p>
+ * A parameter of type <code>text:TypeSet</code> <i>must have</i> a list of zero or
+ * more <code>String</code>s.
+ * <p>
+ * A parameter of type <code>text:TypeString</code>, <code>text:TypeFile</code>,
+ * <code>text:TypeBoolean</code>, <code>text:TypeInt</code> or <code>text:TypeAnalyzer</code>
+ * <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * Examples:
+ * <pre>
+ <#indexLucene> a text:TextIndexLucene ;
+ text:directory <file:Lucene> ;
+ text:entityMap <#entMap> ;
+ text:defineAnalyzers (
+ [text:addLang "sa-x-iast" ;
+ text:analyzer [ . . . ]]
+ [text:defineAnalyzer <#foo> ;
+ text:analyzer [ . . . ]]
+ [text:defineTokenizer <#bar> ;
+ text:tokenizer [
+ a text:GenericTokenizer ;
+ text:class "org.apache.lucene.analysis.ngram.NGramTokenizer" ;
+ text:params (
+ [ text:paramName "minGram" ;
+ text:paramType text:TypeInt ;
+ text:paramValue 3 ]
+ [ text:paramName "maxGram" ;
+ text:paramType text:TypeInt ;
+ text:paramValue 7 ]
+ )
+ ]
+ ]
+ )
+ * </pre>
+ */
+public class GenericTokenizerAssembler extends AssemblerBase {
+ /*
+ <#indexLucene> a text:TextIndexLucene ;
+ text:directory <file:Lucene> ;
+ text:entityMap <#entMap> ;
+ text:defineAnalyzers (
+ [text:addLang "sa-x-iast" ;
+ text:analyzer [ . . . ]]
+ [text:defineAnalyzer <#foo> ;
+ text:analyzer [ . . . ]]
+ [text:defineTokenizer <#bar> ;
+ text:tokenizer [
+ a text:GenericTokenizer ;
+ text:class "org.apache.lucene.analysis.ngram.NGramTokenizer" ;
+ text:params (
+ [ text:paramName "minGram" ;
+ text:paramType text:TypeInt ;
+ text:paramValue 3 ]
+ [ text:paramName "maxGram" ;
+ text:paramType text:TypeInt ;
+ text:paramValue 7 ]
+ )
+ ]
+ ]
+ )
+ */
+
+ @Override
+ public TokenizerSpec open(Assembler a, Resource root, Mode mode) {
+ if (root.hasProperty(TextVocab.pClass)) {
+ // text:class is expected to be a string literal
+ String className = root.getProperty(TextVocab.pClass).getString();
+
+ // is the class accessible?
+ Class<?> clazz = null;
+ try {
+ clazz = Class.forName(className);
+ } catch (ClassNotFoundException e) {
+ Log.error(this, "Tokenizer class " + className + " not found. " + e.getMessage(), e);
+ return null;
+ }
+
+ // Is the class an Tokenizer?
+ if (!Tokenizer.class.isAssignableFrom(clazz)) {
+ Log.error(this, clazz.getName() + " has to be a subclass of " + Tokenizer.class.getName());
+ return null;
+ }
+
+ if (root.hasProperty(TextVocab.pParams)) {
+ RDFNode node = root.getProperty(TextVocab.pParams).getObject();
+ if (! node.isResource()) {
+ throw new TextIndexException("text:params must be a list of parameter resources: " + node);
+ }
+
+ List<ParamSpec> specs = Params.getParamSpecs((Resource) node);
+
+ // split the param specs into classes and values for constructor lookup
+ final Class<?> paramClasses[] = new Class<?>[specs.size()];
+ final Object paramValues[] = new Object[specs.size()];
+ for (int i = 0; i < specs.size(); i++) {
+ ParamSpec spec = specs.get(i);
+ paramClasses[i] = spec.getValueClass();
+ paramValues[i] = spec.getValue();
+ }
+
+ // Create new analyzer
+ return new TokenizerSpec(clazz, paramClasses, paramValues);
+
+ } else {
+ // use the nullary Analyzer constructor
+ return new TokenizerSpec(clazz, new Class<?>[0], new Object[0]);
+ }
+ } else {
+ throw new TextIndexException("text:class property is required by GenericTokenizer: " + root);
+ }
+ }
+
+ public static class TokenizerSpec {
+ public Class<?> clazz;
+ public Class<?>[] paramClasses;
+ public Object[] paramValues;
+
+ public TokenizerSpec(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+ this.clazz = clazz;
+ this.paramClasses = paramClasses;
+ this.paramValues = paramValues;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
new file mode 100644
index 0000000..7b0cd18
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
@@ -0,0 +1,362 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.rdf.model.Literal;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Parses assembler parameter definitions for <code>GenericAnalyzer</code>,
+ * <code>GenericFilter</code>, and <code>GenericTokenizer</code>.
+ * <p>
+ * The parameters may be of the following types:
+ * <pre>
+ * text:TypeString String
+ * text:TypeSet org.apache.lucene.analysis.util.CharArraySet
+ * text:TypeFile java.io.FileReader
+ * text:TypeInt int
+ * text:TypeBoolean boolean
+ * text:TypeAnalyzer org.apache.lucene.analysis.Analyzer
+ * text:TypeTokenStream org.apache.lucene.analysis.TokenStream
+ * </pre>
+ *
+ * Although the list of types is not exhaustive it is a simple matter
+ * to create a wrapper Analyzer, Filter, Tokenizer that reads a file with information
+ * that can be used to initialize any sort of parameters that may be needed.
+ * The provided types cover the vast majority of cases.
+ * <p>
+ * For example, <code>org.apache.lucene.analysis.ja.JapaneseAnalyzer</code>
+ * has a constructor with 4 parameters: a <code>UserDict</code>,
+ * a <code>CharArraySet</code>, a <code>JapaneseTokenizer.Mode</code>, and a
+ * <code>Set<String></code>. So a simple wrapper can extract the values
+ * needed for the various parameters with types not available in this
+ * extension, construct the required instances, and instantiate the
+ * <code>JapaneseAnalyzer</code>.
+ * <p>
+ * Adding custom Analyzers, etc., such as the above wrapper analyzer is a simple
+ * matter of adding the Analyzer class and any associated filters and tokenizer
+ * and so on to the classpath for Jena - usually in a jar. Of course, all of
+ * the Analyzers, Filters, and Tokenizers that are included in the Lucene distribution
+ * bundled with Jena are available as generics as well.
+ * <p>
+ * Each parameter object is specified with:
+ * <ul>
+ * <li>an optional <code>text:paramName</code> that may be used to document which
+ * parameter is represented</li>
+ * <li>a <code>text:paramType</code> which is one of: <code>text:TypeString</code>,
+ * <code>text:TypeSet</code>, <code>text:TypeFile</code>, <code>text:TypeInt</code>,
+ * <code>text:TypeBoolean</code>, <code>text:TypeAnalyzer</code>.</li>
+ * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int or resource.</li>
+ * </ul>
+ * <p>
+ * A parameter of type <code>text:TypeSet</code> <i>must have</i> a list of zero or
+ * more <code>String</code>s.
+ * <p>
+ * A parameter of type <code>text:TypeString</code>, <code>text:TypeFile</code>,
+ * <code>text:TypeBoolean</code>, <code>text:TypeInt</code> or <code>text:TypeAnalyzer</code>
+ * <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * A parameter of type <code>text:TypeTokenStream</code> does not have <code>text:paramValue</code>.
+ * It is used to mark the occurence of the <code>TokenStream</code> parameter for a <code>Filter</code>.
+ * <p>
+ * Examples:
+ * <pre>
+ text:map (
+ [ text:field "text" ;
+ text:predicate rdfs:label;
+ text:analyzer [
+ a text:GenericAnalyzer ;
+ text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
+ text:params (
+ [ text:paramName "stopwords" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("the" "a" "an") ]
+ [ text:paramName "stemExclusionSet" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("ing" "ed") ]
+ )
+ ] .
+ * </pre>
+ * <pre>
+ [] a text:TextIndexLucene ;
+ text:defineFilters (
+ text:filter [
+ a text:GenericFilter ;
+ text:class "fi.finto.FoldingFilter" ;
+ text:params (
+ [ text:paramName "source" ;
+ text:paramType text:TypeTokenStream ]
+ [ text:paramName "whitelisted" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("ç") ]
+ )
+ ]
+ )
+ * </pre>
+ */
+public class Params {
+ /*
+ text:map (
+ [ text:field "text" ;
+ text:predicate rdfs:label;
+ text:analyzer [
+ a text:GenericAnalyzer ;
+ text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
+ text:params (
+ [ text:paramName "stopwords" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("the" "a" "an") ]
+ [ text:paramName "stemExclusionSet" ;
+ text:paramType text:TypeSet ;
+ text:paramValue ("ing" "ed") ]
+ )
+ ] .
+ */
+
+ public static final String TYPE_ANALYZER = "TypeAnalyzer";
+ public static final String TYPE_BOOL = "TypeBoolean";
+ public static final String TYPE_FILE = "TypeFile";
+ public static final String TYPE_INT = "TypeInt";
+ public static final String TYPE_SET = "TypeSet";
+ public static final String TYPE_STRING = "TypeString";
+ public static final String TYPE_TOKENSTREAM = "TypeTokenStream";
+
+ protected static List<ParamSpec> getParamSpecs(Resource list) {
+ List<ParamSpec> result = new ArrayList<>();
+ Resource current = list;
+
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement firstStmt = current.getProperty(RDF.first);
+ if (firstStmt == null) {
+ throw new TextIndexException("parameter list not well formed: " + current);
+ }
+
+ RDFNode first = firstStmt.getObject();
+ if (! first.isResource()) {
+ throw new TextIndexException("parameter specification must be an anon resource : " + first);
+ }
+
+ result.add(getParamSpec((Resource) first));
+
+ Statement restStmt = current.getProperty(RDF.rest);
+ if (restStmt == null) {
+ throw new TextIndexException("parameter list not terminated by rdf:nil");
+ }
+
+ RDFNode rest = restStmt.getObject();
+ if (! rest.isResource()) {
+ throw new TextIndexException("parameter list node is not a resource : " + rest);
+ }
+
+ current = (Resource) rest;
+ }
+
+ return result;
+ }
+
+ protected static ParamSpec getParamSpec(Resource node) {
+ Statement nameStmt = node.getProperty(TextVocab.pParamName);
+ Statement typeStmt = node.getProperty(TextVocab.pParamType);
+ Statement valueStmt = node.getProperty(TextVocab.pParamValue);
+
+ if (typeStmt == null) {
+ throw new TextIndexException("Parameter specification must have a text:paramType: " + node);
+ }
+ Resource typeRes = typeStmt.getResource();
+ String type = typeRes.getLocalName();
+
+ String name = getStringValue(nameStmt);
+ String value = getStringValue(valueStmt);
+
+ switch (type) {
+
+ // String
+ case TYPE_STRING: {
+ if (value == null) {
+ throw new TextIndexException("Value for string param: " + name + " must not be empty!");
+ }
+
+ return new ParamSpec(name, value, String.class);
+ }
+
+ // java.io.FileReader
+ case TYPE_FILE: {
+
+ if (value == null) {
+ throw new TextIndexException("Value for file param must exist and must contain a file name.");
+ }
+
+ try {
+ // The analyzer is responsible for closing the file
+ Reader fileReader = new java.io.FileReader(value);
+ return new ParamSpec(name, fileReader, Reader.class);
+
+ } catch (java.io.FileNotFoundException ex) {
+ throw new TextIndexException("File " + value + " for param " + name + " not found!");
+ }
+ }
+
+ // org.apache.lucene.analysis.util.CharArraySet
+ case TYPE_SET: {
+ if (valueStmt == null) {
+ throw new TextIndexException("A set param spec must have a text:paramValue:" + node);
+ }
+
+ RDFNode valueNode = valueStmt.getObject();
+ if (!valueNode.isResource()) {
+ throw new TextIndexException("A set param spec text:paramValue must be a list of strings: " + valueNode);
+ }
+
+ List<String> values = toStrings((Resource) valueNode);
+
+ return new ParamSpec(name, new CharArraySet(values, false), CharArraySet.class);
+ }
+
+ // int
+ case TYPE_INT:
+ if (value == null) {
+ throw new TextIndexException("Value for int param: " + name + " must not be empty!");
+ }
+
+ int n = ((Literal) valueStmt.getObject()).getInt();
+ return new ParamSpec(name, n, int.class);
+
+ // boolean
+ case TYPE_BOOL:
+ if (value == null) {
+ throw new TextIndexException("Value for boolean param: " + name + " must not be empty!");
+ }
+
+ boolean b = ((Literal) valueStmt.getObject()).getBoolean();
+ return new ParamSpec(name, b, boolean.class);
+
+ // org.apache.lucene.analysis.Analyzer
+ case TYPE_ANALYZER:
+ if (valueStmt == null) {
+ throw new TextIndexException("Analyzer param spec must have a text:paramValue:" + node);
+ }
+
+ RDFNode valueNode = valueStmt.getObject();
+ if (!valueNode.isResource()) {
+ throw new TextIndexException("Analyzer param spec text:paramValue must be an analyzer spec resource: " + valueNode);
+ }
+
+ Analyzer analyzer = (Analyzer) Assembler.general.open((Resource) valueNode);
+ return new ParamSpec(name, analyzer, Analyzer.class);
+
+ default:
+ // there was no match
+ Log.error("org.apache.jena.query.text.assembler.Params", "Unknown parameter type: " + type + " for param: " + name + " with value: " + value);
+ break;
+ }
+
+ return null;
+ }
+
+ private static String getStringValue(Statement stmt) {
+ if (stmt == null) {
+ return null;
+ } else {
+ RDFNode node = stmt.getObject();
+ if (node.isLiteral()) {
+ return ((Literal) node).getLexicalForm();
+ } else {
+ return null;
+ }
+ }
+ }
+
+ protected static List<String> toStrings(Resource list) {
+ List<String> result = new ArrayList<>();
+ Resource current = list;
+
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement firstStmt = current.getProperty(RDF.first);
+ if (firstStmt == null) {
+ throw new TextIndexException("param spec of type set not well formed");
+ }
+
+ RDFNode first = firstStmt.getObject();
+ if (! first.isLiteral()) {
+ throw new TextIndexException("param spec of type set item is not a literal: " + first);
+ }
+
+ result.add(((Literal)first).getLexicalForm());
+
+ Statement restStmt = current.getProperty(RDF.rest);
+ if (restStmt == null) {
+ throw new TextIndexException("param spec of type set not terminated by rdf:nil");
+ }
+
+ RDFNode rest = restStmt.getObject();
+ if (! rest.isResource()) {
+ throw new TextIndexException("param spec of type set rest is not a resource: " + rest);
+ }
+
+ current = (Resource) rest;
+ }
+
+ return result;
+ }
+
+ /**
+ * <code>ParamSpec</code> contains the <code>name</code>, <code>Class</code>, and
+ * <code>value</code> of a parameter for a constructor (or really any method in general)
+ */
+ protected static final class ParamSpec {
+
+ private final String name;
+ private final Object value;
+ private final Class<?> clazz;
+
+ public ParamSpec(String key, Object value) {
+ this(key, value, value.getClass());
+ }
+
+ public ParamSpec(String key, Object value, Class<?> clazz) {
+ this.name = key;
+ this.value = value;
+ this.clazz = clazz;
+ }
+
+ public String getKey() {
+ return name;
+ }
+
+ public Object getValue() {
+ return value;
+ }
+
+ public Class<?> getValueClass() {
+ return clazz;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
index 2a7b52e..013c20f 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
@@ -37,6 +37,8 @@ public class TextAssembler
Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.genericAnalyzer, new GenericAnalyzerAssembler()) ;
+ Assembler.general.implementWith(TextVocab.genericFilter, new GenericFilterAssembler()) ;
+ Assembler.general.implementWith(TextVocab.genericTokenizer, new GenericTokenizerAssembler()) ;
Assembler.general.implementWith(TextVocab.definedAnalyzer, new DefinedAnalyzerAssembler()) ;
}
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
index ebaca4e..6b17603 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
@@ -73,28 +73,6 @@ public class TextIndexLuceneAssembler extends AssemblerBase {
File dir = new File(path) ;
directory = FSDirectory.open(dir.toPath()) ;
}
-
- Analyzer analyzer = null;
- Statement analyzerStatement = root.getProperty(pAnalyzer);
- if (null != analyzerStatement) {
- RDFNode aNode = analyzerStatement.getObject();
- if (! aNode.isResource()) {
- throw new TextIndexException("Text analyzer property is not a resource : " + aNode);
- }
- Resource analyzerResource = (Resource) aNode;
- analyzer = (Analyzer) a.open(analyzerResource);
- }
-
- Analyzer queryAnalyzer = null;
- Statement queryAnalyzerStatement = root.getProperty(pQueryAnalyzer);
- if (null != queryAnalyzerStatement) {
- RDFNode qaNode = queryAnalyzerStatement.getObject();
- if (! qaNode.isResource()) {
- throw new TextIndexException("Text query analyzer property is not a resource : " + qaNode);
- }
- Resource analyzerResource = (Resource) qaNode;
- queryAnalyzer = (Analyzer) a.open(analyzerResource);
- }
String queryParser = null;
Statement queryParserStatement = root.getProperty(pQueryParser);
@@ -117,12 +95,18 @@ public class TextIndexLuceneAssembler extends AssemblerBase {
isMultilingualSupport = mlsNode.asLiteral().getBoolean();
}
+ //define any filters and tokenizers first so they can be referenced in analyzer definitions if need be
Statement defAnalyzersStatement = root.getProperty(pDefAnalyzers);
if (null != defAnalyzersStatement) {
RDFNode aNode = defAnalyzersStatement.getObject();
if (! aNode.isResource()) {
throw new TextIndexException("text:defineAnalyzers property is not a resource (list) : " + aNode);
}
+
+ DefineFiltersAssembler.open(a, (Resource) aNode);
+
+ DefineTokenizersAssembler.open(a, (Resource) aNode);
+
boolean addedLangs = DefineAnalyzersAssembler.open(a, (Resource) aNode);
// if the text:defineAnalyzers added any analyzers to lang tags then ensure that
// multilingual support is enabled
@@ -134,6 +118,30 @@ public class TextIndexLuceneAssembler extends AssemblerBase {
}
}
+ // initialize default analyzer and query analyzer after processing all analyzer definitions
+ // so they can be referred to
+ Analyzer analyzer = null;
+ Statement analyzerStatement = root.getProperty(pAnalyzer);
+ if (null != analyzerStatement) {
+ RDFNode aNode = analyzerStatement.getObject();
+ if (! aNode.isResource()) {
+ throw new TextIndexException("Text analyzer property is not a resource : " + aNode);
+ }
+ Resource analyzerResource = (Resource) aNode;
+ analyzer = (Analyzer) a.open(analyzerResource);
+ }
+
+ Analyzer queryAnalyzer = null;
+ Statement queryAnalyzerStatement = root.getProperty(pQueryAnalyzer);
+ if (null != queryAnalyzerStatement) {
+ RDFNode qaNode = queryAnalyzerStatement.getObject();
+ if (! qaNode.isResource()) {
+ throw new TextIndexException("Text query analyzer property is not a resource : " + qaNode);
+ }
+ Resource analyzerResource = (Resource) qaNode;
+ queryAnalyzer = (Analyzer) a.open(analyzerResource);
+ }
+
boolean storeValues = false;
Statement storeValuesStatement = root.getProperty(pStoreValues);
if (null != storeValuesStatement) {
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
index f41d0cc..187715a4 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
@@ -43,6 +43,7 @@ public class TextVocab
public static final Property pQueryParser = Vocab.property(NS, "queryParser") ;
public static final Property pEntityMap = Vocab.property(NS, "entityMap") ;
public static final Property pTokenizer = Vocab.property(NS, "tokenizer") ;
+ public static final Property pFilter = Vocab.property(NS, "filter") ;
public static final Property pFilters = Vocab.property(NS, "filters") ;
// Entity definition
@@ -78,21 +79,24 @@ public class TextVocab
public static final Resource lowerCaseFilter = Vocab.resource(NS, "LowerCaseFilter");
public static final Resource asciiFoldingFilter = Vocab.resource(NS, "ASCIIFoldingFilter");
+ // ElasticSearch
public static final Property pServerList = Vocab.property(NS, "serverList");
public static final Property pClusterName = Vocab.property(NS, "clusterName");
public static final Property pShards = Vocab.property(NS, "shards");
public static final Property pReplicas = Vocab.property(NS, "replicas");
- public static final Property pIndexName = Vocab.property(NS, "indexName");
+ public static final Property pIndexName = Vocab.property(NS, "indexName");
- //GenericAnalyzer
- public static final Resource genericAnalyzer = Vocab.resource(NS, "GenericAnalyzer");
+ //GenericAnalyzer, DefinedFilter, DefinedTokenizer
public static final Resource definedAnalyzer = Vocab.resource(NS, "DefinedAnalyzer");
- public static final Resource typeAnalyzer = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_ANALYZER);
- public static final Resource typeBoolean = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_BOOL);
- public static final Resource typeFile = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_FILE);
- public static final Resource typeInt = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_INT);
- public static final Resource typeSet = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_SET);
- public static final Resource typeString = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_STRING);
+ public static final Resource genericAnalyzer = Vocab.resource(NS, "GenericAnalyzer");
+ public static final Resource genericFilter = Vocab.resource(NS, "GenericFilter");
+ public static final Resource genericTokenizer = Vocab.resource(NS, "GenericTokenizer");
+ public static final Resource typeAnalyzer = Vocab.resource(NS, Params.TYPE_ANALYZER);
+ public static final Resource typeBoolean = Vocab.resource(NS, Params.TYPE_BOOL);
+ public static final Resource typeFile = Vocab.resource(NS, Params.TYPE_FILE);
+ public static final Resource typeInt = Vocab.resource(NS, Params.TYPE_INT);
+ public static final Resource typeSet = Vocab.resource(NS, Params.TYPE_SET);
+ public static final Resource typeString = Vocab.resource(NS, Params.TYPE_STRING);
public static final Property pClass = Vocab.property(NS, "class");
public static final Property pParams = Vocab.property(NS, "params");
public static final Property pParamName = Vocab.property(NS, "paramName");
@@ -100,6 +104,8 @@ public class TextVocab
public static final Property pParamValue = Vocab.property(NS, "paramValue");
public static final Property pDefAnalyzers = Vocab.property(NS, "defineAnalyzers");
public static final Property pDefAnalyzer = Vocab.property(NS, "defineAnalyzer");
+ public static final Property pDefFilter = Vocab.property(NS, "defineFilter");
+ public static final Property pDefTokenizer = Vocab.property(NS, "defineTokenizer");
public static final Property pAddLang = Vocab.property(NS, "addLang");
public static final Property pUseAnalyzer = Vocab.property(NS, "useAnalyzer");
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
index 8fce7fd..0034632 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
@@ -52,6 +52,7 @@ import org.junit.runners.Suite.SuiteClasses;
, TestTextGraphIndexExtra.class
, TestTextGraphIndexExtra2.class
, TestTextHighlighting.class
+ , TestTextDefineAnalyzers.class
})
public class TS_Text
http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
new file mode 100644
index 0000000..18328f7
--- /dev/null
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.Reader ;
+import java.io.StringReader ;
+
+import org.apache.jena.assembler.Assembler ;
+import org.apache.jena.atlas.lib.StrUtils ;
+import org.apache.jena.query.Dataset ;
+import org.apache.jena.query.ReadWrite ;
+import org.apache.jena.query.text.assembler.TextAssembler ;
+import org.apache.jena.rdf.model.Model ;
+import org.apache.jena.rdf.model.ModelFactory ;
+import org.apache.jena.rdf.model.Resource ;
+import org.junit.After ;
+import org.junit.Before ;
+import org.junit.Test ;
+
+public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBase {
+
+ private static final String SPEC_BASE = "http://example.org/spec#";
+ private static final String SPEC_ROOT_LOCAL = "lucene_text_dataset";
+ private static final String SPEC_ROOT_URI = SPEC_BASE + SPEC_ROOT_LOCAL;
+ private static final String SPEC;
+ static {
+ SPEC = StrUtils.strjoinNL(
+ "prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> ",
+ "prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> ",
+ "prefix tdb: <http://jena.hpl.hp.com/2008/tdb#>",
+ "prefix text: <http://jena.apache.org/text#>",
+ "prefix : <" + SPEC_BASE + ">",
+ "",
+ "[] ja:loadClass \"org.apache.jena.query.text.TextQuery\" .",
+ "text:TextDataset rdfs:subClassOf ja:RDFDataset .",
+ "text:TextIndexLucene rdfs:subClassOf text:TextIndex .",
+
+ ":" + SPEC_ROOT_LOCAL,
+ " a text:TextDataset ;",
+ " text:dataset :dataset ;",
+ " text:index :indexLucene ;",
+ " .",
+ "",
+ ":dataset",
+ " a tdb:DatasetTDB ;",
+ " tdb:location \"--mem--\" ;",
+ " tdb:unionDefaultGraph true ;",
+ ".",
+ "",
+ ":indexLucene",
+ " a text:TextIndexLucene ;",
+ " text:directory \"mem\" ;",
+ " text:storeValues true ;",
+ " text:analyzer [",
+ " a text:DefinedAnalyzer ;",
+ " text:useAnalyzer :configuredAnalyzer ] ;",
+ " text:defineAnalyzers (",
+ " [ text:defineAnalyzer :configuredAnalyzer ;",
+ " text:analyzer [",
+ " a text:ConfigurableAnalyzer ;",
+ " text:tokenizer :ngram ;",
+ " text:filters ( :asciiff text:LowerCaseFilter ) ] ]",
+ " [ text:defineTokenizer :ngram ;",
+ " text:tokenizer [",
+ " a text:GenericTokenizer ;",
+ " text:class \"org.apache.lucene.analysis.ngram.NGramTokenizer\" ;",
+ " text:params (",
+ " [ text:paramName \"minGram\" ;",
+ " text:paramType text:TypeInt ;",
+ " text:paramValue 3 ]",
+ " [ text:paramName \"maxGram\" ;",
+ " text:paramType text:TypeInt ;",
+ " text:paramValue 7 ]",
+ " ) ] ]",
+ " [ text:defineFilter :asciiff ;",
+ " text:filter [",
+ " a text:GenericFilter ;",
+ " text:class \"org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter\" ;",
+ " text:params (",
+ " [ text:paramName \"preserveOriginal\" ;",
+ " text:paramType text:TypeBoolean ;",
+ " text:paramValue true ]",
+ " ) ] ]",
+ " ) ;",
+ " text:entityMap :entMap ;",
+ " .",
+ "",
+ ":entMap",
+ " a text:EntityMap ;",
+ " text:entityField \"uri\" ;",
+ " text:defaultField \"label\" ;",
+ " text:langField \"lang\" ;",
+ " text:graphField \"graph\" ;",
+ " text:map (",
+ " [ text:field \"label\" ; text:predicate rdfs:label ]",
+ " [ text:field \"comment\" ; text:predicate rdfs:comment ]",
+ " ) ."
+ );
+ }
+
+ @Before
+ public void before() {
+ Reader reader = new StringReader(SPEC);
+ System.out.println(">>>>");
+ System.out.println(SPEC);
+ System.out.println("<<<<");
+ Model specModel = ModelFactory.createDefaultModel();
+ specModel.read(reader, "", "TURTLE");
+ TextAssembler.init();
+ Resource root = specModel.getResource(SPEC_ROOT_URI);
+ try {
+ dataset = (Dataset) Assembler.general.open(root);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ @After
+ public void after() {
+ dataset.close();
+ }
+
+ private void putTurtleInModel(String turtle, String modelName) {
+ Model model = modelName != null ? dataset.getNamedModel(modelName) : dataset.getDefaultModel() ;
+ Reader reader = new StringReader(turtle) ;
+ dataset.begin(ReadWrite.WRITE) ;
+ try {
+ model.read(reader, "", "TURTLE") ;
+ dataset.commit() ;
+ }
+ finally {
+ dataset.end();
+ }
+ }
+
+ @Test
+ public void testTextQueryDefAnalyzers1() {
+ final String turtleA = StrUtils.strjoinNL(
+ TURTLE_PROLOG,
+ "<" + RESOURCE_BASE + "testResultOneInModelA>",
+ " rdfs:label 'bar testResultOne barfoo foo'",
+ ".",
+ "<" + RESOURCE_BASE + "testResultTwoInModelA>",
+ " rdfs:label 'bar testResultTwo barfoo foo'",
+ ".",
+ "<" + RESOURCE_BASE + "testResultThreeInModelA>",
+ " rdfs:label 'bar testResultThree barfoo foo'",
+ "."
+ );
+ putTurtleInModel(turtleA, "http://example.org/modelA") ;
+ final String turtleB = StrUtils.strjoinNL(
+ TURTLE_PROLOG,
+ "<" + RESOURCE_BASE + "testResultOneInModelB>",
+ " rdfs:label 'bar testResultOne barfoo foo'",
+ "."
+ );
+ putTurtleInModel(turtleB, "http://example.org/modelB") ;
+
+ // execution reaches here in the event that the assembler machinery
+ // has executed without errors and generated a usable dataset
+ // usage of the runtime machinery is tested elsewhere
+ assertTrue(true);
+ }
+}
[3/4] jena git commit: derive parameter type in the cases: int,
boolean, String.
Posted by co...@apache.org.
derive parameter type in the cases: int, boolean, String.
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/71c2f66d
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/71c2f66d
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/71c2f66d
Branch: refs/heads/master
Commit: 71c2f66dbdad7834b7560f27b16eee08aad37abf
Parents: 58ff28e
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Sun Mar 18 09:48:57 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Sun Mar 18 09:48:57 2018 -0500
----------------------------------------------------------------------
.../jena/query/text/assembler/Params.java | 40 ++++++++++++++++----
.../query/text/TestTextDefineAnalyzers.java | 23 ++++++++++-
2 files changed, 55 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/71c2f66d/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
index 7b0cd18..b1a3f33 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
@@ -24,6 +24,7 @@ import java.util.List;
import org.apache.jena.assembler.Assembler;
import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.datatypes.RDFDatatype;
import org.apache.jena.query.text.TextIndexException;
import org.apache.jena.rdf.model.Literal;
import org.apache.jena.rdf.model.RDFNode;
@@ -184,16 +185,10 @@ public class Params {
protected static ParamSpec getParamSpec(Resource node) {
Statement nameStmt = node.getProperty(TextVocab.pParamName);
- Statement typeStmt = node.getProperty(TextVocab.pParamType);
Statement valueStmt = node.getProperty(TextVocab.pParamValue);
- if (typeStmt == null) {
- throw new TextIndexException("Parameter specification must have a text:paramType: " + node);
- }
- Resource typeRes = typeStmt.getResource();
- String type = typeRes.getLocalName();
-
String name = getStringValue(nameStmt);
+ String type = getType(node);
String value = getStringValue(valueStmt);
switch (type) {
@@ -280,6 +275,37 @@ public class Params {
return null;
}
+
+ private static String getType(Resource node) {
+ Statement typeStmt = node.getProperty(TextVocab.pParamType);
+ Statement valueStmt = node.getProperty(TextVocab.pParamValue);
+ String type = null;
+
+ if (typeStmt == null) {
+
+ if (valueStmt == null) {
+ throw new TextIndexException("Parameter specification must have a text:paramValue: " + node);
+ }
+
+ RDFNode obj = valueStmt != null ? valueStmt.getObject() : null;
+ Literal lit = obj.asLiteral();
+ RDFDatatype rdfType = lit.getDatatype();
+ Class<?> clazz = rdfType.getJavaClass();
+
+ if (clazz == java.lang.Boolean.class) {
+ type = TYPE_BOOL;
+ } else if (clazz == java.math.BigInteger.class) {
+ type = TYPE_INT;
+ } else if (clazz == java.lang.String.class) {
+ type = TYPE_STRING;
+ }
+ } else {
+ Resource typeRes = typeStmt.getResource();
+ type = typeRes.getLocalName();
+ }
+
+ return type;
+ }
private static String getStringValue(Statement stmt) {
if (stmt == null) {
http://git-wip-us.apache.org/repos/asf/jena/blob/71c2f66d/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
index b0c114e..5ffa1db 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
@@ -78,13 +78,18 @@ public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBas
" a text:ConfigurableAnalyzer ;",
" text:tokenizer :ngram ;",
" text:filters ( :asciiff text:LowerCaseFilter ) ] ]",
+ " [ text:defineAnalyzer :configuredAnalyzer2 ;",
+ " text:analyzer [",
+ " a text:ConfigurableAnalyzer ;",
+ " text:tokenizer :ngram2 ;",
+ " text:filters ( :asciiff2 text:LowerCaseFilter ) ] ]",
" [ text:defineTokenizer :ngram ;",
" text:tokenizer [",
" a text:GenericTokenizer ;",
" text:class \"org.apache.lucene.analysis.ngram.NGramTokenizer\" ;",
" text:params (",
" [ text:paramName \"minGram\" ;",
- " text:paramType text:TypeInt ;",
+ " text:paramType text:TypeInt ;",
" text:paramValue 3 ]",
" [ text:paramName \"maxGram\" ;",
" text:paramType text:TypeInt ;",
@@ -99,6 +104,22 @@ public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBas
" text:paramType text:TypeBoolean ;",
" text:paramValue true ]",
" ) ] ]",
+ " [ text:defineTokenizer :ngram2 ;",
+ " text:tokenizer [",
+ " a text:GenericTokenizer ;",
+ " text:class \"org.apache.lucene.analysis.ngram.NGramTokenizer\" ;",
+ " text:params (",
+ " [ text:paramValue 3 ]",
+ " [ text:paramValue 7 ]",
+ " ) ] ]",
+ " [ text:defineFilter :asciiff2 ;",
+ " text:filter [",
+ " a text:GenericFilter ;",
+ " text:class \"org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter\" ;",
+ " text:params (",
+ " [ text:paramName \"preserveOriginal\" ;",
+ " text:paramValue true ]",
+ " ) ] ]",
" ) ;",
" text:entityMap :entMap ;",
" .",
[2/4] jena git commit: rm debug code
Posted by co...@apache.org.
rm debug code
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/58ff28e9
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/58ff28e9
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/58ff28e9
Branch: refs/heads/master
Commit: 58ff28e9e4f26e063b10d9bebe706eba31f241ba
Parents: 795b9eb
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Thu Mar 15 14:54:23 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Mar 15 14:54:23 2018 -0500
----------------------------------------------------------------------
.../org/apache/jena/query/text/TestTextDefineAnalyzers.java | 9 +--------
1 file changed, 1 insertion(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/58ff28e9/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
index 18328f7..b0c114e 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
@@ -119,18 +119,11 @@ public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBas
@Before
public void before() {
Reader reader = new StringReader(SPEC);
- System.out.println(">>>>");
- System.out.println(SPEC);
- System.out.println("<<<<");
Model specModel = ModelFactory.createDefaultModel();
specModel.read(reader, "", "TURTLE");
TextAssembler.init();
Resource root = specModel.getResource(SPEC_ROOT_URI);
- try {
- dataset = (Dataset) Assembler.general.open(root);
- } catch (Exception e) {
- e.printStackTrace();
- }
+ dataset = (Dataset) Assembler.general.open(root);
}
@After
[4/4] jena git commit: This closes #385 - Merge branch 'JENA-1506-PR'
of https://github.com/BuddhistDigitalResourceCenter/jena
Posted by co...@apache.org.
This closes #385 - Merge branch 'JENA-1506-PR' of https://github.com/BuddhistDigitalResourceCenter/jena
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/2e3d1fa2
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/2e3d1fa2
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/2e3d1fa2
Branch: refs/heads/master
Commit: 2e3d1fa27d9f7d3d56fd0b83f7a3cd56a71a474d
Parents: 50b46f0 71c2f66
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Wed Mar 21 09:43:19 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Wed Mar 21 09:43:19 2018 -0500
----------------------------------------------------------------------
.../text/analyzer/ConfigurableAnalyzer.java | 133 +++++--
.../ConfigurableAnalyzerAssembler.java | 4 +-
.../assembler/DefineAnalyzersAssembler.java | 3 +-
.../text/assembler/DefineFiltersAssembler.java | 103 +++++
.../assembler/DefineTokenizersAssembler.java | 100 +++++
.../assembler/GenericAnalyzerAssembler.java | 228 +----------
.../text/assembler/GenericFilterAssembler.java | 199 ++++++++++
.../assembler/GenericTokenizerAssembler.java | 198 ++++++++++
.../jena/query/text/assembler/Params.java | 388 +++++++++++++++++++
.../query/text/assembler/TextAssembler.java | 2 +
.../assembler/TextIndexLuceneAssembler.java | 52 +--
.../jena/query/text/assembler/TextVocab.java | 24 +-
.../org/apache/jena/query/text/TS_Text.java | 1 +
.../query/text/TestTextDefineAnalyzers.java | 196 ++++++++++
14 files changed, 1348 insertions(+), 283 deletions(-)
----------------------------------------------------------------------