You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by co...@apache.org on 2018/03/21 14:44:35 UTC

[1/4] jena git commit: Merged JENA-1506-definedFilters

Repository: jena
Updated Branches:
  refs/heads/master 50b46f0f0 -> 2e3d1fa27


Merged JENA-1506-definedFilters


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/795b9eb7
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/795b9eb7
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/795b9eb7

Branch: refs/heads/master
Commit: 795b9eb7cb45999c1d884bcd84c83896a498ed87
Parents: 6e20282
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Thu Mar 15 14:12:53 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Mar 15 14:12:53 2018 -0500

----------------------------------------------------------------------
 .../text/analyzer/ConfigurableAnalyzer.java     | 133 +++++--
 .../ConfigurableAnalyzerAssembler.java          |   4 +-
 .../assembler/DefineAnalyzersAssembler.java     |   3 +-
 .../text/assembler/DefineFiltersAssembler.java  | 103 ++++++
 .../assembler/DefineTokenizersAssembler.java    | 100 +++++
 .../assembler/GenericAnalyzerAssembler.java     | 228 +-----------
 .../text/assembler/GenericFilterAssembler.java  | 199 ++++++++++
 .../assembler/GenericTokenizerAssembler.java    | 198 ++++++++++
 .../jena/query/text/assembler/Params.java       | 362 +++++++++++++++++++
 .../query/text/assembler/TextAssembler.java     |   2 +
 .../assembler/TextIndexLuceneAssembler.java     |  52 +--
 .../jena/query/text/assembler/TextVocab.java    |  24 +-
 .../org/apache/jena/query/text/TS_Text.java     |   1 +
 .../query/text/TestTextDefineAnalyzers.java     | 182 ++++++++++
 14 files changed, 1308 insertions(+), 283 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
index 2008445..8d54d2c 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
@@ -18,9 +18,16 @@
 
 package org.apache.jena.query.text.analyzer ;
 
+import java.lang.reflect.Constructor;
+import java.util.Hashtable;
 import java.util.List ;
+import java.lang.reflect.InvocationTargetException;
 
+import org.apache.jena.atlas.logging.Log;
 import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.assembler.GenericFilterAssembler.FilterSpec;
+import org.apache.jena.query.text.assembler.GenericTokenizerAssembler.TokenizerSpec;
+import org.apache.jena.query.text.assembler.TextVocab;
 import org.apache.lucene.analysis.Analyzer ;
 import org.apache.lucene.analysis.TokenFilter ;
 import org.apache.lucene.analysis.Tokenizer ;
@@ -43,32 +50,114 @@ public class ConfigurableAnalyzer extends Analyzer {
         private final String tokenizer;
         private final List<String> filters;
         
+        private static Hashtable<String, FilterSpec>    filterSpecs = new Hashtable<>();
+        private static Hashtable<String, TokenizerSpec> tokenizerSpecs = new Hashtable<>();
+        
+        static{
+            Class<?>[] paramClasses = new Class<?>[0];
+            Object[] paramValues = new Object[0];
+            
+            tokenizerSpecs.put(TextVocab.NS+"KeywordTokenizer", new TokenizerSpec(KeywordTokenizer.class, paramClasses, paramValues));
+            tokenizerSpecs.put(TextVocab.NS+"LetterTokenizer", new TokenizerSpec(LetterTokenizer.class, paramClasses, paramValues));
+            tokenizerSpecs.put(TextVocab.NS+"StandardTokenizer", new TokenizerSpec(StandardTokenizer.class, paramClasses, paramValues));
+            tokenizerSpecs.put(TextVocab.NS+"WhitespaceTokenizer", new TokenizerSpec(WhitespaceTokenizer.class, paramClasses, paramValues));
+            
+            paramClasses = new Class<?>[] {TokenStream.class};
+            paramValues = new Object[]{ null };
+            
+            filterSpecs.put(TextVocab.NS+"ASCIIFoldingFilter", new FilterSpec(ASCIIFoldingFilter.class, paramClasses, paramValues));
+            filterSpecs.put(TextVocab.NS+"LowerCaseFilter", new FilterSpec(LowerCaseFilter.class, paramClasses, paramValues));
+            filterSpecs.put(TextVocab.NS+"StandardFilter", new FilterSpec(StandardFilter.class, paramClasses, paramValues));
+        }
+        
+        public static void defineFilter(String id, FilterSpec spec) {
+            filterSpecs.put(id, spec);
+        }
+        
+        public static void defineTokenizer(String id, TokenizerSpec spec) {
+            tokenizerSpecs.put(id, spec);
+        }
+        
+        /**
+         * Create instance of a Lucene Tokenizer, <code>class</code>, with provided parameters
+         *
+         * @param clazz The analyzer class
+         * @param paramClasses The parameter classes
+         * @param paramValues The parameter values
+         * @return The lucene analyzer
+         */
+        private Tokenizer newTokenizer(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+  
+            String className = clazz.getName();
+  
+            try {
+                final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses);
+  
+                return (Tokenizer) cstr.newInstance(paramValues);
+  
+            } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) {
+                Log.error(this, "Exception while instantiating tokenizer class " + className + ". " + e.getMessage(), e);
+            } catch (NoSuchMethodException ex) {
+                Log.error(this, "Could not find matching tokenizer class constructor for " + className + " " + ex.getMessage(), ex);
+            }
+  
+            return null;
+        }
+
+        /**
+         * Create instance of the Lucene Analyzer, <code>class</code>, with provided parameters
+         *
+         * @param clazz The analyzer class
+         * @param paramClasses The parameter classes
+         * @param paramValues The parameter values
+         * @return The lucene analyzer
+         */
+        private TokenFilter newFilter(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+  
+            String className = clazz.getName();
+  
+            try {
+                final Constructor<?> cstr = clazz.getDeclaredConstructor(paramClasses);
+  
+                return (TokenFilter) cstr.newInstance(paramValues);
+  
+            } catch (IllegalArgumentException | IllegalAccessException | InstantiationException | InvocationTargetException | SecurityException e) {
+                Log.error(this, "Exception while instantiating analyzer class " + className + ". " + e.getMessage(), e);
+            } catch (NoSuchMethodException ex) {
+                Log.error(this, "Could not find matching analyzer class constructor for " + className + " " + ex.getMessage(), ex);
+            }
+  
+            return null;
+        }
+        
         private Tokenizer getTokenizer(String tokenizerName) {
-                switch(tokenizerName) {
-                        case "KeywordTokenizer":
-                                return new KeywordTokenizer();
-                        case "LetterTokenizer":
-                                return new LetterTokenizer();
-                        case "StandardTokenizer":
-                                return new StandardTokenizer();
-                        case "WhitespaceTokenizer":
-                                return new WhitespaceTokenizer();
-                        default:
-                                throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
-                }
+            TokenizerSpec spec = tokenizerSpecs.get(tokenizerName);
+            if (spec == null) {
+                throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
+            }
+
+            Class<?> clazz = spec.clazz;
+            Class<?>[] paramClasses = spec.paramClasses;
+            Object[] paramValues = spec.paramValues;
+            
+            return newTokenizer(clazz, paramClasses, paramValues);
         }
         
         private TokenFilter getTokenFilter(String filterName, TokenStream source) {
-                switch(filterName) {
-                        case "ASCIIFoldingFilter":
-                                return new ASCIIFoldingFilter(source);
-                        case "LowerCaseFilter":
-                                return new LowerCaseFilter(source);
-                        case "StandardFilter":
-                                return new StandardFilter(source);
-                        default:
-                                throw new TextIndexException("Unknown filter : " + filterName);
-                }
+            FilterSpec spec = filterSpecs.get(filterName);
+            
+            if (spec == null) {
+                throw new TextIndexException("Unknown filter : " + filterName);
+            }
+
+            Class<?> clazz = spec.clazz;
+            Class<?>[] paramClasses = spec.paramClasses;
+            Object[] paramValues = spec.paramValues;
+            
+            // the source should always be the first parameter
+            paramValues[0] = source;
+            
+            return newFilter(clazz, paramClasses, paramValues);
         }
         
         public ConfigurableAnalyzer(String tokenizer, List<String> filters) {

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
index 5ec96eb..bf38508 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
@@ -55,7 +55,7 @@ public class ConfigurableAnalyzerAssembler extends AssemblerBase {
     public Analyzer open(Assembler a, Resource root, Mode mode) {
         if (root.hasProperty(TextVocab.pTokenizer)) {
             Resource tokenizerResource = root.getPropertyResourceValue(TextVocab.pTokenizer);
-            String tokenizer = tokenizerResource.getLocalName();
+            String tokenizer = tokenizerResource.getURI();
             List<String> filters;
             if (root.hasProperty(TextVocab.pFilters)) {
                 Resource filtersResource = root.getPropertyResourceValue(TextVocab.pFilters);
@@ -82,7 +82,7 @@ public class ConfigurableAnalyzerAssembler extends AssemblerBase {
                 throw new TextIndexException("filter is not a resource : " + node);
             }
             
-            result.add(node.asResource().getLocalName());
+            result.add(node.asResource().getURI());
             stmt = current.getProperty(RDF.rest);
             if (stmt == null) {
                 throw new TextIndexException("filter list not terminated by rdf:nil");

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
index 11270e2..6326128 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
@@ -64,6 +64,7 @@ public class DefineAnalyzersAssembler {
                     throw new TextIndexException("addAnalyzers text:analyzer must be an analyzer spec resource: " + analyzerNode);
                 }
                 
+                // calls GenericAnalyzerAssembler
                 Analyzer analyzer = (Analyzer) a.open((Resource) analyzerNode);
                 
                 if (adding.hasProperty(TextVocab.pAddLang)) {
@@ -83,8 +84,6 @@ public class DefineAnalyzersAssembler {
                         throw new TextIndexException("addAnalyzers text:defineAnalyzer property must be a non-blank resource: " + adding);
                     }
                 }
-            } else {
-                throw new TextIndexException("text:analyzer property is required when adding an analyzer: " + adding);
             }
             
             Statement restStmt = current.getProperty(RDF.rest);

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java
new file mode 100644
index 0000000..b7d1c63
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineFiltersAssembler.java
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.atlas.logging.Log;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+import org.apache.jena.query.text.assembler.GenericFilterAssembler.FilterSpec;
+
+public class DefineFiltersAssembler {
+    /*
+    <#indexLucene> a text:TextIndexLucene ;
+        text:directory <file:Lucene> ;
+        text:entityMap <#entMap> ;
+        text:defineAnalyzers (
+            [text:addLang "sa-x-iast" ;
+             text:analyzer [ . . . ]]
+            [text:defineAnalyzer <#foo> ;
+             text:analyzer [ . . . ]]
+            [text:defineFilter <#bar> ;
+             text:filter [ . . . ]]
+            [text:defineTokenizer <#baz> ;
+             text:tokenizer [ . . . ]]
+        )
+    */
+
+    public static boolean open(Assembler a, Resource list) {
+        Resource current = list;
+        
+        while (current != null && ! current.equals(RDF.nil)){
+            Statement firstStmt = current.getProperty(RDF.first);
+            if (firstStmt == null) {
+                throw new TextIndexException("parameter list not well formed: " + current);
+            }
+            
+            RDFNode first = firstStmt.getObject();
+            if (! first.isResource()) {
+                throw new TextIndexException("parameter specification must be an anon resource : " + first);
+            }
+
+            // process the current list element to add an analyzer 
+            Resource adding = (Resource) first;
+            if (adding.hasProperty(TextVocab.pFilter)) {
+                Statement filterStmt = adding.getProperty(TextVocab.pFilter);
+                RDFNode filterNode = filterStmt.getObject();
+                if (!filterNode.isResource()) {
+                    throw new TextIndexException("addFilters text:filter must be a filter spec resource: " + filterNode);
+                }
+                
+                // calls GenericFilterAssembler
+                FilterSpec filterSpec = (FilterSpec) a.open((Resource) filterNode);
+                
+                if (adding.hasProperty(TextVocab.pDefFilter)) {
+                    Statement defStmt = adding.getProperty(TextVocab.pDefFilter);
+                    Resource id = defStmt.getResource();
+                    
+                    if (id.getURI() != null) {
+                        ConfigurableAnalyzer.defineFilter(id.getURI(), filterSpec);
+                    } else {
+                        throw new TextIndexException("text:defineFilters text:defineAnalyzer property must be a non-blank resource: " + adding);
+                    }
+                } else {
+                    Log.warn("DefineFiltersAssembler", "Filter specified but no text:defineFilter so filter is not accessible!");
+                }
+            }
+            
+            Statement restStmt = current.getProperty(RDF.rest);
+            if (restStmt == null) {
+                throw new TextIndexException("parameter list not terminated by rdf:nil");
+            }
+            
+            RDFNode rest = restStmt.getObject();
+            if (! rest.isResource()) {
+                throw new TextIndexException("parameter list node is not a resource : " + rest);
+            }
+            
+            current = (Resource) rest;
+        }
+        
+        return true;
+    }
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java
new file mode 100644
index 0000000..504e975
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineTokenizersAssembler.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer;
+import org.apache.jena.query.text.assembler.GenericTokenizerAssembler.TokenizerSpec;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+
+public class DefineTokenizersAssembler {
+    /*
+    <#indexLucene> a text:TextIndexLucene ;
+        text:directory <file:Lucene> ;
+        text:entityMap <#entMap> ;
+        text:defineAnalyzers (
+            [text:addLang "sa-x-iast" ;
+             text:analyzer [ . . . ]]
+            [text:defineAnalyzer <#foo> ;
+             text:analyzer [ . . . ]]
+            [text:defineFilter <#bar> ;
+             text:filter [ . . . ]]
+            [text:defineTokenizer <#baz> ;
+             text:tokenizer [ . . . ]]
+        )
+    */
+
+    public static boolean open(Assembler a, Resource list) {
+        Resource current = list;
+        boolean isMultilingualSupport = false;
+        
+        while (current != null && ! current.equals(RDF.nil)){
+            Statement firstStmt = current.getProperty(RDF.first);
+            if (firstStmt == null) {
+                throw new TextIndexException("parameter list not well formed: " + current);
+            }
+            
+            RDFNode first = firstStmt.getObject();
+            if (! first.isResource()) {
+                throw new TextIndexException("parameter specification must be an anon resource : " + first);
+            }
+
+            // process the current list element to add an analyzer 
+            Resource adding = (Resource) first;
+            if (adding.hasProperty(TextVocab.pTokenizer)) {
+                Statement tokenizerStmt = adding.getProperty(TextVocab.pTokenizer);
+                RDFNode tokenizerNode = tokenizerStmt.getObject();
+                if (!tokenizerNode.isResource()) {
+                    throw new TextIndexException("addTokenizers text:tokenizer must be an tokenizer spec resource: " + tokenizerNode);
+                }
+                
+                TokenizerSpec spec = (TokenizerSpec) a.open((Resource) tokenizerNode);
+                
+                if (adding.hasProperty(TextVocab.pDefTokenizer)) {
+                    Statement defStmt = adding.getProperty(TextVocab.pDefTokenizer);
+                    Resource id = defStmt.getResource();
+                    
+                    if (id.getURI() != null) {
+                        ConfigurableAnalyzer.defineTokenizer(id.getURI(), spec);
+                    } else {
+                        throw new TextIndexException("addTokenizers text:defineTokenizer property must be a non-blank resource: " + adding);
+                    }
+                }
+            }
+            
+            Statement restStmt = current.getProperty(RDF.rest);
+            if (restStmt == null) {
+                throw new TextIndexException("parameter list not terminated by rdf:nil");
+            }
+            
+            RDFNode rest = restStmt.getObject();
+            if (! rest.isResource()) {
+                throw new TextIndexException("parameter list node is not a resource : " + rest);
+            }
+            
+            current = (Resource) rest;
+        }
+        
+        return isMultilingualSupport;
+    }
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
index 21f4be1..3cf2004 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericAnalyzerAssembler.java
@@ -18,10 +18,8 @@
 
 package org.apache.jena.query.text.assembler;
 
-import java.io.Reader;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
-import java.util.ArrayList;
 import java.util.List;
 
 import org.apache.jena.assembler.Assembler;
@@ -29,13 +27,9 @@ import org.apache.jena.assembler.Mode;
 import org.apache.jena.assembler.assemblers.AssemblerBase;
 import org.apache.jena.atlas.logging.Log ;
 import org.apache.jena.query.text.TextIndexException;
-import org.apache.jena.rdf.model.Literal;
 import org.apache.jena.rdf.model.RDFNode;
 import org.apache.jena.rdf.model.Resource;
-import org.apache.jena.rdf.model.Statement;
-import org.apache.jena.vocabulary.RDF;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CharArraySet;
 
 /**
  * Creates generic analyzers given a fully qualified Class name and a list
@@ -142,13 +136,6 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
            ] .
      */
 
-    public static final String TYPE_ANALYZER   = "TypeAnalyzer";
-    public static final String TYPE_BOOL       = "TypeBoolean";
-    public static final String TYPE_FILE       = "TypeFile";
-    public static final String TYPE_INT        = "TypeInt";
-    public static final String TYPE_SET        = "TypeSet";
-    public static final String TYPE_STRING     = "TypeString";
-
     @Override
     public Analyzer open(Assembler a, Resource root, Mode mode) {
         if (root.hasProperty(TextVocab.pClass)) {
@@ -176,13 +163,13 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
                     throw new TextIndexException("text:params must be a list of parameter resources: " + node);
                 }
 
-                List<ParamSpec> specs = getParamSpecs((Resource) node);
+                List<Params.ParamSpec> specs = Params.getParamSpecs((Resource) node);
 
                 // split the param specs into classes and values for constructor lookup
                 final Class<?> paramClasses[] = new Class<?>[specs.size()];
                 final Object paramValues[] = new Object[specs.size()];
                 for (int i = 0; i < specs.size(); i++) {
-                    ParamSpec spec = specs.get(i);
+                    Params.ParamSpec spec = specs.get(i);
                     paramClasses[i] = spec.getValueClass();
                     paramValues[i] = spec.getValue();
                 }
@@ -224,215 +211,4 @@ public class GenericAnalyzerAssembler extends AssemblerBase {
 
         return null;
     }
-
-    private List<ParamSpec> getParamSpecs(Resource list) {
-        List<ParamSpec> result = new ArrayList<>();
-        Resource current = list;
-
-        while (current != null && ! current.equals(RDF.nil)){
-            Statement firstStmt = current.getProperty(RDF.first);
-            if (firstStmt == null) {
-                throw new TextIndexException("parameter list not well formed: " + current);
-            }
-
-            RDFNode first = firstStmt.getObject();
-            if (! first.isResource()) {
-                throw new TextIndexException("parameter specification must be an anon resource : " + first);
-            }
-
-            result.add(getParamSpec((Resource) first));
-
-            Statement restStmt = current.getProperty(RDF.rest);
-            if (restStmt == null) {
-                throw new TextIndexException("parameter list not terminated by rdf:nil");
-            }
-
-            RDFNode rest = restStmt.getObject();
-            if (! rest.isResource()) {
-                throw new TextIndexException("parameter list node is not a resource : " + rest);
-            }
-
-            current = (Resource) rest;
-        }
-
-        return result;
-    }
-
-    private ParamSpec getParamSpec(Resource node) {
-        Statement nameStmt = node.getProperty(TextVocab.pParamName);
-        Statement typeStmt = node.getProperty(TextVocab.pParamType);
-        Statement valueStmt = node.getProperty(TextVocab.pParamValue);
-        
-        if (typeStmt == null) {
-            throw new TextIndexException("Parameter specification must have a text:paramType: " + node);
-        }        
-        Resource typeRes = typeStmt.getResource();
-        String type = typeRes.getLocalName();
-
-        String name = getStringValue(nameStmt);
-        String value = getStringValue(valueStmt);
-
-        switch (type) {
-
-        // String
-        case TYPE_STRING: {
-            if (value == null) {
-                throw new TextIndexException("Value for string param: " + name + " must not be empty!");
-            }
-
-            return new ParamSpec(name, value, String.class);
-        }
-
-        // java.io.FileReader
-        case TYPE_FILE: {
-
-            if (value == null) {
-                throw new TextIndexException("Value for file param must exist and must contain a file name.");
-            }
-
-            try {
-                // The analyzer is responsible for closing the file
-                Reader fileReader = new java.io.FileReader(value);
-                return new ParamSpec(name, fileReader, Reader.class);
-
-            } catch (java.io.FileNotFoundException ex) {
-                throw new TextIndexException("File " + value + " for param " + name + " not found!");
-            }
-        }
-
-        // org.apache.lucene.analysis.util.CharArraySet
-        case TYPE_SET: {
-            if (valueStmt == null) {
-                throw new TextIndexException("A set param spec must have a text:paramValue:" + node);
-            }
-
-            RDFNode valueNode = valueStmt.getObject();
-            if (!valueNode.isResource()) {
-                throw new TextIndexException("A set param spec text:paramValue must be a list of strings: " + valueNode);
-            }
-
-            List<String> values = toStrings((Resource) valueNode);
-
-            return new ParamSpec(name, new CharArraySet(values, false), CharArraySet.class);
-        }
-
-        // int
-        case TYPE_INT:
-            if (value == null) {
-                throw new TextIndexException("Value for int param: " + name + " must not be empty!");
-            }
-
-            int n = ((Literal) valueStmt.getObject()).getInt();
-            return new ParamSpec(name, n, int.class);
-
-            // boolean
-        case TYPE_BOOL:
-            if (value == null) {
-                throw new TextIndexException("Value for boolean param: " + name + " must not be empty!");
-            }
-
-            boolean b = ((Literal) valueStmt.getObject()).getBoolean();
-            return new ParamSpec(name, b, boolean.class);
-
-            // org.apache.lucene.analysis.Analyzer
-        case TYPE_ANALYZER:
-            if (valueStmt == null) {
-                throw new TextIndexException("Analyzer param spec must have a text:paramValue:" + node);
-            }
-
-            RDFNode valueNode = valueStmt.getObject();
-            if (!valueNode.isResource()) {
-                throw new TextIndexException("Analyzer param spec text:paramValue must be an analyzer spec resource: " + valueNode);
-            }
-
-            Analyzer analyzer = (Analyzer) Assembler.general.open((Resource) valueNode);
-            return new ParamSpec(name, analyzer, Analyzer.class);
-
-        default:
-            // there was no match
-            Log.error(this, "Unknown parameter type: " + type + " for param: " + name + " with value: " + value);
-            break;
-        }
-
-        return null;
-    }
-
-    private String getStringValue(Statement stmt) {
-        if (stmt == null) {
-            return null;
-        } else {
-            RDFNode node = stmt.getObject();
-            if (node.isLiteral()) {
-                return ((Literal) node).getLexicalForm();
-            } else {
-                return null;
-            }
-        }
-    }
-
-    private List<String> toStrings(Resource list) {
-        List<String> result = new ArrayList<>();
-        Resource current = list;
-
-        while (current != null && ! current.equals(RDF.nil)){
-            Statement firstStmt = current.getProperty(RDF.first);
-            if (firstStmt == null) {
-                throw new TextIndexException("param spec of type set not well formed");
-            }
-
-            RDFNode first = firstStmt.getObject();
-            if (! first.isLiteral()) {
-                throw new TextIndexException("param spec of type set item is not a literal: " + first);
-            }
-
-            result.add(((Literal)first).getLexicalForm());
-
-            Statement restStmt = current.getProperty(RDF.rest);
-            if (restStmt == null) {
-                throw new TextIndexException("param spec of type set not terminated by rdf:nil");
-            }
-
-            RDFNode rest = restStmt.getObject();
-            if (! rest.isResource()) {
-                throw new TextIndexException("param spec of type set rest is not a resource: " + rest);
-            }
-
-            current = (Resource) rest;
-        }
-
-        return result;
-    }
-
-    /**
-     * <code>ParamSpec</code> contains the <code>name</code>, <code>Class</code>, and 
-     * <code>value</code> of a parameter for a constructor (or really any method in general)
-     */
-    private static final class ParamSpec {
-
-        private final String name;
-        private final Object value;
-        private final Class<?> clazz;
-
-        public ParamSpec(String key, Object value) {
-            this(key, value, value.getClass());
-        }
-
-        public ParamSpec(String key, Object value, Class<?> clazz) {
-            this.name = key;
-            this.value = value;
-            this.clazz = clazz;
-        }
-
-        public String getKey() {
-            return name;
-        }
-
-        public Object getValue() {
-            return value;
-        }
-
-        public Class<?> getValueClass() {
-            return clazz;
-        }
-    }
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java
new file mode 100644
index 0000000..245f3f9
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericFilterAssembler.java
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.assembler.Mode;
+import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.assembler.Params.ParamSpec;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Creates generic filters given a fully qualified Class name and a list
+ * of parameters for a constructor of the Class.
+ * <p>
+ * The parameters may be of the following types:
+ * <pre>
+ *     text:TypeString        String
+ *     text:TypeSet           org.apache.lucene.analysis.util.CharArraySet
+ *     text:TypeFile          java.io.FileReader
+ *     text:TypeInt           int
+ *     text:TypeBoolean       boolean
+ *     text:TypeTokenStream   TokenStream
+ *     text:TypeAnalyzer      org.apache.lucene.analysis.Analyzer
+ * </pre>
+ * 
+ * Although the list of types is not exhaustive it is a simple matter
+ * to create a wrapper Analyzer that reads a file with information that can
+ * be used to initialize any sort of parameters that may be needed for
+ * a given Analyzer. The provided types cover the vast majority of cases.
+ * <p>
+ * For example, <code>org.apache.lucene.analysis.ja.JapaneseAnalyzer</code>
+ * has a constructor with 4 parameters: a <code>UserDict</code>,
+ * a <code>CharArraySet</code>, a <code>JapaneseTokenizer.Mode</code>, and a 
+ * <code>Set&lt;String></code>. So a simple wrapper can extract the values
+ * needed for the various parameters with types not available in this
+ * extension, construct the required instances, and instantiate the
+ * <code>JapaneseAnalyzer</code>.
+ * <p>
+ * Adding custom Analyzers such as the above wrapper analyzer is a simple
+ * matter of adding the Analyzer class and any associated filters and tokenizer
+ * and so on to the classpath for Jena - usually in a jar. Of course, all of 
+ * the Analyzers that are included in the Lucene distribution bundled with Jena
+ * are available as generic Analyzers as well.
+ * <p>
+ * Each parameter object is specified with:
+ * <ul>
+ * <li>an optional <code>text:paramName</code> that may be used to document which 
+ * parameter is represented</li>
+ * <li>a <code>text:paramType</code> which is one of: <code>text:TypeString</code>, 
+ * <code>text:TypeSet</code>, <code>text:TypeFile</code>, <code>text:TypeInt</code>, 
+ * <code>text:TypeBoolean</code>, <code>text:TypeAnalyzer</code>.</li>
+ * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int or resource.</li>
+ * </ul>
+ * <p>
+ * A parameter of type <code>text:TypeSet</code> <i>must have</i> a list of zero or 
+ * more <code>String</code>s.
+ * <p>
+ * A parameter of type <code>text:TypeString</code>, <code>text:TypeFile</code>, 
+ * <code>text:TypeBoolean</code>, <code>text:TypeInt</code> or <code>text:TypeAnalyzer</code> 
+ * <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * Examples:
+ * <pre>
+    <#indexLucene> a text:TextIndexLucene ;
+        text:directory <file:Lucene> ;
+        text:entityMap <#entMap> ;
+        text:defineAnalyzers (
+            [text:addLang "sa-x-iast" ;
+             text:analyzer [ . . . ]]
+            [text:defineAnalyzer <#foo> ;
+             text:analyzer [ . . . ]]
+            [text:defineFilter <#bar> ;
+             text:filter [
+               a text:GenericFilter ;
+               text:class "org.apache.jena.query.text.filter.SelectiveFoldingFilter" ;
+               text:params (
+                    [ text:paramName "whitelisted" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("ç") ]
+                    )
+              ]
+            ]
+        )
+ * </pre>
+ */
+public class GenericFilterAssembler extends AssemblerBase {
+    /*
+    <#indexLucene> a text:TextIndexLucene ;
+        text:directory <file:Lucene> ;
+        text:entityMap <#entMap> ;
+        text:defineAnalyzers (
+            [text:addLang "sa-x-iast" ;
+             text:analyzer [ . . . ]]
+            [text:defineAnalyzer <#foo> ;
+             text:analyzer [ . . . ]]
+            [text:defineFilter <#bar> ;
+             text:filter [
+               a text:GenericFilter ;
+               text:class "org.apache.jena.query.text.filter.SelectiveFoldingFilter" ;
+               text:params (
+                    [ text:paramName "whitelisted" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("ç") ]
+                    )
+              ]
+            ]
+        )
+     */
+
+    @Override
+    public FilterSpec open(Assembler a, Resource root, Mode mode) {
+        if (root.hasProperty(TextVocab.pClass)) {
+            // text:class is expected to be a string literal
+            String className = root.getProperty(TextVocab.pClass).getString();
+
+            // is the class accessible?
+            Class<?> clazz = null;
+            try {
+                clazz = Class.forName(className);
+            } catch (ClassNotFoundException e) {
+                Log.error(this, "Filter class " + className + " not found. " + e.getMessage(), e);
+                return null;
+            }
+
+            // Is the class an Analyzer?
+            if (!TokenFilter.class.isAssignableFrom(clazz)) {
+                Log.error(this, clazz.getName() + " has to be a subclass of " + TokenFilter.class.getName());
+                return null;
+            }
+
+            if (root.hasProperty(TextVocab.pParams)) {
+                RDFNode node = root.getProperty(TextVocab.pParams).getObject();
+                if (! node.isResource()) {
+                    throw new TextIndexException("text:params must be a list of parameter resources: " + node);
+                }
+
+                List<ParamSpec> specs = Params.getParamSpecs((Resource) node);
+
+                // split the param specs into classes and values for constructor lookup
+                // add an initial param for the TokenStream source. The source value is
+                // set to null and the actual value supplied in ConfigurableAnalyzer when
+                // used.
+                final Class<?> paramClasses[] = new Class<?>[specs.size()+1];
+                paramClasses[0] = TokenStream.class;
+                final Object paramValues[] = new Object[specs.size()+1];
+                paramValues[0] = null;
+                for (int i = 0; i < specs.size(); i++) {
+                    ParamSpec spec = specs.get(i);
+                    paramClasses[i+1] = spec.getValueClass();
+                    paramValues[i+1] = spec.getValue();
+                }
+
+                // Create spec for new filter
+                return new FilterSpec(clazz, paramClasses, paramValues);
+
+            } else {
+                // use the TokenStream constructor for the new filter
+                return new FilterSpec(clazz, new Class<?>[] { TokenStream.class }, new Object[] { null });
+            }
+        } else {
+            throw new TextIndexException("text:class property is required by GenericFilter: " + root);
+        }
+    }
+    
+    public static class FilterSpec {
+        public Class<?> clazz;
+        public Class<?>[] paramClasses;
+        public Object[] paramValues;
+        
+        public FilterSpec(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+            this.clazz = clazz;
+            this.paramClasses = paramClasses;
+            this.paramValues = paramValues;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java
new file mode 100644
index 0000000..2e2b39b
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/GenericTokenizerAssembler.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.assembler.Mode;
+import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.assembler.Params.ParamSpec;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Creates generic tokenizers given a fully qualified Class name and a list
+ * of parameters for a constructor of the Class.
+ * <p>
+ * The parameters may be of the following types:
+ * <pre>
+ *     text:TypeString    String
+ *     text:TypeSet       org.apache.lucene.analysis.util.CharArraySet
+ *     text:TypeFile      java.io.FileReader
+ *     text:TypeInt       int
+ *     text:TypeBoolean   boolean
+ *     text:TypeAnalyzer  org.apache.lucene.analysis.Analyzer
+ * </pre>
+ * 
+ * Although the list of types is not exhaustive it is a simple matter
+ * to create a wrapper Analyzer that reads a file with information that can
+ * be used to initialize any sort of parameters that may be needed for
+ * a given Analyzer. The provided types cover the vast majority of cases.
+ * <p>
+ * For example, <code>org.apache.lucene.analysis.ja.JapaneseAnalyzer</code>
+ * has a constructor with 4 parameters: a <code>UserDict</code>,
+ * a <code>CharArraySet</code>, a <code>JapaneseTokenizer.Mode</code>, and a 
+ * <code>Set&lt;String></code>. So a simple wrapper can extract the values
+ * needed for the various parameters with types not available in this
+ * extension, construct the required instances, and instantiate the
+ * <code>JapaneseAnalyzer</code>.
+ * <p>
+ * Adding custom Analyzers such as the above wrapper analyzer is a simple
+ * matter of adding the Analyzer class and any associated filters and tokenizer
+ * and so on to the classpath for Jena - usually in a jar. Of course, all of 
+ * the Analyzers that are included in the Lucene distribution bundled with Jena
+ * are available as generic Analyzers as well.
+ * <p>
+ * Each parameter object is specified with:
+ * <ul>
+ * <li>an optional <code>text:paramName</code> that may be used to document which 
+ * parameter is represented</li>
+ * <li>a <code>text:paramType</code> which is one of: <code>text:TypeString</code>, 
+ * <code>text:TypeSet</code>, <code>text:TypeFile</code>, <code>text:TypeInt</code>, 
+ * <code>text:TypeBoolean</code>, <code>text:TypeAnalyzer</code>.</li>
+ * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int or resource.</li>
+ * </ul>
+ * <p>
+ * A parameter of type <code>text:TypeSet</code> <i>must have</i> a list of zero or 
+ * more <code>String</code>s.
+ * <p>
+ * A parameter of type <code>text:TypeString</code>, <code>text:TypeFile</code>, 
+ * <code>text:TypeBoolean</code>, <code>text:TypeInt</code> or <code>text:TypeAnalyzer</code> 
+ * <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * Examples:
+ * <pre>
+    <#indexLucene> a text:TextIndexLucene ;
+        text:directory <file:Lucene> ;
+        text:entityMap <#entMap> ;
+        text:defineAnalyzers (
+            [text:addLang "sa-x-iast" ;
+             text:analyzer [ . . . ]]
+            [text:defineAnalyzer <#foo> ;
+             text:analyzer [ . . . ]]
+            [text:defineTokenizer <#bar> ;
+             text:tokenizer [
+               a text:GenericTokenizer ;
+               text:class "org.apache.lucene.analysis.ngram.NGramTokenizer" ;
+               text:params (
+                    [ text:paramName "minGram" ;
+                      text:paramType text:TypeInt ;
+                      text:paramValue 3 ]
+                    [ text:paramName "maxGram" ;
+                      text:paramType text:TypeInt ;
+                      text:paramValue 7 ]
+                    )
+              ]
+            ]
+        )
+ * </pre>
+ */
+public class GenericTokenizerAssembler extends AssemblerBase {
+    /*
+    <#indexLucene> a text:TextIndexLucene ;
+        text:directory <file:Lucene> ;
+        text:entityMap <#entMap> ;
+        text:defineAnalyzers (
+            [text:addLang "sa-x-iast" ;
+             text:analyzer [ . . . ]]
+            [text:defineAnalyzer <#foo> ;
+             text:analyzer [ . . . ]]
+            [text:defineTokenizer <#bar> ;
+             text:tokenizer [
+               a text:GenericTokenizer ;
+               text:class "org.apache.lucene.analysis.ngram.NGramTokenizer" ;
+               text:params (
+                    [ text:paramName "minGram" ;
+                      text:paramType text:TypeInt ;
+                      text:paramValue 3 ]
+                    [ text:paramName "maxGram" ;
+                      text:paramType text:TypeInt ;
+                      text:paramValue 7 ]
+                    )
+              ]
+            ]
+        )
+     */
+
+    @Override
+    public TokenizerSpec open(Assembler a, Resource root, Mode mode) {
+        if (root.hasProperty(TextVocab.pClass)) {
+            // text:class is expected to be a string literal
+            String className = root.getProperty(TextVocab.pClass).getString();
+
+            // is the class accessible?
+            Class<?> clazz = null;
+            try {
+                clazz = Class.forName(className);
+            } catch (ClassNotFoundException e) {
+                Log.error(this, "Tokenizer class " + className + " not found. " + e.getMessage(), e);
+                return null;
+            }
+
+            // Is the class an Tokenizer?
+            if (!Tokenizer.class.isAssignableFrom(clazz)) {
+                Log.error(this, clazz.getName() + " has to be a subclass of " + Tokenizer.class.getName());
+                return null;
+            }
+
+            if (root.hasProperty(TextVocab.pParams)) {
+                RDFNode node = root.getProperty(TextVocab.pParams).getObject();
+                if (! node.isResource()) {
+                    throw new TextIndexException("text:params must be a list of parameter resources: " + node);
+                }
+
+                List<ParamSpec> specs = Params.getParamSpecs((Resource) node);
+
+                // split the param specs into classes and values for constructor lookup
+                final Class<?> paramClasses[] = new Class<?>[specs.size()];
+                final Object paramValues[] = new Object[specs.size()];
+                for (int i = 0; i < specs.size(); i++) {
+                    ParamSpec spec = specs.get(i);
+                    paramClasses[i] = spec.getValueClass();
+                    paramValues[i] = spec.getValue();
+                }
+
+                // Create new analyzer
+                return new TokenizerSpec(clazz, paramClasses, paramValues);
+
+            } else {
+                // use the nullary Analyzer constructor
+                return new TokenizerSpec(clazz, new Class<?>[0], new Object[0]);
+            }
+        } else {
+            throw new TextIndexException("text:class property is required by GenericTokenizer: " + root);
+        }
+    }
+    
+    public static class TokenizerSpec {
+        public Class<?> clazz;
+        public Class<?>[] paramClasses;
+        public Object[] paramValues;
+        
+        public TokenizerSpec(Class<?> clazz, Class<?>[] paramClasses, Object[] paramValues) {
+            this.clazz = clazz;
+            this.paramClasses = paramClasses;
+            this.paramValues = paramValues;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
new file mode 100644
index 0000000..7b0cd18
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
@@ -0,0 +1,362 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.rdf.model.Literal;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement;
+import org.apache.jena.vocabulary.RDF;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+
+/**
+ * Parses assembler parameter definitions for <code>GenericAnalyzer</code>, 
+ * <code>GenericFilter</code>, and <code>GenericTokenizer</code>.
+ * <p>
+ * The parameters may be of the following types:
+ * <pre>
+ *     text:TypeString        String
+ *     text:TypeSet           org.apache.lucene.analysis.util.CharArraySet
+ *     text:TypeFile          java.io.FileReader
+ *     text:TypeInt           int
+ *     text:TypeBoolean       boolean
+ *     text:TypeAnalyzer      org.apache.lucene.analysis.Analyzer
+ *     text:TypeTokenStream   org.apache.lucene.analysis.TokenStream
+ * </pre>
+ * 
+ * Although the list of types is not exhaustive it is a simple matter
+ * to create a wrapper Analyzer, Filter, Tokenizer that reads a file with information 
+ * that can be used to initialize any sort of parameters that may be needed. 
+ * The provided types cover the vast majority of cases.
+ * <p>
+ * For example, <code>org.apache.lucene.analysis.ja.JapaneseAnalyzer</code>
+ * has a constructor with 4 parameters: a <code>UserDict</code>,
+ * a <code>CharArraySet</code>, a <code>JapaneseTokenizer.Mode</code>, and a 
+ * <code>Set&lt;String></code>. So a simple wrapper can extract the values
+ * needed for the various parameters with types not available in this
+ * extension, construct the required instances, and instantiate the
+ * <code>JapaneseAnalyzer</code>.
+ * <p>
+ * Adding custom Analyzers, etc., such as the above wrapper analyzer is a simple
+ * matter of adding the Analyzer class and any associated filters and tokenizer
+ * and so on to the classpath for Jena - usually in a jar. Of course, all of 
+ * the Analyzers, Filters, and Tokenizers that are included in the Lucene distribution 
+ * bundled with Jena are available as generics as well.
+ * <p>
+ * Each parameter object is specified with:
+ * <ul>
+ * <li>an optional <code>text:paramName</code> that may be used to document which 
+ * parameter is represented</li>
+ * <li>a <code>text:paramType</code> which is one of: <code>text:TypeString</code>, 
+ * <code>text:TypeSet</code>, <code>text:TypeFile</code>, <code>text:TypeInt</code>, 
+ * <code>text:TypeBoolean</code>, <code>text:TypeAnalyzer</code>.</li>
+ * <li>a text:paramValue which is an xsd:string, xsd:boolean or xsd:int or resource.</li>
+ * </ul>
+ * <p>
+ * A parameter of type <code>text:TypeSet</code> <i>must have</i> a list of zero or 
+ * more <code>String</code>s.
+ * <p>
+ * A parameter of type <code>text:TypeString</code>, <code>text:TypeFile</code>, 
+ * <code>text:TypeBoolean</code>, <code>text:TypeInt</code> or <code>text:TypeAnalyzer</code> 
+ * <i>must have</i> a single <code>text:paramValue</code> of the appropriate type.
+ * <p>
+ * A parameter of type <code>text:TypeTokenStream</code> does not have <code>text:paramValue</code>.
+ * It is used to mark the occurence of the <code>TokenStream</code> parameter for a <code>Filter</code>.
+ * <p>
+ * Examples:
+ * <pre>
+    text:map (
+         [ text:field "text" ; 
+           text:predicate rdfs:label;
+           text:analyzer [
+               a text:GenericAnalyzer ;
+               text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
+               text:params (
+                    [ text:paramName "stopwords" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("the" "a" "an") ]
+                    [ text:paramName "stemExclusionSet" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("ing" "ed") ]
+                    )
+           ] .
+ * </pre>
+ * <pre>
+    [] a text:TextIndexLucene ;
+       text:defineFilters (
+           text:filter [
+               a text:GenericFilter ;
+               text:class "fi.finto.FoldingFilter" ;
+               text:params (
+                    [ text:paramName "source" ;
+                      text:paramType text:TypeTokenStream ]
+                    [ text:paramName "whitelisted" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("ç") ]
+                    )
+           ]
+        )
+ * </pre>
+ */
+public class Params {
+    /*
+    text:map (
+         [ text:field "text" ; 
+           text:predicate rdfs:label;
+           text:analyzer [
+               a text:GenericAnalyzer ;
+               text:class "org.apache.lucene.analysis.en.EnglishAnalyzer" ;
+               text:params (
+                    [ text:paramName "stopwords" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("the" "a" "an") ]
+                    [ text:paramName "stemExclusionSet" ;
+                      text:paramType text:TypeSet ;
+                      text:paramValue ("ing" "ed") ]
+                    )
+           ] .
+     */
+
+    public static final String TYPE_ANALYZER    = "TypeAnalyzer";
+    public static final String TYPE_BOOL        = "TypeBoolean";
+    public static final String TYPE_FILE        = "TypeFile";
+    public static final String TYPE_INT         = "TypeInt";
+    public static final String TYPE_SET         = "TypeSet";
+    public static final String TYPE_STRING      = "TypeString";
+    public static final String TYPE_TOKENSTREAM = "TypeTokenStream";
+
+    protected static List<ParamSpec> getParamSpecs(Resource list) {
+        List<ParamSpec> result = new ArrayList<>();
+        Resource current = list;
+
+        while (current != null && ! current.equals(RDF.nil)){
+            Statement firstStmt = current.getProperty(RDF.first);
+            if (firstStmt == null) {
+                throw new TextIndexException("parameter list not well formed: " + current);
+            }
+
+            RDFNode first = firstStmt.getObject();
+            if (! first.isResource()) {
+                throw new TextIndexException("parameter specification must be an anon resource : " + first);
+            }
+
+            result.add(getParamSpec((Resource) first));
+
+            Statement restStmt = current.getProperty(RDF.rest);
+            if (restStmt == null) {
+                throw new TextIndexException("parameter list not terminated by rdf:nil");
+            }
+
+            RDFNode rest = restStmt.getObject();
+            if (! rest.isResource()) {
+                throw new TextIndexException("parameter list node is not a resource : " + rest);
+            }
+
+            current = (Resource) rest;
+        }
+
+        return result;
+    }
+
+    protected static ParamSpec getParamSpec(Resource node) {
+        Statement nameStmt = node.getProperty(TextVocab.pParamName);
+        Statement typeStmt = node.getProperty(TextVocab.pParamType);
+        Statement valueStmt = node.getProperty(TextVocab.pParamValue);
+        
+        if (typeStmt == null) {
+            throw new TextIndexException("Parameter specification must have a text:paramType: " + node);
+        }        
+        Resource typeRes = typeStmt.getResource();
+        String type = typeRes.getLocalName();
+
+        String name = getStringValue(nameStmt);
+        String value = getStringValue(valueStmt);
+
+        switch (type) {
+
+        // String
+        case TYPE_STRING: {
+            if (value == null) {
+                throw new TextIndexException("Value for string param: " + name + " must not be empty!");
+            }
+
+            return new ParamSpec(name, value, String.class);
+        }
+
+        // java.io.FileReader
+        case TYPE_FILE: {
+
+            if (value == null) {
+                throw new TextIndexException("Value for file param must exist and must contain a file name.");
+            }
+
+            try {
+                // The analyzer is responsible for closing the file
+                Reader fileReader = new java.io.FileReader(value);
+                return new ParamSpec(name, fileReader, Reader.class);
+
+            } catch (java.io.FileNotFoundException ex) {
+                throw new TextIndexException("File " + value + " for param " + name + " not found!");
+            }
+        }
+
+        // org.apache.lucene.analysis.util.CharArraySet
+        case TYPE_SET: {
+            if (valueStmt == null) {
+                throw new TextIndexException("A set param spec must have a text:paramValue:" + node);
+            }
+
+            RDFNode valueNode = valueStmt.getObject();
+            if (!valueNode.isResource()) {
+                throw new TextIndexException("A set param spec text:paramValue must be a list of strings: " + valueNode);
+            }
+
+            List<String> values = toStrings((Resource) valueNode);
+
+            return new ParamSpec(name, new CharArraySet(values, false), CharArraySet.class);
+        }
+
+        // int
+        case TYPE_INT:
+            if (value == null) {
+                throw new TextIndexException("Value for int param: " + name + " must not be empty!");
+            }
+
+            int n = ((Literal) valueStmt.getObject()).getInt();
+            return new ParamSpec(name, n, int.class);
+
+            // boolean
+        case TYPE_BOOL:
+            if (value == null) {
+                throw new TextIndexException("Value for boolean param: " + name + " must not be empty!");
+            }
+
+            boolean b = ((Literal) valueStmt.getObject()).getBoolean();
+            return new ParamSpec(name, b, boolean.class);
+
+            // org.apache.lucene.analysis.Analyzer
+        case TYPE_ANALYZER:
+            if (valueStmt == null) {
+                throw new TextIndexException("Analyzer param spec must have a text:paramValue:" + node);
+            }
+
+            RDFNode valueNode = valueStmt.getObject();
+            if (!valueNode.isResource()) {
+                throw new TextIndexException("Analyzer param spec text:paramValue must be an analyzer spec resource: " + valueNode);
+            }
+
+            Analyzer analyzer = (Analyzer) Assembler.general.open((Resource) valueNode);
+            return new ParamSpec(name, analyzer, Analyzer.class);
+
+        default:
+            // there was no match
+            Log.error("org.apache.jena.query.text.assembler.Params", "Unknown parameter type: " + type + " for param: " + name + " with value: " + value);
+            break;
+        }
+
+        return null;
+    }
+
+    private static String getStringValue(Statement stmt) {
+        if (stmt == null) {
+            return null;
+        } else {
+            RDFNode node = stmt.getObject();
+            if (node.isLiteral()) {
+                return ((Literal) node).getLexicalForm();
+            } else {
+                return null;
+            }
+        }
+    }
+
+    protected static List<String> toStrings(Resource list) {
+        List<String> result = new ArrayList<>();
+        Resource current = list;
+
+        while (current != null && ! current.equals(RDF.nil)){
+            Statement firstStmt = current.getProperty(RDF.first);
+            if (firstStmt == null) {
+                throw new TextIndexException("param spec of type set not well formed");
+            }
+
+            RDFNode first = firstStmt.getObject();
+            if (! first.isLiteral()) {
+                throw new TextIndexException("param spec of type set item is not a literal: " + first);
+            }
+
+            result.add(((Literal)first).getLexicalForm());
+
+            Statement restStmt = current.getProperty(RDF.rest);
+            if (restStmt == null) {
+                throw new TextIndexException("param spec of type set not terminated by rdf:nil");
+            }
+
+            RDFNode rest = restStmt.getObject();
+            if (! rest.isResource()) {
+                throw new TextIndexException("param spec of type set rest is not a resource: " + rest);
+            }
+
+            current = (Resource) rest;
+        }
+
+        return result;
+    }
+
+    /**
+     * <code>ParamSpec</code> contains the <code>name</code>, <code>Class</code>, and 
+     * <code>value</code> of a parameter for a constructor (or really any method in general)
+     */
+    protected static final class ParamSpec {
+
+        private final String name;
+        private final Object value;
+        private final Class<?> clazz;
+
+        public ParamSpec(String key, Object value) {
+            this(key, value, value.getClass());
+        }
+
+        public ParamSpec(String key, Object value, Class<?> clazz) {
+            this.name = key;
+            this.value = value;
+            this.clazz = clazz;
+        }
+
+        public String getKey() {
+            return name;
+        }
+
+        public Object getValue() {
+            return value;
+        }
+
+        public Class<?> getValueClass() {
+            return clazz;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
index 2a7b52e..013c20f 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
@@ -37,6 +37,8 @@ public class TextAssembler
         Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ;
         Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ;
         Assembler.general.implementWith(TextVocab.genericAnalyzer,  new GenericAnalyzerAssembler()) ;
+        Assembler.general.implementWith(TextVocab.genericFilter,    new GenericFilterAssembler()) ;
+        Assembler.general.implementWith(TextVocab.genericTokenizer,  new GenericTokenizerAssembler()) ;
         Assembler.general.implementWith(TextVocab.definedAnalyzer,  new DefinedAnalyzerAssembler()) ;
 
     }

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
index ebaca4e..6b17603 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexLuceneAssembler.java
@@ -73,28 +73,6 @@ public class TextIndexLuceneAssembler extends AssemblerBase {
                 File dir = new File(path) ;
                 directory = FSDirectory.open(dir.toPath()) ;
             }
-
-            Analyzer analyzer = null;
-            Statement analyzerStatement = root.getProperty(pAnalyzer);
-            if (null != analyzerStatement) {
-                RDFNode aNode = analyzerStatement.getObject();
-                if (! aNode.isResource()) {
-                    throw new TextIndexException("Text analyzer property is not a resource : " + aNode);
-                }
-                Resource analyzerResource = (Resource) aNode;
-                analyzer = (Analyzer) a.open(analyzerResource);
-            }
-
-            Analyzer queryAnalyzer = null;
-            Statement queryAnalyzerStatement = root.getProperty(pQueryAnalyzer);
-            if (null != queryAnalyzerStatement) {
-                RDFNode qaNode = queryAnalyzerStatement.getObject();
-                if (! qaNode.isResource()) {
-                    throw new TextIndexException("Text query analyzer property is not a resource : " + qaNode);
-                }
-                Resource analyzerResource = (Resource) qaNode;
-                queryAnalyzer = (Analyzer) a.open(analyzerResource);
-            }
             
             String queryParser = null;
             Statement queryParserStatement = root.getProperty(pQueryParser);
@@ -117,12 +95,18 @@ public class TextIndexLuceneAssembler extends AssemblerBase {
                 isMultilingualSupport = mlsNode.asLiteral().getBoolean();
             }
             
+            //define any filters and tokenizers first so they can be referenced in analyzer definitions if need be
             Statement defAnalyzersStatement = root.getProperty(pDefAnalyzers);
             if (null != defAnalyzersStatement) {
                 RDFNode aNode = defAnalyzersStatement.getObject();
                 if (! aNode.isResource()) {
                     throw new TextIndexException("text:defineAnalyzers property is not a resource (list) : " + aNode);
                 }
+                
+                DefineFiltersAssembler.open(a, (Resource) aNode);
+
+                DefineTokenizersAssembler.open(a, (Resource) aNode);
+
                 boolean addedLangs = DefineAnalyzersAssembler.open(a, (Resource) aNode);
                 // if the text:defineAnalyzers added any analyzers to lang tags then ensure that
                 // multilingual support is enabled
@@ -134,6 +118,30 @@ public class TextIndexLuceneAssembler extends AssemblerBase {
                 }
             }
 
+            // initialize default analyzer and query analyzer after processing all analyzer definitions
+            // so they can be referred to
+            Analyzer analyzer = null;
+            Statement analyzerStatement = root.getProperty(pAnalyzer);
+            if (null != analyzerStatement) {
+                RDFNode aNode = analyzerStatement.getObject();
+                if (! aNode.isResource()) {
+                    throw new TextIndexException("Text analyzer property is not a resource : " + aNode);
+                }
+                Resource analyzerResource = (Resource) aNode;
+                analyzer = (Analyzer) a.open(analyzerResource);
+            }
+
+            Analyzer queryAnalyzer = null;
+            Statement queryAnalyzerStatement = root.getProperty(pQueryAnalyzer);
+            if (null != queryAnalyzerStatement) {
+                RDFNode qaNode = queryAnalyzerStatement.getObject();
+                if (! qaNode.isResource()) {
+                    throw new TextIndexException("Text query analyzer property is not a resource : " + qaNode);
+                }
+                Resource analyzerResource = (Resource) qaNode;
+                queryAnalyzer = (Analyzer) a.open(analyzerResource);
+            }
+
             boolean storeValues = false;
             Statement storeValuesStatement = root.getProperty(pStoreValues);
             if (null != storeValuesStatement) {

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
index f41d0cc..187715a4 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
@@ -43,6 +43,7 @@ public class TextVocab
     public static final Property pQueryParser       = Vocab.property(NS, "queryParser") ;
     public static final Property pEntityMap         = Vocab.property(NS, "entityMap") ;
     public static final Property pTokenizer         = Vocab.property(NS, "tokenizer") ;
+    public static final Property pFilter            = Vocab.property(NS, "filter") ;
     public static final Property pFilters           = Vocab.property(NS, "filters") ;
     
     // Entity definition
@@ -78,21 +79,24 @@ public class TextVocab
     public static final Resource lowerCaseFilter    = Vocab.resource(NS, "LowerCaseFilter");
     public static final Resource asciiFoldingFilter = Vocab.resource(NS, "ASCIIFoldingFilter");
 
+    // ElasticSearch
     public static final Property pServerList        = Vocab.property(NS, "serverList");
     public static final Property pClusterName       = Vocab.property(NS, "clusterName");
     public static final Property pShards            = Vocab.property(NS, "shards");
     public static final Property pReplicas          = Vocab.property(NS, "replicas");
-    public static final Property pIndexName          = Vocab.property(NS, "indexName");
+    public static final Property pIndexName         = Vocab.property(NS, "indexName");
 
-    //GenericAnalyzer
-    public static final Resource genericAnalyzer    = Vocab.resource(NS, "GenericAnalyzer");
+    //GenericAnalyzer, DefinedFilter, DefinedTokenizer
     public static final Resource definedAnalyzer    = Vocab.resource(NS, "DefinedAnalyzer");
-    public static final Resource typeAnalyzer       = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_ANALYZER);
-    public static final Resource typeBoolean        = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_BOOL);
-    public static final Resource typeFile           = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_FILE);
-    public static final Resource typeInt            = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_INT);
-    public static final Resource typeSet            = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_SET);
-    public static final Resource typeString         = Vocab.resource(NS, GenericAnalyzerAssembler.TYPE_STRING);
+    public static final Resource genericAnalyzer    = Vocab.resource(NS, "GenericAnalyzer");
+    public static final Resource genericFilter      = Vocab.resource(NS, "GenericFilter");
+    public static final Resource genericTokenizer   = Vocab.resource(NS, "GenericTokenizer");
+    public static final Resource typeAnalyzer       = Vocab.resource(NS, Params.TYPE_ANALYZER);
+    public static final Resource typeBoolean        = Vocab.resource(NS, Params.TYPE_BOOL);
+    public static final Resource typeFile           = Vocab.resource(NS, Params.TYPE_FILE);
+    public static final Resource typeInt            = Vocab.resource(NS, Params.TYPE_INT);
+    public static final Resource typeSet            = Vocab.resource(NS, Params.TYPE_SET);
+    public static final Resource typeString         = Vocab.resource(NS, Params.TYPE_STRING);
     public static final Property pClass             = Vocab.property(NS, "class");
     public static final Property pParams            = Vocab.property(NS, "params");
     public static final Property pParamName         = Vocab.property(NS, "paramName");
@@ -100,6 +104,8 @@ public class TextVocab
     public static final Property pParamValue        = Vocab.property(NS, "paramValue");
     public static final Property pDefAnalyzers      = Vocab.property(NS, "defineAnalyzers");
     public static final Property pDefAnalyzer       = Vocab.property(NS, "defineAnalyzer");
+    public static final Property pDefFilter         = Vocab.property(NS, "defineFilter");
+    public static final Property pDefTokenizer      = Vocab.property(NS, "defineTokenizer");
     public static final Property pAddLang           = Vocab.property(NS, "addLang");
     public static final Property pUseAnalyzer       = Vocab.property(NS, "useAnalyzer");
     

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
index 8fce7fd..0034632 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
@@ -52,6 +52,7 @@ import org.junit.runners.Suite.SuiteClasses;
     , TestTextGraphIndexExtra.class
     , TestTextGraphIndexExtra2.class
     , TestTextHighlighting.class
+    , TestTextDefineAnalyzers.class
 })
 
 public class TS_Text

http://git-wip-us.apache.org/repos/asf/jena/blob/795b9eb7/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
new file mode 100644
index 0000000..18328f7
--- /dev/null
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.Reader ;
+import java.io.StringReader ;
+
+import org.apache.jena.assembler.Assembler ;
+import org.apache.jena.atlas.lib.StrUtils ;
+import org.apache.jena.query.Dataset ;
+import org.apache.jena.query.ReadWrite ;
+import org.apache.jena.query.text.assembler.TextAssembler ;
+import org.apache.jena.rdf.model.Model ;
+import org.apache.jena.rdf.model.ModelFactory ;
+import org.apache.jena.rdf.model.Resource ;
+import org.junit.After ;
+import org.junit.Before ;
+import org.junit.Test ;
+
+public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBase {
+
+    private static final String SPEC_BASE = "http://example.org/spec#";
+    private static final String SPEC_ROOT_LOCAL = "lucene_text_dataset";
+    private static final String SPEC_ROOT_URI = SPEC_BASE + SPEC_ROOT_LOCAL;
+    private static final String SPEC;
+    static {
+        SPEC = StrUtils.strjoinNL(
+                    "prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> ",
+                    "prefix ja:   <http://jena.hpl.hp.com/2005/11/Assembler#> ",
+                    "prefix tdb:  <http://jena.hpl.hp.com/2008/tdb#>",
+                    "prefix text: <http://jena.apache.org/text#>",
+                    "prefix :     <" + SPEC_BASE + ">",
+                    "",
+                    "[] ja:loadClass    \"org.apache.jena.query.text.TextQuery\" .",
+                    "text:TextDataset      rdfs:subClassOf   ja:RDFDataset .",
+                    "text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .",
+
+                    ":" + SPEC_ROOT_LOCAL,
+                    "    a              text:TextDataset ;",
+                    "    text:dataset   :dataset ;",
+                    "    text:index     :indexLucene ;",
+                    "    .",
+                    "",
+                    ":dataset",
+                    "    a                     tdb:DatasetTDB ;",
+                    "    tdb:location          \"--mem--\" ;",
+                    "    tdb:unionDefaultGraph true ;",
+                    ".",
+                    "",
+                    ":indexLucene",
+                    "    a text:TextIndexLucene ;",
+                    "    text:directory \"mem\" ;",
+                    "    text:storeValues true ;",
+                    "    text:analyzer [",
+                    "         a text:DefinedAnalyzer ;",
+                    "         text:useAnalyzer :configuredAnalyzer ] ;",
+                    "    text:defineAnalyzers (",
+                    "         [ text:defineAnalyzer :configuredAnalyzer ;",
+                    "           text:analyzer [",
+                    "                a text:ConfigurableAnalyzer ;",
+                    "                text:tokenizer :ngram ;",
+                    "                text:filters ( :asciiff text:LowerCaseFilter ) ] ]",
+                    "         [ text:defineTokenizer :ngram ;",
+                    "           text:tokenizer [",
+                    "                a text:GenericTokenizer ;",
+                    "                text:class \"org.apache.lucene.analysis.ngram.NGramTokenizer\" ;",
+                    "                text:params (",
+                    "                     [ text:paramName \"minGram\" ;",
+                    "                       text:paramType text:TypeInt ;",
+                    "                       text:paramValue 3 ]",
+                    "                     [ text:paramName \"maxGram\" ;",
+                    "                       text:paramType text:TypeInt ;",
+                    "                       text:paramValue 7 ]",
+                    "                     ) ] ]",
+                    "         [ text:defineFilter :asciiff ;",
+                    "           text:filter [",
+                    "                a text:GenericFilter ;",
+                    "                text:class \"org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter\" ;",
+                    "                text:params (",
+                    "                     [ text:paramName \"preserveOriginal\" ;",
+                    "                       text:paramType text:TypeBoolean ;",
+                    "                       text:paramValue true ]",
+                    "                     ) ] ]",
+                    "         ) ;",
+                    "    text:entityMap :entMap ;",
+                    "    .",
+                    "",
+                    ":entMap",
+                    "    a text:EntityMap ;",
+                    "    text:entityField      \"uri\" ;",
+                    "    text:defaultField     \"label\" ;",
+                    "    text:langField        \"lang\" ;",
+                    "    text:graphField       \"graph\" ;",
+                    "    text:map (",
+                    "         [ text:field \"label\" ; text:predicate rdfs:label ]",
+                    "         [ text:field \"comment\" ; text:predicate rdfs:comment ]",
+                    "         ) ."
+                    );
+    }
+
+    @Before
+    public void before() {
+        Reader reader = new StringReader(SPEC);
+        System.out.println(">>>>");        
+        System.out.println(SPEC);        
+        System.out.println("<<<<");
+        Model specModel = ModelFactory.createDefaultModel();
+        specModel.read(reader, "", "TURTLE");
+        TextAssembler.init();
+        Resource root = specModel.getResource(SPEC_ROOT_URI);
+        try {
+            dataset = (Dataset) Assembler.general.open(root);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    @After
+    public void after() {
+        dataset.close();
+    }
+
+    private void putTurtleInModel(String turtle, String modelName) {
+        Model model = modelName != null ? dataset.getNamedModel(modelName) : dataset.getDefaultModel() ;
+        Reader reader = new StringReader(turtle) ;
+        dataset.begin(ReadWrite.WRITE) ;
+        try {
+            model.read(reader, "", "TURTLE") ;
+            dataset.commit() ;
+        }
+        finally {
+            dataset.end();
+        }
+    }
+
+    @Test
+    public void testTextQueryDefAnalyzers1() {
+        final String turtleA = StrUtils.strjoinNL(
+                TURTLE_PROLOG,
+                "<" + RESOURCE_BASE + "testResultOneInModelA>",
+                "  rdfs:label 'bar testResultOne barfoo foo'",
+                ".",
+                "<" + RESOURCE_BASE + "testResultTwoInModelA>",
+                "  rdfs:label 'bar testResultTwo barfoo foo'",
+                ".",
+                "<" + RESOURCE_BASE + "testResultThreeInModelA>",
+                "  rdfs:label 'bar testResultThree barfoo foo'",
+                "."
+                );
+        putTurtleInModel(turtleA, "http://example.org/modelA") ;
+        final String turtleB = StrUtils.strjoinNL(
+                TURTLE_PROLOG,
+                "<" + RESOURCE_BASE + "testResultOneInModelB>",
+                "  rdfs:label 'bar testResultOne barfoo foo'",
+                "."
+                );
+        putTurtleInModel(turtleB, "http://example.org/modelB") ;
+        
+        // execution reaches here in the event that the assembler machinery
+        // has executed without errors and generated a usable dataset
+        // usage of the runtime machinery is tested elsewhere
+        assertTrue(true);
+    }
+}


[3/4] jena git commit: derive parameter type in the cases: int, boolean, String.

Posted by co...@apache.org.
derive parameter type in the cases: int, boolean, String.

Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/71c2f66d
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/71c2f66d
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/71c2f66d

Branch: refs/heads/master
Commit: 71c2f66dbdad7834b7560f27b16eee08aad37abf
Parents: 58ff28e
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Sun Mar 18 09:48:57 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Sun Mar 18 09:48:57 2018 -0500

----------------------------------------------------------------------
 .../jena/query/text/assembler/Params.java       | 40 ++++++++++++++++----
 .../query/text/TestTextDefineAnalyzers.java     | 23 ++++++++++-
 2 files changed, 55 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/71c2f66d/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
index 7b0cd18..b1a3f33 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/Params.java
@@ -24,6 +24,7 @@ import java.util.List;
 
 import org.apache.jena.assembler.Assembler;
 import org.apache.jena.atlas.logging.Log ;
+import org.apache.jena.datatypes.RDFDatatype;
 import org.apache.jena.query.text.TextIndexException;
 import org.apache.jena.rdf.model.Literal;
 import org.apache.jena.rdf.model.RDFNode;
@@ -184,16 +185,10 @@ public class Params {
 
     protected static ParamSpec getParamSpec(Resource node) {
         Statement nameStmt = node.getProperty(TextVocab.pParamName);
-        Statement typeStmt = node.getProperty(TextVocab.pParamType);
         Statement valueStmt = node.getProperty(TextVocab.pParamValue);
         
-        if (typeStmt == null) {
-            throw new TextIndexException("Parameter specification must have a text:paramType: " + node);
-        }        
-        Resource typeRes = typeStmt.getResource();
-        String type = typeRes.getLocalName();
-
         String name = getStringValue(nameStmt);
+        String type = getType(node);
         String value = getStringValue(valueStmt);
 
         switch (type) {
@@ -280,6 +275,37 @@ public class Params {
 
         return null;
     }
+    
+    private static String getType(Resource node) {
+        Statement typeStmt = node.getProperty(TextVocab.pParamType);
+        Statement valueStmt = node.getProperty(TextVocab.pParamValue);
+        String type = null;
+        
+        if (typeStmt == null) {
+
+            if (valueStmt == null) {
+                throw new TextIndexException("Parameter specification must have a text:paramValue: " + node);
+            }
+            
+            RDFNode obj = valueStmt != null ? valueStmt.getObject() : null;
+            Literal lit = obj.asLiteral();
+            RDFDatatype rdfType = lit.getDatatype();
+            Class<?> clazz = rdfType.getJavaClass();
+
+            if (clazz == java.lang.Boolean.class) {
+                type = TYPE_BOOL;
+            } else if (clazz == java.math.BigInteger.class) {
+                type = TYPE_INT;
+            } else if (clazz == java.lang.String.class) {
+                type = TYPE_STRING;
+            }
+        } else {
+            Resource typeRes = typeStmt.getResource();
+            type = typeRes.getLocalName();
+        }
+        
+        return type;
+    }
 
     private static String getStringValue(Statement stmt) {
         if (stmt == null) {

http://git-wip-us.apache.org/repos/asf/jena/blob/71c2f66d/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
index b0c114e..5ffa1db 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
@@ -78,13 +78,18 @@ public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBas
                     "                a text:ConfigurableAnalyzer ;",
                     "                text:tokenizer :ngram ;",
                     "                text:filters ( :asciiff text:LowerCaseFilter ) ] ]",
+                    "         [ text:defineAnalyzer :configuredAnalyzer2 ;",
+                    "           text:analyzer [",
+                    "                a text:ConfigurableAnalyzer ;",
+                    "                text:tokenizer :ngram2 ;",
+                    "                text:filters ( :asciiff2 text:LowerCaseFilter ) ] ]",
                     "         [ text:defineTokenizer :ngram ;",
                     "           text:tokenizer [",
                     "                a text:GenericTokenizer ;",
                     "                text:class \"org.apache.lucene.analysis.ngram.NGramTokenizer\" ;",
                     "                text:params (",
                     "                     [ text:paramName \"minGram\" ;",
-                    "                       text:paramType text:TypeInt ;",
+                    "                      text:paramType text:TypeInt ;",
                     "                       text:paramValue 3 ]",
                     "                     [ text:paramName \"maxGram\" ;",
                     "                       text:paramType text:TypeInt ;",
@@ -99,6 +104,22 @@ public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBas
                     "                       text:paramType text:TypeBoolean ;",
                     "                       text:paramValue true ]",
                     "                     ) ] ]",
+                    "         [ text:defineTokenizer :ngram2 ;",
+                    "           text:tokenizer [",
+                    "                a text:GenericTokenizer ;",
+                    "                text:class \"org.apache.lucene.analysis.ngram.NGramTokenizer\" ;",
+                    "                text:params (",
+                    "                     [ text:paramValue 3 ]",
+                    "                     [ text:paramValue 7 ]",
+                    "                     ) ] ]",
+                    "         [ text:defineFilter :asciiff2 ;",
+                    "           text:filter [",
+                    "                a text:GenericFilter ;",
+                    "                text:class \"org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter\" ;",
+                    "                text:params (",
+                    "                     [ text:paramName \"preserveOriginal\" ;",
+                    "                       text:paramValue true ]",
+                    "                     ) ] ]",
                     "         ) ;",
                     "    text:entityMap :entMap ;",
                     "    .",


[2/4] jena git commit: rm debug code

Posted by co...@apache.org.
rm debug code

Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/58ff28e9
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/58ff28e9
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/58ff28e9

Branch: refs/heads/master
Commit: 58ff28e9e4f26e063b10d9bebe706eba31f241ba
Parents: 795b9eb
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Thu Mar 15 14:54:23 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Mar 15 14:54:23 2018 -0500

----------------------------------------------------------------------
 .../org/apache/jena/query/text/TestTextDefineAnalyzers.java | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/58ff28e9/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
index 18328f7..b0c114e 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestTextDefineAnalyzers.java
@@ -119,18 +119,11 @@ public class TestTextDefineAnalyzers extends AbstractTestDatasetWithTextIndexBas
     @Before
     public void before() {
         Reader reader = new StringReader(SPEC);
-        System.out.println(">>>>");        
-        System.out.println(SPEC);        
-        System.out.println("<<<<");
         Model specModel = ModelFactory.createDefaultModel();
         specModel.read(reader, "", "TURTLE");
         TextAssembler.init();
         Resource root = specModel.getResource(SPEC_ROOT_URI);
-        try {
-            dataset = (Dataset) Assembler.general.open(root);
-        } catch (Exception e) {
-            e.printStackTrace();
-        }
+        dataset = (Dataset) Assembler.general.open(root);
     }
 
     @After


[4/4] jena git commit: This closes #385 - Merge branch 'JENA-1506-PR' of https://github.com/BuddhistDigitalResourceCenter/jena

Posted by co...@apache.org.
This closes #385 - Merge branch 'JENA-1506-PR' of https://github.com/BuddhistDigitalResourceCenter/jena


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/2e3d1fa2
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/2e3d1fa2
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/2e3d1fa2

Branch: refs/heads/master
Commit: 2e3d1fa27d9f7d3d56fd0b83f7a3cd56a71a474d
Parents: 50b46f0 71c2f66
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Wed Mar 21 09:43:19 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Wed Mar 21 09:43:19 2018 -0500

----------------------------------------------------------------------
 .../text/analyzer/ConfigurableAnalyzer.java     | 133 +++++--
 .../ConfigurableAnalyzerAssembler.java          |   4 +-
 .../assembler/DefineAnalyzersAssembler.java     |   3 +-
 .../text/assembler/DefineFiltersAssembler.java  | 103 +++++
 .../assembler/DefineTokenizersAssembler.java    | 100 +++++
 .../assembler/GenericAnalyzerAssembler.java     | 228 +----------
 .../text/assembler/GenericFilterAssembler.java  | 199 ++++++++++
 .../assembler/GenericTokenizerAssembler.java    | 198 ++++++++++
 .../jena/query/text/assembler/Params.java       | 388 +++++++++++++++++++
 .../query/text/assembler/TextAssembler.java     |   2 +
 .../assembler/TextIndexLuceneAssembler.java     |  52 +--
 .../jena/query/text/assembler/TextVocab.java    |  24 +-
 .../org/apache/jena/query/text/TS_Text.java     |   1 +
 .../query/text/TestTextDefineAnalyzers.java     | 196 ++++++++++
 14 files changed, 1348 insertions(+), 283 deletions(-)
----------------------------------------------------------------------