You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by os...@apache.org on 2015/11/17 11:02:05 UTC
jena git commit: JENA-1062: configurable Lucene analyzer for jena-text
Repository: jena
Updated Branches:
refs/heads/master 2099295b1 -> 9c35b6806
JENA-1062: configurable Lucene analyzer for jena-text
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/9c35b680
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/9c35b680
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/9c35b680
Branch: refs/heads/master
Commit: 9c35b680626f164578a8b1c2a3ea9c5cd85e0868
Parents: 2099295
Author: Osma Suominen <os...@aalto.fi>
Authored: Wed Nov 4 20:32:03 2015 +0200
Committer: Osma Suominen <os...@apache.org>
Committed: Tue Nov 17 11:58:12 2015 +0200
----------------------------------------------------------------------
.../text/analyzer/ConfigurableAnalyzer.java | 93 +++++++++++++++++
.../ConfigurableAnalyzerAssembler.java | 100 +++++++++++++++++++
.../query/text/assembler/TextAssembler.java | 1 +
.../jena/query/text/assembler/TextVocab.java | 14 +++
.../org/apache/jena/query/text/TS_Text.java | 1 +
.../TestDatasetWithConfigurableAnalyzer.java | 61 +++++++++++
.../text/assembler/TestEntityMapAssembler.java | 26 +++++
7 files changed, 296 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
new file mode 100644
index 0000000..ada3361
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/ConfigurableAnalyzer.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.analyzer ;
+
+import java.io.Reader ;
+import java.util.List ;
+
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.lucene.analysis.Analyzer ;
+import org.apache.lucene.analysis.TokenFilter ;
+import org.apache.lucene.analysis.Tokenizer ;
+import org.apache.lucene.analysis.TokenStream ;
+import org.apache.lucene.analysis.core.KeywordTokenizer ;
+import org.apache.lucene.analysis.core.LetterTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter ;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer ;
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version ;
+
+
+/**
+ * Lucene Analyzer implementation that can be configured with different
+ * Tokenizer and (optionally) TokenFilter implementations.
+ */
+
+public class ConfigurableAnalyzer extends Analyzer {
+ private final Version version;
+ private final String tokenizer;
+ private final List<String> filters;
+
+ private Tokenizer getTokenizer(String tokenizerName, Reader reader) {
+ switch(tokenizerName) {
+ case "KeywordTokenizer":
+ return new KeywordTokenizer(reader);
+ case "LetterTokenizer":
+ return new LetterTokenizer(version, reader);
+ case "StandardTokenizer":
+ return new StandardTokenizer(version, reader);
+ case "WhitespaceTokenizer":
+ return new WhitespaceTokenizer(version, reader);
+ default:
+ throw new TextIndexException("Unknown tokenizer : " + tokenizerName);
+ }
+ }
+
+ private TokenFilter getTokenFilter(String filterName, TokenStream source) {
+ switch(filterName) {
+ case "ASCIIFoldingFilter":
+ return new ASCIIFoldingFilter(source);
+ case "LowerCaseFilter":
+ return new LowerCaseFilter(version, source);
+ case "StandardFilter":
+ return new StandardFilter(version, source);
+ default:
+ throw new TextIndexException("Unknown filter : " + filterName);
+ }
+ }
+
+ public ConfigurableAnalyzer(Version ver, String tokenizer, List<String> filters) {
+ this.version = ver;
+ this.tokenizer = tokenizer;
+ this.filters = filters;
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer source = getTokenizer(this.tokenizer, reader);
+ TokenStream stream = source;
+ for (String filter : this.filters) {
+ stream = getTokenFilter(filter, stream);
+ }
+ return new TokenStreamComponents(source, stream);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
new file mode 100644
index 0000000..d336ed8
--- /dev/null
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/ConfigurableAnalyzerAssembler.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text.assembler;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.jena.assembler.Assembler;
+import org.apache.jena.assembler.Mode;
+import org.apache.jena.assembler.assemblers.AssemblerBase;
+import org.apache.jena.query.text.TextIndexException;
+import org.apache.jena.query.text.TextIndexLucene;
+import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer;
+import org.apache.jena.rdf.model.RDFNode;
+import org.apache.jena.rdf.model.Resource;
+import org.apache.jena.rdf.model.Statement ;
+import org.apache.jena.vocabulary.RDF ;
+import org.apache.lucene.analysis.Analyzer;
+
+
+/**
+ * Assembler to create a configurable analyzer.
+ */
+public class ConfigurableAnalyzerAssembler extends AssemblerBase {
+ /*
+ text:map (
+ [ text:field "text" ;
+ text:predicate rdfs:label;
+ text:analyzer [
+ a text:ConfigurableAnalyzer ;
+ text:tokenizer text:LetterTokenizer ;
+ text:filters (text:LowerCaseFilter)
+ ]
+ ]
+ .
+ */
+
+
+ @Override
+ public Analyzer open(Assembler a, Resource root, Mode mode) {
+ if (root.hasProperty(TextVocab.pTokenizer)) {
+ Resource tokenizerResource = root.getPropertyResourceValue(TextVocab.pTokenizer);
+ String tokenizer = tokenizerResource.getLocalName();
+ List<String> filters;
+ if (root.hasProperty(TextVocab.pFilters)) {
+ Resource filtersResource = root.getPropertyResourceValue(TextVocab.pFilters);
+ filters = toFilterList(filtersResource);
+ } else {
+ filters = new ArrayList<>();
+ }
+ return new ConfigurableAnalyzer(TextIndexLucene.VER, tokenizer, filters);
+ } else {
+ throw new TextIndexException("text:tokenizer setting is required by ConfigurableAnalyzer");
+ }
+ }
+
+ private List<String> toFilterList(Resource list) {
+ List<String> result = new ArrayList<>();
+ Resource current = list;
+ while (current != null && ! current.equals(RDF.nil)){
+ Statement stmt = current.getProperty(RDF.first);
+ if (stmt == null) {
+ throw new TextIndexException("filter list not well formed");
+ }
+ RDFNode node = stmt.getObject();
+ if (! node.isResource()) {
+ throw new TextIndexException("filter is not a resource : " + node);
+ }
+
+ result.add(node.asResource().getLocalName());
+ stmt = current.getProperty(RDF.rest);
+ if (stmt == null) {
+ throw new TextIndexException("filter list not terminated by rdf:nil");
+ }
+ node = stmt.getObject();
+ if (! node.isResource()) {
+ throw new TextIndexException("filter list node is not a resource : " + node);
+ }
+ current = node.asResource();
+ }
+ return result;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
index 021c003..5f7ca4d 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java
@@ -35,6 +35,7 @@ public class TextAssembler
Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ;
Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ;
+ Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ;
}
}
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
index fb14505..705b565 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java
@@ -43,6 +43,8 @@ public class TextVocab
public static final Property pStoreValues = Vocab.property(NS, "storeValues") ;
public static final Property pQueryAnalyzer = Vocab.property(NS, "queryAnalyzer") ;
public static final Property pEntityMap = Vocab.property(NS, "entityMap") ;
+ public static final Property pTokenizer = Vocab.property(NS, "tokenizer") ;
+ public static final Property pFilters = Vocab.property(NS, "filters") ;
// Entity definition
public static final Resource entityMap = Vocab.resource(NS, "EntityMap") ;
@@ -64,6 +66,18 @@ public class TextVocab
public static final Resource keywordAnalyzer = Vocab.resource(NS, "KeywordAnalyzer");
public static final Resource lowerCaseKeywordAnalyzer = Vocab.resource(NS, "LowerCaseKeywordAnalyzer");
public static final Resource localizedAnalyzer = Vocab.resource(NS, "LocalizedAnalyzer");
+ public static final Resource configurableAnalyzer = Vocab.resource(NS, "ConfigurableAnalyzer");
+
+ // Tokenizers
+ public static final Resource standardTokenizer = Vocab.resource(NS, "StandardTokenizer");
+ public static final Resource letterTokenizer = Vocab.resource(NS, "LetterTokenizer");
+ public static final Resource keywordTokenizer = Vocab.resource(NS, "KeywordTokenizer");
+ public static final Resource whitespaceTokenizer = Vocab.resource(NS, "WhitespaceTokenizer");
+
+ // Filters
+ public static final Resource standardFilter = Vocab.resource(NS, "StandardFilter");
+ public static final Resource lowerCaseFilter = Vocab.resource(NS, "LowerCaseFilter");
+ public static final Resource asciiFoldingFilter = Vocab.resource(NS, "ASCIIFoldingFilter");
}
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
index 3459e43..6e0be2c 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java
@@ -50,6 +50,7 @@ import org.junit.runners.Suite.SuiteClasses ;
, TestDatasetWithLowerCaseKeywordAnalyzer.class
, TestLuceneWithMultipleThreads.class
, TestDatasetWithLocalizedAnalyzer.class
+ , TestDatasetWithConfigurableAnalyzer.class
})
public class TS_Text
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java
new file mode 100644
index 0000000..ad3c417
--- /dev/null
+++ b/jena-text/src/test/java/org/apache/jena/query/text/TestDatasetWithConfigurableAnalyzer.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.query.text;
+
+import java.util.Set ;
+
+import org.apache.jena.atlas.lib.StrUtils ;
+import org.apache.jena.ext.com.google.common.collect.Sets ;
+import org.junit.Before ;
+import org.junit.Test ;
+
+/**
+ * This class defines a setup configuration for a dataset that uses an ASCII folding lowercase keyword analyzer with a Lucene index.
+ */
+public class TestDatasetWithConfigurableAnalyzer extends TestDatasetWithLowerCaseKeywordAnalyzer {
+ @Override
+ @Before
+ public void before() {
+ init(StrUtils.strjoinNL(
+ "text:ConfigurableAnalyzer ;",
+ "text:tokenizer text:KeywordTokenizer ;",
+ "text:filters (text:ASCIIFoldingFilter text:LowerCaseFilter)"
+ ));
+ }
+
+ @Test
+ public void testConfigurableAnalyzerIsCaseAndAccentInsensitive() {
+ final String testName = "testConfigurableAnalyzerIsCaseAndAccentInsensitive";
+ final String turtle = StrUtils.strjoinNL(
+ TURTLE_PROLOG,
+ "<" + RESOURCE_BASE + testName + ">",
+ " rdfs:label 'Feeling a déjà vu'",
+ "."
+ );
+ String queryString = StrUtils.strjoinNL(
+ QUERY_PROLOG,
+ "SELECT ?s",
+ "WHERE {",
+ " ?s text:query ( rdfs:label '\"feeling ä déja\"*' 10 ) .",
+ "}"
+ );
+ Set<String> expectedURIs = Sets.newHashSet(RESOURCE_BASE + testName);
+ doTestSearch(turtle, queryString, expectedURIs);
+ }
+}
http://git-wip-us.apache.org/repos/asf/jena/blob/9c35b680/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java
index ab3ed29..e4c823d 100644
--- a/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java
+++ b/jena-text/src/test/java/org/apache/jena/query/text/assembler/TestEntityMapAssembler.java
@@ -29,6 +29,7 @@ import org.apache.jena.atlas.logging.LogCtl ;
import org.apache.jena.graph.Node ;
import org.apache.jena.query.text.EntityDefinition ;
import org.apache.jena.query.text.TextIndexException ;
+import org.apache.jena.query.text.analyzer.ConfigurableAnalyzer ;
import org.apache.jena.query.text.analyzer.LowerCaseKeywordAnalyzer ;
import org.apache.jena.rdf.model.* ;
import org.apache.jena.vocabulary.RDF ;
@@ -56,6 +57,7 @@ public class TestEntityMapAssembler {
private static final Resource spec4;
private static final Resource spec5;
private static final Resource spec6;
+ private static final Resource spec7;
private static final Resource specNoEntityField;
private static final Resource specNoDefaultField;
private static final Resource specNoMapProperty;
@@ -119,6 +121,12 @@ public class TestEntityMapAssembler {
assertEquals(LowerCaseKeywordAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass());
}
+ @Test public void EntityHasMapEntryWithConfigurableAnalyzer() {
+ EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler();
+ EntityDefinition entityDef = entDefAssem.open(Assembler.general, spec7, null);
+ assertEquals(ConfigurableAnalyzer.class, entityDef.getAnalyzer(SPEC1_DEFAULT_FIELD).getClass());
+ }
+
@Test(expected=TextIndexException.class) public void errorOnNoEntityField() {
EntityDefinitionAssembler entDefAssem = new EntityDefinitionAssembler();
entDefAssem.open(null, specNoEntityField, null);
@@ -254,6 +262,24 @@ public class TestEntityMapAssembler {
.addProperty(RDF.type, TextVocab.lowerCaseKeywordAnalyzer))
}));
+
+ // create an entity map specification using a configurable analyzer
+
+ spec7 = model.createResource(TESTBASE + "spec7")
+ .addProperty(TextVocab.pEntityField, SPEC1_ENTITY_FIELD)
+ .addProperty(TextVocab.pDefaultField, SPEC1_DEFAULT_FIELD)
+ .addProperty(TextVocab.pMap,
+ model.createList(
+ new RDFNode[] {
+ model.createResource()
+ .addProperty(TextVocab.pField, SPEC1_DEFAULT_FIELD)
+ .addProperty(TextVocab.pPredicate, SPEC1_PREDICATE)
+ .addProperty(TextVocab.pAnalyzer,
+ model.createResource()
+ .addProperty(RDF.type, TextVocab.configurableAnalyzer)
+ .addProperty(TextVocab.pTokenizer, TextVocab.standardTokenizer))
+ }));
+
// bad assembler spec
specNoEntityField =