You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by bu...@apache.org on 2017/06/29 06:47:31 UTC

svn commit: r1014689 [2/2] - in /websites/staging/jena/trunk/content: ./ documentation/query/text-query-new.html

Added: websites/staging/jena/trunk/content/documentation/query/text-query-new.html
==============================================================================
--- websites/staging/jena/trunk/content/documentation/query/text-query-new.html (added)
+++ websites/staging/jena/trunk/content/documentation/query/text-query-new.html Thu Jun 29 06:47:30 2017
@@ -0,0 +1,1164 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE- 2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+  <title>Apache Jena - Jena Full Text Search</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-extension.css" rel="stylesheet" type="text/css">
+  <link href="/css/jena.css" rel="stylesheet" type="text/css">
+  <link rel="shortcut icon" href="/images/favicon.ico" />
+  
+  <script src="https://code.jquery.com/jquery-2.0.3.min.js"></script>
+  <script src="/js/jena-navigation.js" type="text/javascript"></script>
+  <script src="/js/bootstrap.min.js" type="text/javascript"></script>
+  <script src="/js/breadcrumbs.js" type="text/javascript"></script>
+
+  <script src="/js/improve.js" type="text/javascript"></script>
+
+  
+  <!-- Uncomment to enable code coloring <link href="/css/codehilite.css" rel="stylesheet" type="text/css"> -->
+
+</head>
+
+<body>
+
+
+
+<nav class="navbar navbar-default" role="navigation">
+<div class="container">
+  <div class="navbar-header">
+  
+    <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-ex1-collapse">
+      <span class="icon-bar"></span>
+      <span class="icon-bar"></span>
+      <span class="icon-bar"></span>
+    </button>
+    <a class="navbar-brand" href="/index.html">
+    <img class="logo-menu" src="/images/jena-logo/jena-logo-notext-small.png" alt="jena logo">Apache Jena</a>
+  </div>
+ 
+  <div class="collapse navbar-collapse navbar-ex1-collapse">
+    <ul class="nav navbar-nav">
+              <li id="homepage"><a href="/index.html"><span class="glyphicon glyphicon-home"></span> Home</a></li>
+              <li id="download"><a href="/download/index.cgi"><span class="glyphicon glyphicon-download-alt"></span> Download</a></li>
+              <li class="dropdown">
+                <a href="#" class="dropdown-toggle" data-toggle="dropdown"><span class="glyphicon glyphicon-book"></span> Learn <b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li class="dropdown-header">Tutorials</li>
+                  <li><a href="/tutorials/index.html">Overview</a></li>
+                  <li><a href="/tutorials/rdf_api.html">RDF core API tutorial</a></li>
+                  <li><a href="/tutorials/sparql.html">SPARQL tutorial</a></li>
+                  <li><a href="/documentation/query/manipulating_sparql_using_arq.html">Manipulating SPARQL using ARQ</a></li>
+                  <li><a href="/tutorials/using_jena_with_eclipse.html">Using Jena with Eclipse</a></li>
+                  <li><a href="/documentation/notes/index.html">How-To's</a></li>
+                  <li class="divider"></li>
+                  <li class="dropdown-header">References</li>
+                  <li><a href="/documentation/index.html">Overview</a></li>
+                  <li><a href="/documentation/javadoc/">Javadoc</a></li>
+                  <li><a href="/documentation/rdf/index.html">RDF API</a></li>
+                  <li><a href="/documentation/io/">RDF I/O</a></li>
+                  <li><a href="/documentation/query/index.html">ARQ (SPARQL)</a></li>
+                  <li><a href="/documentation/rdfconnection/">RDF Connection - SPARQL API</a></li>
+                  <li><a href="/documentation/hadoop/index.html">Elephas - tools for RDF on Hadoop</a></li>
+                  <li><a href="/documentation/query/text-query.html">Text Search</a></li>
+                  <li><a href="/documentation/tdb/index.html">TDB</a></li>
+                  <li><a href="/documentation/sdb/index.html">SDB</a></li>
+                  <li><a href="/documentation/jdbc/index.html">SPARQL over JDBC</a></li>
+                  <li><a href="/documentation/fuseki2/index.html">Fuseki</a></li>
+                  <li><a href="/documentation/permissions/index.html">Permissions</a></li>
+                  <li><a href="/documentation/assembler/index.html">Assembler</a></li>
+                  <li><a href="/documentation/ontology/">Ontology API</a></li>
+                  <li><a href="/documentation/inference/index.html">Inference API</a></li>
+                  <li><a href="/documentation/tools/index.html">Command-line tools</a></li>
+                  <li><a href="/documentation/extras/index.html">Extras</a></li>
+                </ul>
+              </li>
+
+              <li class="drop down">
+                <a href="#" class="dropdown-toggle" data-toggle="dropdown"><span class="glyphicon glyphicon-book"></span> Javadoc <b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/documentation/javadoc/jena/">Jena Core</a></li>
+                  <li><a href="/documentation/javadoc/arq/">ARQ</a></li>
+                  <li><a href="/documentation/javadoc/tdb/">TDB</a></li>
+                  <li><a href="/documentation/javadoc/elephas/">Elephas</a></li>
+                  <li><a href="/documentation/javadoc/text/">Text Search</a></li>
+                  <li><a href="/documentation/javadoc/spatial/">Spatial Search</a></li>
+                  <li><a href="/documentation/javadoc/permissions/">Permissions</a></li>
+                  <li><a href="/documentation/javadoc/jdbc/">JDBC</a></li>
+                  <li><a href="/documentation/javadoc/">All Javadoc</a></li>
+                </ul>
+              </li>
+
+              <li id="ask"><a href="/help_and_support/index.html"><span class="glyphicon glyphicon-question-sign"></span> Ask</a></li>
+              
+              <li class="dropdown">
+                <a href="#" class="dropdown-toggle" data-toggle="dropdown"><span class="glyphicon glyphicon-bullhorn"></span> Get involved <b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/getting_involved/index.html">Contribute</a></li>
+                  <li><a href="/help_and_support/bugs_and_suggestions.html">Report a bug</a></li>
+                  <li class="divider"></li>
+                  <li class="dropdown-header">Project</li>
+                  <li><a href="/about_jena/about.html">About Jena</a></li>
+                  <li><a href="/about_jena/roadmap.html">Roadmap</a></li>
+                  <li><a href="/about_jena/architecture.html">Architecture</a></li>
+                  <li><a href="/about_jena/team.html">Project team</a></li>
+                  <li><a href="/about_jena/contributions.html">Related projects</a></li>
+                  <li class="divider"></li>
+                  <li class="dropdown-header">ASF</li>
+                  <li><a href="http://www.apache.org/">Apache Software Foundation</a></li>
+                  <li><a href="http://www.apache.org/licenses/LICENSE-2.0">License</a></li>
+                  <li><a href="http://www.apache.org/foundation/thanks.html">Thanks</a></li>
+                  <li><a href="http://www.apache.org/foundation/sponsorship.html">Become a Sponsor</a></li>
+                  <li><a href="http://www.apache.org/security/">Security</a></li>
+                </ul>
+              </li>
+
+              <li id="edit"><a href="javascript:improveThisPage(location.href);" title="Improve this Page (Use username anonymous and empty password)"><span class="glyphicon glyphicon-pencil"></span> Improve this Page</a></li>   
+    </ul>
+  </div>
+</div>
+</nav>
+
+
+<div class="container">
+    <div class="row">
+    <div class="col-md-12">
+    <div id="breadcrumbs"></div>
+    <h1 class="title">Jena Full Text Search</h1>
+  <style type="text/css">
+/* The following code is added by mdx_elementid.py
+   It was originally lifted from http://subversion.apache.org/style/site.css */
+/*
+ * Hide class="elementid-permalink", except when an enclosing heading
+ * has the :hover property.
+ */
+.headerlink, .elementid-permalink {
+  visibility: hidden;
+}
+h2:hover > .headerlink, h3:hover > .headerlink, h1:hover > .headerlink, h6:hover > .headerlink, h4:hover > .headerlink, h5:hover > .headerlink, dt:hover > .elementid-permalink { visibility: visible }</style>
+<p>This extension to ARQ combines SPARQL and full text search via <a href="https://lucene.apache.org">Lucene</a> 6.4.1 or 
+<a href="https://www.elastic.co">ElasticSearch</a> 5.2.1 (which is built on Lucene). It gives applications the ability 
+to perform indexed full text searches within SPARQL queries.</p>
+<p>Recall that SPARQL allows the use of <a href="https://www.w3.org/TR/2013/REC-sparql11-query-20130321/#func-regex">regex</a> 
+in <code>FILTER</code>s; however, such use <em>is not indexed</em>. For example, if you're searching for occurrences of <code>"printer"</code> in
+the <code>rdfs:label</code> of a bunch of products:</p>
+<div class="codehilite"><pre><span class="n">PREFIX</span>   <span class="n">ex</span><span class="p">:</span> <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">www</span><span class="p">.</span><span class="n">example</span><span class="p">.</span><span class="n">org</span><span class="o">/</span><span class="n">resources</span>#<span class="o">&gt;</span>
+<span class="n">PREFIX</span> <span class="n">rdfs</span><span class="p">:</span> <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">www</span><span class="p">.</span><span class="n">w3</span><span class="p">.</span><span class="n">org</span><span class="o">/</span>2000<span class="o">/</span>01<span class="o">/</span><span class="n">rdf</span><span class="o">-</span><span class="n">schema</span>#<span class="o">&gt;</span>
+
+<span class="n">SELECT</span> ?<span class="n">s</span> ?<span class="n">lbl</span>
+<span class="n">WHERE</span> <span class="p">{</span> 
+    ?<span class="n">s</span> <span class="n">a</span> <span class="n">ex</span><span class="p">:</span><span class="n">Product</span> <span class="p">;</span>
+       <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> ?<span class="n">lbl</span>
+    <span class="n">FILTER</span> <span class="n">regex</span><span class="p">(</span>?<span class="n">lbl</span><span class="p">,</span> &quot;<span class="n">printer</span>&quot;<span class="p">,</span> &quot;<span class="nb">i</span>&quot;<span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<p>then the search will need to examine <em>all</em> selected <code>rdfs:label</code> statements and apply the regular expression 
+to each label in turn. If there are many such statements and many such uses of <code>regex</code>, then it may be appropriate 
+to consider using this extension to take advantage of the performance potential of full text indexing.</p>
+<p>Text indexes provide additional information for accessing the RDF graph by allowing the application to have <em>indexed 
+access</em> to the internal structure of string literals rather than treating such literals as opaque items. 
+Assuming appropriate <a href="#configuration">configuration</a>, the above query can use full text search via the 
+<a href="https://jena.apache.org/documentation/query/extension.html#property-functions">ARQ property function extension</a>, 
+<code>text:query</code>:</p>
+<div class="codehilite"><pre><span class="n">PREFIX</span>   <span class="n">ex</span><span class="p">:</span> <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">www</span><span class="p">.</span><span class="n">example</span><span class="p">.</span><span class="n">org</span><span class="o">/</span><span class="n">resources</span>#<span class="o">&gt;</span>
+<span class="n">PREFIX</span> <span class="n">rdfs</span><span class="p">:</span> <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">www</span><span class="p">.</span><span class="n">w3</span><span class="p">.</span><span class="n">org</span><span class="o">/</span>2000<span class="o">/</span>01<span class="o">/</span><span class="n">rdf</span><span class="o">-</span><span class="n">schema</span>#<span class="o">&gt;</span>
+<span class="n">PREFIX</span> <span class="n">text</span><span class="p">:</span> <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">jena</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">org</span><span class="o">/</span><span class="n">text</span>#<span class="o">&gt;</span>
+
+<span class="n">SELECT</span> ?<span class="n">s</span> ?<span class="n">lbl</span>
+<span class="n">WHERE</span> <span class="p">{</span> 
+    ?<span class="n">s</span> <span class="n">a</span> <span class="n">ex</span><span class="p">:</span><span class="n">Product</span> <span class="p">;</span>
+       <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span><span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="s">&#39;printer&#39;</span><span class="p">)</span> <span class="p">;</span>
+       <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> ?<span class="n">lbl</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<p>This query makes a text query for <code>'printer'</code> on the <code>rdfs:label</code> property; and then looks in the RDF data and retrieves 
+the complete label for each match.</p>
+<p>The full text engine can be either <a href="http://lucene.apache.org/core">Apache Lucene</a> hosted with Jena on
+a single machine, or <a href="https://www.elastic.co/">Elasticsearch</a> for a large scale enterprise search application
+where the full text engine is potentially distributed across separate machines.</p>
+<p>This <a href="https://github.com/apache/jena/tree/master/jena-text/src/main/java/examples/">example code</a> illustrates
+creating an in-memory dataset with a Lucene index.</p>
+<p>This module was first released with Jena 2.11.0.</p>
+<p>This module is not compatible with the much older LARQ module.</p>
+<h2 id="table-of-contents">Table of Contents<a class="headerlink" href="#table-of-contents" title="Permanent link">&para;</a></h2>
+<ul>
+<li><a href="#architecture">Architecture</a></li>
+<li><a href="#query-with-sparql">Query with SPARQL</a></li>
+<li><a href="#configuration">Configuration</a><ul>
+<li><a href="#text-dataset-assembler">Text Dataset Assembler</a></li>
+<li><a href="#configuring-an-analyzer">Configuring an analyzer</a></li>
+<li><a href="#configuration-by-code">Configuration by Code</a></li>
+<li><a href="#graph-specific-indexing">Graph-specific Indexing</a></li>
+<li><a href="#linguistic-support-with-lucene-index">Linguistic Support with Lucene Index</a></li>
+<li><a href="#generic-and-defined-analyzer-support">Generic and Defined Analyzer Support</a></li>
+<li><a href="#storing-literal-values">Storing Literal Values</a></li>
+</ul>
+</li>
+<li><a href="#working-with-fuseki">Working with Fuseki</a></li>
+<li><a href="#building-a-text-index">Building a Text Index</a></li>
+<li><a href="#configuring-alternative-textdocproducers">Configuring Alternative TextDocProducers</a></li>
+<li><a href="#maven-dependency">Maven Dependency</a></li>
+</ul>
+<h2 id="architecture">Architecture<a class="headerlink" href="#architecture" title="Permanent link">&para;</a></h2>
+<p>In general, a text index engine (Lucene or Elasticsearch) indexes <em>documents</em> where each document is
+a collection of <em>fields</em>, the values of which are indexed so that searches matching contents of specified 
+fields can return a reference to the document containing the fields with matching values.</p>
+<p>The basic idea of the Jena text extension is to associate a triple with a document and the <em>property</em> 
+of the triple with a <em>field</em> of a document and the <em>object</em> of the triple (which must be a literal) with 
+the value of the field in the document. The <em>subject</em> of the triple then becomes another field of the 
+document that is returned as the result of a search match to identify what was matched. (NB, the
+particular triple that matched is not identified. Only, its subject.)</p>
+<p>In this manner, the text index provides an inverted index that maps query string matches to subject URIs.</p>
+<p>A text-indexed dataset is configured with a description of which properties are to be indexed. When triples 
+are added, any properties matching the description cause a document to be added to the index 
+by analyzing the literal value of the triple object and mapping to the subject URI. On the other hand, it is
+necessary to specifically configure the text-indexed dataset to <a href="#entity-map-definition">delete index entries</a>
+when the corresponding triples are dropped from the RDF store.</p>
+<p>The text index uses the native query language of the index:
+<a href="http://lucene.apache.org/core/6_4_1/queryparser/org/apache/lucene/queryparser/classic/package-summary.html#package_description">Lucene query language</a>
+or
+<a href="https://www.elastic.co/guide/en/elasticsearch/reference/5.2/query-dsl.html">Elasticsearch query language</a>.</p>
+<h3 id="external-content">External content<a class="headerlink" href="#external-content" title="Permanent link">&para;</a></h3>
+<p>It is also possible that the indexed text is content external to the RDF store with only additional triples 
+(about the indexed text) in the RDF store. The subject URI returned as a search result may then be considered 
+to refer via the indexed property to the external content.</p>
+<p>There is no requirement that the text data indexed is present in the RDF
+data.  As long as the index contains the index text documents to match the
+index description, then text search can be performed.</p>
+<p>For example, if the content of a collection of documents is indexed and the
+URI naming the document is the result of the text search, then an RDF
+dataset with the document metadata can be combined with accessing the
+content by URI.</p>
+<p>The maintenance of the index is external to the RDF data store.</p>
+<h3 id="external-applications">External applications<a class="headerlink" href="#external-applications" title="Permanent link">&para;</a></h3>
+<p>By using Elasticsearch, other applications can share the text index with SPARQL search.</p>
+<h2 id="query-with-sparql">Query with SPARQL<a class="headerlink" href="#query-with-sparql" title="Permanent link">&para;</a></h2>
+<p>The URI of the text extenion property function is <code>http://jena.apache.org/text#query</code> more
+conveniently written:</p>
+<div class="codehilite"><pre><span class="n">PREFIX</span> <span class="n">text</span><span class="p">:</span> <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">jena</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">org</span><span class="o">/</span><span class="n">text</span>#<span class="o">&gt;</span>
+
+<span class="p">...</span>   <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">...</span>
+</pre></div>
+
+
+<table class="table">
+<thead>
+<tr>
+<th>&nbsp;Argument&nbsp;</th>
+<th>&nbsp; Definition&nbsp;</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>property</td>
+<td>(optional) URI (including prefix name form)</td>
+</tr>
+<tr>
+<td>query string</td>
+<td>The native query string</td>
+</tr>
+<tr>
+<td>limit</td>
+<td>(optional) <code>int</code> limit on the number of results</td>
+</tr>
+</tbody>
+</table>
+<p>The following forms are all legal:</p>
+<div class="codehilite"><pre>?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="s">&#39;word&#39;</span>                   # <span class="n">query</span>
+?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span><span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="s">&#39;word&#39;</span><span class="p">)</span>      # <span class="n">query</span> <span class="n">specific</span> <span class="n">property</span> <span class="k">if</span> <span class="n">multiple</span>
+?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span><span class="s">&#39;word&#39;</span> 10<span class="p">)</span>              # <span class="n">with</span> <span class="n">limit</span> <span class="n">on</span> <span class="n">results</span>
+<span class="p">(</span>?<span class="n">s</span> ?<span class="n">score</span><span class="p">)</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="s">&#39;word&#39;</span>          # <span class="n">query</span> <span class="n">capturing</span> <span class="n">also</span> <span class="n">the</span> <span class="n">score</span>
+<span class="p">(</span>?<span class="n">s</span> ?<span class="n">score</span> ?<span class="n">literal</span><span class="p">)</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="s">&#39;word&#39;</span> # <span class="p">...</span> <span class="n">and</span> <span class="n">original</span> <span class="n">literal</span> <span class="n">value</span>
+</pre></div>
+
+
+<p>The most general form is:</p>
+<div class="codehilite"><pre> <span class="p">(</span>?<span class="n">s</span> ?<span class="n">score</span> ?<span class="n">literal</span><span class="p">)</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span><span class="n">property</span> <span class="s">&#39;query string&#39;</span> <span class="n">limit</span><span class="p">)</span>
+</pre></div>
+
+
+<p>Only the query string is required, and if it is the only argument the
+surrounding <code>( )</code> can be omitted.</p>
+<p>The <code>property</code> URI is only necessary if multiple properties have been indexed and the property 
+being searched over is not the <a href="#entity-map-definition">default field of the index</a>.
+Also the <code>property</code> URI <strong>must not</strong> be used when the <code>query string</code> refers explicitly to one or more 
+fields.</p>
+<p>The results include the subject URI, <code>?s</code>; the <code>?score</code> assigned by the text search engine;
+and the entire matched <code>?literal</code> 
+(if the index has been <a href="#text-dataset-assembler">configured to store literal values</a>).</p>
+<p>If the <code>query string</code> refers to more than one field, e.g.,</p>
+<div class="codehilite"><pre>&quot;<span class="n">label</span><span class="p">:</span> <span class="n">printer</span> <span class="n">AND</span> <span class="n">description</span><span class="p">:</span> <span class="o">\</span>&quot;<span class="n">large</span> <span class="n">capacity</span> <span class="n">cartridge</span><span class="o">\</span>&quot;&quot;
+</pre></div>
+
+
+<p>then the <code>?literal</code> in the results will not be bound since there is no single field that contains
+the match &ndash; the match is separated over two fields.</p>
+<h3 id="good-practice">Good practice<a class="headerlink" href="#good-practice" title="Permanent link">&para;</a></h3>
+<p>The query engine does not have information about the selectivity of the text index and so effective
+query plans cannot be determined programmatically.  It is helpful to be aware of the following two
+general query patterns.</p>
+<h4 id="query-pattern-1-find-in-the-text-index-and-refine-results">Query pattern 1 &ndash; Find in the text index and refine results<a class="headerlink" href="#query-pattern-1-find-in-the-text-index-and-refine-results" title="Permanent link">&para;</a></h4>
+<p>Access to the text index is first in the query and used to find a number of
+items of interest; further information is obtained about these items from
+the RDF data.</p>
+<div class="codehilite"><pre><span class="n">SELECT</span> ?<span class="n">s</span>
+<span class="p">{</span> ?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span><span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="s">&#39;word&#39;</span> 10<span class="p">)</span> <span class="p">;</span> 
+     <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> ?<span class="n">label</span> <span class="p">;</span>
+     <span class="n">rdf</span><span class="p">:</span><span class="n">type</span>   ?<span class="n">type</span> 
+<span class="p">}</span>
+</pre></div>
+
+
+<p>The <code>text:query</code> limit argument is useful when working with large indexes to limit results to the
+higher scoring results &ndash; results are returned in the order of scoring by the text search engine.</p>
+<h4 id="query-pattern-2-filter-results-via-the-text-index">Query pattern 2 &ndash; Filter results via the text index<a class="headerlink" href="#query-pattern-2-filter-results-via-the-text-index" title="Permanent link">&para;</a></h4>
+<p>By finding items of interest first in the RDF data, the text search can be
+used to restrict the items found still further.</p>
+<div class="codehilite"><pre><span class="n">SELECT</span> ?<span class="n">s</span>
+<span class="p">{</span> ?<span class="n">s</span> <span class="n">rdf</span><span class="p">:</span><span class="n">type</span>     <span class="p">:</span><span class="n">book</span> <span class="p">;</span>
+     <span class="n">dc</span><span class="p">:</span><span class="n">creator</span>  &quot;<span class="n">John</span>&quot; <span class="p">.</span>
+  ?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span>   <span class="p">(</span><span class="n">dc</span><span class="p">:</span><span class="n">title</span> <span class="s">&#39;word&#39;</span><span class="p">)</span> <span class="p">;</span> 
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="configuration">Configuration<a class="headerlink" href="#configuration" title="Permanent link">&para;</a></h2>
+<p>The usual way to describe a text index is with a 
+<a href="../assembler/index.html">Jena assembler description</a>.  Configurations can
+also be built with code. The assembler describes a 'text
+dataset' which has an underlying RDF dataset and a text index. The text
+index describes the text index technology (Lucene or Elasticsearch) and the details
+needed for each.</p>
+<p>A text index has an "entity map" which defines the properties to
+index, the name of the Lucene/Elasticsearch field and field used for storing the URI
+itself.</p>
+<p>For simple RDF use, there will be one field, mapping a property to a text
+index field. More complex setups, with multiple properties per entity
+(URI) are possible.</p>
+<p>Once configured, any data added to the text dataset is automatically
+indexed as well.</p>
+<h3 id="text-dataset-assembler">Text Dataset Assembler<a class="headerlink" href="#text-dataset-assembler" title="Permanent link">&para;</a></h3>
+<p>The following is an example of a TDB dataset with a text index.</p>
+<div class="codehilite"><pre><span class="p">@</span><span class="n">prefix</span> <span class="p">:</span>        <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">localhost</span><span class="o">/</span><span class="n">jena_example</span><span class="o">/</span>#<span class="o">&gt;</span> <span class="p">.</span>
+<span class="p">@</span><span class="n">prefix</span> <span class="n">rdf</span><span class="p">:</span>     <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">www</span><span class="p">.</span><span class="n">w3</span><span class="p">.</span><span class="n">org</span><span class="o">/</span>1999<span class="o">/</span>02<span class="o">/</span>22<span class="o">-</span><span class="n">rdf</span><span class="o">-</span><span class="n">syntax</span><span class="o">-</span><span class="n">ns</span>#<span class="o">&gt;</span> <span class="p">.</span>
+<span class="p">@</span><span class="n">prefix</span> <span class="n">rdfs</span><span class="p">:</span>    <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">www</span><span class="p">.</span><span class="n">w3</span><span class="p">.</span><span class="n">org</span><span class="o">/</span>2000<span class="o">/</span>01<span class="o">/</span><span class="n">rdf</span><span class="o">-</span><span class="n">schema</span>#<span class="o">&gt;</span> <span class="p">.</span>
+<span class="p">@</span><span class="n">prefix</span> <span class="n">tdb</span><span class="p">:</span>     <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">jena</span><span class="p">.</span><span class="n">hpl</span><span class="p">.</span><span class="n">hp</span><span class="p">.</span><span class="n">com</span><span class="o">/</span>2008<span class="o">/</span><span class="n">tdb</span>#<span class="o">&gt;</span> <span class="p">.</span>
+<span class="p">@</span><span class="n">prefix</span> <span class="n">ja</span><span class="p">:</span>      <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">jena</span><span class="p">.</span><span class="n">hpl</span><span class="p">.</span><span class="n">hp</span><span class="p">.</span><span class="n">com</span><span class="o">/</span>2005<span class="o">/</span>11<span class="o">/</span><span class="n">Assembler</span>#<span class="o">&gt;</span> <span class="p">.</span>
+<span class="p">@</span><span class="n">prefix</span> <span class="n">text</span><span class="p">:</span>    <span class="o">&lt;</span><span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">jena</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">org</span><span class="o">/</span><span class="n">text</span>#<span class="o">&gt;</span> <span class="p">.</span>
+
+## <span class="n">Example</span> <span class="n">of</span> <span class="n">a</span> <span class="n">TDB</span> <span class="n">dataset</span> <span class="n">and</span> <span class="n">text</span> <span class="n">index</span>
+## <span class="n">Initialize</span> <span class="n">TDB</span>
+<span class="p">[]</span> <span class="n">ja</span><span class="p">:</span><span class="n">loadClass</span> &quot;<span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">jena</span><span class="p">.</span><span class="n">tdb</span><span class="p">.</span><span class="n">TDB</span>&quot; <span class="p">.</span>
+<span class="n">tdb</span><span class="p">:</span><span class="n">DatasetTDB</span>  <span class="n">rdfs</span><span class="p">:</span><span class="n">subClassOf</span>  <span class="n">ja</span><span class="p">:</span><span class="n">RDFDataset</span> <span class="p">.</span>
+<span class="n">tdb</span><span class="p">:</span><span class="n">GraphTDB</span>    <span class="n">rdfs</span><span class="p">:</span><span class="n">subClassOf</span>  <span class="n">ja</span><span class="p">:</span><span class="n">Model</span> <span class="p">.</span>
+
+## <span class="n">Initialize</span> <span class="n">text</span> <span class="n">query</span>
+<span class="p">[]</span> <span class="n">ja</span><span class="p">:</span><span class="n">loadClass</span>       &quot;<span class="n">org</span><span class="p">.</span><span class="n">apache</span><span class="p">.</span><span class="n">jena</span><span class="p">.</span><span class="n">query</span><span class="p">.</span><span class="n">text</span><span class="p">.</span><span class="n">TextQuery</span>&quot; <span class="p">.</span>
+# <span class="n">A</span> <span class="n">TextDataset</span> <span class="n">is</span> <span class="n">a</span> <span class="n">regular</span> <span class="n">dataset</span> <span class="n">with</span> <span class="n">a</span> <span class="n">text</span> <span class="n">index</span><span class="p">.</span>
+<span class="n">text</span><span class="p">:</span><span class="n">TextDataset</span>      <span class="n">rdfs</span><span class="p">:</span><span class="n">subClassOf</span>   <span class="n">ja</span><span class="p">:</span><span class="n">RDFDataset</span> <span class="p">.</span>
+# <span class="n">Lucene</span> <span class="n">index</span>
+<span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span>  <span class="n">rdfs</span><span class="p">:</span><span class="n">subClassOf</span>   <span class="n">text</span><span class="p">:</span><span class="n">TextIndex</span> <span class="p">.</span>
+# <span class="n">Elasticsearch</span> <span class="n">index</span>
+<span class="n">text</span><span class="p">:</span><span class="n">TextIndexES</span>    <span class="n">rdfs</span><span class="p">:</span><span class="n">subClassOf</span>   <span class="n">text</span><span class="p">:</span><span class="n">TextIndex</span> <span class="p">.</span>
+
+## <span class="o">---------------------------------------------------------------</span>
+## <span class="n">This</span> <span class="n">URI</span> <span class="n">must</span> <span class="n">be</span> <span class="n">fixed</span> <span class="o">-</span> <span class="n">it</span><span class="o">&#39;</span><span class="n">s</span> <span class="n">used</span> <span class="n">to</span> <span class="n">assemble</span> <span class="n">the</span> <span class="n">text</span> <span class="n">dataset</span><span class="p">.</span>
+
+<span class="p">:</span><span class="n">text_dataset</span> <span class="n">rdf</span><span class="p">:</span><span class="n">type</span>     <span class="n">text</span><span class="p">:</span><span class="n">TextDataset</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">dataset</span>   <span class="o">&lt;</span>#<span class="n">dataset</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">index</span>     <span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="p">.</span>
+
+# <span class="n">A</span> <span class="n">TDB</span> <span class="n">datset</span> <span class="n">used</span> <span class="k">for</span> <span class="n">RDF</span> <span class="n">storage</span>
+<span class="o">&lt;</span>#<span class="n">dataset</span><span class="o">&gt;</span> <span class="n">rdf</span><span class="p">:</span><span class="n">type</span>      <span class="n">tdb</span><span class="p">:</span><span class="n">DatasetTDB</span> <span class="p">;</span>
+    <span class="n">tdb</span><span class="p">:</span><span class="n">location</span> &quot;<span class="n">DB</span>&quot; <span class="p">;</span>
+    <span class="n">tdb</span><span class="p">:</span><span class="n">unionDefaultGraph</span> <span class="n">true</span> <span class="p">;</span> # <span class="n">Optional</span>
+    <span class="p">.</span>
+
+# <span class="n">Text</span> <span class="n">index</span> <span class="n">description</span>
+<span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">directory</span> <span class="o">&lt;</span><span class="n">file</span><span class="p">:</span><span class="o">/</span><span class="n">some</span><span class="o">/</span><span class="n">path</span><span class="o">/</span><span class="n">lucene</span><span class="o">-</span><span class="n">index</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityMap</span> <span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">storeValues</span> <span class="n">true</span> <span class="p">;</span> 
+    <span class="n">text</span><span class="p">:</span><span class="n">analyzer</span> <span class="p">[</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">StandardAnalyzer</span> <span class="p">]</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">queryAnalyzer</span> <span class="p">[</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">KeywordAnalyzer</span> <span class="p">]</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">queryParser</span> <span class="n">text</span><span class="p">:</span><span class="n">AnalyzingQueryParser</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">multilingualSupport</span> <span class="n">true</span> <span class="p">;</span>
+ <span class="p">.</span>
+</pre></div>
+
+
+<p>The <code>text:TextDataset</code> has two properties:</p>
+<ul>
+<li>
+<p>a <code>text:dataset</code>, e.g., a <code>tdb:DatasetTDB</code>, to contain 
+the RDF triples; and</p>
+</li>
+<li>
+<p>an index configured to use either <code>text:TextIndexLucene</code> or <code>text:TextIndexES</code>.</p>
+</li>
+</ul>
+<p>The <code>&lt;#indexLucene&gt;</code> instance of <code>text:TextIndexLucene</code>, above, has two required properties: </p>
+<ul>
+<li>
+<p>the <code>text:directory</code> 
+file URI which specifies the directory that will contain the Lucene index files &ndash; if this has the 
+value <code>"mem"</code> then the index resides in memory;</p>
+</li>
+<li>
+<p>the <code>text:entityMap</code>, <code>&lt;#entMap&gt;</code> that will define 
+what properties are to be indexed and other features of the index; and</p>
+</li>
+</ul>
+<p>and several optional properties:</p>
+<ul>
+<li>
+<p><code>text:storeValues</code> controls the <a href="#storing-literal-values">storing of literal values</a>.
+It indicates whether values are stored or not &ndash; values must be stored for the 
+<a href="#query-with-sparql"><code>?literal</code> return value</a> to be available in <code>text:query</code> in SPARQL.</p>
+</li>
+<li>
+<p><code>text:analyzer</code> specifies the default <a href="#configuring-an-analyzer">analyzer configuration</a> to used 
+during indexing and querying. The default analyzer defaults to Lucene's <code>StandardAnalyzer</code>.</p>
+</li>
+<li>
+<p><code>text:queryAnalyzer</code> specifies an optional <a href="#analyzer-for-query">analyzer for query</a> that will be
+used to analyze the query string. If not set the analyzer used to index a given field is used.</p>
+</li>
+<li>
+<p><code>text:queryParser</code> is optional and specifies an <a href="#alternative-query-parsers">alternative query parser</a></p>
+</li>
+<li>
+<p><code>text:multilingualSupport</code> enables <a href="#multilingual-support">Multilingual Support</a></p>
+</li>
+</ul>
+<p>If using Elasticsearch then an index would be configured as follows:</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexES</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexES</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">serverList</span> &quot;127<span class="p">.</span>0<span class="p">.</span>0<span class="p">.</span>1<span class="p">:</span>9300&quot; <span class="p">;</span> # <span class="n">A</span> <span class="n">comma</span><span class="o">-</span><span class="n">separated</span> <span class="n">list</span> <span class="n">of</span> <span class="n">Host</span><span class="p">:</span><span class="n">Port</span> <span class="n">values</span> <span class="n">of</span> <span class="n">the</span> <span class="n">ElasticSearch</span> <span class="n">Cluster</span> <span class="n">nodes</span><span class="p">.</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">clusterName</span> &quot;<span class="n">elasticsearch</span>&quot; <span class="p">;</span> # <span class="n">Name</span> <span class="n">of</span> <span class="n">the</span> <span class="n">ElasticSearch</span> <span class="n">Cluster</span><span class="p">.</span> <span class="n">If</span> <span class="n">not</span> <span class="n">specified</span> <span class="n">defaults</span> <span class="n">to</span> <span class="s">&#39;elasticsearch&#39;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">shards</span> &quot;1&quot; <span class="p">;</span>                  # <span class="n">The</span> <span class="n">number</span> <span class="n">of</span> <span class="n">shards</span> <span class="k">for</span> <span class="n">the</span> <span class="n">index</span><span class="p">.</span> <span class="n">Defaults</span> <span class="n">to</span> 1
+    <span class="n">text</span><span class="p">:</span><span class="n">replicas</span> &quot;1&quot; <span class="p">;</span>                # <span class="n">The</span> <span class="n">number</span> <span class="n">of</span> <span class="n">replicas</span> <span class="k">for</span> <span class="n">the</span> <span class="n">index</span><span class="p">.</span> <span class="n">Defaults</span> <span class="n">to</span> 1
+    <span class="n">text</span><span class="p">:</span><span class="n">indexName</span> &quot;<span class="n">jena</span><span class="o">-</span><span class="n">text</span>&quot; <span class="p">;</span>       # <span class="n">Name</span> <span class="n">of</span> <span class="n">the</span> <span class="n">Index</span><span class="p">.</span> <span class="n">defaults</span> <span class="n">to</span> <span class="n">jena</span><span class="o">-</span><span class="n">text</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityMap</span> <span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="p">.</span>
+</pre></div>
+
+
+<p>and <code>text:index  &lt;#indexES&gt; ;</code> would be used in the configuration of <code>:text_dataset</code>.</p>
+<p>To use a text index assembler configuration in Java code is it necessary to identify the dataset URI to 
+be assembled, such as in:</p>
+<div class="codehilite"><pre><span class="n">Dataset</span> <span class="n">ds</span> <span class="p">=</span> <span class="n">DatasetFactory</span><span class="p">.</span><span class="n">assemble</span><span class="p">(</span>
+    &quot;<span class="n">text</span><span class="o">-</span><span class="n">config</span><span class="p">.</span><span class="n">ttl</span>&quot;<span class="p">,</span> 
+    &quot;<span class="n">http</span><span class="p">:</span><span class="o">//</span><span class="n">localhost</span><span class="o">/</span><span class="n">jena_example</span><span class="o">/</span>#<span class="n">text_dataset</span>&quot;<span class="p">)</span> <span class="p">;</span>
+</pre></div>
+
+
+<p>since the assembler contains two dataset definitions, one for
+the text dataset, one for the base data.  Therefore, the application
+needs to identify the text dataset by it's URI
+<code>http://localhost/jena_example/#text_dataset</code>.</p>
+<h3 id="entity-map-definition">Entity Map definition<a class="headerlink" href="#entity-map-definition" title="Permanent link">&para;</a></h3>
+<p>A <code>text:EntityMap</code> has several properties that condition what is indexed, what information is stored, and 
+what analyzers are used.</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">EntityMap</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">defaultField</span>     &quot;<span class="n">label</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityField</span>      &quot;<span class="n">uri</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">uidField</span>         &quot;<span class="n">uid</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">langField</span>        &quot;<span class="n">lang</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">graphField</span>       &quot;<span class="n">graph</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">map</span> <span class="p">(</span>
+         <span class="p">[</span> <span class="n">text</span><span class="p">:</span><span class="n">field</span> &quot;<span class="n">label</span>&quot; <span class="p">;</span> 
+           <span class="n">text</span><span class="p">:</span><span class="n">predicate</span> <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="p">]</span>
+         <span class="p">)</span> <span class="p">.</span>
+</pre></div>
+
+
+<h4 id="default-text-field">Default text field<a class="headerlink" href="#default-text-field" title="Permanent link">&para;</a></h4>
+<p>The <code>text:defaultField</code> specifies the default field name that Lucene will use in a query that does
+not otherwise specify a field. For example,</p>
+<div class="codehilite"><pre>?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> &quot;<span class="o">\</span>&quot;<span class="n">bread</span> <span class="n">and</span> <span class="n">butter</span><span class="o">\</span>&quot;&quot;
+</pre></div>
+
+
+<p>will perform a search in the <code>label</code> field for the phrase <code>"bread and butter"</code></p>
+<h4 id="entity-field">Entity field<a class="headerlink" href="#entity-field" title="Permanent link">&para;</a></h4>
+<p>The <code>text:entityField</code> specifies the field name of the field that will contain the subject URI that
+is returned on a match. The value of the property is arbitrary so long as it is unique among the
+defined names.</p>
+<h4 id="automatic-document-deletion">Automatic document deletion<a class="headerlink" href="#automatic-document-deletion" title="Permanent link">&para;</a></h4>
+<p>When the <code>text:uidField</code> is defined in the <code>EntityMap</code> then dropping a triple will result in the 
+corresponding document, if any, being deleted from the text index. The value, <code>"uid"</code>, is arbitrary 
+and defines the name of a stored field in Lucene that holds a unique ID that represents the triple.</p>
+<p>If you configure the index via Java code, you need to set this parameter to the 
+EntityDefinition instance, e.g.</p>
+<div class="codehilite"><pre><span class="n">EntityDefinition</span> <span class="n">docDef</span> <span class="p">=</span> <span class="n">new</span> <span class="n">EntityDefinition</span><span class="p">(</span><span class="n">entityField</span><span class="p">,</span> <span class="n">defaultField</span><span class="p">);</span>
+<span class="n">docDef</span><span class="p">.</span><span class="n">setUidField</span><span class="p">(</span>&quot;<span class="n">uid</span>&quot;<span class="p">);</span>
+</pre></div>
+
+
+<p><strong>Note</strong>: If you migrate from an index without deletion support to an index with automatic deletion, 
+you will need to rebuild the index to ensure that the uid information is stored.</p>
+<h4 id="language-field">Language Field<a class="headerlink" href="#language-field" title="Permanent link">&para;</a></h4>
+<p>The <code>text:langField</code> is the name of the field that will store the language attribute of the literal
+in the case of an <code>rdf:langString</code>. This Entity Map property is a key element of the 
+<a href="#linguistic-support-with-lucene-index">Linguistic support with Lucene index</a></p>
+<h4 id="graph-field">Graph Field<a class="headerlink" href="#graph-field" title="Permanent link">&para;</a></h4>
+<p>Setting the <code>text:graphField</code> allows <a href="#graph-specific-indexing">graph-specific indexing</a> of the text 
+index to limit searching to a specified graph when a SPARQL query targets a single named graph. The 
+field value is arbitrary and serves to store the graph ID that a triple belongs to when the index is 
+updated.</p>
+<h4 id="the-analyzer-map">The Analyzer Map<a class="headerlink" href="#the-analyzer-map" title="Permanent link">&para;</a></h4>
+<p>The <code>text:map</code> is a list of <a href="#configuring-an-analyzer">analyzer specifications</a> as described below.</p>
+<h3 id="configuring-an-analyzer">Configuring an Analyzer<a class="headerlink" href="#configuring-an-analyzer" title="Permanent link">&para;</a></h3>
+<p>Text to be indexed is passed through a text analyzer that divides it into tokens 
+and may perform other transformations such as eliminating stop words. If a Lucene
+or Elasticsearch text index is used, then by default the Lucene <code>StandardAnalyzer</code> is used.</p>
+<p>In case of a <code>TextIndexLucene</code> the default analyzer can be replaced by another analyzer with 
+the <code>text:analyzer</code> property on the <code>text:TextIndexLucene</code> resource in the 
+<a href="#text-dataset-assembler">text dataset assembler</a>,  for example with a <code>SimpleAnalyzer</code>:   </p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+        <span class="n">text</span><span class="p">:</span><span class="n">directory</span> <span class="o">&lt;</span><span class="n">file</span><span class="p">:</span><span class="n">Lucene</span><span class="o">&gt;</span> <span class="p">;</span>
+        <span class="n">text</span><span class="p">:</span><span class="n">analyzer</span> <span class="p">[</span>
+            <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">SimpleAnalyzer</span>
+        <span class="p">]</span>
+        <span class="p">.</span>
+</pre></div>
+
+
+<p>It is possible to configure an alternative analyzer for each field indexed in a
+Lucene index.  For example:</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">EntityMap</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityField</span>      &quot;<span class="n">uri</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">defaultField</span>     &quot;<span class="n">text</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">map</span> <span class="p">(</span>
+         <span class="p">[</span> <span class="n">text</span><span class="p">:</span><span class="n">field</span> &quot;<span class="n">text</span>&quot; <span class="p">;</span> 
+           <span class="n">text</span><span class="p">:</span><span class="n">predicate</span> <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="p">;</span>
+           <span class="n">text</span><span class="p">:</span><span class="n">analyzer</span> <span class="p">[</span>
+               <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">StandardAnalyzer</span> <span class="p">;</span>
+               <span class="n">text</span><span class="p">:</span><span class="n">stopWords</span> <span class="p">(</span>&quot;<span class="n">a</span>&quot; &quot;<span class="n">an</span>&quot; &quot;<span class="n">and</span>&quot; &quot;<span class="n">but</span>&quot;<span class="p">)</span>
+           <span class="p">]</span>
+         <span class="p">]</span>
+         <span class="p">)</span> <span class="p">.</span>
+</pre></div>
+
+
+<p>will configure the index to analyze values of the 'text' field
+using a <code>StandardAnalyzer</code> with the given list of stop words.</p>
+<p>Other analyzer types that may be specified are <code>SimpleAnalyzer</code> and
+<code>KeywordAnalyzer</code>, neither of which has any configuration parameters. See
+the Lucene documentation for details of what these analyzers do. Jena also
+provides <code>LowerCaseKeywordAnalyzer</code>, which is a case-insensitive version of
+<code>KeywordAnalyzer</code>, and <a href="#configurableanalyzer"><code>ConfigurableAnalyzer</code></a>.</p>
+<p>Support for the new <code>LocalizedAnalyzer</code> has been introduced in Jena 3.0.0 to
+deal with Lucene language specific analyzers. See <a href="#linguistic-support-with-lucene-index">Linguistic Support with
+Lucene Index</a> part for details.</p>
+<p>Support for <code>GenericAnalyzer</code>s has been introduced in Jena 3.4.0 to allow
+the use of Analyzers that do not have built-in support, e.g., <code>BrazilianAnalyzer</code>; 
+require constructor parameters not otherwise supported, e.g., a stop words <code>FileReader</code> or
+a <code>stemExclusionSet</code>; and finally use of Analyzers not included in the bundled
+Lucene distribution, e.g., a <code>SanskritIASTAnalyzer</code>. See <a href="#generic-and-defined-analyzer-support">Generic and Defined
+Analyzer Support</a></p>
+<h4 id="configurableanalyzer">ConfigurableAnalyzer<a class="headerlink" href="#configurableanalyzer" title="Permanent link">&para;</a></h4>
+<p><code>ConfigurableAnalyzer</code> was introduced in Jena 3.0.1. It allows more detailed
+configuration of text analysis parameters by independently selecting a
+<code>Tokenizer</code> and zero or more <code>TokenFilter</code>s which are applied in order after
+tokenization. See the Lucene documentation for details on what each
+tokenizer and token filter does.</p>
+<p>The available <code>Tokenizer</code> implementations are:</p>
+<ul>
+<li><code>StandardTokenizer</code></li>
+<li><code>KeywordTokenizer</code></li>
+<li><code>WhitespaceTokenizer</code></li>
+<li><code>LetterTokenizer</code></li>
+</ul>
+<p>The available <code>TokenFilter</code> implementations are:</p>
+<ul>
+<li><code>StandardFilter</code></li>
+<li><code>LowerCaseFilter</code></li>
+<li><code>ASCIIFoldingFilter</code></li>
+</ul>
+<p>Configuration is done using Jena assembler like this:</p>
+<div class="codehilite"><pre><span class="n">text</span><span class="o">:</span><span class="n">analyzer</span> <span class="o">[</span>
+  <span class="n">a</span> <span class="n">text</span><span class="o">:</span><span class="n">ConfigurableAnalyzer</span> <span class="o">;</span>
+  <span class="n">text</span><span class="o">:</span><span class="n">tokenizer</span> <span class="n">text</span><span class="o">:</span><span class="n">KeywordTokenizer</span> <span class="o">;</span>
+  <span class="n">text</span><span class="o">:</span><span class="n">filters</span> <span class="o">(</span><span class="n">text</span><span class="o">:</span><span class="n">ASCIIFoldingFilter</span><span class="o">,</span> <span class="n">text</span><span class="o">:</span><span class="n">LowerCaseFilter</span><span class="o">)</span>
+<span class="o">]</span>
+</pre></div>
+
+
+<p>Here, <code>text:tokenizer</code> must be one of the four tokenizers listed above and
+the optional <code>text:filters</code> property specifies a list of token filters.</p>
+<h4 id="analyzer-for-query">Analyzer for Query<a class="headerlink" href="#analyzer-for-query" title="Permanent link">&para;</a></h4>
+<p>New in Jena 2.13.0.</p>
+<p>There is an ability to specify an analyzer to be used for the
+query string itself.  It will find terms in the query text.  If not set, then the
+analyzer used for the document will be used.  The query analyzer is specified on
+the <code>TextIndexLucene</code> resource:</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">directory</span> <span class="o">&lt;</span><span class="n">file</span><span class="p">:</span><span class="n">Lucene</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityMap</span> <span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">queryAnalyzer</span> <span class="p">[</span>
+        <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">KeywordAnalyzer</span>
+    <span class="p">]</span>
+    <span class="p">.</span>
+</pre></div>
+
+
+<h4 id="alternative-query-parsers">Alternative Query Parsers<a class="headerlink" href="#alternative-query-parsers" title="Permanent link">&para;</a></h4>
+<p>New in Jena 3.1.0.</p>
+<p>It is possible to select a query parser other than the default QueryParser.</p>
+<p>The available <code>QueryParser</code> implementations are:</p>
+<ul>
+<li><code>AnalyzingQueryParser</code>: Performs analysis for wildcard queries . This is useful in combination
+with accent-insensitive wildcard queries.</li>
+<li><code>ComplexPhraseQueryParser</code>: Permits complex phrase query syntax. Eg: "(john jon jonathan~) peters*".
+This is useful for performing wildcard or fuzzy queries on individual terms in a phrase.</li>
+</ul>
+<p>The query parser is specified on
+the <code>TextIndexLucene</code> resource:</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">directory</span> <span class="o">&lt;</span><span class="n">file</span><span class="p">:</span><span class="n">Lucene</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityMap</span> <span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">queryParser</span> <span class="n">text</span><span class="p">:</span><span class="n">AnalyzingQueryParser</span> <span class="p">.</span>
+</pre></div>
+
+
+<p>Elasticsearch currently doesn't support Analyzers beyond Standard Analyzer. </p>
+<h3 id="configuration-by-code">Configuration by Code<a class="headerlink" href="#configuration-by-code" title="Permanent link">&para;</a></h3>
+<p>A text dataset can also be constructed in code as might be done for a
+purely in-memory setup:</p>
+<div class="codehilite"><pre>    <span class="o">//</span> <span class="n">Example</span> <span class="n">of</span> <span class="n">building</span> <span class="n">a</span> <span class="n">text</span> <span class="n">dataset</span> <span class="n">with</span> <span class="n">code</span><span class="p">.</span>
+    <span class="o">//</span> <span class="n">Example</span> <span class="n">is</span> <span class="n">in</span><span class="o">-</span><span class="n">memory</span><span class="p">.</span>
+    <span class="o">//</span> <span class="n">Base</span> <span class="n">dataset</span>
+    <span class="n">Dataset</span> <span class="n">ds1</span> <span class="p">=</span> <span class="n">DatasetFactory</span><span class="p">.</span><span class="n">createMem</span><span class="p">()</span> <span class="p">;</span>
+
+    <span class="n">EntityDefinition</span> <span class="n">entDef</span> <span class="p">=</span> <span class="n">new</span> <span class="n">EntityDefinition</span><span class="p">(</span>&quot;<span class="n">uri</span>&quot;<span class="p">,</span> &quot;<span class="n">text</span>&quot;<span class="p">,</span> <span class="n">RDFS</span><span class="p">.</span><span class="n">label</span><span class="p">)</span> <span class="p">;</span>
+
+    <span class="o">//</span> <span class="n">Lucene</span><span class="p">,</span> <span class="n">in</span> <span class="n">memory</span><span class="p">.</span>
+    <span class="n">Directory</span> <span class="n">dir</span> <span class="p">=</span>  <span class="n">new</span> <span class="n">RAMDirectory</span><span class="p">();</span>
+
+    <span class="o">//</span> <span class="n">Join</span> <span class="n">together</span> <span class="n">into</span> <span class="n">a</span> <span class="n">dataset</span>
+    <span class="n">Dataset</span> <span class="n">ds</span> <span class="p">=</span> <span class="n">TextDatasetFactory</span><span class="p">.</span><span class="n">createLucene</span><span class="p">(</span><span class="n">ds1</span><span class="p">,</span> <span class="n">dir</span><span class="p">,</span> <span class="n">entDef</span><span class="p">)</span> <span class="p">;</span>
+</pre></div>
+
+
+<h3 id="graph-specific-indexing">Graph-specific Indexing<a class="headerlink" href="#graph-specific-indexing" title="Permanent link">&para;</a></h3>
+<p>Starting with version 1.0.1, jena-text supports
+storing information about the source graph into the text index. This allows
+for more efficient text queries when the query targets only a single named
+graph. Without graph-specific indexing, text queries do not distinguish named
+graphs and will always return results from all graphs.</p>
+<p>Support for graph-specific indexing is enabled by defining the name of the
+index field to use for storing the graph identifier.</p>
+<p>If you use an assembler configuration, set the graph field using the
+text:graphField property on the EntityMap, e.g.</p>
+<div class="codehilite"><pre><span class="c"># Mapping in the index</span>
+<span class="c"># URI stored in field &quot;uri&quot;</span>
+<span class="c"># Graph stored in field &quot;graph&quot;</span>
+<span class="c"># rdfs:label is mapped to field &quot;text&quot;</span>
+<span class="o">&lt;</span><span class="c">#entMap&gt; a text:EntityMap ;</span>
+    <span class="nb">text</span><span class="p">:</span><span class="n">entityField</span>      <span class="s">&quot;uri&quot;</span> <span class="p">;</span>
+    <span class="nb">text</span><span class="p">:</span><span class="n">graphField</span>       <span class="s">&quot;graph&quot;</span> <span class="p">;</span>
+    <span class="nb">text</span><span class="p">:</span><span class="n">defaultField</span>     <span class="s">&quot;text&quot;</span> <span class="p">;</span>
+    <span class="nb">text</span><span class="p">:</span><span class="n">map</span> <span class="p">(</span>
+         <span class="p">[</span> <span class="nb">text</span><span class="p">:</span><span class="n">field</span> <span class="s">&quot;text&quot;</span> <span class="p">;</span> <span class="nb">text</span><span class="p">:</span><span class="n">predicate</span> <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="p">]</span>
+         <span class="p">)</span> <span class="p">.</span>
+</pre></div>
+
+
+<p>If you configure the index in Java code, you need to use one of the
+EntityDefinition constructors that support the graphField parameter, e.g.</p>
+<div class="codehilite"><pre>    <span class="n">EntityDefinition</span> <span class="n">entDef</span> <span class="p">=</span> <span class="n">new</span> <span class="n">EntityDefinition</span><span class="p">(</span>&quot;<span class="n">uri</span>&quot;<span class="p">,</span> &quot;<span class="n">text</span>&quot;<span class="p">,</span> &quot;<span class="n">graph</span>&quot;<span class="p">,</span> <span class="n">RDFS</span><span class="p">.</span><span class="n">label</span><span class="p">.</span><span class="n">asNode</span><span class="p">())</span> <span class="p">;</span>
+</pre></div>
+
+
+<p><strong>Note:</strong> If you migrate from a global (non-graph-aware) index to a graph-aware index,
+you need to rebuild the index to ensure that the graph information is stored.</p>
+<h3 id="linguistic-support-with-lucene-index">Linguistic support with Lucene index<a class="headerlink" href="#linguistic-support-with-lucene-index" title="Permanent link">&para;</a></h3>
+<p>It is now possible to take advantage of languages of triple literals to enhance 
+index and queries. Sub-sections below detail different settings with the index, 
+and use cases with SPARQL queries.</p>
+<h4 id="explicit-language-field-in-the-index">Explicit Language Field in the Index<a class="headerlink" href="#explicit-language-field-in-the-index" title="Permanent link">&para;</a></h4>
+<p>Literals' languages of triples can be stored (during triple addition phase) into the 
+index to extend query capabilities. 
+For that, the new <code>text:langField</code> property must be set in the EntityMap assembler :</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">EntityMap</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityField</span>      &quot;<span class="n">uri</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">defaultField</span>     &quot;<span class="n">text</span>&quot; <span class="p">;</span>        
+    <span class="n">text</span><span class="p">:</span><span class="n">langField</span>        &quot;<span class="n">lang</span>&quot; <span class="p">;</span>       
+    <span class="p">.</span>
+</pre></div>
+
+
+<p>If you configure the index via Java code, you need to set this parameter to the 
+EntityDefinition instance, e.g.</p>
+<div class="codehilite"><pre><span class="n">EntityDefinition</span> <span class="n">docDef</span> <span class="p">=</span> <span class="n">new</span> <span class="n">EntityDefinition</span><span class="p">(</span><span class="n">entityField</span><span class="p">,</span> <span class="n">defaultField</span><span class="p">);</span>
+<span class="n">docDef</span><span class="p">.</span><span class="n">setLangField</span><span class="p">(</span>&quot;<span class="n">lang</span>&quot;<span class="p">);</span>
+</pre></div>
+
+
+<h4 id="sparql-linguistic-clause-forms">SPARQL Linguistic Clause Forms<a class="headerlink" href="#sparql-linguistic-clause-forms" title="Permanent link">&para;</a></h4>
+<p>Once the <code>langField</code> is set, you can use it directly inside SPARQL queries, for that the <code>'lang:xx'</code>
+argument allows you to target specific localized values. For example:</p>
+<div class="codehilite"><pre><span class="c1">//target english literals</span>
+<span class="o">?</span><span class="n">s</span> <span class="nl">text:</span><span class="n">query</span> <span class="p">(</span><span class="nl">rdfs:</span><span class="n">label</span> <span class="p">&#39;</span><span class="n">word</span><span class="sc">&#39; &#39;</span><span class="nl">lang:</span><span class="n">en</span><span class="p">&#39;</span> <span class="p">)</span>
+
+<span class="c1">//target unlocalized literals</span>
+<span class="o">?</span><span class="n">s</span> <span class="nl">text:</span><span class="n">query</span> <span class="p">(</span><span class="nl">rdfs:</span><span class="n">label</span> <span class="p">&#39;</span><span class="n">word</span><span class="sc">&#39; &#39;</span><span class="nl">lang:</span><span class="n">none</span><span class="p">&#39;)</span>
+
+<span class="c1">//ignore language field</span>
+<span class="o">?</span><span class="n">s</span> <span class="nl">text:</span><span class="n">query</span> <span class="p">(</span><span class="nl">rdfs:</span><span class="n">label</span> <span class="p">&#39;</span><span class="n">word</span><span class="p">&#39;)</span>
+</pre></div>
+
+
+<h4 id="localizedanalyzer">LocalizedAnalyzer<a class="headerlink" href="#localizedanalyzer" title="Permanent link">&para;</a></h4>
+<p>You can specify a LocalizedAnalyzer in order to benefit from Lucene language 
+specific analyzers (stemming, stop words,...). Like any other analyzers, it can 
+be done for default text indexing, for each different field or for query.</p>
+<p>With an assembler configuration, the <code>text:language</code> property needs to be provided, e.g :</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">directory</span> <span class="o">&lt;</span><span class="n">file</span><span class="p">:</span><span class="n">Lucene</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityMap</span> <span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">analyzer</span> <span class="p">[</span>
+        <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">LocalizedAnalyzer</span> <span class="p">;</span>
+        <span class="n">text</span><span class="p">:</span><span class="n">language</span> &quot;<span class="n">fr</span>&quot;
+    <span class="p">]</span>
+    <span class="p">.</span>
+</pre></div>
+
+
+<p>will configure the index to analyze values of the 'text' field using a FrenchAnalyzer.</p>
+<p>To configure the same example via Java code, you need to provide the analyzer to the
+index configuration object:</p>
+<div class="codehilite"><pre>    <span class="n">TextIndexConfig</span> <span class="n">config</span> <span class="p">=</span> <span class="n">new</span> <span class="n">TextIndexConfig</span><span class="p">(</span><span class="n">def</span><span class="p">);</span>
+    <span class="n">Analyzer</span> <span class="n">analyzer</span> <span class="p">=</span> <span class="n">Util</span><span class="p">.</span><span class="n">getLocalizedAnalyzer</span><span class="p">(</span>&quot;<span class="n">fr</span>&quot;<span class="p">);</span>
+    <span class="n">config</span><span class="p">.</span><span class="n">setAnalyzer</span><span class="p">(</span><span class="n">analyzer</span><span class="p">);</span>
+    <span class="n">Dataset</span> <span class="n">ds</span> <span class="p">=</span> <span class="n">TextDatasetFactory</span><span class="p">.</span><span class="n">createLucene</span><span class="p">(</span><span class="n">ds1</span><span class="p">,</span> <span class="n">dir</span><span class="p">,</span> <span class="n">config</span><span class="p">)</span> <span class="p">;</span>
+</pre></div>
+
+
+<p>Where <code>def</code>, <code>ds1</code> and <code>dir</code> are instances of <code>EntityDefinition</code>, <code>Dataset</code> and 
+<code>Directory</code> classes.</p>
+<p><strong>Note</strong>: You do not have to set the <code>text:langField</code> property with a single 
+localized analyzer.</p>
+<h4 id="multilingual-support">Multilingual Support<a class="headerlink" href="#multilingual-support" title="Permanent link">&para;</a></h4>
+<p>Let us suppose that we have many triples with many localized literals in many different 
+languages. It is possible to take all these languages into account for future mixed localized queries.
+Just set the <code>text:multilingualSupport</code> property at <code>true</code> to automatically enable the localized
+indexing (and also the localized analyzer for query) :</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">directory</span> &quot;<span class="n">mem</span>&quot; <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">multilingualSupport</span> <span class="n">true</span><span class="p">;</span>     
+    <span class="p">.</span>
+</pre></div>
+
+
+<p>Via Java code, set the multilingual support flag : </p>
+<div class="codehilite"><pre>    <span class="n">TextIndexConfig</span> <span class="n">config</span> <span class="p">=</span> <span class="n">new</span> <span class="n">TextIndexConfig</span><span class="p">(</span><span class="n">def</span><span class="p">);</span>
+    <span class="n">config</span><span class="p">.</span><span class="n">setMultilingualSupport</span><span class="p">(</span><span class="n">true</span><span class="p">);</span>
+    <span class="n">Dataset</span> <span class="n">ds</span> <span class="p">=</span> <span class="n">TextDatasetFactory</span><span class="p">.</span><span class="n">createLucene</span><span class="p">(</span><span class="n">ds1</span><span class="p">,</span> <span class="n">dir</span><span class="p">,</span> <span class="n">config</span><span class="p">)</span> <span class="p">;</span>
+</pre></div>
+
+
+<p>Thus, this multilingual index combines dynamically all localized analyzers of existing languages and 
+the storage of langField properties.</p>
+<p>For example, it is possible to refer to different languages in the same text search query :</p>
+<div class="codehilite"><pre><span class="n">SELECT</span> ?<span class="n">s</span>
+<span class="n">WHERE</span> <span class="p">{</span>
+    <span class="p">{</span> ?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span> <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="s">&#39;institut&#39;</span> <span class="s">&#39;lang:fr&#39;</span> <span class="p">)</span> <span class="p">}</span>
+    <span class="n">UNION</span>
+    <span class="p">{</span> ?<span class="n">s</span> <span class="n">text</span><span class="p">:</span><span class="n">query</span> <span class="p">(</span> <span class="n">rdfs</span><span class="p">:</span><span class="n">label</span> <span class="s">&#39;institute&#39;</span> <span class="s">&#39;lang:en&#39;</span> <span class="p">)</span> <span class="p">}</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<p>Hence, the result set of the query will contain "institute" related subjects 
+(institution, institutional,...) in French and in English.</p>
+<p><strong>Note</strong>: If the <code>text:langField</code> property is not set, the <code>text:langField</code> will default to"lang".</p>
+<h3 id="generic-and-defined-analyzer-support">Generic and Defined Analyzer Support<a class="headerlink" href="#generic-and-defined-analyzer-support" title="Permanent link">&para;</a></h3>
+<p>There are many Analyzers that do not have built-in support, e.g., <code>BrazilianAnalyzer</code>; 
+require constructor parameters not otherwise supported, e.g., a stop words <code>FileReader</code> or
+a <code>stemExclusionSet</code>; or make use of Analyzers not included in the bundled
+Lucene distribution, e.g., a <code>SanskritIASTAnalyzer</code>. Two features have been added to enhance
+the utility of jena-text: 1) <code>text:GenericAnalyzer</code>; and 2) <code>text:DefinedAnalyzer</code>.</p>
+<h4 id="generic-analyzer">Generic Analyzer<a class="headerlink" href="#generic-analyzer" title="Permanent link">&para;</a></h4>
+<p>A <code>text:GenericAnalyzer</code> includes a <code>text:class</code> which is the fully qualified class name of an
+Analyzer that is accessible on the jena classpath. This is trivial for Analyzer classes that are
+included in the bundled Lucene distribution and for other custom Analyzers a simple matter of
+including a jar containing the custom Analyzer and any associated Tokenizer and Filters on
+the classpath.</p>
+<p>In addition to the <code>text:class</code> it is generally useful to include an ordered list of <code>text:params</code>
+that will be used to select an appropriate constructor of the Analyzer class. If there are no
+<code>text:params</code> in the analyzer specification or if the <code>text:params</code> is an empty list then the 
+nullary constructor is used to instantiate the analyzer. Each element of the list of <code>text:params</code> 
+includes:</p>
+<ul>
+<li>an optional <code>text:paramName</code> of type <code>Literal</code> that is useful to identify the purpose of a 
+parameter in the assembler configuration</li>
+<li>a required <code>text:paramType</code> which is one of:</li>
+</ul>
+<table class="table">
+<thead>
+<tr>
+<th>&nbsp;Type&nbsp;</th>
+<th>&nbsp; Description&nbsp;</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>text:TypeAnalyzer</code></td>
+<td>a subclass of <code>org.apache.lucene.analysis.Analyzer</code></td>
+</tr>
+<tr>
+<td><code>text:TypeBoolean</code></td>
+<td>a java <code>boolean</code></td>
+</tr>
+<tr>
+<td><code>text:TypeFile</code></td>
+<td>the <code>String</code> path to a file materialized as a <code>java.io.FileReader</code></td>
+</tr>
+<tr>
+<td><code>text:TypeInt</code></td>
+<td>a java <code>int</code></td>
+</tr>
+<tr>
+<td><code>text:TypeString</code></td>
+<td>a java <code>String</code></td>
+</tr>
+<tr>
+<td><code>text:TypeSet</code></td>
+<td>an <code>org.apache.lucene.analysis.CharArraySet</code></td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li>a required <code>text:paramValue</code> with an object of the type corresponding to <code>text:paramType</code></li>
+</ul>
+<p>In the case of an <code>analyzer</code> parameter the <code>text:paramValue</code> is any <code>text:analyzer</code> resource as 
+describe throughout this document.</p>
+<p>An example of the use of <code>text:GenericAnalyzer</code> to configure an <code>EnglishAnalyzer</code> with stop 
+words and stem exclusions is:</p>
+<div class="codehilite"><pre><span class="n">text</span><span class="o">:</span><span class="n">map</span> <span class="o">(</span>
+     <span class="o">[</span> <span class="n">text</span><span class="o">:</span><span class="n">field</span> <span class="s2">&quot;text&quot;</span> <span class="o">;</span> 
+       <span class="n">text</span><span class="o">:</span><span class="n">predicate</span> <span class="n">rdfs</span><span class="o">:</span><span class="n">label</span><span class="o">;</span>
+       <span class="n">text</span><span class="o">:</span><span class="n">analyzer</span> <span class="o">[</span>
+           <span class="n">a</span> <span class="n">text</span><span class="o">:</span><span class="n">GenericAnalyzer</span> <span class="o">;</span>
+           <span class="n">text</span><span class="o">:</span><span class="kd">class</span> <span class="s2">&quot;org.apache.lucene.analysis.en.EnglishAnalyzer&quot;</span> <span class="o">;</span>
+           <span class="n">text</span><span class="o">:</span><span class="n">params</span> <span class="o">(</span>
+                <span class="o">[</span> <span class="n">text</span><span class="o">:</span><span class="n">paramName</span> <span class="s2">&quot;stopwords&quot;</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramType</span> <span class="n">text</span><span class="o">:</span><span class="n">TypeSet</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramValue</span> <span class="o">(</span><span class="s2">&quot;the&quot;</span> <span class="s2">&quot;a&quot;</span> <span class="s2">&quot;an&quot;</span><span class="o">)</span> <span class="o">]</span>
+                <span class="o">[</span> <span class="n">text</span><span class="o">:</span><span class="n">paramName</span> <span class="s2">&quot;stemExclusionSet&quot;</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramType</span> <span class="n">text</span><span class="o">:</span><span class="n">TypeSet</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramValue</span> <span class="o">(</span><span class="s2">&quot;ing&quot;</span> <span class="s2">&quot;ed&quot;</span><span class="o">)</span> <span class="o">]</span>
+                <span class="o">)</span>
+       <span class="o">]</span> <span class="o">.</span>
+</pre></div>
+
+
+<p>Here is an example of defining an instance of <code>ShingleAnalyzerWrapper</code>:</p>
+<div class="codehilite"><pre><span class="n">text</span><span class="o">:</span><span class="n">map</span> <span class="o">(</span>
+     <span class="o">[</span> <span class="n">text</span><span class="o">:</span><span class="n">field</span> <span class="s2">&quot;text&quot;</span> <span class="o">;</span> 
+       <span class="n">text</span><span class="o">:</span><span class="n">predicate</span> <span class="n">rdfs</span><span class="o">:</span><span class="n">label</span><span class="o">;</span>
+       <span class="n">text</span><span class="o">:</span><span class="n">analyzer</span> <span class="o">[</span>
+           <span class="n">a</span> <span class="n">text</span><span class="o">:</span><span class="n">GenericAnalyzer</span> <span class="o">;</span>
+           <span class="n">text</span><span class="o">:</span><span class="kd">class</span> <span class="s2">&quot;org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper&quot;</span> <span class="o">;</span>
+           <span class="n">text</span><span class="o">:</span><span class="n">params</span> <span class="o">(</span>
+                <span class="o">[</span> <span class="n">text</span><span class="o">:</span><span class="n">paramName</span> <span class="s2">&quot;defaultAnalyzer&quot;</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramType</span> <span class="n">text</span><span class="o">:</span><span class="n">TypeAnalyzer</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramValue</span> <span class="o">[</span> <span class="n">a</span> <span class="n">text</span><span class="o">:</span><span class="n">SimpleAnalyzer</span> <span class="o">]</span> <span class="o">]</span>
+                <span class="o">[</span> <span class="n">text</span><span class="o">:</span><span class="n">paramName</span> <span class="s2">&quot;maxShingleSize&quot;</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramType</span> <span class="n">text</span><span class="o">:</span><span class="n">TypeInt</span> <span class="o">;</span>
+                  <span class="n">text</span><span class="o">:</span><span class="n">paramValue</span> <span class="mi">3</span> <span class="o">]</span>
+                <span class="o">)</span>
+       <span class="o">]</span> <span class="o">.</span>
+</pre></div>
+
+
+<p>If there is need of using an analyzer with constructor parameter types not included here then 
+one approach is to define an <code>AnalyzerWrapper</code> that uses available parameter types, such as 
+<code>file</code>, to collect the information needed to instantiate the desired analyzer. An example of
+such an analyzer is the Kuromoji morphological analyzer for Japanese text that uses constructor 
+parameters of types: <code>UserDictionary</code>, <code>JapaneseTokenizer.Mode</code>, <code>CharArraySet</code> and <code>Set&lt;String&gt;</code>.</p>
+<h4 id="defined-analyzers">Defined Analyzers<a class="headerlink" href="#defined-analyzers" title="Permanent link">&para;</a></h4>
+<p>The <code>text:defineAnalyzers</code> feature allows to extend the <a href="#multilingual-support">Multilingual Support</a>
+defined above. Further, this feature can also be used to name analyzers defined via <code>text:GenericAnalyzer</code>
+so that a single (perhaps complex) analyzer configuration can be used is several places.</p>
+<p>The <code>text:defineAnalyzers</code> is used with <code>text:TextIndexLucene</code> to provide a list of analyzer
+definitions:</p>
+<div class="codehilite"><pre><span class="o">&lt;</span>#<span class="n">indexLucene</span><span class="o">&gt;</span> <span class="n">a</span> <span class="n">text</span><span class="p">:</span><span class="n">TextIndexLucene</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">directory</span> <span class="o">&lt;</span><span class="n">file</span><span class="p">:</span><span class="n">Lucene</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">entityMap</span> <span class="o">&lt;</span>#<span class="n">entMap</span><span class="o">&gt;</span> <span class="p">;</span>
+    <span class="n">text</span><span class="p">:</span><span class="n">defineAnalyzers</span> <span class="p">(</span>
+        <span class="p">[</span> <span class="n">text</span><span class="p">:</span><span class="n">addLang</span> &quot;<span class="n">sa</span><span class="o">-</span><span class="n">x</span><span class="o">-</span><span class="n">iast</span>&quot; <span class="p">;</span>
+          <span class="n">text</span><span class="p">:</span><span class="n">analyzer</span> <span class="p">[</span> <span class="p">.</span> <span class="p">.</span> <span class="p">.</span> <span class="p">]</span> <span class="p">]</span>
+        <span class="p">[</span> <span class="n">text</span><span class="p">:</span><span class="n">defineAnalyzer</span> <span class="o">&lt;</span>#<span class="n">foo</span><span class="o">&gt;</span> <span class="p">;</span>
+          <span class="n">text</span><span class="p">:</span><span class="n">analyzer</span> <span class="p">[</span> <span class="p">.</span> <span class="p">.</span> <span class="p">.</span> <span class="p">]</span> <span class="p">]</span>
+    <span class="p">)</span>
+    <span class="p">.</span>
+</pre></div>
+
+
+<p>References to a defined analyzer may be made in the entity map like:</p>
+<div class="codehilite"><pre><span class="n">text</span><span class="o">:</span><span class="n">analyzer</span> <span class="o">[</span>
+    <span class="n">a</span> <span class="n">text</span><span class="o">:</span><span class="n">DefinedAnalyzer</span>
+    <span class="n">text</span><span class="o">:</span><span class="n">useAnalyzer</span> <span class="o">&lt;</span><span class="err">#</span><span class="n">foo</span><span class="o">&gt;</span> <span class="o">]</span>
+</pre></div>
+
+
+<h5 id="extending-multilingual-support">Extending multilingual support<a class="headerlink" href="#extending-multilingual-support" title="Permanent link">&para;</a></h5>
+<p>The <a href="#multilingual-support">Multilingual Support</a> described above allows for a limited set of 
+ISO 2-letter codes to be used to select from among built-in analyzers using the nullary constructor 
+associated with each analyzer. So if one is wanting to use:</p>
+<ul>
+<li>a language not included, e.g., Brazilian; or </li>

[... 227 lines stripped ...]