You are viewing a plain text version of this content. The canonical link for it is here.
Posted to users@jena.apache.org by Mikael Pesonen <mi...@lingsoft.fi> on 2020/09/29 14:13:23 UTC

java.lang.StringIndexOutOfBoundsException with Jena Text

Hi

I'm building a new text index with following command and getting java error.

/usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
--desc=fuseki_config.ttl

After the command I get 4 files in /home/text/tools/jena_text_index/

_0.fdt
_0.fdx
segments_1
write.lock

Any idea what could case this?


Error is:

java.lang.StringIndexOutOfBoundsException: String index out of range: 59
         at java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
         at java.base/java.lang.String.charAt(String.java:711)
         at org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
         at 
org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
         at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
         at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67)
         at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
         at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
         at 
org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59)
         at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
         at 
org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
         at jena.textindexer.exec(textindexer.java:130)
         at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
         at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
         at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
         at jena.textindexer.main(textindexer.java:52)
mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
/usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
--desc=fuseki_config.ttl
java.lang.StringIndexOutOfBoundsException: String index out of range: 59
         at java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
         at java.base/java.lang.String.charAt(String.java:711)
         at org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
         at 
org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
         at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
         at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50)
         at 
org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67)
         at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
         at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
         at 
org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59)
         at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
         at 
org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
         at jena.textindexer.exec(textindexer.java:130)
         at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
         at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
         at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
         at jena.textindexer.main(textindexer.java:52)


config:

@prefix :<http://localhost/jena_example/#>  .
@prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>  .
@prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#>  .
@prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
@prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
@prefix text:<http://jena.apache.org/text#>  .
@prefix skos:<http://www.w3.org/2004/02/skos/core#> .
@prefix fuseki:<http://jena.apache.org/fuseki#>  .
@prefix vcard:<http://www.w3.org/2006/vcard/ns#> .

## Example of a TDB dataset and text index
## Initialize TDB
[] ja:loadClass "org.apache.jena.tdb.TDB" .
tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
tdb:GraphTDB    rdfs:subClassOf  ja:Model .

## Initialize text query
[] ja:loadClass       "org.apache.jena.query.text.TextQuery" .
# A TextDataset is a regular dataset with a text index.
text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
# Lucene index
text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .


## ---------------------------------------------------------------
# build: java -cp ./fuseki-server.jar jena.textindexer 
--desc=fuseki_config.ttl

:text_dataset rdf:type     text:TextDataset ;
      text:dataset   :my_dataset ;
      text:index     <#indexLucene> ;
      .

# A TDB dataset used for RDF storage
:my_dataset rdf:type      tdb:DatasetTDB ;
      tdb:location "/home/text/tools/jena_data/" ;
#    tdb:unionDefaultGraph true ; # Optional
      .

# Text index description
<#indexLucene> a text:TextIndexLucene ;
      text:directory <file:/home/text/tools/jena_text_index/> ;
      text:entityMap <#entMap> ;
      text:storeValues true ;
      text:analyzer [ a text:StandardAnalyzer ] ;
      text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
      text:queryParser text:AnalyzingQueryParser ;
      text:multilingualSupport true ;
   .

<#entMap> a text:EntityMap ;
      text:defaultField     "vcard_fn" ;
      text:entityField      "uri" ;
      text:uidField         "uid" ;
      text:langField        "lang" ;
      text:graphField       "graph" ;
      text:map (
           [ text:field "vcard_fn" ; text:predicate vcard:fn ]
           [ text:field "altLabel"  ; text:predicate skos:altLabel ]
           ) .

<#service> rdf:type fuseki:Service ;
      fuseki:name                     "/ds" ;   # http://host:port/ds-ro
      fuseki:serviceQuery             "query" ;    # SPARQL query service
      fuseki:serviceQuery             "sparql" ;   # SPARQL query service
      fuseki:serviceUpdate            "update" ;   # SPARQL update service
      fuseki:serviceUpload            "upload" ;   # Non-SPARQL upload 
service
      fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL Graph 
store protocol (read and write)
      fuseki:dataset           :text_dataset ;
      .


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.
Added some new fields to index and rebuilt it and now everything works. 
Not sure what happened...

On 1.10.2020 0:32, Andy Seaborne wrote:
>
>>> On startup Jena now says
>>>
>>> 2020-09-30 16:47:48,396 main ERROR Reconfiguration failed: No 
>>> configuration found for '5bc2b487' at 'null' in 'null'
>>>
>>> if that is somehow related.
>>
>> looks likely.
>
> That message is not coming from Jena (the string "Reconfiguration 
> failed" isn't in the codebase)
>
>     Andy


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.
Newly inserted data is found, only old data before the index is created 
cannot be found.

On 1.10.2020 0:32, Andy Seaborne wrote:
>
>>> On startup Jena now says
>>>
>>> 2020-09-30 16:47:48,396 main ERROR Reconfiguration failed: No 
>>> configuration found for '5bc2b487' at 'null' in 'null'
>>>
>>> if that is somehow related.
>>
>> looks likely.
>
> That message is not coming from Jena (the string "Reconfiguration 
> failed" isn't in the codebase)
>
>     Andy


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Andy Seaborne <an...@apache.org>.
>> On startup Jena now says
>>
>> 2020-09-30 16:47:48,396 main ERROR Reconfiguration failed: No 
>> configuration found for '5bc2b487' at 'null' in 'null'
>>
>> if that is somehow related.
> 
> looks likely.

That message is not coming from Jena (the string "Reconfiguration 
failed" isn't in the codebase)

     Andy

Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.

On 30.9.2020 22:53, Andy Seaborne wrote:
>
> On 30/09/2020 15:12, Mikael Pesonen wrote:
>>
>> Okay got the index done:
>>
>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>> --desc=fuseki_config.ttl
>> 16:51:57 INFO  textindexer     :: 159657 (15965 per second)properties 
>> indexed (15965 per second overall)
>> 16:52:07 INFO  textindexer     :: 349257 (18960 per second)properties 
>> indexed (17462 per second overall)
>> 16:52:17 INFO  textindexer     :: 539238 (18998 per second)properties 
>> indexed (17974 per second overall)
>> 16:52:27 INFO  textindexer     :: 708454 (16921 per second)properties 
>> indexed (17711 per second overall)
>> 16:52:37 INFO  textindexer     :: 888469 (18001 per second)properties 
>> indexed (17769 per second overall)
>> 16:52:46 INFO  textindexer     :: 928952 (15744 per second) 
>> properties indexed
>>
>> but I'm getting no results. Tried (with data that should return matches)
>>
>> (?s ?score ?content) text:query (vcard:fn "Some Person"  )
>> and
>> ?s text:query "something"  .
>>
>>
>> On startup Jena now says
>>
>> 2020-09-30 16:47:48,396 main ERROR Reconfiguration failed: No 
>> configuration found for '5bc2b487' at 'null' in 'null'
>>
>> if that is somehow related.
>
> looks likely.
>
> Earier you showed:
>
> select * where
> {
>  graph ?g {
>     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>   }
> }
>
> so also may be a namedgraph

Entire query is:

SELECT * WHERE
{
     GRAPH ?g
     {
          (?s ?score ?content) text:query (vcard:fn "Some Person"  )

         # or
     # ?s text:query "something"  .
     }
}



The error occurs like this:

  /usr/bin/java 
-Dlog4j.configuration=file:/home/text/tools/apache-jena-fuseki-3.16.0/log4j.properties 
-Xmx3000M -jar fuseki-server.jar --update --port 3030 --config 
/home/text/tools/apache-jena-fuseki-3.16.0/fuseki_config.ttl
2020-10-01 11:57:48,819 main ERROR Reconfiguration failed: No 
configuration found for '5bc2b487' at 'null' in 'null'
11:57:49 INFO  Server          :: Apache Jena Fuseki 3.16.0
11:57:49 INFO  Config          :: 
FUSEKI_HOME=/home/text/tools/apache-jena-fuseki-3.16.0/.
11:57:49 INFO  Config          :: 
FUSEKI_BASE=/home/text/tools/apache-jena-fuseki-3.16.0/run
11:57:49 INFO  Config          :: Shiro file: 
file:///home/text/tools/apache-jena-fuseki-3.16.0/run/shiro.ini
11:57:49 INFO  Config          :: Configuration file: 
/home/text/tools/apache-jena-fuseki-3.16.0/fuseki_config.ttl
11:57:50 INFO  Server          :: Path = /ds
11:57:50 INFO  Server          :: System
11:57:50 INFO  Server          ::   Memory: 2.9 GiB
11:57:50 INFO  Server          ::   Java:   14.0.1
11:57:50 INFO  Server          ::   OS:     Linux 4.4.0-174-generic amd64
11:57:50 INFO  Server          ::   PID:    12616
11:57:50 INFO  Server          :: Started 2020/10/01 11:57:50 EEST on 
port 3030


> Andy
>
>>
>>
>>
>> On 30.9.2020 15:18, Andy Seaborne wrote:
>>> https://issues.apache.org/jira/browse/JENA-1890 and 1892
>>>
>>> are fixed in 3.16.0
>>>
>>> Its a dcode error - the TDB database is intact.
>>>
>>> On 30/09/2020 12:31, Mikael Pesonen wrote:
>>>>
>>>> I figured out the regexp. Seems that we have external data having 
>>>> non Ascii URLs that can't be altered. Is there any workaround, for 
>>>> example adding text index to selected graphs only?
>>>>
>>>> On 30.9.2020 13:57, Mikael Pesonen wrote:
>>>>>
>>>>> Ah, thanks. Is it possible to find such URis with SPARQL query? 
>>>>> SPARQL seems not to support \x -notation
>>>>>
>>>>> select * where
>>>>> {
>>>>>  graph ?g {
>>>>>     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>>>>>   }
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> On 30.9.2020 13:29, Andy Seaborne wrote:
>>>>>> In the data (probbaly in a URI) - it's reading the database.
>>>>>>
>>>>>> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>>>>>>
>>>>>>> I couldn't find any non Ascii characters in the config file 
>>>>>>> ([^\x00-\x7F]+)...
>>>>>>>
>>>>>>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>>>>>>> Looks like
>>>>>>>>
>>>>>>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>>>>>>
>>>>>>>>     Andy
>>>>>>>>
>>>>>>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>>>>>>
>>>>>>>>> Hi
>>>>>>>>>
>>>>>>>>> I'm building a new text index with following command and 
>>>>>>>>> getting java error.
>>>>>>>>>
>>>>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>>>> --desc=fuseki_config.ttl
>>>>>>>>>
>>>>>>>>> After the command I get 4 files in 
>>>>>>>>> /home/text/tools/jena_text_index/
>>>>>>>>>
>>>>>>>>> _0.fdt
>>>>>>>>> _0.fdx
>>>>>>>>> segments_1
>>>>>>>>> write.lock
>>>>>>>>>
>>>>>>>>> Any idea what could case this?
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Error is:
>>>>>>>>>
>>>>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>>>>> range: 59
>>>>>>>>>          at 
>>>>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>>>>
>>>>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>>>> --desc=fuseki_config.ttl
>>>>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>>>>> range: 59
>>>>>>>>>          at 
>>>>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>>>>
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>>>>          at 
>>>>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>>>>
>>>>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> config:
>>>>>>>>>
>>>>>>>>> @prefix :<http://localhost/jena_example/#> .
>>>>>>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>>>>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>>>>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#> .
>>>>>>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#> .
>>>>>>>>> @prefix text:<http://jena.apache.org/text#> .
>>>>>>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>>>>>>> @prefix fuseki:<http://jena.apache.org/fuseki#> .
>>>>>>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>>>>>>
>>>>>>>>> ## Example of a TDB dataset and text index
>>>>>>>>> ## Initialize TDB
>>>>>>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>>>>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>>>>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>>>>>>
>>>>>>>>> ## Initialize text query
>>>>>>>>> [] ja:loadClass "org.apache.jena.query.text.TextQuery" .
>>>>>>>>> # A TextDataset is a regular dataset with a text index.
>>>>>>>>> text:TextDataset      rdfs:subClassOf ja:RDFDataset .
>>>>>>>>> # Lucene index
>>>>>>>>> text:TextIndexLucene  rdfs:subClassOf text:TextIndex .
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> ## 
>>>>>>>>> ---------------------------------------------------------------
>>>>>>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>>>> --desc=fuseki_config.ttl
>>>>>>>>>
>>>>>>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>>>>>>       text:dataset   :my_dataset ;
>>>>>>>>>       text:index     <#indexLucene> ;
>>>>>>>>>       .
>>>>>>>>>
>>>>>>>>> # A TDB dataset used for RDF storage
>>>>>>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>>>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>>>>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>>>>>>       .
>>>>>>>>>
>>>>>>>>> # Text index description
>>>>>>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>>>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>>>>>>       text:entityMap <#entMap> ;
>>>>>>>>>       text:storeValues true ;
>>>>>>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>>>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>>>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>>>>>>       text:multilingualSupport true ;
>>>>>>>>>    .
>>>>>>>>>
>>>>>>>>> <#entMap> a text:EntityMap ;
>>>>>>>>>       text:defaultField     "vcard_fn" ;
>>>>>>>>>       text:entityField      "uri" ;
>>>>>>>>>       text:uidField         "uid" ;
>>>>>>>>>       text:langField        "lang" ;
>>>>>>>>>       text:graphField       "graph" ;
>>>>>>>>>       text:map (
>>>>>>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>>>>>>            [ text:field "altLabel"  ; text:predicate 
>>>>>>>>> skos:altLabel ]
>>>>>>>>>            ) .
>>>>>>>>>
>>>>>>>>> <#service> rdf:type fuseki:Service ;
>>>>>>>>>       fuseki:name                     "/ds" ;   # 
>>>>>>>>> http://host:port/ds-ro
>>>>>>>>>       fuseki:serviceQuery             "query" ; # SPARQL query 
>>>>>>>>> service
>>>>>>>>>       fuseki:serviceQuery             "sparql" ; # SPARQL 
>>>>>>>>> query service
>>>>>>>>>       fuseki:serviceUpdate            "update" ; # SPARQL 
>>>>>>>>> update service
>>>>>>>>>       fuseki:serviceUpload            "upload" ; # Non-SPARQL 
>>>>>>>>> upload service
>>>>>>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL 
>>>>>>>>> Graph store protocol (read and write)
>>>>>>>>>       fuseki:dataset           :text_dataset ;
>>>>>>>>>       .
>>>>>>>>>
>>>>>>>
>>>>>
>>>>
>>


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Andy Seaborne <an...@apache.org>.
On 30/09/2020 15:12, Mikael Pesonen wrote:
> 
> Okay got the index done:
> 
> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
> --desc=fuseki_config.ttl
> 16:51:57 INFO  textindexer     :: 159657 (15965 per second)properties 
> indexed (15965 per second overall)
> 16:52:07 INFO  textindexer     :: 349257 (18960 per second)properties 
> indexed (17462 per second overall)
> 16:52:17 INFO  textindexer     :: 539238 (18998 per second)properties 
> indexed (17974 per second overall)
> 16:52:27 INFO  textindexer     :: 708454 (16921 per second)properties 
> indexed (17711 per second overall)
> 16:52:37 INFO  textindexer     :: 888469 (18001 per second)properties 
> indexed (17769 per second overall)
> 16:52:46 INFO  textindexer     :: 928952 (15744 per second) properties 
> indexed
> 
> but I'm getting no results. Tried (with data that should return matches)
> 
> (?s ?score ?content) text:query (vcard:fn "Some Person"  )
> and
> ?s text:query "something"  .
> 
> 
> On startup Jena now says
> 
> 2020-09-30 16:47:48,396 main ERROR Reconfiguration failed: No 
> configuration found for '5bc2b487' at 'null' in 'null'
> 
> if that is somehow related.

looks likely.

Earier you showed:

select * where
{
  graph ?g {
     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
   }
}

so also may be a namedgraph
     Andy

> 
> 
> 
> On 30.9.2020 15:18, Andy Seaborne wrote:
>> https://issues.apache.org/jira/browse/JENA-1890 and 1892
>>
>> are fixed in 3.16.0
>>
>> Its a dcode error - the TDB database is intact.
>>
>> On 30/09/2020 12:31, Mikael Pesonen wrote:
>>>
>>> I figured out the regexp. Seems that we have external data having non 
>>> Ascii URLs that can't be altered. Is there any workaround, for 
>>> example adding text index to selected graphs only?
>>>
>>> On 30.9.2020 13:57, Mikael Pesonen wrote:
>>>>
>>>> Ah, thanks. Is it possible to find such URis with SPARQL query? 
>>>> SPARQL seems not to support \x -notation
>>>>
>>>> select * where
>>>> {
>>>>  graph ?g {
>>>>     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>>>>   }
>>>> }
>>>>
>>>>
>>>>
>>>> On 30.9.2020 13:29, Andy Seaborne wrote:
>>>>> In the data (probbaly in a URI) - it's reading the database.
>>>>>
>>>>> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>>>>>
>>>>>> I couldn't find any non Ascii characters in the config file 
>>>>>> ([^\x00-\x7F]+)...
>>>>>>
>>>>>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>>>>>> Looks like
>>>>>>>
>>>>>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>>>>>
>>>>>>>     Andy
>>>>>>>
>>>>>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>>>>>
>>>>>>>> Hi
>>>>>>>>
>>>>>>>> I'm building a new text index with following command and getting 
>>>>>>>> java error.
>>>>>>>>
>>>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>>> --desc=fuseki_config.ttl
>>>>>>>>
>>>>>>>> After the command I get 4 files in 
>>>>>>>> /home/text/tools/jena_text_index/
>>>>>>>>
>>>>>>>> _0.fdt
>>>>>>>> _0.fdx
>>>>>>>> segments_1
>>>>>>>> write.lock
>>>>>>>>
>>>>>>>> Any idea what could case this?
>>>>>>>>
>>>>>>>>
>>>>>>>> Error is:
>>>>>>>>
>>>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>>>> range: 59
>>>>>>>>          at 
>>>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>>>
>>>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>>> --desc=fuseki_config.ttl
>>>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>>>> range: 59
>>>>>>>>          at 
>>>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>>>
>>>>>>>>          at 
>>>>>>>> org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>>>          at 
>>>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>>>
>>>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>>>
>>>>>>>>
>>>>>>>> config:
>>>>>>>>
>>>>>>>> @prefix :<http://localhost/jena_example/#>  .
>>>>>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>>>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>>>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#> .
>>>>>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#> .
>>>>>>>> @prefix text:<http://jena.apache.org/text#>  .
>>>>>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>>>>>> @prefix fuseki:<http://jena.apache.org/fuseki#> .
>>>>>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>>>>>
>>>>>>>> ## Example of a TDB dataset and text index
>>>>>>>> ## Initialize TDB
>>>>>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>>>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>>>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>>>>>
>>>>>>>> ## Initialize text query
>>>>>>>> [] ja:loadClass "org.apache.jena.query.text.TextQuery" .
>>>>>>>> # A TextDataset is a regular dataset with a text index.
>>>>>>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>>>>>>> # Lucene index
>>>>>>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>>>>>>
>>>>>>>>
>>>>>>>> ## ---------------------------------------------------------------
>>>>>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>>> --desc=fuseki_config.ttl
>>>>>>>>
>>>>>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>>>>>       text:dataset   :my_dataset ;
>>>>>>>>       text:index     <#indexLucene> ;
>>>>>>>>       .
>>>>>>>>
>>>>>>>> # A TDB dataset used for RDF storage
>>>>>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>>>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>>>>>       .
>>>>>>>>
>>>>>>>> # Text index description
>>>>>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>>>>>       text:entityMap <#entMap> ;
>>>>>>>>       text:storeValues true ;
>>>>>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>>>>>       text:multilingualSupport true ;
>>>>>>>>    .
>>>>>>>>
>>>>>>>> <#entMap> a text:EntityMap ;
>>>>>>>>       text:defaultField     "vcard_fn" ;
>>>>>>>>       text:entityField      "uri" ;
>>>>>>>>       text:uidField         "uid" ;
>>>>>>>>       text:langField        "lang" ;
>>>>>>>>       text:graphField       "graph" ;
>>>>>>>>       text:map (
>>>>>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>>>>>            [ text:field "altLabel"  ; text:predicate 
>>>>>>>> skos:altLabel ]
>>>>>>>>            ) .
>>>>>>>>
>>>>>>>> <#service> rdf:type fuseki:Service ;
>>>>>>>>       fuseki:name                     "/ds" ;   # 
>>>>>>>> http://host:port/ds-ro
>>>>>>>>       fuseki:serviceQuery             "query" ;    # SPARQL 
>>>>>>>> query service
>>>>>>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL 
>>>>>>>> query service
>>>>>>>>       fuseki:serviceUpdate            "update" ;   # SPARQL 
>>>>>>>> update service
>>>>>>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>>>>>>> upload service
>>>>>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL 
>>>>>>>> Graph store protocol (read and write)
>>>>>>>>       fuseki:dataset           :text_dataset ;
>>>>>>>>       .
>>>>>>>>
>>>>>>
>>>>
>>>
> 

Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.
Okay got the index done:

/usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
--desc=fuseki_config.ttl
16:51:57 INFO  textindexer     :: 159657 (15965 per second)properties 
indexed (15965 per second overall)
16:52:07 INFO  textindexer     :: 349257 (18960 per second)properties 
indexed (17462 per second overall)
16:52:17 INFO  textindexer     :: 539238 (18998 per second)properties 
indexed (17974 per second overall)
16:52:27 INFO  textindexer     :: 708454 (16921 per second)properties 
indexed (17711 per second overall)
16:52:37 INFO  textindexer     :: 888469 (18001 per second)properties 
indexed (17769 per second overall)
16:52:46 INFO  textindexer     :: 928952 (15744 per second) properties 
indexed

but I'm getting no results. Tried (with data that should return matches)

(?s ?score ?content) text:query (vcard:fn "Some Person"  )
and
?s text:query "something"  .


On startup Jena now says

2020-09-30 16:47:48,396 main ERROR Reconfiguration failed: No 
configuration found for '5bc2b487' at 'null' in 'null'

if that is somehow related.



On 30.9.2020 15:18, Andy Seaborne wrote:
> https://issues.apache.org/jira/browse/JENA-1890 and 1892
>
> are fixed in 3.16.0
>
> Its a dcode error - the TDB database is intact.
>
> On 30/09/2020 12:31, Mikael Pesonen wrote:
>>
>> I figured out the regexp. Seems that we have external data having non 
>> Ascii URLs that can't be altered. Is there any workaround, for 
>> example adding text index to selected graphs only?
>>
>> On 30.9.2020 13:57, Mikael Pesonen wrote:
>>>
>>> Ah, thanks. Is it possible to find such URis with SPARQL query? 
>>> SPARQL seems not to support \x -notation
>>>
>>> select * where
>>> {
>>>  graph ?g {
>>>     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>>>   }
>>> }
>>>
>>>
>>>
>>> On 30.9.2020 13:29, Andy Seaborne wrote:
>>>> In the data (probbaly in a URI) - it's reading the database.
>>>>
>>>> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>>>>
>>>>> I couldn't find any non Ascii characters in the config file 
>>>>> ([^\x00-\x7F]+)...
>>>>>
>>>>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>>>>> Looks like
>>>>>>
>>>>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>>>>
>>>>>>     Andy
>>>>>>
>>>>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>>>>
>>>>>>> Hi
>>>>>>>
>>>>>>> I'm building a new text index with following command and getting 
>>>>>>> java error.
>>>>>>>
>>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>> --desc=fuseki_config.ttl
>>>>>>>
>>>>>>> After the command I get 4 files in 
>>>>>>> /home/text/tools/jena_text_index/
>>>>>>>
>>>>>>> _0.fdt
>>>>>>> _0.fdx
>>>>>>> segments_1
>>>>>>> write.lock
>>>>>>>
>>>>>>> Any idea what could case this?
>>>>>>>
>>>>>>>
>>>>>>> Error is:
>>>>>>>
>>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>>> range: 59
>>>>>>>          at 
>>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>>          at 
>>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>>          at 
>>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>>
>>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>> --desc=fuseki_config.ttl
>>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>>> range: 59
>>>>>>>          at 
>>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>>          at 
>>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>>          at 
>>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>>
>>>>>>>          at 
>>>>>>> org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>>          at 
>>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>>
>>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>>
>>>>>>>
>>>>>>> config:
>>>>>>>
>>>>>>> @prefix :<http://localhost/jena_example/#>  .
>>>>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#> .
>>>>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#> .
>>>>>>> @prefix text:<http://jena.apache.org/text#>  .
>>>>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>>>>> @prefix fuseki:<http://jena.apache.org/fuseki#> .
>>>>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>>>>
>>>>>>> ## Example of a TDB dataset and text index
>>>>>>> ## Initialize TDB
>>>>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>>>>
>>>>>>> ## Initialize text query
>>>>>>> [] ja:loadClass "org.apache.jena.query.text.TextQuery" .
>>>>>>> # A TextDataset is a regular dataset with a text index.
>>>>>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>>>>>> # Lucene index
>>>>>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>>>>>
>>>>>>>
>>>>>>> ## ---------------------------------------------------------------
>>>>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>>>>> --desc=fuseki_config.ttl
>>>>>>>
>>>>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>>>>       text:dataset   :my_dataset ;
>>>>>>>       text:index     <#indexLucene> ;
>>>>>>>       .
>>>>>>>
>>>>>>> # A TDB dataset used for RDF storage
>>>>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>>>>       .
>>>>>>>
>>>>>>> # Text index description
>>>>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>>>>       text:entityMap <#entMap> ;
>>>>>>>       text:storeValues true ;
>>>>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>>>>       text:multilingualSupport true ;
>>>>>>>    .
>>>>>>>
>>>>>>> <#entMap> a text:EntityMap ;
>>>>>>>       text:defaultField     "vcard_fn" ;
>>>>>>>       text:entityField      "uri" ;
>>>>>>>       text:uidField         "uid" ;
>>>>>>>       text:langField        "lang" ;
>>>>>>>       text:graphField       "graph" ;
>>>>>>>       text:map (
>>>>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>>>>            [ text:field "altLabel"  ; text:predicate 
>>>>>>> skos:altLabel ]
>>>>>>>            ) .
>>>>>>>
>>>>>>> <#service> rdf:type fuseki:Service ;
>>>>>>>       fuseki:name                     "/ds" ;   # 
>>>>>>> http://host:port/ds-ro
>>>>>>>       fuseki:serviceQuery             "query" ;    # SPARQL 
>>>>>>> query service
>>>>>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL 
>>>>>>> query service
>>>>>>>       fuseki:serviceUpdate            "update" ;   # SPARQL 
>>>>>>> update service
>>>>>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>>>>>> upload service
>>>>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL 
>>>>>>> Graph store protocol (read and write)
>>>>>>>       fuseki:dataset           :text_dataset ;
>>>>>>>       .
>>>>>>>
>>>>>
>>>
>>


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Andy Seaborne <an...@apache.org>.
https://issues.apache.org/jira/browse/JENA-1890 and 1892

are fixed in 3.16.0

Its a dcode error - the TDB database is intact.

On 30/09/2020 12:31, Mikael Pesonen wrote:
> 
> I figured out the regexp. Seems that we have external data having non 
> Ascii URLs that can't be altered. Is there any workaround, for example 
> adding text index to selected graphs only?
> 
> On 30.9.2020 13:57, Mikael Pesonen wrote:
>>
>> Ah, thanks. Is it possible to find such URis with SPARQL query? SPARQL 
>> seems not to support \x -notation
>>
>> select * where
>> {
>>  graph ?g {
>>     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>>   }
>> }
>>
>>
>>
>> On 30.9.2020 13:29, Andy Seaborne wrote:
>>> In the data (probbaly in a URI) - it's reading the database.
>>>
>>> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>>>
>>>> I couldn't find any non Ascii characters in the config file 
>>>> ([^\x00-\x7F]+)...
>>>>
>>>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>>>> Looks like
>>>>>
>>>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>>>
>>>>>     Andy
>>>>>
>>>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>>>
>>>>>> Hi
>>>>>>
>>>>>> I'm building a new text index with following command and getting 
>>>>>> java error.
>>>>>>
>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>> --desc=fuseki_config.ttl
>>>>>>
>>>>>> After the command I get 4 files in /home/text/tools/jena_text_index/
>>>>>>
>>>>>> _0.fdt
>>>>>> _0.fdx
>>>>>> segments_1
>>>>>> write.lock
>>>>>>
>>>>>> Any idea what could case this?
>>>>>>
>>>>>>
>>>>>> Error is:
>>>>>>
>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>> range: 59
>>>>>>          at 
>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>          at 
>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>
>>>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>
>>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>
>>>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>          at 
>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>
>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>>> --desc=fuseki_config.ttl
>>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>>> range: 59
>>>>>>          at 
>>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>>          at 
>>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>>
>>>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>>
>>>>>>          at 
>>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>>
>>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>>          at 
>>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>>
>>>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>>          at 
>>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>>
>>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>>
>>>>>>
>>>>>> config:
>>>>>>
>>>>>> @prefix :<http://localhost/jena_example/#>  .
>>>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
>>>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
>>>>>> @prefix text:<http://jena.apache.org/text#>  .
>>>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>>>> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
>>>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>>>
>>>>>> ## Example of a TDB dataset and text index
>>>>>> ## Initialize TDB
>>>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>>>
>>>>>> ## Initialize text query
>>>>>> [] ja:loadClass "org.apache.jena.query.text.TextQuery" .
>>>>>> # A TextDataset is a regular dataset with a text index.
>>>>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>>>>> # Lucene index
>>>>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>>>>
>>>>>>
>>>>>> ## ---------------------------------------------------------------
>>>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>>>> --desc=fuseki_config.ttl
>>>>>>
>>>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>>>       text:dataset   :my_dataset ;
>>>>>>       text:index     <#indexLucene> ;
>>>>>>       .
>>>>>>
>>>>>> # A TDB dataset used for RDF storage
>>>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>>>       .
>>>>>>
>>>>>> # Text index description
>>>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>>>       text:entityMap <#entMap> ;
>>>>>>       text:storeValues true ;
>>>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>>>       text:multilingualSupport true ;
>>>>>>    .
>>>>>>
>>>>>> <#entMap> a text:EntityMap ;
>>>>>>       text:defaultField     "vcard_fn" ;
>>>>>>       text:entityField      "uri" ;
>>>>>>       text:uidField         "uid" ;
>>>>>>       text:langField        "lang" ;
>>>>>>       text:graphField       "graph" ;
>>>>>>       text:map (
>>>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>>>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>>>>>>            ) .
>>>>>>
>>>>>> <#service> rdf:type fuseki:Service ;
>>>>>>       fuseki:name                     "/ds" ;   # 
>>>>>> http://host:port/ds-ro
>>>>>>       fuseki:serviceQuery             "query" ;    # SPARQL query 
>>>>>> service
>>>>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL query 
>>>>>> service
>>>>>>       fuseki:serviceUpdate            "update" ;   # SPARQL update 
>>>>>> service
>>>>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>>>>> upload service
>>>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL 
>>>>>> Graph store protocol (read and write)
>>>>>>       fuseki:dataset           :text_dataset ;
>>>>>>       .
>>>>>>
>>>>
>>
> 

Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.
I figured out the regexp. Seems that we have external data having non 
Ascii URLs that can't be altered. Is there any workaround, for example 
adding text index to selected graphs only?

On 30.9.2020 13:57, Mikael Pesonen wrote:
>
> Ah, thanks. Is it possible to find such URis with SPARQL query? SPARQL 
> seems not to support \x -notation
>
> select * where
> {
>  graph ?g {
>     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>   }
> }
>
>
>
> On 30.9.2020 13:29, Andy Seaborne wrote:
>> In the data (probbaly in a URI) - it's reading the database.
>>
>> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>>
>>> I couldn't find any non Ascii characters in the config file 
>>> ([^\x00-\x7F]+)...
>>>
>>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>>> Looks like
>>>>
>>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>>
>>>>     Andy
>>>>
>>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>>
>>>>> Hi
>>>>>
>>>>> I'm building a new text index with following command and getting 
>>>>> java error.
>>>>>
>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>> --desc=fuseki_config.ttl
>>>>>
>>>>> After the command I get 4 files in /home/text/tools/jena_text_index/
>>>>>
>>>>> _0.fdt
>>>>> _0.fdx
>>>>> segments_1
>>>>> write.lock
>>>>>
>>>>> Any idea what could case this?
>>>>>
>>>>>
>>>>> Error is:
>>>>>
>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>> range: 59
>>>>>          at 
>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>          at 
>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>
>>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>
>>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>          at 
>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>
>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>> --desc=fuseki_config.ttl
>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>> range: 59
>>>>>          at 
>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>          at 
>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121) 
>>>>>
>>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>
>>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>          at 
>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>
>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>
>>>>>
>>>>> config:
>>>>>
>>>>> @prefix :<http://localhost/jena_example/#>  .
>>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
>>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
>>>>> @prefix text:<http://jena.apache.org/text#>  .
>>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>>> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
>>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>>
>>>>> ## Example of a TDB dataset and text index
>>>>> ## Initialize TDB
>>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>>
>>>>> ## Initialize text query
>>>>> [] ja:loadClass "org.apache.jena.query.text.TextQuery" .
>>>>> # A TextDataset is a regular dataset with a text index.
>>>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>>>> # Lucene index
>>>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>>>
>>>>>
>>>>> ## ---------------------------------------------------------------
>>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>>> --desc=fuseki_config.ttl
>>>>>
>>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>>       text:dataset   :my_dataset ;
>>>>>       text:index     <#indexLucene> ;
>>>>>       .
>>>>>
>>>>> # A TDB dataset used for RDF storage
>>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>>       .
>>>>>
>>>>> # Text index description
>>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>>       text:entityMap <#entMap> ;
>>>>>       text:storeValues true ;
>>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>>       text:multilingualSupport true ;
>>>>>    .
>>>>>
>>>>> <#entMap> a text:EntityMap ;
>>>>>       text:defaultField     "vcard_fn" ;
>>>>>       text:entityField      "uri" ;
>>>>>       text:uidField         "uid" ;
>>>>>       text:langField        "lang" ;
>>>>>       text:graphField       "graph" ;
>>>>>       text:map (
>>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>>>>>            ) .
>>>>>
>>>>> <#service> rdf:type fuseki:Service ;
>>>>>       fuseki:name                     "/ds" ;   # 
>>>>> http://host:port/ds-ro
>>>>>       fuseki:serviceQuery             "query" ;    # SPARQL query 
>>>>> service
>>>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL query 
>>>>> service
>>>>>       fuseki:serviceUpdate            "update" ;   # SPARQL update 
>>>>> service
>>>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>>>> upload service
>>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL 
>>>>> Graph store protocol (read and write)
>>>>>       fuseki:dataset           :text_dataset ;
>>>>>       .
>>>>>
>>>
>


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Andy Seaborne <an...@apache.org>.
https://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html

\\ not \ because it is inside a string.

And the filter needs to look for charcaters outside 0-255

select * where
{
     ?s ?p ?o filter( regex(str(?s), "[\\u0100-\\uFFFF]"))
}

     Andy

On 30/09/2020 11:57, Mikael Pesonen wrote:
> 
> Ah, thanks. Is it possible to find such URis with SPARQL query? SPARQL 
> seems not to support \x -notation
> 
> select * where
> {
>   graph ?g {
>      ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
>    }
> }
> 
> 
> 
> On 30.9.2020 13:29, Andy Seaborne wrote:
>> In the data (probbaly in a URI) - it's reading the database.
>>
>> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>>
>>> I couldn't find any non Ascii characters in the config file 
>>> ([^\x00-\x7F]+)...
>>>
>>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>>> Looks like
>>>>
>>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>>
>>>>     Andy
>>>>
>>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>>
>>>>> Hi
>>>>>
>>>>> I'm building a new text index with following command and getting 
>>>>> java error.
>>>>>
>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>> --desc=fuseki_config.ttl
>>>>>
>>>>> After the command I get 4 files in /home/text/tools/jena_text_index/
>>>>>
>>>>> _0.fdt
>>>>> _0.fdx
>>>>> segments_1
>>>>> write.lock
>>>>>
>>>>> Any idea what could case this?
>>>>>
>>>>>
>>>>> Error is:
>>>>>
>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>> range: 59
>>>>>          at 
>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>          at 
>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>
>>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>          at 
>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>
>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>>> --desc=fuseki_config.ttl
>>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>>> range: 59
>>>>>          at 
>>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>>          at 
>>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>>
>>>>>          at 
>>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>>
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>>          at 
>>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>>
>>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>>          at 
>>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>>
>>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>>
>>>>>
>>>>> config:
>>>>>
>>>>> @prefix :<http://localhost/jena_example/#>  .
>>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
>>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
>>>>> @prefix text:<http://jena.apache.org/text#>  .
>>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>>> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
>>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>>
>>>>> ## Example of a TDB dataset and text index
>>>>> ## Initialize TDB
>>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>>
>>>>> ## Initialize text query
>>>>> [] ja:loadClass       "org.apache.jena.query.text.TextQuery" .
>>>>> # A TextDataset is a regular dataset with a text index.
>>>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>>>> # Lucene index
>>>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>>>
>>>>>
>>>>> ## ---------------------------------------------------------------
>>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>>> --desc=fuseki_config.ttl
>>>>>
>>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>>       text:dataset   :my_dataset ;
>>>>>       text:index     <#indexLucene> ;
>>>>>       .
>>>>>
>>>>> # A TDB dataset used for RDF storage
>>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>>       .
>>>>>
>>>>> # Text index description
>>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>>       text:entityMap <#entMap> ;
>>>>>       text:storeValues true ;
>>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>>       text:multilingualSupport true ;
>>>>>    .
>>>>>
>>>>> <#entMap> a text:EntityMap ;
>>>>>       text:defaultField     "vcard_fn" ;
>>>>>       text:entityField      "uri" ;
>>>>>       text:uidField         "uid" ;
>>>>>       text:langField        "lang" ;
>>>>>       text:graphField       "graph" ;
>>>>>       text:map (
>>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>>>>>            ) .
>>>>>
>>>>> <#service> rdf:type fuseki:Service ;
>>>>>       fuseki:name                     "/ds" ;   # 
>>>>> http://host:port/ds-ro
>>>>>       fuseki:serviceQuery             "query" ;    # SPARQL query 
>>>>> service
>>>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL query 
>>>>> service
>>>>>       fuseki:serviceUpdate            "update" ;   # SPARQL update 
>>>>> service
>>>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>>>> upload service
>>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL Graph 
>>>>> store protocol (read and write)
>>>>>       fuseki:dataset           :text_dataset ;
>>>>>       .
>>>>>
>>>
> 

Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.
Ah, thanks. Is it possible to find such URis with SPARQL query? SPARQL 
seems not to support \x -notation

select * where
{
  graph ?g {
     ?s ?p ?o filter(regex(str(?s), "[\x00-\x7F]"))
   }
}



On 30.9.2020 13:29, Andy Seaborne wrote:
> In the data (probbaly in a URI) - it's reading the database.
>
> On 30/09/2020 10:36, Mikael Pesonen wrote:
>>
>> I couldn't find any non Ascii characters in the config file 
>> ([^\x00-\x7F]+)...
>>
>> On 30.9.2020 0:48, Andy Seaborne wrote:
>>> Looks like
>>>
>>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>>
>>>     Andy
>>>
>>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>>
>>>> Hi
>>>>
>>>> I'm building a new text index with following command and getting 
>>>> java error.
>>>>
>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>> --desc=fuseki_config.ttl
>>>>
>>>> After the command I get 4 files in /home/text/tools/jena_text_index/
>>>>
>>>> _0.fdt
>>>> _0.fdx
>>>> segments_1
>>>> write.lock
>>>>
>>>> Any idea what could case this?
>>>>
>>>>
>>>> Error is:
>>>>
>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>> range: 59
>>>>          at 
>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>          at 
>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>          at 
>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>
>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>          at 
>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>
>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>          at 
>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>
>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>          at jena.textindexer.main(textindexer.java:52)
>>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>>> --desc=fuseki_config.ttl
>>>> java.lang.StringIndexOutOfBoundsException: String index out of 
>>>> range: 59
>>>>          at 
>>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>>          at 
>>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>>          at 
>>>> org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>>
>>>>          at 
>>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>>
>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>>          at 
>>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>>
>>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>>          at 
>>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104) 
>>>>
>>>>          at jena.textindexer.exec(textindexer.java:130)
>>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>>          at jena.textindexer.main(textindexer.java:52)
>>>>
>>>>
>>>> config:
>>>>
>>>> @prefix :<http://localhost/jena_example/#>  .
>>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#> .
>>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
>>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
>>>> @prefix text:<http://jena.apache.org/text#>  .
>>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>>> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
>>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>>
>>>> ## Example of a TDB dataset and text index
>>>> ## Initialize TDB
>>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>>
>>>> ## Initialize text query
>>>> [] ja:loadClass       "org.apache.jena.query.text.TextQuery" .
>>>> # A TextDataset is a regular dataset with a text index.
>>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>>> # Lucene index
>>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>>
>>>>
>>>> ## ---------------------------------------------------------------
>>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>>> --desc=fuseki_config.ttl
>>>>
>>>> :text_dataset rdf:type     text:TextDataset ;
>>>>       text:dataset   :my_dataset ;
>>>>       text:index     <#indexLucene> ;
>>>>       .
>>>>
>>>> # A TDB dataset used for RDF storage
>>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>>       tdb:location "/home/text/tools/jena_data/" ;
>>>> #    tdb:unionDefaultGraph true ; # Optional
>>>>       .
>>>>
>>>> # Text index description
>>>> <#indexLucene> a text:TextIndexLucene ;
>>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>>       text:entityMap <#entMap> ;
>>>>       text:storeValues true ;
>>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>>       text:queryParser text:AnalyzingQueryParser ;
>>>>       text:multilingualSupport true ;
>>>>    .
>>>>
>>>> <#entMap> a text:EntityMap ;
>>>>       text:defaultField     "vcard_fn" ;
>>>>       text:entityField      "uri" ;
>>>>       text:uidField         "uid" ;
>>>>       text:langField        "lang" ;
>>>>       text:graphField       "graph" ;
>>>>       text:map (
>>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>>>>            ) .
>>>>
>>>> <#service> rdf:type fuseki:Service ;
>>>>       fuseki:name                     "/ds" ;   # 
>>>> http://host:port/ds-ro
>>>>       fuseki:serviceQuery             "query" ;    # SPARQL query 
>>>> service
>>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL query 
>>>> service
>>>>       fuseki:serviceUpdate            "update" ;   # SPARQL update 
>>>> service
>>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>>> upload service
>>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL Graph 
>>>> store protocol (read and write)
>>>>       fuseki:dataset           :text_dataset ;
>>>>       .
>>>>
>>


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Andy Seaborne <an...@apache.org>.
In the data (probbaly in a URI) - it's reading the database.

On 30/09/2020 10:36, Mikael Pesonen wrote:
> 
> I couldn't find any non Ascii characters in the config file 
> ([^\x00-\x7F]+)...
> 
> On 30.9.2020 0:48, Andy Seaborne wrote:
>> Looks like
>>
>> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>>
>>     Andy
>>
>> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>>
>>> Hi
>>>
>>> I'm building a new text index with following command and getting java 
>>> error.
>>>
>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>> --desc=fuseki_config.ttl
>>>
>>> After the command I get 4 files in /home/text/tools/jena_text_index/
>>>
>>> _0.fdt
>>> _0.fdx
>>> segments_1
>>> write.lock
>>>
>>> Any idea what could case this?
>>>
>>>
>>> Error is:
>>>
>>> java.lang.StringIndexOutOfBoundsException: String index out of range: 59
>>>          at 
>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>          at 
>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>          at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>
>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>          at 
>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>
>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>          at 
>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
>>>          at jena.textindexer.exec(textindexer.java:130)
>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>          at jena.textindexer.main(textindexer.java:52)
>>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>>> --desc=fuseki_config.ttl
>>> java.lang.StringIndexOutOfBoundsException: String index out of range: 59
>>>          at 
>>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>>          at java.base/java.lang.String.charAt(String.java:711)
>>>          at 
>>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>>          at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>>
>>>          at 
>>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>>
>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>>          at 
>>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59) 
>>>
>>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>>          at 
>>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
>>>          at jena.textindexer.exec(textindexer.java:130)
>>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>>          at jena.textindexer.main(textindexer.java:52)
>>>
>>>
>>> config:
>>>
>>> @prefix :<http://localhost/jena_example/#>  .
>>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#>  .
>>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
>>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
>>> @prefix text:<http://jena.apache.org/text#>  .
>>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>>> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
>>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>>
>>> ## Example of a TDB dataset and text index
>>> ## Initialize TDB
>>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>>
>>> ## Initialize text query
>>> [] ja:loadClass       "org.apache.jena.query.text.TextQuery" .
>>> # A TextDataset is a regular dataset with a text index.
>>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>>> # Lucene index
>>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>>
>>>
>>> ## ---------------------------------------------------------------
>>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>>> --desc=fuseki_config.ttl
>>>
>>> :text_dataset rdf:type     text:TextDataset ;
>>>       text:dataset   :my_dataset ;
>>>       text:index     <#indexLucene> ;
>>>       .
>>>
>>> # A TDB dataset used for RDF storage
>>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>>       tdb:location "/home/text/tools/jena_data/" ;
>>> #    tdb:unionDefaultGraph true ; # Optional
>>>       .
>>>
>>> # Text index description
>>> <#indexLucene> a text:TextIndexLucene ;
>>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>>       text:entityMap <#entMap> ;
>>>       text:storeValues true ;
>>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>>       text:queryParser text:AnalyzingQueryParser ;
>>>       text:multilingualSupport true ;
>>>    .
>>>
>>> <#entMap> a text:EntityMap ;
>>>       text:defaultField     "vcard_fn" ;
>>>       text:entityField      "uri" ;
>>>       text:uidField         "uid" ;
>>>       text:langField        "lang" ;
>>>       text:graphField       "graph" ;
>>>       text:map (
>>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>>>            ) .
>>>
>>> <#service> rdf:type fuseki:Service ;
>>>       fuseki:name                     "/ds" ;   # http://host:port/ds-ro
>>>       fuseki:serviceQuery             "query" ;    # SPARQL query 
>>> service
>>>       fuseki:serviceQuery             "sparql" ;   # SPARQL query 
>>> service
>>>       fuseki:serviceUpdate            "update" ;   # SPARQL update 
>>> service
>>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>>> upload service
>>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL Graph 
>>> store protocol (read and write)
>>>       fuseki:dataset           :text_dataset ;
>>>       .
>>>
> 

Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Mikael Pesonen <mi...@lingsoft.fi>.
I couldn't find any non Ascii characters in the config file 
([^\x00-\x7F]+)...

On 30.9.2020 0:48, Andy Seaborne wrote:
> Looks like
>
> https://issues.apache.org/jira/browse/JENA-1892 , 1890
>
>     Andy
>
> On 29/09/2020 15:13, Mikael Pesonen wrote:
>>
>> Hi
>>
>> I'm building a new text index with following command and getting java 
>> error.
>>
>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>> --desc=fuseki_config.ttl
>>
>> After the command I get 4 files in /home/text/tools/jena_text_index/
>>
>> _0.fdt
>> _0.fdx
>> segments_1
>> write.lock
>>
>> Any idea what could case this?
>>
>>
>> Error is:
>>
>> java.lang.StringIndexOutOfBoundsException: String index out of range: 59
>>          at 
>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>          at java.base/java.lang.String.charAt(String.java:711)
>>          at 
>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>          at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>
>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>          at 
>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59)
>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>          at 
>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
>>          at jena.textindexer.exec(textindexer.java:130)
>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>          at jena.textindexer.main(textindexer.java:52)
>> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
>> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
>> --desc=fuseki_config.ttl
>> java.lang.StringIndexOutOfBoundsException: String index out of range: 59
>>          at 
>> java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>>          at java.base/java.lang.String.charAt(String.java:711)
>>          at 
>> org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>>          at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
>>
>>          at 
>> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
>>
>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>>          at 
>> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59)
>>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>>          at 
>> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
>>          at jena.textindexer.exec(textindexer.java:130)
>>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>>          at jena.textindexer.main(textindexer.java:52)
>>
>>
>> config:
>>
>> @prefix :<http://localhost/jena_example/#>  .
>> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
>> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#>  .
>> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
>> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
>> @prefix text:<http://jena.apache.org/text#>  .
>> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
>> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
>> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
>>
>> ## Example of a TDB dataset and text index
>> ## Initialize TDB
>> [] ja:loadClass "org.apache.jena.tdb.TDB" .
>> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
>> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
>>
>> ## Initialize text query
>> [] ja:loadClass       "org.apache.jena.query.text.TextQuery" .
>> # A TextDataset is a regular dataset with a text index.
>> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
>> # Lucene index
>> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
>>
>>
>> ## ---------------------------------------------------------------
>> # build: java -cp ./fuseki-server.jar jena.textindexer 
>> --desc=fuseki_config.ttl
>>
>> :text_dataset rdf:type     text:TextDataset ;
>>       text:dataset   :my_dataset ;
>>       text:index     <#indexLucene> ;
>>       .
>>
>> # A TDB dataset used for RDF storage
>> :my_dataset rdf:type      tdb:DatasetTDB ;
>>       tdb:location "/home/text/tools/jena_data/" ;
>> #    tdb:unionDefaultGraph true ; # Optional
>>       .
>>
>> # Text index description
>> <#indexLucene> a text:TextIndexLucene ;
>>       text:directory <file:/home/text/tools/jena_text_index/> ;
>>       text:entityMap <#entMap> ;
>>       text:storeValues true ;
>>       text:analyzer [ a text:StandardAnalyzer ] ;
>>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>>       text:queryParser text:AnalyzingQueryParser ;
>>       text:multilingualSupport true ;
>>    .
>>
>> <#entMap> a text:EntityMap ;
>>       text:defaultField     "vcard_fn" ;
>>       text:entityField      "uri" ;
>>       text:uidField         "uid" ;
>>       text:langField        "lang" ;
>>       text:graphField       "graph" ;
>>       text:map (
>>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>>            ) .
>>
>> <#service> rdf:type fuseki:Service ;
>>       fuseki:name                     "/ds" ;   # http://host:port/ds-ro
>>       fuseki:serviceQuery             "query" ;    # SPARQL query 
>> service
>>       fuseki:serviceQuery             "sparql" ;   # SPARQL query 
>> service
>>       fuseki:serviceUpdate            "update" ;   # SPARQL update 
>> service
>>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL 
>> upload service
>>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL Graph 
>> store protocol (read and write)
>>       fuseki:dataset           :text_dataset ;
>>       .
>>


Re: java.lang.StringIndexOutOfBoundsException with Jena Text

Posted by Andy Seaborne <an...@apache.org>.
Looks like

https://issues.apache.org/jira/browse/JENA-1892 , 1890

     Andy

On 29/09/2020 15:13, Mikael Pesonen wrote:
> 
> Hi
> 
> I'm building a new text index with following command and getting java 
> error.
> 
> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
> --desc=fuseki_config.ttl
> 
> After the command I get 4 files in /home/text/tools/jena_text_index/
> 
> _0.fdt
> _0.fdx
> segments_1
> write.lock
> 
> Any idea what could case this?
> 
> 
> Error is:
> 
> java.lang.StringIndexOutOfBoundsException: String index out of range: 59
>          at java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>          at java.base/java.lang.String.charAt(String.java:711)
>          at org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>          at 
> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>          at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
> 
>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>          at 
> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59)
>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>          at 
> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
>          at jena.textindexer.exec(textindexer.java:130)
>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>          at jena.textindexer.main(textindexer.java:52)
> mikael@insight-dev:/home/text/tools/apache-jena-fuseki-3.14.0$ 
> /usr/bin/java -cp ./fuseki-server.jar jena.textindexer 
> --desc=fuseki_config.ttl
> java.lang.StringIndexOutOfBoundsException: String index out of range: 59
>          at java.base/java.lang.StringLatin1.charAt(StringLatin1.java:48)
>          at java.base/java.lang.String.charAt(String.java:711)
>          at org.apache.jena.atlas.lib.StrUtils.decodeHex(StrUtils.java:212)
>          at 
> org.apache.jena.tdb.store.nodetable.NodecSSE.decode(NodecSSE.java:121)
>          at org.apache.jena.tdb.lib.NodeLib.decode(NodeLib.java:120)
>          at org.apache.jena.tdb.lib.NodeLib.fetchDecode(NodeLib.java:97)
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableNative.readNodeFromTable(NodeTableNative.java:182) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableNative._retrieveNodeByNodeId(NodeTableNative.java:108) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableNative.getNodeForNodeId(NodeTableNative.java:67) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableCache._retrieveNodeByNodeId(NodeTableCache.java:128) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableCache.getNodeForNodeId(NodeTableCache.java:82) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableWrapper.getNodeForNodeId(NodeTableWrapper.java:50) 
> 
>          at 
> org.apache.jena.tdb.store.nodetable.NodeTableInline.getNodeForNodeId(NodeTableInline.java:67) 
> 
>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:126)
>          at org.apache.jena.tdb.lib.TupleLib.quad(TupleLib.java:120)
>          at 
> org.apache.jena.tdb.lib.TupleLib.lambda$convertToQuads$3(TupleLib.java:59)
>          at org.apache.jena.atlas.iterator.Iter$2.next(Iter.java:352)
>          at 
> org.apache.jena.atlas.iterator.IteratorCons.next(IteratorCons.java:104)
>          at jena.textindexer.exec(textindexer.java:130)
>          at jena.cmd.CmdMain.mainMethod(CmdMain.java:93)
>          at jena.cmd.CmdMain.mainRun(CmdMain.java:58)
>          at jena.cmd.CmdMain.mainRun(CmdMain.java:45)
>          at jena.textindexer.main(textindexer.java:52)
> 
> 
> config:
> 
> @prefix :<http://localhost/jena_example/#>  .
> @prefix rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>  .
> @prefix rdfs:<http://www.w3.org/2000/01/rdf-schema#>  .
> @prefix tdb:<http://jena.hpl.hp.com/2008/tdb#>  .
> @prefix ja:<http://jena.hpl.hp.com/2005/11/Assembler#>  .
> @prefix text:<http://jena.apache.org/text#>  .
> @prefix skos:<http://www.w3.org/2004/02/skos/core#> .
> @prefix fuseki:<http://jena.apache.org/fuseki#>  .
> @prefix vcard:<http://www.w3.org/2006/vcard/ns#> .
> 
> ## Example of a TDB dataset and text index
> ## Initialize TDB
> [] ja:loadClass "org.apache.jena.tdb.TDB" .
> tdb:DatasetTDB  rdfs:subClassOf  ja:RDFDataset .
> tdb:GraphTDB    rdfs:subClassOf  ja:Model .
> 
> ## Initialize text query
> [] ja:loadClass       "org.apache.jena.query.text.TextQuery" .
> # A TextDataset is a regular dataset with a text index.
> text:TextDataset      rdfs:subClassOf   ja:RDFDataset .
> # Lucene index
> text:TextIndexLucene  rdfs:subClassOf   text:TextIndex .
> 
> 
> ## ---------------------------------------------------------------
> # build: java -cp ./fuseki-server.jar jena.textindexer 
> --desc=fuseki_config.ttl
> 
> :text_dataset rdf:type     text:TextDataset ;
>       text:dataset   :my_dataset ;
>       text:index     <#indexLucene> ;
>       .
> 
> # A TDB dataset used for RDF storage
> :my_dataset rdf:type      tdb:DatasetTDB ;
>       tdb:location "/home/text/tools/jena_data/" ;
> #    tdb:unionDefaultGraph true ; # Optional
>       .
> 
> # Text index description
> <#indexLucene> a text:TextIndexLucene ;
>       text:directory <file:/home/text/tools/jena_text_index/> ;
>       text:entityMap <#entMap> ;
>       text:storeValues true ;
>       text:analyzer [ a text:StandardAnalyzer ] ;
>       text:queryAnalyzer [ a text:KeywordAnalyzer ] ;
>       text:queryParser text:AnalyzingQueryParser ;
>       text:multilingualSupport true ;
>    .
> 
> <#entMap> a text:EntityMap ;
>       text:defaultField     "vcard_fn" ;
>       text:entityField      "uri" ;
>       text:uidField         "uid" ;
>       text:langField        "lang" ;
>       text:graphField       "graph" ;
>       text:map (
>            [ text:field "vcard_fn" ; text:predicate vcard:fn ]
>            [ text:field "altLabel"  ; text:predicate skos:altLabel ]
>            ) .
> 
> <#service> rdf:type fuseki:Service ;
>       fuseki:name                     "/ds" ;   # http://host:port/ds-ro
>       fuseki:serviceQuery             "query" ;    # SPARQL query service
>       fuseki:serviceQuery             "sparql" ;   # SPARQL query service
>       fuseki:serviceUpdate            "update" ;   # SPARQL update service
>       fuseki:serviceUpload            "upload" ;   # Non-SPARQL upload 
> service
>       fuseki:serviceReadWriteGraphStore "data" ;     # SPARQL Graph 
> store protocol (read and write)
>       fuseki:dataset           :text_dataset ;
>       .
>