You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by co...@apache.org on 2018/11/01 17:02:37 UTC
jena git commit: Merged hilightdebug. This closes #485
Repository: jena
Updated Branches:
refs/heads/master c3027d69a -> 289a9a905
Merged hilightdebug. This closes #485
Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/289a9a90
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/289a9a90
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/289a9a90
Branch: refs/heads/master
Commit: 289a9a9050cffc5d89f314f50eb4cc5cd21a805d
Parents: c3027d6
Author: Chris Tomlinson <ct...@moonvine.org>
Authored: Mon Oct 29 14:04:50 2018 -0500
Committer: Chris Tomlinson <ct...@moonvine.org>
Committed: Thu Nov 1 11:37:25 2018 -0500
----------------------------------------------------------------------
.../apache/jena/query/text/TextIndexLucene.java | 34 ++++++++++--------
.../apache/jena/query/text/analyzer/Util.java | 38 ++++++++++++++++++++
.../assembler/DefineAnalyzersAssembler.java | 2 +-
3 files changed, 59 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/jena/blob/289a9a90/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
index 120e83f..64018e1 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
@@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry ;
+import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.jena.datatypes.RDFDatatype ;
@@ -484,6 +485,7 @@ public class TextIndexLucene implements TextIndex {
String start = RIGHT_ARROW;
String end = LEFT_ARROW;
String fragSep = DIVIDES;
+ String patternExpr = null;
boolean joinHi = true;
boolean joinFrags = true;
@@ -517,23 +519,26 @@ public class TextIndexLucene implements TextIndex {
}
}
}
+ patternExpr = end+Z_MORE_SEPS+start;
}
}
- private String frags2string(TextFragment[] frags, HighlightOpts opts) {
- String sep = "";
- String rez = "";
-
- for (TextFragment f : frags) {
- String s = opts.joinHi ? f.toString().replaceAll(opts.end+Z_MORE_SEPS+opts.start, "$1") : f.toString();
- rez += sep + s;
+ private String frags2string(final TextFragment[] frags, final HighlightOpts opts) {
+ final StringBuilder sb = new StringBuilder();
+ String sep = "";
+
+ for (final TextFragment f : frags) {
+ final String fragStr = f.toString();
+ log.trace("found fragment {}", f);
+ sb.append(sep);
+ sb.append(opts.joinHi ? fragStr.replaceAll(opts.patternExpr, "$1") : fragStr);
sep = opts.fragSep;
}
-
- return rez;
+
+ return sb.toString();
}
- private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field, String highlight, boolean useDocLang)
+ private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, String field, String highlight, boolean useDocLang, String queryLang)
throws IOException, InvalidTokenOffsetsException {
List<TextHit> results = new ArrayList<>() ;
@@ -550,13 +555,14 @@ public class TextIndexLucene implements TextIndex {
Node literal = null;
String lexical = doc.get(field) ;
String docLang = doc.get(docDef.getLangField()) ;
- String effectiveField = useDocLang ? field + "_" + docLang : field;
+ String effectiveField = useDocLang ? field + "_" + Util.getEffectiveLang(docLang, queryLang) : field;
log.trace("highlightResults[{}]: {}, field: {}, lexical: {}, docLang: {}, effectiveField: {}", sd.doc, doc, field, lexical, docLang, effectiveField) ;
if (lexical != null) {
- TokenStream tokenStream = queryAnalyzer.tokenStream(effectiveField, lexical);
+ TokenStream tokenStream = indexAnalyzer.tokenStream(effectiveField, lexical);
+ log.trace("tokenStream: {}", tokenStream.toString());
TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags, opts.maxFrags);
String rez = frags2string(frags, opts);
-
+ log.trace("result: {}, #frags: {}", rez, frags.length) ;
literal = NodeFactory.createLiteral(rez, docLang);
}
@@ -639,7 +645,7 @@ public class TextIndexLucene implements TextIndex {
ScoreDoc[] sDocs = indexSearcher.search(query, limit).scoreDocs ;
if (highlight != null) {
- return highlightResults(sDocs, indexSearcher, query, textField, highlight, usingSearchFor);
+ return highlightResults(sDocs, indexSearcher, query, textField, highlight, usingSearchFor, lang);
} else {
return simpleResults(sDocs, indexSearcher, query, textField);
}
http://git-wip-us.apache.org/repos/asf/jena/blob/289a9a90/jena-text/src/main/java/org/apache/jena/query/text/analyzer/Util.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/Util.java b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/Util.java
index 1e7b85d..aff431d 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/analyzer/Util.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/analyzer/Util.java
@@ -21,13 +21,19 @@ package org.apache.jena.query.text.analyzer;
import org.apache.commons.lang3.StringUtils;
import org.apache.jena.rdf.model.Resource;
import org.apache.lucene.analysis.Analyzer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
+import java.util.Map.Entry;
public class Util {
+ private static Logger log = LoggerFactory.getLogger(Util.class) ;
+
private static Hashtable<String, Class<?>> analyzersClasses; //mapping between BCP-47 language tags and lucene analyzersClasses
private static Hashtable<String, Analyzer> cache = new Hashtable<>(); //to avoid unnecessary multiple analyzer instantiations
@@ -42,6 +48,9 @@ public class Util {
// map of auxiliary index info
private static Hashtable<String, List<String>> auxIndexes = new Hashtable<>();
+
+ // cache of effective fields
+ private static Hashtable<String, Hashtable<String,String>> effectiveFields = new Hashtable<>();
static {
initAnalyzerDefs();
@@ -115,6 +124,35 @@ public class Util {
auxIndexes.put(tag, tags);
}
+ public static void finishCaching() {
+ log.trace("call finishCaching()");
+ for (final Entry<String,List<String>> auxIndexesE : auxIndexes.entrySet()) {
+ final String tag = auxIndexesE.getKey(); // ex: zh-hans
+ final List<String> auxIndexesL = auxIndexesE.getValue();
+ log.trace("finishCaching: tag: {}", tag);
+ for (final String auxIndexTag : auxIndexesL) { // ex: auxIndexTag: zh-aux-han2pinyin
+ log.trace("finishCaching: auxIndexTag: {}", auxIndexTag);
+ for (final String searchForTag : searchForTags.get(auxIndexTag)) { // ex: zh-latn-pinyin
+ final Hashtable<String,String> res = effectiveFields.computeIfAbsent(tag, x -> new Hashtable<String,String>());
+ log.trace("add effectiveField mapping: d:{} + q:{} = e:{}", tag, searchForTag, auxIndexTag);
+ res.put(searchForTag, auxIndexTag);
+ }
+ }
+ }
+ }
+
+ public static String getEffectiveLang(final String docLang, final String queryLang) {
+ final Hashtable<String,String> m = effectiveFields.get(docLang);
+ if (m == null)
+ return docLang;
+ final String tag = m.get(queryLang);
+ if (tag == null) {
+ log.info("getEffectiveFields got map for {} but couldn't find effective tag for {}", docLang, queryLang);
+ return docLang;
+ }
+ return tag;
+ }
+
private static void initAnalyzerDefs() {
analyzersClasses = new Hashtable<>();
analyzersClasses.put("ar", org.apache.lucene.analysis.ar.ArabicAnalyzer.class);
http://git-wip-us.apache.org/repos/asf/jena/blob/289a9a90/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
index 876ca74..c9289aa 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/DefineAnalyzersAssembler.java
@@ -165,7 +165,7 @@ public class DefineAnalyzersAssembler {
current = (Resource) rest;
}
-
+ Util.finishCaching();
return isMultilingualSupport;
}
}