You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2015/05/25 14:10:53 UTC

[2/6] jena git commit: Remove of multilingual aspects in TextIndexLucene + only usage of ISO 639-1 language codes

Remove of multilingual aspects in TextIndexLucene
+ only usage of ISO 639-1 language codes


Project: http://git-wip-us.apache.org/repos/asf/jena/repo
Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/7ab59ed6
Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/7ab59ed6
Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/7ab59ed6

Branch: refs/heads/master
Commit: 7ab59ed6d914496c9a1492376745fe9cee840f67
Parents: 9553c6b
Author: Alexis Miara <al...@hotmail.com>
Authored: Thu May 14 09:17:40 2015 -0400
Committer: Alexis Miara <al...@hotmail.com>
Committed: Thu May 14 09:17:40 2015 -0400

----------------------------------------------------------------------
 .../org/apache/jena/query/text/LuceneUtil.java  | 55 ---------------
 .../org/apache/jena/query/text/TextIndex.java   |  3 -
 .../apache/jena/query/text/TextIndexLucene.java | 74 ++++++++------------
 .../query/text/TextIndexLuceneMultilingual.java | 47 ++++++++++++-
 .../apache/jena/query/text/TextIndexSolr.java   |  5 --
 .../org/apache/jena/query/text/TextQueryPF.java |  8 +--
 6 files changed, 78 insertions(+), 114 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/jena/blob/7ab59ed6/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java
index 7fafc4c..050b6f3 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/LuceneUtil.java
@@ -37,7 +37,6 @@ public class LuceneUtil {
     }
 
     public static Analyzer getLocalizedAnalyzer(String lang, Version ver) {
-        lang = getISO2Language(lang);
         if (lang == null)
             return null;
 
@@ -58,60 +57,6 @@ public class LuceneUtil {
         }
     }
 
-    public static String getISO2Language(String lang) {
-        if (lang != null) {
-            lang = lang.split("-")[0].toLowerCase();
-            if (lang.length() == 2)
-                return lang;
-            else {
-                if ("ara".equals(lang)) return "ar";
-                if ("bul".equals(lang)) return "bg";
-                if ("cat".equals(lang)) return "ca";
-                if ("ces".equals(lang)) return "cs";
-                if ("cze".equals(lang)) return "cs";
-                if ("dan".equals(lang)) return "da";
-                if ("deu".equals(lang)) return "de";
-                if ("ger".equals(lang)) return "de";
-                if ("ell".equals(lang)) return "el";
-                if ("gre".equals(lang)) return "el";
-                if ("eng".equals(lang)) return "en";
-                if ("spa".equals(lang)) return "es";
-                if ("eus".equals(lang)) return "eu";
-                if ("baq".equals(lang)) return "eu";
-                if ("fas".equals(lang)) return "fa";
-                if ("per".equals(lang)) return "fa";
-                if ("fin".equals(lang)) return "fi";
-                if ("fra".equals(lang)) return "fr";
-                if ("fre".equals(lang)) return "fr";
-                if ("gle".equals(lang)) return "ga";
-                if ("glg".equals(lang)) return "gl";
-                if ("hin".equals(lang)) return "hi";
-                if ("hun".equals(lang)) return "hu";
-                if ("hye".equals(lang)) return "hy";
-                if ("arm".equals(lang)) return "hy";
-                if ("ind".equals(lang)) return "id";
-                if ("ita".equals(lang)) return "it";
-                if ("jpn".equals(lang)) return "jp";
-                if ("kor".equals(lang)) return "ko";
-                if ("lav".equals(lang)) return "lv";
-                if ("nld".equals(lang)) return "nl";
-                if ("dut".equals(lang)) return "nl";
-                if ("nor".equals(lang)) return "no";
-                if ("por".equals(lang)) return "pt";
-                if ("ron".equals(lang)) return "ro";
-                if ("rum".equals(lang)) return "ro";
-                if ("rus".equals(lang)) return "ru";
-                if ("swe".equals(lang)) return "sv";
-                if ("tha".equals(lang)) return "th";
-                if ("tur".equals(lang)) return "tr";
-                if ("zho".equals(lang)) return "zh";
-                if ("chi".equals(lang)) return "zh";
-            }
-        }
-
-        return null;
-    }
-
     private static void initAnalyzerDefs() {
         analyzersClasses = new Hashtable<>();
         analyzersClasses.put("ar", org.apache.lucene.analysis.ar.ArabicAnalyzer.class);

http://git-wip-us.apache.org/repos/asf/jena/blob/7ab59ed6/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java
index ffe92e7..69efb31 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndex.java
@@ -51,7 +51,4 @@ public interface TextIndex extends Closeable //, Transactional
     List<Node> query(String qs) ;
 
     EntityDefinition getDocDef() ;
-
-    //localization
-    boolean isMultilingual() ;
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/7ab59ed6/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
index 004c242..abb9466 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLucene.java
@@ -206,42 +206,36 @@ public class TextIndexLucene implements TextIndex {
         if ( log.isDebugEnabled() )
             log.debug("Update entity: " + entity) ;
         try {
-            Document doc = doc(entity);
-            Analyzer analyzer = null;
-            if (isMultilingual())
-                analyzer = LuceneUtil.getLocalizedAnalyzer(entity.getLanguage());
-            Term term = new Term(docDef.getEntityField(), entity.getId());
-
-            if (analyzer != null)
-                indexWriter.updateDocument(term, doc, analyzer) ;
-            else //use the default one
-                indexWriter.updateDocument(term, doc);
+            updateDocument(entity);
         } catch (IOException e) {
             throw new TextIndexException(e) ;
         }
     }
 
+    protected void updateDocument(Entity entity) throws IOException {
+        Document doc = doc(entity);
+        Term term = new Term(docDef.getEntityField(), entity.getId());
+        indexWriter.updateDocument(term, doc);
+    }
+
     @Override
     public void addEntity(Entity entity) {
         if ( log.isDebugEnabled() )
             log.debug("Add entity: " + entity) ;
         try {
-            Document doc = doc(entity) ;
-            Analyzer analyzer = null;
-            if (isMultilingual())
-                analyzer = LuceneUtil.getLocalizedAnalyzer(entity.getLanguage());
-
-            if (analyzer != null)
-                indexWriter.addDocument(doc, analyzer) ;
-            else //use the default one
-                indexWriter.addDocument(doc) ;
+            addDocument(entity);
         }
         catch (IOException e) {
             throw new TextIndexException(e) ;
         }
     }
 
-    private Document doc(Entity entity) {
+    protected void addDocument(Entity entity) throws IOException {
+        Document doc = doc(entity) ;
+        indexWriter.addDocument(doc) ;
+    }
+
+    protected Document doc(Entity entity) {
         Document doc = new Document() ;
         Field entField = new Field(docDef.getEntityField(), entity.getId(), ftIRI) ;
         doc.add(entField) ;
@@ -252,18 +246,18 @@ public class TextIndexLucene implements TextIndex {
             doc.add(gField) ;
         }
 
+        for ( Field field : buildContentFields(entity) )
+            doc.add(field);
+
+        return doc ;
+    }
+
+    protected List<Field> buildContentFields(Entity entity) {
+        List<Field> list = new ArrayList<>();
         for ( Entry<String, Object> e : entity.getMap().entrySet() ) {
-            Field field = new Field(e.getKey(), (String)e.getValue(), ftText) ;
-            doc.add(field) ;
-            if (isMultilingual()) {
-                String lang =  entity.getLanguage();
-                if (lang == null || "".equals(lang))
-                    lang = "undef";
-                field = new Field("lang", lang, StringField.TYPE_STORED ) ;
-                doc.add(field) ;
-            }
+            list.add( new Field(e.getKey(), (String) e.getValue(), ftText) );
         }
-        return doc ;
+        return list;
     }
 
     @Override
@@ -289,10 +283,14 @@ public class TextIndexLucene implements TextIndex {
         return query ;
     }
 
+    protected Query preParseQuery(String queryString, String primaryField, Analyzer analyzer) throws ParseException {
+        return parseQuery(queryString, primaryField, analyzer);
+    }
+
     private List<Map<String, Node>> get$(IndexReader indexReader, String uri) throws ParseException, IOException {
         String escaped = QueryParserBase.escape(uri) ;
         String qs = docDef.getEntityField() + ":" + escaped ;
-        Query query = parseQuery(qs, docDef.getPrimaryField(), queryAnalyzer) ;
+        Query query = preParseQuery(qs, docDef.getPrimaryField(), queryAnalyzer) ;
         IndexSearcher indexSearcher = new IndexSearcher(indexReader) ;
         ScoreDoc[] sDocs = indexSearcher.search(query, 1).scoreDocs ;
         List<Map<String, Node>> records = new ArrayList<Map<String, Node>>() ;
@@ -339,14 +337,7 @@ public class TextIndexLucene implements TextIndex {
 
     private List<Node> query$(IndexReader indexReader, String qs, int limit) throws ParseException, IOException {
         IndexSearcher indexSearcher = new IndexSearcher(indexReader) ;
-        Analyzer qAnalyzer = queryAnalyzer;
-        if (isMultilingual()) {//index and query analyzer must be the same
-            String lang = qs.substring( qs.lastIndexOf(":") + 1);
-            if (!"undef".equals(lang))
-                qAnalyzer = LuceneUtil.getLocalizedAnalyzer(lang);
-        }
-
-        Query query = parseQuery(qs, docDef.getPrimaryField(), qAnalyzer) ;
+        Query query = preParseQuery(qs, docDef.getPrimaryField(), queryAnalyzer) ;
         if ( limit <= 0 )
             limit = MAX_N ;
         ScoreDoc[] sDocs = indexSearcher.search(query, limit).scoreDocs ;
@@ -370,11 +361,6 @@ public class TextIndexLucene implements TextIndex {
         return docDef ;
     }
 
-    @Override
-    public boolean isMultilingual() {
-        return false;
-    }
-
     private Node entryToNode(String v) {
         // TEMP
         return NodeFactoryExtra.createLiteralNode(v, null, null) ;

http://git-wip-us.apache.org/repos/asf/jena/blob/7ab59ed6/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java
index 86b34e6..cdf7876 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexLuceneMultilingual.java
@@ -18,8 +18,18 @@
 
 package org.apache.jena.query.text;
 
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;
 
+import java.io.IOException;
+import java.util.List;
+
 public class TextIndexLuceneMultilingual extends TextIndexLucene {
 
     public TextIndexLuceneMultilingual(Directory directory, EntityDefinition def) {
@@ -27,7 +37,40 @@ public class TextIndexLuceneMultilingual extends TextIndexLucene {
     }
 
     @Override
-    public boolean isMultilingual() {
-        return true;
+    protected void updateDocument(Entity entity) throws IOException {
+        Document doc = doc(entity);
+        Term term = new Term(getDocDef().getEntityField(), entity.getId());
+        Analyzer analyzer = LuceneUtil.getLocalizedAnalyzer(entity.getLanguage());
+        if (analyzer == null)
+            analyzer = getAnalyzer();
+        getIndexWriter().updateDocument(term, doc, analyzer) ;
+    }
+
+    @Override
+    protected void addDocument(Entity entity) throws IOException {
+        Document doc = doc(entity) ;
+        Analyzer analyzer = LuceneUtil.getLocalizedAnalyzer(entity.getLanguage());
+        if (analyzer == null)
+            analyzer = getAnalyzer();
+        getIndexWriter().addDocument(doc, analyzer) ;
+    }
+
+    @Override
+    protected List<Field> buildContentFields(Entity entity) {
+        List<Field> list = super.buildContentFields(entity);
+        String lang =  entity.getLanguage();
+        if (lang == null || "".equals(lang))
+            lang = "undef";
+        list.add( new Field("lang", lang, StringField.TYPE_STORED ) );
+        return list;
+    }
+
+    @Override
+    protected Query preParseQuery(String queryString, String primaryField, Analyzer analyzer) throws ParseException {
+        String lang = queryString.substring( queryString.lastIndexOf(":") + 1);
+        if (!"undef".equals(lang))
+            analyzer = LuceneUtil.getLocalizedAnalyzer(lang);
+
+        return super.preParseQuery(queryString, primaryField, analyzer);
     }
 }

http://git-wip-us.apache.org/repos/asf/jena/blob/7ab59ed6/jena-text/src/main/java/org/apache/jena/query/text/TextIndexSolr.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexSolr.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexSolr.java
index 5be898c..54a3263 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexSolr.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexSolr.java
@@ -231,11 +231,6 @@ public class TextIndexSolr implements TextIndex
         return docDef ;
     }
 
-    @Override
-    public boolean isMultilingual() {
-        return false;
-    }
-
     private Node entryToNode(String v)
     {
         // TEMP

http://git-wip-us.apache.org/repos/asf/jena/blob/7ab59ed6/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java
----------------------------------------------------------------------
diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java
index cc7e4f6..4fac00b 100644
--- a/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java
+++ b/jena-text/src/main/java/org/apache/jena/query/text/TextQueryPF.java
@@ -74,11 +74,9 @@ public class TextQueryPF extends PropertyFunctionBase {
         if (argObject.isList()) {
             //extract of extra lang arg if present and if is usable (multilingual index).
             //arg is removed from the list to avoid conflict with order and args length
-            if (server.isMultilingual()) {
-                langArg = extractArg("lang", argObject);
-                if (langArg == null)
-                    langArg = "undef";
-            }
+            langArg = extractArg("lang", argObject);
+            if (langArg == null && server instanceof TextIndexLuceneMultilingual)
+                langArg = "undef";
 
             List<Node> list = argObject.getArgList() ;
             if (list.size() == 0)