You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2013/12/20 13:18:52 UTC
svn commit: r1552610 -
/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
Author: markg
Date: Fri Dec 20 12:18:52 2013
New Revision: 1552610
URL: http://svn.apache.org/r1552610
Log:
OPENNLP-626
Integrated Arabic, Russian, Thai, and Farsi analyzer usage to GazateerIndexer. Still need to add support for query time analyzer usage via a language code overload or language detector...
Modified:
opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
Modified: opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java
URL: http://svn.apache.org/viewvc/opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java?rev=1552610&r1=1552609&r2=1552610&view=diff
==============================================================================
--- opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java (original)
+++ opennlp/sandbox/apache-opennlp-addons/src/main/java/org/apache/opennlp/addons/tools/entitylinker/geoentitylinker/GazateerIndexer.java Fri Dec 20 12:18:52 2013
@@ -19,10 +19,16 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
@@ -38,6 +44,11 @@ import org.apache.lucene.util.Version;
*/
public class GazateerIndexer {
+ public GazateerIndexer() {
+ loadAnalyzerMap();
+ }
+ Map<String, Analyzer> languageAnalyzerMap = new HashMap<>();
+
public static interface Separable {
String getSeparator();
@@ -92,14 +103,19 @@ public class GazateerIndexer {
BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
List<String> fields = new ArrayList<String>();
int counter = 0;
+ int langCodeIndex = 0;
System.out.println("reading gazateer data from file...........");
while (reader.read() != -1) {
String line = reader.readLine();
String[] values = line.split(type.getSeparator());
if (counter == 0) {
// build fields
- for (String columnName : values) {
+ for (int i = 0; i < values.length; i++) {
+ String columnName = values[i];
fields.add(columnName.replace("»¿", "").trim());
+ if (columnName.toLowerCase().equals("lc")) {
+ langCodeIndex = i;
+ }
}
@@ -108,7 +124,25 @@ public class GazateerIndexer {
for (int i = 0; i < fields.size() - 1; i++) {
doc.add(new TextField(fields.get(i), values[i], Field.Store.YES));
}
- w.addDocument(doc);
+ if (type == GazType.GEONAMES) {
+ /**
+ * see if the map contains a language specific analyzer
+ */
+ if (languageAnalyzerMap.containsKey(values[langCodeIndex])) {
+ /*
+ * if so retrieve it from the map
+ */
+ Analyzer analyzer = languageAnalyzerMap.get(values[langCodeIndex]);
+ /**
+ * index the doc using the specified analyzer
+ */
+ w.addDocument(doc, analyzer);
+ } else {
+ w.addDocument(doc);
+ }
+ } else {
+ w.addDocument(doc);
+ }
}
counter++;
if (counter % 10000 == 0) {
@@ -120,4 +154,14 @@ public class GazateerIndexer {
w.commit();
System.out.println("Completed indexing gaz! index name is: " + type.toString());
}
+/**
+ * TODO: make these analyzers configurable
+ */
+ private void loadAnalyzerMap() {
+ languageAnalyzerMap.put("ara", new ArabicAnalyzer(Version.LUCENE_45));
+ languageAnalyzerMap.put("tha", new ThaiAnalyzer(Version.LUCENE_45));
+ languageAnalyzerMap.put("rus", new RussianAnalyzer(Version.LUCENE_45));
+ languageAnalyzerMap.put("fas", new PersianAnalyzer(Version.LUCENE_45));
+
+ }
}