You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2013/10/18 11:44:27 UTC
svn commit: r1533371 - in
/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src:
java/org/apache/lucene/analysis/ko/dic/
java/org/apache/lucene/analysis/ko/tagging/
resources/org/apache/lucene/analysis/ko/dic/
Author: uschindler
Date: Fri Oct 18 09:44:27 2013
New Revision: 1533371
URL: http://svn.apache.org/r1533371
Log:
LUCENE-4956: Quick fix to remove the Trie for the Tagger. The file is very slow and a TreeMap is perfectly fine. Can still be improved, but the primary concern is to remove Trie.java
Modified:
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java?rev=1533371&r1=1533370&r2=1533371&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/dic/DictionaryResources.java Fri Oct 18 09:44:27 2013
@@ -76,26 +76,20 @@ public class DictionaryResources {
/**
* Get the contents of a <code>Reader</code> as a list of Strings,
- * one entry per line.
- * <p>
- * This method buffers the input internally, so there is no need to use a
- * <code>BufferedReader</code>.
- *
+ * one entry per line, removing comment lines starting with '!'.
* @param input the <code>Reader</code> to read from, not null
* @return the list of Strings, never null
- * @throws NullPointerException if the input is null
* @throws IOException if an I/O error occurs
- * @since Commons IO 1.1
*/
private static List<String> readLines(Reader input) throws IOException {
BufferedReader reader = new BufferedReader(input);
List<String> list = new ArrayList<String>();
- String line = reader.readLine();
- while (line != null) {
- if ( ! (line.startsWith("!") || line.startsWith("\uFEFF!"))) { // Skip comment lines starting with '!'
- list.add(line);
+ String line;
+ while ((line = reader.readLine()) != null) {
+ if (line.startsWith("!") || line.startsWith("\uFEFF!")) { // Skip comment lines starting with '!'
+ continue;
}
- line = reader.readLine();
+ list.add(line);
}
return list;
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java?rev=1533371&r1=1533370&r2=1533371&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/java/org/apache/lucene/analysis/ko/tagging/Tagger.java Fri Oct 18 09:44:27 2013
@@ -19,14 +19,16 @@ package org.apache.lucene.analysis.ko.ta
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
+import java.util.NavigableMap;
+import java.util.TreeMap;
import org.apache.lucene.analysis.ko.dic.DictionaryResources;
import org.apache.lucene.analysis.ko.morph.AnalysisOutput;
import org.apache.lucene.analysis.ko.morph.PatternConstants;
import org.apache.lucene.analysis.ko.utils.ConstraintUtil;
-import org.apache.lucene.analysis.ko.utils.Trie;
/**
* ì¬ë¬ê°ì ííìë¶ì ê²°ê³¼ ì¤ì ìµì ì ê²ì ì ííë¤.
@@ -34,28 +36,27 @@ import org.apache.lucene.analysis.ko.uti
*/
public class Tagger {
- private static final Trie<String, String[]> occurrences = new Trie<String, String[]>(true);;
+ private static final NavigableMap<String, String[]> occurrences = new TreeMap<String, String[]>();;
static {
try {
List<String> strs = DictionaryResources.readLines(DictionaryResources.FILE_TAG_DIC);
-
for(String str : strs) {
- if(str==null) continue;
- str = str.trim();
+ str=str.trim();
+ if(str.isEmpty()) continue;
String[] syls = str.split("[:]+");
- if(syls.length!=4) continue;
+ if(syls.length!=4)
+ throw new IOException("Invalid file format: "+Arrays.toString(syls));
- String key = null;
+ final String key;
if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
final String joined = syls[1] + "/" + syls[2] + "/" + syls[3];
String[] patns = joined.split("[/]+");
- occurrences.add(syls[0]+key, patns);
-
- }
-
+ occurrences.put(syls[0]+key, patns);
+ }
+ System.out.println(occurrences);
} catch (IOException ioe) {
throw new Error("Fail to read the tagger dictionary.", ioe);
}
@@ -202,8 +203,6 @@ public class Tagger {
private boolean checkGrammer(String[] values, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean depFront) {
- boolean ok = true;
-
String pend = pmorph.getJosa();
if(pend==null) pend = pmorph.getEomi();
@@ -284,9 +283,8 @@ public class Tagger {
return false;
}
- @SuppressWarnings("unchecked")
public static Iterator<String[]> getGR(String prefix) {
- return occurrences.getPrefixedBy(prefix);
+ return occurrences.subMap(prefix, prefix + "\uFFFF").values().iterator();
}
}
Modified: lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic?rev=1533371&r1=1533370&r2=1533371&view=diff
==============================================================================
--- lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic (original)
+++ lucene/dev/branches/lucene4956/lucene/analysis/arirang/src/resources/org/apache/lucene/analysis/ko/dic/tagger.dic Fri Oct 18 09:44:27 2013
@@ -13,7 +13,6 @@
! See the License for the specific language governing permissions and
! limitations under the License.
!
-//#####################
F:NILL/ì/0:ëí^S/NILL/11:0
F:NILL/ì/0:ê´í^S/NILL/11:0
F:NILL/ì/0:ë°ë¥´^S/NILL/11:0