You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by rm...@apache.org on 2014/03/04 18:51:58 UTC

svn commit: r1574159 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Author: rmuir
Date: Tue Mar  4 17:51:57 2014
New Revision: 1574159

URL: http://svn.apache.org/r1574159
Log:
SOLR-2934: increase buffer size for recent dictionaries with large amounts of AF/AM lines before charset

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java?rev=1574159&r1=1574158&r2=1574159&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java Tue Mar  4 17:51:57 2014
@@ -154,9 +154,11 @@ public class Dictionary {
     this.ignoreCase = ignoreCase;
     this.needsInputCleaning = ignoreCase;
     this.needsOutputCleaning = false; // set if we have an OCONV
-    // hungarian has thousands of AF before the SET, so a 32k buffer is needed 
-    BufferedInputStream buffered = new BufferedInputStream(affix, 32768);
-    buffered.mark(32768);
+    // TODO: we really need to probably buffer this on disk since so many newer dictionaries
+    // (en_GB, hu_HU, etc) now have tons of AM lines (morph metadata) etc before they finally declare 
+    // their encoding... but for now this large buffer is a workaround
+    BufferedInputStream buffered = new BufferedInputStream(affix, 65536);
+    buffered.mark(65536);
     String encoding = getDictionaryEncoding(buffered);
     buffered.reset();
     CharsetDecoder decoder = getJavaEncoding(encoding);