You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2011/02/11 16:37:30 UTC
svn commit: r1069842 [4/4] - in /lucene/dev/branches/bulkpostings: ./
lucene/ lucene/src/java/org/apache/lucene/analysis/
lucene/src/java/org/apache/lucene/document/
lucene/src/java/org/apache/lucene/index/
lucene/src/java/org/apache/lucene/index/codec...
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.jflex Fri Feb 11 15:37:28 2011
@@ -77,6 +77,8 @@ ComplexContext = ([\p{LB:Complex_Context
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+// Script=Hangul & Aletter
+HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
@@ -168,16 +170,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
%{
/** Alphanumeric sequences */
- public static final String WORD_TYPE = "<ALPHANUM>";
+ public static final String WORD_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Numbers */
- public static final String NUMERIC_TYPE = "<NUM>";
+ public static final String NUMERIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax */
public static final String URL_TYPE = "<URL>";
/** E-mail addresses */
- public static final String EMAIL_TYPE = "<EMAIL";
+ public static final String EMAIL_TYPE = "<EMAIL>";
/**
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
@@ -187,12 +189,16 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
* <p>
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
*/
- public static final String SOUTH_EAST_ASIAN_TYPE = "<SOUTHEAST_ASIAN>";
+ public static final String SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN];
- public static final String IDEOGRAPHIC_TYPE = "<IDEOGRAPHIC>";
+ public static final String IDEOGRAPHIC_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
- public static final String HIRAGANA_TYPE = "<HIRAGANA>";
+ public static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ public static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+
+ public static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt
@@ -316,6 +322,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
{ExtendNumLetEx}*
{ if (populateAttributes(NUMERIC_TYPE)) return true; }
+// subset of the below for typing purposes only!
+{HangulEx}+
+ { if (populateAttributes(HANGUL_TYPE)) return true; }
+
+{KatakanaEx}+
+ { if (populateAttributes(KATAKANA_TYPE)) return true; }
// UAX#29 WB5. ALetter à ALetter
// WB6. ALetter à (MidLetter | MidNumLet) ALetter
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Fri Feb 11 15:37:28 2011
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 10/3/10 9:07 AM */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 2/9/11 11:45 AM */
package org.apache.lucene.analysis.wikipedia;
@@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokena
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
- * on 10/3/10 9:07 AM from the specification file
- * <tt>C:/Users/rmuir/workspace/lucene-clean/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
+ * on 2/9/11 11:45 AM from the specification file
+ * <tt>C:/Users/rmuir/workspace/lucene-2911/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@@ -757,6 +757,12 @@ final int setText(StringBuilder buffer){
zzState = ZZ_LEXSTATE[zzLexicalState];
+ // set up zzAction for empty match case:
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ }
+
zzForAction: {
while (true) {
@@ -789,7 +795,7 @@ final int setText(StringBuilder buffer){
if (zzNext == -1) break zzForAction;
zzState = zzNext;
- int zzAttributes = zzAttrL[zzState];
+ zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Fri Feb 11 15:37:28 2011
@@ -207,4 +207,16 @@ public class TestStandardAnalyzer extend
new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}
Modified: lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Fri Feb 11 15:37:28 2011
@@ -406,4 +406,16 @@ public class TestUAX29URLEmailTokenizer
new String[] {"ð©¬
", "è±", "é", "ä¹", "æ¯", "ç"},
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}
Modified: lucene/dev/branches/bulkpostings/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java Fri Feb 11 15:37:28 2011
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.icu.s
import java.io.IOException;
import java.io.InputStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
@@ -44,20 +46,24 @@ import com.ibm.icu.util.ULocale;
*/
public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
/** Token type for words containing ideographic characters */
- public static final String WORD_IDEO = "<IDEOGRAPHIC>";
- /** Token type for words containing Japanese kana */
- public static final String WORD_KANA = "<KANA>";
+ public static final String WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+ /** Token type for words containing Japanese hiragana */
+ public static final String WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+ /** Token type for words containing Japanese katakana */
+ public static final String WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+ /** Token type for words containing Korean hangul */
+ public static final String WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
/** Token type for words that contain letters */
- public static final String WORD_LETTER = "<ALPHANUM>";
+ public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Token type for words that appear to be numbers */
- public static final String WORD_NUMBER = "<NUM>";
+ public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/*
* the default breakiterators in use. these can be expensive to
* instantiate, cheap to clone.
*/
private static final BreakIterator rootBreakIterator =
- BreakIterator.getWordInstance(ULocale.ROOT);
+ readBreakIterator("Default.brk");
private static final BreakIterator thaiBreakIterator =
BreakIterator.getWordInstance(new ULocale("th_TH"));
private static final BreakIterator hebrewBreakIterator =
@@ -87,9 +93,9 @@ public class DefaultICUTokenizerConfig e
case RuleBasedBreakIterator.WORD_IDEO:
return WORD_IDEO;
case RuleBasedBreakIterator.WORD_KANA:
- return WORD_KANA;
+ return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA;
case RuleBasedBreakIterator.WORD_LETTER:
- return WORD_LETTER;
+ return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
case RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER;
default: /* some other custom code */
Modified: lucene/dev/branches/bulkpostings/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java (original)
+++ lucene/dev/branches/bulkpostings/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java Fri Feb 11 15:37:28 2011
@@ -128,11 +128,10 @@ public class TestICUTokenizer extends Ba
/*
* For chinese, tokenize as char (these can later form bigrams or whatever)
- * TODO: why do full-width numerics have no word-break prop?
*/
public void testChinese() throws Exception {
assertAnalyzesTo(a, "ææ¯ä¸å½äººã ï¼ï¼ï¼ï¼ ï¼´ï½
ï½ï½ï½ ",
- new String[] { "æ", "æ¯", "ä¸", "å½", "人", "tests"});
+ new String[] { "æ", "æ¯", "ä¸", "å½", "人", "1234", "tests"});
}
public void testEmpty() throws Exception {
@@ -221,4 +220,16 @@ public class TestICUTokenizer extends Ba
new String[] {"david", "has", "5000", "bones"},
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}
+
+ public void testKorean() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "í민ì ì",
+ new String[] { "í민ì ì" },
+ new String[] { "<HANGUL>" });
+ }
+
+ public void testJapanese() throws Exception {
+ BaseTokenStreamTestCase.assertAnalyzesTo(a, "ä»®åé£ã ã«ã¿ã«ã",
+ new String[] { "ä»®", "å", "é£", "ã", "ã«ã¿ã«ã" },
+ new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
+ }
}
Modified: lucene/dev/branches/bulkpostings/solr/example/solr/conf/velocity/tabs.vm
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/solr/example/solr/conf/velocity/tabs.vm?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/solr/example/solr/conf/velocity/tabs.vm (original)
+++ lucene/dev/branches/bulkpostings/solr/example/solr/conf/velocity/tabs.vm Fri Feb 11 15:37:28 2011
@@ -1,6 +1,6 @@
##TODO: Make some nice tabs here
#set($queryOpts = $params.get("queryOpts"))
-<span #annTitle("Click the link to demonstrate various Solr capabilities")><span>Examples: </span><span class="tab">#if($queryOpts && $queryOpts != "")<a href="#url_for_home">Simple</a>#{else}Simple#end</span>
-<span class="tab">#if($queryOpts == "spatial")Spatial#else<a href="#url_for_home?&queryOpts=spatial">Spatial</a>#end</span>
-<span class="tab">#if($queryOpts == "group")Group By#else<a href="#url_for_home?&queryOpts=group&group=true&group.field=manu_exact">Group By</a>#end</span></span>
+<span #annTitle("Click the link to demonstrate various Solr capabilities")><span>Examples: </span><span class="tab">#if($queryOpts && $queryOpts != "")<a href="#url_for_home/?#debug#annotate">Simple</a>#{else}Simple#end</span>
+<span class="tab">#if($queryOpts == "spatial")Spatial#else<a href="#url_for_home?&queryOpts=spatial#debug#annotate">Spatial</a>#end</span>
+<span class="tab">#if($queryOpts == "group")Group By#else<a href="#url_for_home?#debug#annotate&queryOpts=group&group=true&group.field=manu_exact">Group By</a>#end</span></span>
<hr/>
\ No newline at end of file
Modified: lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/core/SolrResourceLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/core/SolrResourceLoader.java?rev=1069842&r1=1069841&r2=1069842&view=diff
==============================================================================
--- lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/core/SolrResourceLoader.java (original)
+++ lucene/dev/branches/bulkpostings/solr/src/java/org/apache/solr/core/SolrResourceLoader.java Fri Feb 11 15:37:28 2011
@@ -131,7 +131,12 @@ public class SolrResourceLoader implemen
*/
void addToClassLoader(final String baseDir, final FileFilter filter) {
File base = FileUtils.resolvePath(new File(getInstanceDir()), baseDir);
- this.classLoader = replaceClassLoader(classLoader, base, filter);
+ if(base != null && base.canRead() && base.isDirectory()){
+ this.classLoader = replaceClassLoader(classLoader, base, filter);
+ }
+ else{
+ log.error("Can't find (or read) file to add to classloader: " + base);
+ }
}
/**