You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2014/07/07 01:44:31 UTC
svn commit: r1608313 [5/5] - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/ lucene/analysis/common/
lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/
lucene/analysis/common/src/java/org/apache/lucene/analysis/standa...
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Sun Jul 6 23:44:30 2014
@@ -52,40 +52,22 @@ import org.apache.lucene.analysis.tokena
%xstate AVOID_BAD_URL
%buffer 4096
-%include SUPPLEMENTARY.jflex-macro
-ALetter = (\p{WB:ALetter} | {ALetterSupp})
-Format = (\p{WB:Format} | {FormatSupp})
-Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
-Extend = (\p{WB:Extend} | {ExtendSupp})
-Katakana = (\p{WB:Katakana} | {KatakanaSupp})
-MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
-MidNum = (\p{WB:MidNum} | {MidNumSupp})
-MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
-ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
-ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
-Han = (\p{Script:Han} | {HanSupp})
-Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
-SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
-DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
-HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
-RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
-HebrewOrALetter = ({HebrewLetter} | {ALetter})
-
// UAX#29 WB4. X (Extend | Format)* --> X
//
-HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
-HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
-NumericEx = {Numeric} ({Format} | {Extend})*
-KatakanaEx = {Katakana} ({Format} | {Extend})*
-MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
-MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
-ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
-HanEx = {Han} ({Format} | {Extend})*
-HiraganaEx = {Hiragana} ({Format} | {Extend})*
-SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
-DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
-HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
-RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
+HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
+HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
+NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
+KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
+MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
+MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
+ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
+HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
+HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
+SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
+DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
+HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
+RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
+ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
@@ -304,7 +286,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
//
// http://www.unicode.org/reports/tr14/#SA
//
- {ComplexContext}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
+ {ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Sun Jul 6 23:44:30 2014
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.1 */
+/* The following code was generated by JFlex 1.6.0 */
package org.apache.lucene.analysis.wikipedia;
@@ -71,7 +71,7 @@ class WikipediaTokenizerImpl {
"\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
"\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
"\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
- "\u0200\21\u0465\0\73\21\75\15\43\0";
+ "\u0200\21\u0465\0\73\21\75\15\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
/**
* Translates characters to character classes
@@ -427,6 +427,14 @@ class WikipediaTokenizerImpl {
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
+
+ /**
+ * The number of occupied positions in zzBuffer beyond zzEndRead.
+ * When a lead/high surrogate has been read from the input stream
+ * into the final zzBuffer position, this will have a value of 1;
+ * otherwise, it will have a value of 0.
+ */
+ private int zzFinalHighSurrogate = 0;
/* user code: */
@@ -519,10 +527,10 @@ final void reset() {
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
- char [] map = new char[0x10000];
+ char [] map = new char[0x110000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
- while (i < 230) {
+ while (i < 262) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@@ -542,6 +550,8 @@ final void reset() {
/* first: make room (if you can) */
if (zzStartRead > 0) {
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
@@ -554,33 +564,38 @@ final void reset() {
}
/* is the buffer big enough? */
- if (zzCurrentPos >= zzBuffer.length) {
+ if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
/* if not: blow it up */
- char newBuffer[] = new char[zzCurrentPos*2];
+ char newBuffer[] = new char[zzBuffer.length*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
+ zzEndRead += zzFinalHighSurrogate;
+ zzFinalHighSurrogate = 0;
}
- /* finally: fill the buffer with new input */
- int numRead = zzReader.read(zzBuffer, zzEndRead,
- zzBuffer.length-zzEndRead);
+ /* fill the buffer with new input */
+ int requested = zzBuffer.length - zzEndRead;
+ int totalRead = 0;
+ while (totalRead < requested) {
+ int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
+ if (numRead == -1) {
+ break;
+ }
+ totalRead += numRead;
+ }
- if (numRead > 0) {
- zzEndRead+= numRead;
+ if (totalRead > 0) {
+ zzEndRead += totalRead;
+ if (totalRead == requested) { /* possibly more input available */
+ if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+ --zzEndRead;
+ zzFinalHighSurrogate = 1;
+ }
+ }
return false;
}
- // unlikely but not impossible: read 0 characters, but not at end of stream
- if (numRead == 0) {
- int c = zzReader.read();
- if (c == -1) {
- return true;
- } else {
- zzBuffer[zzEndRead++] = (char) c;
- return false;
- }
- }
- // numRead < 0
+ // totalRead = 0: End of stream
return true;
}
@@ -616,6 +631,7 @@ final void reset() {
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
+ zzFinalHighSurrogate = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
@@ -759,8 +775,10 @@ final void reset() {
zzForAction: {
while (true) {
- if (zzCurrentPosL < zzEndReadL)
- zzInput = zzBufferL[zzCurrentPosL++];
+ if (zzCurrentPosL < zzEndReadL) {
+ zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+ zzCurrentPosL += Character.charCount(zzInput);
+ }
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
@@ -780,7 +798,8 @@ final void reset() {
break zzForAction;
}
else {
- zzInput = zzBufferL[zzCurrentPosL++];
+ zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+ zzCurrentPosL += Character.charCount(zzInput);
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml Sun Jul 6 23:44:30 2014
@@ -104,43 +104,6 @@ are part of the ICU4C package. See http:
<arg value="${rbbi.dst.dir}"/>
</java>
</target>
-
- <property name="uax29.supp.macros.output.file"
- location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
-
- <target name="gen-uax29-supp-macros" depends="compile-tools">
- <java
- classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
- dir="."
- fork="true"
- failonerror="true"
- output="${uax29.supp.macros.output.file}">
- <classpath>
- <path refid="icujar"/>
- <pathelement location="${build.dir}/classes/tools"/>
- </classpath>
- <assertions>
- <enable package="org.apache.lucene"/>
- </assertions>
- </java>
- </target>
-
- <property name="html.strip.charfilter.supp.macros.output.file"
- location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
-
- <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
- <java
- classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
- dir="."
- fork="true"
- failonerror="true"
- output="${html.strip.charfilter.supp.macros.output.file}">
- <classpath>
- <path refid="icujar"/>
- <pathelement location="${build.dir}/classes/tools"/>
- </classpath>
- </java>
- </target>
<target name="compile-tools" depends="init,common.compile-tools">
<compile
@@ -150,6 +113,6 @@ are part of the ICU4C package. See http:
</compile>
</target>
- <target name="regenerate" depends="gen-html-strip-charfilter-supp-macros,gen-uax29-supp-macros,gen-utr30-data-files,gennorm2,genrbbi"/>
+ <target name="regenerate" depends="gen-utr30-data-files,gennorm2,genrbbi"/>
</project>
Modified: lucene/dev/branches/branch_4x/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/common-build.xml?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/common-build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/common-build.xml Sun Jul 6 23:44:30 2014
@@ -2196,7 +2196,7 @@ ${ant.project.name}.test.dependencies=${
<!-- JFlex task -->
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
- <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.5.1"
+ <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
<property name="jflex.loaded" value="true"/>