You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2014/07/07 01:44:31 UTC

svn commit: r1608313 [5/5] - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/analysis/common/src/java/org/apache/lucene/analysis/standa...

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Sun Jul  6 23:44:30 2014
@@ -52,40 +52,22 @@ import org.apache.lucene.analysis.tokena
 %xstate AVOID_BAD_URL
 %buffer 4096
 
-%include SUPPLEMENTARY.jflex-macro
-ALetter           = (\p{WB:ALetter}                                     | {ALetterSupp})
-Format            = (\p{WB:Format}                                      | {FormatSupp})
-Numeric           = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
-Extend            = (\p{WB:Extend}                                      | {ExtendSupp})
-Katakana          = (\p{WB:Katakana}                                    | {KatakanaSupp})
-MidLetter         = (\p{WB:MidLetter}                                   | {MidLetterSupp})
-MidNum            = (\p{WB:MidNum}                                      | {MidNumSupp})
-MidNumLet         = (\p{WB:MidNumLet}                                   | {MidNumLetSupp})
-ExtendNumLet      = (\p{WB:ExtendNumLet}                                | {ExtendNumLetSupp})
-ComplexContext    = (\p{LB:Complex_Context}                             | {ComplexContextSupp})
-Han               = (\p{Script:Han}                                     | {HanSupp})
-Hiragana          = (\p{Script:Hiragana}                                | {HiraganaSupp})
-SingleQuote       = (\p{WB:Single_Quote}                                | {SingleQuoteSupp})
-DoubleQuote       = (\p{WB:Double_Quote}                                | {DoubleQuoteSupp})
-HebrewLetter      = (\p{WB:Hebrew_Letter}                               | {HebrewLetterSupp})
-RegionalIndicator = (\p{WB:Regional_Indicator}                          | {RegionalIndicatorSupp})
-HebrewOrALetter   = ({HebrewLetter} | {ALetter})
-
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
-HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
-HebrewOrALetterEx   = {HebrewOrALetter}                                         ({Format} | {Extend})*
-NumericEx           = {Numeric}                                                 ({Format} | {Extend})*
-KatakanaEx          = {Katakana}                                                ({Format} | {Extend})* 
-MidLetterEx         = ({MidLetter} | {MidNumLet} | {SingleQuote})               ({Format} | {Extend})* 
-MidNumericEx        = ({MidNum} | {MidNumLet} | {SingleQuote})                  ({Format} | {Extend})*
-ExtendNumLetEx      = {ExtendNumLet}                                            ({Format} | {Extend})*
-HanEx               = {Han}                                                     ({Format} | {Extend})*
-HiraganaEx          = {Hiragana}                                                ({Format} | {Extend})*
-SingleQuoteEx       = {SingleQuote}                                             ({Format} | {Extend})*                                            
-DoubleQuoteEx       = {DoubleQuote}                                             ({Format} | {Extend})*
-HebrewLetterEx      = {HebrewLetter}                                            ({Format} | {Extend})*
-RegionalIndicatorEx = {RegionalIndicator}                                       ({Format} | {Extend})*
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
+HebrewOrALetterEx   = [\p{WB:HebrewLetter}\p{WB:ALetter}]                       [\p{WB:Format}\p{WB:Extend}]*
+NumericEx           = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]]        [\p{WB:Format}\p{WB:Extend}]*
+KatakanaEx          = \p{WB:Katakana}                                           [\p{WB:Format}\p{WB:Extend}]* 
+MidLetterEx         = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}]      [\p{WB:Format}\p{WB:Extend}]* 
+MidNumericEx        = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}]         [\p{WB:Format}\p{WB:Extend}]*
+ExtendNumLetEx      = \p{WB:ExtendNumLet}                                       [\p{WB:Format}\p{WB:Extend}]*
+HanEx               = \p{Script:Han}                                            [\p{WB:Format}\p{WB:Extend}]*
+HiraganaEx          = \p{Script:Hiragana}                                       [\p{WB:Format}\p{WB:Extend}]*
+SingleQuoteEx       = \p{WB:Single_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
+DoubleQuoteEx       = \p{WB:Double_Quote}                                       [\p{WB:Format}\p{WB:Extend}]*
+HebrewLetterEx      = \p{WB:Hebrew_Letter}                                      [\p{WB:Format}\p{WB:Extend}]*
+RegionalIndicatorEx = \p{WB:RegionalIndicator}                                  [\p{WB:Format}\p{WB:Extend}]*
+ComplexContextEx    = \p{LB:Complex_Context}                                    [\p{WB:Format}\p{WB:Extend}]*
 
 // URL and E-mail syntax specifications:
 //
@@ -304,7 +286,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
   //
   //    http://www.unicode.org/reports/tr14/#SA
   //
-  {ComplexContext}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
+  {ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
 
   // UAX#29 WB14.  Any ÷ Any
   //

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Sun Jul  6 23:44:30 2014
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.5.1 */
+/* The following code was generated by JFlex 1.6.0 */
 
 package org.apache.lucene.analysis.wikipedia;
 
@@ -71,7 +71,7 @@ class WikipediaTokenizerImpl {
     "\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
     "\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
     "\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
-    "\u0200\21\u0465\0\73\21\75\15\43\0";
+    "\u0200\21\u0465\0\73\21\75\15\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
 
   /** 
    * Translates characters to character classes
@@ -427,6 +427,14 @@ class WikipediaTokenizerImpl {
 
   /** denotes if the user-EOF-code has already been executed */
   private boolean zzEOFDone;
+  
+  /** 
+   * The number of occupied positions in zzBuffer beyond zzEndRead.
+   * When a lead/high surrogate has been read from the input stream
+   * into the final zzBuffer position, this will have a value of 1;
+   * otherwise, it will have a value of 0.
+   */
+  private int zzFinalHighSurrogate = 0;
 
   /* user code: */
 
@@ -519,10 +527,10 @@ final void reset() {
    * @return         the unpacked character translation table
    */
   private static char [] zzUnpackCMap(String packed) {
-    char [] map = new char[0x10000];
+    char [] map = new char[0x110000];
     int i = 0;  /* index in packed string  */
     int j = 0;  /* index in unpacked array */
-    while (i < 230) {
+    while (i < 262) {
       int  count = packed.charAt(i++);
       char value = packed.charAt(i++);
       do map[j++] = value; while (--count > 0);
@@ -542,6 +550,8 @@ final void reset() {
 
     /* first: make room (if you can) */
     if (zzStartRead > 0) {
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
       System.arraycopy(zzBuffer, zzStartRead,
                        zzBuffer, 0,
                        zzEndRead-zzStartRead);
@@ -554,33 +564,38 @@ final void reset() {
     }
 
     /* is the buffer big enough? */
-    if (zzCurrentPos >= zzBuffer.length) {
+    if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
       /* if not: blow it up */
-      char newBuffer[] = new char[zzCurrentPos*2];
+      char newBuffer[] = new char[zzBuffer.length*2];
       System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
       zzBuffer = newBuffer;
+      zzEndRead += zzFinalHighSurrogate;
+      zzFinalHighSurrogate = 0;
     }
 
-    /* finally: fill the buffer with new input */
-    int numRead = zzReader.read(zzBuffer, zzEndRead,
-                                            zzBuffer.length-zzEndRead);
+    /* fill the buffer with new input */
+    int requested = zzBuffer.length - zzEndRead;           
+    int totalRead = 0;
+    while (totalRead < requested) {
+      int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
+      if (numRead == -1) {
+        break;
+      }
+      totalRead += numRead;
+    }
 
-    if (numRead > 0) {
-      zzEndRead+= numRead;
+    if (totalRead > 0) {
+      zzEndRead += totalRead;
+      if (totalRead == requested) { /* possibly more input available */
+        if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
+          --zzEndRead;
+          zzFinalHighSurrogate = 1;
+        }
+      }
       return false;
     }
-    // unlikely but not impossible: read 0 characters, but not at end of stream    
-    if (numRead == 0) {
-      int c = zzReader.read();
-      if (c == -1) {
-        return true;
-      } else {
-        zzBuffer[zzEndRead++] = (char) c;
-        return false;
-      }     
-    }
 
-    // numRead < 0
+    // totalRead = 0: End of stream
     return true;
   }
 
@@ -616,6 +631,7 @@ final void reset() {
     zzEOFDone = false;
     zzEndRead = zzStartRead = 0;
     zzCurrentPos = zzMarkedPos = 0;
+    zzFinalHighSurrogate = 0;
     yyline = yychar = yycolumn = 0;
     zzLexicalState = YYINITIAL;
     if (zzBuffer.length > ZZ_BUFFERSIZE)
@@ -759,8 +775,10 @@ final void reset() {
       zzForAction: {
         while (true) {
     
-          if (zzCurrentPosL < zzEndReadL)
-            zzInput = zzBufferL[zzCurrentPosL++];
+          if (zzCurrentPosL < zzEndReadL) {
+            zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+            zzCurrentPosL += Character.charCount(zzInput);
+          }
           else if (zzAtEOF) {
             zzInput = YYEOF;
             break zzForAction;
@@ -780,7 +798,8 @@ final void reset() {
               break zzForAction;
             }
             else {
-              zzInput = zzBufferL[zzCurrentPosL++];
+              zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
+              zzCurrentPosL += Character.charCount(zzInput);
             }
           }
           int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];

Modified: lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/icu/build.xml Sun Jul  6 23:44:30 2014
@@ -104,43 +104,6 @@ are part of the ICU4C package. See http:
       <arg value="${rbbi.dst.dir}"/>
     </java>
   </target>
-			
-  <property name="uax29.supp.macros.output.file" 
-            location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
-
-  <target name="gen-uax29-supp-macros" depends="compile-tools">
-    <java
-      classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
-      dir="."
-      fork="true"
-      failonerror="true"
-      output="${uax29.supp.macros.output.file}">
-      <classpath>
-      	<path refid="icujar"/>
-      	<pathelement location="${build.dir}/classes/tools"/>
-      </classpath>
-      <assertions>
-        <enable package="org.apache.lucene"/>
-      </assertions>
-    </java>
-  </target>
-
-  <property name="html.strip.charfilter.supp.macros.output.file"
-            location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
-
-  <target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
-    <java
-        classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
-        dir="."
-        fork="true"
-        failonerror="true"
-        output="${html.strip.charfilter.supp.macros.output.file}">
-      <classpath>
-        <path refid="icujar"/>
-        <pathelement location="${build.dir}/classes/tools"/>
-      </classpath>
-    </java>
-  </target>
 
   <target name="compile-tools" depends="init,common.compile-tools">
     <compile
@@ -150,6 +113,6 @@ are part of the ICU4C package. See http:
     </compile>
   </target>
 
-  <target name="regenerate" depends="gen-html-strip-charfilter-supp-macros,gen-uax29-supp-macros,gen-utr30-data-files,gennorm2,genrbbi"/>
+  <target name="regenerate" depends="gen-utr30-data-files,gennorm2,genrbbi"/>
 
 </project>

Modified: lucene/dev/branches/branch_4x/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/common-build.xml?rev=1608313&r1=1608312&r2=1608313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/common-build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/common-build.xml Sun Jul  6 23:44:30 2014
@@ -2196,7 +2196,7 @@ ${ant.project.name}.test.dependencies=${
 
   <!-- JFlex task -->
   <target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
-    <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.5.1"
+    <ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
                    inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
     <taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
     <property name="jflex.loaded" value="true"/>