You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/12/19 18:48:57 UTC

svn commit: r1552377 [5/15] - in /lucene/dev/branches/lucene5339: ./ dev-tools/ dev-tools/idea/.idea/ dev-tools/idea/.idea/libraries/ dev-tools/idea/lucene/benchmark/src/ dev-tools/idea/lucene/demo/ dev-tools/idea/lucene/facet/ dev-tools/idea/solr/cont...

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex Thu Dec 19 17:48:47 2013
@@ -35,11 +35,13 @@ import org.apache.lucene.analysis.tokena
  *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
+ *   <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
+ *   <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
  * </ul>
  */
 %%
 
-%unicode 6.1
+%unicode 6.3
 %integer
 %final
 %public
@@ -50,33 +52,39 @@ import org.apache.lucene.analysis.tokena
 %buffer 4096
 
 %include SUPPLEMENTARY.jflex-macro
-ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
-Format =  ([\p{WB:Format}] | {FormatSupp})
-Numeric = ([\p{WB:Numeric}] | {NumericSupp})
-Extend =  ([\p{WB:Extend}] | {ExtendSupp})
-Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
-MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
-MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
-MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
-ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
-ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
-Han = ([\p{Script:Han}] | {HanSupp})
-Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
+ALetter           = (\p{WB:ALetter}                                     | {ALetterSupp})
+Format            = (\p{WB:Format}                                      | {FormatSupp})
+Numeric           = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
+Extend            = (\p{WB:Extend}                                      | {ExtendSupp})
+Katakana          = (\p{WB:Katakana}                                    | {KatakanaSupp})
+MidLetter         = (\p{WB:MidLetter}                                   | {MidLetterSupp})
+MidNum            = (\p{WB:MidNum}                                      | {MidNumSupp})
+MidNumLet         = (\p{WB:MidNumLet}                                   | {MidNumLetSupp})
+ExtendNumLet      = (\p{WB:ExtendNumLet}                                | {ExtendNumLetSupp})
+ComplexContext    = (\p{LB:Complex_Context}                             | {ComplexContextSupp})
+Han               = (\p{Script:Han}                                     | {HanSupp})
+Hiragana          = (\p{Script:Hiragana}                                | {HiraganaSupp})
+SingleQuote       = (\p{WB:Single_Quote}                                | {SingleQuoteSupp})
+DoubleQuote       = (\p{WB:Double_Quote}                                | {DoubleQuoteSupp})
+HebrewLetter      = (\p{WB:Hebrew_Letter}                               | {HebrewLetterSupp})
+RegionalIndicator = (\p{WB:Regional_Indicator}                          | {RegionalIndicatorSupp})
+HebrewOrALetter   = ({HebrewLetter} | {ALetter})
 
-// Script=Hangul & Aletter
-HangulEx       = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
 // UAX#29 WB4. X (Extend | Format)* --> X
 //
-ALetterEx      = {ALetter}                     ({Format} | {Extend})*
-// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
-NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
-KatakanaEx     = {Katakana}                    ({Format} | {Extend})* 
-MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})* 
-MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
-ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
-
-HanEx = {Han} ({Format} | {Extend})*
-HiraganaEx = {Hiragana} ({Format} | {Extend})*
+HangulEx            = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
+HebrewOrALetterEx   = {HebrewOrALetter}                                         ({Format} | {Extend})*
+NumericEx           = {Numeric}                                                 ({Format} | {Extend})*
+KatakanaEx          = {Katakana}                                                ({Format} | {Extend})* 
+MidLetterEx         = ({MidLetter} | {MidNumLet} | {SingleQuote})               ({Format} | {Extend})* 
+MidNumericEx        = ({MidNum} | {MidNumLet} | {SingleQuote})                  ({Format} | {Extend})*
+ExtendNumLetEx      = {ExtendNumLet}                                            ({Format} | {Extend})*
+HanEx               = {Han}                                                     ({Format} | {Extend})*
+HiraganaEx          = {Hiragana}                                                ({Format} | {Extend})*
+SingleQuoteEx       = {SingleQuote}                                             ({Format} | {Extend})*                                            
+DoubleQuoteEx       = {DoubleQuote}                                             ({Format} | {Extend})*
+HebrewLetterEx      = {HebrewLetter}                                            ({Format} | {Extend})*
+RegionalIndicatorEx = {RegionalIndicator}                                       ({Format} | {Extend})*
 
 // URL and E-mail syntax specifications:
 //
@@ -213,40 +221,47 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 {EMAIL} { return EMAIL_TYPE; }
 
 // UAX#29 WB8.   Numeric × Numeric
-//        WB11.  Numeric (MidNum | MidNumLet) × Numeric
-//        WB12.  Numeric × (MidNum | MidNumLet) Numeric
-//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
-//
-{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx} 
-                              | {MidNumericEx} {NumericEx} 
-                              | {NumericEx})*
-{ExtendNumLetEx}* 
+//        WB11.  Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
+//        WB12.  Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
+//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
+//
+{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}* 
   { return NUMERIC_TYPE; }
 
 // subset of the below for typing purposes only!
 {HangulEx}+
   { return HANGUL_TYPE; }
-
+  
 {KatakanaEx}+
   { return KATAKANA_TYPE; }
 
-// UAX#29 WB5.   ALetter × ALetter
-//        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
-//        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
-//        WB9.   ALetter × Numeric
-//        WB10.  Numeric × ALetter
+// UAX#29 WB5.   (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
+//        WB6.   (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
+//        WB7.   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
+//        WB7a.  Hebrew_Letter × Single_Quote
+//        WB7b.  Hebrew_Letter × Double_Quote Hebrew_Letter
+//        WB7c.  Hebrew_Letter Double_Quote × Hebrew_Letter
+//        WB9.   (ALetter | Hebrew_Letter) × Numeric
+//        WB10.  Numeric × (ALetter | Hebrew_Letter)
 //        WB13.  Katakana × Katakana
-//        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
-//        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
+//        WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+//        WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana) 
 //
-{ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
-                   | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
-                     | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ ) 
-({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})* 
-                   | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
-                     | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ ) )*
-{ExtendNumLetEx}*  
+{ExtendNumLetEx}*  ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                            )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx} {HebrewLetterEx}      )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx}         )*
+                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  )* {HebrewOrALetterEx} )*
+                     )+
+                   )
+({ExtendNumLetEx}+ ( {KatakanaEx}          ( {ExtendNumLetEx}*   {KatakanaEx}                            )*
+                   | ( {HebrewLetterEx}    ( {SingleQuoteEx}     | {DoubleQuoteEx}   {HebrewLetterEx}    )
+                     | {NumericEx}         ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx}         )*
+                     | {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx}  )* {HebrewOrALetterEx} )*
+                     )+
+                   )
+)*
+{ExtendNumLetEx}* 
   { return WORD_TYPE; }
 
 
@@ -258,7 +273,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 //    annex.  That means that satisfactory treatment of languages like Chinese
 //    or Thai requires special handling.
 // 
-// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
+// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
 // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
 //
 // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@@ -280,6 +295,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNam
 // UAX#29 WB3.   CR × LF
 //        WB3a.  (Newline | CR | LF) ÷
 //        WB3b.  ÷ (Newline | CR | LF)
+//        WB13c. Regional_Indicator × Regional_Indicator
 //        WB14.  Any ÷ Any
 //
-[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
+{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
+  { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java Thu Dec 19 17:48:47 2013
@@ -133,8 +133,8 @@ public class SynonymFilterFactory extend
       analyzer = new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-          Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
-          TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
+          Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
+          TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
           return new TokenStreamComponents(tokenizer, stream);
         }
       };
@@ -201,7 +201,7 @@ public class SynonymFilterFactory extend
   private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
     Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
     try {
-      Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_50);
+      Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_CURRENT);
       if (analyzer instanceof ResourceLoaderAware) {
         ((ResourceLoaderAware) analyzer).inform(loader);
       }

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.java Thu Dec 19 17:48:47 2013
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex. */
+/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
 
 package org.apache.lucene.analysis.wikipedia;
 
@@ -84,21 +84,20 @@ class WikipediaTokenizerImpl {
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
-    "\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
-    "\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
-    "\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
-    "\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
-    "\1\11\1\0\1\34\1\35\1\36\1\0\1\37\1\0"+
-    "\1\40\3\0\1\41\1\42\2\43\1\42\2\44\2\0"+
-    "\1\43\1\0\14\43\1\42\3\0\1\11\1\45\3\0"+
-    "\1\46\1\47\5\0\1\50\4\0\1\50\2\0\2\50"+
-    "\2\0\1\11\5\0\1\31\1\42\1\43\1\51\3\0"+
-    "\1\11\2\0\1\52\30\0\1\53\2\0\1\54\1\55"+
-    "\1\56";
+    "\12\0\4\1\4\2\1\3\1\4\1\1\2\5\1\6"+
+    "\1\5\1\7\1\5\2\10\1\11\1\5\1\12\1\11"+
+    "\1\13\1\14\1\15\1\16\1\15\1\17\1\20\1\10"+
+    "\1\21\1\10\4\22\1\23\1\24\1\25\1\26\3\0"+
+    "\1\27\14\0\1\30\1\31\1\32\1\33\1\11\1\0"+
+    "\1\34\1\35\1\36\1\0\1\37\1\0\1\40\3\0"+
+    "\1\41\1\42\2\43\1\42\2\44\2\0\1\43\1\0"+
+    "\14\43\1\42\3\0\1\11\1\45\3\0\1\46\1\47"+
+    "\5\0\1\50\4\0\1\50\2\0\2\50\2\0\1\11"+
+    "\5\0\1\31\1\42\1\43\1\51\3\0\1\11\2\0"+
+    "\1\52\30\0\1\53\2\0\1\54\1\55\1\56";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[184];
+    int [] result = new int[181];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -125,30 +124,30 @@ class WikipediaTokenizerImpl {
   private static final String ZZ_ROWMAP_PACKED_0 =
     "\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
     "\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
-    "\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
-    "\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
-    "\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
-    "\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
-    "\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
-    "\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
-    "\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
-    "\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u01b8\0\u0b2c"+
+    "\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u01b8\0\u0370\0\u039c"+
+    "\0\u03c8\0\u03f4\0\u0420\0\u01b8\0\u0370\0\u044c\0\u0478\0\u01b8"+
+    "\0\u04a4\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
+    "\0\u0604\0\u0630\0\u065c\0\u01b8\0\u0688\0\u0370\0\u06b4\0\u06e0"+
+    "\0\u070c\0\u01b8\0\u01b8\0\u0738\0\u0764\0\u0790\0\u01b8\0\u07bc"+
+    "\0\u07e8\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
+    "\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u01b8\0\u01b8\0\u0a24"+
+    "\0\u0a50\0\u0a7c\0\u0a7c\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c"+
     "\0\u0b58\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c"+
-    "\0\u0cb8\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
+    "\0\u0814\0\u0cb8\0\u0ce4\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
     "\0\u0dec\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20"+
-    "\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080"+
-    "\0\u10ac\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8"+
+    "\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u01b8"+
+    "\0\u1080\0\u10ac\0\u10d8\0\u1104\0\u01b8\0\u1130\0\u115c\0\u1188"+
     "\0\u11b4\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8"+
-    "\0\u1314\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0"+
-    "\0\u141c\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8"+
-    "\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684"+
-    "\0\u16b0\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
+    "\0\u1314\0\u1340\0\u07e8\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0"+
+    "\0\u141c\0\u1448\0\u1474\0\u14a0\0\u01b8\0\u14cc\0\u14f8\0\u1524"+
+    "\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u01b8\0\u1658"+
+    "\0\u1684\0\u16b0\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
     "\0\u17e4\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918"+
     "\0\u1944\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78"+
-    "\0\u1aa4\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
+    "\0\u1aa4\0\u1ad0\0\u01b8\0\u01b8\0\u01b8";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[184];
+    int [] result = new int[181];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -172,152 +171,149 @@ class WikipediaTokenizerImpl {
 
   private static final String ZZ_TRANS_PACKED_0 =
     "\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
-    "\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
-    "\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
-    "\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
-    "\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
-    "\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
-    "\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
-    "\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
-    "\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
-    "\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
-    "\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
-    "\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
-    "\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
-    "\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
-    "\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
-    "\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
-    "\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
-    "\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
-    "\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
-    "\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
-    "\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
-    "\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
-    "\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
-    "\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
-    "\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
-    "\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
-    "\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
-    "\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
-    "\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
-    "\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
-    "\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
-    "\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
-    "\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
-    "\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
-    "\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
-    "\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\117"+
-    "\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\120"+
-    "\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
-    "\7\0\15\54\3\0\3\54\47\0\1\117\6\0\1\121"+
-    "\63\0\1\122\57\0\4\62\7\0\15\62\3\0\3\62"+
-    "\24\0\1\56\55\0\1\123\43\0\4\70\7\0\15\70"+
-    "\3\0\3\70\14\0\1\36\1\0\4\124\1\0\3\125"+
-    "\3\0\15\124\3\0\3\124\14\0\1\36\1\0\4\124"+
-    "\1\0\3\125\3\0\3\124\1\126\11\124\3\0\3\124"+
-    "\16\0\1\127\1\0\1\127\10\0\15\127\3\0\3\127"+
-    "\16\0\1\130\1\131\1\132\1\133\7\0\15\130\3\0"+
-    "\3\130\16\0\1\134\1\0\1\134\10\0\15\134\3\0"+
-    "\3\134\16\0\1\135\1\136\1\135\1\136\7\0\15\135"+
-    "\3\0\3\135\16\0\1\137\2\140\1\141\7\0\15\137"+
-    "\3\0\3\137\16\0\1\100\2\142\10\0\15\100\3\0"+
-    "\3\100\16\0\1\143\2\144\1\145\7\0\15\143\3\0"+
-    "\3\143\16\0\4\136\7\0\15\136\3\0\3\136\16\0"+
-    "\1\146\2\147\1\150\7\0\15\146\3\0\3\146\16\0"+
-    "\1\151\2\152\1\153\7\0\15\151\3\0\3\151\16\0"+
-    "\1\154\1\144\1\155\1\145\7\0\15\154\3\0\3\154"+
-    "\16\0\1\156\2\131\1\133\7\0\15\156\3\0\3\156"+
-    "\30\0\1\157\1\160\64\0\1\161\27\0\4\40\7\0"+
-    "\2\40\1\162\12\40\3\0\3\40\2\0\1\163\101\0"+
-    "\1\164\1\165\40\0\4\70\7\0\6\70\1\166\6\70"+
-    "\3\0\3\70\2\0\1\167\63\0\1\170\71\0\1\171"+
-    "\1\172\34\0\1\173\1\0\1\36\1\0\4\124\1\0"+
-    "\3\125\3\0\15\124\3\0\3\124\16\0\4\174\1\0"+
-    "\3\125\3\0\15\174\3\0\3\174\12\0\1\173\1\0"+
-    "\1\36\1\0\4\124\1\0\3\125\3\0\10\124\1\175"+
-    "\4\124\3\0\3\124\2\0\1\73\13\0\1\127\1\0"+
-    "\1\127\10\0\15\127\3\0\3\127\3\0\1\176\1\0"+
-    "\1\102\2\177\6\0\1\130\1\131\1\132\1\133\7\0"+
-    "\15\130\3\0\3\130\3\0\1\200\1\0\1\102\2\201"+
-    "\1\0\1\202\3\0\1\202\3\131\1\133\7\0\15\131"+
-    "\3\0\3\131\3\0\1\203\1\0\1\102\2\201\1\0"+
-    "\1\202\3\0\1\202\1\132\1\131\1\132\1\133\7\0"+
-    "\15\132\3\0\3\132\3\0\1\204\1\0\1\102\2\177"+
-    "\6\0\4\133\7\0\15\133\3\0\3\133\3\0\1\205"+
-    "\2\0\1\205\7\0\1\135\1\136\1\135\1\136\7\0"+
-    "\15\135\3\0\3\135\3\0\1\205\2\0\1\205\7\0"+
-    "\4\136\7\0\15\136\3\0\3\136\3\0\1\177\1\0"+
-    "\1\102\2\177\6\0\1\137\2\140\1\141\7\0\15\137"+
-    "\3\0\3\137\3\0\1\201\1\0\1\102\2\201\1\0"+
-    "\1\202\3\0\1\202\3\140\1\141\7\0\15\140\3\0"+
-    "\3\140\3\0\1\177\1\0\1\102\2\177\6\0\4\141"+
-    "\7\0\15\141\3\0\3\141\3\0\1\202\2\0\2\202"+
-    "\1\0\1\202\3\0\1\202\3\142\10\0\15\142\3\0"+
-    "\3\142\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
-    "\3\0\1\100\1\143\2\144\1\145\7\0\15\143\3\0"+
-    "\3\143\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
-    "\3\0\1\104\3\144\1\145\7\0\15\144\3\0\3\144"+
-    "\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
-    "\1\100\4\145\7\0\15\145\3\0\3\145\3\0\1\77"+
-    "\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\146"+
-    "\2\147\1\150\7\0\15\146\3\0\3\146\3\0\1\103"+
-    "\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\147"+
-    "\1\150\7\0\15\147\3\0\3\147\3\0\1\77\1\0"+
-    "\1\102\2\77\1\0\1\100\3\0\1\100\4\150\7\0"+
-    "\15\150\3\0\3\150\3\0\1\100\2\0\2\100\1\0"+
-    "\1\100\3\0\1\100\1\151\2\152\1\153\7\0\15\151"+
-    "\3\0\3\151\3\0\1\104\2\0\2\104\1\0\1\104"+
-    "\3\0\1\104\3\152\1\153\7\0\15\152\3\0\3\152"+
-    "\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
-    "\4\153\7\0\15\153\3\0\3\153\3\0\1\206\1\0"+
-    "\1\102\2\77\1\0\1\100\3\0\1\100\1\154\1\144"+
-    "\1\155\1\145\7\0\15\154\3\0\3\154\3\0\1\207"+
-    "\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\155"+
-    "\1\144\1\155\1\145\7\0\15\155\3\0\3\155\3\0"+
-    "\1\204\1\0\1\102\2\177\6\0\1\156\2\131\1\133"+
-    "\7\0\15\156\3\0\3\156\31\0\1\160\54\0\1\210"+
-    "\64\0\1\211\26\0\4\40\7\0\15\40\3\0\1\40"+
-    "\1\212\1\40\31\0\1\165\54\0\1\213\35\0\1\36"+
-    "\1\0\4\124\1\0\3\125\3\0\3\124\1\214\11\124"+
-    "\3\0\3\124\2\0\1\215\102\0\1\172\54\0\1\216"+
-    "\34\0\1\217\52\0\1\173\3\0\4\174\7\0\15\174"+
-    "\3\0\3\174\12\0\1\173\1\0\1\220\1\0\4\124"+
-    "\1\0\3\125\3\0\15\124\3\0\3\124\16\0\1\221"+
-    "\1\133\1\221\1\133\7\0\15\221\3\0\3\221\16\0"+
-    "\4\141\7\0\15\141\3\0\3\141\16\0\4\145\7\0"+
-    "\15\145\3\0\3\145\16\0\4\150\7\0\15\150\3\0"+
-    "\3\150\16\0\4\153\7\0\15\153\3\0\3\153\16\0"+
-    "\1\222\1\145\1\222\1\145\7\0\15\222\3\0\3\222"+
-    "\16\0\4\133\7\0\15\133\3\0\3\133\16\0\4\223"+
-    "\7\0\15\223\3\0\3\223\33\0\1\224\61\0\1\225"+
-    "\30\0\4\40\6\0\1\226\15\40\3\0\2\40\1\227"+
-    "\33\0\1\230\32\0\1\173\1\0\1\36\1\0\4\124"+
-    "\1\0\3\125\3\0\10\124\1\231\4\124\3\0\3\124"+
-    "\2\0\1\232\104\0\1\233\36\0\4\234\7\0\15\234"+
-    "\3\0\3\234\3\0\1\176\1\0\1\102\2\177\6\0"+
-    "\1\221\1\133\1\221\1\133\7\0\15\221\3\0\3\221"+
-    "\3\0\1\206\1\0\1\102\2\77\1\0\1\100\3\0"+
-    "\1\100\1\222\1\145\1\222\1\145\7\0\15\222\3\0"+
-    "\3\222\3\0\1\205\2\0\1\205\7\0\4\223\7\0"+
-    "\15\223\3\0\3\223\34\0\1\235\55\0\1\236\26\0"+
-    "\1\237\60\0\4\40\6\0\1\226\15\40\3\0\3\40"+
-    "\34\0\1\240\31\0\1\173\1\0\1\117\1\0\4\124"+
-    "\1\0\3\125\3\0\15\124\3\0\3\124\34\0\1\241"+
-    "\32\0\1\242\2\0\4\234\7\0\15\234\3\0\3\234"+
-    "\35\0\1\243\62\0\1\244\20\0\1\245\77\0\1\246"+
-    "\53\0\1\247\32\0\1\36\1\0\4\174\1\0\3\125"+
-    "\3\0\15\174\3\0\3\174\36\0\1\250\53\0\1\251"+
-    "\33\0\4\252\7\0\15\252\3\0\3\252\36\0\1\253"+
-    "\53\0\1\254\54\0\1\255\61\0\1\256\11\0\1\257"+
-    "\12\0\4\252\7\0\15\252\3\0\3\252\37\0\1\260"+
-    "\53\0\1\261\54\0\1\262\22\0\1\13\62\0\4\263"+
-    "\7\0\15\263\3\0\3\263\40\0\1\264\53\0\1\265"+
-    "\43\0\1\266\26\0\2\263\1\0\2\263\1\0\2\263"+
-    "\2\0\5\263\7\0\15\263\3\0\4\263\27\0\1\267"+
-    "\53\0\1\270\24\0";
+    "\1\20\1\21\1\22\1\23\3\13\1\24\2\13\15\17"+
+    "\1\25\2\13\3\17\1\13\7\26\1\27\5\26\4\30"+
+    "\5\26\1\31\1\26\15\30\3\26\3\30\10\26\1\27"+
+    "\5\26\4\32\5\26\1\33\1\26\15\32\3\26\3\32"+
+    "\1\26\7\34\1\35\5\34\4\36\1\34\1\37\2\26"+
+    "\1\34\1\40\1\34\15\36\3\34\1\41\2\36\2\34"+
+    "\1\42\5\34\1\35\5\34\4\43\4\34\1\44\2\34"+
+    "\15\43\3\34\3\43\10\34\1\35\5\34\4\45\4\34"+
+    "\1\44\2\34\15\45\3\34\3\45\10\34\1\35\5\34"+
+    "\4\45\4\34\1\46\2\34\15\45\3\34\3\45\10\34"+
+    "\1\35\1\34\1\47\3\34\4\50\7\34\15\50\3\34"+
+    "\3\50\10\34\1\51\5\34\4\52\7\34\15\52\1\34"+
+    "\1\53\1\34\3\52\1\34\1\54\1\55\5\54\1\56"+
+    "\1\54\1\57\3\54\4\60\4\54\1\61\2\54\15\60"+
+    "\2\54\1\62\3\60\1\54\55\0\1\63\62\0\1\64"+
+    "\4\0\4\65\7\0\6\65\1\66\6\65\3\0\3\65"+
+    "\12\0\1\67\43\0\1\70\1\71\1\72\1\73\2\74"+
+    "\1\0\1\75\3\0\1\75\1\17\1\20\1\21\1\22"+
+    "\7\0\15\17\3\0\3\17\3\0\1\76\1\0\1\77"+
+    "\2\100\1\0\1\101\3\0\1\101\3\20\1\22\7\0"+
+    "\15\20\3\0\3\20\2\0\1\70\1\102\1\72\1\73"+
+    "\2\100\1\0\1\101\3\0\1\101\1\21\1\20\1\21"+
+    "\1\22\7\0\15\21\3\0\3\21\3\0\1\103\1\0"+
+    "\1\77\2\74\1\0\1\75\3\0\1\75\4\22\7\0"+
+    "\15\22\3\0\3\22\26\0\1\104\73\0\1\105\16\0"+
+    "\1\64\4\0\4\65\7\0\15\65\3\0\3\65\16\0"+
+    "\4\30\7\0\15\30\3\0\3\30\27\0\1\106\42\0"+
+    "\4\32\7\0\15\32\3\0\3\32\27\0\1\107\42\0"+
+    "\4\36\7\0\15\36\3\0\3\36\24\0\1\26\45\0"+
+    "\4\36\7\0\2\36\1\110\12\36\3\0\3\36\2\0"+
+    "\1\111\67\0\4\43\7\0\15\43\3\0\3\43\26\0"+
+    "\1\112\43\0\4\45\7\0\15\45\3\0\3\45\26\0"+
+    "\1\113\37\0\1\114\57\0\4\50\7\0\15\50\3\0"+
+    "\3\50\11\0\1\115\4\0\4\65\7\0\15\65\3\0"+
+    "\3\65\16\0\4\52\7\0\15\52\3\0\3\52\47\0"+
+    "\1\114\6\0\1\116\63\0\1\117\57\0\4\60\7\0"+
+    "\15\60\3\0\3\60\26\0\1\120\43\0\4\65\7\0"+
+    "\15\65\3\0\3\65\14\0\1\34\1\0\4\121\1\0"+
+    "\3\122\3\0\15\121\3\0\3\121\14\0\1\34\1\0"+
+    "\4\121\1\0\3\122\3\0\3\121\1\123\11\121\3\0"+
+    "\3\121\16\0\1\124\1\0\1\124\10\0\15\124\3\0"+
+    "\3\124\16\0\1\125\1\126\1\127\1\130\7\0\15\125"+
+    "\3\0\3\125\16\0\1\131\1\0\1\131\10\0\15\131"+
+    "\3\0\3\131\16\0\1\132\1\133\1\132\1\133\7\0"+
+    "\15\132\3\0\3\132\16\0\1\134\2\135\1\136\7\0"+
+    "\15\134\3\0\3\134\16\0\1\75\2\137\10\0\15\75"+
+    "\3\0\3\75\16\0\1\140\2\141\1\142\7\0\15\140"+
+    "\3\0\3\140\16\0\4\133\7\0\15\133\3\0\3\133"+
+    "\16\0\1\143\2\144\1\145\7\0\15\143\3\0\3\143"+
+    "\16\0\1\146\2\147\1\150\7\0\15\146\3\0\3\146"+
+    "\16\0\1\151\1\141\1\152\1\142\7\0\15\151\3\0"+
+    "\3\151\16\0\1\153\2\126\1\130\7\0\15\153\3\0"+
+    "\3\153\30\0\1\154\1\155\64\0\1\156\27\0\4\36"+
+    "\7\0\2\36\1\157\12\36\3\0\3\36\2\0\1\160"+
+    "\101\0\1\161\1\162\40\0\4\65\7\0\6\65\1\163"+
+    "\6\65\3\0\3\65\2\0\1\164\63\0\1\165\71\0"+
+    "\1\166\1\167\34\0\1\170\1\0\1\34\1\0\4\121"+
+    "\1\0\3\122\3\0\15\121\3\0\3\121\16\0\4\171"+
+    "\1\0\3\122\3\0\15\171\3\0\3\171\12\0\1\170"+
+    "\1\0\1\34\1\0\4\121\1\0\3\122\3\0\10\121"+
+    "\1\172\4\121\3\0\3\121\2\0\1\70\13\0\1\124"+
+    "\1\0\1\124\10\0\15\124\3\0\3\124\3\0\1\173"+
+    "\1\0\1\77\2\174\6\0\1\125\1\126\1\127\1\130"+
+    "\7\0\15\125\3\0\3\125\3\0\1\175\1\0\1\77"+
+    "\2\176\1\0\1\177\3\0\1\177\3\126\1\130\7\0"+
+    "\15\126\3\0\3\126\3\0\1\200\1\0\1\77\2\176"+
+    "\1\0\1\177\3\0\1\177\1\127\1\126\1\127\1\130"+
+    "\7\0\15\127\3\0\3\127\3\0\1\201\1\0\1\77"+
+    "\2\174\6\0\4\130\7\0\15\130\3\0\3\130\3\0"+
+    "\1\202\2\0\1\202\7\0\1\132\1\133\1\132\1\133"+
+    "\7\0\15\132\3\0\3\132\3\0\1\202\2\0\1\202"+
+    "\7\0\4\133\7\0\15\133\3\0\3\133\3\0\1\174"+
+    "\1\0\1\77\2\174\6\0\1\134\2\135\1\136\7\0"+
+    "\15\134\3\0\3\134\3\0\1\176\1\0\1\77\2\176"+
+    "\1\0\1\177\3\0\1\177\3\135\1\136\7\0\15\135"+
+    "\3\0\3\135\3\0\1\174\1\0\1\77\2\174\6\0"+
+    "\4\136\7\0\15\136\3\0\3\136\3\0\1\177\2\0"+
+    "\2\177\1\0\1\177\3\0\1\177\3\137\10\0\15\137"+
+    "\3\0\3\137\3\0\1\103\1\0\1\77\2\74\1\0"+
+    "\1\75\3\0\1\75\1\140\2\141\1\142\7\0\15\140"+
+    "\3\0\3\140\3\0\1\76\1\0\1\77\2\100\1\0"+
+    "\1\101\3\0\1\101\3\141\1\142\7\0\15\141\3\0"+
+    "\3\141\3\0\1\103\1\0\1\77\2\74\1\0\1\75"+
+    "\3\0\1\75\4\142\7\0\15\142\3\0\3\142\3\0"+
+    "\1\74\1\0\1\77\2\74\1\0\1\75\3\0\1\75"+
+    "\1\143\2\144\1\145\7\0\15\143\3\0\3\143\3\0"+
+    "\1\100\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
+    "\3\144\1\145\7\0\15\144\3\0\3\144\3\0\1\74"+
+    "\1\0\1\77\2\74\1\0\1\75\3\0\1\75\4\145"+
+    "\7\0\15\145\3\0\3\145\3\0\1\75\2\0\2\75"+
+    "\1\0\1\75\3\0\1\75\1\146\2\147\1\150\7\0"+
+    "\15\146\3\0\3\146\3\0\1\101\2\0\2\101\1\0"+
+    "\1\101\3\0\1\101\3\147\1\150\7\0\15\147\3\0"+
+    "\3\147\3\0\1\75\2\0\2\75\1\0\1\75\3\0"+
+    "\1\75\4\150\7\0\15\150\3\0\3\150\3\0\1\203"+
+    "\1\0\1\77\2\74\1\0\1\75\3\0\1\75\1\151"+
+    "\1\141\1\152\1\142\7\0\15\151\3\0\3\151\3\0"+
+    "\1\204\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
+    "\1\152\1\141\1\152\1\142\7\0\15\152\3\0\3\152"+
+    "\3\0\1\201\1\0\1\77\2\174\6\0\1\153\2\126"+
+    "\1\130\7\0\15\153\3\0\3\153\31\0\1\155\54\0"+
+    "\1\205\64\0\1\206\26\0\4\36\7\0\15\36\3\0"+
+    "\1\36\1\207\1\36\31\0\1\162\54\0\1\210\35\0"+
+    "\1\34\1\0\4\121\1\0\3\122\3\0\3\121\1\211"+
+    "\11\121\3\0\3\121\2\0\1\212\102\0\1\167\54\0"+
+    "\1\213\34\0\1\214\52\0\1\170\3\0\4\171\7\0"+
+    "\15\171\3\0\3\171\12\0\1\170\1\0\1\215\1\0"+
+    "\4\121\1\0\3\122\3\0\15\121\3\0\3\121\16\0"+
+    "\1\216\1\130\1\216\1\130\7\0\15\216\3\0\3\216"+
+    "\16\0\4\136\7\0\15\136\3\0\3\136\16\0\4\142"+
+    "\7\0\15\142\3\0\3\142\16\0\4\145\7\0\15\145"+
+    "\3\0\3\145\16\0\4\150\7\0\15\150\3\0\3\150"+
+    "\16\0\1\217\1\142\1\217\1\142\7\0\15\217\3\0"+
+    "\3\217\16\0\4\130\7\0\15\130\3\0\3\130\16\0"+
+    "\4\220\7\0\15\220\3\0\3\220\33\0\1\221\61\0"+
+    "\1\222\30\0\4\36\6\0\1\223\15\36\3\0\2\36"+
+    "\1\224\33\0\1\225\32\0\1\170\1\0\1\34\1\0"+
+    "\4\121\1\0\3\122\3\0\10\121\1\226\4\121\3\0"+
+    "\3\121\2\0\1\227\104\0\1\230\36\0\4\231\7\0"+
+    "\15\231\3\0\3\231\3\0\1\173\1\0\1\77\2\174"+
+    "\6\0\1\216\1\130\1\216\1\130\7\0\15\216\3\0"+
+    "\3\216\3\0\1\203\1\0\1\77\2\74\1\0\1\75"+
+    "\3\0\1\75\1\217\1\142\1\217\1\142\7\0\15\217"+
+    "\3\0\3\217\3\0\1\202\2\0\1\202\7\0\4\220"+
+    "\7\0\15\220\3\0\3\220\34\0\1\232\55\0\1\233"+
+    "\26\0\1\234\60\0\4\36\6\0\1\223\15\36\3\0"+
+    "\3\36\34\0\1\235\31\0\1\170\1\0\1\114\1\0"+
+    "\4\121\1\0\3\122\3\0\15\121\3\0\3\121\34\0"+
+    "\1\236\32\0\1\237\2\0\4\231\7\0\15\231\3\0"+
+    "\3\231\35\0\1\240\62\0\1\241\20\0\1\242\77\0"+
+    "\1\243\53\0\1\244\32\0\1\34\1\0\4\171\1\0"+
+    "\3\122\3\0\15\171\3\0\3\171\36\0\1\245\53\0"+
+    "\1\246\33\0\4\247\7\0\15\247\3\0\3\247\36\0"+
+    "\1\250\53\0\1\251\54\0\1\252\61\0\1\253\11\0"+
+    "\1\254\12\0\4\247\7\0\15\247\3\0\3\247\37\0"+
+    "\1\255\53\0\1\256\54\0\1\257\22\0\1\13\62\0"+
+    "\4\260\7\0\15\260\3\0\3\260\40\0\1\261\53\0"+
+    "\1\262\43\0\1\263\26\0\2\260\1\0\2\260\1\0"+
+    "\2\260\2\0\5\260\7\0\15\260\3\0\4\260\27\0"+
+    "\1\264\53\0\1\265\24\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[7040];
+    int [] result = new int[6908];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -355,8 +351,8 @@ class WikipediaTokenizerImpl {
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
-    "\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
+    "\12\0\1\11\7\1\1\11\2\1\1\11\5\1\1\11"+
+    "\3\1\1\11\13\1\1\11\5\1\2\11\3\0\1\11"+
     "\14\0\2\1\2\11\1\1\1\0\2\1\1\11\1\0"+
     "\1\1\1\0\1\1\3\0\7\1\2\0\1\1\1\0"+
     "\15\1\3\0\1\1\1\11\3\0\1\1\1\11\5\0"+
@@ -365,7 +361,7 @@ class WikipediaTokenizerImpl {
     "\2\0\3\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[184];
+    int [] result = new int[181];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -508,7 +504,6 @@ final void reset() {
 
   /**
    * Creates a new scanner
-   * There is also a java.io.InputStream version of this constructor.
    *
    * @param   in  the java.io.Reader to read input from.
    */
@@ -516,7 +511,6 @@ final void reset() {
     this.zzReader = in;
   }
 
-  
 
   /** 
    * Unpacks the compressed character translation table.

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex Thu Dec 19 17:48:47 2013
@@ -212,7 +212,7 @@ DOUBLE_EQUALS = "="{2}
   {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
   {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
 //ignore
-  . | {WHITESPACE} |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+  [^] |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
 }
 
 <INTERNAL_LINK_STATE>{
@@ -221,7 +221,7 @@ DOUBLE_EQUALS = "="{2}
   {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
   {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
   //ignore
-  . | {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+  [^]                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
 }
 
 <EXTERNAL_LINK_STATE>{
@@ -236,7 +236,7 @@ DOUBLE_EQUALS = "="{2}
   {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
   {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
   //ignore
-  . | {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+  [^]                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
 }
 //italics
 <TWO_SINGLE_QUOTES_STATE>{
@@ -249,7 +249,7 @@ DOUBLE_EQUALS = "="{2}
    {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
 
    //ignore
-  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+   [^]                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
 }
 //bold
 <THREE_SINGLE_QUOTES_STATE>{
@@ -260,7 +260,7 @@ DOUBLE_EQUALS = "="{2}
    {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
 
    //ignore
-  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+   [^]                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
 
 }
 //bold italics
@@ -272,7 +272,7 @@ DOUBLE_EQUALS = "="{2}
    {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
 
    //ignore
-  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+   [^]                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
 }
 
 <DOUBLE_EQUALS_STATE>{
@@ -280,15 +280,15 @@ DOUBLE_EQUALS = "="{2}
  {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
  {DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
   //ignore
-  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+  [^]                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
 }
 
 <DOUBLE_BRACE_STATE>{
   {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
   {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
   {CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
-   //ignore
-  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+  //ignore
+  [^]                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
 }
 
 <STRING> {
@@ -305,7 +305,7 @@ DOUBLE_EQUALS = "="{2}
 
   {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
 
-  .|{WHITESPACE}                                              { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
+  [^]                                              { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
 }
 
 
@@ -327,7 +327,7 @@ DOUBLE_EQUALS = "="{2}
 //end wikipedia
 
 /** Ignore the rest */
-. | {WHITESPACE}|{TAGS}                                                { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+[^] | {TAGS}                                          { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
 
 
 //INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStandardAnalyzer.java Thu Dec 19 17:48:47 2013
@@ -202,7 +202,7 @@ public class TestStandardAnalyzer extend
   }
   
   public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
+    WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
     wordBreakTest.test(a);
   }
   
@@ -230,6 +230,8 @@ public class TestStandardAnalyzer extend
     checkOneTerm(a, "壹゙", "壹゙"); // ideographic
     checkOneTerm(a, "아゙",  "아゙"); // hangul
   }
+  
+  
 
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopAnalyzer.java Thu Dec 19 17:48:47 2013
@@ -60,7 +60,7 @@ public class TestStopAnalyzer extends Ba
 
   public void testStopList() throws IOException {
     CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
-    StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
+    StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
     try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
       assertNotNull(stream);
       CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilter.java Thu Dec 19 17:48:47 2013
@@ -94,7 +94,7 @@ public class TestStopFilter extends Base
   // LUCENE-3849: make sure after .end() we see the "ending" posInc
   public void testEndStopword() throws Exception {
     CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
-    StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
+    StopFilter stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
     assertTokenStreamContents(stpf, new String[] { "test" },
                               new int[] {0},
                               new int[] {4},

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUAX29URLEmailTokenizer.java Thu Dec 19 17:48:47 2013
@@ -424,7 +424,7 @@ public class TestUAX29URLEmailTokenizer 
   }
 
   public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
+    WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
     wordBreakTest.test(a);
   }
   

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/email.addresses.from.random.text.with.email.addresses.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/email.addresses.from.random.text.with.email.addresses.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/email.addresses.from.random.text.with.email.addresses.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/email.addresses.from.random.text.with.email.addresses.txt Thu Dec 19 17:48:47 2013
@@ -78,13 +78,13 @@ LTLNFsgB@[191.56.104.113]
 iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU
 VGLn@z3E2.3an2.MM
 TWmfsxn@[112.192.017.029]
-2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV
+2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D
 CjaPC63@['\RDrwk]
 Ayydpdoa@tdgypppmen.wf
 "gfKP9"@jo3-r0.mz
-aTMgDW4@t5gax.XN--0ZWM56D
+aTMgDW4@t5gax.XN--3E0B707E
 mcDrMO3FQ@nwc21.y5qd45lesryrp.IL
-NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp
+NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d
 XtAhFnq@[218.214.251.103]
 x0S8uos@[109.82.126.233]
 ALB4KFavj16pODdd@i206d6s.MM

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/generateJavaUnicodeWordBreakTest.pl
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/generateJavaUnicodeWordBreakTest.pl?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/generateJavaUnicodeWordBreakTest.pl (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/generateJavaUnicodeWordBreakTest.pl Thu Dec 19 17:48:47 2013
@@ -78,9 +78,10 @@ import org.junit.Ignore;
  *    \\p{Script = Hiragana}
  *    \\p{LineBreak = Complex_Context} (From $line_break_url)
  *    \\p{WordBreak = ALetter}         (From $word_break_url)
+ *    \\p{WordBreak = Hebrew_Letter}
  *    \\p{WordBreak = Katakana}
  *    \\p{WordBreak = Numeric}         (Excludes full-width Arabic digits)
- *    [\\uFF10-\\uFF19]                 (Full-width Arabic digits)
+ *    [\\uFF10-\\uFF19]                (Full-width Arabic digits)
  */
 \@Ignore
 public class ${class_name} extends BaseTokenStreamTestCase {
@@ -97,7 +98,7 @@ parse_Unicode_data_file($line_break_url,
 parse_Unicode_data_file($scripts_url, $codepoints, 
                         {'han' => 1, 'hiragana' => 1});
 parse_Unicode_data_file($word_break_url, $codepoints,
-                        {'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
+                        {'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
 my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
 
 my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@@ -109,25 +110,33 @@ print STDERR "Writing '$output_path'..."
 print OUT $header;
 
 for my $line (@tests) {
-  next if ($line =~ /^\s*\#/);
-  # ÷ 0001 × 0300 ÷  #  ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
+  next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
+  # Example line: ÷ 0001 × 0300 ÷  #  ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
   my ($sequence) = $line =~ /^(.*?)\s*\#/;
+  $line =~ s/\t/  /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
   print OUT "    // $line\n";
   $sequence =~ s/\s*÷\s*$//; # Trim trailing break character
   my $test_string = $sequence;
   $test_string =~ s/\s*÷\s*/\\u/g;
   $test_string =~ s/\s*×\s*/\\u/g;
+  $test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
   $test_string =~ s/\\u000A/\\n/g;
   $test_string =~ s/\\u000D/\\r/g;
+  $test_string =~ s/\\u0022/\\\"/g;
   $sequence =~ s/^\s*÷\s*//; # Trim leading break character
   my @tokens = ();
   for my $candidate (split /\s*÷\s*/, $sequence) {
     my @chars = ();
     my $has_wanted_char = 0;
     while ($candidate =~ /([0-9A-F]+)/gi) {
-      push @chars, $1;
+      my $hexchar = $1;
+      if (4 == length($hexchar)) {
+        push @chars, $hexchar;
+      } else {
+        push @chars, above_BMP_char_to_surrogates($hexchar);
+      }
       unless ($has_wanted_char) {
-        $has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
+        $has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
       }
     }
     if ($has_wanted_char) {
@@ -144,6 +153,21 @@ close OUT;
 print STDERR "done.\n";
 
 
+# sub above_BMP_char_to_surrogates
+#
+# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
+# to the corresponding UTF-16 surrogate pair
+#
+# Assumption: input string is a sequence more than four hex digits
+#
+sub above_BMP_char_to_surrogates {
+  my $ch = hex(shift);
+  my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
+  my $low_surrogate  = 0xDC00 + ($ch & 0x3FF);
+  return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
+}
+
+
 # sub parse_Unicode_data_file
 #
 # Downloads and parses the specified Unicode data file, parses it, and

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.email.addresses.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.email.addresses.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.email.addresses.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.email.addresses.txt Thu Dec 19 17:48:47 2013
@@ -121,14 +121,14 @@ Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them "
 of LTLNFsgB@[191.56.104.113] all, until it has read it is
 iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU there. <VG...@z3E2.3an2.MM> Once
 TWmfsxn@[112.192.017.029] Spiros under the place
-2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV as were not a house of the
+2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D as were not a house of the
 rosebushes and the whateverend, feel her waist. She changes everything. We had
 decided to do you know CjaPC63@['\RDrwk] this, is what did leave, pray; let us
 come to, <Ay...@tdgypppmen.wf> what history as died. Strange, Spiros with
 delight: That night "gfKP9"@jo3-r0.mz and gold case
-<aT...@t5gax.XN--0ZWM56D> is spring: the aeon arising, wherein he returned,
+<aT...@t5gax.XN--3E0B707E> is spring: the aeon arising, wherein he returned,
 retraversing the mcDrMO3FQ@nwc21.y5qd45lesryrp.IL gates, first
-<NZ...@v50egeveepk.z290kk.Bc3.xn--jxalpdlp> to reach session. Initiating first
+<NZ...@v50egeveepk.z290kk.Bc3.xn--kprw13d> to reach session. Initiating first
 part of the main hall toward his own spurs. Hes an <XtAhFnq@[218.214.251.103]>
 Irifix And older ones who wins? ADAM: x0S8uos@[109.82.126.233] The violin and
 reality. The hidden set up to come. ROSE WAKINS: No answer. The

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.urls.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.urls.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.urls.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/random.text.with.urls.txt Thu Dec 19 17:48:47 2013
@@ -24,7 +24,7 @@ and Joe recited this iron bars with thei
 almost drove me towards evening. At
 HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH the
 sergeant and then on the raw
-<Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m> afternoon towards
+<Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m> afternoon towards
 the terror, merely wished him as biled
 M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb -- a conciliatory air on in
 <ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J>
@@ -47,7 +47,7 @@ to live. You didn't know nothing could a
 behind the answer those aids, I saw him in the same appearance of the convict's
 file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
 confession, and bring you see? '
-HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
+HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
 accusatory manner as well known that Joe Gargery marry her cup. `I wonder and
 there was publicly made it was,
 <file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#> as lookers on; me, I
@@ -63,7 +63,7 @@ again
 FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
 towards evening. At last, and kneaded, and a dead man taking any. There was
 publicly made out there?' said I,
-ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
+ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
 glancing http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY at the
 N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/ river wound, twenty miles of the
 number called, hears the awful it lights; here and trimmings of Caesar. This
@@ -155,7 +155,7 @@ ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qM
 at me, and that her walking z3ymb.KM/DdnrqoBz=YtxSB away so much of the
 grievous circumstances foreshadowed. After receiving the way, that I thought,
 if she should go to?' `Good again!' cried the
-FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0 society of a savoury pork pie,
+FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0 society of a savoury pork pie,
 and nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc they challenged, hears nothin' all my
 hands in herself, and bring him by hand. `This,' ftp://085.062.055.011/bopfVV/
 said he wore ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs a dog of
@@ -191,7 +191,7 @@ and tingling, and that I had won of the 
 from Richard the furthest end of
 http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w both imp and stung by the
 bright fire, another look
-zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
+zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
 best use asking questions, and feet,
 <ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ> hanging to try
 back was the poker. `It was not warmly. `Seems
@@ -204,7 +204,7 @@ kitchen wall,
 Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1 he ate the
 house, end with the Ghost in order): Forty-three pence?' To five hundred
 Gargerys.' `I say, Pip; stay
-7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
+7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
 ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
 his shot, and reposing no help to my seat. It was in the kitchen wall, because
 I calculated the sounds by giving me by the name for a rush of Joe's forge
@@ -299,7 +299,7 @@ She drew the kitchen, carrying file:///Y
 wooden hut
 ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
 where it seemed to give Pirrip as
-<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
+<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
 to say, on the guiltily coarse his head, he tried to the
 Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
 remark. `There's one sprinkled all I was possible she beggared me. All these
@@ -311,7 +311,7 @@ Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA
 he shook her veil so thick nor my milk and would impart all had returned, with
 soap-suds, I had FILE:///#F9Bgl just like thin snow. `Enough of his right side
 of thenceforth sitting
-jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
+jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
 in File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg my soul. I sat down on it, I have
 been a spoon that the pie, blacksmith?' asked Estella of it made a mouth wide
 open, and so
@@ -324,7 +324,7 @@ FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
 pointed to Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz himself. No glimpse of
 file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg herself, I saw that he would have
 been there, I was too far and uncomfortable by it.
-http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
+http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
 Under the Above,' I rather to become transfixed -- he gave me out of the
 kitchen empty-handed, to keep him, I had made a
 Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG subject, if he had
@@ -468,7 +468,7 @@ hard twist upon his -- `Well, boy,' Uncl
 had heard it had hesitated as little window, violently plunging and she had
 committed, and had all about the present calling, which the fingers of tea on
 Saturdays than this country, gentlemen, but I could see those,
-https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
+https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
 too, if you remember what stock she told me again. `But I know what
 file:///enqvF%EFLOBsZhl8h2z wittles is?' `Yes, ma'am.' `Estella, take me again
 and ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A refractory
@@ -493,7 +493,7 @@ right-side
 ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
 flaxen curls and tables, and a foot of the blacksmith's.' `Halloa!' said Joe,
 staring at that it had withered like a infunt, and took another look about the
-rum <6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
+rum <6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
 Three Jolly Bargemen to think she seemed to tell you were. When we saw the file
 coming at my slice. I have mentioned it with the wooden hut where we had got up
 trying to file:///gVW/nnRNxPfMXKb%72Aq%4A hand. If ever grateful for. If a
@@ -662,7 +662,7 @@ open,' he
 https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
 wiped the liquor. He was the bad; and some one
 Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE another
-Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
+Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
 down by a most powerfully down
 t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x to me, and all that
 know the window,
@@ -993,7 +993,7 @@ upon a door, which was gobbling mincemea
 that Joe's blue file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/ eyes, had an
 hour longer than at me, and dismal, and gloves, and that's further than I
 mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs looked on. `Now, boy!
-g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
+g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
 Why, here's a ridiculous old chap. And looked up by hand. `Why don't like
 `sulks.' Therefore, I was in such game?' Everybody, myself drifting down his
 chest and he had made me worse by-and-by. I was a
@@ -1035,7 +1035,7 @@ in every word out again. `You are prison
 <HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt>
 for us heavy. `I Bolted, myself, 5.Piba4ac.JE/55M1H/AZXdj and thread, and we
 after him, or to inspire confidence. This was brought you spoke all the act, he
-couldn't m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
+couldn't m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
 between the forge was <ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/>
 busy in it. Until
 hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/ she jammed
@@ -1329,7 +1329,7 @@ sort Http://w9ys35.wb55p6l.hxl.rs/Y97%58
 FILE://155.24.106.255/3VEZIT7 if it was to him, I might not do not afraid of
 report, and looking rather to make nothing of a confidential voice,
 d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
-as lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
+as lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
 supposed,' said the wind and so we were read the conversation consisted of it
 had so that we saw some bread, some
 l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C brandy out: no black velvet

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/urls.from.random.text.with.urls.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/urls.from.random.text.with.urls.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/urls.from.random.text.with.urls.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/urls.from.random.text.with.urls.txt Thu Dec 19 17:48:47 2013
@@ -10,7 +10,7 @@ http://Rcbu6/Oxc%C0IkGSZ8rO9IUpd/BEvkvw3
 file:///2CdsP/U2GCLT
 Http://Pzw978uzb.ai/yB;mt/o8hVKG/%231Y/Xb1%bb6v1fhjfdkfkBvxed?8mq~=OvF&STpJJk=ws0ZO&0DRA=
 HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH
-Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m
+Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m
 M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb
 ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J
 ftp://213.7.210.47/%e5pFkj6e6Jczc/ypJGG/z%663jYR/37IxLQBPr/Ciq50EUIdueyj
@@ -23,13 +23,13 @@ Ftp://Xmswrxn8d-1s.pe.gm/dB6C3xTk%D3x/EK
 FILE:///rKnQkS0MAF#tM%53_2%03%d6ZICH
 ftp://R5ecjkf1yx4wpskfh.tv0y3m90ak.0R605.se:51297/zpWcRRcG/1woSqw7ZUko/
 file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
-HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND
+HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND
 file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#
 http://1qvgjd1.TP/7oq5gWW/Gwqf8fxBXR4/?Br,q=ayMz0&1IO%370N7=;Sl1czc2L+5bRISfD+w&ygP3FhV%E1w36=2Rx
 ftp://5SCC6BUYP.Knf1cvlc22z9.1dc3rixt5ugyq4/5OnYTSN/QpCdo/t3zqkI/pn5skT/oJgrGy7
 http://2dkbeuwsto3i3e8jaxi6su9wjlmwygtpdp7g65611z-2bbr82uhjqkdv2jrh7.KZ/FiSvI/aaB&dPQ%42kLdM
 FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
-ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
+ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
 http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY
 N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/
 http://ah-2d4.ASIA/qmp
@@ -75,7 +75,7 @@ http://4u3o/BKdhwRyzG
 file:///LdsHfPABFz1vRD1OB6Yl/RS6&1Gmz/mfYul/
 ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sgn6&X5EiZdZ0WhTX3T/fa%f3Azz
 z3ymb.KM/DdnrqoBz=YtxSB
-FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0
+FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0
 nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc
 ftp://085.062.055.011/bopfVV/
 ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs
@@ -93,12 +93,12 @@ https://[3790:ad57:0B63::e5f7:f6ac:164C]
 bl60k0jqkc9.oow84o1.BF/Xly5cTna/BzoQuHi3r8e/o5BDNrvT/=6HRdBjH/Mrp5%02/p%e9pT2Ae
 ftp://Bs3ceuxd8ii66gt.X8wwdpt.BB:27095/3BfkvfzcmTS/FTffh&S/gIWvJ5Kd/AlOQ%3EnO
 http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w
-zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
+zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
 ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ
 HTTPS://56aderic0knmip9lkqdqag14.uk:45885/lELiK:/vF%4C5Enwqy/P5NGJ2b/dD6sg1yMV
 ftp://vlt.3g45k63viz2.tcnm3.UA:60664/AJ9iqYk%c1/uKbohn2/K%D1kequ4z8rxFpJ
 Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1
-7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
+7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
 ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
 ftp://lv56pdepzu0b0fo-04qtxv5tt2jc0nsaukrhtz5-e3u1vcb517y3b135zl.e0r1hson.dk/3TVoqjp6%1FCFSkt/006VZfho/gxrWxgDawM3Uk
 Ftp://7n977.Niyt.2fgkzfhj.q7-DJ.Ow7a.it/5zfRi3PO8/1zfKT9%421tP/?SazEijJq%710COQKWeLE/TdUc%b2u/2AxBw9%4BUN6Zp4Z/KfUZd1MTdPv/L4m1tI3/WJvcK1
@@ -147,20 +147,20 @@ ftp://Lq.es/%B1ZPdTZgB2mNFW/qre92rM
 file:///IZ47ESCtX%aatQab1/V553gjR?Me/#9%68qPw
 file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH
 ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
-79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
+79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
 Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
 ftp://[fd77:4982:C37F:a0a1:7651:E09C:117.093.145.017]/2l91g/s%79lJmUiZ/%A5R2qsJ
 [62c0::]/d1lmSzoB/5OBVnzn/kOXW%D23
 Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%bed=uY5hO+s+IKk1S&Q=HHXEC+Gof86QIRHy&35QY5=
 FILE:///#F9Bgl
-jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
+jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
 File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg
 ftp://892f7.oel50j.32.9qj1p-g7lgw.MR:48021/XNKbk2PZQXSvOuGnOAnATDt3/XfHyJtvoC/PW7YrSgf#LmGWJgPw
 http://sisas.ua/4CU60ZLK4VgY8AR89
 FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
 Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz
 file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg
-http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
+http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
 Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG
 ftp://tw7d-6yu.im:2055/%66qbqzss/OmPGW;type=d
 FTP://zst.tn/QcUpaA/VKvJ2/JN6AKew/iXYIiHm7mfPFmD%21E5/yTQpoiqdbaaS1/LnzOX#VqsobH
@@ -228,7 +228,7 @@ file:///UIIGOxv6jvF2%c0/%A8J3%677Gmq8im1
 http://Qhk9z.zm/cOGBen/mBsDycEI5V7L1s%84WUj7863/p%5f~okuRD51b0M?b%F2d%67ujGr=oh8PWUtK&j6uX7baX=&sg3RUocA9W=m5IaF&JWH9G=fyiOtnC3+7RJA+ippw96rvu+BxtGg&F6f1=jmPS&3PE0xX5=TGV%5c5J&%fc@NSEynhuvb=&MkRIt33=
 Http://[98cc:433d:2C25:62dd:54ba:d10b:63d3:4C40]/YlbNrJod/fdjuN/qYqSdqr5/KAbXYHO%F0m7Ws9
 file:///ywFY5HK/XAv@v%66o/M2O4Wlny50hypf5%02A8
-https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
+https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
 file:///enqvF%EFLOBsZhl8h2z
 ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A
 ftp://1xf.ipl4f0y6c4.VA/LHuq~/p2nPbE/0YGGNJB%DEje2psef_B/aKOuMl1Q9
@@ -240,7 +240,7 @@ http://nEN5ZN.EG/%0efsf4v30L
 file:///19%9947/ksd3Sq7W78%27/2K_Ylzcu2q
 r8sht9qzsc1e2wp.ci/8SbPwlW%5ac/qKEqFi0Q
 ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
-6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/
+6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/
 file:///gVW/nnRNxPfMXKb%72Aq%4A
 file:///Fzza388TQ
 file:///
@@ -314,7 +314,7 @@ file:///3%aexrb7UdZ5GpR4ZIfoxwL/vQV%4a2z
 f5ms.jp/%A1FpERWwTd%BFG/ExC8V5aqx5l2CLJr0mJb5u/DgMvEzAr2U/py9Vg/igr9PzANtw/FFiN1E7
 https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
 Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE
-Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9
+Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9
 t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x
 ftp://D02-auxxaeqnv9ve-jlmo3.l10vqu.12jl.2mvjwrsqm.BA/r71QLLNu6oGJjG/HbxrX1Grq8/QR%2agZv4hR
 file:///XoCg%EDVf/A3ibJYjU
@@ -476,7 +476,7 @@ ftp://53.151.134.240/uZqGXLUIu-J/=%0C2pO
 FILE:///Kywof5D5q/0TRS/zayrkrnENB
 file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/
 mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs
-g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
+g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
 file:///TJa%86AczeCmM5QMhi/Wox~Ajl/WxUF%5eSA:y%0fD%E21/x%cca%d3Qgx/8iWJ5-h%26/fCK%01nQNrK8#ygTTB
 file:///~%303cUUVYTEaQU5%5DXbogiPKb/favR2rETEh/9TXM%15u/nYCOZpZgL
 file:///mJM%a1/jv5%53QDqE/bFMu0CBp
@@ -496,7 +496,7 @@ http://gpu16lz.LS/9e%daJrwQfHEpFvsZ3jx/c
 file://ij9anjtok86ro.uN-BGDQ855IB.sDXAQR.5kr8kz.3J3M8XRM.18r3s0g-6.4rjsmwue0lwao0og17d-5-1.F1h3qgkul29yw2t4p4se5clomncxhmoy.g6c9tbz7.pa/5LMtmbl/1tfIF/pBOV7Hc
 HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt
 5.Piba4ac.JE/55M1H/AZXdj
-m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/
+m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/
 ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/
 hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/
 Ftp://mez27g2tpmk.MC/%B8AHk%95etDns%46/gXbsCn%6C-/s8_Jmy/DhmfT~Di6KD
@@ -633,7 +633,7 @@ http://047.014.184.200/Z_QdOwjzfBue4Nt/a
 Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L
 FILE://155.24.106.255/3VEZIT7
 d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
-lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET
+lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET
 l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C
 FILE://a6ys9a4.xj.BY/%99BGXp/F=yJtxc71/gvXuHuB9k
 212.072.006.032/6kV8ce%2e/%e7lzm-HB%4artP/zg6tWMW7RIG?U7=HAXw$D3sM%7DyDJ&Gt=

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java Thu Dec 19 17:48:47 2013
@@ -75,7 +75,7 @@ public class TestUAX29URLEmailTokenizerF
         + " samba Halta gamba "
         + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
         + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
-        + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+        + "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m"
         + " inter Locutio "
         + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
         + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
@@ -91,7 +91,7 @@ public class TestUAX29URLEmailTokenizerF
           "samba", "Halta", "gamba",
           "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
           "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
-          "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
+          "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m",
           "inter", "Locutio",
           "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
           "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",

Modified: lucene/dev/branches/lucene5339/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java Thu Dec 19 17:48:47 2013
@@ -60,20 +60,21 @@ public class GenerateJflexTLDMacros {
   
   private static final String APACHE_LICENSE 
     = "/*" + NL
-      + " * Copyright 2001-2005 The Apache Software Foundation." + NL
-      + " *" + NL
-      + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
-      + " * you may not use this file except in compliance with the License." + NL
-      + " * You may obtain a copy of the License at" + NL
-      + " *" + NL
-      + " *      http://www.apache.org/licenses/LICENSE-2.0" + NL
-      + " *" + NL
-      + " * Unless required by applicable law or agreed to in writing, software" + NL
-      + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
-      + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
-      + " * See the License for the specific language governing permissions and" + NL
-      + " * limitations under the License." + NL
-      + " */" + NL + NL;
+    + " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
+    + " * contributor license agreements.  See the NOTICE file distributed with" + NL
+    + " * this work for additional information regarding copyright ownership." + NL
+    + " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
+    + " * (the \"License\"); you may not use this file except in compliance with" + NL
+    + " * the License.  You may obtain a copy of the License at" + NL
+    + " *" + NL
+    + " *     http://www.apache.org/licenses/LICENSE-2.0" + NL
+    + " *" + NL
+    + " * Unless required by applicable law or agreed to in writing, software" + NL
+    + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+    + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+    + " * See the License for the specific language governing permissions and" + NL
+    + " * limitations under the License." + NL
+    + " */" + NL;
     
   private static final Pattern TLD_PATTERN_1 
     = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");

Modified: lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/uax29/Default.rbbi
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/uax29/Default.rbbi?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/uax29/Default.rbbi (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/uax29/Default.rbbi Thu Dec 19 17:48:47 2013
@@ -14,27 +14,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-# Default RBBI rules, based on UAX#29.
+# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
 #
+# Copyright (C) 2002-2013, International Business Machines Corporation 
+# and others. All Rights Reserved.
+#
+# file:  word.txt
+#
+# ICU Word Break Rules
+#      See Unicode Standard Annex #29.
+#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
+#
+# Note:  Updates to word.txt will usually need to be merged into
+#        word_POSIX.txt also.
+
+##############################################################################
+#
+#  Character class definitions from TR 29
+#
+##############################################################################
 
 !!chain;
 
+
 #
 #  Character Class Definitions.
 #
 
-$CR           = [\p{Word_Break = CR}];
-$LF           = [\p{Word_Break = LF}];
-$Newline      = [\p{Word_Break = Newline}];
-$Extend       = [\p{Word_Break = Extend}];
-$Format       = [\p{Word_Break = Format}];
-$Katakana     = [\p{Word_Break = Katakana}];
-$ALetter      = [\p{Word_Break = ALetter}];
-$MidNumLet    = [\p{Word_Break = MidNumLet}];
-$MidLetter    = [\p{Word_Break = MidLetter}];
-$MidNum       = [\p{Word_Break = MidNum}];
-$Numeric      = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidLetter          = [\p{Word_Break = MidLetter}];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+
+$Han                = [:Han:];
+$Hiragana           = [:Hiragana:];
 
 
 #   Dictionary character set, for triggering language-based break engines. Currently
@@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendN
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.
 
-$dictionary   = [:LineBreak = Complex_Context:];
 $Control        = [\p{Grapheme_Cluster_Break = Control}]; 
-$ALetterPlus  = [$ALetter [$dictionary-$Extend-$Control]];   # Note:  default ALetter does not
-                                                             #  include the dictionary characters.
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$Han $Hiragana $HangulSyllable];
+$dictionary     = [$ComplexContext];
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
 
 #
 #  Rules 4    Ignore Format and Extend characters, 
 #             except when they appear at the beginning of a region of text.
 #
-$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
-$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
-$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
-$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
-$MidNumEx       = $MidNum       ($Extend |  $Format)*;
-$NumericEx      = $Numeric      ($Extend |  $Format)*;
-$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
+$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
+$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
+$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
+$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
+$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
+$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
+$MidNumEx             = $MidNum             ($Extend |  $Format)*;
+$NumericEx            = $Numeric            ($Extend |  $Format)*;
+$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
+$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
 
-$Hiragana       = [\p{script=Hiragana}];
 $Ideographic    = [\p{Ideographic}];
 $HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
 $IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
@@ -77,23 +112,31 @@ $CR $LF;
 #          of a region of Text.   The rule here comes into play when the start of text
 #          begins with a group of Format chars, or with a "word" consisting of a single
 #          char that is not in any of the listed word break categories followed by
-#          format char(s).
+#          format char(s), or is not a CJK dictionary character.
 [^$CR $LF $Newline]? ($Extend |  $Format)+;
 
 $NumericEx {100};
 $ALetterEx {200};
+$HangulSyllable {200};
+$Hebrew_LetterEx{200};
 $KatakanaEx {300};       # note:  these status values override those from rule 5
-$HiraganaEx {300};       #        by virtual of being numerically larger.
+$HiraganaEx {300};       #        by virtue of being numerically larger.
 $IdeographicEx {400};    #
 
 #
 # rule 5
 #    Do not break between most letters.
 #
-$ALetterEx $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
 
 # rule 6 and 7
-$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
+($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
+
+# rule 7a
+$Hebrew_LetterEx $Single_QuoteEx {200};
+
+# rule 7b and 7c
+$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
 
 # rule 8
 
@@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
 
 # rule 9
 
-$ALetterEx $NumericEx {200};
+($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
 
 # rule 10
 
-$NumericEx $ALetterEx {200};
+$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
 
 # rule 11 and 12 
 
-$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
+$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
 
 # rule 13
-
 $KatakanaEx  $KatakanaEx {300};
 
 # rule 13a/b
 
-$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
-$NumericEx      $ExtendNumLetEx {100};    #  (13a)
-$KatakanaEx     $ExtendNumLetEx {300};    #  (13a)
-$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
-
-$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
-$ExtendNumLetEx $NumericEx  {100};    #  (13b)
-$ExtendNumLetEx $KatakanaEx {300};    #  (13b)
+$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
+$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
+$NumericEx       $ExtendNumLetEx {100};    #  (13a)
+$KatakanaEx      $ExtendNumLetEx {300};    #  (13a)
+$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
+
+$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
+$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
+$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
+$ExtendNumLetEx  $KatakanaEx     {300};    #  (13b)
+
+# rule 13c
+
+$Regional_IndicatorEx $Regional_IndicatorEx;
+
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};

Modified: lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/BasicFoldings.txt Thu Dec 19 17:48:47 2013
@@ -78,7 +78,6 @@ FF0D>002D
 ## Space Folding
 # Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
 1680>0020
-180E>0020
 
 ## Spacing Accents folding (done by kd)
 

Modified: lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfc.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfc.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfc.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfc.txt Thu Dec 19 17:48:47 2013
@@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
 # file name: nfc.txt
@@ -7,7 +7,7 @@
 #
 # Complete data for Unicode NFC normalization.
 
-* Unicode 6.1.0
+* Unicode 6.3.0
 
 # Canonical_Combining_Class (ccc) values
 0300..0314:230

Modified: lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc.txt Thu Dec 19 17:48:47 2013
@@ -1,4 +1,4 @@
-# Copyright (C) 1999-2012, International Business Machines
+# Copyright (C) 1999-2013, International Business Machines
 # Corporation and others.  All Rights Reserved.
 #
 # file name: nfkc.txt
@@ -11,7 +11,7 @@
 # to NFKC one-way mappings.
 # Use this file as the second gennorm2 input file after nfc.txt.
 
-* Unicode 6.1.0
+* Unicode 6.3.0
 
 00A0>0020
 00A8>0020 0308

Modified: lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt?rev=1552377&r1=1552376&r2=1552377&view=diff
==============================================================================
--- lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt (original)
+++ lucene/dev/branches/lucene5339/lucene/analysis/icu/src/data/utr30/nfkc_cf.txt Thu Dec 19 17:48:47 2013
@@ -1,5 +1,5 @@
 # Unicode Character Database
-# Copyright (c) 1991-2012 Unicode, Inc.
+# Copyright (c) 1991-2013 Unicode, Inc.
 # For terms of use, see http://www.unicode.org/terms_of_use.html
 # For documentation, see http://www.unicode.org/reports/tr44/
 #
@@ -12,7 +12,7 @@
 # and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
 # Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
 
-* Unicode 6.1.0
+* Unicode 6.3.0
 
 0041>0061
 0042>0062
@@ -537,6 +537,7 @@
 0555>0585
 0556>0586
 0587>0565 0582
+061C>
 0675>0627 0674
 0676>0648 0674
 0677>06C7 0674
@@ -627,7 +628,7 @@
 10FC>10DC
 115F..1160>
 17B4..17B5>
-180B..180D>
+180B..180E>
 1D2C>0061
 1D2D>00E6
 1D2E>0062