You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/01/24 16:51:57 UTC

svn commit: r1235308 [1/5] - in /lucene/dev/branches/branch_3x: lucene/ lucene/contrib/analyzers/common/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis...

Author: sarowe
Date: Tue Jan 24 15:51:55 2012
New Revision: 1235308

URL: http://svn.apache.org/viewvc?rev=1235308&view=rev
Log:
LUCENE-3690: Re-implemented HTMLStripCharFilter as a JFlex-generated scanner, and moved from Solr to Lucene Common Analyzers contrib.  Fixes LUCENE-2208, SOLR-882, and SOLR-42.

Added:
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro   (with props)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java   (with props)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
      - copied, changed from r1234452, lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/charfilter/htmlStripReaderTest.html
      - copied unchanged from r1235263, lucene/dev/branches/branch_3x/solr/core/src/test-files/htmlStripReaderTest.html
    lucene/dev/branches/branch_3x/lucene/contrib/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java   (with props)
    lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
      - copied, changed from r1234452, lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
    lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
      - copied, changed from r1234452, lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
    lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
      - copied, changed from r1234452, lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
    lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java   (with props)
Removed:
    lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
    lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
Modified:
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/common-build.xml
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/build.xml
    lucene/dev/branches/branch_3x/lucene/contrib/icu/build.xml
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/analysis/BaseCharFilter.java
    lucene/dev/branches/branch_3x/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
    lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
    lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Tue Jan 24 15:51:55 2012
@@ -110,6 +110,9 @@ New Features
   
 * LUCENE-3671: Add TypeTokenFilter that filters tokens based on
   their TypeAttribute.  (Tommaso Teofili via Uwe Schindler)
+
+* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
+  markup. (Steve Rowe)
   
 Bug fixes
 

Modified: lucene/dev/branches/branch_3x/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/common-build.xml?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/common-build.xml (original)
+++ lucene/dev/branches/branch_3x/lucene/common-build.xml Tue Jan 24 15:51:55 2012
@@ -184,7 +184,9 @@
   -->
   <property name="svnversion.exe" value="svnversion" />
   <property name="svn.exe" value="svn" />
-  
+
+  <property name="python.exe" value="python" />
+
   <property name="gpg.exe" value="gpg" />
   <property name="gpg.key" value="CODE SIGNING KEY" />
 

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/build.xml?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/build.xml (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/build.xml Tue Jan 24 15:51:55 2012
@@ -31,7 +31,30 @@
 
   <target name="compile-core" depends="jflex-notice, common.compile-core"/>
 
-  <target name="jflex" depends="jflex-check,clean-jflex,jflex-wiki-tokenizer"/>
+  <target name="jflex" depends="jflex-check,clean-jflex,jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
+
+  <target name="jflex-HTMLStripCharFilter"
+          depends="init,jflex-check,generate-jflex-html-char-entities"
+          if="jflex.present">
+    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+      <classpath refid="jflex.classpath"/>
+    </taskdef>
+    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+           outdir="src/java/org/apache/lucene/analysis/charfilter"
+           nobak="on"/>
+    <!-- Remove the inappropriate JFlex-generated constructors -->
+    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+                   match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+                   replace="" flags="sg"/>
+  </target>
+
+  <target name="generate-jflex-html-char-entities">
+    <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+          output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+          executable="${python.exe}" failonerror="true" logerror="true">
+      <arg value="htmlentity.py"/>
+    </exec>
+  </target>
 
   <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">

Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex Tue Jan 24 15:51:55 2012
@@ -0,0 +1,153 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
+                    | "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
+                    | "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
+                    | "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
+                    | "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
+                    | "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
+                    | "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
+                    | "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
+                    | "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
+                    | "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
+                    | "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
+                    | "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
+                    | "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
+                    | "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
+                    | "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
+                    | "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
+                    | "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
+                    | "divide" | "eacute" | "ecirc" | "egrave" | "empty"
+                    | "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
+                    | "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
+                    | "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
+                    | "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
+                    | "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
+                    | "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
+                    | "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
+                    | "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
+                    | "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
+                    | "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
+                    | "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
+                    | "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
+                    | "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
+                    | "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
+                    | "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
+                    | "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
+                    | "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
+                    | "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
+                    | "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
+                    | "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
+                    | "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
+                    | "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
+                    | "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
+                    | "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
+                    | "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
+                    | "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
+                    | "zwj" | "zwnj" )
+%{
+  private static final Set<String> upperCaseVariantsAccepted
+      = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));
+  private static final CharArrayMap<Character> entityValues
+      = new CharArrayMap<Character>(Version.LUCENE_36, 253, false);
+  static {
+    String[] entities = {
+      "AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
+      "Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
+      "Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
+      "Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
+      "Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
+      "Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
+      "Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
+      "Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
+      "Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
+      "Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
+      "Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
+      "Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
+      "Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
+      "Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
+      "Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
+      "Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
+      "Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
+      "Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
+      "aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
+      "aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
+      "alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
+      "apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
+      "atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
+      "beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
+      "ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
+      "circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
+      "crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
+      "dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
+      "diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
+      "ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
+      "emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
+      "equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
+      "euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
+      "forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
+      "frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
+      "gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
+      "hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
+      "iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
+      "infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
+      "isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
+      "lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
+      "larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
+      "lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
+      "lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
+      "mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
+      "minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
+      "ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
+      "notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
+      "oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
+      "ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
+      "omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
+      "ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
+      "otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
+      "permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
+      "piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
+      "prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
+      "quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
+      "raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
+      "rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
+      "rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
+      "sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
+      "sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
+      "sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
+      "sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
+      "sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
+      "there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
+      "thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
+      "times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
+      "uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
+      "ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
+      "upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
+      "xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
+      "zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
+    };
+    for (int i = 0 ; i < entities.length ; i += 2) {
+      Character value = entities[i + 1].charAt(0);
+      entityValues.put(entities[i], value);
+      if (upperCaseVariantsAccepted.contains(entities[i])) {
+        entityValues.put(entities[i].toUpperCase(), value);
+      }
+    }
+  }
+%}

Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro Tue Jan 24 15:51:55 2012
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2010 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Generated using ICU4J 4.8.1.1 on Tuesday, January 24, 2012 3:01:15 PM UTC
+// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
+
+
+ID_Start_Supp = (
+	  [\uD808][\uDC00-\uDF6E]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uD81A][\uDC00-\uDE38]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uD801][\uDC00-\uDC9D]
+	| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+)
+ID_Continue_Supp = (
+	  [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
+	| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
+	| [\uD808][\uDC00-\uDF6E]
+	| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
+	| [\uD80D][\uDC00-\uDC2E]
+	| [\uD809][\uDC00-\uDC62]
+	| [\uDB40][\uDD00-\uDDEF]
+	| [\uD81A][\uDC00-\uDE38]
+	| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
+	| [\uD87E][\uDC00-\uDE1D]
+	| [\uD82C][\uDC00\uDC01]
+	| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
+	| [\uD803][\uDC00-\uDC48]
+	| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
+	| [\uD86E][\uDC00-\uDC1D]
+	| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
+	| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
+	| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
+)