You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/03/26 21:36:16 UTC
svn commit: r1305512 [2/6] - in /lucene/dev/branches/branch_3x: ./
dev-tools/eclipse/dot.settings/
dev-tools/idea/lucene/contrib/analyzers/kuromoji/
dev-tools/idea/lucene/contrib/analyzers/phonetic/
dev-tools/idea/lucene/contrib/facet/ dev-tools/idea/l...
Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py?rev=1305512&r1=1305511&r2=1305512&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py Mon Mar 26 19:36:09 2012
@@ -1,530 +1,530 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-
-# A simple python script to generate an HTML entity map and a regex alternation
-# for inclusion in HTMLStripCharFilter.jflex.
-
-def main():
- print get_apache_license()
- codes = {}
- regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
- for line in get_entity_text().split('\n'):
- match = regex.match(line)
- if match:
- key = match.group(1)
- if key == 'quot': codes[key] = r'\"'
- elif key == 'nbsp': codes[key] = ' ';
- else : codes[key] = r'\u%04X' % int(match.group(2))
-
- keys = sorted(codes)
-
- first_entry = True
- output_line = 'CharacterEntities = ( '
- for key in keys:
- new_entry = ('"%s"' if first_entry else ' | "%s"') % key
- first_entry = False
- if len(output_line) + len(new_entry) >= 80:
- print output_line
- output_line = ' '
- output_line += new_entry
- if key in ('quot','copy','gt','lt','reg','amp'):
- new_entry = ' | "%s"' % key.upper()
- if len(output_line) + len(new_entry) >= 80:
- print output_line
- output_line = ' '
- output_line += new_entry
- print output_line, ')'
-
- print '%{'
- print ' private static final Set<String> upperCaseVariantsAccepted'
- print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
- print ' private static final CharArrayMap<Character> entityValues'
- print ' = new CharArrayMap<Character>(Version.LUCENE_36, %i, false);' % len(keys)
- print ' static {'
- print ' String[] entities = {'
- output_line = ' '
- for key in keys:
- new_entry = ' "%s", "%s",' % (key, codes[key])
- if len(output_line) + len(new_entry) >= 80:
- print output_line
- output_line = ' '
- output_line += new_entry
- print output_line[:-1]
- print ' };'
- print ' for (int i = 0 ; i < entities.length ; i += 2) {'
- print ' Character value = entities[i + 1].charAt(0);'
- print ' entityValues.put(entities[i], value);'
- print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
- print ' entityValues.put(entities[i].toUpperCase(), value);'
- print ' }'
- print ' }'
- print " }"
- print "%}"
-
-def get_entity_text():
-# The text below is taken verbatim from
-# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
- text = r"""
-F.1. XHTML Character Entities
-
-XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
-F.1.1. XHTML Latin 1 Character Entities
-
-You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
-
-<!-- ...................................................................... -->
-<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
-<!-- file: xhtml-lat1.ent
-
- Typical invocation:
-
- <!ENTITY % xhtml-lat1
- PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
- "xhtml-lat1.ent" >
- %xhtml-lat1;
-
- This DTD module is identified by the PUBLIC and SYSTEM identifiers:
-
- PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
- SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
-
- Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
-
- Portions (C) International Organization for Standardization 1986:
- Permission to copy in any form is granted for use with conforming
- SGML systems and applications as defined in ISO 8879, provided
- this notice is included in all copies.
--->
-
-<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
-<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
-<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum -->
-<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum -->
-<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum -->
-<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
-<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
-<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum -->
-<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
-<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum -->
-<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
-<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
-<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum -->
-<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
-<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
-<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
-<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum -->
-<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
-<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
-<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
-<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
-<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum -->
-<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
-<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
-<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
-<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
-<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
-<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
-<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
-<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
-<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
-<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
-<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
-<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
-<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
-<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
-<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
-<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
-<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
-<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
-<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
-<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
-<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
-<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
-<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
-<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
-<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
-<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
-<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
-<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
-<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
-<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
-<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
-<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
-<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
-<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum -->
-<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
-<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
-<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
-<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
-<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
-<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
-<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 -->
-<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
-<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
-<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
-<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
-<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
-<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
-<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
-<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
-<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
-<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
-<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
-<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
-<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
-<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 -->
-<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 -->
-<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
-<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
-<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 -->
-<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
-<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
-<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
-<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
-<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
-<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
-<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum -->
-<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
-<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
-<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 -->
-<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
-<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
-<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 -->
-<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 -->
-<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
-<!-- end of xhtml-lat1.ent -->
-
-F.1.2. XHTML Special Characters
-
-You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
-
-<!-- ...................................................................... -->
-<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
-<!-- file: xhtml-special.ent
-
- Typical invocation:
-
- <!ENTITY % xhtml-special
- PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
- "xhtml-special.ent" >
- %xhtml-special;
-
- This DTD module is identified by the PUBLIC and SYSTEM identifiers:
-
- PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
- SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
-
- Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
-
- Portions (C) International Organization for Standardization 1986:
- Permission to copy in any form is granted for use with conforming
- SGML systems and applications as defined in ISO 8879, provided
- this notice is included in all copies.
-
- Revisions:
-2000-10-28: added ' and altered XML Predefined Entities for compatibility
--->
-
-<!-- Relevant ISO entity set is given unless names are newly introduced.
- New names (i.e., not in ISO 8879 [SGML] list) do not clash with
- any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
- numbers are given for each character, in hex. Entity values are
- decimal conversions of the ISO 10646 values and refer to the
- document character set. Names are Unicode [UNICODE] names.
--->
-
-<!-- C0 Controls and Basic Latin -->
-<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum -->
-<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum -->
-<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum -->
-<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
-<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
-
-<!-- Latin Extended-A -->
-<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
-<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
-
-<!-- ligature is a misnomer, this is a separate character in some languages -->
-<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
-<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
-<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
-
-<!-- Spacing Modifier Letters -->
-<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
-<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia -->
-
-<!-- General Punctuation -->
-<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub -->
-<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub -->
-<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub -->
-<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
-<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
-<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
-<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
-<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub -->
-<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub -->
-<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum -->
-<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum -->
-<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW -->
-<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum -->
-<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum -->
-<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW -->
-<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub -->
-<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub -->
-<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech -->
-
-<!-- lsaquo is proposed but not yet ISO standardized -->
-<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
-<!-- rsaquo is proposed but not yet ISO standardized -->
-<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
-<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW -->
-
-<!-- end of xhtml-special.ent -->
-
-F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
-
-You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
-
-<!-- ...................................................................... -->
-<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
-<!-- file: xhtml-symbol.ent
-
- Typical invocation:
-
- <!ENTITY % xhtml-symbol
- PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
- "xhtml-symbol.ent" >
- %xhtml-symbol;
-
- This DTD module is identified by the PUBLIC and SYSTEM identifiers:
-
- PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
- SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
-
- Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
-
- Portions (C) International Organization for Standardization 1986:
- Permission to copy in any form is granted for use with conforming
- SGML systems and applications as defined in ISO 8879, provided
- this notice is included in all copies.
--->
-
-<!-- Relevant ISO entity set is given unless names are newly introduced.
- New names (i.e., not in ISO 8879 [SGML] list) do not clash with
- any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
- numbers are given for each character, in hex. Entity values are
- decimal conversions of the ISO 10646 values and refer to the
- document character set. Names are Unicode [UNICODE] names.
--->
-
-<!-- Latin Extended-B -->
-<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function
- = florin, U+0192 ISOtech -->
-
-<!-- Greek -->
-<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 -->
-<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 -->
-<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
-<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
-<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 -->
-<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 -->
-<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 -->
-<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
-<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 -->
-<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A -->
-<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
-<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C -->
-<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D -->
-<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
-<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F -->
-<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
-<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 -->
-<!-- there is no Sigmaf, and no U+03A2 character either -->
-<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
-<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 -->
-<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon,
- U+03A5 ISOgrk3 -->
-<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
-<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 -->
-<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
-<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
-<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
-<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
-<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
-<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
-<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
-<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
-<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
-<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
-<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
-<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
-<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
-<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
-<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
-<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
-<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW -->
-<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
-<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
-<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
-<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
-<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
-<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
-<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
-<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
-<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
-<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
-<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW -->
-<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
-<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
-
-<!-- General Punctuation -->
-<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub -->
-<!-- bullet is NOT the same as bullet operator, U+2219 -->
-<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
-<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech -->
-<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
-<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW -->
-<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW -->
-
-<!-- Letterlike Symbols -->
-<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
-<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
-<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
-<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum -->
-<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
-<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
- the same glyph could be used to depict both characters -->
-
-<!-- Arrows -->
-<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum -->
-<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum-->
-<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum -->
-<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum -->
-<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa -->
-<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards
- = carriage return, U+21B5 NEW -->
-<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech -->
-<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
- but also does not have any other character for that function. So ? lArr can
- be used for 'is implied by' as ISOtech suggests -->
-<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa -->
-<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech -->
-<!-- Unicode does not say this is the 'implies' character but does not have
- another character with this function so ?
- rArr can be used for 'implies' as ISOtech suggests -->
-<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa -->
-<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa -->
-
-<!-- Mathematical Operators -->
-<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech -->
-<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech -->
-<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech -->
-<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso -->
-<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech -->
-<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech -->
-<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech -->
-<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech -->
-<!-- should there be a more memorable name than 'ni'? -->
-<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb -->
-<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
- the same glyph might be used for both -->
-<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb -->
-<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
- though the same glyph might be used for both -->
-<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech -->
-<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech -->
-<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech -->
-<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech -->
-<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech -->
-<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso -->
-<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech -->
-<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech -->
-<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech -->
-<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech -->
-<!ENTITY int "∫" ><!-- integral, U+222B ISOtech -->
-<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech -->
-<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
-<!-- tilde operator is NOT the same character as the tilde, U+007E,
- although the same glyph might be used to represent both -->
-<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech -->
-<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
-<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech -->
-<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech -->
-<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech -->
-<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech -->
-<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech -->
-<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech -->
-<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
- font encoding and is not included. Should it be, for symmetry?
- It is in ISOamsn -->
-<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn -->
-<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech -->
-<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech -->
-<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
-<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb -->
-<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
-<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb -->
-<!-- dot operator is NOT the same character as U+00B7 middle dot -->
-
-<!-- Miscellaneous Technical -->
-<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
-<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc -->
-<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc -->
-<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc -->
-<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
-<!-- lang is NOT the same character as U+003C 'less than'
- or U+2039 'single left-pointing angle quotation mark' -->
-<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
-<!-- rang is NOT the same character as U+003E 'greater than'
- or U+203A 'single right-pointing angle quotation mark' -->
-
-<!-- Geometric Shapes -->
-<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub -->
-
-<!-- Miscellaneous Symbols -->
-<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub -->
-<!-- black here seems to mean filled as opposed to hollow -->
-<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub -->
-<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub -->
-<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub -->
-
-<!-- end of xhtml-symbol.ent -->
-"""
- return text
-
-def get_apache_license():
- license = r"""/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-"""
- return license
-
-main()
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+# A simple python script to generate an HTML entity map and a regex alternation
+# for inclusion in HTMLStripCharFilter.jflex.
+
+def main():
+ print get_apache_license()
+ codes = {}
+ regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
+ for line in get_entity_text().split('\n'):
+ match = regex.match(line)
+ if match:
+ key = match.group(1)
+ if key == 'quot': codes[key] = r'\"'
+ elif key == 'nbsp': codes[key] = ' ';
+ else : codes[key] = r'\u%04X' % int(match.group(2))
+
+ keys = sorted(codes)
+
+ first_entry = True
+ output_line = 'CharacterEntities = ( '
+ for key in keys:
+ new_entry = ('"%s"' if first_entry else ' | "%s"') % key
+ first_entry = False
+ if len(output_line) + len(new_entry) >= 80:
+ print output_line
+ output_line = ' '
+ output_line += new_entry
+ if key in ('quot','copy','gt','lt','reg','amp'):
+ new_entry = ' | "%s"' % key.upper()
+ if len(output_line) + len(new_entry) >= 80:
+ print output_line
+ output_line = ' '
+ output_line += new_entry
+ print output_line, ')'
+
+ print '%{'
+ print ' private static final Set<String> upperCaseVariantsAccepted'
+ print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
+ print ' private static final CharArrayMap<Character> entityValues'
+ print ' = new CharArrayMap<Character>(Version.LUCENE_36, %i, false);' % len(keys)
+ print ' static {'
+ print ' String[] entities = {'
+ output_line = ' '
+ for key in keys:
+ new_entry = ' "%s", "%s",' % (key, codes[key])
+ if len(output_line) + len(new_entry) >= 80:
+ print output_line
+ output_line = ' '
+ output_line += new_entry
+ print output_line[:-1]
+ print ' };'
+ print ' for (int i = 0 ; i < entities.length ; i += 2) {'
+ print ' Character value = entities[i + 1].charAt(0);'
+ print ' entityValues.put(entities[i], value);'
+ print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
+ print ' entityValues.put(entities[i].toUpperCase(), value);'
+ print ' }'
+ print ' }'
+ print " }"
+ print "%}"
+
+def get_entity_text():
+# The text below is taken verbatim from
+# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
+ text = r"""
+F.1. XHTML Character Entities
+
+XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
+F.1.1. XHTML Latin 1 Character Entities
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-lat1.ent
+
+ Typical invocation:
+
+ <!ENTITY % xhtml-lat1
+ PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+ "xhtml-lat1.ent" >
+ %xhtml-lat1;
+
+ This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+ PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+ SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
+
+ Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+ Portions (C) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with conforming
+ SGML systems and applications as defined in ISO 8879, provided
+ this notice is included in all copies.
+-->
+
+<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
+<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
+<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum -->
+<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum -->
+<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum -->
+<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
+<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
+<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum -->
+<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
+<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum -->
+<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
+<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
+<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum -->
+<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
+<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
+<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
+<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum -->
+<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
+<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
+<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
+<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
+<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum -->
+<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
+<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
+<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
+<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
+<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
+<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
+<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
+<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
+<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
+<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
+<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
+<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
+<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
+<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
+<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
+<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
+<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
+<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
+<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
+<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
+<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
+<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
+<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
+<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
+<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
+<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
+<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
+<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
+<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
+<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
+<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
+<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
+<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
+<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum -->
+<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
+<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
+<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
+<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
+<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
+<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
+<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 -->
+<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
+<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
+<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
+<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
+<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
+<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
+<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
+<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
+<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
+<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
+<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
+<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
+<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
+<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 -->
+<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 -->
+<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
+<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
+<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 -->
+<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
+<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
+<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
+<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
+<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
+<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
+<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum -->
+<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
+<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
+<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 -->
+<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
+<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
+<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 -->
+<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 -->
+<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
+<!-- end of xhtml-lat1.ent -->
+
+F.1.2. XHTML Special Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-special.ent
+
+ Typical invocation:
+
+ <!ENTITY % xhtml-special
+ PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+ "xhtml-special.ent" >
+ %xhtml-special;
+
+ This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+ PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+ SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
+
+ Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+ Portions (C) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with conforming
+ SGML systems and applications as defined in ISO 8879, provided
+ this notice is included in all copies.
+
+ Revisions:
+2000-10-28: added ' and altered XML Predefined Entities for compatibility
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+ New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+ any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+ numbers are given for each character, in hex. Entity values are
+ decimal conversions of the ISO 10646 values and refer to the
+ document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- C0 Controls and Basic Latin -->
+<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum -->
+<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum -->
+<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum -->
+<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
+<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
+
+<!-- Latin Extended-A -->
+<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
+<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
+
+<!-- ligature is a misnomer, this is a separate character in some languages -->
+<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
+<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
+<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
+
+<!-- Spacing Modifier Letters -->
+<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
+<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia -->
+
+<!-- General Punctuation -->
+<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub -->
+<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub -->
+<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub -->
+<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
+<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
+<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
+<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
+<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub -->
+<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub -->
+<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum -->
+<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum -->
+<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW -->
+<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum -->
+<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum -->
+<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW -->
+<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub -->
+<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub -->
+<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech -->
+
+<!-- lsaquo is proposed but not yet ISO standardized -->
+<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
+<!-- rsaquo is proposed but not yet ISO standardized -->
+<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
+<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW -->
+
+<!-- end of xhtml-special.ent -->
+
+F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
+
+<!-- ...................................................................... -->
+<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
+<!-- file: xhtml-symbol.ent
+
+ Typical invocation:
+
+ <!ENTITY % xhtml-symbol
+ PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+ "xhtml-symbol.ent" >
+ %xhtml-symbol;
+
+ This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+ PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+ SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
+
+ Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+ Portions (C) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with conforming
+ SGML systems and applications as defined in ISO 8879, provided
+ this notice is included in all copies.
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+ New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+ any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+ numbers are given for each character, in hex. Entity values are
+ decimal conversions of the ISO 10646 values and refer to the
+ document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- Latin Extended-B -->
+<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function
+ = florin, U+0192 ISOtech -->
+
+<!-- Greek -->
+<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
+<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
+<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 -->
+<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 -->
+<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 -->
+<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
+<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 -->
+<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A -->
+<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
+<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C -->
+<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D -->
+<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
+<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F -->
+<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
+<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 -->
+<!-- there is no Sigmaf, and no U+03A2 character either -->
+<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
+<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 -->
+<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon,
+ U+03A5 ISOgrk3 -->
+<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
+<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 -->
+<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
+<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
+<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
+<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
+<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
+<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
+<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
+<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
+<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
+<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
+<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
+<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
+<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
+<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
+<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
+<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
+<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW -->
+<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
+<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
+<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
+<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
+<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
+<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
+<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
+<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
+<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
+<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
+<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW -->
+<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
+<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
+
+<!-- General Punctuation -->
+<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub -->
+<!-- bullet is NOT the same as bullet operator, U+2219 -->
+<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
+<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech -->
+<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
+<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW -->
+<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW -->
+
+<!-- Letterlike Symbols -->
+<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
+<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
+<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
+<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum -->
+<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
+<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
+ the same glyph could be used to depict both characters -->
+
+<!-- Arrows -->
+<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum -->
+<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum-->
+<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum -->
+<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum -->
+<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa -->
+<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards
+ = carriage return, U+21B5 NEW -->
+<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech -->
+<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
+ but also does not have any other character for that function. So ? lArr can
+ be used for 'is implied by' as ISOtech suggests -->
+<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa -->
+<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech -->
+<!-- Unicode does not say this is the 'implies' character but does not have
+ another character with this function so ?
+ rArr can be used for 'implies' as ISOtech suggests -->
+<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa -->
+<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa -->
+
+<!-- Mathematical Operators -->
+<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech -->
+<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech -->
+<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech -->
+<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso -->
+<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech -->
+<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech -->
+<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech -->
+<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech -->
+<!-- should there be a more memorable name than 'ni'? -->
+<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb -->
+<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
+ the same glyph might be used for both -->
+<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb -->
+<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+ though the same glyph might be used for both -->
+<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech -->
+<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech -->
+<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech -->
+<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech -->
+<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech -->
+<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso -->
+<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech -->
+<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech -->
+<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech -->
+<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech -->
+<!ENTITY int "∫" ><!-- integral, U+222B ISOtech -->
+<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech -->
+<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
+<!-- tilde operator is NOT the same character as the tilde, U+007E,
+ although the same glyph might be used to represent both -->
+<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech -->
+<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
+<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech -->
+<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech -->
+<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech -->
+<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech -->
+<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech -->
+<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech -->
+<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
+ font encoding and is not included. Should it be, for symmetry?
+ It is in ISOamsn -->
+<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn -->
+<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech -->
+<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech -->
+<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
+<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb -->
+<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
+<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb -->
+<!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+<!-- Miscellaneous Technical -->
+<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
+<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc -->
+<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc -->
+<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc -->
+<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
+<!-- lang is NOT the same character as U+003C 'less than'
+ or U+2039 'single left-pointing angle quotation mark' -->
+<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
+<!-- rang is NOT the same character as U+003E 'greater than'
+ or U+203A 'single right-pointing angle quotation mark' -->
+
+<!-- Geometric Shapes -->
+<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub -->
+
+<!-- Miscellaneous Symbols -->
+<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub -->
+<!-- black here seems to mean filled as opposed to hollow -->
+<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub -->
+<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub -->
+<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub -->
+
+<!-- end of xhtml-symbol.ent -->
+"""
+ return text
+
+def get_apache_license():
+ license = r"""/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+ return license
+
+main()