You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/01/24 16:51:57 UTC
svn commit: r1235308 [3/5] - in /lucene/dev/branches/branch_3x: lucene/
lucene/contrib/analyzers/common/
lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/
lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis...
Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Tue Jan 24 15:51:55 2012
@@ -0,0 +1,875 @@
+package org.apache.lucene.analysis.charfilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.BaseCharFilter;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.util.OpenStringBuilder;
+
+
+/**
+ * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.0
+%apiprivate
+%type int
+%final
+%public
+%char
+%function nextChar
+%class HTMLStripCharFilter
+%extends BaseCharFilter
+%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
+%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
+%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
+%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
+%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
+%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
+%xstate STYLE, STYLE_COMMENT
+
+// From XML 1.0 <http://www.w3.org/TR/xml/>:
+//
+// [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
+// [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | [...]
+// [5] Name ::= NameStartChar (NameChar)*
+//
+// From UAX #31: Unicode Identifier and Pattern Syntax
+// <http://unicode.org/reports/tr31/>:
+//
+// D1. Default Identifier Syntax
+//
+// <identifier> := <ID_Start> <ID_Continue>*
+//
+Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
+
+// From Apache httpd mod_include documentation
+// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
+//
+// Basic Elements
+//
+// The document is parsed as an HTML document, with special commands
+// embedded as SGML comments. A command has the syntax:
+//
+// <!--#element attribute=value attribute=value ... -->
+//
+// The value will often be enclosed in double quotes, but single quotes (')
+// and backticks (`) are also possible. Many commands only allow a single
+// attribute-value pair. Note that the comment terminator (-->) should be
+// preceded by whitespace to ensure that it isn't considered part of an SSI
+// token. Note that the leading <!--# is one token and may not contain any
+// whitespaces.
+//
+
+EventAttributeSuffixes = ( [aA][bB][oO][rR][tT] |
+ [bB][lL][uU][rR] |
+ [cC][hH][aA][nN][gG][eE] |
+ [cC][lL][iI][cC][kK] |
+ [dD][bB][lL][cC][lL][iI][cC][kK] |
+ [eE][rR][rR][oO][rR] |
+ [fF][oO][cC][uU][sS] |
+ [kK][eE][yY][dD][oO][wW][nN] |
+ [kK][eE][yY][pP][rR][eE][sS][sS] |
+ [kK][eE][yY][uU][pP] |
+ [lL][oO][aA][dD] |
+ [mM][oO][uU][sS][eE][dD][oO][wW][nN] |
+ [mM][oO][uU][sS][eE][mM][oO][vV][eE] |
+ [mM][oO][uU][sS][eE][oO][uU][tT] |
+ [mM][oO][uU][sS][eE][oO][vV][eE][rR] |
+ [mM][oO][uU][sS][eE][uU][pP] |
+ [rR][eE][sS][eE][tT] |
+ [sS][eE][lL][eE][cC][tT] |
+ [sS][uU][bB][mM][iI][tT] |
+ [uU][nN][lL][oO][aA][dD] )
+
+SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
+DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
+ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
+EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
+OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
+
+InlineElment = ( [aAbBiIqQsSuU] |
+ [aA][bB][bB][rR] |
+ [aA][cC][rR][oO][nN][yY][mM] |
+ [bB][aA][sS][eE][fF][oO][nN][tT] |
+ [bB][dD][oO] |
+ [bB][iI][gG] |
+ [cC][iI][tT][eE] |
+ [cC][oO][dD][eE] |
+ [dD][fF][nN] |
+ [eE][mM] |
+ [fF][oO][nN][tT] |
+ [iI][mM][gG] |
+ [iI][nN][pP][uU][tT] |
+ [kK][bB][dD] |
+ [lL][aA][bB][eE][lL] |
+ [sS][aA][mM][pP] |
+ [sS][eE][lL][eE][cC][tT] |
+ [sS][mM][aA][lL][lL] |
+ [sS][pP][aA][nN] |
+ [sS][tT][rR][iI][kK][eE] |
+ [sS][tT][rR][oO][nN][gG] |
+ [sS][uU][bB] |
+ [sS][uU][pP] |
+ [tT][eE][xX][tT][aA][rR][eE][aA] |
+ [tT][tT] |
+ [vV][aA][rR] )
+
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+
+%{
+ private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
+ private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
+ private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
+ private static final char BR_START_TAG_REPLACEMENT = '\n';
+ private static final char BR_END_TAG_REPLACEMENT = '\n';
+ private static final char SCRIPT_REPLACEMENT = '\n';
+ private static final char STYLE_REPLACEMENT = '\n';
+ private static final char REPLACEMENT_CHARACTER = '\uFFFD';
+
+ private CharArraySet escapedTags = null;
+ private int inputStart;
+ private int cumulativeDiff;
+ private boolean escapeBR = false;
+ private boolean escapeSCRIPT = false;
+ private boolean escapeSTYLE = false;
+ private int restoreState;
+ private int previousRestoreState;
+ private int outputCharCount;
+ private int eofReturnValue;
+ private TextSegment inputSegment
+ = new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
+ private TextSegment outputSegment = inputSegment;
+ private TextSegment entitySegment = new TextSegment(2);
+
+ /**
+ * @param source
+ */
+ public HTMLStripCharFilter(CharStream source) {
+ super(source);
+ this.zzReader = source;
+ }
+
+ /**
+ * @param source
+ * @param escapedTags Tags in this set (both start and end tags)
+ * will not be filtered out.
+ */
+ public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
+ super(source);
+ this.zzReader = source;
+ if (null != escapedTags) {
+ for (String tag : escapedTags) {
+ if (tag.equalsIgnoreCase("BR")) {
+ escapeBR = true;
+ } else if (tag.equalsIgnoreCase("SCRIPT")) {
+ escapeSCRIPT = true;
+ } else if (tag.equalsIgnoreCase("STYLE")) {
+ escapeSTYLE = true;
+ } else {
+ if (null == this.escapedTags) {
+ this.escapedTags = new CharArraySet(Version.LUCENE_36, 16, true);
+ }
+ this.escapedTags.add(tag);
+ }
+ }
+ }
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (outputSegment.isRead()) {
+ if (zzAtEOF) {
+ return -1;
+ }
+ int ch = nextChar();
+ ++outputCharCount;
+ return ch;
+ }
+ int ch = outputSegment.nextChar();
+ ++outputCharCount;
+ return ch;
+ }
+
+ @Override
+ public int read(char cbuf[], int off, int len) throws IOException {
+ int i = 0;
+ for ( ; i < len ; ++i) {
+ int ch = read();
+ if (ch == -1) break;
+ cbuf[off++] = (char)ch;
+ }
+ return i > 0 ? i : (len == 0 ? 0 : -1);
+ }
+
+ @Override
+ public void close() throws IOException {
+ yyclose();
+ }
+
+ static int getInitialBufferSize() { // Package private, for testing purposes
+ return ZZ_BUFFERSIZE;
+ }
+
+ private class TextSegment extends OpenStringBuilder {
+ /** The position from which the next char will be read. */
+ int pos = 0;
+
+ /** Wraps the given buffer and sets this.len to the given length. */
+ TextSegment(char[] buffer, int length) {
+ super(buffer, length);
+ }
+
+ /** Allocates an internal buffer of the given size. */
+ TextSegment(int size) {
+ super(size);
+ }
+
+ /** Sets len = 0 and pos = 0. */
+ void clear() {
+ reset();
+ restart();
+ }
+
+ /** Sets pos = 0 */
+ void restart() {
+ pos = 0;
+ }
+
+ /** Returns the next char in the segment. */
+ int nextChar() {
+ assert (! isRead()): "Attempting to read past the end of a segment.";
+ return buf[pos++];
+ }
+
+ /** Returns true when all characters in the text segment have been read */
+ boolean isRead() {
+ return pos >= len;
+ }
+ }
+%}
+
+%eofval{
+ return eofReturnValue;
+%eofval}
+%eof{
+ switch (zzLexicalState) {
+ case SCRIPT:
+ case COMMENT:
+ case SCRIPT_COMMENT:
+ case STYLE:
+ case STYLE_COMMENT:
+ case SINGLE_QUOTED_STRING:
+ case DOUBLE_QUOTED_STRING:
+ case END_TAG_TAIL_EXCLUDE:
+ case END_TAG_TAIL_SUBSTITUTE:
+ case START_TAG_TAIL_EXCLUDE:
+ case SERVER_SIDE_INCLUDE:
+ case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+ cumulativeDiff += yychar - inputStart;
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ outputSegment.clear();
+ eofReturnValue = -1;
+ break;
+ }
+ case CHARACTER_REFERENCE_TAIL: { // Substitute
+ // At end of file, allow char refs without semicolons
+ cumulativeDiff += inputSegment.length() - outputSegment.length();
+ addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+ eofReturnValue = outputSegment.nextChar();
+ break;
+ }
+ case BANG:
+ case CDATA:
+ case AMPERSAND:
+ case NUMERIC_CHARACTER:
+ case END_TAG_TAIL_INCLUDE:
+ case START_TAG_TAIL_INCLUDE:
+ case LEFT_ANGLE_BRACKET:
+ case LEFT_ANGLE_BRACKET_SLASH:
+ case LEFT_ANGLE_BRACKET_SPACE: { // Include
+ outputSegment = inputSegment;
+ eofReturnValue = outputSegment.nextChar();
+ break;
+ }
+ default: {
+ eofReturnValue = -1;
+ }
+ }
+%eof}
+
+%%
+
+"&" {
+ inputStart = yychar;
+ inputSegment.clear();
+ inputSegment.append('&');
+ yybegin(AMPERSAND);
+}
+
+"<" {
+ inputStart = yychar;
+ inputSegment.clear();
+ inputSegment.append('<');
+ yybegin(LEFT_ANGLE_BRACKET);
+}
+
+<AMPERSAND> {
+ {CharacterEntities} {
+ int length = yylength();
+ inputSegment.write(zzBuffer, zzStartRead, length);
+ entitySegment.clear();
+ char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
+ entitySegment.append(ch);
+ outputSegment = entitySegment;
+ yybegin(CHARACTER_REFERENCE_TAIL);
+ }
+ "#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
+
+// 1 1 11 11
+// 0 1 2 3 45 678 9 0 1 23 45
+ "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+ // Handle paired UTF-16 surrogates.
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ String surrogatePair = yytext();
+ char highSurrogate = '\u0000';
+ try {
+ highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing high surrogate '"
+ + surrogatePair.substring(2, 6) + "'";
+ }
+ try {
+ outputSegment.unsafeWrite
+ ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing low surrogate '"
+ + surrogatePair.substring(10, 14) + "'";
+ }
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
+ }
+
+// 1 1 11 11
+// 01 2 345 678 9 0 1 23 45
+ "#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+ // Handle paired UTF-16 surrogates.
+ String surrogatePair = yytext();
+ char highSurrogate = '\u0000';
+ try { // High surrogates are in decimal range [55296, 56319]
+ highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing high surrogate '"
+ + surrogatePair.substring(1, 6) + "'";
+ }
+ if (Character.isHighSurrogate(highSurrogate)) {
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ try {
+ outputSegment.unsafeWrite
+ ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing low surrogate '"
+ + surrogatePair.substring(10, 14) + "'";
+ }
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
+ }
+ yypushback(surrogatePair.length() - 1); // Consume only '#'
+ inputSegment.append('#');
+ yybegin(NUMERIC_CHARACTER);
+ }
+
+// 1 111 11
+// 0 1 2 3 45 6789 0 123 45
+ "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";" [67] \d{3} ";" {
+ // Handle paired UTF-16 surrogates.
+ String surrogatePair = yytext();
+ char highSurrogate = '\u0000';
+ char lowSurrogate = '\u0000';
+ try {
+ highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing high surrogate '"
+ + surrogatePair.substring(2, 6) + "'";
+ }
+ try { // Low surrogates are in decimal range [56320, 57343]
+ lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing low surrogate '"
+ + surrogatePair.substring(9, 14) + "'";
+ }
+ if (Character.isLowSurrogate(lowSurrogate)) {
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ outputSegment.unsafeWrite(lowSurrogate);
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
+ }
+ yypushback(surrogatePair.length() - 1); // Consume only '#'
+ inputSegment.append('#');
+ yybegin(NUMERIC_CHARACTER);
+ }
+
+// 1 111 11
+// 01 2 345 6789 0 123 45
+ "#5" [56] \d{3} ";" [67] \d{3} ";" {
+ // Handle paired UTF-16 surrogates.
+ String surrogatePair = yytext();
+ char highSurrogate = '\u0000';
+ try { // High surrogates are in decimal range [55296, 56319]
+ highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing high surrogate '"
+ + surrogatePair.substring(1, 6) + "'";
+ }
+ if (Character.isHighSurrogate(highSurrogate)) {
+ char lowSurrogate = '\u0000';
+ try { // Low surrogates are in decimal range [56320, 57343]
+ lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+ } catch(Exception e) { // should never happen
+ assert false: "Exception parsing low surrogate '"
+ + surrogatePair.substring(9, 14) + "'";
+ }
+ if (Character.isLowSurrogate(lowSurrogate)) {
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ outputSegment.unsafeWrite(lowSurrogate);
+ cumulativeDiff += inputSegment.length() + yylength() - 2;
+ addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return highSurrogate;
+ }
+ }
+ yypushback(surrogatePair.length() - 1); // Consume only '#'
+ inputSegment.append('#');
+ yybegin(NUMERIC_CHARACTER);
+ }
+}
+
+<NUMERIC_CHARACTER> {
+ [xX] [0-9A-Fa-f]+ {
+ int matchLength = yylength();
+ inputSegment.write(zzBuffer, zzStartRead, matchLength);
+ if (matchLength <= 6) { // 10FFFF: max 6 hex chars
+ String hexCharRef
+ = new String(zzBuffer, zzStartRead + 1, matchLength - 1);
+ int codePoint = 0;
+ try {
+ codePoint = Integer.parseInt(hexCharRef, 16);
+ } catch(Exception e) {
+ assert false: "Exception parsing hex code point '" + hexCharRef + "'";
+ }
+ if (codePoint <= 0x10FFFF) {
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ if (codePoint >= Character.MIN_SURROGATE
+ && codePoint <= Character.MAX_SURROGATE) {
+ outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+ } else {
+ outputSegment.setLength
+ (Character.toChars(codePoint, outputSegment.getArray(), 0));
+ }
+ yybegin(CHARACTER_REFERENCE_TAIL);
+ } else {
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+ } else {
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+ }
+ [0-9]+ {
+ int matchLength = yylength();
+ inputSegment.write(zzBuffer, zzStartRead, matchLength);
+ if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
+ String decimalCharRef = yytext();
+ int codePoint = 0;
+ try {
+ codePoint = Integer.parseInt(decimalCharRef);
+ } catch(Exception e) {
+ assert false: "Exception parsing code point '" + decimalCharRef + "'";
+ }
+ if (codePoint <= 0x10FFFF) {
+ outputSegment = entitySegment;
+ outputSegment.clear();
+ if (codePoint >= Character.MIN_SURROGATE
+ && codePoint <= Character.MAX_SURROGATE) {
+ outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+ } else {
+ outputSegment.setLength
+ (Character.toChars(codePoint, outputSegment.getArray(), 0));
+ }
+ yybegin(CHARACTER_REFERENCE_TAIL);
+ } else {
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+ } else {
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+ }
+}
+
+<CHARACTER_REFERENCE_TAIL> {
+ ";" {
+ cumulativeDiff
+ += inputSegment.length() + yylength() - outputSegment.length();
+ addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+}
+
+<LEFT_ANGLE_BRACKET_SLASH> {
+ \s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
+ [bB][rR] \s* ">" {
+ yybegin(YYINITIAL);
+ if (escapeBR) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ return outputSegment.nextChar();
+ } else {
+ cumulativeDiff
+ += inputSegment.length() + yylength() - outputSegment.length();
+ addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+ inputSegment.reset();
+ return BR_END_TAG_REPLACEMENT;
+ }
+ }
+ {InlineElment} {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ if (null != escapedTags
+ && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+ yybegin(END_TAG_TAIL_INCLUDE);
+ } else {
+ yybegin(END_TAG_TAIL_EXCLUDE);
+ }
+ }
+ {Name} {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ if (null != escapedTags
+ && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+ yybegin(END_TAG_TAIL_INCLUDE);
+ } else {
+ yybegin(END_TAG_TAIL_SUBSTITUTE);
+ }
+ }
+}
+
+<END_TAG_TAIL_INCLUDE> {
+ \s* ">" {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+}
+
+<END_TAG_TAIL_EXCLUDE> {
+ \s* ">" {
+ cumulativeDiff += inputSegment.length() + yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ }
+}
+
+<END_TAG_TAIL_SUBSTITUTE> {
+ \s* ">" {
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+ }
+}
+
+<LEFT_ANGLE_BRACKET> {
+ "!" { inputSegment.append('!'); yybegin(BANG); }
+ "/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
+ \s+ {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ yybegin(LEFT_ANGLE_BRACKET_SPACE);
+ }
+ "?" [^>]* [/?] ">" {
+ cumulativeDiff += inputSegment.length() + yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ }
+ \s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+ yybegin(YYINITIAL);
+ if (escapeBR) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ return outputSegment.nextChar();
+ } else {
+ cumulativeDiff
+ += inputSegment.length() + yylength() - outputSegment.length();
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+ inputSegment.reset();
+ return BR_START_TAG_REPLACEMENT;
+ }
+ }
+ \s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s* ">" {
+ yybegin(SCRIPT);
+ if (escapeSCRIPT) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ inputStart += 1 + yylength();
+ return outputSegment.nextChar();
+ }
+ }
+ \s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
+ yybegin(STYLE);
+ if (escapeSTYLE) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ inputStart += 1 + yylength();
+ return outputSegment.nextChar();
+ }
+ }
+}
+
+<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
+ {InlineElment} {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ if (null != escapedTags
+ && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+ yybegin(START_TAG_TAIL_INCLUDE);
+ } else {
+ yybegin(START_TAG_TAIL_EXCLUDE);
+ }
+ }
+ {Name} {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ if (null != escapedTags
+ && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+ yybegin(START_TAG_TAIL_INCLUDE);
+ } else {
+ yybegin(START_TAG_TAIL_SUBSTITUTE);
+ }
+ }
+}
+
+<START_TAG_TAIL_INCLUDE> {
+ ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+}
+
+<START_TAG_TAIL_EXCLUDE> {
+ ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+ cumulativeDiff += inputSegment.length() + yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ outputSegment = inputSegment;
+ yybegin(YYINITIAL);
+ }
+}
+
+<START_TAG_TAIL_SUBSTITUTE> {
+ ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+ cumulativeDiff += inputSegment.length() + yylength() - 1;
+ addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+ }
+}
+
+<BANG> {
+ "--" { yybegin(COMMENT); }
+ ">" {
+ cumulativeDiff += inputSegment.length() + yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ }
+ // From XML 1.0 <http://www.w3.org/TR/xml/>:
+ //
+ // [18] CDSect ::= CDStart CData CDEnd
+ // [19] CDStart ::= '<![CDATA['
+ // [20] CData ::= (Char* - (Char* ']]>' Char*))
+ // [21] CDEnd ::= ']]>'
+ //
+ "[CDATA[" {
+ cumulativeDiff += inputSegment.length() + yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(CDATA);
+ }
+ [^] {
+ inputSegment.append(zzBuffer[zzStartRead]);
+ }
+}
+
+<CDATA> {
+ "]]>" {
+ cumulativeDiff += yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ yybegin(YYINITIAL);
+ }
+ [^] { return zzBuffer[zzStartRead]; }
+}
+
+<COMMENT> {
+ "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+ "-->" {
+ cumulativeDiff += yychar - inputStart + yylength();
+ addOffCorrectMap(outputCharCount, cumulativeDiff);
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ }
+ [^] { }
+}
+
+<SERVER_SIDE_INCLUDE> {
+ "-->" { yybegin(restoreState); }
+ "'" {
+ previousRestoreState = restoreState;
+ restoreState = SERVER_SIDE_INCLUDE;
+ yybegin(SINGLE_QUOTED_STRING);
+ }
+ "\"" {
+ previousRestoreState = restoreState;
+ restoreState = SERVER_SIDE_INCLUDE;
+ yybegin(DOUBLE_QUOTED_STRING);
+ }
+ [^] { }
+}
+
+<SCRIPT_COMMENT> {
+ "<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+ "'" { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+ "\"" { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+ "-->" { yybegin(SCRIPT); }
+ [^] { }
+}
+
+<STYLE_COMMENT> {
+ "<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+ "'" { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+ "\"" { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+ "-->" { yybegin(STYLE); }
+ [^] { }
+}
+
+<SINGLE_QUOTED_STRING> {
+ "\\" [^] { }
+ "'" { yybegin(restoreState); restoreState = previousRestoreState; }
+ [^] { }
+}
+
+<DOUBLE_QUOTED_STRING> {
+ "\\" [^] { }
+ "\"" { yybegin(restoreState); restoreState = previousRestoreState; }
+ [^] { }
+}
+
+<SCRIPT> {
+ "<!--" { yybegin(SCRIPT_COMMENT); }
+ "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ cumulativeDiff += yychar - inputStart;
+ int outputEnd = outputCharCount;
+ int returnValue;
+ if (escapeSCRIPT) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ returnValue = outputSegment.nextChar();
+ } else {
+ cumulativeDiff += yylength() - 1;
+ ++outputEnd;
+ returnValue = SCRIPT_REPLACEMENT;
+ }
+ addOffCorrectMap(outputEnd, cumulativeDiff);
+ return returnValue;
+ }
+ [^] { }
+}
+
+<STYLE> {
+ "<!--" { yybegin(STYLE_COMMENT); }
+ "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
+ inputSegment.clear();
+ yybegin(YYINITIAL);
+ cumulativeDiff += yychar - inputStart;
+ int outputEnd = outputCharCount;
+ int returnValue;
+ if (escapeSTYLE) {
+ inputSegment.write(zzBuffer, zzStartRead, yylength());
+ outputSegment = inputSegment;
+ returnValue = outputSegment.nextChar();
+ } else {
+ cumulativeDiff += yylength() - 1;
+ ++outputEnd;
+ returnValue = STYLE_REPLACEMENT;
+ }
+ addOffCorrectMap(outputEnd, cumulativeDiff);
+ return returnValue;
+ }
+ [^] { }
+}
+
+<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
+ [^] {
+ yypushback(1);
+ outputSegment = inputSegment;
+ outputSegment.restart();
+ yybegin(YYINITIAL);
+ return outputSegment.nextChar();
+ }
+}
+
+[^] { return zzBuffer[zzStartRead]; }
Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py Tue Jan 24 15:51:55 2012
@@ -0,0 +1,530 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+# A simple python script to generate an HTML entity map and a regex alternation
+# for inclusion in HTMLStripCharFilter.jflex.
+
+def main():
+ print get_apache_license()
+ codes = {}
+ regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
+ for line in get_entity_text().split('\n'):
+ match = regex.match(line)
+ if match:
+ key = match.group(1)
+ if key == 'quot': codes[key] = r'\"'
+ elif key == 'nbsp': codes[key] = ' ';
+ else : codes[key] = r'\u%04X' % int(match.group(2))
+
+ keys = sorted(codes)
+
+ first_entry = True
+ output_line = 'CharacterEntities = ( '
+ for key in keys:
+ new_entry = ('"%s"' if first_entry else ' | "%s"') % key
+ first_entry = False
+ if len(output_line) + len(new_entry) >= 80:
+ print output_line
+ output_line = ' '
+ output_line += new_entry
+ if key in ('quot','copy','gt','lt','reg','amp'):
+ new_entry = ' | "%s"' % key.upper()
+ if len(output_line) + len(new_entry) >= 80:
+ print output_line
+ output_line = ' '
+ output_line += new_entry
+ print output_line, ')'
+
+ print '%{'
+ print ' private static final Set<String> upperCaseVariantsAccepted'
+ print ' = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
+ print ' private static final CharArrayMap<Character> entityValues'
+ print ' = new CharArrayMap<Character>(Version.LUCENE_36, %i, false);' % len(keys)
+ print ' static {'
+ print ' String[] entities = {'
+ output_line = ' '
+ for key in keys:
+ new_entry = ' "%s", "%s",' % (key, codes[key])
+ if len(output_line) + len(new_entry) >= 80:
+ print output_line
+ output_line = ' '
+ output_line += new_entry
+ print output_line[:-1]
+ print ' };'
+ print ' for (int i = 0 ; i < entities.length ; i += 2) {'
+ print ' Character value = entities[i + 1].charAt(0);'
+ print ' entityValues.put(entities[i], value);'
+ print ' if (upperCaseVariantsAccepted.contains(entities[i])) {'
+ print ' entityValues.put(entities[i].toUpperCase(), value);'
+ print ' }'
+ print ' }'
+ print " }"
+ print "%}"
+
+def get_entity_text():
+# The text below is taken verbatim from
+# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
+ text = r"""
+F.1. XHTML Character Entities
+
+XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
+F.1.1. XHTML Latin 1 Character Entities
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-lat1.ent
+
+ Typical invocation:
+
+ <!ENTITY % xhtml-lat1
+ PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+ "xhtml-lat1.ent" >
+ %xhtml-lat1;
+
+ This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+ PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+ SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
+
+ Revision: $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+ Portions (C) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with conforming
+ SGML systems and applications as defined in ISO 8879, provided
+ this notice is included in all copies.
+-->
+
+<!ENTITY nbsp " " ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
+<!ENTITY iexcl "¡" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
+<!ENTITY cent "¢" ><!-- cent sign, U+00A2 ISOnum -->
+<!ENTITY pound "£" ><!-- pound sign, U+00A3 ISOnum -->
+<!ENTITY curren "¤" ><!-- currency sign, U+00A4 ISOnum -->
+<!ENTITY yen "¥" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
+<!ENTITY brvbar "¦" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
+<!ENTITY sect "§" ><!-- section sign, U+00A7 ISOnum -->
+<!ENTITY uml "¨" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
+<!ENTITY copy "©" ><!-- copyright sign, U+00A9 ISOnum -->
+<!ENTITY ordf "ª" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
+<!ENTITY laquo "«" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
+<!ENTITY not "¬" ><!-- not sign, U+00AC ISOnum -->
+<!ENTITY shy "­" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
+<!ENTITY reg "®" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
+<!ENTITY macr "¯" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
+<!ENTITY deg "°" ><!-- degree sign, U+00B0 ISOnum -->
+<!ENTITY plusmn "±" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
+<!ENTITY sup2 "²" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
+<!ENTITY sup3 "³" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
+<!ENTITY acute "´" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
+<!ENTITY micro "µ" ><!-- micro sign, U+00B5 ISOnum -->
+<!ENTITY para "¶" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
+<!ENTITY middot "·" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
+<!ENTITY cedil "¸" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
+<!ENTITY sup1 "¹" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
+<!ENTITY ordm "º" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
+<!ENTITY raquo "»" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
+<!ENTITY frac14 "¼" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
+<!ENTITY frac12 "½" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
+<!ENTITY frac34 "¾" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
+<!ENTITY iquest "¿" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
+<!ENTITY Agrave "À" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
+<!ENTITY Aacute "Á" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
+<!ENTITY Acirc "Â" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
+<!ENTITY Atilde "Ã" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
+<!ENTITY Auml "Ä" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
+<!ENTITY Aring "Å" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
+<!ENTITY AElig "Æ" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
+<!ENTITY Ccedil "Ç" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
+<!ENTITY Egrave "È" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
+<!ENTITY Eacute "É" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
+<!ENTITY Ecirc "Ê" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
+<!ENTITY Euml "Ë" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
+<!ENTITY Igrave "Ì" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
+<!ENTITY Iacute "Í" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
+<!ENTITY Icirc "Î" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
+<!ENTITY Iuml "Ï" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
+<!ENTITY ETH "Ð" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
+<!ENTITY Ntilde "Ñ" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
+<!ENTITY Ograve "Ò" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
+<!ENTITY Oacute "Ó" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
+<!ENTITY Ocirc "Ô" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
+<!ENTITY Otilde "Õ" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
+<!ENTITY Ouml "Ö" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
+<!ENTITY times "×" ><!-- multiplication sign, U+00D7 ISOnum -->
+<!ENTITY Oslash "Ø" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
+<!ENTITY Ugrave "Ù" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
+<!ENTITY Uacute "Ú" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
+<!ENTITY Ucirc "Û" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
+<!ENTITY Uuml "Ü" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
+<!ENTITY Yacute "Ý" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
+<!ENTITY THORN "Þ" ><!-- latin capital THORN, U+00DE ISOlat1 -->
+<!ENTITY szlig "ß" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
+<!ENTITY agrave "à" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
+<!ENTITY aacute "á" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
+<!ENTITY acirc "â" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
+<!ENTITY atilde "ã" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
+<!ENTITY auml "ä" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
+<!ENTITY aring "å" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
+<!ENTITY aelig "æ" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
+<!ENTITY ccedil "ç" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
+<!ENTITY egrave "è" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
+<!ENTITY eacute "é" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
+<!ENTITY ecirc "ê" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
+<!ENTITY euml "ë" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
+<!ENTITY igrave "ì" ><!-- latin small i with grave, U+00EC ISOlat1 -->
+<!ENTITY iacute "í" ><!-- latin small i with acute, U+00ED ISOlat1 -->
+<!ENTITY icirc "î" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
+<!ENTITY iuml "ï" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
+<!ENTITY eth "ð" ><!-- latin small eth, U+00F0 ISOlat1 -->
+<!ENTITY ntilde "ñ" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
+<!ENTITY ograve "ò" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
+<!ENTITY oacute "ó" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
+<!ENTITY ocirc "ô" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
+<!ENTITY otilde "õ" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
+<!ENTITY ouml "ö" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
+<!ENTITY divide "÷" ><!-- division sign, U+00F7 ISOnum -->
+<!ENTITY oslash "ø" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
+<!ENTITY ugrave "ù" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
+<!ENTITY uacute "ú" ><!-- latin small u with acute, U+00FA ISOlat1 -->
+<!ENTITY ucirc "û" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
+<!ENTITY uuml "ü" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
+<!ENTITY yacute "ý" ><!-- latin small y with acute, U+00FD ISOlat1 -->
+<!ENTITY thorn "þ" ><!-- latin small thorn with, U+00FE ISOlat1 -->
+<!ENTITY yuml "ÿ" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
+<!-- end of xhtml-lat1.ent -->
+
+F.1.2. XHTML Special Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-special.ent
+
+ Typical invocation:
+
+ <!ENTITY % xhtml-special
+ PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+ "xhtml-special.ent" >
+ %xhtml-special;
+
+ This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+ PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+ SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
+
+ Revision: $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+ Portions (C) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with conforming
+ SGML systems and applications as defined in ISO 8879, provided
+ this notice is included in all copies.
+
+ Revisions:
+2000-10-28: added ' and altered XML Predefined Entities for compatibility
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+ New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+ any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+ numbers are given for each character, in hex. Entity values are
+ decimal conversions of the ISO 10646 values and refer to the
+ document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- C0 Controls and Basic Latin -->
+<!ENTITY lt "&#60;" ><!-- less-than sign, U+003C ISOnum -->
+<!ENTITY gt ">" ><!-- greater-than sign, U+003E ISOnum -->
+<!ENTITY amp "&#38;" ><!-- ampersand, U+0026 ISOnum -->
+<!ENTITY apos "'" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
+<!ENTITY quot """ ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
+
+<!-- Latin Extended-A -->
+<!ENTITY OElig "Œ" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
+<!ENTITY oelig "œ" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
+
+<!-- ligature is a misnomer, this is a separate character in some languages -->
+<!ENTITY Scaron "Š" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
+<!ENTITY scaron "š" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
+<!ENTITY Yuml "Ÿ" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
+
+<!-- Spacing Modifier Letters -->
+<!ENTITY circ "ˆ" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
+<!ENTITY tilde "˜" ><!-- small tilde, U+02DC ISOdia -->
+
+<!-- General Punctuation -->
+<!ENTITY ensp " " ><!-- en space, U+2002 ISOpub -->
+<!ENTITY emsp " " ><!-- em space, U+2003 ISOpub -->
+<!ENTITY thinsp " " ><!-- thin space, U+2009 ISOpub -->
+<!ENTITY zwnj "‌" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
+<!ENTITY zwj "‍" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
+<!ENTITY lrm "‎" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
+<!ENTITY rlm "‏" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
+<!ENTITY ndash "–" ><!-- en dash, U+2013 ISOpub -->
+<!ENTITY mdash "—" ><!-- em dash, U+2014 ISOpub -->
+<!ENTITY lsquo "‘" ><!-- left single quotation mark, U+2018 ISOnum -->
+<!ENTITY rsquo "’" ><!-- right single quotation mark, U+2019 ISOnum -->
+<!ENTITY sbquo "‚" ><!-- single low-9 quotation mark, U+201A NEW -->
+<!ENTITY ldquo "“" ><!-- left double quotation mark, U+201C ISOnum -->
+<!ENTITY rdquo "”" ><!-- right double quotation mark, U+201D ISOnum -->
+<!ENTITY bdquo "„" ><!-- double low-9 quotation mark, U+201E NEW -->
+<!ENTITY dagger "†" ><!-- dagger, U+2020 ISOpub -->
+<!ENTITY Dagger "‡" ><!-- double dagger, U+2021 ISOpub -->
+<!ENTITY permil "‰" ><!-- per mille sign, U+2030 ISOtech -->
+
+<!-- lsaquo is proposed but not yet ISO standardized -->
+<!ENTITY lsaquo "‹" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
+<!-- rsaquo is proposed but not yet ISO standardized -->
+<!ENTITY rsaquo "›" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
+<!ENTITY euro "€" ><!-- euro sign, U+20AC NEW -->
+
+<!-- end of xhtml-special.ent -->
+
+F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
+
+<!-- ...................................................................... -->
+<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
+<!-- file: xhtml-symbol.ent
+
+ Typical invocation:
+
+ <!ENTITY % xhtml-symbol
+ PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+ "xhtml-symbol.ent" >
+ %xhtml-symbol;
+
+ This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+ PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+ SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
+
+ Revision: $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+ Portions (C) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with conforming
+ SGML systems and applications as defined in ISO 8879, provided
+ this notice is included in all copies.
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+ New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+ any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+ numbers are given for each character, in hex. Entity values are
+ decimal conversions of the ISO 10646 values and refer to the
+ document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- Latin Extended-B -->
+<!ENTITY fnof "ƒ" ><!-- latin small f with hook = function
+ = florin, U+0192 ISOtech -->
+
+<!-- Greek -->
+<!ENTITY Alpha "Α" ><!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta "Β" ><!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma "Γ" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
+<!ENTITY Delta "Δ" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
+<!ENTITY Epsilon "Ε" ><!-- greek capital letter epsilon, U+0395 -->
+<!ENTITY Zeta "Ζ" ><!-- greek capital letter zeta, U+0396 -->
+<!ENTITY Eta "Η" ><!-- greek capital letter eta, U+0397 -->
+<!ENTITY Theta "Θ" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
+<!ENTITY Iota "Ι" ><!-- greek capital letter iota, U+0399 -->
+<!ENTITY Kappa "Κ" ><!-- greek capital letter kappa, U+039A -->
+<!ENTITY Lambda "Λ" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
+<!ENTITY Mu "Μ" ><!-- greek capital letter mu, U+039C -->
+<!ENTITY Nu "Ν" ><!-- greek capital letter nu, U+039D -->
+<!ENTITY Xi "Ξ" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
+<!ENTITY Omicron "Ο" ><!-- greek capital letter omicron, U+039F -->
+<!ENTITY Pi "Π" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
+<!ENTITY Rho "Ρ" ><!-- greek capital letter rho, U+03A1 -->
+<!-- there is no Sigmaf, and no U+03A2 character either -->
+<!ENTITY Sigma "Σ" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
+<!ENTITY Tau "Τ" ><!-- greek capital letter tau, U+03A4 -->
+<!ENTITY Upsilon "Υ" ><!-- greek capital letter upsilon,
+ U+03A5 ISOgrk3 -->
+<!ENTITY Phi "Φ" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
+<!ENTITY Chi "Χ" ><!-- greek capital letter chi, U+03A7 -->
+<!ENTITY Psi "Ψ" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
+<!ENTITY Omega "Ω" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
+<!ENTITY alpha "α" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
+<!ENTITY beta "β" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
+<!ENTITY gamma "γ" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
+<!ENTITY delta "δ" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
+<!ENTITY epsilon "ε" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
+<!ENTITY zeta "ζ" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
+<!ENTITY eta "η" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
+<!ENTITY theta "θ" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
+<!ENTITY iota "ι" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
+<!ENTITY kappa "κ" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
+<!ENTITY lambda "λ" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
+<!ENTITY mu "μ" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
+<!ENTITY nu "ν" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
+<!ENTITY xi "ξ" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
+<!ENTITY omicron "ο" ><!-- greek small letter omicron, U+03BF NEW -->
+<!ENTITY pi "π" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
+<!ENTITY rho "ρ" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
+<!ENTITY sigmaf "ς" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
+<!ENTITY sigma "σ" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
+<!ENTITY tau "τ" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
+<!ENTITY upsilon "υ" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
+<!ENTITY phi "φ" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
+<!ENTITY chi "χ" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
+<!ENTITY psi "ψ" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
+<!ENTITY omega "ω" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
+<!ENTITY thetasym "ϑ" ><!-- greek small letter theta symbol, U+03D1 NEW -->
+<!ENTITY upsih "ϒ" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
+<!ENTITY piv "ϖ" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
+
+<!-- General Punctuation -->
+<!ENTITY bull "•" ><!-- bullet = black small circle, U+2022 ISOpub -->
+<!-- bullet is NOT the same as bullet operator, U+2219 -->
+<!ENTITY hellip "…" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub -->
+<!ENTITY prime "′" ><!-- prime = minutes = feet, U+2032 ISOtech -->
+<!ENTITY Prime "″" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
+<!ENTITY oline "‾" ><!-- overline = spacing overscore, U+203E NEW -->
+<!ENTITY frasl "⁄" ><!-- fraction slash, U+2044 NEW -->
+
+<!-- Letterlike Symbols -->
+<!ENTITY weierp "℘" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
+<!ENTITY image "ℑ" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
+<!ENTITY real "ℜ" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
+<!ENTITY trade "™" ><!-- trade mark sign, U+2122 ISOnum -->
+<!ENTITY alefsym "ℵ" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
+<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
+ the same glyph could be used to depict both characters -->
+
+<!-- Arrows -->
+<!ENTITY larr "←" ><!-- leftwards arrow, U+2190 ISOnum -->
+<!ENTITY uarr "↑" ><!-- upwards arrow, U+2191 ISOnum-->
+<!ENTITY rarr "→" ><!-- rightwards arrow, U+2192 ISOnum -->
+<!ENTITY darr "↓" ><!-- downwards arrow, U+2193 ISOnum -->
+<!ENTITY harr "↔" ><!-- left right arrow, U+2194 ISOamsa -->
+<!ENTITY crarr "↵" ><!-- downwards arrow with corner leftwards
+ = carriage return, U+21B5 NEW -->
+<!ENTITY lArr "⇐" ><!-- leftwards double arrow, U+21D0 ISOtech -->
+<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
+ but also does not have any other character for that function. So ? lArr can
+ be used for 'is implied by' as ISOtech suggests -->
+<!ENTITY uArr "⇑" ><!-- upwards double arrow, U+21D1 ISOamsa -->
+<!ENTITY rArr "⇒" ><!-- rightwards double arrow, U+21D2 ISOtech -->
+<!-- Unicode does not say this is the 'implies' character but does not have
+ another character with this function so ?
+ rArr can be used for 'implies' as ISOtech suggests -->
+<!ENTITY dArr "⇓" ><!-- downwards double arrow, U+21D3 ISOamsa -->
+<!ENTITY hArr "⇔" ><!-- left right double arrow, U+21D4 ISOamsa -->
+
+<!-- Mathematical Operators -->
+<!ENTITY forall "∀" ><!-- for all, U+2200 ISOtech -->
+<!ENTITY part "∂" ><!-- partial differential, U+2202 ISOtech -->
+<!ENTITY exist "∃" ><!-- there exists, U+2203 ISOtech -->
+<!ENTITY empty "∅" ><!-- empty set = null set, U+2205 ISOamso -->
+<!ENTITY nabla "∇" ><!-- nabla = backward difference, U+2207 ISOtech -->
+<!ENTITY isin "∈" ><!-- element of, U+2208 ISOtech -->
+<!ENTITY notin "∉" ><!-- not an element of, U+2209 ISOtech -->
+<!ENTITY ni "∋" ><!-- contains as member, U+220B ISOtech -->
+<!-- should there be a more memorable name than 'ni'? -->
+<!ENTITY prod "∏" ><!-- n-ary product = product sign, U+220F ISOamsb -->
+<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
+ the same glyph might be used for both -->
+<!ENTITY sum "∑" ><!-- n-ary sumation, U+2211 ISOamsb -->
+<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+ though the same glyph might be used for both -->
+<!ENTITY minus "−" ><!-- minus sign, U+2212 ISOtech -->
+<!ENTITY lowast "∗" ><!-- asterisk operator, U+2217 ISOtech -->
+<!ENTITY radic "√" ><!-- square root = radical sign, U+221A ISOtech -->
+<!ENTITY prop "∝" ><!-- proportional to, U+221D ISOtech -->
+<!ENTITY infin "∞" ><!-- infinity, U+221E ISOtech -->
+<!ENTITY ang "∠" ><!-- angle, U+2220 ISOamso -->
+<!ENTITY and "∧" ><!-- logical and = wedge, U+2227 ISOtech -->
+<!ENTITY or "∨" ><!-- logical or = vee, U+2228 ISOtech -->
+<!ENTITY cap "∩" ><!-- intersection = cap, U+2229 ISOtech -->
+<!ENTITY cup "∪" ><!-- union = cup, U+222A ISOtech -->
+<!ENTITY int "∫" ><!-- integral, U+222B ISOtech -->
+<!ENTITY there4 "∴" ><!-- therefore, U+2234 ISOtech -->
+<!ENTITY sim "∼" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
+<!-- tilde operator is NOT the same character as the tilde, U+007E,
+ although the same glyph might be used to represent both -->
+<!ENTITY cong "≅" ><!-- approximately equal to, U+2245 ISOtech -->
+<!ENTITY asymp "≈" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
+<!ENTITY ne "≠" ><!-- not equal to, U+2260 ISOtech -->
+<!ENTITY equiv "≡" ><!-- identical to, U+2261 ISOtech -->
+<!ENTITY le "≤" ><!-- less-than or equal to, U+2264 ISOtech -->
+<!ENTITY ge "≥" ><!-- greater-than or equal to, U+2265 ISOtech -->
+<!ENTITY sub "⊂" ><!-- subset of, U+2282 ISOtech -->
+<!ENTITY sup "⊃" ><!-- superset of, U+2283 ISOtech -->
+<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
+ font encoding and is not included. Should it be, for symmetry?
+ It is in ISOamsn -->
+<!ENTITY nsub "⊄" ><!-- not a subset of, U+2284 ISOamsn -->
+<!ENTITY sube "⊆" ><!-- subset of or equal to, U+2286 ISOtech -->
+<!ENTITY supe "⊇" ><!-- superset of or equal to, U+2287 ISOtech -->
+<!ENTITY oplus "⊕" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
+<!ENTITY otimes "⊗" ><!-- circled times = vector product, U+2297 ISOamsb -->
+<!ENTITY perp "⊥" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
+<!ENTITY sdot "⋅" ><!-- dot operator, U+22C5 ISOamsb -->
+<!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+<!-- Miscellaneous Technical -->
+<!ENTITY lceil "⌈" ><!-- left ceiling = apl upstile, U+2308 ISOamsc -->
+<!ENTITY rceil "⌉" ><!-- right ceiling, U+2309 ISOamsc -->
+<!ENTITY lfloor "⌊" ><!-- left floor = apl downstile, U+230A ISOamsc -->
+<!ENTITY rfloor "⌋" ><!-- right floor, U+230B ISOamsc -->
+<!ENTITY lang "〈" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
+<!-- lang is NOT the same character as U+003C 'less than'
+ or U+2039 'single left-pointing angle quotation mark' -->
+<!ENTITY rang "〉" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
+<!-- rang is NOT the same character as U+003E 'greater than'
+ or U+203A 'single right-pointing angle quotation mark' -->
+
+<!-- Geometric Shapes -->
+<!ENTITY loz "◊" ><!-- lozenge, U+25CA ISOpub -->
+
+<!-- Miscellaneous Symbols -->
+<!ENTITY spades "♠" ><!-- black spade suit, U+2660 ISOpub -->
+<!-- black here seems to mean filled as opposed to hollow -->
+<!ENTITY clubs "♣" ><!-- black club suit = shamrock, U+2663 ISOpub -->
+<!ENTITY hearts "♥" ><!-- black heart suit = valentine, U+2665 ISOpub -->
+<!ENTITY diams "♦" ><!-- black diamond suit, U+2666 ISOpub -->
+
+<!-- end of xhtml-symbol.ent -->
+"""
+ return text
+
+def get_apache_license():
+ license = r"""/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+ return license
+
+main()