You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2012/01/24 16:51:57 UTC

svn commit: r1235308 [3/5] - in /lucene/dev/branches/branch_3x: lucene/ lucene/contrib/analyzers/common/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis...

Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex Tue Jan 24 15:51:55 2012
@@ -0,0 +1,875 @@
+package org.apache.lucene.analysis.charfilter;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.util.Version;
+import org.apache.lucene.analysis.BaseCharFilter;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.CharArrayMap;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.util.OpenStringBuilder;
+
+
+/**
+ * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
+ */
+@SuppressWarnings("fallthrough")
+%%
+
+%unicode 6.0
+%apiprivate
+%type int
+%final
+%public
+%char
+%function nextChar
+%class HTMLStripCharFilter
+%extends BaseCharFilter
+%xstate AMPERSAND, NUMERIC_CHARACTER, CHARACTER_REFERENCE_TAIL
+%xstate LEFT_ANGLE_BRACKET, BANG, COMMENT, SCRIPT, SCRIPT_COMMENT
+%xstate LEFT_ANGLE_BRACKET_SLASH, LEFT_ANGLE_BRACKET_SPACE, CDATA
+%xstate SERVER_SIDE_INCLUDE, SINGLE_QUOTED_STRING, DOUBLE_QUOTED_STRING
+%xstate END_TAG_TAIL_INCLUDE, END_TAG_TAIL_EXCLUDE, END_TAG_TAIL_SUBSTITUTE
+%xstate START_TAG_TAIL_INCLUDE, START_TAG_TAIL_EXCLUDE, START_TAG_TAIL_SUBSTITUTE
+%xstate STYLE, STYLE_COMMENT
+
+// From XML 1.0 <http://www.w3.org/TR/xml/>:
+//
+//    [4]  NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [...]
+//    [4a] NameChar      ::= NameStartChar | "-" | "." | [0-9] | [...]
+//    [5]  Name          ::= NameStartChar (NameChar)*
+//
+// From UAX #31: Unicode Identifier and Pattern Syntax
+// <http://unicode.org/reports/tr31/>:
+//
+//    D1. Default Identifier Syntax
+//
+//        <identifier> := <ID_Start> <ID_Continue>*
+//
+Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
+
+// From Apache httpd mod_include documentation
+// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
+//
+// Basic Elements
+//
+//    The document is parsed as an HTML document, with special commands
+//    embedded as SGML comments. A command has the syntax:
+//
+//       <!--#element attribute=value attribute=value ... -->
+//
+//    The value will often be enclosed in double quotes, but single quotes (')
+//    and backticks (`) are also possible. Many commands only allow a single
+//    attribute-value pair. Note that the comment terminator (-->) should be
+//    preceded by whitespace to ensure that it isn't considered part of an SSI
+//    token. Note that the leading <!--# is one token and may not contain any
+//    whitespaces.
+//
+
+EventAttributeSuffixes = ( [aA][bB][oO][rR][tT]                 |
+                           [bB][lL][uU][rR]                     |
+                           [cC][hH][aA][nN][gG][eE]             |
+                           [cC][lL][iI][cC][kK]                 |
+	                         [dD][bB][lL][cC][lL][iI][cC][kK]     |
+                           [eE][rR][rR][oO][rR]                 |
+                           [fF][oO][cC][uU][sS]                 |
+	                         [kK][eE][yY][dD][oO][wW][nN]         |
+	                         [kK][eE][yY][pP][rR][eE][sS][sS]     |
+	                         [kK][eE][yY][uU][pP]                 |
+                           [lL][oO][aA][dD]                     |
+	                         [mM][oO][uU][sS][eE][dD][oO][wW][nN] |
+	                         [mM][oO][uU][sS][eE][mM][oO][vV][eE] |
+                           [mM][oO][uU][sS][eE][oO][uU][tT]     |
+                           [mM][oO][uU][sS][eE][oO][vV][eE][rR] |
+	                         [mM][oO][uU][sS][eE][uU][pP]         |
+                           [rR][eE][sS][eE][tT]                 |
+                           [sS][eE][lL][eE][cC][tT]             |
+                           [sS][uU][bB][mM][iI][tT]             |
+                           [uU][nN][lL][oO][aA][dD]             )
+
+SingleQuoted = ( "'" ( "\\'" | [^']* )* "'" )
+DoubleQuoted = ( "\"" ( "\\\"" | [^\"]* )* "\"" )
+ServerSideInclude = ( "<!--#" ( [^'\"] | {SingleQuoted} | {DoubleQuoted} )* "-->" )
+EventAttribute = [oO][nN] {EventAttributeSuffixes} \s* "=" \s* ( {SingleQuoted} | {DoubleQuoted} )
+OpenTagContent = ( {EventAttribute} | [^<>] | {ServerSideInclude} )*
+
+InlineElment = ( [aAbBiIqQsSuU]                   |
+                 [aA][bB][bB][rR]                 |
+                 [aA][cC][rR][oO][nN][yY][mM]     |
+                 [bB][aA][sS][eE][fF][oO][nN][tT] |
+                 [bB][dD][oO]                     |
+                 [bB][iI][gG]                     |
+                 [cC][iI][tT][eE]                 |
+                 [cC][oO][dD][eE]                 |
+                 [dD][fF][nN]                     |
+                 [eE][mM]                         |
+                 [fF][oO][nN][tT]                 |
+                 [iI][mM][gG]                     |
+                 [iI][nN][pP][uU][tT]             |
+                 [kK][bB][dD]                     |
+                 [lL][aA][bB][eE][lL]             |
+                 [sS][aA][mM][pP]                 |
+                 [sS][eE][lL][eE][cC][tT]         |
+                 [sS][mM][aA][lL][lL]             |
+                 [sS][pP][aA][nN]                 |
+                 [sS][tT][rR][iI][kK][eE]         |
+                 [sS][tT][rR][oO][nN][gG]         |
+                 [sS][uU][bB]                     |
+                 [sS][uU][pP]                     |
+                 [tT][eE][xX][tT][aA][rR][eE][aA] |
+                 [tT][tT]                         |
+                 [vV][aA][rR]                     )
+
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
+
+%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
+
+%{
+  private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
+  private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
+  private static final char BLOCK_LEVEL_END_TAG_REPLACEMENT = '\n';
+  private static final char BR_START_TAG_REPLACEMENT = '\n';
+  private static final char BR_END_TAG_REPLACEMENT = '\n';
+  private static final char SCRIPT_REPLACEMENT = '\n';
+  private static final char STYLE_REPLACEMENT = '\n';
+  private static final char REPLACEMENT_CHARACTER = '\uFFFD';
+
+  private CharArraySet escapedTags = null;
+  private int inputStart;
+  private int cumulativeDiff;
+  private boolean escapeBR = false;
+  private boolean escapeSCRIPT = false;
+  private boolean escapeSTYLE = false;
+  private int restoreState;
+  private int previousRestoreState;
+  private int outputCharCount;
+  private int eofReturnValue;
+  private TextSegment inputSegment
+      = new TextSegment(INITIAL_INPUT_SEGMENT_SIZE);
+  private TextSegment outputSegment = inputSegment;
+  private TextSegment entitySegment = new TextSegment(2);
+
+  /**
+   * @param source
+   */
+  public HTMLStripCharFilter(CharStream source) {
+    super(source);
+    this.zzReader = source;
+  }
+
+  /**
+   * @param source
+   * @param escapedTags Tags in this set (both start and end tags)
+   *  will not be filtered out.
+   */
+  public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
+    super(source);
+    this.zzReader = source;
+    if (null != escapedTags) {
+      for (String tag : escapedTags) {
+        if (tag.equalsIgnoreCase("BR")) {
+          escapeBR = true;
+        } else if (tag.equalsIgnoreCase("SCRIPT")) {
+          escapeSCRIPT = true;
+        } else if (tag.equalsIgnoreCase("STYLE")) {
+          escapeSTYLE = true;
+        } else {
+          if (null == this.escapedTags) {
+            this.escapedTags = new CharArraySet(Version.LUCENE_36, 16, true);
+          }
+          this.escapedTags.add(tag);
+        }
+      }
+    }
+  }
+
+  @Override
+  public int read() throws IOException {
+    if (outputSegment.isRead()) {
+      if (zzAtEOF) {
+        return -1;
+      }
+      int ch = nextChar();
+      ++outputCharCount;
+      return ch;
+    }
+    int ch = outputSegment.nextChar();
+    ++outputCharCount;
+    return ch;
+  }
+
+  @Override
+  public int read(char cbuf[], int off, int len) throws IOException {
+    int i = 0;
+    for ( ; i < len ; ++i) {
+      int ch = read();
+      if (ch == -1) break;
+      cbuf[off++] = (char)ch;
+    }
+    return i > 0 ? i : (len == 0 ? 0 : -1);
+  }
+
+  @Override
+  public void close() throws IOException {
+    yyclose();
+  }
+
+  static int getInitialBufferSize() {  // Package private, for testing purposes
+    return ZZ_BUFFERSIZE;
+  }
+
+  private class TextSegment extends OpenStringBuilder {
+    /** The position from which the next char will be read. */
+    int pos = 0;
+
+    /** Wraps the given buffer and sets this.len to the given length. */
+    TextSegment(char[] buffer, int length) {
+      super(buffer, length);
+    }
+
+    /** Allocates an internal buffer of the given size. */
+    TextSegment(int size) {
+      super(size);
+    }
+
+    /** Sets len = 0 and pos = 0. */
+    void clear() {
+      reset();
+      restart();
+    }
+
+    /** Sets pos = 0 */
+    void restart() {
+      pos = 0;
+    }
+
+    /** Returns the next char in the segment. */
+    int nextChar() {
+      assert (! isRead()): "Attempting to read past the end of a segment.";
+      return buf[pos++];
+    }
+
+    /** Returns true when all characters in the text segment have been read */
+    boolean isRead() {
+      return pos >= len;
+    }
+  }
+%}
+
+%eofval{
+  return eofReturnValue;
+%eofval}
+%eof{
+  switch (zzLexicalState) {
+    case SCRIPT:
+    case COMMENT:
+    case SCRIPT_COMMENT:
+    case STYLE:
+    case STYLE_COMMENT:
+    case SINGLE_QUOTED_STRING:
+    case DOUBLE_QUOTED_STRING:
+    case END_TAG_TAIL_EXCLUDE:
+    case END_TAG_TAIL_SUBSTITUTE:
+    case START_TAG_TAIL_EXCLUDE:
+    case SERVER_SIDE_INCLUDE:
+    case START_TAG_TAIL_SUBSTITUTE: { // Exclude
+      cumulativeDiff += yychar - inputStart;
+      addOffCorrectMap(outputCharCount, cumulativeDiff);
+      outputSegment.clear();
+      eofReturnValue = -1;
+      break;
+    }
+    case CHARACTER_REFERENCE_TAIL: {        // Substitute
+      // At end of file, allow char refs without semicolons
+      cumulativeDiff += inputSegment.length() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    case BANG:
+    case CDATA:
+    case AMPERSAND:
+    case NUMERIC_CHARACTER:
+    case END_TAG_TAIL_INCLUDE:
+    case START_TAG_TAIL_INCLUDE:
+    case LEFT_ANGLE_BRACKET:
+    case LEFT_ANGLE_BRACKET_SLASH:
+    case LEFT_ANGLE_BRACKET_SPACE: {        // Include
+      outputSegment = inputSegment;
+      eofReturnValue = outputSegment.nextChar();
+      break;
+    }
+    default: {
+      eofReturnValue = -1;
+    }
+  }
+%eof}
+
+%%
+
+"&" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('&');
+  yybegin(AMPERSAND);
+}
+
+"<" {
+  inputStart = yychar;
+  inputSegment.clear();
+  inputSegment.append('<');
+  yybegin(LEFT_ANGLE_BRACKET);
+}
+
+<AMPERSAND> {
+  {CharacterEntities} {
+    int length = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, length);
+    entitySegment.clear();
+    char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
+    entitySegment.append(ch);
+    outputSegment = entitySegment;
+    yybegin(CHARACTER_REFERENCE_TAIL);
+  }
+  "#" { inputSegment.append('#'); yybegin(NUMERIC_CHARACTER); }
+
+//                                             1   1       11              11
+// 0  1   2   3       45              678  9   0   1       23              45
+  "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+    // Handle paired UTF-16 surrogates.
+    outputSegment = entitySegment;
+    outputSegment.clear();
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try {
+      outputSegment.unsafeWrite
+          ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(10, 14) + "'";
+    }
+    cumulativeDiff += inputSegment.length() + yylength() - 2;
+    addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return highSurrogate;
+  }
+
+//                          1   1       11              11
+// 01  2    345    678  9   0   1       23              45
+  "#5" [56] \d{3} ";&#" [xX][dD][c-fC-F][0-9a-fA-F]{2} ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      try {
+        outputSegment.unsafeWrite
+            ((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(10, 14) + "'";
+      }
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+
+//                                          1    111     11
+// 0  1   2   3       45              6789  0    123     45
+  "#" [xX][dD][89aAbB][0-9a-fA-F]{2} ";&#5" [67] \d{3}  ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    char lowSurrogate = '\u0000';
+    try {
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(2, 6) + "'";
+    }
+    try { // Low surrogates are in decimal range [56320, 57343]
+      lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing low surrogate '"
+                  + surrogatePair.substring(9, 14) + "'";
+    }
+    if (Character.isLowSurrogate(lowSurrogate)) {
+      outputSegment = entitySegment;
+      outputSegment.clear();
+      outputSegment.unsafeWrite(lowSurrogate);
+      cumulativeDiff += inputSegment.length() + yylength() - 2;
+      addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(YYINITIAL);
+      return highSurrogate;
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+
+//                       1    111     11
+// 01  2    345    6789  0    123     45
+  "#5" [56] \d{3} ";&#5" [67] \d{3}  ";" {
+    // Handle paired UTF-16 surrogates.
+    String surrogatePair = yytext();
+    char highSurrogate = '\u0000';
+    try { // High surrogates are in decimal range [55296, 56319]
+      highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
+    } catch(Exception e) { // should never happen
+      assert false: "Exception parsing high surrogate '"
+                  + surrogatePair.substring(1, 6) + "'";
+    }
+    if (Character.isHighSurrogate(highSurrogate)) {
+      char lowSurrogate = '\u0000';
+      try { // Low surrogates are in decimal range [56320, 57343]
+        lowSurrogate = (char)Integer.parseInt(surrogatePair.substring(9, 14));
+      } catch(Exception e) { // should never happen
+        assert false: "Exception parsing low surrogate '"
+                    + surrogatePair.substring(9, 14) + "'";
+      }
+      if (Character.isLowSurrogate(lowSurrogate)) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        outputSegment.unsafeWrite(lowSurrogate);
+        cumulativeDiff += inputSegment.length() + yylength() - 2;
+        addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
+        inputSegment.clear();
+        yybegin(YYINITIAL);
+        return highSurrogate;
+      }
+    }
+    yypushback(surrogatePair.length() - 1); // Consume only '#'
+    inputSegment.append('#');
+    yybegin(NUMERIC_CHARACTER);
+  }
+}
+
+<NUMERIC_CHARACTER> {
+  [xX] [0-9A-Fa-f]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 6) { // 10FFFF: max 6 hex chars
+      String hexCharRef
+          = new String(zzBuffer, zzStartRead + 1, matchLength - 1);
+      int codePoint = 0;
+      try {
+        codePoint = Integer.parseInt(hexCharRef, 16);
+      } catch(Exception e) {
+        assert false: "Exception parsing hex code point '" + hexCharRef + "'";
+      }
+      if (codePoint <= 0x10FFFF) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        if (codePoint >= Character.MIN_SURROGATE
+            && codePoint <= Character.MAX_SURROGATE) {
+          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+        } else {
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+        }
+        yybegin(CHARACTER_REFERENCE_TAIL);
+      } else {
+        outputSegment = inputSegment;
+        yybegin(YYINITIAL);
+        return outputSegment.nextChar();
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+  [0-9]+ {
+    int matchLength = yylength();
+    inputSegment.write(zzBuffer, zzStartRead, matchLength);
+    if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
+      String decimalCharRef = yytext();
+      int codePoint = 0;
+      try {
+        codePoint = Integer.parseInt(decimalCharRef);
+      } catch(Exception e) {
+        assert false: "Exception parsing code point '" + decimalCharRef + "'";
+      }
+      if (codePoint <= 0x10FFFF) {
+        outputSegment = entitySegment;
+        outputSegment.clear();
+        if (codePoint >= Character.MIN_SURROGATE
+            && codePoint <= Character.MAX_SURROGATE) {
+          outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
+        } else {
+          outputSegment.setLength
+              (Character.toChars(codePoint, outputSegment.getArray(), 0));
+        }
+        yybegin(CHARACTER_REFERENCE_TAIL);
+      } else {
+        outputSegment = inputSegment;
+        yybegin(YYINITIAL);
+        return outputSegment.nextChar();
+      }
+    } else {
+      outputSegment = inputSegment;
+      yybegin(YYINITIAL);
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<CHARACTER_REFERENCE_TAIL> {
+  ";" {
+    cumulativeDiff
+        += inputSegment.length() + yylength() - outputSegment.length();
+    addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+<LEFT_ANGLE_BRACKET_SLASH> {
+  \s+ { inputSegment.write(zzBuffer, zzStartRead, yylength()); }
+  [bB][rR] \s* ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      cumulativeDiff
+          += inputSegment.length() + yylength() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
+      inputSegment.reset();
+      return BR_END_TAG_REPLACEMENT;
+    }
+  }
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(END_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(END_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<END_TAG_TAIL_INCLUDE> {
+   \s* ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<END_TAG_TAIL_EXCLUDE> {
+  \s* ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+}
+
+<END_TAG_TAIL_SUBSTITUTE> {
+  \s* ">" {
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_END_TAG_REPLACEMENT;
+  }
+}
+
+<LEFT_ANGLE_BRACKET> {
+  "!" { inputSegment.append('!'); yybegin(BANG); }
+  "/" { inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH); }
+  \s+ {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    yybegin(LEFT_ANGLE_BRACKET_SPACE);
+  }
+  "?" [^>]* [/?] ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  \s* [bB][rR] ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    yybegin(YYINITIAL);
+    if (escapeBR) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      return outputSegment.nextChar();
+    } else {
+      cumulativeDiff
+          += inputSegment.length() + yylength() - outputSegment.length();
+      addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+      inputSegment.reset();
+      return BR_START_TAG_REPLACEMENT;
+    }
+  }
+  \s* [sS][cC][rR][iI][pP][tT] ( \s+ {OpenTagContent} )? \s*  ">" {
+    yybegin(SCRIPT);
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+  \s* [sS][tT][yY][lL][eE] ( \s+ {OpenTagContent} )? \s* ">" {
+    yybegin(STYLE);
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      inputStart += 1 + yylength();
+      return outputSegment.nextChar();
+    }
+  }
+}
+
+<LEFT_ANGLE_BRACKET, LEFT_ANGLE_BRACKET_SPACE> {
+  {InlineElment} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_EXCLUDE);
+    }
+  }
+  {Name} {
+    inputSegment.write(zzBuffer, zzStartRead, yylength());
+    if (null != escapedTags
+        && escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
+      yybegin(START_TAG_TAIL_INCLUDE);
+    } else {
+      yybegin(START_TAG_TAIL_SUBSTITUTE);
+    }
+  }
+}
+
+<START_TAG_TAIL_INCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+     inputSegment.write(zzBuffer, zzStartRead, yylength());
+     outputSegment = inputSegment;
+     yybegin(YYINITIAL);
+     return outputSegment.nextChar();
+   }
+}
+
+<START_TAG_TAIL_EXCLUDE> {
+   ( ( "="\s* | \s+ ) {OpenTagContent} )? \s* "/"? ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    outputSegment = inputSegment;
+    yybegin(YYINITIAL);
+  }
+}
+
+<START_TAG_TAIL_SUBSTITUTE> {
+  ( ( "="\s* | \s+ ) {OpenTagContent} )? \s*  "/"? ">" {
+    cumulativeDiff += inputSegment.length() + yylength() - 1;
+    addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    return BLOCK_LEVEL_START_TAG_REPLACEMENT;
+  }
+}
+
+<BANG> {
+  "--" { yybegin(COMMENT); }
+  ">" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  // From XML 1.0 <http://www.w3.org/TR/xml/>:
+  //
+  // [18] CDSect  ::= CDStart CData CDEnd
+  // [19] CDStart ::= '<![CDATA['
+  // [20] CData   ::= (Char* - (Char* ']]>' Char*))
+  // [21] CDEnd   ::= ']]>'
+  //
+  "[CDATA[" {
+    cumulativeDiff += inputSegment.length() + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(CDATA);
+  }
+  [^] {
+    inputSegment.append(zzBuffer[zzStartRead]);
+  }
+}
+
+<CDATA> {
+  "]]>" {
+    cumulativeDiff += yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    yybegin(YYINITIAL);
+  }
+  [^] { return zzBuffer[zzStartRead]; }
+}
+
+<COMMENT> {
+  "<!--#" { restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "-->" {
+    cumulativeDiff += yychar - inputStart + yylength();
+    addOffCorrectMap(outputCharCount, cumulativeDiff);
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+  }
+  [^] { }
+}
+
+<SERVER_SIDE_INCLUDE> {
+  "-->" { yybegin(restoreState); }
+  "'" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(SINGLE_QUOTED_STRING);
+  }
+  "\"" {
+    previousRestoreState = restoreState;
+    restoreState = SERVER_SIDE_INCLUDE;
+    yybegin(DOUBLE_QUOTED_STRING);
+  }
+  [^] { }
+}
+
+<SCRIPT_COMMENT> {
+  "<!--#" { restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(SCRIPT); }
+  [^] { }
+}
+
+<STYLE_COMMENT> {
+  "<!--#" { restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE); }
+  "'"     { restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING); }
+  "\""    { restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING); }
+  "-->"   { yybegin(STYLE); }
+  [^] { }
+}
+
+<SINGLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "'" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<DOUBLE_QUOTED_STRING> {
+  "\\" [^] { }
+  "\"" { yybegin(restoreState); restoreState = previousRestoreState; }
+  [^] { }
+}
+
+<SCRIPT> {
+  "<!--" { yybegin(SCRIPT_COMMENT); }
+  "</" \s* [sS][cC][rR][iI][pP][tT] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    cumulativeDiff += yychar - inputStart;
+    int outputEnd = outputCharCount;
+    int returnValue;
+    if (escapeSCRIPT) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      cumulativeDiff += yylength() - 1;
+      ++outputEnd;
+      returnValue = SCRIPT_REPLACEMENT;
+    }
+    addOffCorrectMap(outputEnd, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<STYLE> {
+  "<!--" { yybegin(STYLE_COMMENT); }
+  "</" \s* [sS][tT][yY][lL][eE] \s* ">" {
+    inputSegment.clear();
+    yybegin(YYINITIAL);
+    cumulativeDiff += yychar - inputStart;
+    int outputEnd = outputCharCount;
+    int returnValue;
+    if (escapeSTYLE) {
+      inputSegment.write(zzBuffer, zzStartRead, yylength());
+      outputSegment = inputSegment;
+      returnValue = outputSegment.nextChar();
+    } else {
+      cumulativeDiff += yylength() - 1;
+      ++outputEnd;
+      returnValue = STYLE_REPLACEMENT;
+    }
+    addOffCorrectMap(outputEnd, cumulativeDiff);
+    return returnValue;
+  }
+  [^] { }
+}
+
+<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
+  [^] {
+    yypushback(1);
+    outputSegment = inputSegment;
+    outputSegment.restart();
+    yybegin(YYINITIAL);
+    return outputSegment.nextChar();
+  }
+}
+
+[^] { return zzBuffer[zzStartRead]; }

Added: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py Tue Jan 24 15:51:55 2012
@@ -0,0 +1,530 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+# A simple python script to generate an HTML entity map and a regex alternation
+# for inclusion in HTMLStripCharFilter.jflex.
+
+def main():
+  print get_apache_license()
+  codes = {}
+  regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
+  for line in get_entity_text().split('\n'):
+    match = regex.match(line)
+    if match:
+      key = match.group(1)
+      if   key == 'quot': codes[key] = r'\"'
+      elif key == 'nbsp': codes[key] = ' ';
+      else              : codes[key] = r'\u%04X' % int(match.group(2))
+
+  keys = sorted(codes)
+
+  first_entry = True
+  output_line = 'CharacterEntities = ( '
+  for key in keys:
+    new_entry = ('"%s"' if first_entry else ' | "%s"') % key
+    first_entry = False
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '                   '
+    output_line += new_entry
+    if key in ('quot','copy','gt','lt','reg','amp'):
+      new_entry = ' | "%s"' % key.upper()
+      if len(output_line) + len(new_entry) >= 80:
+        print output_line
+        output_line = '                   '
+      output_line += new_entry
+  print output_line, ')'
+
+  print '%{'
+  print '  private static final Set<String> upperCaseVariantsAccepted'
+  print '      = new HashSet<String>(Arrays.asList("quot","copy","gt","lt","reg","amp"));'
+  print '  private static final CharArrayMap<Character> entityValues'
+  print '      = new CharArrayMap<Character>(Version.LUCENE_36, %i, false);' % len(keys)
+  print '  static {'
+  print '    String[] entities = {'
+  output_line = '     '
+  for key in keys:
+    new_entry = ' "%s", "%s",' % (key, codes[key])
+    if len(output_line) + len(new_entry) >= 80:
+      print output_line
+      output_line = '     '
+    output_line += new_entry
+  print output_line[:-1]
+  print '    };'
+  print '    for (int i = 0 ; i < entities.length ; i += 2) {'
+  print '      Character value = entities[i + 1].charAt(0);'
+  print '      entityValues.put(entities[i], value);'
+  print '      if (upperCaseVariantsAccepted.contains(entities[i])) {'
+  print '        entityValues.put(entities[i].toUpperCase(), value);'
+  print '      }'
+  print '    }'
+  print "  }"
+  print "%}"
+
+def get_entity_text():
+# The text below is taken verbatim from
+# <http://www.w3.org/TR/REC-html40/sgml/entities.html>:
+  text = r"""
+F.1. XHTML Character Entities
+
+XHTML DTDs make available a standard collection of named character entities. Those entities are defined in this section.
+F.1.1. XHTML Latin 1 Character Entities
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-lat1.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Latin 1 Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-lat1.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-lat1
+           PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+                  "xhtml-lat1.ent" >
+       %xhtml-lat1;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Latin 1 for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-lat1.ent"
+
+     Revision:  $Id: xhtml-lat1.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!ENTITY nbsp   "&#160;" ><!-- no-break space = non-breaking space, U+00A0 ISOnum -->
+<!ENTITY iexcl  "&#161;" ><!-- inverted exclamation mark, U+00A1 ISOnum -->
+<!ENTITY cent   "&#162;" ><!-- cent sign, U+00A2 ISOnum -->
+<!ENTITY pound  "&#163;" ><!-- pound sign, U+00A3 ISOnum -->
+<!ENTITY curren "&#164;" ><!-- currency sign, U+00A4 ISOnum -->
+<!ENTITY yen    "&#165;" ><!-- yen sign = yuan sign, U+00A5 ISOnum -->
+<!ENTITY brvbar "&#166;" ><!-- broken bar = broken vertical bar, U+00A6 ISOnum -->
+<!ENTITY sect   "&#167;" ><!-- section sign, U+00A7 ISOnum -->
+<!ENTITY uml    "&#168;" ><!-- diaeresis = spacing diaeresis, U+00A8 ISOdia -->
+<!ENTITY copy   "&#169;" ><!-- copyright sign, U+00A9 ISOnum -->
+<!ENTITY ordf   "&#170;" ><!-- feminine ordinal indicator, U+00AA ISOnum -->
+<!ENTITY laquo  "&#171;" ><!-- left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum -->
+<!ENTITY not    "&#172;" ><!-- not sign, U+00AC ISOnum -->
+<!ENTITY shy    "&#173;" ><!-- soft hyphen = discretionary hyphen, U+00AD ISOnum -->
+<!ENTITY reg    "&#174;" ><!-- registered sign = registered trade mark sign, U+00AE ISOnum -->
+<!ENTITY macr   "&#175;" ><!-- macron = spacing macron = overline = APL overbar, U+00AF ISOdia -->
+<!ENTITY deg    "&#176;" ><!-- degree sign, U+00B0 ISOnum -->
+<!ENTITY plusmn "&#177;" ><!-- plus-minus sign = plus-or-minus sign, U+00B1 ISOnum -->
+<!ENTITY sup2   "&#178;" ><!-- superscript two = superscript digit two = squared, U+00B2 ISOnum -->
+<!ENTITY sup3   "&#179;" ><!-- superscript three = superscript digit three = cubed, U+00B3 ISOnum -->
+<!ENTITY acute  "&#180;" ><!-- acute accent = spacing acute, U+00B4 ISOdia -->
+<!ENTITY micro  "&#181;" ><!-- micro sign, U+00B5 ISOnum -->
+<!ENTITY para   "&#182;" ><!-- pilcrow sign = paragraph sign, U+00B6 ISOnum -->
+<!ENTITY middot "&#183;" ><!-- middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum -->
+<!ENTITY cedil  "&#184;" ><!-- cedilla = spacing cedilla, U+00B8 ISOdia -->
+<!ENTITY sup1   "&#185;" ><!-- superscript one = superscript digit one, U+00B9 ISOnum -->
+<!ENTITY ordm   "&#186;" ><!-- masculine ordinal indicator, U+00BA ISOnum -->
+<!ENTITY raquo  "&#187;" ><!-- right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum -->
+<!ENTITY frac14 "&#188;" ><!-- vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum -->
+<!ENTITY frac12 "&#189;" ><!-- vulgar fraction one half = fraction one half, U+00BD ISOnum -->
+<!ENTITY frac34 "&#190;" ><!-- vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum -->
+<!ENTITY iquest "&#191;" ><!-- inverted question mark = turned question mark, U+00BF ISOnum -->
+<!ENTITY Agrave "&#192;" ><!-- latin capital A with grave = latin capital A grave, U+00C0 ISOlat1 -->
+<!ENTITY Aacute "&#193;" ><!-- latin capital A with acute, U+00C1 ISOlat1 -->
+<!ENTITY Acirc  "&#194;" ><!-- latin capital A with circumflex, U+00C2 ISOlat1 -->
+<!ENTITY Atilde "&#195;" ><!-- latin capital A with tilde, U+00C3 ISOlat1 -->
+<!ENTITY Auml   "&#196;" ><!-- latin capital A with diaeresis, U+00C4 ISOlat1 -->
+<!ENTITY Aring  "&#197;" ><!-- latin capital A with ring above = latin capital A ring, U+00C5 ISOlat1 -->
+<!ENTITY AElig  "&#198;" ><!-- latin capital AE = latin capital ligature AE, U+00C6 ISOlat1 -->
+<!ENTITY Ccedil "&#199;" ><!-- latin capital C with cedilla, U+00C7 ISOlat1 -->
+<!ENTITY Egrave "&#200;" ><!-- latin capital E with grave, U+00C8 ISOlat1 -->
+<!ENTITY Eacute "&#201;" ><!-- latin capital E with acute, U+00C9 ISOlat1 -->
+<!ENTITY Ecirc  "&#202;" ><!-- latin capital E with circumflex, U+00CA ISOlat1 -->
+<!ENTITY Euml   "&#203;" ><!-- latin capital E with diaeresis, U+00CB ISOlat1 -->
+<!ENTITY Igrave "&#204;" ><!-- latin capital I with grave, U+00CC ISOlat1 -->
+<!ENTITY Iacute "&#205;" ><!-- latin capital I with acute, U+00CD ISOlat1 -->
+<!ENTITY Icirc  "&#206;" ><!-- latin capital I with circumflex, U+00CE ISOlat1 -->
+<!ENTITY Iuml   "&#207;" ><!-- latin capital I with diaeresis, U+00CF ISOlat1 -->
+<!ENTITY ETH    "&#208;" ><!-- latin capital ETH, U+00D0 ISOlat1 -->
+<!ENTITY Ntilde "&#209;" ><!-- latin capital N with tilde, U+00D1 ISOlat1 -->
+<!ENTITY Ograve "&#210;" ><!-- latin capital O with grave, U+00D2 ISOlat1 -->
+<!ENTITY Oacute "&#211;" ><!-- latin capital O with acute, U+00D3 ISOlat1 -->
+<!ENTITY Ocirc  "&#212;" ><!-- latin capital O with circumflex, U+00D4 ISOlat1 -->
+<!ENTITY Otilde "&#213;" ><!-- latin capital O with tilde, U+00D5 ISOlat1 -->
+<!ENTITY Ouml   "&#214;" ><!-- latin capital O with diaeresis, U+00D6 ISOlat1 -->
+<!ENTITY times  "&#215;" ><!-- multiplication sign, U+00D7 ISOnum -->
+<!ENTITY Oslash "&#216;" ><!-- latin capital O with stroke = latin capital O slash, U+00D8 ISOlat1 -->
+<!ENTITY Ugrave "&#217;" ><!-- latin capital U with grave, U+00D9 ISOlat1 -->
+<!ENTITY Uacute "&#218;" ><!-- latin capital U with acute, U+00DA ISOlat1 -->
+<!ENTITY Ucirc  "&#219;" ><!-- latin capital U with circumflex, U+00DB ISOlat1 -->
+<!ENTITY Uuml   "&#220;" ><!-- latin capital U with diaeresis, U+00DC ISOlat1 -->
+<!ENTITY Yacute "&#221;" ><!-- latin capital Y with acute, U+00DD ISOlat1 -->
+<!ENTITY THORN  "&#222;" ><!-- latin capital THORN, U+00DE ISOlat1 -->
+<!ENTITY szlig  "&#223;" ><!-- latin small sharp s = ess-zed, U+00DF ISOlat1 -->
+<!ENTITY agrave "&#224;" ><!-- latin small a with grave = latin small a grave, U+00E0 ISOlat1 -->
+<!ENTITY aacute "&#225;" ><!-- latin small a with acute, U+00E1 ISOlat1 -->
+<!ENTITY acirc  "&#226;" ><!-- latin small a with circumflex, U+00E2 ISOlat1 -->
+<!ENTITY atilde "&#227;" ><!-- latin small a with tilde, U+00E3 ISOlat1 -->
+<!ENTITY auml   "&#228;" ><!-- latin small a with diaeresis, U+00E4 ISOlat1 -->
+<!ENTITY aring  "&#229;" ><!-- latin small a with ring above = latin small a ring, U+00E5 ISOlat1 -->
+<!ENTITY aelig  "&#230;" ><!-- latin small ae = latin small ligature ae, U+00E6 ISOlat1 -->
+<!ENTITY ccedil "&#231;" ><!-- latin small c with cedilla, U+00E7 ISOlat1 -->
+<!ENTITY egrave "&#232;" ><!-- latin small e with grave, U+00E8 ISOlat1 -->
+<!ENTITY eacute "&#233;" ><!-- latin small e with acute, U+00E9 ISOlat1 -->
+<!ENTITY ecirc  "&#234;" ><!-- latin small e with circumflex, U+00EA ISOlat1 -->
+<!ENTITY euml   "&#235;" ><!-- latin small e with diaeresis, U+00EB ISOlat1 -->
+<!ENTITY igrave "&#236;" ><!-- latin small i with grave, U+00EC ISOlat1 -->
+<!ENTITY iacute "&#237;" ><!-- latin small i with acute, U+00ED ISOlat1 -->
+<!ENTITY icirc  "&#238;" ><!-- latin small i with circumflex, U+00EE ISOlat1 -->
+<!ENTITY iuml   "&#239;" ><!-- latin small i with diaeresis, U+00EF ISOlat1 -->
+<!ENTITY eth    "&#240;" ><!-- latin small eth, U+00F0 ISOlat1 -->
+<!ENTITY ntilde "&#241;" ><!-- latin small n with tilde, U+00F1 ISOlat1 -->
+<!ENTITY ograve "&#242;" ><!-- latin small o with grave, U+00F2 ISOlat1 -->
+<!ENTITY oacute "&#243;" ><!-- latin small o with acute, U+00F3 ISOlat1 -->
+<!ENTITY ocirc  "&#244;" ><!-- latin small o with circumflex, U+00F4 ISOlat1 -->
+<!ENTITY otilde "&#245;" ><!-- latin small o with tilde, U+00F5 ISOlat1 -->
+<!ENTITY ouml   "&#246;" ><!-- latin small o with diaeresis, U+00F6 ISOlat1 -->
+<!ENTITY divide "&#247;" ><!-- division sign, U+00F7 ISOnum -->
+<!ENTITY oslash "&#248;" ><!-- latin small o with stroke, = latin small o slash, U+00F8 ISOlat1 -->
+<!ENTITY ugrave "&#249;" ><!-- latin small u with grave, U+00F9 ISOlat1 -->
+<!ENTITY uacute "&#250;" ><!-- latin small u with acute, U+00FA ISOlat1 -->
+<!ENTITY ucirc  "&#251;" ><!-- latin small u with circumflex, U+00FB ISOlat1 -->
+<!ENTITY uuml   "&#252;" ><!-- latin small u with diaeresis, U+00FC ISOlat1 -->
+<!ENTITY yacute "&#253;" ><!-- latin small y with acute, U+00FD ISOlat1 -->
+<!ENTITY thorn  "&#254;" ><!-- latin small thorn with, U+00FE ISOlat1 -->
+<!ENTITY yuml   "&#255;" ><!-- latin small y with diaeresis, U+00FF ISOlat1 -->
+<!-- end of xhtml-lat1.ent -->
+
+F.1.2. XHTML Special Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-special.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-special.ent.
+
+<!-- ...................................................................... -->
+<!-- XML-compatible ISO Special Character Entity Set for XHTML ............ -->
+<!-- file: xhtml-special.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-special
+           PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+                  "xhtml-special.ent" >
+       %xhtml-special;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Special for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-special.ent"
+
+     Revision:  $Id: xhtml-special.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+
+     Revisions:
+2000-10-28: added &apos; and altered XML Predefined Entities for compatibility
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- C0 Controls and Basic Latin -->
+<!ENTITY lt      "&#38;#60;" ><!-- less-than sign, U+003C ISOnum -->
+<!ENTITY gt      "&#62;" ><!-- greater-than sign, U+003E ISOnum -->
+<!ENTITY amp     "&#38;#38;" ><!-- ampersand, U+0026 ISOnum -->
+<!ENTITY apos    "&#39;" ><!-- The Apostrophe (Apostrophe Quote, APL Quote), U+0027 ISOnum -->
+<!ENTITY quot    "&#34;" ><!-- quotation mark (Quote Double), U+0022 ISOnum -->
+
+<!-- Latin Extended-A -->
+<!ENTITY OElig   "&#338;" ><!-- latin capital ligature OE, U+0152 ISOlat2 -->
+<!ENTITY oelig   "&#339;" ><!-- latin small ligature oe, U+0153 ISOlat2 -->
+
+<!-- ligature is a misnomer, this is a separate character in some languages -->
+<!ENTITY Scaron  "&#352;" ><!-- latin capital letter S with caron, U+0160 ISOlat2 -->
+<!ENTITY scaron  "&#353;" ><!-- latin small letter s with caron, U+0161 ISOlat2 -->
+<!ENTITY Yuml    "&#376;" ><!-- latin capital letter Y with diaeresis, U+0178 ISOlat2 -->
+
+<!-- Spacing Modifier Letters -->
+<!ENTITY circ    "&#710;" ><!-- modifier letter circumflex accent, U+02C6 ISOpub -->
+<!ENTITY tilde   "&#732;" ><!-- small tilde, U+02DC ISOdia -->
+
+<!-- General Punctuation -->
+<!ENTITY ensp    "&#8194;" ><!-- en space, U+2002 ISOpub -->
+<!ENTITY emsp    "&#8195;" ><!-- em space, U+2003 ISOpub -->
+<!ENTITY thinsp  "&#8201;" ><!-- thin space, U+2009 ISOpub -->
+<!ENTITY zwnj    "&#8204;" ><!-- zero width non-joiner, U+200C NEW RFC 2070 -->
+<!ENTITY zwj     "&#8205;" ><!-- zero width joiner, U+200D NEW RFC 2070 -->
+<!ENTITY lrm     "&#8206;" ><!-- left-to-right mark, U+200E NEW RFC 2070 -->
+<!ENTITY rlm     "&#8207;" ><!-- right-to-left mark, U+200F NEW RFC 2070 -->
+<!ENTITY ndash   "&#8211;" ><!-- en dash, U+2013 ISOpub -->
+<!ENTITY mdash   "&#8212;" ><!-- em dash, U+2014 ISOpub -->
+<!ENTITY lsquo   "&#8216;" ><!-- left single quotation mark, U+2018 ISOnum -->
+<!ENTITY rsquo   "&#8217;" ><!-- right single quotation mark, U+2019 ISOnum -->
+<!ENTITY sbquo   "&#8218;" ><!-- single low-9 quotation mark, U+201A NEW -->
+<!ENTITY ldquo   "&#8220;" ><!-- left double quotation mark, U+201C ISOnum -->
+<!ENTITY rdquo   "&#8221;" ><!-- right double quotation mark, U+201D ISOnum -->
+<!ENTITY bdquo   "&#8222;" ><!-- double low-9 quotation mark, U+201E NEW -->
+<!ENTITY dagger  "&#8224;" ><!-- dagger, U+2020 ISOpub -->
+<!ENTITY Dagger  "&#8225;" ><!-- double dagger, U+2021 ISOpub -->
+<!ENTITY permil  "&#8240;" ><!-- per mille sign, U+2030 ISOtech -->
+
+<!-- lsaquo is proposed but not yet ISO standardized -->
+<!ENTITY lsaquo  "&#8249;" ><!-- single left-pointing angle quotation mark, U+2039 ISO proposed -->
+<!-- rsaquo is proposed but not yet ISO standardized -->
+<!ENTITY rsaquo  "&#8250;" ><!-- single right-pointing angle quotation mark, U+203A ISO proposed -->
+<!ENTITY euro    "&#8364;" ><!-- euro sign, U+20AC NEW -->
+
+<!-- end of xhtml-special.ent -->
+
+F.1.3. XHTML Mathematical, Greek, and Symbolic Characters
+
+You can download this version of this file from http://www.w3.org/TR/2010/REC-xhtml-modularization/DTD/xhtml-symbol.ent. The latest version is available at http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent.
+
+<!-- ...................................................................... -->
+<!-- ISO Math, Greek and Symbolic Character Entity Set for XHTML .......... -->
+<!-- file: xhtml-symbol.ent
+
+     Typical invocation:
+
+       <!ENTITY % xhtml-symbol
+           PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+                  "xhtml-symbol.ent" >
+       %xhtml-symbol;
+
+     This DTD module is identified by the PUBLIC and SYSTEM identifiers:
+
+       PUBLIC "-//W3C//ENTITIES Symbols for XHTML//EN"
+       SYSTEM "http://www.w3.org/MarkUp/DTD/xhtml-symbol.ent"
+
+     Revision:  $Id: xhtml-symbol.ent,v 4.1 2001/04/10 09:34:14 altheim Exp $ SMI
+
+     Portions (C) International Organization for Standardization 1986:
+     Permission to copy in any form is granted for use with conforming
+     SGML systems and applications as defined in ISO 8879, provided
+     this notice is included in all copies.
+-->
+
+<!-- Relevant ISO entity set is given unless names are newly introduced.
+     New names (i.e., not in ISO 8879 [SGML] list) do not clash with
+     any existing ISO 8879 entity names. ISO 10646 [ISO10646] character
+     numbers are given for each character, in hex. Entity values are
+     decimal conversions of the ISO 10646 values and refer to the
+     document character set. Names are Unicode [UNICODE] names.
+-->
+
+<!-- Latin Extended-B -->
+<!ENTITY fnof     "&#402;" ><!-- latin small f with hook = function
+                              = florin, U+0192 ISOtech -->
+
+<!-- Greek -->
+<!ENTITY Alpha    "&#913;" ><!-- greek capital letter alpha, U+0391 -->
+<!ENTITY Beta     "&#914;" ><!-- greek capital letter beta, U+0392 -->
+<!ENTITY Gamma    "&#915;" ><!-- greek capital letter gamma, U+0393 ISOgrk3 -->
+<!ENTITY Delta    "&#916;" ><!-- greek capital letter delta, U+0394 ISOgrk3 -->
+<!ENTITY Epsilon  "&#917;" ><!-- greek capital letter epsilon, U+0395 -->
+<!ENTITY Zeta     "&#918;" ><!-- greek capital letter zeta, U+0396 -->
+<!ENTITY Eta      "&#919;" ><!-- greek capital letter eta, U+0397 -->
+<!ENTITY Theta    "&#920;" ><!-- greek capital letter theta, U+0398 ISOgrk3 -->
+<!ENTITY Iota     "&#921;" ><!-- greek capital letter iota, U+0399 -->
+<!ENTITY Kappa    "&#922;" ><!-- greek capital letter kappa, U+039A -->
+<!ENTITY Lambda   "&#923;" ><!-- greek capital letter lambda, U+039B ISOgrk3 -->
+<!ENTITY Mu       "&#924;" ><!-- greek capital letter mu, U+039C -->
+<!ENTITY Nu       "&#925;" ><!-- greek capital letter nu, U+039D -->
+<!ENTITY Xi       "&#926;" ><!-- greek capital letter xi, U+039E ISOgrk3 -->
+<!ENTITY Omicron  "&#927;" ><!-- greek capital letter omicron, U+039F -->
+<!ENTITY Pi       "&#928;" ><!-- greek capital letter pi, U+03A0 ISOgrk3 -->
+<!ENTITY Rho      "&#929;" ><!-- greek capital letter rho, U+03A1 -->
+<!-- there is no Sigmaf, and no U+03A2 character either -->
+<!ENTITY Sigma    "&#931;" ><!-- greek capital letter sigma, U+03A3 ISOgrk3 -->
+<!ENTITY Tau      "&#932;" ><!-- greek capital letter tau, U+03A4 -->
+<!ENTITY Upsilon  "&#933;" ><!-- greek capital letter upsilon,
+                              U+03A5 ISOgrk3 -->
+<!ENTITY Phi      "&#934;" ><!-- greek capital letter phi, U+03A6 ISOgrk3 -->
+<!ENTITY Chi      "&#935;" ><!-- greek capital letter chi, U+03A7 -->
+<!ENTITY Psi      "&#936;" ><!-- greek capital letter psi, U+03A8 ISOgrk3 -->
+<!ENTITY Omega    "&#937;" ><!-- greek capital letter omega, U+03A9 ISOgrk3 -->
+<!ENTITY alpha    "&#945;" ><!-- greek small letter alpha, U+03B1 ISOgrk3 -->
+<!ENTITY beta     "&#946;" ><!-- greek small letter beta, U+03B2 ISOgrk3 -->
+<!ENTITY gamma    "&#947;" ><!-- greek small letter gamma, U+03B3 ISOgrk3 -->
+<!ENTITY delta    "&#948;" ><!-- greek small letter delta, U+03B4 ISOgrk3 -->
+<!ENTITY epsilon  "&#949;" ><!-- greek small letter epsilon, U+03B5 ISOgrk3 -->
+<!ENTITY zeta     "&#950;" ><!-- greek small letter zeta, U+03B6 ISOgrk3 -->
+<!ENTITY eta      "&#951;" ><!-- greek small letter eta, U+03B7 ISOgrk3 -->
+<!ENTITY theta    "&#952;" ><!-- greek small letter theta, U+03B8 ISOgrk3 -->
+<!ENTITY iota     "&#953;" ><!-- greek small letter iota, U+03B9 ISOgrk3 -->
+<!ENTITY kappa    "&#954;" ><!-- greek small letter kappa, U+03BA ISOgrk3 -->
+<!ENTITY lambda   "&#955;" ><!-- greek small letter lambda, U+03BB ISOgrk3 -->
+<!ENTITY mu       "&#956;" ><!-- greek small letter mu, U+03BC ISOgrk3 -->
+<!ENTITY nu       "&#957;" ><!-- greek small letter nu, U+03BD ISOgrk3 -->
+<!ENTITY xi       "&#958;" ><!-- greek small letter xi, U+03BE ISOgrk3 -->
+<!ENTITY omicron  "&#959;" ><!-- greek small letter omicron, U+03BF NEW -->
+<!ENTITY pi       "&#960;" ><!-- greek small letter pi, U+03C0 ISOgrk3 -->
+<!ENTITY rho      "&#961;" ><!-- greek small letter rho, U+03C1 ISOgrk3 -->
+<!ENTITY sigmaf   "&#962;" ><!-- greek small letter final sigma, U+03C2 ISOgrk3 -->
+<!ENTITY sigma    "&#963;" ><!-- greek small letter sigma, U+03C3 ISOgrk3 -->
+<!ENTITY tau      "&#964;" ><!-- greek small letter tau, U+03C4 ISOgrk3 -->
+<!ENTITY upsilon  "&#965;" ><!-- greek small letter upsilon, U+03C5 ISOgrk3 -->
+<!ENTITY phi      "&#966;" ><!-- greek small letter phi, U+03C6 ISOgrk3 -->
+<!ENTITY chi      "&#967;" ><!-- greek small letter chi, U+03C7 ISOgrk3 -->
+<!ENTITY psi      "&#968;" ><!-- greek small letter psi, U+03C8 ISOgrk3 -->
+<!ENTITY omega    "&#969;" ><!-- greek small letter omega, U+03C9 ISOgrk3 -->
+<!ENTITY thetasym "&#977;" ><!-- greek small letter theta symbol, U+03D1 NEW -->
+<!ENTITY upsih    "&#978;" ><!-- greek upsilon with hook symbol, U+03D2 NEW -->
+<!ENTITY piv      "&#982;" ><!-- greek pi symbol, U+03D6 ISOgrk3 -->
+
+<!-- General Punctuation -->
+<!ENTITY bull     "&#8226;" ><!-- bullet = black small circle, U+2022 ISOpub  -->
+<!-- bullet is NOT the same as bullet operator, U+2219 -->
+<!ENTITY hellip   "&#8230;" ><!-- horizontal ellipsis = three dot leader, U+2026 ISOpub  -->
+<!ENTITY prime    "&#8242;" ><!-- prime = minutes = feet, U+2032 ISOtech -->
+<!ENTITY Prime    "&#8243;" ><!-- double prime = seconds = inches, U+2033 ISOtech -->
+<!ENTITY oline    "&#8254;" ><!-- overline = spacing overscore, U+203E NEW -->
+<!ENTITY frasl    "&#8260;" ><!-- fraction slash, U+2044 NEW -->
+
+<!-- Letterlike Symbols -->
+<!ENTITY weierp   "&#8472;" ><!-- script capital P = power set = Weierstrass p, U+2118 ISOamso -->
+<!ENTITY image    "&#8465;" ><!-- blackletter capital I = imaginary part, U+2111 ISOamso -->
+<!ENTITY real     "&#8476;" ><!-- blackletter capital R = real part symbol, U+211C ISOamso -->
+<!ENTITY trade    "&#8482;" ><!-- trade mark sign, U+2122 ISOnum -->
+<!ENTITY alefsym  "&#8501;" ><!-- alef symbol = first transfinite cardinal, U+2135 NEW -->
+<!-- alef symbol is NOT the same as hebrew letter alef, U+05D0 although
+     the same glyph could be used to depict both characters -->
+
+<!-- Arrows -->
+<!ENTITY larr     "&#8592;" ><!-- leftwards arrow, U+2190 ISOnum -->
+<!ENTITY uarr     "&#8593;" ><!-- upwards arrow, U+2191 ISOnum-->
+<!ENTITY rarr     "&#8594;" ><!-- rightwards arrow, U+2192 ISOnum -->
+<!ENTITY darr     "&#8595;" ><!-- downwards arrow, U+2193 ISOnum -->
+<!ENTITY harr     "&#8596;" ><!-- left right arrow, U+2194 ISOamsa -->
+<!ENTITY crarr    "&#8629;" ><!-- downwards arrow with corner leftwards
+                               = carriage return, U+21B5 NEW -->
+<!ENTITY lArr     "&#8656;" ><!-- leftwards double arrow, U+21D0 ISOtech -->
+<!-- Unicode does not say that lArr is the same as the 'is implied by' arrow
+    but also does not have any other character for that function. So ? lArr can
+    be used for 'is implied by' as ISOtech suggests -->
+<!ENTITY uArr     "&#8657;" ><!-- upwards double arrow, U+21D1 ISOamsa -->
+<!ENTITY rArr     "&#8658;" ><!-- rightwards double arrow, U+21D2 ISOtech -->
+<!-- Unicode does not say this is the 'implies' character but does not have
+     another character with this function so ?
+     rArr can be used for 'implies' as ISOtech suggests -->
+<!ENTITY dArr     "&#8659;" ><!-- downwards double arrow, U+21D3 ISOamsa -->
+<!ENTITY hArr     "&#8660;" ><!-- left right double arrow, U+21D4 ISOamsa -->
+
+<!-- Mathematical Operators -->
+<!ENTITY forall   "&#8704;" ><!-- for all, U+2200 ISOtech -->
+<!ENTITY part     "&#8706;" ><!-- partial differential, U+2202 ISOtech  -->
+<!ENTITY exist    "&#8707;" ><!-- there exists, U+2203 ISOtech -->
+<!ENTITY empty    "&#8709;" ><!-- empty set = null set, U+2205 ISOamso -->
+<!ENTITY nabla    "&#8711;" ><!-- nabla = backward difference, U+2207 ISOtech -->
+<!ENTITY isin     "&#8712;" ><!-- element of, U+2208 ISOtech -->
+<!ENTITY notin    "&#8713;" ><!-- not an element of, U+2209 ISOtech -->
+<!ENTITY ni       "&#8715;" ><!-- contains as member, U+220B ISOtech -->
+<!-- should there be a more memorable name than 'ni'? -->
+<!ENTITY prod     "&#8719;" ><!-- n-ary product = product sign, U+220F ISOamsb -->
+<!-- prod is NOT the same character as U+03A0 'greek capital letter pi' though
+     the same glyph might be used for both -->
+<!ENTITY sum      "&#8721;" ><!-- n-ary sumation, U+2211 ISOamsb -->
+<!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+     though the same glyph might be used for both -->
+<!ENTITY minus    "&#8722;" ><!-- minus sign, U+2212 ISOtech -->
+<!ENTITY lowast   "&#8727;" ><!-- asterisk operator, U+2217 ISOtech -->
+<!ENTITY radic    "&#8730;" ><!-- square root = radical sign, U+221A ISOtech -->
+<!ENTITY prop     "&#8733;" ><!-- proportional to, U+221D ISOtech -->
+<!ENTITY infin    "&#8734;" ><!-- infinity, U+221E ISOtech -->
+<!ENTITY ang      "&#8736;" ><!-- angle, U+2220 ISOamso -->
+<!ENTITY and      "&#8743;" ><!-- logical and = wedge, U+2227 ISOtech -->
+<!ENTITY or       "&#8744;" ><!-- logical or = vee, U+2228 ISOtech -->
+<!ENTITY cap      "&#8745;" ><!-- intersection = cap, U+2229 ISOtech -->
+<!ENTITY cup      "&#8746;" ><!-- union = cup, U+222A ISOtech -->
+<!ENTITY int      "&#8747;" ><!-- integral, U+222B ISOtech -->
+<!ENTITY there4   "&#8756;" ><!-- therefore, U+2234 ISOtech -->
+<!ENTITY sim      "&#8764;" ><!-- tilde operator = varies with = similar to, U+223C ISOtech -->
+<!-- tilde operator is NOT the same character as the tilde, U+007E,
+     although the same glyph might be used to represent both  -->
+<!ENTITY cong     "&#8773;" ><!-- approximately equal to, U+2245 ISOtech -->
+<!ENTITY asymp    "&#8776;" ><!-- almost equal to = asymptotic to, U+2248 ISOamsr -->
+<!ENTITY ne       "&#8800;" ><!-- not equal to, U+2260 ISOtech -->
+<!ENTITY equiv    "&#8801;" ><!-- identical to, U+2261 ISOtech -->
+<!ENTITY le       "&#8804;" ><!-- less-than or equal to, U+2264 ISOtech -->
+<!ENTITY ge       "&#8805;" ><!-- greater-than or equal to, U+2265 ISOtech -->
+<!ENTITY sub      "&#8834;" ><!-- subset of, U+2282 ISOtech -->
+<!ENTITY sup      "&#8835;" ><!-- superset of, U+2283 ISOtech -->
+<!-- note that nsup, 'not a superset of, U+2283' is not covered by the Symbol
+     font encoding and is not included. Should it be, for symmetry?
+     It is in ISOamsn  -->
+<!ENTITY nsub     "&#8836;" ><!-- not a subset of, U+2284 ISOamsn -->
+<!ENTITY sube     "&#8838;" ><!-- subset of or equal to, U+2286 ISOtech -->
+<!ENTITY supe     "&#8839;" ><!-- superset of or equal to, U+2287 ISOtech -->
+<!ENTITY oplus    "&#8853;" ><!-- circled plus = direct sum, U+2295 ISOamsb -->
+<!ENTITY otimes   "&#8855;" ><!-- circled times = vector product, U+2297 ISOamsb -->
+<!ENTITY perp     "&#8869;" ><!-- up tack = orthogonal to = perpendicular, U+22A5 ISOtech -->
+<!ENTITY sdot     "&#8901;" ><!-- dot operator, U+22C5 ISOamsb -->
+<!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+<!-- Miscellaneous Technical -->
+<!ENTITY lceil    "&#8968;" ><!-- left ceiling = apl upstile, U+2308 ISOamsc  -->
+<!ENTITY rceil    "&#8969;" ><!-- right ceiling, U+2309 ISOamsc  -->
+<!ENTITY lfloor   "&#8970;" ><!-- left floor = apl downstile, U+230A ISOamsc  -->
+<!ENTITY rfloor   "&#8971;" ><!-- right floor, U+230B ISOamsc  -->
+<!ENTITY lang     "&#9001;" ><!-- left-pointing angle bracket = bra, U+2329 ISOtech -->
+<!-- lang is NOT the same character as U+003C 'less than'
+     or U+2039 'single left-pointing angle quotation mark' -->
+<!ENTITY rang     "&#9002;" ><!-- right-pointing angle bracket = ket, U+232A ISOtech -->
+<!-- rang is NOT the same character as U+003E 'greater than'
+     or U+203A 'single right-pointing angle quotation mark' -->
+
+<!-- Geometric Shapes -->
+<!ENTITY loz      "&#9674;" ><!-- lozenge, U+25CA ISOpub -->
+
+<!-- Miscellaneous Symbols -->
+<!ENTITY spades   "&#9824;" ><!-- black spade suit, U+2660 ISOpub -->
+<!-- black here seems to mean filled as opposed to hollow -->
+<!ENTITY clubs    "&#9827;" ><!-- black club suit = shamrock, U+2663 ISOpub -->
+<!ENTITY hearts   "&#9829;" ><!-- black heart suit = valentine, U+2665 ISOpub -->
+<!ENTITY diams    "&#9830;" ><!-- black diamond suit, U+2666 ISOpub -->
+
+<!-- end of xhtml-symbol.ent -->
+"""
+  return text
+
+def get_apache_license():
+  license = r"""/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+"""
+  return license
+
+main()