You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/01/22 13:25:24 UTC

svn commit: r1234501 [1/3] - in /lucene/dev/branches/lucene2858: ./ dev-tools/idea/lucene/contrib/ lucene/ lucene/contrib/ lucene/contrib/sandbox/src/test/org/apache/lucene/sandbox/queries/regex/ lucene/src/java/org/apache/lucene/codecs/lucene3x/ lucen...

Author: uschindler
Date: Sun Jan 22 12:25:22 2012
New Revision: 1234501

URL: http://svn.apache.org/viewvc?rev=1234501&view=rev
Log:
LUCENE-2858: Reverse merged revision(s) 1-0 from lucene/dev/trunk

Added:
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
      - copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
      - copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
      - copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
      - copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
    lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
      - copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
    lucene/dev/branches/lucene2858/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
      - copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
    lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
      - copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
    lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
      - copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
    lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
      - copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
    lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
      - copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
    lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/htmlStripReaderTest.html
      - copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/htmlStripReaderTest.html
Modified:
    lucene/dev/branches/lucene2858/   (props changed)
    lucene/dev/branches/lucene2858/dev-tools/idea/lucene/contrib/   (props changed)
    lucene/dev/branches/lucene2858/lucene/   (props changed)
    lucene/dev/branches/lucene2858/lucene/CHANGES.txt
    lucene/dev/branches/lucene2858/lucene/contrib/CHANGES.txt   (props changed)
    lucene/dev/branches/lucene2858/lucene/contrib/sandbox/src/test/org/apache/lucene/sandbox/queries/regex/TestSpanRegexQuery.java   (props changed)
    lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/TermInfosReaderIndex.java   (props changed)
    lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
    lucene/dev/branches/lucene2858/modules/analysis/common/build.xml
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
    lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
    lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
    lucene/dev/branches/lucene2858/modules/analysis/icu/build.xml
    lucene/dev/branches/lucene2858/modules/analysis/kuromoji/   (props changed)
    lucene/dev/branches/lucene2858/modules/benchmark/   (props changed)
    lucene/dev/branches/lucene2858/modules/facet/   (props changed)
    lucene/dev/branches/lucene2858/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
    lucene/dev/branches/lucene2858/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
    lucene/dev/branches/lucene2858/modules/queryparser/src/test/org/apache/lucene/queryparser/xml/builders/TestNumericRangeFilterBuilder.java   (props changed)
    lucene/dev/branches/lucene2858/solr/   (props changed)
    lucene/dev/branches/lucene2858/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene2858/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/lucene2858/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/lucene2858/solr/README.txt   (props changed)
    lucene/dev/branches/lucene2858/solr/build.xml   (props changed)
    lucene/dev/branches/lucene2858/solr/client/   (props changed)
    lucene/dev/branches/lucene2858/solr/common-build.xml   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/clustering/src/test-files/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler-extras/src/java/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler/src/java/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler/src/test-files/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler/src/test/org/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/uima/src/java/   (props changed)
    lucene/dev/branches/lucene2858/solr/contrib/uima/src/test-files/   (props changed)
    lucene/dev/branches/lucene2858/solr/core/   (props changed)
    lucene/dev/branches/lucene2858/solr/core/src/java/   (props changed)
    lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
    lucene/dev/branches/lucene2858/solr/core/src/test/   (props changed)
    lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
    lucene/dev/branches/lucene2858/solr/dev-tools/   (props changed)
    lucene/dev/branches/lucene2858/solr/example/   (props changed)
    lucene/dev/branches/lucene2858/solr/lib/   (props changed)
    lucene/dev/branches/lucene2858/solr/scripts/   (props changed)
    lucene/dev/branches/lucene2858/solr/site/   (props changed)
    lucene/dev/branches/lucene2858/solr/site-src/   (props changed)
    lucene/dev/branches/lucene2858/solr/solrj/   (props changed)
    lucene/dev/branches/lucene2858/solr/solrj/src/java/   (props changed)
    lucene/dev/branches/lucene2858/solr/solrj/src/test/org/apache/solr/client/   (props changed)
    lucene/dev/branches/lucene2858/solr/solrj/src/test/org/apache/solr/client/solrj/   (props changed)
    lucene/dev/branches/lucene2858/solr/solrj/src/test/org/apache/solr/common/   (props changed)
    lucene/dev/branches/lucene2858/solr/test-framework/   (props changed)
    lucene/dev/branches/lucene2858/solr/testlogging.properties   (props changed)
    lucene/dev/branches/lucene2858/solr/webapp/   (props changed)

Modified: lucene/dev/branches/lucene2858/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/CHANGES.txt?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene2858/lucene/CHANGES.txt Sun Jan 22 12:25:22 2012
@@ -792,6 +792,9 @@ New Features
   
 * LUCENE-3121: Add TypeTokenFilter that filters tokens based on
   their TypeAttribute.  (Tommaso Teofili via Uwe Schindler)
+
+* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
+  markup. (Steve Rowe)
   
 Bug fixes
 

Modified: lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java Sun Jan 22 12:25:22 2012
@@ -249,7 +249,42 @@ public class _TestUtil {
     }
   }
   
-  // TODO: make this more evil
+  private static final String[] HTML_CHAR_ENTITIES = {
+      "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
+      "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
+      "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
+      "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
+      "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
+      "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
+      "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
+      "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
+      "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
+      "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
+      "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
+      "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
+      "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
+      "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
+      "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
+      "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
+      "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
+      "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
+      "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
+      "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
+      "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
+      "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
+      "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
+      "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
+      "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
+      "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
+      "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
+      "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
+      "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
+      "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
+      "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
+      "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
+      "yuml", "zeta", "zwj", "zwnj"
+  };
+  
   public static String randomHtmlishString(Random random, int numElements) {
     final int end = random.nextInt(numElements);
     if (end == 0) {
@@ -258,17 +293,80 @@ public class _TestUtil {
     }
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < end; i++) {
-      int val = random.nextInt(10);
+      int val = random.nextInt(25);
       switch(val) {
         case 0: sb.append("<p>"); break;
-        case 1: sb.append("</p>"); break;
-        case 2: sb.append("<!--"); break;
-        case 3: sb.append("-->"); break;
-        case 4: sb.append("&#"); break;
-        case 5: sb.append(";"); break;
-        case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
-        default:
-          sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+        case 1: {
+          sb.append("<");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
+            sb.append(' ');
+            sb.append(randomSimpleString(random));
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append('=');
+            sb.append(" ".substring(nextInt(random, 0, 1)));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+            sb.append(randomSimpleString(random));
+            sb.append("\"".substring(nextInt(random, 0, 1)));
+          }
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append("/".substring(nextInt(random, 0, 1)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 2: {
+          sb.append("</");
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(randomSimpleString(random));
+          sb.append("    ".substring(nextInt(random, 0, 4)));
+          sb.append(">".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 3: sb.append(">"); break;
+        case 4: sb.append("</p>"); break;
+        case 5: sb.append("<!--"); break;
+        case 6: sb.append("<!--#"); break;
+        case 7: sb.append("<script><!-- f('"); break;
+        case 8: sb.append("</script>"); break;
+        case 9: sb.append("<?"); break;
+        case 10: sb.append("?>"); break;
+        case 11: sb.append("\""); break;
+        case 12: sb.append("\\\""); break;
+        case 13: sb.append("'"); break;
+        case 14: sb.append("\\'"); break;
+        case 15: sb.append("-->"); break;
+        case 16: {
+          sb.append("&");
+          switch(nextInt(random, 0, 2)) {
+            case 0: sb.append(randomSimpleString(random)); break;
+            case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
+          }
+          sb.append(";".substring(nextInt(random, 0, 1)));
+          break;
+        }
+        case 17: {
+          sb.append("&#");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        } 
+        case 18: {
+          sb.append("&#x");
+          if (0 == nextInt(random, 0, 1)) {
+            sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
+            sb.append(";".substring(nextInt(random, 0, 1)));
+          }
+          break;
+        }
+          
+        case 19: sb.append(";"); break;
+        case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
+        case 21: sb.append("\n");
+        case 22: sb.append("          ".substring(nextInt(random, 0, 10)));
+        default: sb.append(randomSimpleString(random));
       }
     }
     return sb.toString();

Modified: lucene/dev/branches/lucene2858/modules/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/modules/analysis/common/build.xml?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/modules/analysis/common/build.xml (original)
+++ lucene/dev/branches/lucene2858/modules/analysis/common/build.xml Sun Jan 22 12:25:22 2012
@@ -31,14 +31,38 @@
   <target name="compile-core" depends="jflex-notice, common.compile-core"/>
 
   <target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
-                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
+                                jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
+                                jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
 
   <target name="gen-uax29-supp-macros">
     <subant target="gen-uax29-supp-macros">
        <fileset dir="../icu" includes="build.xml"/>
     </subant>
   </target>
-  
+
+  <target name="jflex-HTMLStripCharFilter"
+          depends="init,jflex-check,generate-jflex-html-char-entities"
+          if="jflex.present">
+    <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+      <classpath refid="jflex.classpath"/>
+    </taskdef>
+    <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+           outdir="src/java/org/apache/lucene/analysis/charfilter"
+           nobak="on"/>
+    <!-- Remove the inappropriate JFlex-generated constructors -->
+    <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+                   match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+                   replace="" flags="sg"/>
+  </target>
+
+  <target name="generate-jflex-html-char-entities">
+    <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+          output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+          executable="${python.exe}" failonerror="true" logerror="true">
+      <arg value="htmlentity.py"/>
+    </exec>
+  </target>
+
   <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
     <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
       <classpath refid="jflex.classpath"/>

Modified: lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java (original)
+++ lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java Sun Jan 22 12:25:22 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charf
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.util.ArrayUtil;
 
+import java.util.Arrays;
+
 /**
  * Base utility class for implementing a {@link CharFilter}.
  * You subclass this, and then record mappings by calling
@@ -71,6 +73,19 @@ public abstract class BaseCharFilter ext
       0 : diffs[size-1];
   }
 
+  /**
+   * <p>
+   *   Adds an offset correction mapping at the given output stream offset.
+   * </p>
+   * <p>
+   *   Assumption: the offset given with each successive call to this method
+   *   will not be smaller than the offset given at the previous invocation.
+   * </p>
+   *
+   * @param off The output stream offset at which to apply the correction
+   * @param cumulativeDiff The input offset is given by adding this
+   *                       to the output offset
+   */
   protected void addOffCorrectMap(int off, int cumulativeDiff) {
     if (offsets == null) {
       offsets = new int[64];
@@ -80,7 +95,15 @@ public abstract class BaseCharFilter ext
       diffs = ArrayUtil.grow(diffs);
     }
     
-    offsets[size] = off;
-    diffs[size++] = cumulativeDiff; 
+    assert (size == 0 || off >= offsets[size])
+        : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+          + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+    
+    if (size == 0 || off != offsets[size - 1]) {
+      offsets[size] = off;
+      diffs[size++] = cumulativeDiff;
+    } else { // Overwrite the diff at the last recorded offset
+      diffs[size - 1] = cumulativeDiff;
+    }
   }
 }