You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2012/01/22 13:25:24 UTC
svn commit: r1234501 [1/3] - in /lucene/dev/branches/lucene2858: ./
dev-tools/idea/lucene/contrib/ lucene/ lucene/contrib/
lucene/contrib/sandbox/src/test/org/apache/lucene/sandbox/queries/regex/
lucene/src/java/org/apache/lucene/codecs/lucene3x/ lucen...
Author: uschindler
Date: Sun Jan 22 12:25:22 2012
New Revision: 1234501
URL: http://svn.apache.org/viewvc?rev=1234501&view=rev
Log:
LUCENE-2858: Reverse merged revision(s) 1-0 from lucene/dev/trunk
Added:
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
- copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
- copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
- copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
- copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
- copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/MS-Word 14 generated.htm
lucene/dev/branches/lucene2858/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
- copied unchanged from r1234500, lucene/dev/trunk/modules/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateHTMLStripCharFilterSupplementaryMacros.java
lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
- copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
- copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
- copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
- copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/analysis/htmlStripReaderTest.html
- copied unchanged from r1234500, lucene/dev/trunk/solr/core/src/test/org/apache/solr/analysis/htmlStripReaderTest.html
Modified:
lucene/dev/branches/lucene2858/ (props changed)
lucene/dev/branches/lucene2858/dev-tools/idea/lucene/contrib/ (props changed)
lucene/dev/branches/lucene2858/lucene/ (props changed)
lucene/dev/branches/lucene2858/lucene/CHANGES.txt
lucene/dev/branches/lucene2858/lucene/contrib/CHANGES.txt (props changed)
lucene/dev/branches/lucene2858/lucene/contrib/sandbox/src/test/org/apache/lucene/sandbox/queries/regex/TestSpanRegexQuery.java (props changed)
lucene/dev/branches/lucene2858/lucene/src/java/org/apache/lucene/codecs/lucene3x/TermInfosReaderIndex.java (props changed)
lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
lucene/dev/branches/lucene2858/modules/analysis/common/build.xml
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/package.html
lucene/dev/branches/lucene2858/modules/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
lucene/dev/branches/lucene2858/modules/analysis/icu/build.xml
lucene/dev/branches/lucene2858/modules/analysis/kuromoji/ (props changed)
lucene/dev/branches/lucene2858/modules/benchmark/ (props changed)
lucene/dev/branches/lucene2858/modules/facet/ (props changed)
lucene/dev/branches/lucene2858/modules/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
lucene/dev/branches/lucene2858/modules/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
lucene/dev/branches/lucene2858/modules/queryparser/src/test/org/apache/lucene/queryparser/xml/builders/TestNumericRangeFilterBuilder.java (props changed)
lucene/dev/branches/lucene2858/solr/ (props changed)
lucene/dev/branches/lucene2858/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/lucene2858/solr/LICENSE.txt (props changed)
lucene/dev/branches/lucene2858/solr/NOTICE.txt (props changed)
lucene/dev/branches/lucene2858/solr/README.txt (props changed)
lucene/dev/branches/lucene2858/solr/build.xml (props changed)
lucene/dev/branches/lucene2858/solr/client/ (props changed)
lucene/dev/branches/lucene2858/solr/common-build.xml (props changed)
lucene/dev/branches/lucene2858/solr/contrib/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/clustering/src/test-files/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler-extras/src/java/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler/src/java/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler/src/test-files/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/dataimporthandler/src/test/org/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/uima/src/java/ (props changed)
lucene/dev/branches/lucene2858/solr/contrib/uima/src/test-files/ (props changed)
lucene/dev/branches/lucene2858/solr/core/ (props changed)
lucene/dev/branches/lucene2858/solr/core/src/java/ (props changed)
lucene/dev/branches/lucene2858/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
lucene/dev/branches/lucene2858/solr/core/src/test/ (props changed)
lucene/dev/branches/lucene2858/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
lucene/dev/branches/lucene2858/solr/dev-tools/ (props changed)
lucene/dev/branches/lucene2858/solr/example/ (props changed)
lucene/dev/branches/lucene2858/solr/lib/ (props changed)
lucene/dev/branches/lucene2858/solr/scripts/ (props changed)
lucene/dev/branches/lucene2858/solr/site/ (props changed)
lucene/dev/branches/lucene2858/solr/site-src/ (props changed)
lucene/dev/branches/lucene2858/solr/solrj/ (props changed)
lucene/dev/branches/lucene2858/solr/solrj/src/java/ (props changed)
lucene/dev/branches/lucene2858/solr/solrj/src/test/org/apache/solr/client/ (props changed)
lucene/dev/branches/lucene2858/solr/solrj/src/test/org/apache/solr/client/solrj/ (props changed)
lucene/dev/branches/lucene2858/solr/solrj/src/test/org/apache/solr/common/ (props changed)
lucene/dev/branches/lucene2858/solr/test-framework/ (props changed)
lucene/dev/branches/lucene2858/solr/testlogging.properties (props changed)
lucene/dev/branches/lucene2858/solr/webapp/ (props changed)
Modified: lucene/dev/branches/lucene2858/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/CHANGES.txt?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene2858/lucene/CHANGES.txt Sun Jan 22 12:25:22 2012
@@ -792,6 +792,9 @@ New Features
* LUCENE-3121: Add TypeTokenFilter that filters tokens based on
their TypeAttribute. (Tommaso Teofili via Uwe Schindler)
+
+* LUCENE-3690: Added HTMLStripCharFilter, a CharFilter that strips HTML
+ markup. (Steve Rowe)
Bug fixes
Modified: lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java (original)
+++ lucene/dev/branches/lucene2858/lucene/src/test-framework/java/org/apache/lucene/util/_TestUtil.java Sun Jan 22 12:25:22 2012
@@ -249,7 +249,42 @@ public class _TestUtil {
}
}
- // TODO: make this more evil
+ private static final String[] HTML_CHAR_ENTITIES = {
+ "AElig", "Aacute", "Acirc", "Agrave", "Alpha", "AMP", "Aring", "Atilde",
+ "Auml", "Beta", "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH",
+ "Eacute", "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "Gamma", "GT",
+ "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "Lambda", "LT",
+ "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", "Ograve", "Omega",
+ "Omicron", "Oslash", "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi",
+ "QUOT", "REG", "Rho", "Scaron", "Sigma", "THORN", "Tau", "Theta",
+ "Uacute", "Ucirc", "Ugrave", "Upsilon", "Uuml", "Xi", "Yacute", "Yuml",
+ "Zeta", "aacute", "acirc", "acute", "aelig", "agrave", "alefsym",
+ "alpha", "amp", "and", "ang", "apos", "aring", "asymp", "atilde",
+ "auml", "bdquo", "beta", "brvbar", "bull", "cap", "ccedil", "cedil",
+ "cent", "chi", "circ", "clubs", "cong", "copy", "crarr", "cup",
+ "curren", "dArr", "dagger", "darr", "deg", "delta", "diams", "divide",
+ "eacute", "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon",
+ "equiv", "eta", "eth", "euml", "euro", "exist", "fnof", "forall",
+ "frac12", "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr",
+ "harr", "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave",
+ "image", "infin", "int", "iota", "iquest", "isin", "iuml", "kappa",
+ "lArr", "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
+ "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
+ "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash",
+ "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
+ "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", "ordf",
+ "ordm", "oslash", "otilde", "otimes", "ouml", "para", "part", "permil",
+ "perp", "phi", "pi", "piv", "plusmn", "pound", "prime", "prod", "prop",
+ "psi", "quot", "rArr", "radic", "rang", "raquo", "rarr", "rceil",
+ "rdquo", "real", "reg", "rfloor", "rho", "rlm", "rsaquo", "rsquo",
+ "sbquo", "scaron", "sdot", "sect", "shy", "sigma", "sigmaf", "sim",
+ "spades", "sub", "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe",
+ "szlig", "tau", "there4", "theta", "thetasym", "thinsp", "thorn",
+ "tilde", "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
+ "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
+ "yuml", "zeta", "zwj", "zwnj"
+ };
+
public static String randomHtmlishString(Random random, int numElements) {
final int end = random.nextInt(numElements);
if (end == 0) {
@@ -258,17 +293,80 @@ public class _TestUtil {
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < end; i++) {
- int val = random.nextInt(10);
+ int val = random.nextInt(25);
switch(val) {
case 0: sb.append("<p>"); break;
- case 1: sb.append("</p>"); break;
- case 2: sb.append("<!--"); break;
- case 3: sb.append("-->"); break;
- case 4: sb.append("&#"); break;
- case 5: sb.append(";"); break;
- case 6: sb.append((char)_TestUtil.nextInt(random, '0', '9')); break;
- default:
- sb.append((char)_TestUtil.nextInt(random, 'a', 'z'));
+ case 1: {
+ sb.append("<");
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append(randomSimpleString(random));
+ for (int j = 0 ; j < nextInt(random, 0, 10) ; ++j) {
+ sb.append(' ');
+ sb.append(randomSimpleString(random));
+ sb.append(" ".substring(nextInt(random, 0, 1)));
+ sb.append('=');
+ sb.append(" ".substring(nextInt(random, 0, 1)));
+ sb.append("\"".substring(nextInt(random, 0, 1)));
+ sb.append(randomSimpleString(random));
+ sb.append("\"".substring(nextInt(random, 0, 1)));
+ }
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append("/".substring(nextInt(random, 0, 1)));
+ sb.append(">".substring(nextInt(random, 0, 1)));
+ break;
+ }
+ case 2: {
+ sb.append("</");
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append(randomSimpleString(random));
+ sb.append(" ".substring(nextInt(random, 0, 4)));
+ sb.append(">".substring(nextInt(random, 0, 1)));
+ break;
+ }
+ case 3: sb.append(">"); break;
+ case 4: sb.append("</p>"); break;
+ case 5: sb.append("<!--"); break;
+ case 6: sb.append("<!--#"); break;
+ case 7: sb.append("<script><!-- f('"); break;
+ case 8: sb.append("</script>"); break;
+ case 9: sb.append("<?"); break;
+ case 10: sb.append("?>"); break;
+ case 11: sb.append("\""); break;
+ case 12: sb.append("\\\""); break;
+ case 13: sb.append("'"); break;
+ case 14: sb.append("\\'"); break;
+ case 15: sb.append("-->"); break;
+ case 16: {
+ sb.append("&");
+ switch(nextInt(random, 0, 2)) {
+ case 0: sb.append(randomSimpleString(random)); break;
+ case 1: sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]); break;
+ }
+ sb.append(";".substring(nextInt(random, 0, 1)));
+ break;
+ }
+ case 17: {
+ sb.append("&#");
+ if (0 == nextInt(random, 0, 1)) {
+ sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
+ sb.append(";".substring(nextInt(random, 0, 1)));
+ }
+ break;
+ }
+ case 18: {
+ sb.append("&#x");
+ if (0 == nextInt(random, 0, 1)) {
+ sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
+ sb.append(";".substring(nextInt(random, 0, 1)));
+ }
+ break;
+ }
+
+ case 19: sb.append(";"); break;
+ case 20: sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1)); break;
+ case 21: sb.append("\n");
+ case 22: sb.append(" ".substring(nextInt(random, 0, 10)));
+ default: sb.append(randomSimpleString(random));
}
}
return sb.toString();
Modified: lucene/dev/branches/lucene2858/modules/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/modules/analysis/common/build.xml?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/modules/analysis/common/build.xml (original)
+++ lucene/dev/branches/lucene2858/modules/analysis/common/build.xml Sun Jan 22 12:25:22 2012
@@ -31,14 +31,38 @@
<target name="compile-core" depends="jflex-notice, common.compile-core"/>
<target name="jflex" depends="jflex-check,clean-jflex,gen-uax29-supp-macros,
- jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,jflex-wiki-tokenizer"/>
+ jflex-StandardAnalyzer,jflex-UAX29URLEmailTokenizer,
+ jflex-wiki-tokenizer,jflex-HTMLStripCharFilter"/>
<target name="gen-uax29-supp-macros">
<subant target="gen-uax29-supp-macros">
<fileset dir="../icu" includes="build.xml"/>
</subant>
</target>
-
+
+ <target name="jflex-HTMLStripCharFilter"
+ depends="init,jflex-check,generate-jflex-html-char-entities"
+ if="jflex.present">
+ <taskdef classname="jflex.anttask.JFlexTask" name="jflex">
+ <classpath refid="jflex.classpath"/>
+ </taskdef>
+ <jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
+ outdir="src/java/org/apache/lucene/analysis/charfilter"
+ nobak="on"/>
+ <!-- Remove the inappropriate JFlex-generated constructors -->
+ <replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
+ match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
+ replace="" flags="sg"/>
+ </target>
+
+ <target name="generate-jflex-html-char-entities">
+ <exec dir="src/java/org/apache/lucene/analysis/charfilter"
+ output="src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex"
+ executable="${python.exe}" failonerror="true" logerror="true">
+ <arg value="htmlentity.py"/>
+ </exec>
+ </target>
+
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
Modified: lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java?rev=1234501&r1=1234500&r2=1234501&view=diff
==============================================================================
--- lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java (original)
+++ lucene/dev/branches/lucene2858/modules/analysis/common/src/java/org/apache/lucene/analysis/charfilter/BaseCharFilter.java Sun Jan 22 12:25:22 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.charf
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.util.ArrayUtil;
+import java.util.Arrays;
+
/**
* Base utility class for implementing a {@link CharFilter}.
* You subclass this, and then record mappings by calling
@@ -71,6 +73,19 @@ public abstract class BaseCharFilter ext
0 : diffs[size-1];
}
+ /**
+ * <p>
+ * Adds an offset correction mapping at the given output stream offset.
+ * </p>
+ * <p>
+ * Assumption: the offset given with each successive call to this method
+ * will not be smaller than the offset given at the previous invocation.
+ * </p>
+ *
+ * @param off The output stream offset at which to apply the correction
+ * @param cumulativeDiff The input offset is given by adding this
+ * to the output offset
+ */
protected void addOffCorrectMap(int off, int cumulativeDiff) {
if (offsets == null) {
offsets = new int[64];
@@ -80,7 +95,15 @@ public abstract class BaseCharFilter ext
diffs = ArrayUtil.grow(diffs);
}
- offsets[size] = off;
- diffs[size++] = cumulativeDiff;
+ assert (size == 0 || off >= offsets[size])
+ : "Offset #" + size + "(" + off + ") is less than the last recorded offset "
+ + offsets[size] + "\n" + Arrays.toString(offsets) + "\n" + Arrays.toString(diffs);
+
+ if (size == 0 || off != offsets[size - 1]) {
+ offsets[size] = off;
+ diffs[size++] = cumulativeDiff;
+ } else { // Overwrite the diff at the last recorded offset
+ diffs[size - 1] = cumulativeDiff;
+ }
}
}