You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/11/14 20:31:41 UTC
svn commit: r1714355 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/analysis/ lucene/analysis/common/
lucene/analysis/common/src/java/org/apache/lucene/analysis/core/
lucene/analysis/common/src/java/org/apache/lucene/analysis/util/
lucene/analy...
Author: uschindler
Date: Sat Nov 14 19:31:41 2015
New Revision: 1714355
URL: http://svn.apache.org/viewvc?rev=1714355&view=rev
Log:
Merged revision(s) 1714354 from lucene/dev/trunk:
LUCENE-6874: Add a new UnicodeWhitespaceTokenizer to analysis/common that uses Unicode character properties extracted from ICU4J to tokenize text on whitespace
Added:
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceAnalyzer.java
- copied unchanged from r1714354, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceAnalyzer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
- copied unchanged from r1714354, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UnicodeWhitespaceTokenizer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
- copied unchanged from r1714354, lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/UnicodeProps.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
- copied unchanged from r1714354, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestUnicodeWhitespaceTokenizer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/tools/groovy/
- copied from r1714354, lucene/dev/trunk/lucene/analysis/common/src/tools/groovy/
lucene/dev/branches/branch_5x/lucene/benchmark/conf/wstok.alg
- copied unchanged from r1714354, lucene/dev/trunk/lucene/benchmark/conf/wstok.alg
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_5x/lucene/analysis/common/build.xml
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
lucene/dev/branches/branch_5x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Sat Nov 14 19:31:41 2015
@@ -39,6 +39,11 @@ New Features
within a "ring" (beyond a minimum distance and below a maximum
distance) (Nick Knize via Mike McCandless)
+* LUCENE-6874: Add a new UnicodeWhitespaceTokenizer to analysis/common
+ that uses Unicode character properties extracted from ICU4J to tokenize
+ text on whitespace. This tokenizer will split on non-breaking
+ space (NBSP), too. (David Smiley, Uwe Schindler, Steve Rowe)
+
API Changes
* LUCENE-6590: Query.setBoost(), Query.getBoost() and Query.clone() are gone.
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/build.xml?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/build.xml (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/build.xml Sat Nov 14 19:31:41 2015
@@ -31,6 +31,8 @@
<property name="snowball.programs.dir" location="src/java/org/tartarus/snowball/ext"/>
+ <property name="unicode-props-file" location="src/java/org/apache/lucene/analysis/util/UnicodeProps.java"/>
+
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-StandardAnalyzer,-jflex-UAX29URLEmailTokenizer,
-jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter,-jflex-legacy"/>
@@ -158,6 +160,18 @@
</delete>
</target>
+ <target xmlns:ivy="antlib:org.apache.ivy.ant" name="-resolve-icu4j" unless="icu4j.resolved" depends="ivy-availability-check,ivy-configure">
+ <loadproperties prefix="ivyversions" srcFile="${common.dir}/ivy-versions.properties"/>
+ <ivy:cachepath organisation="com.ibm.icu" module="icu4j" revision="${ivyversions./com.ibm.icu/icu4j}"
+ inline="true" conf="default" transitive="true" pathid="icu4j.classpath"/>
+ <property name="icu4j.resolved" value="true"/>
+ </target>
+
+ <target name="unicode-data" depends="-resolve-icu4j,resolve-groovy">
+ <groovy classpathref="icu4j.classpath" src="src/tools/groovy/generate-unicode-data.groovy"/>
+ <fixcrlf file="${unicode-props-file}" encoding="UTF-8"/>
+ </target>
+
<property name="tld.zones" value="http://www.internic.net/zones/root.zone"/>
<property name="tld.output" location="src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro"/>
@@ -185,7 +199,7 @@
<target name="javadocs" depends="module-build.javadocs"/>
- <target name="regenerate" depends="jflex"/>
+ <target name="regenerate" depends="jflex,unicode-data"/>
<target name="patch-snowball" description="Patches all snowball programs in '${snowball.programs.dir}' to make them work with MethodHandles">
<fileset id="snowball.programs" dir="${snowball.programs.dir}" includes="*Stemmer.java"/>
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java Sat Nov 14 19:31:41 2015
@@ -22,8 +22,11 @@ import org.apache.lucene.analysis.util.C
import org.apache.lucene.util.AttributeFactory;
/**
- * A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ * A tokenizer that divides text at whitespace characters as defined by
+ * {@link Character#isWhitespace(int)}. Note: That definition explicitly excludes the non-breaking space.
* Adjacent sequences of non-Whitespace characters form tokens.
+ *
+ * @see UnicodeWhitespaceTokenizer
*/
public final class WhitespaceTokenizer extends CharTokenizer {
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java Sat Nov 14 19:31:41 2015
@@ -17,32 +17,56 @@ package org.apache.lucene.analysis.core;
* limitations under the License.
*/
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
-import java.util.Map;
-
/**
* Factory for {@link WhitespaceTokenizer}.
* <pre class="prettyprint">
* <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
* <analyzer>
- * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory" rule="unicode"/>
* </analyzer>
* </fieldType></pre>
+ *
+ * Options:
+ * <ul>
+ * <li>rule: either "java" for {@link WhitespaceTokenizer}
+ * or "unicode" for {@link UnicodeWhitespaceTokenizer}</li>
+ * </ul>
*/
public class WhitespaceTokenizerFactory extends TokenizerFactory {
+ public static final String RULE_JAVA = "java";
+ public static final String RULE_UNICODE = "unicode";
+ private static final Collection<String> RULE_NAMES = Arrays.asList(RULE_JAVA, RULE_UNICODE);
+
+ private final String rule;
/** Creates a new WhitespaceTokenizerFactory */
public WhitespaceTokenizerFactory(Map<String,String> args) {
super(args);
+
+ rule = get(args, "rule", RULE_NAMES, RULE_JAVA);
+
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
- public WhitespaceTokenizer create(AttributeFactory factory) {
- return new WhitespaceTokenizer(factory);
+ public Tokenizer create(AttributeFactory factory) {
+ switch (rule) {
+ case RULE_JAVA:
+ return new WhitespaceTokenizer(factory);
+ case RULE_UNICODE:
+ return new UnicodeWhitespaceTokenizer(factory);
+ default:
+ throw new AssertionError();
+ }
}
}
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java Sat Nov 14 19:31:41 2015
@@ -101,7 +101,8 @@ public class TestAllAnalyzersHaveFactori
ReversePathHierarchyTokenizer.class, // this is supported via an option to PathHierarchyTokenizer's factory
SnowballFilter.class, // this is called SnowballPorterFilterFactory
PatternKeywordMarkerFilter.class,
- SetKeywordMarkerFilter.class
+ SetKeywordMarkerFilter.class,
+ UnicodeWhitespaceTokenizer.class // a supported option via WhitespaceTokenizerFactory
);
}
Modified: lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java (original)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java Sat Nov 14 19:31:41 2015
@@ -130,7 +130,7 @@ public class TestAnalyzers extends BaseT
@Override
public TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new WhitespaceTokenizer();
+ Tokenizer tokenizer = random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
@@ -140,7 +140,7 @@ public class TestAnalyzers extends BaseT
@Override
public TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new WhitespaceTokenizer();
+ Tokenizer tokenizer = random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer();
return new TokenStreamComponents(tokenizer, new UpperCaseFilter(tokenizer));
}
@@ -230,7 +230,7 @@ public class TestAnalyzers extends BaseT
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer() };
+ Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer(), new UnicodeWhitespaceAnalyzer() };
for (Analyzer analyzer : analyzers) {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
@@ -239,7 +239,7 @@ public class TestAnalyzers extends BaseT
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
- Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer() };
+ Analyzer analyzers[] = new Analyzer[] { new WhitespaceAnalyzer(), new SimpleAnalyzer(), new StopAnalyzer(), new UnicodeWhitespaceAnalyzer() };
for (Analyzer analyzer : analyzers) {
checkRandomData(random(), analyzer, 100*RANDOM_MULTIPLIER, 8192);
}
Modified: lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java?rev=1714355&r1=1714354&r2=1714355&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java (original)
+++ lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java Sat Nov 14 19:31:41 2015
@@ -47,6 +47,7 @@ public class ExtractReuters {
public void extract() throws IOException {
long count = 0;
+ Files.createDirectories(outputDir);
try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) {
for (Path sgmFile : stream) {
extractFile(sgmFile);
@@ -70,7 +71,7 @@ public class ExtractReuters {
* Override if you wish to change what is extracted
*/
protected void extractFile(Path sgmFile) {
- try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.UTF_8)) {
+ try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) {
StringBuilder buffer = new StringBuilder(1024);
StringBuilder outBuffer = new StringBuilder(1024);