You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/06/29 15:05:54 UTC
nutch git commit: fix for NUTCH-2234
Repository: nutch
Updated Branches:
refs/heads/master d29be63bd -> abc01175d
fix for NUTCH-2234
and NUTCH-2236.
Upgrades Elasticsearch and Hadoop dependencies, which, in turn,
requires updates to Guava and Lucene dependencies:
- Elasticsearch 1.4.1 -> Elasticsearch 2.3.3
- Lucene 4.10.2 -> 5.5.0
- Solrj 5.4.1 -> 5.5.0
- Guava 16.0.1 -> Guava 18.0
- Hadoop 2.4.0 -> 2.7.2
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/abc01175
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/abc01175
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/abc01175
Branch: refs/heads/master
Commit: abc01175d8a1595db8d8d34a816c5f87f7474565
Parents: d29be63
Author: Joseph Naegele <jn...@grierforensics.com>
Authored: Wed May 25 18:27:31 2016 +0000
Committer: Joseph Naegele <jn...@grierforensics.com>
Committed: Mon Jun 27 21:16:25 2016 +0000
----------------------------------------------------------------------
build.xml | 8 ++-
default.properties | 5 +-
ivy/ivy.xml | 15 +++---
src/plugin/indexer-elastic/ivy.xml | 2 +-
src/plugin/indexer-elastic/plugin.xml | 52 +++++++++++++-------
.../indexwriter/elastic/ElasticIndexWriter.java | 19 ++++---
src/plugin/indexer-solr/ivy.xml | 2 +-
src/plugin/indexer-solr/plugin.xml | 2 +-
src/plugin/parsefilter-naivebayes/ivy.xml | 4 +-
src/plugin/parsefilter-naivebayes/plugin.xml | 4 +-
src/plugin/scoring-similarity/build.xml | 10 +---
src/plugin/scoring-similarity/ivy.xml | 1 +
src/plugin/scoring-similarity/plugin.xml | 3 +-
.../similarity/util/LuceneAnalyzerUtil.java | 4 +-
.../similarity/util/LuceneTokenizer.java | 21 +++++---
15 files changed, 91 insertions(+), 61 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 5cff1ea..a1c41ed 100644
--- a/build.xml
+++ b/build.xml
@@ -234,8 +234,10 @@
<packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/>
<link href="${javadoc.link.java}"/>
- <link href="${javadoc.link.lucene}"/>
<link href="${javadoc.link.hadoop}"/>
+ <link href="${javadoc.link.lucene.core}"/>
+ <link href="${javadoc.link.lucene.analyzers-common}"/>
+ <link href="${javadoc.link.solr-solrj}"/>
<classpath refid="classpath"/>
<classpath>
@@ -675,8 +677,10 @@
<packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/>
<link href="${javadoc.link.java}"/>
- <link href="${javadoc.link.lucene}"/>
<link href="${javadoc.link.hadoop}"/>
+ <link href="${javadoc.link.lucene.core}"/>
+ <link href="${javadoc.link.lucene.analyzers-common}"/>
+ <link href="${javadoc.link.solr-solrj}"/>
<classpath refid="classpath"/>
<classpath>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index c8d9212..33390f7 100644
--- a/default.properties
+++ b/default.properties
@@ -44,7 +44,10 @@ test.junit.output.format = plain
javadoc.proxy.host=-J-DproxyHost=
javadoc.proxy.port=-J-DproxyPort=
javadoc.link.java=http://docs.oracle.com/javase/7/docs/api/
-javadoc.link.hadoop=http://hadoop.apache.org/docs/r2.4.0/api/
+javadoc.link.hadoop=http://hadoop.apache.org/docs/r2.7.2/api/
+javadoc.link.lucene.core=https://lucene.apache.org/core/5_5_0/core/
+javadoc.link.lucene.analyzers-common=https://lucene.apache.org/core/5_5_0/analyzers-common/
+javadoc.link.solr-solrj=https://lucene.apache.org/solr/5_5_0/solr-solrj/
javadoc.packages=org.apache.nutch.*
dist.dir=./dist
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/ivy/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 027f0c1..a4e9481 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -52,7 +52,7 @@
<dependency org="com.tdunning" name="t-digest" rev="3.1" />
<!-- Hadoop Dependencies -->
- <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0" conf="*->default">
+ <dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.2" conf="*->default">
<exclude org="hsqldb" name="hsqldb" />
<exclude org="net.sf.kosmosfs" name="kfs" />
<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -60,9 +60,9 @@
<exclude org="org.mortbay.jetty" name="jsp-*" />
<exclude org="ant" name="ant" />
</dependency>
- <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.4.0" conf="*->default"/>
- <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.4.0" conf="*->default"/>
- <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.4.0" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.7.2" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.7.2" conf="*->default"/>
+ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.2" conf="*->default"/>
<!-- End of Hadoop Dependencies -->
<dependency org="org.apache.tika" name="tika-core" rev="1.12" />
@@ -72,7 +72,7 @@
<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
<dependency org="oro" name="oro" rev="2.0.8" />
- <dependency org="com.google.guava" name="guava" rev="16.0.1" />
+ <dependency org="com.google.guava" name="guava" rev="18.0" />
<dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.6" />
@@ -88,7 +88,6 @@
<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>
- <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" conf="*->default"></dependency>
<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
<exclude module="hadoop-core"/>
@@ -105,6 +104,10 @@
<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
+ <dependency org="tomcat" name="jasper-runtime" rev="5.5.23" conf="test->default" />
+ <dependency org="tomcat" name="jasper-compiler" rev="5.5.23" conf="test->default">
+ <exclude org="ant" name="ant" />
+ </dependency>
<!-- end of test artifacts -->
<!-- web app dependencies -->
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml
index 6681410..f34075f 100644
--- a/src/plugin/indexer-elastic/ivy.xml
+++ b/src/plugin/indexer-elastic/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.elasticsearch" name="elasticsearch" rev="1.4.1"
+ <dependency org="org.elasticsearch" name="elasticsearch" rev="2.3.3"
conf="*->default"/>
</dependencies>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml
index 02aad85..d99a665 100644
--- a/src/plugin/indexer-elastic/plugin.xml
+++ b/src/plugin/indexer-elastic/plugin.xml
@@ -22,25 +22,39 @@
<library name="indexer-elastic.jar">
<export name="*" />
</library>
-
- <library name="elasticsearch-1.4.1.jar"/>
- <library name="lucene-analyzers-common-4.10.2.jar"/>
- <library name="lucene-codecs-4.10.2.jar"/>
- <library name="lucene-core-4.10.2.jar"/>
- <library name="lucene-grouping-4.10.2.jar"/>
- <library name="lucene-highlighter-4.10.2.jar"/>
- <library name="lucene-join-4.10.2.jar"/>
- <library name="lucene-memory-4.10.2.jar"/>
- <library name="lucene-misc-4.10.2.jar"/>
- <library name="lucene-queries-4.10.2.jar"/>
- <library name="lucene-queryparser-4.10.2.jar"/>
- <library name="lucene-sandbox-4.10.2.jar"/>
- <library name="lucene-spatial-4.10.2.jar"/>
- <library name="lucene-suggest-4.10.2.jar"/>
- <library name="spatial4j-0.4.1.jar"/>
- <library name="antlr-runtime-3.5.jar"/>
- <library name="asm-4.1"/>
- <library name="asm-commons-4.1.jar"/>
+ <library name="elasticsearch-2.3.3.jar"/>
+ <library name="commons-cli-1.3.1.jar"/>
+ <library name="compress-lzf-1.0.2.jar"/>
+ <library name="guava-18.0.jar"/>
+ <library name="HdrHistogram-2.1.6.jar"/>
+ <library name="hppc-0.7.1.jar"/>
+ <library name="indexer-elastic.jar"/>
+ <library name="jackson-core-2.6.6.jar"/>
+ <library name="jackson-dataformat-cbor-2.6.6.jar"/>
+ <library name="jackson-dataformat-smile-2.6.6.jar"/>
+ <library name="jackson-dataformat-yaml-2.6.6.jar"/>
+ <library name="joda-convert-1.2.jar"/>
+ <library name="joda-time-2.8.2.jar"/>
+ <library name="jsr166e-1.1.0.jar"/>
+ <library name="lucene-analyzers-common-5.5.0.jar"/>
+ <library name="lucene-backward-codecs-5.5.0.jar"/>
+ <library name="lucene-core-5.5.0.jar"/>
+ <library name="lucene-grouping-5.5.0.jar"/>
+ <library name="lucene-highlighter-5.5.0.jar"/>
+ <library name="lucene-join-5.5.0.jar"/>
+ <library name="lucene-memory-5.5.0.jar"/>
+ <library name="lucene-misc-5.5.0.jar"/>
+ <library name="lucene-queries-5.5.0.jar"/>
+ <library name="lucene-queryparser-5.5.0.jar"/>
+ <library name="lucene-sandbox-5.5.0.jar"/>
+ <library name="lucene-spatial-5.5.0.jar"/>
+ <library name="lucene-spatial3d-5.5.0.jar"/>
+ <library name="lucene-suggest-5.5.0.jar"/>
+ <library name="netty-3.10.5.Final.jar"/>
+ <library name="securesm-1.0.jar"/>
+ <library name="snakeyaml-1.15.jar"/>
+ <library name="spatial4j-0.5.jar"/>
+ <library name="t-digest-3.0.jar"/>
</runtime>
<requires>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index c1827e7..9367e41 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -21,6 +21,7 @@ import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import java.io.BufferedReader;
import java.io.IOException;
+import java.net.InetAddress;
import java.util.HashMap;
import java.util.Map;
@@ -38,9 +39,8 @@ import org.elasticsearch.action.delete.DeleteRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
-import org.elasticsearch.common.settings.ImmutableSettings;
-import org.elasticsearch.common.settings.ImmutableSettings.Builder;
import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.Settings.Builder;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.node.Node;
import org.slf4j.Logger;
@@ -79,8 +79,7 @@ public class ElasticIndexWriter implements IndexWriter {
host = job.get(ElasticConstants.HOST);
port = job.getInt(ElasticConstants.PORT, 9300);
- Builder settingsBuilder = ImmutableSettings.settingsBuilder().classLoader(
- Settings.class.getClassLoader());
+ Builder settingsBuilder = Settings.builder();
BufferedReader reader = new BufferedReader(
job.getConfResourceAsReader("elasticsearch.conf"));
@@ -106,8 +105,10 @@ public class ElasticIndexWriter implements IndexWriter {
// Prefer TransportClient
if (host != null && port > 1) {
- client = new TransportClient(settings)
- .addTransportAddress(new InetSocketTransportAddress(host, port));
+ TransportClient transportClient = TransportClient.builder()
+ .settings(settings).build()
+ .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port));
+ client = transportClient;
} else if (clusterName != null) {
node = nodeBuilder().settings(settings).client(true).node();
client = node.client();
@@ -141,8 +142,10 @@ public class ElasticIndexWriter implements IndexWriter {
bulkLength += value.toString().length();
}
} else {
- source.put(fieldName, doc.getFieldValue(fieldName));
- bulkLength += doc.getFieldValue(fieldName).toString().length();
+ if (doc.getFieldValue(fieldName) != null) {
+ source.put(fieldName, doc.getFieldValue(fieldName));
+ bulkLength += doc.getFieldValue(fieldName).toString().length();
+ }
}
}
request.setSource(source);
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-solr/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index 566ec78..65e97e7 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.apache.solr" name="solr-solrj" rev="5.4.1"/>
+ <dependency org="org.apache.solr" name="solr-solrj" rev="5.5.0"/>
<dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.1" conf="*->default"/>
<dependency org="org.apache.httpcomponents" name="httpmime" rev="4.4.1" conf="*->default"/>
</dependencies>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-solr/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml
index c92d3aa..0e86796 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -28,7 +28,7 @@
<library name="httpmime-4.4.1.jar"/>
<library name="noggit-0.6.jar"/>
<library name="slf4j-api-1.7.7.jar"/>
- <library name="solr-solrj-5.4.1.jar"/>
+ <library name="solr-solrj-5.5.0.jar"/>
<library name="stax2-api-3.1.4.jar"/>
<library name="woodstox-core-asl-4.4.1.jar"/>
<library name="zookeeper-3.4.6.jar"/>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/parsefilter-naivebayes/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml
index eea057f..08cca2c 100644
--- a/src/plugin/parsefilter-naivebayes/ivy.xml
+++ b/src/plugin/parsefilter-naivebayes/ivy.xml
@@ -41,8 +41,8 @@
<dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
<exclude org="org.apache.mrunit" name="mrunit"/>
</dependency>
- <dependency org="org.apache.lucene" name="lucene-core" rev="4.10.2" />
- <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" />
+ <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
+ <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" />
</dependencies>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/parsefilter-naivebayes/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml b/src/plugin/parsefilter-naivebayes/plugin.xml
index b3217a8..ac15041 100644
--- a/src/plugin/parsefilter-naivebayes/plugin.xml
+++ b/src/plugin/parsefilter-naivebayes/plugin.xml
@@ -31,8 +31,8 @@
<library name="guava-14.0.1.jar"/>
<library name="jackson-core-asl-1.9.12.jar"/>
<library name="jackson-mapper-asl-1.9.12.jar"/>
- <library name="lucene-analyzers-common-4.10.2.jar"/>
- <library name="lucene-core-4.10.2.jar"/>
+ <library name="lucene-analyzers-common-5.5.0.jar"/>
+ <library name="lucene-core-5.5.0.jar"/>
<library name="mahout-core-0.9.jar"/>
<library name="mahout-math-0.10.1.jar"/>
<library name="slf4j-api-1.7.12.jar"/>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/build.xml b/src/plugin/scoring-similarity/build.xml
index 98abc70..66ac8f3 100644
--- a/src/plugin/scoring-similarity/build.xml
+++ b/src/plugin/scoring-similarity/build.xml
@@ -18,15 +18,7 @@
<project name="scoring-similarity" default="jar-core">
<import file="../build-plugin.xml"/>
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../indexer-elastic" />
- </target>
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/indexer-elastic/*.jar" />
- </fileset>
- </path>
+
<!-- Deploy Unit test dependencies -->
<target name="deps-test">
<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml
index 1a86d68..be0a1de 100644
--- a/src/plugin/scoring-similarity/ivy.xml
+++ b/src/plugin/scoring-similarity/ivy.xml
@@ -36,6 +36,7 @@
</publications>
<dependencies>
+ <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
</dependencies>
</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml
index e3a04b2..9639c18 100644
--- a/src/plugin/scoring-similarity/plugin.xml
+++ b/src/plugin/scoring-similarity/plugin.xml
@@ -26,7 +26,8 @@
<library name="scoring-similarity.jar">
<export name="*"/>
</library>
- <library name="lucene-core-4.10.2.jar"/>
+ <library name="lucene-analyzers-common-5.5.0.jar"/>
+ <library name="lucene-core-5.5.0.jar"/>
</runtime>
<requires>
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
index 78b0fa9..4b519bc 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -70,8 +70,8 @@ public class LuceneAnalyzerUtil extends Analyzer{
}
@Override
- protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer source = new ClassicTokenizer(reader);
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer source = new ClassicTokenizer();
TokenStream filter = new LowerCaseFilter(source);
if(stopSet != null) {
filter = new StopFilter(filter, stopSet);
http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index 6f6d4d4..acb987c 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -19,6 +19,7 @@ package org.apache.nutch.scoring.similarity.util;
import java.io.StringReader;
import java.util.List;
+import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
@@ -113,21 +114,29 @@ public class LuceneTokenizer {
return tokenStream;
}
- private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizer){
- switch(tokenizer){
+ private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){
+ Tokenizer tokenizer = null;
+ switch(tokenizerType){
case CLASSIC:
- tokenStream = new ClassicTokenizer(new StringReader(content));
+ tokenizer = new ClassicTokenizer();
break;
case STANDARD:
- tokenStream = new StandardTokenizer(new StringReader(content));
+ default:
+ tokenizer = new StandardTokenizer();
}
+
+ tokenizer.setReader(new StringReader(content));
+
+ tokenStream = tokenizer;
+
return tokenStream;
}
private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
- tokenStream = new StandardTokenizer(new StringReader(content));
- tokenStream = new LowerCaseFilter(tokenStream);
+ Tokenizer tokenizer = new StandardTokenizer();
+ tokenizer.setReader(new StringReader(content));
+ tokenStream = new LowerCaseFilter(tokenizer);
tokenStream = applyStemmer(stemFilterType);
ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
shingleFilter.setOutputUnigrams(false);