You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/06/29 15:05:54 UTC

nutch git commit: fix for NUTCH-2234

Repository: nutch
Updated Branches:
  refs/heads/master d29be63bd -> abc01175d


fix for NUTCH-2234

and NUTCH-2236.
Upgrades Elasticsearch and Hadoop dependencies, which, in turn,
requires updates to Guava and Lucene dependencies:

- Elasticsearch 1.4.1 -> Elasticsearch 2.3.3
- Lucene 4.10.2 -> 5.5.0
- Solrj 5.4.1 -> 5.5.0
- Guava 16.0.1 -> Guava 18.0
- Hadoop 2.4.0 -> 2.7.2


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/abc01175
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/abc01175
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/abc01175

Branch: refs/heads/master
Commit: abc01175d8a1595db8d8d34a816c5f87f7474565
Parents: d29be63
Author: Joseph Naegele <jn...@grierforensics.com>
Authored: Wed May 25 18:27:31 2016 +0000
Committer: Joseph Naegele <jn...@grierforensics.com>
Committed: Mon Jun 27 21:16:25 2016 +0000

----------------------------------------------------------------------
 build.xml                                       |  8 ++-
 default.properties                              |  5 +-
 ivy/ivy.xml                                     | 15 +++---
 src/plugin/indexer-elastic/ivy.xml              |  2 +-
 src/plugin/indexer-elastic/plugin.xml           | 52 +++++++++++++-------
 .../indexwriter/elastic/ElasticIndexWriter.java | 19 ++++---
 src/plugin/indexer-solr/ivy.xml                 |  2 +-
 src/plugin/indexer-solr/plugin.xml              |  2 +-
 src/plugin/parsefilter-naivebayes/ivy.xml       |  4 +-
 src/plugin/parsefilter-naivebayes/plugin.xml    |  4 +-
 src/plugin/scoring-similarity/build.xml         | 10 +---
 src/plugin/scoring-similarity/ivy.xml           |  1 +
 src/plugin/scoring-similarity/plugin.xml        |  3 +-
 .../similarity/util/LuceneAnalyzerUtil.java     |  4 +-
 .../similarity/util/LuceneTokenizer.java        | 21 +++++---
 15 files changed, 91 insertions(+), 61 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 5cff1ea..a1c41ed 100644
--- a/build.xml
+++ b/build.xml
@@ -234,8 +234,10 @@
       <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/>
 
       <link href="${javadoc.link.java}"/>
-      <link href="${javadoc.link.lucene}"/>
       <link href="${javadoc.link.hadoop}"/>
+      <link href="${javadoc.link.lucene.core}"/>
+      <link href="${javadoc.link.lucene.analyzers-common}"/>
+      <link href="${javadoc.link.solr-solrj}"/>
       
       <classpath refid="classpath"/>
     	<classpath>
@@ -675,8 +677,10 @@
       <packageset dir="${plugins.dir}/urlnormalizer-slash/src/java"/>
 
       <link href="${javadoc.link.java}"/>
-      <link href="${javadoc.link.lucene}"/>
       <link href="${javadoc.link.hadoop}"/>
+      <link href="${javadoc.link.lucene.core}"/>
+      <link href="${javadoc.link.lucene.analyzers-common}"/>
+      <link href="${javadoc.link.solr-solrj}"/>
       
       <classpath refid="classpath"/>
     	<classpath>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/default.properties
----------------------------------------------------------------------
diff --git a/default.properties b/default.properties
index c8d9212..33390f7 100644
--- a/default.properties
+++ b/default.properties
@@ -44,7 +44,10 @@ test.junit.output.format = plain
 javadoc.proxy.host=-J-DproxyHost=
 javadoc.proxy.port=-J-DproxyPort=
 javadoc.link.java=http://docs.oracle.com/javase/7/docs/api/
-javadoc.link.hadoop=http://hadoop.apache.org/docs/r2.4.0/api/
+javadoc.link.hadoop=http://hadoop.apache.org/docs/r2.7.2/api/
+javadoc.link.lucene.core=https://lucene.apache.org/core/5_5_0/core/
+javadoc.link.lucene.analyzers-common=https://lucene.apache.org/core/5_5_0/analyzers-common/
+javadoc.link.solr-solrj=https://lucene.apache.org/solr/5_5_0/solr-solrj/
 javadoc.packages=org.apache.nutch.*
 
 dist.dir=./dist

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/ivy/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 027f0c1..a4e9481 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -52,7 +52,7 @@
         <dependency org="com.tdunning" name="t-digest" rev="3.1" />
             
         <!-- Hadoop Dependencies -->
-		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.4.0" conf="*->default">
+		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.2" conf="*->default">
 			<exclude org="hsqldb" name="hsqldb" />
 			<exclude org="net.sf.kosmosfs" name="kfs" />
 			<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -60,9 +60,9 @@
 			<exclude org="org.mortbay.jetty" name="jsp-*" />
 			<exclude org="ant" name="ant" />
 		</dependency>
-        <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.4.0" conf="*->default"/>
-        <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.4.0" conf="*->default"/>
-        <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.4.0" conf="*->default"/>
+        <dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.7.2" conf="*->default"/>
+        <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.7.2" conf="*->default"/>
+        <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.2" conf="*->default"/>
         <!-- End of Hadoop Dependencies -->
 
 		<dependency org="org.apache.tika" name="tika-core" rev="1.12" />
@@ -72,7 +72,7 @@
 		<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
 		<dependency org="oro" name="oro" rev="2.0.8" />
 
-		<dependency org="com.google.guava" name="guava" rev="16.0.1" />
+		<dependency org="com.google.guava" name="guava" rev="18.0" />
 
 		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.6" />
 
@@ -88,7 +88,6 @@
         <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
         <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>	
         
-        <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" conf="*->default"></dependency>
 		<!-- WARC artifacts needed  -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
 			<exclude module="hadoop-core"/>
@@ -105,6 +104,10 @@
 		<dependency org="org.mortbay.jetty" name="jetty-client" rev="6.1.22" conf="test->default" />
 		<dependency org="org.mortbay.jetty" name="jetty" rev="6.1.22" conf="test->default" />
 		<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22" conf="test->default" />
+		<dependency org="tomcat" name="jasper-runtime" rev="5.5.23" conf="test->default" />
+		<dependency org="tomcat" name="jasper-compiler" rev="5.5.23" conf="test->default">
+			<exclude org="ant" name="ant" />
+		</dependency>
 		<!-- end of test artifacts -->
 
 		<!-- web app dependencies -->

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-elastic/ivy.xml b/src/plugin/indexer-elastic/ivy.xml
index 6681410..f34075f 100644
--- a/src/plugin/indexer-elastic/ivy.xml
+++ b/src/plugin/indexer-elastic/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-        <dependency org="org.elasticsearch" name="elasticsearch" rev="1.4.1"
+        <dependency org="org.elasticsearch" name="elasticsearch" rev="2.3.3"
                     conf="*->default"/>
   </dependencies>
   

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-elastic/plugin.xml b/src/plugin/indexer-elastic/plugin.xml
index 02aad85..d99a665 100644
--- a/src/plugin/indexer-elastic/plugin.xml
+++ b/src/plugin/indexer-elastic/plugin.xml
@@ -22,25 +22,39 @@
     <library name="indexer-elastic.jar">
       <export name="*" />
     </library>
-    
-    <library name="elasticsearch-1.4.1.jar"/>
-    <library name="lucene-analyzers-common-4.10.2.jar"/>
-    <library name="lucene-codecs-4.10.2.jar"/>
-    <library name="lucene-core-4.10.2.jar"/>
-    <library name="lucene-grouping-4.10.2.jar"/>
-    <library name="lucene-highlighter-4.10.2.jar"/>
-    <library name="lucene-join-4.10.2.jar"/>
-    <library name="lucene-memory-4.10.2.jar"/>
-    <library name="lucene-misc-4.10.2.jar"/>
-    <library name="lucene-queries-4.10.2.jar"/>
-    <library name="lucene-queryparser-4.10.2.jar"/>
-    <library name="lucene-sandbox-4.10.2.jar"/>
-    <library name="lucene-spatial-4.10.2.jar"/>
-    <library name="lucene-suggest-4.10.2.jar"/>
-    <library name="spatial4j-0.4.1.jar"/>
-    <library name="antlr-runtime-3.5.jar"/>
-    <library name="asm-4.1"/>
-    <library name="asm-commons-4.1.jar"/>
+    <library name="elasticsearch-2.3.3.jar"/>
+    <library name="commons-cli-1.3.1.jar"/>
+    <library name="compress-lzf-1.0.2.jar"/>
+    <library name="guava-18.0.jar"/>
+    <library name="HdrHistogram-2.1.6.jar"/>
+    <library name="hppc-0.7.1.jar"/>
+    <library name="indexer-elastic.jar"/>
+    <library name="jackson-core-2.6.6.jar"/>
+    <library name="jackson-dataformat-cbor-2.6.6.jar"/>
+    <library name="jackson-dataformat-smile-2.6.6.jar"/>
+    <library name="jackson-dataformat-yaml-2.6.6.jar"/>
+    <library name="joda-convert-1.2.jar"/>
+    <library name="joda-time-2.8.2.jar"/>
+    <library name="jsr166e-1.1.0.jar"/>
+    <library name="lucene-analyzers-common-5.5.0.jar"/>
+    <library name="lucene-backward-codecs-5.5.0.jar"/>
+    <library name="lucene-core-5.5.0.jar"/>
+    <library name="lucene-grouping-5.5.0.jar"/>
+    <library name="lucene-highlighter-5.5.0.jar"/>
+    <library name="lucene-join-5.5.0.jar"/>
+    <library name="lucene-memory-5.5.0.jar"/>
+    <library name="lucene-misc-5.5.0.jar"/>
+    <library name="lucene-queries-5.5.0.jar"/>
+    <library name="lucene-queryparser-5.5.0.jar"/>
+    <library name="lucene-sandbox-5.5.0.jar"/>
+    <library name="lucene-spatial-5.5.0.jar"/>
+    <library name="lucene-spatial3d-5.5.0.jar"/>
+    <library name="lucene-suggest-5.5.0.jar"/>
+    <library name="netty-3.10.5.Final.jar"/>
+    <library name="securesm-1.0.jar"/>
+    <library name="snakeyaml-1.15.jar"/>
+    <library name="spatial4j-0.5.jar"/>
+    <library name="t-digest-3.0.jar"/>
   </runtime>
 
   <requires>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index c1827e7..9367e41 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -21,6 +21,7 @@ import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.net.InetAddress;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -38,9 +39,8 @@ import org.elasticsearch.action.delete.DeleteRequestBuilder;
 import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.client.transport.TransportClient;
-import org.elasticsearch.common.settings.ImmutableSettings;
-import org.elasticsearch.common.settings.ImmutableSettings.Builder;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.settings.Settings.Builder;
 import org.elasticsearch.common.transport.InetSocketTransportAddress;
 import org.elasticsearch.node.Node;
 import org.slf4j.Logger;
@@ -79,8 +79,7 @@ public class ElasticIndexWriter implements IndexWriter {
     host = job.get(ElasticConstants.HOST);
     port = job.getInt(ElasticConstants.PORT, 9300);
 
-    Builder settingsBuilder = ImmutableSettings.settingsBuilder().classLoader(
-        Settings.class.getClassLoader());
+    Builder settingsBuilder = Settings.builder();
 
     BufferedReader reader = new BufferedReader(
         job.getConfResourceAsReader("elasticsearch.conf"));
@@ -106,8 +105,10 @@ public class ElasticIndexWriter implements IndexWriter {
 
     // Prefer TransportClient
     if (host != null && port > 1) {
-      client = new TransportClient(settings)
-          .addTransportAddress(new InetSocketTransportAddress(host, port));
+      TransportClient transportClient = TransportClient.builder()
+          .settings(settings).build()
+          .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port));
+      client = transportClient;
     } else if (clusterName != null) {
       node = nodeBuilder().settings(settings).client(true).node();
       client = node.client();
@@ -141,8 +142,10 @@ public class ElasticIndexWriter implements IndexWriter {
           bulkLength += value.toString().length();
         }
       } else {
-        source.put(fieldName, doc.getFieldValue(fieldName));
-        bulkLength += doc.getFieldValue(fieldName).toString().length();
+        if (doc.getFieldValue(fieldName) != null) {
+          source.put(fieldName, doc.getFieldValue(fieldName));
+          bulkLength += doc.getFieldValue(fieldName).toString().length();
+        }
       }
     }
     request.setSource(source);

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-solr/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-solr/ivy.xml b/src/plugin/indexer-solr/ivy.xml
index 566ec78..65e97e7 100644
--- a/src/plugin/indexer-solr/ivy.xml
+++ b/src/plugin/indexer-solr/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.solr" name="solr-solrj" rev="5.4.1"/>
+    <dependency org="org.apache.solr" name="solr-solrj" rev="5.5.0"/>
     <dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.1" conf="*->default"/>
     <dependency org="org.apache.httpcomponents" name="httpmime" rev="4.4.1" conf="*->default"/>
   </dependencies>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/indexer-solr/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/indexer-solr/plugin.xml b/src/plugin/indexer-solr/plugin.xml
index c92d3aa..0e86796 100644
--- a/src/plugin/indexer-solr/plugin.xml
+++ b/src/plugin/indexer-solr/plugin.xml
@@ -28,7 +28,7 @@
       <library name="httpmime-4.4.1.jar"/>
       <library name="noggit-0.6.jar"/>
       <library name="slf4j-api-1.7.7.jar"/>
-      <library name="solr-solrj-5.4.1.jar"/>
+      <library name="solr-solrj-5.5.0.jar"/>
       <library name="stax2-api-3.1.4.jar"/>
       <library name="woodstox-core-asl-4.4.1.jar"/>
       <library name="zookeeper-3.4.6.jar"/> 

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/parsefilter-naivebayes/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/ivy.xml b/src/plugin/parsefilter-naivebayes/ivy.xml
index eea057f..08cca2c 100644
--- a/src/plugin/parsefilter-naivebayes/ivy.xml
+++ b/src/plugin/parsefilter-naivebayes/ivy.xml
@@ -41,8 +41,8 @@
     <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
       <exclude org="org.apache.mrunit" name="mrunit"/>
     </dependency>
-    <dependency org="org.apache.lucene" name="lucene-core" rev="4.10.2" />
-    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.10.2" />
+    <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" />
 
   </dependencies>
   

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/parsefilter-naivebayes/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parsefilter-naivebayes/plugin.xml b/src/plugin/parsefilter-naivebayes/plugin.xml
index b3217a8..ac15041 100644
--- a/src/plugin/parsefilter-naivebayes/plugin.xml
+++ b/src/plugin/parsefilter-naivebayes/plugin.xml
@@ -31,8 +31,8 @@
       <library name="guava-14.0.1.jar"/>
       <library name="jackson-core-asl-1.9.12.jar"/>
       <library name="jackson-mapper-asl-1.9.12.jar"/>
-      <library name="lucene-analyzers-common-4.10.2.jar"/>
-      <library name="lucene-core-4.10.2.jar"/>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
       <library name="mahout-core-0.9.jar"/>
       <library name="mahout-math-0.10.1.jar"/>
       <library name="slf4j-api-1.7.12.jar"/>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/build.xml b/src/plugin/scoring-similarity/build.xml
index 98abc70..66ac8f3 100644
--- a/src/plugin/scoring-similarity/build.xml
+++ b/src/plugin/scoring-similarity/build.xml
@@ -18,15 +18,7 @@
 <project name="scoring-similarity" default="jar-core">
 
   <import file="../build-plugin.xml"/>
-	  <target name="deps-jar">
-	    <ant target="jar" inheritall="false" dir="../indexer-elastic" />
-	  </target>
-	  <!-- Add compilation dependencies to classpath -->
-	  <path id="plugin.deps">
-	    <fileset dir="${nutch.root}/build">
-	      <include name="**/indexer-elastic/*.jar" />
-	    </fileset>
-	  </path>
+
   <!-- Deploy Unit test dependencies -->
   <target name="deps-test">
     <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/ivy.xml b/src/plugin/scoring-similarity/ivy.xml
index 1a86d68..be0a1de 100644
--- a/src/plugin/scoring-similarity/ivy.xml
+++ b/src/plugin/scoring-similarity/ivy.xml
@@ -36,6 +36,7 @@
   </publications>
 
   <dependencies>
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" conf="*->default"/>
   </dependencies>
   
 </ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/plugin.xml b/src/plugin/scoring-similarity/plugin.xml
index e3a04b2..9639c18 100644
--- a/src/plugin/scoring-similarity/plugin.xml
+++ b/src/plugin/scoring-similarity/plugin.xml
@@ -26,7 +26,8 @@
       <library name="scoring-similarity.jar">
          <export name="*"/>
       </library>
-      <library name="lucene-core-4.10.2.jar"/>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
    </runtime>
 
    <requires>

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
index 78b0fa9..4b519bc 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -70,8 +70,8 @@ public class LuceneAnalyzerUtil extends Analyzer{
   }
     
   @Override
-  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    Tokenizer source = new ClassicTokenizer(reader);
+  protected TokenStreamComponents createComponents(String fieldName) {
+    Tokenizer source = new ClassicTokenizer();
     TokenStream filter = new LowerCaseFilter(source);
     if(stopSet != null) {
       filter = new StopFilter(filter, stopSet);

http://git-wip-us.apache.org/repos/asf/nutch/blob/abc01175/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
----------------------------------------------------------------------
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index 6f6d4d4..acb987c 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -19,6 +19,7 @@ package org.apache.nutch.scoring.similarity.util;
 import java.io.StringReader;
 import java.util.List;
 
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
@@ -113,21 +114,29 @@ public class LuceneTokenizer {
     return tokenStream;
   }
 
-  private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizer){
-    switch(tokenizer){
+  private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){
+    Tokenizer tokenizer = null;
+    switch(tokenizerType){
     case CLASSIC:
-      tokenStream = new ClassicTokenizer(new StringReader(content));
+      tokenizer = new ClassicTokenizer();
       break;
 
     case STANDARD:
-      tokenStream = new StandardTokenizer(new StringReader(content));
+    default:
+      tokenizer = new StandardTokenizer();
     }
+
+    tokenizer.setReader(new StringReader(content));
+
+    tokenStream = tokenizer;
+
     return tokenStream;
   }
 
   private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
-    tokenStream = new StandardTokenizer(new StringReader(content));
-    tokenStream = new LowerCaseFilter(tokenStream);
+    Tokenizer tokenizer = new StandardTokenizer();
+    tokenizer.setReader(new StringReader(content));
+    tokenStream = new LowerCaseFilter(tokenizer);
     tokenStream = applyStemmer(stemFilterType);
     ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
     shingleFilter.setOutputUnigrams(false);