You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/06/29 07:14:45 UTC
svn commit: r1688084 - in /nutch/trunk: ./ conf/ ivy/ src/plugin/ src/plugin/parsefilter-naivebayes/ src/plugin/parsefilter-naivebayes/src/ src/plugin/parsefilter-naivebayes/src/java/ src/plugin/parsefilter-naivebayes/src/java/org/ src/plugin/parsefilt...

Author: mattmann
Date: Mon Jun 29 05:14:45 2015
New Revision: 1688084

URL: http://svn.apache.org/r1688084
Log:
fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks) contributed by Asitang Mishra <as...@gmail.com> this closes #39

Added:
    nutch/trunk/src/plugin/parsefilter-naivebayes/
    nutch/trunk/src/plugin/parsefilter-naivebayes/build.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
Modified:
    nutch/trunk/.gitignore
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/.gitignore
URL: http://svn.apache.org/viewvc/nutch/trunk/.gitignore?rev=1688084&r1=1688083&r2=1688084&view=diff
==============================================================================
--- nutch/trunk/.gitignore (original)
+++ nutch/trunk/.gitignore Mon Jun 29 05:14:45 2015
@@ -5,3 +5,4 @@ conf/slaves
 build/
 runtime/
 logs/
+/bin/

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1688084&r1=1688083&r2=1688084&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Jun 29 05:14:45 2015
@@ -212,6 +212,7 @@
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/urlmeta/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-host/src/java"/>
@@ -623,6 +624,7 @@
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/urlmeta/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-host/src/java"/>
@@ -1040,6 +1042,7 @@
         <source path="${plugins.dir}/urlfilter-suffix/src/test/" />
         <source path="${plugins.dir}/urlfilter-validator/src/java/" />
         <source path="${plugins.dir}/urlfilter-validator/src/test/" />
+        <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
         <source path="${plugins.dir}/urlmeta/src/java/" />
         <source path="${plugins.dir}/urlnormalizer-basic/src/java/" />
         <source path="${plugins.dir}/urlnormalizer-basic/src/test/" />

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1688084&r1=1688083&r2=1688084&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 29 05:14:45 2015
@@ -1109,7 +1109,7 @@
   default Nutch includes crawling just HTML and plain text via HTTP,
   and basic indexing and search plugins. In order to use HTTPS please enable 
   protocol-httpclient, but be aware of possible intermittent problems with the 
-  underlying commons-httpclient library.
+  underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler.
   </description>
 </property>
 
@@ -1207,6 +1207,29 @@
   </description>
 </property>
 
+<property>
+  <name>parsefilter.naivebayes.trainfile</name>
+  <value>naivebayes-train.txt</value>
+  <description>Set the name of the file to be used for Naive Bayes training. The format will be: 
+Each line contains two tab seperted parts
+There are two columns/parts:
+1. "1" or "0", "1" for relevent and "0" for irrelevent document.
+3. Text (text that will be used for training)
+
+Each row will be considered a new "document" for the classifier.
+CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this classifier.
+
+  </description>
+</property>
+
+<property>
+  <name>parsefilter.naivebayes.wordlist</name>
+  <value>wordlist.txt</value>
+  <description>Put the name of the file you want to be used as a list of 
+  important words to be matched in the url for the model filter. The format should be one word per line.
+  </description>
+</property>
+
 <property>
   <name>parser.timeout</name>
   <value>30</value>

Modified: nutch/trunk/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1688084&r1=1688083&r2=1688084&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Mon Jun 29 05:14:45 2015
@@ -96,6 +96,10 @@
 		<dependency org="org.mortbay.jetty" name="jetty-util" rev="6.1.22"
 			conf="test->default" />
 
+                <!-- naive bayes parse filter -->
+                <dependency org="org.apache.mahout.commons" name="commons-cli" rev="2.0-mahout"
+                        conf="test->default" />
+
 		<!--global exclusion -->
 		<exclude module="jmxtools" />
 		<exclude module="jms" />

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1688084&r1=1688083&r2=1688084&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Mon Jun 29 05:14:45 2015
@@ -72,6 +72,7 @@
      <ant dir="urlfilter-regex" target="deploy"/>
      <ant dir="urlfilter-suffix" target="deploy"/>
      <ant dir="urlfilter-validator" target="deploy"/>
+     <ant dir="parsefilter-naivebayes" target="deploy"/>
      <ant dir="urlmeta" target="deploy"/>
      <ant dir="urlnormalizer-ajax" target="deploy"/>
      <ant dir="urlnormalizer-basic" target="deploy"/>
@@ -176,6 +177,7 @@
     <ant dir="urlfilter-regex" target="clean"/>
     <ant dir="urlfilter-suffix" target="clean"/>
     <ant dir="urlfilter-validator" target="clean"/>
+    <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="urlmeta" target="clean"/>
     <ant dir="urlnormalizer-ajax" target="clean"/>
     <ant dir="urlnormalizer-basic" target="clean"/>

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/build.xml?rev=1688084&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/build.xml (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/build.xml Mon Jun 29 05:14:45 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml?rev=1688084&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/ivy.xml Mon Jun 29 05:14:45 2015
@@ -0,0 +1,47 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
+                <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
+                <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
+                <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.3.0" />
+     <exclude org="com.thoughtworks.xstream"/>
+    <exclude org="org.apache.mrunit"/>           
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml?rev=1688084&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/plugin.xml Mon Jun 29 05:14:45 2015
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-naivebayes"
+   name="Naive Bayes Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-naivebayes.jar">
+         <export name="*"/>
+      </library>
+      <library name="lucene-analyzers-common-4.3.0.jar"/>
+      <library name="mahout-math-0.8.jar"/>
+      <library name="mahout-core-0.8.jar"/>
+      <library name="lucene-core-4.3.0.jar"/>     
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.naivebayes"
+              name="Nutch Parser Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="NaiveBayesHTMLParseFilter"
+                      class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
+   </extension>
+
+</plugin>

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java?rev=1688084&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesClassifier.java Mon Jun 29 05:14:45 2015
@@ -0,0 +1,230 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.SequenceFile.Writer;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+import org.apache.mahout.classifier.naivebayes.BayesUtils;
+import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
+import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
+import org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
+import org.apache.mahout.math.RandomAccessSparseVector;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.Vector.Element;
+import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
+import org.apache.mahout.vectorizer.TFIDF;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.ConcurrentHashMultiset;
+import com.google.common.collect.Multiset;
+
+public class NaiveBayesClassifier {
+
+  private static NaiveBayesModel model = null;
+  private static final Logger LOG = LoggerFactory
+      .getLogger(NaiveBayesClassifier.class);
+
+  public static Map<String, Integer> readDictionnary(Configuration conf,
+      Path dictionnaryPath) {
+    Map<String, Integer> dictionnary = new HashMap<String, Integer>();
+    for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(
+        dictionnaryPath, true, conf)) {
+      dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
+    }
+    return dictionnary;
+  }
+
+  public static Map<Integer, Long> readDocumentFrequency(Configuration conf,
+      Path documentFrequencyPath) {
+    Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
+    for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(
+        documentFrequencyPath, true, conf)) {
+      documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
+    }
+    return documentFrequency;
+  }
+
+  public static void createModel(String inputTrainFilePath) throws Exception {
+
+    String[] args1 = new String[4];
+
+    args1[0] = "-i";
+    args1[1] = "outseq";
+    args1[2] = "-o";
+    args1[3] = "vectors";
+
+    String[] args2 = new String[9];
+
+    args2[0] = "-i";
+    args2[1] = "vectors/tfidf-vectors";
+    args2[2] = "-el";
+    args2[3] = "-li";
+    args2[4] = "labelindex";
+    args2[5] = "-o";
+    args2[6] = "model";
+    args2[7] = "-ow";
+    args2[8] = "-c";
+
+    convertToSeq(inputTrainFilePath, "outseq");
+
+    SparseVectorsFromSequenceFiles.main(args1);
+
+    TrainNaiveBayesJob.main(args2);
+  }
+
+  public static String classify(String text) throws IOException {
+    return classify(text, "model", "labelindex", "vectors/dictionary.file-0",
+        "vectors/df-count/part-r-00000");
+  }
+
+  public static String classify(String text, String modelPath,
+      String labelIndexPath, String dictionaryPath, String documentFrequencyPath)
+      throws IOException {
+
+    Configuration configuration = new Configuration();
+
+    // model is a matrix (wordId, labelId) => probability score
+    if (model == null) {
+      model = NaiveBayesModel.materialize(new Path(modelPath), configuration);
+    }
+    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(
+        model);
+
+    // labels is a map label => classId
+    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration,
+        new Path(labelIndexPath));
+    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(
+        dictionaryPath));
+    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
+        new Path(documentFrequencyPath));
+
+    // analyzer used to extract word from text
+    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
+
+    // int labelCount = labels.size();
+    int documentCount = documentFrequency.get(-1).intValue();
+
+    Multiset<String> words = ConcurrentHashMultiset.create();
+
+    // extract words from text
+    TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
+    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+    ts.reset();
+    int wordCount = 0;
+    while (ts.incrementToken()) {
+      if (termAtt.length() > 0) {
+        String word = ts.getAttribute(CharTermAttribute.class).toString();
+        Integer wordId = dictionary.get(word);
+        // if the word is not in the dictionary, skip it
+        if (wordId != null) {
+          words.add(word);
+          wordCount++;
+        }
+      }
+    }
+
+    ts.end();
+    ts.close();
+    // create vector wordId => weight using tfidf
+    Vector vector = new RandomAccessSparseVector(10000);
+    TFIDF tfidf = new TFIDF();
+    for (Multiset.Entry<String> entry : words.entrySet()) {
+      String word = entry.getElement();
+      int count = entry.getCount();
+      Integer wordId = dictionary.get(word);
+      Long freq = documentFrequency.get(wordId);
+      double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount,
+          documentCount);
+      vector.setQuick(wordId, tfIdfValue);
+    }
+    // one score for each label
+
+    Vector resultVector = classifier.classifyFull(vector);
+    double bestScore = -Double.MAX_VALUE;
+    int bestCategoryId = -1;
+    for (Element element : resultVector.all()) {
+      int categoryId = element.index();
+      double score = element.get();
+      if (score > bestScore) {
+        bestScore = score;
+        bestCategoryId = categoryId;
+      }
+
+    }
+
+    analyzer.close();
+    return labels.get(bestCategoryId);
+
+  }
+
+  static void convertToSeq(String inputFileName, String outputDirName)
+      throws IOException {
+    Configuration configuration = new Configuration();
+    FileSystem fs = FileSystem.get(configuration);
+    Writer writer = new SequenceFile.Writer(fs, configuration, new Path(
+        outputDirName + "/chunk-0"), Text.class, Text.class);
+    BufferedReader reader = null;
+    reader = new BufferedReader(
+        configuration.getConfResourceAsReader(inputFileName));
+    Text key = new Text();
+    Text value = new Text();
+    long uniqueid = 0;
+    while (true) {
+      uniqueid++;
+      String line = reader.readLine();
+      if (line == null) {
+        break;
+      }
+      String[] tokens = line.split("\t", 2);
+      if (tokens.length != 2) {
+        continue;
+      }
+      String category = tokens[0];
+      String id = "" + uniqueid;
+      String message = tokens[1];
+      key.set("/" + category + "/" + id);
+      value.set(message);
+      writer.append(key, value);
+
+    }
+    reader.close();
+    writer.close();
+
+  }
+
+}

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1688084&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Mon Jun 29 05:14:45 2015
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+
+import java.io.Reader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevent it
+ * gives the link a second chance if it contains any of the words from the list
+ * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
+ * -1 or a bigger value than 30, when using this classifier.
+ */
+public class NaiveBayesParseFilter implements HtmlParseFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(NaiveBayesParseFilter.class);
+
+  public static final String TRAINFILE_MODELFILTER = "parsefilter.naivebayes.trainfile";
+  public static final String DICTFILE_MODELFILTER = "parsefilter.naivebayes.wordlist";
+
+  private Configuration conf;
+  private String inputFilePath;
+  private String dictionaryFile;
+  private ArrayList<String> wordlist = new ArrayList<String>();
+
+  public boolean filterParse(String text) {
+
+    try {
+      return classify(text);
+    } catch (IOException e) {
+      LOG.error("Error occured while classifying:: " + text + " ::"
+          + StringUtils.stringifyException(e));
+    }
+
+    return false;
+  }
+
+  public boolean filterUrl(String url) {
+
+    return containsWord(url, wordlist);
+
+  }
+
+  public boolean classify(String text) throws IOException {
+
+    // if classified as relevent "1" then return true
+    if (NaiveBayesClassifier.classify(text).equals("1"))
+      return true;
+    return false;
+  }
+
+  public void train() throws Exception {
+    // check if the model file exists, if it does then don't train
+    if (!FileSystem.get(conf).exists(new Path("model"))) {
+      LOG.info("Training the Naive Bayes Model");
+      NaiveBayesClassifier.createModel(inputFilePath);
+    } else {
+      LOG.info("Model file already exists. Skipping training.");
+    }
+  }
+
+  public boolean containsWord(String url, ArrayList<String> wordlist) {
+    for (String word : wordlist) {
+      if (url.contains(word)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    inputFilePath = conf.get(TRAINFILE_MODELFILTER);
+    dictionaryFile = conf.get(DICTFILE_MODELFILTER);
+    if (inputFilePath == null || inputFilePath.trim().length() == 0
+        || dictionaryFile == null || dictionaryFile.trim().length() == 0) {
+      String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
+      if (LOG.isErrorEnabled()) {
+        LOG.error(message);
+      }
+      throw new IllegalArgumentException(message);
+    }
+    try {
+      if ((FileSystem.get(conf).exists(new Path(inputFilePath)))
+          || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) {
+        String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or "
+            + dictionaryFile + " not found!";
+        if (LOG.isErrorEnabled()) {
+          LOG.error(message);
+        }
+        throw new IllegalArgumentException(message);
+      }
+
+      BufferedReader br = null;
+
+      String CurrentLine;
+      Reader reader = conf.getConfResourceAsReader(dictionaryFile);
+      br = new BufferedReader(reader);
+      while ((CurrentLine = br.readLine()) != null) {
+        wordlist.add(CurrentLine);
+      }
+
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+
+    }
+
+    try {
+
+      train();
+    } catch (Exception e) {
+
+      LOG.error("Error occured while training:: "
+          + StringUtils.stringifyException(e));
+
+    }
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    String url = content.getBaseUrl();
+    ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
+    String text = parse.getText();
+
+    if (!filterParse(text)) { // kick in the second tier
+      // if parent page found
+      // irrelevent
+      LOG.info("ParseFilter: NaiveBayes: Page found irrelevent:: " + url);
+      LOG.info("Checking outlinks");
+
+      Outlink[] out = null;
+      for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
+        LOG.info("ParseFilter: NaiveBayes: Outlink to check:: "
+            + parse.getData().getOutlinks()[i].getToUrl());
+        if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
+          tempOutlinks.add(parse.getData().getOutlinks()[i]);
+          LOG.info("ParseFilter: NaiveBayes: found relevent");
+
+        } else {
+          LOG.info("ParseFilter: NaiveBayes: found irrelevent");
+        }
+      }
+      out = new Outlink[tempOutlinks.size()];
+      for (int i = 0; i < tempOutlinks.size(); i++) {
+        out[i] = tempOutlinks.get(i);
+      }
+      parse.getData().setOutlinks(out);
+
+    } else {
+      LOG.info("ParseFilter: NaiveBayes: Page found relevent:: " + url);
+    }
+
+    return parseResult;
+  }
+
+}

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java?rev=1688084&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java Mon Jun 29 05:14:45 2015
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevent
+ * it gives the link a second chance if it contains any of the words from the
+ * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the
+ * parser.timeout to -1 or a bigger value than 30, when using this classifier.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+