You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by as...@apache.org on 2015/10/12 18:28:33 UTC

svn commit: r1708158 - in /nutch/trunk: ./ src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/

Author: asitang
Date: Mon Oct 12 16:28:33 2015
New Revision: 1708158

URL: http://svn.apache.org/viewvc?rev=1708158&view=rev
Log:
NUTCH-2136 Implement a different version of Naive Bayes Parse Filter this closes #71

Added:
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1708158&r1=1708157&r2=1708158&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Oct 12 16:28:33 2015
@@ -2,6 +2,8 @@ Nutch Change Log
    
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2136 Implement a different version of Naive Bayes Parse Filter
+
 * NUTCH-2109 Create a brute force click-all-ajax-links utility fucntion for selenium interactive plugin (Asitang Mishra)
 
 * NUTCH-2108 Add a function to the selenium interactive plugin interface to do multiple manipulation of driver and then return the data (Asitang Mishra)

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java?rev=1708158&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java Mon Oct 12 16:28:33 2015
@@ -0,0 +1,103 @@
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.io.InputStreamReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Classify {
+
+  private static int uniquewords_size = 0;
+
+  private static int numof_ir = 0;
+  private static int numwords_ir = 0;
+  private static HashMap<String, Integer> wordfreq_ir = null;
+
+  private static int numof_r = 0;
+  private static int numwords_r = 0;
+  private static HashMap<String, Integer> wordfreq_r = null;
+  private static boolean ismodel = false;
+
+  public static HashMap<String, Integer> unflattenToHashmap(String line) {
+    HashMap<String, Integer> dict = new HashMap<String, Integer>();
+
+    String dictarray[] = line.split(",");
+
+    for (String field : dictarray) {
+
+      dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1]));
+    }
+
+    return dict;
+
+  }
+
+  public static String classify(String line) throws IOException {
+
+    double prob_ir = 0;
+    double prob_r = 0;
+
+    String result = "1";
+
+    String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase()
+        .split(" ");
+
+    // read the training file
+    // read the line
+    if (!ismodel) {
+      Configuration configuration = new Configuration();
+      FileSystem fs = FileSystem.get(configuration);
+
+      BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
+          fs.open(new Path("naivebayes-model"))));
+
+      uniquewords_size = Integer.valueOf(bufferedReader.readLine());
+      bufferedReader.readLine();
+
+      numof_ir = Integer.valueOf(bufferedReader.readLine());
+      numwords_ir = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
+      bufferedReader.readLine();
+      numof_r = Integer.valueOf(bufferedReader.readLine());
+      numwords_r = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
+
+      ismodel = true;
+
+      bufferedReader.close();
+
+    }
+
+    // update probabilities
+
+    for (String word : linearray) {
+      if (wordfreq_ir.containsKey(word))
+        prob_ir += Math.log(wordfreq_ir.get(word)) + 1
+            - Math.log(numwords_ir + uniquewords_size);
+      else
+        prob_ir += 1 - Math.log(numwords_ir + uniquewords_size);
+
+      if (wordfreq_r.containsKey(word))
+        prob_r += Math.log(wordfreq_r.get(word)) + 1
+            - Math.log(numwords_r + uniquewords_size);
+      else
+        prob_r += 1 - Math.log(numwords_r + uniquewords_size);
+
+    }
+
+    prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r);
+    prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r);
+
+    if (prob_ir > prob_r)
+      result = "0";
+    else
+      result = "1";
+
+    return result;
+  }
+
+}

Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1708158&r1=1708157&r2=1708158&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Mon Oct 12 16:28:33 2015
@@ -39,7 +39,7 @@ import java.util.ArrayList;
  * Html Parse filter that classifies the outlinks from the parseresult as
  * relevant or irrelevant based on the parseText's relevancy (using a training
  * file where you can give positive and negative example texts see the
- * description of parsefilter.naivebayes.trainfile) and if found irrelevent it
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevant it
  * gives the link a second chance if it contains any of the words from the list
  * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
  * -1 or a bigger value than 30, when using this classifier.
@@ -77,17 +77,17 @@ public class NaiveBayesParseFilter imple
 
   public boolean classify(String text) throws IOException {
 
-    // if classified as relevent "1" then return true
-    if (NaiveBayesClassifier.classify(text).equals("1"))
+    // if classified as relevant "1" then return true
+    if (Classify.classify(text).equals("1"))
       return true;
     return false;
   }
 
   public void train() throws Exception {
     // check if the model file exists, if it does then don't train
-    if (!FileSystem.get(conf).exists(new Path("model"))) {
+    if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) {
       LOG.info("Training the Naive Bayes Model");
-      NaiveBayesClassifier.createModel(inputFilePath);
+      Train.start(inputFilePath);
     } else {
       LOG.info("Model file already exists. Skipping training.");
     }
@@ -165,7 +165,7 @@ public class NaiveBayesParseFilter imple
 
     if (!filterParse(text)) { // kick in the second tier
       // if parent page found
-      // irrelevent
+      // irrelevant
       LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
       LOG.info("Checking outlinks");
 
@@ -175,10 +175,10 @@ public class NaiveBayesParseFilter imple
             + parse.getData().getOutlinks()[i].getToUrl());
         if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
           tempOutlinks.add(parse.getData().getOutlinks()[i]);
-          LOG.info("ParseFilter: NaiveBayes: found relevent");
+          LOG.info("ParseFilter: NaiveBayes: found relevant");
 
         } else {
-          LOG.info("ParseFilter: NaiveBayes: found irrelevent");
+          LOG.info("ParseFilter: NaiveBayes: found irrelevant");
         }
       }
       out = new Outlink[tempOutlinks.size()];
@@ -188,7 +188,7 @@ public class NaiveBayesParseFilter imple
       parse.getData().setOutlinks(out);
 
     } else {
-      LOG.info("ParseFilter: NaiveBayes: Page found relevent:: " + url);
+      LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
     }
 
     return parseResult;

Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java?rev=1708158&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java Mon Oct 12 16:28:33 2015
@@ -0,0 +1,131 @@
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Train {
+
+  public static String replacefirstoccuranceof(String tomatch, String line) {
+
+    int index = line.indexOf(tomatch);
+    if (index == -1) {
+      return line;
+    } else {
+      return line.substring(0, index)
+          + line.substring(index + tomatch.length());
+    }
+
+  }
+
+  public static void updateHashMap(HashMap<String, Integer> dict, String key) {
+    if (!key.equals("")) {
+      if (dict.containsKey(key))
+        dict.put(key, dict.get(key) + 1);
+      else
+        dict.put(key, 1);
+    }
+  }
+
+  public static String flattenHashMap(HashMap<String, Integer> dict) {
+    String result = "";
+
+    for (String key : dict.keySet()) {
+
+      result += key + ":" + dict.get(key) + ",";
+    }
+
+    // remove the last comma
+    result = result.substring(0, result.length() - 1);
+
+    return result;
+  }
+
+  public static void start(String filepath) throws IOException {
+
+    // two classes 0/irrelevant and 1/relevant
+
+    // calculate the total number of instances/examples per class, word count in
+    // each class and for each class a word:frequency map
+
+    int numof_ir = 0;
+    int numof_r = 0;
+    int numwords_ir = 0;
+    int numwords_r = 0;
+    HashSet<String> uniquewords = new HashSet<String>();
+    HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
+    HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();
+
+    String line = "";
+    String target = "";
+    String[] linearray = null;
+
+    // read the line
+    Configuration configuration = new Configuration();
+    FileSystem fs = FileSystem.get(configuration);
+
+    BufferedReader bufferedReader = new BufferedReader(
+        configuration.getConfResourceAsReader(filepath));
+
+    while ((line = bufferedReader.readLine()) != null) {
+
+      target = line.split("\t")[0];
+
+      line = replacefirstoccuranceof(target + "\t", line);
+
+      linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");
+
+      // update the data structures
+      if (target.equals("0")) {
+
+        numof_ir += 1;
+        numwords_ir += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_ir, linearray[i]);
+        }
+      } else {
+
+        numof_r += 1;
+        numwords_r += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_r, linearray[i]);
+        }
+
+      }
+
+    }
+
+    // write the model file
+
+    Path path = new Path("naivebayes-model");
+
+    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
+        true)));
+
+    writer.write(String.valueOf(uniquewords.size()) + "\n");
+    writer.write("0\n");
+    writer.write(String.valueOf(numof_ir) + "\n");
+    writer.write(String.valueOf(numwords_ir) + "\n");
+    writer.write(flattenHashMap(wordfreq_ir) + "\n");
+    writer.write("1\n");
+    writer.write(String.valueOf(numof_r) + "\n");
+    writer.write(String.valueOf(numwords_r) + "\n");
+    writer.write(flattenHashMap(wordfreq_r) + "\n");
+
+    writer.close();
+
+    bufferedReader.close();
+
+  }
+
+}