You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by as...@apache.org on 2015/10/12 18:28:33 UTC
svn commit: r1708158 - in /nutch/trunk: ./
src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/
Author: asitang
Date: Mon Oct 12 16:28:33 2015
New Revision: 1708158
URL: http://svn.apache.org/viewvc?rev=1708158&view=rev
Log:
NUTCH-2136 Implement a different version of Naive Bayes Parse Filter this closes #71
Added:
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1708158&r1=1708157&r2=1708158&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Oct 12 16:28:33 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2136 Implement a different version of Naive Bayes Parse Filter
+
* NUTCH-2109 Create a brute force click-all-ajax-links utility fucntion for selenium interactive plugin (Asitang Mishra)
* NUTCH-2108 Add a function to the selenium interactive plugin interface to do multiple manipulation of driver and then return the data (Asitang Mishra)
Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java?rev=1708158&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java Mon Oct 12 16:28:33 2015
@@ -0,0 +1,103 @@
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.io.InputStreamReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Classify {
+
+ private static int uniquewords_size = 0;
+
+ private static int numof_ir = 0;
+ private static int numwords_ir = 0;
+ private static HashMap<String, Integer> wordfreq_ir = null;
+
+ private static int numof_r = 0;
+ private static int numwords_r = 0;
+ private static HashMap<String, Integer> wordfreq_r = null;
+ private static boolean ismodel = false;
+
+ public static HashMap<String, Integer> unflattenToHashmap(String line) {
+ HashMap<String, Integer> dict = new HashMap<String, Integer>();
+
+ String dictarray[] = line.split(",");
+
+ for (String field : dictarray) {
+
+ dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1]));
+ }
+
+ return dict;
+
+ }
+
+ public static String classify(String line) throws IOException {
+
+ double prob_ir = 0;
+ double prob_r = 0;
+
+ String result = "1";
+
+ String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase()
+ .split(" ");
+
+ // read the training file
+ // read the line
+ if (!ismodel) {
+ Configuration configuration = new Configuration();
+ FileSystem fs = FileSystem.get(configuration);
+
+ BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
+ fs.open(new Path("naivebayes-model"))));
+
+ uniquewords_size = Integer.valueOf(bufferedReader.readLine());
+ bufferedReader.readLine();
+
+ numof_ir = Integer.valueOf(bufferedReader.readLine());
+ numwords_ir = Integer.valueOf(bufferedReader.readLine());
+ wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
+ bufferedReader.readLine();
+ numof_r = Integer.valueOf(bufferedReader.readLine());
+ numwords_r = Integer.valueOf(bufferedReader.readLine());
+ wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
+
+ ismodel = true;
+
+ bufferedReader.close();
+
+ }
+
+ // update probabilities
+
+ for (String word : linearray) {
+ if (wordfreq_ir.containsKey(word))
+ prob_ir += Math.log(wordfreq_ir.get(word)) + 1
+ - Math.log(numwords_ir + uniquewords_size);
+ else
+ prob_ir += 1 - Math.log(numwords_ir + uniquewords_size);
+
+ if (wordfreq_r.containsKey(word))
+ prob_r += Math.log(wordfreq_r.get(word)) + 1
+ - Math.log(numwords_r + uniquewords_size);
+ else
+ prob_r += 1 - Math.log(numwords_r + uniquewords_size);
+
+ }
+
+ prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r);
+ prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r);
+
+ if (prob_ir > prob_r)
+ result = "0";
+ else
+ result = "1";
+
+ return result;
+ }
+
+}
Modified: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java?rev=1708158&r1=1708157&r2=1708158&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java (original)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java Mon Oct 12 16:28:33 2015
@@ -39,7 +39,7 @@ import java.util.ArrayList;
* Html Parse filter that classifies the outlinks from the parseresult as
* relevant or irrelevant based on the parseText's relevancy (using a training
* file where you can give positive and negative example texts see the
- * description of parsefilter.naivebayes.trainfile) and if found irrelevent it
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevant it
* gives the link a second chance if it contains any of the words from the list
* given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
* -1 or a bigger value than 30, when using this classifier.
@@ -77,17 +77,17 @@ public class NaiveBayesParseFilter imple
public boolean classify(String text) throws IOException {
- // if classified as relevent "1" then return true
- if (NaiveBayesClassifier.classify(text).equals("1"))
+ // if classified as relevant "1" then return true
+ if (Classify.classify(text).equals("1"))
return true;
return false;
}
public void train() throws Exception {
// check if the model file exists, if it does then don't train
- if (!FileSystem.get(conf).exists(new Path("model"))) {
+ if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) {
LOG.info("Training the Naive Bayes Model");
- NaiveBayesClassifier.createModel(inputFilePath);
+ Train.start(inputFilePath);
} else {
LOG.info("Model file already exists. Skipping training.");
}
@@ -165,7 +165,7 @@ public class NaiveBayesParseFilter imple
if (!filterParse(text)) { // kick in the second tier
// if parent page found
- // irrelevent
+ // irrelevant
LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
LOG.info("Checking outlinks");
@@ -175,10 +175,10 @@ public class NaiveBayesParseFilter imple
+ parse.getData().getOutlinks()[i].getToUrl());
if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
tempOutlinks.add(parse.getData().getOutlinks()[i]);
- LOG.info("ParseFilter: NaiveBayes: found relevent");
+ LOG.info("ParseFilter: NaiveBayes: found relevant");
} else {
- LOG.info("ParseFilter: NaiveBayes: found irrelevent");
+ LOG.info("ParseFilter: NaiveBayes: found irrelevant");
}
}
out = new Outlink[tempOutlinks.size()];
@@ -188,7 +188,7 @@ public class NaiveBayesParseFilter imple
parse.getData().setOutlinks(out);
} else {
- LOG.info("ParseFilter: NaiveBayes: Page found relevent:: " + url);
+ LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
}
return parseResult;
Added: nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java?rev=1708158&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java (added)
+++ nutch/trunk/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java Mon Oct 12 16:28:33 2015
@@ -0,0 +1,131 @@
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Train {
+
+ public static String replacefirstoccuranceof(String tomatch, String line) {
+
+ int index = line.indexOf(tomatch);
+ if (index == -1) {
+ return line;
+ } else {
+ return line.substring(0, index)
+ + line.substring(index + tomatch.length());
+ }
+
+ }
+
+ public static void updateHashMap(HashMap<String, Integer> dict, String key) {
+ if (!key.equals("")) {
+ if (dict.containsKey(key))
+ dict.put(key, dict.get(key) + 1);
+ else
+ dict.put(key, 1);
+ }
+ }
+
+ public static String flattenHashMap(HashMap<String, Integer> dict) {
+ String result = "";
+
+ for (String key : dict.keySet()) {
+
+ result += key + ":" + dict.get(key) + ",";
+ }
+
+ // remove the last comma
+ result = result.substring(0, result.length() - 1);
+
+ return result;
+ }
+
+ public static void start(String filepath) throws IOException {
+
+ // two classes 0/irrelevant and 1/relevant
+
+ // calculate the total number of instances/examples per class, word count in
+ // each class and for each class a word:frequency map
+
+ int numof_ir = 0;
+ int numof_r = 0;
+ int numwords_ir = 0;
+ int numwords_r = 0;
+ HashSet<String> uniquewords = new HashSet<String>();
+ HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
+ HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();
+
+ String line = "";
+ String target = "";
+ String[] linearray = null;
+
+ // read the line
+ Configuration configuration = new Configuration();
+ FileSystem fs = FileSystem.get(configuration);
+
+ BufferedReader bufferedReader = new BufferedReader(
+ configuration.getConfResourceAsReader(filepath));
+
+ while ((line = bufferedReader.readLine()) != null) {
+
+ target = line.split("\t")[0];
+
+ line = replacefirstoccuranceof(target + "\t", line);
+
+ linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");
+
+ // update the data structures
+ if (target.equals("0")) {
+
+ numof_ir += 1;
+ numwords_ir += linearray.length;
+ for (int i = 0; i < linearray.length; i++) {
+ uniquewords.add(linearray[i]);
+ updateHashMap(wordfreq_ir, linearray[i]);
+ }
+ } else {
+
+ numof_r += 1;
+ numwords_r += linearray.length;
+ for (int i = 0; i < linearray.length; i++) {
+ uniquewords.add(linearray[i]);
+ updateHashMap(wordfreq_r, linearray[i]);
+ }
+
+ }
+
+ }
+
+ // write the model file
+
+ Path path = new Path("naivebayes-model");
+
+ Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
+ true)));
+
+ writer.write(String.valueOf(uniquewords.size()) + "\n");
+ writer.write("0\n");
+ writer.write(String.valueOf(numof_ir) + "\n");
+ writer.write(String.valueOf(numwords_ir) + "\n");
+ writer.write(flattenHashMap(wordfreq_ir) + "\n");
+ writer.write("1\n");
+ writer.write(String.valueOf(numof_r) + "\n");
+ writer.write(String.valueOf(numwords_r) + "\n");
+ writer.write(flattenHashMap(wordfreq_r) + "\n");
+
+ writer.close();
+
+ bufferedReader.close();
+
+ }
+
+}