You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/07/01 07:59:11 UTC
svn commit: r1688552 - in /nutch/trunk: ./ conf/
src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/
src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/
Author: mattmann
Date: Wed Jul 1 05:59:10 2015
New Revision: 1688552
URL: http://svn.apache.org/r1688552
Log:
Add mattmann for unit test for NUTCH-2038 to pass.
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/naivebayes-wordlist.txt.template
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688552&r1=1688551&r2=1688552&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 1 05:59:10 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2052 Enhance index-static to allow configurable delimiters (Peter Ciuffetti via mattmann)
+
* NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks)
(Asitang Mishra, snagel via mattmann)
Modified: nutch/trunk/conf/naivebayes-wordlist.txt.template
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/naivebayes-wordlist.txt.template?rev=1688552&r1=1688551&r2=1688552&view=diff
==============================================================================
--- nutch/trunk/conf/naivebayes-wordlist.txt.template (original)
+++ nutch/trunk/conf/naivebayes-wordlist.txt.template Wed Jul 1 05:59:10 2015
@@ -1,4 +1,5 @@
nutch
funny
happy
-search
\ No newline at end of file
+search
+mattmann
\ No newline at end of file
Modified: nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1688552&r1=1688551&r2=1688552&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java (original)
+++ nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java Wed Jul 1 05:59:10 2015
@@ -40,6 +40,9 @@ public class StaticFieldIndexer implemen
private Configuration conf;
private HashMap<String, String[]> fields;
private boolean addStaticFields = false;
+ private String fieldSep = ",";
+ private String kevSep = ":";
+ private String valueSep = " ";
/**
* The {@link StaticFieldIndexer} filter object which adds fields as per
@@ -85,10 +88,10 @@ public class StaticFieldIndexer implemen
* The format is very easy, it's a comma-separated list of fields in the
* form <name>:<value>
*/
- for (String field : fieldsString.split(",")) {
- String[] entry = field.split(":");
+ for (String field : fieldsString.split(this.fieldSep)) {
+ String[] entry = field.split(this.kevSep);
if (entry.length == 2)
- fields.put(entry[0].trim(), entry[1].trim().split(" "));
+ fields.put(entry[0].trim(), entry[1].trim().split(this.valueSep));
}
return fields;
@@ -99,6 +102,12 @@ public class StaticFieldIndexer implemen
*/
public void setConf(Configuration conf) {
this.conf = conf;
+
+ // NUTCH-2052: Allow user-defined delimiters in index.static
+ this.fieldSep = conf.get("index.static.fieldsep", ",");
+ this.kevSep = conf.get("index.static.keysep", ":");
+ this.valueSep = conf.get("index.static.valuesep", " ");
+
String fieldsString = conf.get("index.static", null);
if (fieldsString != null) {
this.addStaticFields = true;
Modified: nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1688552&r1=1688551&r2=1688552&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java (original)
+++ nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java Wed Jul 1 05:59:10 2015
@@ -115,4 +115,42 @@ public class TestStaticFieldIndexerTest
Assert.assertTrue("test if doc has field4", doc.getField("field4")
.getValues().contains("val4"));
}
+
+ /**
+ * Test for NUTCH-2052 custom delimiters in index.static.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testCustomDelimiters() throws Exception {
+
+ conf.set("index.static.fieldsep", ">");
+ conf.set("index.static.keysep", "=");
+ conf.set("index.static.valuesep", "|");
+ conf.set("index.static",
+ "field1=val1>field2 = val2|val3 >field3>field4 =val4 > ");
+ Assert.assertNotNull(filter);
+ filter.setConf(conf);
+
+ NutchDocument doc = new NutchDocument();
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertNotNull(doc);
+ Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+ .isEmpty());
+ Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+ .size());
+ Assert.assertTrue("test if doc has field1", doc.getField("field1")
+ .getValues().contains("val1"));
+ Assert.assertTrue("test if doc has field2", doc.getField("field2")
+ .getValues().contains("val2"));
+ Assert.assertTrue("test if doc has field4", doc.getField("field4")
+ .getValues().contains("val4"));
+ }
}