You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/07/03 18:24:05 UTC

svn commit: r1689044 - in /nutch/trunk/src/plugin/index-static/src: java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java

Author: mattmann
Date: Fri Jul  3 16:24:05 2015
New Revision: 1689044

URL: http://svn.apache.org/r1689044
Log:
Fix for NUTCH-2052:Enhance index-static to allow configurable delimiters contributed by PeterCiuffetti <pc...@astreetpress.com> this closes #43.

Modified:
    nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
    nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java

Modified: nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1689044&r1=1689043&r2=1689044&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java (original)
+++ nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java Fri Jul  3 16:24:05 2015
@@ -40,6 +40,9 @@ public class StaticFieldIndexer implemen
   private Configuration conf;
   private HashMap<String, String[]> fields;
   private boolean addStaticFields = false;
+  private String fieldSep = ",";
+  private String kevSep = ":";
+  private String valueSep = " ";
 
   /**
    * The {@link StaticFieldIndexer} filter object which adds fields as per
@@ -85,10 +88,10 @@ public class StaticFieldIndexer implemen
      * The format is very easy, it's a comma-separated list of fields in the
      * form <name>:<value>
      */
-    for (String field : fieldsString.split(",")) {
-      String[] entry = field.split(":");
+    for (String field : fieldsString.split(this.fieldSep)) {
+      String[] entry = field.split(this.kevSep);
       if (entry.length == 2)
-        fields.put(entry[0].trim(), entry[1].trim().split(" "));
+        fields.put(entry[0].trim(), entry[1].trim().split(this.valueSep));
     }
 
     return fields;
@@ -99,6 +102,12 @@ public class StaticFieldIndexer implemen
    */
   public void setConf(Configuration conf) {
     this.conf = conf;
+
+    // NUTCH-2052: Allow user-defined delimiters in index.static
+    this.fieldSep = this.regexEscape(conf.get("index.static.fieldsep", ","));
+    this.kevSep = this.regexEscape(conf.get("index.static.keysep", ":"));
+    this.valueSep = this.regexEscape(conf.get("index.static.valuesep", " "));
+
     String fieldsString = conf.get("index.static", null);
     if (fieldsString != null) {
       this.addStaticFields = true;
@@ -112,4 +121,23 @@ public class StaticFieldIndexer implemen
   public Configuration getConf() {
     return this.conf;
   }
+
+  /**
+   * Escapes any character that needs escaping so it can be used in a regexp.
+   */
+  protected String regexEscape(String in) {
+    String result = in;
+    if (in != null) {
+      StringBuffer sb = new StringBuffer();
+      for (int i = 0; i < in.length(); i++) {
+        CharSequence c = in.subSequence(i, i+1);
+        if ("<([{\\^-=$!|]})?*+.>".contains(c)) {
+          sb.append('\\');
+        }
+        sb.append(c);
+      }
+      result = sb.toString();
+    }
+    return result;
+  }
 }

Modified: nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1689044&r1=1689043&r2=1689044&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java (original)
+++ nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java Fri Jul  3 16:24:05 2015
@@ -115,4 +115,80 @@ public class TestStaticFieldIndexerTest
     Assert.assertTrue("test if doc has field4", doc.getField("field4")
         .getValues().contains("val4"));
   }
+
+  /**
+   * Test for NUTCH-2052 custom delimiters in index.static.
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testCustomDelimiters() throws Exception {
+
+    conf.set("index.static.fieldsep", ">");
+    conf.set("index.static.keysep", "=");
+    conf.set("index.static.valuesep", "|");
+    conf.set("index.static",
+        "field1=val1>field2    =      val2|val3     >field3>field4 =val4 > ");
+    Assert.assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+    Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+        .isEmpty());
+    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+        .size());
+    Assert.assertTrue("test if doc has field1", doc.getField("field1")
+        .getValues().contains("val1"));
+    Assert.assertTrue("test if doc has field2", doc.getField("field2")
+        .getValues().contains("val2"));
+    Assert.assertTrue("test if doc has field4", doc.getField("field4")
+        .getValues().contains("val4"));
+  }
+
+  /**
+   * Test for NUTCH-2052 custom delimiters in index.static.
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testCustomMulticharacterDelimiters() throws Exception {
+
+    conf.set("index.static.fieldsep", "\n\n");
+    conf.set("index.static.keysep", "\t\t");
+    conf.set("index.static.valuesep", "***");
+    conf.set("index.static", "field1\t\tval1\n\n" + "field2\t\tval2***val3\n\n"
+        + "field3\n\n" + "field4\t\tval4\n\n\n\n");
+    Assert.assertNotNull(filter);
+    filter.setConf(conf);
+
+    NutchDocument doc = new NutchDocument();
+
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+    Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+        .isEmpty());
+    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+        .size());
+    Assert.assertTrue("test if doc has field1", doc.getField("field1")
+        .getValues().contains("val1"));
+    Assert.assertTrue("test if doc has field2", doc.getField("field2")
+        .getValues().contains("val2"));
+    Assert.assertTrue("test if doc has field4", doc.getField("field4")
+        .getValues().contains("val4"));
+  }
 }