You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/01/31 13:58:41 UTC

svn commit: r1440940 - in /lucene/dev/trunk/solr: ./ core/src/java/org/apache/solr/update/processor/ core/src/test-files/solr/collection1/conf/ core/src/test-files/solr/conf/ core/src/test/org/apache/solr/update/processor/

Author: janhoy
Date: Thu Jan 31 12:58:40 2013
New Revision: 1440940

URL: http://svn.apache.org/viewvc?rev=1440940&view=rev
Log:
SOLR-2827: RegexpBoost Update Processor

Added:
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java   (with props)
    lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java   (with props)
    lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt   (with props)
    lucene/dev/trunk/solr/core/src/test-files/solr/conf/
    lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java   (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1440940&r1=1440939&r2=1440940&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Jan 31 12:58:40 2013
@@ -66,6 +66,8 @@ New Features
 * SOLR-4043: Add ability to get success/failure responses from Collections API.
   (Raintung Li, Mark Miller)
 
+* SOLR-2827: RegexpBoost Update Processor (janhoy)
+
 Bug Fixes
 ----------------------
 

Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java Thu Jan 31 12:58:40 2013
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import org.apache.commons.io.IOUtils;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A processor which will match content of "inputField" against regular expressions
+ * found in "boostFilename", and if it matches will return the corresponding boost
+ * value from the file and output this to "boostField" as a double value.
+ * If more than one pattern matches, the boosts from each are multiplied.
+ * <p>
+ * A typical use case may be to match a URL against patterns to boost or deboost
+ * web documents based on the URL itself:
+ * <pre>
+ * # Format of each line: &lt;pattern&gt;&lt;TAB&gt;&lt;boost&gt;
+ * # Example:
+ * https?://my.domain.com/temp.*  0.2
+ * </pre>
+ * <p>
+ * Both inputField, boostField and boostFilename are mandatory parameters.
+ */
+public class RegexpBoostProcessor extends UpdateRequestProcessor {
+
+  protected static final String INPUT_FIELD_PARAM = "inputField";
+  protected static final String BOOST_FIELD_PARAM = "boostField";
+  protected static final String BOOST_FILENAME_PARAM = "boostFilename";
+  private static final String DEFAULT_INPUT_FIELDNAME = "url";
+  private static final String DEFAULT_BOOST_FIELDNAME = "urlboost";
+
+  private static final Logger log = LoggerFactory.getLogger(RegexpBoostProcessor.class);
+
+  private boolean enabled = true;
+  private String inputFieldname = DEFAULT_INPUT_FIELDNAME;
+  private String boostFieldname = DEFAULT_BOOST_FIELDNAME;
+  private String boostFilename;
+  private List<BoostEntry> boostEntries = new ArrayList<BoostEntry>();
+  private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries";
+
+  RegexpBoostProcessor(SolrParams parameters,
+                       SolrQueryRequest request,
+                       SolrQueryResponse response,
+                       UpdateRequestProcessor nextProcessor,
+                       final Map<Object, Object> sharedObjectCache) {
+    super(nextProcessor);
+    this.initParameters(parameters);
+
+    if (this.boostFilename == null) {
+      log.warn("Null boost filename.  Disabling processor.");
+      setEnabled(false);
+    }
+
+    if (!isEnabled()) {
+      return;
+    }
+
+    try {
+      synchronized (sharedObjectCache) {
+        List<BoostEntry> cachedBoostEntries =
+            (List<BoostEntry>) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY);
+
+        if (cachedBoostEntries == null) {
+          log.debug("No pre-cached boost entry list found, initializing new");
+          InputStream is = request.getCore().getResourceLoader().openResource(boostFilename);
+          cachedBoostEntries = initBoostEntries(is);
+          sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries);
+        } else {
+          if (log.isDebugEnabled()) {
+            log.debug("Using cached boost entry list with " + cachedBoostEntries.size() + " elements.");
+          }
+        }
+
+        this.boostEntries = cachedBoostEntries;
+      }
+    } catch (IOException ioe) {
+      log.warn("IOException while initializing boost entries from file " + this.boostFilename, ioe);
+    }
+  }
+
+  private void initParameters(SolrParams parameters) {
+    if (parameters != null) {
+      this.setEnabled(parameters.getBool("enabled", true));
+      this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME);
+      this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME);
+      this.boostFilename = parameters.get(BOOST_FILENAME_PARAM);
+    }
+  }
+
+  private List<BoostEntry> initBoostEntries(InputStream is) throws IOException {
+    List<BoostEntry> newBoostEntries = new ArrayList<BoostEntry>();
+    
+    BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
+    try {
+      String line = null;
+      while ((line = reader.readLine()) != null) {
+        // Remove comments
+        line = line.replaceAll("\\s+#.*$", "");
+        line = line.replaceAll("^#.*$", "");
+
+        // Skip empty lines or comment lines
+        if (line.trim().length() == 0) {
+          continue;
+        }
+
+        String[] fields = line.split("\\s+");
+
+        if (fields.length == 2) {
+          String regexp = fields[0];
+          String boost = fields[1];
+          newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost)));
+          log.debug("Read regexp " + regexp + " with boost " + boost);
+        } else {
+          log.warn("Malformed config input line: " + line + " (expected 2 fields, got " + fields.length + " fields).  Skipping entry.");
+          continue;
+        }
+      }
+    } finally {
+      IOUtils.closeQuietly(reader);
+    }
+
+    return newBoostEntries;
+  }
+
+  @Override
+  public void processAdd(AddUpdateCommand command) throws IOException {
+    if (isEnabled()) {
+      processBoost(command);
+    }
+    super.processAdd(command);
+  }
+
+  public void processBoost(AddUpdateCommand command) {
+    SolrInputDocument document = command.getSolrInputDocument();
+    if (document.containsKey(inputFieldname)) {
+      String value = (String) document.getFieldValue(inputFieldname);
+      double boost = 1.0f;
+      for (BoostEntry boostEntry : boostEntries) {
+        if (boostEntry.getPattern().matcher(value).matches()) {
+          if (log.isDebugEnabled()) {
+            log.debug("Pattern match " + boostEntry.getPattern().pattern() + " for " + value);
+          }
+          boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000;
+        }
+      }
+      document.setField(boostFieldname, boost);
+
+      if (log.isDebugEnabled()) {
+        log.debug("Value " + boost + ", applied to field " + boostFieldname);
+      }
+    }
+  }
+
+  public boolean isEnabled() {
+    return enabled;
+  }
+
+  public void setEnabled(boolean enabled) {
+    this.enabled = enabled;
+  }
+
+  private static class BoostEntry {
+
+    private Pattern pattern;
+    private double boost;
+
+    public BoostEntry(Pattern pattern, double d) {
+      this.pattern = pattern;
+      this.boost = d;
+    }
+
+    public Pattern getPattern() {
+      return pattern;
+    }
+
+    public double getBoost() {
+      return boost;
+    }
+  }
+}

Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java Thu Jan 31 12:58:40 2013
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+/**
+ * Factory which creates RegexBoostProcessors
+ * <p>
+ * The factory initializes a shared object cache which is passed to the processor
+ * and this way reduces rules file parsing to the first time the UpdateChain
+ * is initialized.
+ */
+public class RegexpBoostProcessorFactory extends UpdateRequestProcessorFactory {
+
+    private SolrParams params;
+    private final Map<Object, Object> sharedObjectCache = new HashMap<Object, Object>();
+
+    @Override
+    public void init(@SuppressWarnings("rawtypes") final NamedList args) {
+        if (args != null) {
+            this.params = SolrParams.toSolrParams(args);
+        }
+    }
+
+    @Override
+    public UpdateRequestProcessor getInstance(SolrQueryRequest request,
+            SolrQueryResponse response,
+            UpdateRequestProcessor nextProcessor) {
+
+        return new RegexpBoostProcessor(this.params, request, response, nextProcessor, this.sharedObjectCache);
+    }
+}

Added: lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt Thu Jan 31 12:58:40 2013
@@ -0,0 +1,10 @@
+# Sample config file for RegexBoostProcessor
+# This example applies boost on the "url" field to boost or deboost certain urls
+# All rules are evaluated, and if several of them match, the boosts are multiplied.
+# If for example one rule with boost 2.0 and one rule with boost 0.1 match, the resulting urlboost=0.2
+
+https?://[^/]+/old/.* 0.1		#Comments are removed
+https?://[^/]+/.*index\([0-9]\).html$	0.5
+
+# Prioritize certain sites over others
+https?://www.mydomain.no/.*	1.5
\ No newline at end of file

Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java Thu Jan 31 12:58:40 2013
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.servlet.SolrRequestParsers;
+import org.apache.solr.update.AddUpdateCommand;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class RegexBoostProcessorTest extends SolrTestCaseJ4 {
+  private static RegexpBoostProcessor reProcessor;
+  protected static SolrRequestParsers _parser;
+  protected static ModifiableSolrParams parameters;
+  private static RegexpBoostProcessorFactory factory;
+  private SolrInputDocument document;
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    initCore("solrconfig.xml", "schema12.xml");
+    SolrCore core = h.getCore();
+    _parser = new SolrRequestParsers( null );
+    SolrQueryResponse resp = null;
+    parameters = new ModifiableSolrParams();
+    parameters.set(RegexpBoostProcessor.BOOST_FILENAME_PARAM, "regex-boost-processor-test.txt");
+    parameters.set(RegexpBoostProcessor.INPUT_FIELD_PARAM, "url");
+    parameters.set(RegexpBoostProcessor.BOOST_FIELD_PARAM, "urlboost");
+    SolrQueryRequest req = _parser.buildRequestFrom(core, new ModifiableSolrParams(), null);
+    factory = new RegexpBoostProcessorFactory();
+    factory.init(parameters.toNamedList());
+    reProcessor = (RegexpBoostProcessor) factory.getInstance(req, resp, null);
+  }
+
+  @Before
+  public void setUp() throws Exception {
+    document = new SolrInputDocument();
+    super.setUp();
+  }
+
+  @Test
+  public void testNoBoost() throws Exception {
+    document.addField("id", "doc1");
+    document.addField("url", "http://www.nomatch.no");
+    
+    processAdd(document);
+    
+    assertEquals(1.0d, document.getFieldValue("urlboost"));
+  }
+  
+  @Test
+  public void testDeboostOld() throws Exception {
+    document.addField("id", "doc1");
+    document.addField("url", "http://www.somedomain.no/old/test.html");
+    
+    processAdd(document);
+    
+    assertEquals(0.1d, document.getFieldValue("urlboost"));
+
+    // Test the other deboost rule
+    document = new SolrInputDocument();
+    document.addField("id", "doc1");
+    document.addField("url", "http://www.somedomain.no/foo/index(1).html");
+    
+    processAdd(document);
+    
+    assertEquals(0.5d, document.getFieldValue("urlboost"));
+}
+  
+  @Test
+  public void testBoostGood() throws Exception {
+    document.addField("id", "doc1");
+    document.addField("url", "http://www.mydomain.no/fifty-percent-boost");
+    
+    processAdd(document);
+    
+    assertEquals(1.5d, document.getFieldValue("urlboost"));
+  }
+  
+  @Test
+  public void testTwoRules() throws Exception {
+    document.addField("id", "doc1");
+    document.addField("url", "http://www.mydomain.no/old/test.html");
+    
+    processAdd(document);
+    
+    assertEquals(0.15d, document.getFieldValue("urlboost"));
+  }
+  
+  private void processAdd(SolrInputDocument doc) throws Exception {
+    AddUpdateCommand addCommand = new AddUpdateCommand(null);
+    addCommand.solrDoc = doc;
+    reProcessor.processAdd(addCommand);
+  }
+  
+}