You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/01/31 13:58:41 UTC
svn commit: r1440940 - in /lucene/dev/trunk/solr: ./
core/src/java/org/apache/solr/update/processor/
core/src/test-files/solr/collection1/conf/ core/src/test-files/solr/conf/
core/src/test/org/apache/solr/update/processor/
Author: janhoy
Date: Thu Jan 31 12:58:40 2013
New Revision: 1440940
URL: http://svn.apache.org/viewvc?rev=1440940&view=rev
Log:
SOLR-2827: RegexpBoost Update Processor
Added:
lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java (with props)
lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt (with props)
lucene/dev/trunk/solr/core/src/test-files/solr/conf/
lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java (with props)
Modified:
lucene/dev/trunk/solr/CHANGES.txt
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1440940&r1=1440939&r2=1440940&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Jan 31 12:58:40 2013
@@ -66,6 +66,8 @@ New Features
* SOLR-4043: Add ability to get success/failure responses from Collections API.
(Raintung Li, Mark Miller)
+* SOLR-2827: RegexpBoost Update Processor (janhoy)
+
Bug Fixes
----------------------
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessor.java Thu Jan 31 12:58:40 2013
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+import org.apache.commons.io.IOUtils;
+
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.update.AddUpdateCommand;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A processor which will match content of "inputField" against regular expressions
+ * found in "boostFilename", and if it matches will return the corresponding boost
+ * value from the file and output this to "boostField" as a double value.
+ * If more than one pattern matches, the boosts from each are multiplied.
+ * <p>
+ * A typical use case may be to match a URL against patterns to boost or deboost
+ * web documents based on the URL itself:
+ * <pre>
+ * # Format of each line: <pattern><TAB><boost>
+ * # Example:
+ * https?://my.domain.com/temp.* 0.2
+ * </pre>
+ * <p>
+ * Both inputField, boostField and boostFilename are mandatory parameters.
+ */
+public class RegexpBoostProcessor extends UpdateRequestProcessor {
+
+ protected static final String INPUT_FIELD_PARAM = "inputField";
+ protected static final String BOOST_FIELD_PARAM = "boostField";
+ protected static final String BOOST_FILENAME_PARAM = "boostFilename";
+ private static final String DEFAULT_INPUT_FIELDNAME = "url";
+ private static final String DEFAULT_BOOST_FIELDNAME = "urlboost";
+
+ private static final Logger log = LoggerFactory.getLogger(RegexpBoostProcessor.class);
+
+ private boolean enabled = true;
+ private String inputFieldname = DEFAULT_INPUT_FIELDNAME;
+ private String boostFieldname = DEFAULT_BOOST_FIELDNAME;
+ private String boostFilename;
+ private List<BoostEntry> boostEntries = new ArrayList<BoostEntry>();
+ private static final String BOOST_ENTRIES_CACHE_KEY = "boost-entries";
+
+ RegexpBoostProcessor(SolrParams parameters,
+ SolrQueryRequest request,
+ SolrQueryResponse response,
+ UpdateRequestProcessor nextProcessor,
+ final Map<Object, Object> sharedObjectCache) {
+ super(nextProcessor);
+ this.initParameters(parameters);
+
+ if (this.boostFilename == null) {
+ log.warn("Null boost filename. Disabling processor.");
+ setEnabled(false);
+ }
+
+ if (!isEnabled()) {
+ return;
+ }
+
+ try {
+ synchronized (sharedObjectCache) {
+ List<BoostEntry> cachedBoostEntries =
+ (List<BoostEntry>) sharedObjectCache.get(BOOST_ENTRIES_CACHE_KEY);
+
+ if (cachedBoostEntries == null) {
+ log.debug("No pre-cached boost entry list found, initializing new");
+ InputStream is = request.getCore().getResourceLoader().openResource(boostFilename);
+ cachedBoostEntries = initBoostEntries(is);
+ sharedObjectCache.put(BOOST_ENTRIES_CACHE_KEY, cachedBoostEntries);
+ } else {
+ if (log.isDebugEnabled()) {
+ log.debug("Using cached boost entry list with " + cachedBoostEntries.size() + " elements.");
+ }
+ }
+
+ this.boostEntries = cachedBoostEntries;
+ }
+ } catch (IOException ioe) {
+ log.warn("IOException while initializing boost entries from file " + this.boostFilename, ioe);
+ }
+ }
+
+ private void initParameters(SolrParams parameters) {
+ if (parameters != null) {
+ this.setEnabled(parameters.getBool("enabled", true));
+ this.inputFieldname = parameters.get(INPUT_FIELD_PARAM, DEFAULT_INPUT_FIELDNAME);
+ this.boostFieldname = parameters.get(BOOST_FIELD_PARAM, DEFAULT_BOOST_FIELDNAME);
+ this.boostFilename = parameters.get(BOOST_FILENAME_PARAM);
+ }
+ }
+
+ private List<BoostEntry> initBoostEntries(InputStream is) throws IOException {
+ List<BoostEntry> newBoostEntries = new ArrayList<BoostEntry>();
+
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
+ try {
+ String line = null;
+ while ((line = reader.readLine()) != null) {
+ // Remove comments
+ line = line.replaceAll("\\s+#.*$", "");
+ line = line.replaceAll("^#.*$", "");
+
+ // Skip empty lines or comment lines
+ if (line.trim().length() == 0) {
+ continue;
+ }
+
+ String[] fields = line.split("\\s+");
+
+ if (fields.length == 2) {
+ String regexp = fields[0];
+ String boost = fields[1];
+ newBoostEntries.add(new BoostEntry(Pattern.compile(regexp), Double.parseDouble(boost)));
+ log.debug("Read regexp " + regexp + " with boost " + boost);
+ } else {
+ log.warn("Malformed config input line: " + line + " (expected 2 fields, got " + fields.length + " fields). Skipping entry.");
+ continue;
+ }
+ }
+ } finally {
+ IOUtils.closeQuietly(reader);
+ }
+
+ return newBoostEntries;
+ }
+
+ @Override
+ public void processAdd(AddUpdateCommand command) throws IOException {
+ if (isEnabled()) {
+ processBoost(command);
+ }
+ super.processAdd(command);
+ }
+
+ public void processBoost(AddUpdateCommand command) {
+ SolrInputDocument document = command.getSolrInputDocument();
+ if (document.containsKey(inputFieldname)) {
+ String value = (String) document.getFieldValue(inputFieldname);
+ double boost = 1.0f;
+ for (BoostEntry boostEntry : boostEntries) {
+ if (boostEntry.getPattern().matcher(value).matches()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Pattern match " + boostEntry.getPattern().pattern() + " for " + value);
+ }
+ boost = (boostEntry.getBoost() * 1000) * (boost * 1000) / 1000000;
+ }
+ }
+ document.setField(boostFieldname, boost);
+
+ if (log.isDebugEnabled()) {
+ log.debug("Value " + boost + ", applied to field " + boostFieldname);
+ }
+ }
+ }
+
+ public boolean isEnabled() {
+ return enabled;
+ }
+
+ public void setEnabled(boolean enabled) {
+ this.enabled = enabled;
+ }
+
+ private static class BoostEntry {
+
+ private Pattern pattern;
+ private double boost;
+
+ public BoostEntry(Pattern pattern, double d) {
+ this.pattern = pattern;
+ this.boost = d;
+ }
+
+ public Pattern getPattern() {
+ return pattern;
+ }
+
+ public double getBoost() {
+ return boost;
+ }
+ }
+}
Added: lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java (added)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/update/processor/RegexpBoostProcessorFactory.java Thu Jan 31 12:58:40 2013
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+/**
+ * Factory which creates RegexBoostProcessors
+ * <p>
+ * The factory initializes a shared object cache which is passed to the processor
+ * and this way reduces rules file parsing to the first time the UpdateChain
+ * is initialized.
+ */
+public class RegexpBoostProcessorFactory extends UpdateRequestProcessorFactory {
+
+ private SolrParams params;
+ private final Map<Object, Object> sharedObjectCache = new HashMap<Object, Object>();
+
+ @Override
+ public void init(@SuppressWarnings("rawtypes") final NamedList args) {
+ if (args != null) {
+ this.params = SolrParams.toSolrParams(args);
+ }
+ }
+
+ @Override
+ public UpdateRequestProcessor getInstance(SolrQueryRequest request,
+ SolrQueryResponse response,
+ UpdateRequestProcessor nextProcessor) {
+
+ return new RegexpBoostProcessor(this.params, request, response, nextProcessor, this.sharedObjectCache);
+ }
+}
Added: lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt (added)
+++ lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/regex-boost-processor-test.txt Thu Jan 31 12:58:40 2013
@@ -0,0 +1,10 @@
+# Sample config file for RegexBoostProcessor
+# This example applies boost on the "url" field to boost or deboost certain urls
+# All rules are evaluated, and if several of them match, the boosts are multiplied.
+# If for example one rule with boost 2.0 and one rule with boost 0.1 match, the resulting urlboost=0.2
+
+https?://[^/]+/old/.* 0.1 #Comments are removed
+https?://[^/]+/.*index\([0-9]\).html$ 0.5
+
+# Prioritize certain sites over others
+https?://www.mydomain.no/.* 1.5
\ No newline at end of file
Added: lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java?rev=1440940&view=auto
==============================================================================
--- lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java (added)
+++ lucene/dev/trunk/solr/core/src/test/org/apache/solr/update/processor/RegexBoostProcessorTest.java Thu Jan 31 12:58:40 2013
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.update.processor;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.servlet.SolrRequestParsers;
+import org.apache.solr.update.AddUpdateCommand;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class RegexBoostProcessorTest extends SolrTestCaseJ4 {
+ private static RegexpBoostProcessor reProcessor;
+ protected static SolrRequestParsers _parser;
+ protected static ModifiableSolrParams parameters;
+ private static RegexpBoostProcessorFactory factory;
+ private SolrInputDocument document;
+
+ @BeforeClass
+ public static void setUpBeforeClass() throws Exception {
+ initCore("solrconfig.xml", "schema12.xml");
+ SolrCore core = h.getCore();
+ _parser = new SolrRequestParsers( null );
+ SolrQueryResponse resp = null;
+ parameters = new ModifiableSolrParams();
+ parameters.set(RegexpBoostProcessor.BOOST_FILENAME_PARAM, "regex-boost-processor-test.txt");
+ parameters.set(RegexpBoostProcessor.INPUT_FIELD_PARAM, "url");
+ parameters.set(RegexpBoostProcessor.BOOST_FIELD_PARAM, "urlboost");
+ SolrQueryRequest req = _parser.buildRequestFrom(core, new ModifiableSolrParams(), null);
+ factory = new RegexpBoostProcessorFactory();
+ factory.init(parameters.toNamedList());
+ reProcessor = (RegexpBoostProcessor) factory.getInstance(req, resp, null);
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ document = new SolrInputDocument();
+ super.setUp();
+ }
+
+ @Test
+ public void testNoBoost() throws Exception {
+ document.addField("id", "doc1");
+ document.addField("url", "http://www.nomatch.no");
+
+ processAdd(document);
+
+ assertEquals(1.0d, document.getFieldValue("urlboost"));
+ }
+
+ @Test
+ public void testDeboostOld() throws Exception {
+ document.addField("id", "doc1");
+ document.addField("url", "http://www.somedomain.no/old/test.html");
+
+ processAdd(document);
+
+ assertEquals(0.1d, document.getFieldValue("urlboost"));
+
+ // Test the other deboost rule
+ document = new SolrInputDocument();
+ document.addField("id", "doc1");
+ document.addField("url", "http://www.somedomain.no/foo/index(1).html");
+
+ processAdd(document);
+
+ assertEquals(0.5d, document.getFieldValue("urlboost"));
+}
+
+ @Test
+ public void testBoostGood() throws Exception {
+ document.addField("id", "doc1");
+ document.addField("url", "http://www.mydomain.no/fifty-percent-boost");
+
+ processAdd(document);
+
+ assertEquals(1.5d, document.getFieldValue("urlboost"));
+ }
+
+ @Test
+ public void testTwoRules() throws Exception {
+ document.addField("id", "doc1");
+ document.addField("url", "http://www.mydomain.no/old/test.html");
+
+ processAdd(document);
+
+ assertEquals(0.15d, document.getFieldValue("urlboost"));
+ }
+
+ private void processAdd(SolrInputDocument doc) throws Exception {
+ AddUpdateCommand addCommand = new AddUpdateCommand(null);
+ addCommand.solrDoc = doc;
+ reProcessor.processAdd(addCommand);
+ }
+
+}