You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2009/11/25 21:58:11 UTC

svn commit: r884269 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java src/java/org/apache/nutch/indexer/solr/SolrWriter.java src/java/org/apache/nutch/searcher/SolrSearchBean.java

Author: ab
Date: Wed Nov 25 20:58:10 2009
New Revision: 884269

URL: http://svn.apache.org/viewvc?rev=884269&view=rev
Log:
NUTCH-760 Allow field mapping from nutch to solr index.

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java   (with props)
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884269&r1=884268&r2=884269&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 20:58:10 2009
@@ -2,6 +2,8 @@
 
 Unreleased Changes
 
+* NUTCH-760 Allow field mapping from Nutch to Solr index (David Stuart, ab)
+
 * NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab)
 
 * NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab)

Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java?rev=884269&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java Wed Nov 25 20:58:10 2009
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.ObjectCache;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+public class SolrMappingReader {
+  public static Log LOG = LogFactory.getLog(SolrMappingReader.class);
+
+  /** The property name of the parse solr index mapping location */
+  private static final String SS_FILE_MAPPING = "solrindex.mapping.file";
+  
+  private Configuration conf;
+  
+  private Map<String, String> keyMap = new HashMap<String, String>();
+  private Map<String, String> copyMap = new HashMap<String, String>();
+  private String uniqueKey = "id";
+  
+  public static synchronized SolrMappingReader getInstance(Configuration conf) {
+    ObjectCache cache = ObjectCache.get(conf);
+    SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName());
+    if (instance == null) {
+      instance = new SolrMappingReader(conf);
+      cache.setObject(SolrMappingReader.class.getName(), instance);
+    }
+    return instance;
+  }
+
+  protected SolrMappingReader(Configuration conf) {
+    this.conf = conf;
+    parseMapping();
+  }
+
+  private void parseMapping() {    
+    InputStream ssInputStream = null;
+    ssInputStream = conf.getConfResourceAsInputStream(conf.get(SS_FILE_MAPPING, "solrindex-mapping.xml"));
+    InputSource inputSource = new InputSource(ssInputStream);
+    try {
+      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder builder = factory.newDocumentBuilder();
+      Document document = builder.parse(inputSource);
+      Element rootElement = document.getDocumentElement();
+      NodeList fieldList = rootElement.getElementsByTagName("field");
+      if (fieldList.getLength() > 0) {
+        for (int i = 0; i < fieldList.getLength(); i++) {
+          Element element = (Element) fieldList.item(i);
+          LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
+          keyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+        }
+      }
+      NodeList copyFieldList = rootElement.getElementsByTagName("copyField");
+      if (copyFieldList.getLength() > 0) {
+        for (int i = 0; i < copyFieldList.getLength(); i++) {
+          Element element = (Element) copyFieldList.item(i);
+          LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
+          copyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+        }
+      }
+      NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey");
+      if (uniqueKeyItem.getLength() > 1) {
+        LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'");
+        uniqueKey = "id";
+      }
+      else if (uniqueKeyItem.getLength() == 0) {
+        LOG.warn("No unique key definition found in solr index mapping using, default 'id'");
+      }
+      else{
+    	  uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
+      }
+    } catch (MalformedURLException e) {
+        LOG.warn(e.toString());
+    } catch (SAXException e) {
+        LOG.warn(e.toString());
+    } catch (IOException e) {
+    	LOG.warn(e.toString());
+    } catch (ParserConfigurationException e) {
+    	LOG.warn(e.toString());
+    } 
+  }
+	  
+  public Map<String, String> getKeyMap() {
+    return keyMap;
+  }
+	  
+  public Map<String, String> getCopyMap() {
+    return copyMap;
+  }
+	  
+  public String getUniqueKey() {
+    return uniqueKey;
+  }
+
+  public String hasCopy(String key) {
+    if (copyMap.containsKey(key)) {
+      key = (String) copyMap.get(key);
+    }
+    return key;
+  }
+
+  public String mapKey(String key) throws IOException {
+    if(keyMap.containsKey(key)) {
+      key = (String) keyMap.get(key);
+    }
+    return key;
+  }
+
+  public String mapCopyKey(String key) throws IOException {
+    if(copyMap.containsKey(key)) {
+      key = (String) copyMap.get(key);
+    }
+    return key;
+  }
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=884269&r1=884268&r2=884269&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Wed Nov 25 20:58:10 2009
@@ -32,23 +32,28 @@
 public class SolrWriter implements NutchIndexWriter {
 
   private SolrServer solr;
+  private SolrMappingReader solrMapping;
 
   private final List<SolrInputDocument> inputDocs =
     new ArrayList<SolrInputDocument>();
 
   private int commitSize;
 
-  public void open(JobConf job, String name)
-  throws IOException {
+  public void open(JobConf job, String name) throws IOException {
     solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
     commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
+    solrMapping = SolrMappingReader.getInstance(job);
   }
 
   public void write(NutchDocument doc) throws IOException {
     final SolrInputDocument inputDoc = new SolrInputDocument();
     for(final Entry<String, List<String>> e : doc) {
       for (final String val : e.getValue()) {
-        inputDoc.addField(e.getKey(), val);
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val);
+        String sCopy = solrMapping.mapCopyKey(e.getKey());
+        if (sCopy != e.getKey()) {
+        	inputDoc.addField(sCopy, val);	
+        }
       }
     }
     inputDoc.setDocumentBoost(doc.getScore());

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java?rev=884269&r1=884268&r2=884269&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java Wed Nov 25 20:58:10 2009
@@ -35,6 +35,7 @@
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.ToStringUtils;
+import org.apache.nutch.indexer.solr.SolrMappingReader;
 import org.apache.nutch.indexer.solr.SolrWriter;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServer;
@@ -52,11 +53,15 @@
   private final SolrServer solr;
 
   private final QueryFilters filters;
+  
+  private String searchUID;
 
   public SolrSearchBean(Configuration conf, String solrServer)
   throws IOException {
     solr = new CommonsHttpSolrServer(solrServer);
     filters = new QueryFilters(conf);
+    SolrMappingReader mapping = SolrMappingReader.getInstance(conf);
+    searchUID = mapping.getUniqueKey();
   }
 
   public String getExplanation(Query query, Hit hit) throws IOException {
@@ -76,10 +81,10 @@
     solrQuery.setRows(numHits);
 
     if (sortField == null) {
-      solrQuery.setFields(dedupField, "score", "id");
+      solrQuery.setFields(dedupField, "score", searchUID);
       sortField = "score";
     } else {
-      solrQuery.setFields(dedupField, sortField, "id");
+      solrQuery.setFields(dedupField, sortField, searchUID);
       solrQuery.setSortField(sortField, reverse ? ORDER.asc : ORDER.desc);
     }
 
@@ -113,7 +118,7 @@
 
       final String dedupValue = (String) solrDoc.getFirstValue(dedupField);
 
-      final String uniqueKey = (String )solrDoc.getFirstValue("id");
+      final String uniqueKey = (String )solrDoc.getFirstValue(searchUID);
 
       hitArr[i] = new Hit(uniqueKey, sortValue, dedupValue);
     }
@@ -124,7 +129,7 @@
   public HitDetails getDetails(Hit hit) throws IOException {
     QueryResponse response;
     try {
-      response = solr.query(new SolrQuery("id:\"" + hit.getUniqueKey() + "\""));
+      response = solr.query(new SolrQuery(searchUID + ":\"" + hit.getUniqueKey() + "\""));
     } catch (final SolrServerException e) {
       throw SolrWriter.makeIOException(e);
     }
@@ -141,7 +146,7 @@
     final StringBuilder buf = new StringBuilder();
     buf.append("(");
     for (final Hit hit : hits) {
-      buf.append(" id:\"");
+      buf.append(" " + searchUID + ":\"");
       buf.append(hit.getUniqueKey());
       buf.append("\"");
     }
@@ -169,7 +174,7 @@
       new HashMap<String, HitDetails>(hits.length);
     for (final SolrDocument solrDoc : docList) {
       final HitDetails details = buildDetails(solrDoc);
-      detailsMap.put(details.getValue("id"), details);
+      detailsMap.put(details.getValue(searchUID), details);
     }
 
     final HitDetails[] detailsArr = new HitDetails[hits.length];