You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2009/11/25 21:58:11 UTC
svn commit: r884269 - in /lucene/nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
src/java/org/apache/nutch/indexer/solr/SolrWriter.java
src/java/org/apache/nutch/searcher/SolrSearchBean.java
Author: ab
Date: Wed Nov 25 20:58:10 2009
New Revision: 884269
URL: http://svn.apache.org/viewvc?rev=884269&view=rev
Log:
NUTCH-760 Allow field mapping from nutch to solr index.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (with props)
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=884269&r1=884268&r2=884269&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Nov 25 20:58:10 2009
@@ -2,6 +2,8 @@
Unreleased Changes
+* NUTCH-760 Allow field mapping from Nutch to Solr index (David Stuart, ab)
+
* NUTCH-761 Avoid cloning CrawlDatum in CrawlDbReducer (Julien Nioche, ab)
* NUTCH-753 Prevent new Fetcher from retrieving the robots twice (Julien Nioche via ab)
Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java?rev=884269&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java Wed Nov 25 20:58:10 2009
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.solr;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.ObjectCache;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+public class SolrMappingReader {
+ public static Log LOG = LogFactory.getLog(SolrMappingReader.class);
+
+ /** The property name of the parse solr index mapping location */
+ private static final String SS_FILE_MAPPING = "solrindex.mapping.file";
+
+ private Configuration conf;
+
+ private Map<String, String> keyMap = new HashMap<String, String>();
+ private Map<String, String> copyMap = new HashMap<String, String>();
+ private String uniqueKey = "id";
+
+ public static synchronized SolrMappingReader getInstance(Configuration conf) {
+ ObjectCache cache = ObjectCache.get(conf);
+ SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName());
+ if (instance == null) {
+ instance = new SolrMappingReader(conf);
+ cache.setObject(SolrMappingReader.class.getName(), instance);
+ }
+ return instance;
+ }
+
+ protected SolrMappingReader(Configuration conf) {
+ this.conf = conf;
+ parseMapping();
+ }
+
+ private void parseMapping() {
+ InputStream ssInputStream = null;
+ ssInputStream = conf.getConfResourceAsInputStream(conf.get(SS_FILE_MAPPING, "solrindex-mapping.xml"));
+ InputSource inputSource = new InputSource(ssInputStream);
+ try {
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document document = builder.parse(inputSource);
+ Element rootElement = document.getDocumentElement();
+ NodeList fieldList = rootElement.getElementsByTagName("field");
+ if (fieldList.getLength() > 0) {
+ for (int i = 0; i < fieldList.getLength(); i++) {
+ Element element = (Element) fieldList.item(i);
+ LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
+ keyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+ }
+ }
+ NodeList copyFieldList = rootElement.getElementsByTagName("copyField");
+ if (copyFieldList.getLength() > 0) {
+ for (int i = 0; i < copyFieldList.getLength(); i++) {
+ Element element = (Element) copyFieldList.item(i);
+ LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest"));
+ copyMap.put(element.getAttribute("source"), element.getAttribute("dest"));
+ }
+ }
+ NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey");
+ if (uniqueKeyItem.getLength() > 1) {
+ LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'");
+ uniqueKey = "id";
+ }
+ else if (uniqueKeyItem.getLength() == 0) {
+ LOG.warn("No unique key definition found in solr index mapping using, default 'id'");
+ }
+ else{
+ uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
+ }
+ } catch (MalformedURLException e) {
+ LOG.warn(e.toString());
+ } catch (SAXException e) {
+ LOG.warn(e.toString());
+ } catch (IOException e) {
+ LOG.warn(e.toString());
+ } catch (ParserConfigurationException e) {
+ LOG.warn(e.toString());
+ }
+ }
+
+ public Map<String, String> getKeyMap() {
+ return keyMap;
+ }
+
+ public Map<String, String> getCopyMap() {
+ return copyMap;
+ }
+
+ public String getUniqueKey() {
+ return uniqueKey;
+ }
+
+ public String hasCopy(String key) {
+ if (copyMap.containsKey(key)) {
+ key = (String) copyMap.get(key);
+ }
+ return key;
+ }
+
+ public String mapKey(String key) throws IOException {
+ if(keyMap.containsKey(key)) {
+ key = (String) keyMap.get(key);
+ }
+ return key;
+ }
+
+ public String mapCopyKey(String key) throws IOException {
+ if(copyMap.containsKey(key)) {
+ key = (String) copyMap.get(key);
+ }
+ return key;
+ }
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrMappingReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=884269&r1=884268&r2=884269&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Wed Nov 25 20:58:10 2009
@@ -32,23 +32,28 @@
public class SolrWriter implements NutchIndexWriter {
private SolrServer solr;
+ private SolrMappingReader solrMapping;
private final List<SolrInputDocument> inputDocs =
new ArrayList<SolrInputDocument>();
private int commitSize;
- public void open(JobConf job, String name)
- throws IOException {
+ public void open(JobConf job, String name) throws IOException {
solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL));
commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
+ solrMapping = SolrMappingReader.getInstance(job);
}
public void write(NutchDocument doc) throws IOException {
final SolrInputDocument inputDoc = new SolrInputDocument();
for(final Entry<String, List<String>> e : doc) {
for (final String val : e.getValue()) {
- inputDoc.addField(e.getKey(), val);
+ inputDoc.addField(solrMapping.mapKey(e.getKey()), val);
+ String sCopy = solrMapping.mapCopyKey(e.getKey());
+ if (sCopy != e.getKey()) {
+ inputDoc.addField(sCopy, val);
+ }
}
}
inputDoc.setDocumentBoost(doc.getScore());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java?rev=884269&r1=884268&r2=884269&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/SolrSearchBean.java Wed Nov 25 20:58:10 2009
@@ -35,6 +35,7 @@
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.ToStringUtils;
+import org.apache.nutch.indexer.solr.SolrMappingReader;
import org.apache.nutch.indexer.solr.SolrWriter;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
@@ -52,11 +53,15 @@
private final SolrServer solr;
private final QueryFilters filters;
+
+ private String searchUID;
public SolrSearchBean(Configuration conf, String solrServer)
throws IOException {
solr = new CommonsHttpSolrServer(solrServer);
filters = new QueryFilters(conf);
+ SolrMappingReader mapping = SolrMappingReader.getInstance(conf);
+ searchUID = mapping.getUniqueKey();
}
public String getExplanation(Query query, Hit hit) throws IOException {
@@ -76,10 +81,10 @@
solrQuery.setRows(numHits);
if (sortField == null) {
- solrQuery.setFields(dedupField, "score", "id");
+ solrQuery.setFields(dedupField, "score", searchUID);
sortField = "score";
} else {
- solrQuery.setFields(dedupField, sortField, "id");
+ solrQuery.setFields(dedupField, sortField, searchUID);
solrQuery.setSortField(sortField, reverse ? ORDER.asc : ORDER.desc);
}
@@ -113,7 +118,7 @@
final String dedupValue = (String) solrDoc.getFirstValue(dedupField);
- final String uniqueKey = (String )solrDoc.getFirstValue("id");
+ final String uniqueKey = (String )solrDoc.getFirstValue(searchUID);
hitArr[i] = new Hit(uniqueKey, sortValue, dedupValue);
}
@@ -124,7 +129,7 @@
public HitDetails getDetails(Hit hit) throws IOException {
QueryResponse response;
try {
- response = solr.query(new SolrQuery("id:\"" + hit.getUniqueKey() + "\""));
+ response = solr.query(new SolrQuery(searchUID + ":\"" + hit.getUniqueKey() + "\""));
} catch (final SolrServerException e) {
throw SolrWriter.makeIOException(e);
}
@@ -141,7 +146,7 @@
final StringBuilder buf = new StringBuilder();
buf.append("(");
for (final Hit hit : hits) {
- buf.append(" id:\"");
+ buf.append(" " + searchUID + ":\"");
buf.append(hit.getUniqueKey());
buf.append("\"");
}
@@ -169,7 +174,7 @@
new HashMap<String, HitDetails>(hits.length);
for (final SolrDocument solrDoc : docList) {
final HitDetails details = buildDetails(solrDoc);
- detailsMap.put(details.getValue("id"), details);
+ detailsMap.put(details.getValue(searchUID), details);
}
final HitDetails[] detailsArr = new HitDetails[hits.length];