You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by gs...@apache.org on 2008/02/20 05:06:49 UTC

svn commit: r629342 - in /lucene/solr/trunk: example/exampleAnalysis/ example/solr/conf/ src/java/org/apache/solr/common/util/ src/java/org/apache/solr/handler/ src/test/org/apache/solr/handler/

Author: gsingers
Date: Tue Feb 19 20:06:33 2008
New Revision: 629342

URL: http://svn.apache.org/viewvc?rev=629342&view=rev
Log:
SOLR-477: Added in AnalysisRequestHandler and tests/sample documents

Added:
    lucene/solr/trunk/example/exampleAnalysis/
    lucene/solr/trunk/example/exampleAnalysis/mem.xml   (with props)
    lucene/solr/trunk/example/exampleAnalysis/post.sh   (with props)
    lucene/solr/trunk/example/exampleAnalysis/small.xml   (with props)
    lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandler.java   (with props)
    lucene/solr/trunk/src/test/org/apache/solr/handler/AnalysisRequestHandlerTest.java   (with props)
Modified:
    lucene/solr/trunk/example/solr/conf/solrconfig.xml
    lucene/solr/trunk/src/java/org/apache/solr/common/util/XML.java

Added: lucene/solr/trunk/example/exampleAnalysis/mem.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/exampleAnalysis/mem.xml?rev=629342&view=auto
==============================================================================
--- lucene/solr/trunk/example/exampleAnalysis/mem.xml (added)
+++ lucene/solr/trunk/example/exampleAnalysis/mem.xml Tue Feb 19 20:06:33 2008
@@ -0,0 +1,55 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<docs>
+<doc>
+  <field name="id">TWINX2048-3200PRO</field>
+  <field name="name">CORSAIR  XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
+  <field name="manu">Corsair Microsystems Inc.</field>
+  <field name="cat">electronics</field>
+  <field name="cat">memory</field>
+  <field name="features">CAS latency 2,	2-3-3-6 timing, 2.75v, unbuffered, heat-spreader</field>
+  <field name="price">185</field>
+  <field name="popularity">5</field>
+  <field name="inStock">true</field>
+</doc>
+
+<doc>
+  <field name="id">VS1GB400C3</field>
+  <field name="name">CORSAIR ValueSelect 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - Retail</field>
+  <field name="manu">Corsair Microsystems Inc.</field>
+  <field name="cat">electronics</field>
+  <field name="cat">memory</field>
+  <field name="price">74.99</field>
+  <field name="popularity">7</field>
+  <field name="inStock">true</field>
+</doc>
+
+<doc>
+  <field name="id">VDBDB1A16</field>
+  <field name="name">A-DATA V-Series 1GB 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) System Memory - OEM</field>
+  <field name="manu">A-DATA Technology Inc.</field>
+  <field name="cat">electronics</field>
+  <field name="cat">memory</field>
+  <field name="features">CAS latency 3,	 2.7v</field>
+  <!-- note: price is missing on this one -->
+  <field name="popularity">5</field>
+  <field name="inStock">true</field>
+
+</doc>
+</docs>
+

Propchange: lucene/solr/trunk/example/exampleAnalysis/mem.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/solr/trunk/example/exampleAnalysis/post.sh
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/exampleAnalysis/post.sh?rev=629342&view=auto
==============================================================================
--- lucene/solr/trunk/example/exampleAnalysis/post.sh (added)
+++ lucene/solr/trunk/example/exampleAnalysis/post.sh Tue Feb 19 20:06:33 2008
@@ -0,0 +1,24 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FILES=$*
+URL=http://localhost:8983/solr/analysis
+
+for f in $FILES; do
+  echo Posting file $f to $URL
+  curl $URL --data-binary @$f -H 'Content-type:text/xml; charset=utf-8' 
+  echo
+done

Propchange: lucene/solr/trunk/example/exampleAnalysis/post.sh
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/solr/trunk/example/exampleAnalysis/small.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/exampleAnalysis/small.xml?rev=629342&view=auto
==============================================================================
--- lucene/solr/trunk/example/exampleAnalysis/small.xml (added)
+++ lucene/solr/trunk/example/exampleAnalysis/small.xml Tue Feb 19 20:06:33 2008
@@ -0,0 +1,24 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<docs>
+<doc>
+  <field name="id">TWINX2048-3200PRO</field>
+  <field name="name">CORSAIR  XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
+</doc>
+</docs>
+

Propchange: lucene/solr/trunk/example/exampleAnalysis/small.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/solr/trunk/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/solrconfig.xml?rev=629342&r1=629341&r2=629342&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/solrconfig.xml (original)
+++ lucene/solr/trunk/example/solr/conf/solrconfig.xml Tue Feb 19 20:06:33 2008
@@ -505,6 +505,16 @@
     -->
   </requestHandler>
 
+  <!--
+   Analysis request handler.  Since Solr 1.3.  Use to returnhow a document is analyzed.  Useful
+   for debugging and as a token server for other types of applications
+   -->
+  <requestHandler name="/analysis" class="solr.AnalysisRequestHandler" >
+    <!--
+    <str name="update.processor.class">org.apache.solr.handler.UpdateRequestProcessor</str>
+    -->
+  </requestHandler>
+
   <!-- CSV update handler, loaded on demand -->
   <requestHandler name="/update/csv" class="solr.CSVRequestHandler" startup="lazy" />
 

Modified: lucene/solr/trunk/src/java/org/apache/solr/common/util/XML.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/common/util/XML.java?rev=629342&r1=629341&r2=629342&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/common/util/XML.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/common/util/XML.java Tue Feb 19 20:06:33 2008
@@ -82,6 +82,10 @@
     escape(str, out, attribute_escapes);
   }
 
+  public static void escapeAttributeValue(char [] chars, int start, int length, Writer out) throws IOException {
+    escape(chars, start, length, out, attribute_escapes);
+  }
+
 
   public final static void writeXML(Writer out, String tag, String val) throws IOException {
     out.write('<');
@@ -149,6 +153,19 @@
     }
   }
 
+  private static void escape(char [] chars, int offset, int length, Writer out, String [] escapes) throws IOException{
+     for (int i=offset; i<length; i++) {
+      char ch = chars[i];
+      if (ch<escapes.length) {
+        String replacement = escapes[ch];
+        if (replacement != null) {
+          out.write(replacement);
+          continue;
+        }
+      }
+      out.write(ch);
+    }
+  }
 
   private static void escape(String str, Writer out, String[] escapes) throws IOException {
     for (int i=0; i<str.length(); i++) {

Added: lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandler.java?rev=629342&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandler.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandler.java Tue Feb 19 20:06:33 2008
@@ -0,0 +1,223 @@
+package org.apache.solr.handler;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import javanet.staxutils.BaseXMLInputFactory;
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.request.SolrQueryResponse;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.logging.Logger;
+
+/**
+ *
+ *
+ **/
+public class AnalysisRequestHandler extends RequestHandlerBase {
+
+  public static Logger log = Logger.getLogger(AnalysisRequestHandler.class.getName());
+
+  private XMLInputFactory inputFactory;
+
+  @Override
+  public void init(NamedList args) {
+    super.init(args);
+
+    inputFactory = BaseXMLInputFactory.newInstance();
+    try {
+      // The java 1.6 bundled stax parser (sjsxp) does not currently have a thread-safe
+      // XMLInputFactory, as that implementation tries to cache and reuse the
+      // XMLStreamReader.  Setting the parser-specific "reuse-instance" property to false
+      // prevents this.
+      // All other known open-source stax parsers (and the bea ref impl)
+      // have thread-safe factories.
+      inputFactory.setProperty("reuse-instance", Boolean.FALSE);
+    }
+    catch (IllegalArgumentException ex) {
+      // Other implementations will likely throw this exception since "reuse-instance"
+      // isimplementation specific.
+      log.fine("Unable to set the 'reuse-instance' property for the input factory: " + inputFactory);
+    }
+  }
+
+  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
+    RequestHandlerUtils.addExperimentalFormatWarning(rsp);
+    SolrParams params = req.getParams();
+    Iterable<ContentStream> streams = req.getContentStreams();
+    if (streams != null) {
+      for (ContentStream stream : req.getContentStreams()) {
+        Reader reader = stream.getReader();
+        try {
+          XMLStreamReader parser = inputFactory.createXMLStreamReader(reader);
+          NamedList<Object> result = processContent(parser, req.getSchema());
+          rsp.add("response", result);
+        }
+        finally {
+          IOUtils.closeQuietly(reader);
+        }
+      }
+    }
+  }
+
+  NamedList<Object> processContent(XMLStreamReader parser,
+                                   IndexSchema schema) throws XMLStreamException, IOException {
+    NamedList<Object> result = new NamedList<Object>();
+    while (true) {
+      int event = parser.next();
+      switch (event) {
+        case XMLStreamConstants.END_DOCUMENT: {
+          parser.close();
+          return result;
+        }
+        case XMLStreamConstants.START_ELEMENT: {
+          String currTag = parser.getLocalName();
+          if ("doc".equals(currTag)) {
+            log.finest("Tokenizing doc...");
+
+            SolrInputDocument doc = readDoc(parser);
+            SchemaField uniq = schema.getUniqueKeyField();
+            NamedList<NamedList<NamedList<Object>>> theTokens = new NamedList<NamedList<NamedList<Object>>>();
+            result.add(doc.getFieldValue(uniq.getName()).toString(), theTokens);
+            for (String name : doc.getFieldNames()) {
+              FieldType ft = schema.getFieldType(name);
+              Analyzer analyzer = ft.getAnalyzer();
+              Collection<Object> vals = doc.getFieldValues(name);
+              for (Object val : vals) {
+                Reader reader = new StringReader(val.toString());
+                TokenStream tstream = analyzer.tokenStream(name, reader);
+                NamedList<NamedList<Object>> tokens = getTokens(tstream);
+                theTokens.add(name, tokens);
+              }
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  static NamedList<NamedList<Object>> getTokens(TokenStream tstream) throws IOException {
+    NamedList<NamedList<Object>> tokens = new NamedList<NamedList<Object>>();
+    Token t = null;
+    while (((t = tstream.next()) != null)) {
+      NamedList<Object> token = new NamedList<Object>();
+      tokens.add("token", token);
+      token.add("value", new String(t.termBuffer(), 0, t.termLength()));
+      token.add("start", t.startOffset());
+      token.add("end", t.endOffset());
+      token.add("posInc", t.getPositionIncrement());
+      token.add("type", t.type());
+      //TODO: handle payloads
+    }
+    return tokens;
+  }
+
+  SolrInputDocument readDoc(XMLStreamReader parser) throws XMLStreamException {
+    SolrInputDocument doc = new SolrInputDocument();
+
+    StringBuilder text = new StringBuilder();
+    String name = null;
+    String attrName = "";
+    float boost = 1.0f;
+    boolean isNull = false;
+    while (true) {
+      int event = parser.next();
+      switch (event) {
+        // Add everything to the text
+        case XMLStreamConstants.SPACE:
+        case XMLStreamConstants.CDATA:
+        case XMLStreamConstants.CHARACTERS:
+          text.append(parser.getText());
+          break;
+
+        case XMLStreamConstants.END_ELEMENT:
+          if ("doc".equals(parser.getLocalName())) {
+            return doc;
+          } else if ("field".equals(parser.getLocalName())) {
+            if (!isNull) {
+              doc.addField(name, text.toString(), boost);
+              boost = 1.0f;
+            }
+          }
+          break;
+
+        case XMLStreamConstants.START_ELEMENT:
+          text.setLength(0);
+          String localName = parser.getLocalName();
+          if (!"field".equals(localName)) {
+            log.warning("unexpected XML tag doc/" + localName);
+            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+                    "unexpected XML tag doc/" + localName);
+          }
+
+          String attrVal = "";
+          for (int i = 0; i < parser.getAttributeCount(); i++) {
+            attrName = parser.getAttributeLocalName(i);
+            attrVal = parser.getAttributeValue(i);
+            if ("name".equals(attrName)) {
+              name = attrVal;
+            }
+          }
+          break;
+      }
+    }
+  }
+
+
+  //////////////////////// SolrInfoMBeans methods //////////////////////
+  @Override
+  public String getDescription() {
+    return "Provide Analysis of text";
+  }
+
+  @Override
+  public String getVersion() {
+    return "$Revision:$";
+  }
+
+  @Override
+  public String getSourceId() {
+    return "$Id:$";
+  }
+
+  @Override
+  public String getSource() {
+    return "$URL:$";
+  }
+
+}

Propchange: lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/solr/trunk/src/test/org/apache/solr/handler/AnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/handler/AnalysisRequestHandlerTest.java?rev=629342&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/handler/AnalysisRequestHandlerTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/handler/AnalysisRequestHandlerTest.java Tue Feb 19 20:06:33 2008
@@ -0,0 +1,105 @@
+package org.apache.solr.handler;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import javanet.staxutils.BaseXMLInputFactory;
+import org.apache.lucene.analysis.Token;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.util.AbstractSolrTestCase;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamReader;
+import java.io.StringReader;
+import java.util.List;
+
+public class AnalysisRequestHandlerTest extends AbstractSolrTestCase {
+  private XMLInputFactory inputFactory = BaseXMLInputFactory.newInstance();
+
+  @Override
+  public String getSchemaFile() {
+    return "schema.xml";
+  }
+
+  @Override
+  public String getSolrConfigFile() {
+    return "solrconfig.xml";
+  }
+
+
+  public void testReadDoc() throws Exception {
+    String xml =
+            "<docs><doc >" +
+                    "  <field name=\"id\" >12345</field>" +
+                    "  <field name=\"name\">cute little kitten</field>" +
+                    "  <field name=\"text\">the quick red fox jumped over the lazy brown dogs</field>" +
+                    "</doc>" +
+                    "<doc >" +
+                    "  <field name=\"id\" >12346</field>" +
+                    "  <field name=\"name\">big mean dog</field>" +
+                    "  <field name=\"text\">cats like to purr</field>" +
+                    "</doc>" +
+                    "</docs>";
+
+    XMLStreamReader parser =
+            inputFactory.createXMLStreamReader(new StringReader(xml));
+    AnalysisRequestHandler handler = new AnalysisRequestHandler();
+    NamedList<Object> result = handler.processContent(parser, h.getCore().getSchema());
+    assertTrue("result is null and it shouldn't be", result != null);
+    NamedList<NamedList<NamedList<Object>>> theTokens = (NamedList<NamedList<NamedList<Object>>>) result.get("12345");
+    assertTrue("theTokens is null and it shouldn't be", theTokens != null);
+    NamedList<NamedList<Object>> tokens = theTokens.get("name");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    assertTrue("tokens Size: " + tokens.size() + " is not : " + 3, tokens.size() == 3);
+    NamedList<Object> token;
+    String value;
+    token = tokens.get("token", 0);
+    value = (String) token.get("value");
+    assertTrue(value + " is not equal to " + "cute", value.equals("cute") == true);
+    token = tokens.get("token", 1);
+    value = (String) token.get("value");
+    assertTrue(value + " is not equal to " + "little", value.equals("little") == true);
+
+    token = tokens.get("token", 2);
+    value = (String) token.get("value");
+    assertTrue(value + " is not equal to " + "kitten", value.equals("kitten") == true);
+
+    tokens = theTokens.get("text");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    assertTrue("tokens Size: " + tokens.size() + " is not : " + 8, tokens.size() == 8);//stopwords are removed
+
+    String[] gold = new String[]{"quick", "red", "fox", "jump", "over", "lazi", "brown", "dog"};
+    for (int j = 0; j < gold.length; j++) {
+      NamedList<Object> tok = tokens.get("token", j);
+      value = (String) tok.get("value");
+      assertTrue(value + " is not equal to " + gold[j], value.equals(gold[j]) == true);
+    }
+    theTokens = (NamedList<NamedList<NamedList<Object>>>) result.get("12346");
+    assertTrue("theTokens is null and it shouldn't be", theTokens != null);
+    tokens = theTokens.get("name");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    assertTrue("tokens Size: " + tokens.size() + " is not : " + 3, tokens.size() == 3);
+    gold = new String[]{"cat", "like", "purr"};
+    tokens = theTokens.get("text");
+    assertTrue("tokens is null and it shouldn't be", tokens != null);
+    assertTrue("tokens Size: " + tokens.size() + " is not : " + 3, tokens.size() == 3);//stopwords are removed
+    for (int j = 0; j < gold.length; j++) {
+      NamedList<Object> tok = tokens.get("token", j);
+      value = (String) tok.get("value");
+      assertTrue(value + " is not equal to " + gold[j], value.equals(gold[j]) == true);
+    }
+  }
+}
\ No newline at end of file

Propchange: lucene/solr/trunk/src/test/org/apache/solr/handler/AnalysisRequestHandlerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native