You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ko...@apache.org on 2009/08/19 19:01:12 UTC

svn commit: r805880 - in /lucene/solr/trunk: ./ src/java/org/apache/solr/handler/ src/test/org/apache/solr/handler/ src/test/test-files/solr/conf/

Author: koji
Date: Wed Aug 19 17:01:12 2009
New Revision: 805880

URL: http://svn.apache.org/viewvc?rev=805880&view=rev
Log:
SOLR-1370: Show the output of CharFilters in FieldAnalysisRequestHandler

Added:
    lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt   (with props)
Modified:
    lucene/solr/trunk/CHANGES.txt
    lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
    lucene/solr/trunk/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
    lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml

Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=805880&r1=805879&r2=805880&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Wed Aug 19 17:01:12 2009
@@ -271,6 +271,8 @@
 
 69. SOLR-1372: Enhance FieldAnalysisRequestHandler to accept field value from content stream (ehatcher)
 
+70. SOLR-1370: Show the output of CharFilters in FieldAnalysisRequestHandler (koji)
+
 
 Optimizations
 ----------------------

Modified: lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java?rev=805880&r1=805879&r2=805880&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java Wed Aug 19 17:01:12 2009
@@ -18,10 +18,14 @@
 package org.apache.solr.handler;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.analysis.CharFilterFactory;
 import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerChain;
+import org.apache.solr.analysis.TokenizerFactory;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.SolrException;
@@ -83,17 +87,29 @@
     }
 
     TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
+    CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
+    TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
+    TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
 
     NamedList<List<NamedList>> namedList = new SimpleOrderedMap<List<NamedList>>();
 
-    TokenStream tokenStream = tokenizerChain.getTokenizerFactory().create(new StringReader(value));
+    if( cfiltfacs != null ){
+      String source = value;
+      for(CharFilterFactory cfiltfac : cfiltfacs ){
+        CharStream reader = CharReader.get(new StringReader(source));
+        reader = cfiltfac.create(reader);
+        source = writeCharStream(namedList, reader);
+      }
+    }
+
+    TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value)));
     List<Token> tokens = analyzeTokenStream(tokenStream);
 
     namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
 
     ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens);
 
-    for (TokenFilterFactory tokenFilterFactory : tokenizerChain.getTokenFilterFactories()) {
+    for (TokenFilterFactory tokenFilterFactory : filtfacs) {
       tokenStream = tokenFilterFactory.create(listBasedTokenStream);
       List<Token> tokenList = analyzeTokenStream(tokenStream);
       namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context));
@@ -188,6 +204,23 @@
 
     return tokensNamedLists;
   }
+  
+  private String writeCharStream(NamedList out, CharStream input ){
+    final int BUFFER_SIZE = 1024;
+    char[] buf = new char[BUFFER_SIZE];
+    int len = 0;
+    StringBuilder sb = new StringBuilder();
+    do {
+      try {
+        len = input.read( buf, 0, BUFFER_SIZE );
+      } catch (IOException e) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
+      }
+      sb.append(buf, 0, len);
+    } while( len == BUFFER_SIZE );
+    out.add( input.getClass().getName(), sb.toString());
+    return sb.toString();
+  }
 
 
   // ================================================= Inner classes =================================================

Modified: lucene/solr/trunk/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java?rev=805880&r1=805879&r2=805880&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java Wed Aug 19 17:01:12 2009
@@ -293,4 +293,30 @@
 
   }
 
+  public void testCharFilterAnalysis() throws Exception {
+
+    FieldAnalysisRequest request = new FieldAnalysisRequest();
+    request.addFieldType("charfilthtmlmap");
+    request.setFieldValue("<html><body>whátëvêr</body></html>");
+    request.setShowMatch(false);
+
+    NamedList<NamedList> result = handler.handleAnalysisRequest(request, h.getCore().getSchema());
+    assertTrue("result is null and it shouldn't be", result != null);
+
+    NamedList<NamedList> fieldTypes = result.get("field_types");
+    assertNotNull("field_types should never be null", fieldTypes);
+    NamedList<NamedList> textType = fieldTypes.get("charfilthtmlmap");
+    assertNotNull("expecting result for field type 'charfilthtmlmap'", textType);
+
+    NamedList indexPart = textType.get("index");
+    assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
+    
+    assertEquals("            whátëvêr              ", indexPart.get("org.apache.solr.analysis.HTMLStripCharFilter"));
+    assertEquals("            whatever              ", indexPart.get("org.apache.lucene.analysis.MappingCharFilter"));
+
+    List<NamedList> tokenList = (List<NamedList>)indexPart.get("org.apache.lucene.analysis.WhitespaceTokenizer");
+    assertNotNull("Expcting WhitespaceTokenizer analysis breakdown", tokenList);
+    assertEquals(tokenList.size(), 1);
+    assertToken(tokenList.get(0), new TokenInfo("whatever", null, "word", 12, 20, 1, null, false));
+  }
 }

Added: lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt?rev=805880&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt Wed Aug 19 17:01:12 2009
@@ -0,0 +1,246 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Syntax:
+#   "source" => "target"
+#     "source".length() > 0 (source cannot be empty.)
+#     "target".length() >= 0 (target can be empty.)
+
+# example:
+#   "À" => "A"
+#   "\u00C0" => "A"
+#   "\u00C0" => "\u0041"
+#   "ß" => "ss"
+#   "\t" => " "
+#   "\n" => ""
+
+# À => A
+"\u00C0" => "A"
+
+# Á => A
+"\u00C1" => "A"
+
+# Â => A
+"\u00C2" => "A"
+
+# Ã => A
+"\u00C3" => "A"
+
+# Ä => A
+"\u00C4" => "A"
+
+# Å => A
+"\u00C5" => "A"
+
+# Æ => AE
+"\u00C6" => "AE"
+
+# Ç => C
+"\u00C7" => "C"
+
+# È => E
+"\u00C8" => "E"
+
+# É => E
+"\u00C9" => "E"
+
+# Ê => E
+"\u00CA" => "E"
+
+# Ë => E
+"\u00CB" => "E"
+
+# Ì => I
+"\u00CC" => "I"
+
+# Í => I
+"\u00CD" => "I"
+
+# Î => I
+"\u00CE" => "I"
+
+# Ï => I
+"\u00CF" => "I"
+
+# IJ => IJ
+"\u0132" => "IJ"
+
+# Ð => D
+"\u00D0" => "D"
+
+# Ñ => N
+"\u00D1" => "N"
+
+# Ò => O
+"\u00D2" => "O"
+
+# Ó => O
+"\u00D3" => "O"
+
+# Ô => O
+"\u00D4" => "O"
+
+# Õ => O
+"\u00D5" => "O"
+
+# Ö => O
+"\u00D6" => "O"
+
+# Ø => O
+"\u00D8" => "O"
+
+# Π=> OE
+"\u0152" => "OE"
+
+# Þ
+"\u00DE" => "TH"
+
+# Ù => U
+"\u00D9" => "U"
+
+# Ú => U
+"\u00DA" => "U"
+
+# Û => U
+"\u00DB" => "U"
+
+# Ü => U
+"\u00DC" => "U"
+
+# Ý => Y
+"\u00DD" => "Y"
+
+# Ÿ => Y
+"\u0178" => "Y"
+
+# à => a
+"\u00E0" => "a"
+
+# á => a
+"\u00E1" => "a"
+
+# â => a
+"\u00E2" => "a"
+
+# ã => a
+"\u00E3" => "a"
+
+# ä => a
+"\u00E4" => "a"
+
+# å => a
+"\u00E5" => "a"
+
+# æ => ae
+"\u00E6" => "ae"
+
+# ç => c
+"\u00E7" => "c"
+
+# è => e
+"\u00E8" => "e"
+
+# é => e
+"\u00E9" => "e"
+
+# ê => e
+"\u00EA" => "e"
+
+# ë => e
+"\u00EB" => "e"
+
+# ì => i
+"\u00EC" => "i"
+
+# í => i
+"\u00ED" => "i"
+
+# î => i
+"\u00EE" => "i"
+
+# ï => i
+"\u00EF" => "i"
+
+# ij => ij
+"\u0133" => "ij"
+
+# ð => d
+"\u00F0" => "d"
+
+# ñ => n
+"\u00F1" => "n"
+
+# ò => o
+"\u00F2" => "o"
+
+# ó => o
+"\u00F3" => "o"
+
+# ô => o
+"\u00F4" => "o"
+
+# õ => o
+"\u00F5" => "o"
+
+# ö => o
+"\u00F6" => "o"
+
+# ø => o
+"\u00F8" => "o"
+
+# œ => oe
+"\u0153" => "oe"
+
+# ß => ss
+"\u00DF" => "ss"
+
+# þ => th
+"\u00FE" => "th"
+
+# ù => u
+"\u00F9" => "u"
+
+# ú => u
+"\u00FA" => "u"
+
+# û => u
+"\u00FB" => "u"
+
+# ü => u
+"\u00FC" => "u"
+
+# ý => y
+"\u00FD" => "y"
+
+# ÿ => y
+"\u00FF" => "y"
+
+# ff => ff
+"\uFB00" => "ff"
+
+# fi => fi
+"\uFB01" => "fi"
+
+# fl => fl
+"\uFB02" => "fl"
+
+# ffi => ffi
+"\uFB03" => "ffi"
+
+# ffl => ffl
+"\uFB04" => "ffl"
+
+# ſt => ft
+"\uFB05" => "ft"
+
+# st => st
+"\uFB06" => "st"

Propchange: lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/solr/trunk/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Modified: lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml?rev=805880&r1=805879&r2=805880&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema.xml Wed Aug 19 17:01:12 2009
@@ -247,6 +247,13 @@
         <filter class="solr.LengthFilterFactory" min="2" max="5"/>
       </analyzer>
     </fieldtype>
+    <fieldType name="charfilthtmlmap" class="solr.TextField">
+      <analyzer>
+        <charFilter class="solr.HTMLStripCharFilterFactory"/>
+        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
 
     <fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
       <analyzer type="index">