You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ko...@apache.org on 2008/11/14 02:56:21 UTC
svn commit: r713902 - in /lucene/solr/trunk: ./ example/solr/conf/
src/java/org/apache/solr/analysis/ src/java/org/apache/solr/core/
src/java/org/apache/solr/schema/ src/test/org/apache/solr/analysis/
src/webapp/web/admin/
Author: koji
Date: Thu Nov 13 17:56:21 2008
New Revision: 713902
URL: http://svn.apache.org/viewvc?rev=713902&view=rev
Log:
SOLR-822: Add CharFilter so that characters can be filtered before Tokenizer/TokenFilters.
Added:
lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java (with props)
lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java (with props)
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/example/solr/conf/schema.xml
lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerChain.java
lucene/solr/trunk/src/java/org/apache/solr/core/SolrResourceLoader.java
lucene/solr/trunk/src/java/org/apache/solr/schema/IndexSchema.java
lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java (props changed)
lucene/solr/trunk/src/webapp/web/admin/analysis.jsp
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=713902&r1=713901&r2=713902&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Thu Nov 13 17:56:21 2008
@@ -82,6 +82,9 @@
DirectoryProvider will use NIOFSDirectory for better concurrency
on non Windows platforms. (Mark Miller, TJ Laurenzo via yonik)
+15. SOLR-822: Add CharFilter so that characters can be filtered (e.g. character normalization)
+ before Tokenizer/TokenFilters. (koji)
+
Optimizations
----------------------
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
Added: lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt (added)
+++ lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt Thu Nov 13 17:56:21 2008
@@ -0,0 +1,246 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Syntax:
+# "source" => "target"
+# "source".length() > 0 (source cannot be empty.)
+# "target".length() >= 0 (target can be empty.)
+
+# example:
+# "Ã" => "A"
+# "\u00C0" => "A"
+# "\u00C0" => "\u0041"
+# "Ã" => "ss"
+# "\t" => " "
+# "\n" => ""
+
+# Ã => A
+"\u00C0" => "A"
+
+# Ã => A
+"\u00C1" => "A"
+
+# Ã => A
+"\u00C2" => "A"
+
+# Ã => A
+"\u00C3" => "A"
+
+# Ã => A
+"\u00C4" => "A"
+
+# Ã
=> A
+"\u00C5" => "A"
+
+# Ã => AE
+"\u00C6" => "AE"
+
+# Ã => C
+"\u00C7" => "C"
+
+# Ã => E
+"\u00C8" => "E"
+
+# Ã => E
+"\u00C9" => "E"
+
+# Ã => E
+"\u00CA" => "E"
+
+# Ã => E
+"\u00CB" => "E"
+
+# Ã => I
+"\u00CC" => "I"
+
+# Ã => I
+"\u00CD" => "I"
+
+# Ã => I
+"\u00CE" => "I"
+
+# Ã => I
+"\u00CF" => "I"
+
+# IJ => IJ
+"\u0132" => "IJ"
+
+# Ã => D
+"\u00D0" => "D"
+
+# Ã => N
+"\u00D1" => "N"
+
+# Ã => O
+"\u00D2" => "O"
+
+# Ã => O
+"\u00D3" => "O"
+
+# Ã => O
+"\u00D4" => "O"
+
+# Ã => O
+"\u00D5" => "O"
+
+# Ã => O
+"\u00D6" => "O"
+
+# Ã => O
+"\u00D8" => "O"
+
+# Å => OE
+"\u0152" => "OE"
+
+# Ã
+"\u00DE" => "TH"
+
+# Ã => U
+"\u00D9" => "U"
+
+# Ã => U
+"\u00DA" => "U"
+
+# Ã => U
+"\u00DB" => "U"
+
+# Ã => U
+"\u00DC" => "U"
+
+# Ã => Y
+"\u00DD" => "Y"
+
+# Ÿ => Y
+"\u0178" => "Y"
+
+# Ã => a
+"\u00E0" => "a"
+
+# á => a
+"\u00E1" => "a"
+
+# â => a
+"\u00E2" => "a"
+
+# ã => a
+"\u00E3" => "a"
+
+# ä => a
+"\u00E4" => "a"
+
+# å => a
+"\u00E5" => "a"
+
+# æ => ae
+"\u00E6" => "ae"
+
+# ç => c
+"\u00E7" => "c"
+
+# è => e
+"\u00E8" => "e"
+
+# é => e
+"\u00E9" => "e"
+
+# ê => e
+"\u00EA" => "e"
+
+# ë => e
+"\u00EB" => "e"
+
+# ì => i
+"\u00EC" => "i"
+
+# Ã => i
+"\u00ED" => "i"
+
+# î => i
+"\u00EE" => "i"
+
+# ï => i
+"\u00EF" => "i"
+
+# ij => ij
+"\u0133" => "ij"
+
+# ð => d
+"\u00F0" => "d"
+
+# ñ => n
+"\u00F1" => "n"
+
+# ò => o
+"\u00F2" => "o"
+
+# ó => o
+"\u00F3" => "o"
+
+# ô => o
+"\u00F4" => "o"
+
+# õ => o
+"\u00F5" => "o"
+
+# ö => o
+"\u00F6" => "o"
+
+# ø => o
+"\u00F8" => "o"
+
+# Å => oe
+"\u0153" => "oe"
+
+# Ã => ss
+"\u00DF" => "ss"
+
+# þ => th
+"\u00FE" => "th"
+
+# ù => u
+"\u00F9" => "u"
+
+# ú => u
+"\u00FA" => "u"
+
+# û => u
+"\u00FB" => "u"
+
+# ü => u
+"\u00FC" => "u"
+
+# ý => y
+"\u00FD" => "y"
+
+# ÿ => y
+"\u00FF" => "y"
+
+# ï¬ => ff
+"\uFB00" => "ff"
+
+# ï¬ => fi
+"\uFB01" => "fi"
+
+# ï¬ => fl
+"\uFB02" => "fl"
+
+# ï¬ => ffi
+"\uFB03" => "ffi"
+
+# ï¬ => ffl
+"\uFB04" => "ffl"
+
+# ï¬
=> ft
+"\uFB05" => "ft"
+
+# ï¬ => st
+"\uFB06" => "st"
Propchange: lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/example/solr/conf/mapping-ISOLatin1Accent.txt
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/solr/trunk/example/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/example/solr/conf/schema.xml?rev=713902&r1=713901&r2=713902&view=diff
==============================================================================
--- lucene/solr/trunk/example/solr/conf/schema.xml (original)
+++ lucene/solr/trunk/example/solr/conf/schema.xml Thu Nov 13 17:56:21 2008
@@ -215,6 +215,16 @@
</analyzer>
</fieldType>
+ <!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
+ <!--
+ <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
+ <analyzer>
+ <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+ <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
+ </analyzer>
+ </fieldType>
+ -->
+
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public abstract class BaseCharFilter extends CharFilter {
+
+ protected List<PosCorrectMap> pcmList;
+
+ public BaseCharFilter( CharStream in ){
+ super(in);
+ pcmList = new ArrayList<PosCorrectMap>();
+ }
+
+ protected int correctPosition( int currentPos ){
+ if( pcmList.isEmpty() ) return currentPos;
+ for( int i = pcmList.size() - 1; i >= 0; i-- ){
+ if( currentPos >= pcmList.get( i ).pos )
+ return currentPos + pcmList.get( i ).cumulativeDiff;
+ }
+ return currentPos;
+ }
+
+ protected static class PosCorrectMap {
+
+ protected int pos;
+ protected int cumulativeDiff;
+
+ public PosCorrectMap( int pos, int cumulativeDiff ){
+ this.pos = pos;
+ this.cumulativeDiff = cumulativeDiff;
+ }
+
+ public String toString(){
+ StringBuffer sb = new StringBuffer();
+ sb.append('(');
+ sb.append(pos);
+ sb.append(',');
+ sb.append(cumulativeDiff);
+ sb.append(')');
+ return sb.toString();
+ }
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+*
+* @version $Id$
+* @since Solr 1.4
+*
+*/
+public abstract class BaseCharFilterFactory implements CharFilterFactory {
+
+ public static final Logger log = LoggerFactory.getLogger(BaseCharFilterFactory.class);
+
+ /** The init args */
+ protected Map<String,String> args;
+
+ public Map<String, String> getArgs() {
+ return args;
+ }
+
+ public void init(Map<String, String> args) {
+ this.args = args;
+ }
+
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+/**
+ *
+ * Subclasses of CharFilter can be chained to filter CharStream.
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public abstract class CharFilter extends CharStream {
+
+ protected CharStream input;
+
+ protected CharFilter( CharStream in ){
+ input = in;
+ }
+
+ /**
+ *
+ * Subclass may want to override to correct the current position.
+ *
+ * @param pos current position
+ * @return corrected position
+ */
+ protected int correctPosition( int pos ){
+ return pos;
+ }
+
+ @Override
+ public final int correctOffset(int currentOff) {
+ return input.correctOffset( correctPosition( currentOff ) );
+ }
+
+ @Override
+ public void close() throws IOException {
+ input.close();
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return input.read(cbuf, off, len);
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+/**
+*
+* @version $Id$
+* @since Solr 1.4
+*
+*/
+public interface CharFilterFactory {
+ public void init(Map<String,String> args);
+ public Map<String,String> getArgs();
+ public CharStream create(CharStream input);
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharFilterFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public final class CharReader extends CharStream {
+
+ protected Reader input;
+
+ public CharReader( Reader in ){
+ input = in;
+ }
+
+ @Override
+ public int correctOffset(int currentOff) {
+ return currentOff;
+ }
+
+ @Override
+ public void close() throws IOException {
+ input.close();
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return input.read(cbuf, off, len );
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharReader.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+/**
+ * CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public abstract class CharStream extends Reader {
+
+ /**
+ * called by CharFilter(s) and Tokenizer to correct token offset.
+ *
+ * @param currentOff current offset
+ * @return corrected token offset
+ */
+ public abstract int correctOffset( int currentOff );
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStream.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,276 @@
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.Reader;
+
+
+/**
+ * CJKTokenizer was modified from StopTokenizer which does a decent job for
+ * most European languages. It performs other token methods for double-byte
+ * Characters: the token will return at each two characters with overlap match.<br>
+ * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+ * also need filter filter zero length token ""<br>
+ * for Digit: digit, '+', '#' will token as letter<br>
+ * for more info on Asia language(Chinese Japanese Korean) text segmentation:
+ * please search <a
+ * href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ *
+ */
+
+/*
+ * LUCENE-973 is applied
+ */
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public final class CharStreamAwareCJKTokenizer extends Tokenizer {
+ //~ Static fields/initializers ---------------------------------------------
+ /** Word token type */
+ static final int WORD_TYPE = 0;
+
+ /** Single byte token type */
+ static final int SINGLE_TOKEN_TYPE = 1;
+
+ /** Double byte token type */
+ static final int DOUBLE_TOKEN_TYPE = 2;
+
+ /** Names for token types */
+ static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+
+ /** Max word length */
+ private static final int MAX_WORD_LEN = 255;
+
+ /** buffer size: */
+ private static final int IO_BUFFER_SIZE = 256;
+
+ //~ Instance fields --------------------------------------------------------
+
+ /** word offset, used to imply which character(in ) is parsed */
+ private int offset = 0;
+
+ /** the index used only for ioBuffer */
+ private int bufferIndex = 0;
+
+ /** data length */
+ private int dataLen = 0;
+
+ /**
+ * character buffer, store the characters which are used to compose <br>
+ * the returned Token
+ */
+ private final char[] buffer = new char[MAX_WORD_LEN];
+
+ /**
+ * I/O buffer, used to store the content of the input(one of the <br>
+ * members of Tokenizer)
+ */
+ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /** word type: single=>ASCII double=>non-ASCII word=>default */
+ private int tokenType = WORD_TYPE;
+
+ /**
+ * tag: previous character is a cached double-byte character "C1C2C3C4"
+ * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+ * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+ */
+ private boolean preIsTokened = false;
+
+ //~ Constructors -----------------------------------------------------------
+
+ /**
+ * Construct a token stream processing the given input.
+ *
+ * @param in I/O reader
+ */
+ public CharStreamAwareCJKTokenizer(CharStream in) {
+ input = in;
+ }
+
+ //~ Methods ----------------------------------------------------------------
+
+ /**
+ * Returns the next token in the stream, or null at EOS.
+ * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+ * for detail.
+ *
+ * @param reusableToken a reusable token
+ * @return Token
+ *
+ * @throws java.io.IOException - throw IOException when read error <br>
+ * happened in the InputStream
+ *
+ */
+ public final Token next(final Token reusableToken) throws java.io.IOException {
+ /** how many character(s) has been stored in buffer */
+ assert reusableToken != null;
+ int length = 0;
+
+ /** the position used to create Token */
+ int start = offset;
+
+ while (true) {
+ /** current character */
+ char c;
+
+ /** unicode block of current character for detail */
+ Character.UnicodeBlock ub;
+
+ offset++;
+
+ if (bufferIndex >= dataLen) {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ }
+
+ if (dataLen == -1) {
+ if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ }
+
+ break;
+ } else {
+ return null;
+ }
+ } else {
+ //get current character
+ c = ioBuffer[bufferIndex++];
+
+ //get the UnicodeBlock of the current character
+ ub = Character.UnicodeBlock.of(c);
+ }
+
+ //if the current character is ASCII or Extend ASCII
+ if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+ || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+ ) {
+ if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+ // convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+ int i = (int) c;
+ i = i - 65248;
+ c = (char) i;
+ }
+
+ // if the current character is a letter or "_" "+" "#"
+ if (Character.isLetterOrDigit(c)
+ || ((c == '_') || (c == '+') || (c == '#'))
+ ) {
+ if (length == 0) {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the current character begin to token the ASCII
+ // letter
+ start = offset - 1;
+ } else if (tokenType == DOUBLE_TOKEN_TYPE) {
+ // "javaC1C2C3C4linux" <br>
+ // ^--: the previous non-ASCII
+ // : the current character
+ offset--;
+ bufferIndex--;
+
+ if (preIsTokened == true) {
+ // there is only one non-ASCII has been stored
+ length = 0;
+ preIsTokened = false;
+ break;
+ } else {
+ break;
+ }
+ }
+
+ // store the LowerCase(c) in the buffer
+ buffer[length++] = Character.toLowerCase(c);
+ tokenType = SINGLE_TOKEN_TYPE;
+
+ // break the procedure if buffer overflowed!
+ if (length == MAX_WORD_LEN) {
+ break;
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ } else {
+ // non-ASCII letter, e.g."C1C2C3C4"
+ if (Character.isLetter(c)) {
+ if (length == 0) {
+ start = offset - 1;
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+ } else {
+ if (tokenType == SINGLE_TOKEN_TYPE) {
+ offset--;
+ bufferIndex--;
+
+ //return the previous ASCII characters
+ break;
+ } else {
+ buffer[length++] = c;
+ tokenType = DOUBLE_TOKEN_TYPE;
+
+ if (length == 2) {
+ offset--;
+ bufferIndex--;
+ preIsTokened = true;
+
+ break;
+ }
+ }
+ }
+ } else if (length > 0) {
+ if (preIsTokened == true) {
+ // empty the buffer
+ length = 0;
+ preIsTokened = false;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ if (length > 0) {
+ // Because of "CharStream aware" tokenizer, using correctOffset() to
+ // correct start/end offsets
+ return reusableToken.reinit
+ (buffer, 0, length,
+ ((CharStream)input).correctOffset( start ),
+ ((CharStream)input).correctOffset( start+length ),
+ TOKEN_TYPE_NAMES[tokenType]);
+ } else if (dataLen != -1) {
+ // Don't return an empty string - recurse to get the next token
+ return next(reusableToken);
+ } else {
+ return null;
+ }
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class CharStreamAwareCJKTokenizerFactory extends BaseTokenizerFactory {
+
+ public TokenStream create(Reader input) {
+ return new CharStreamAwareCJKTokenizer( (CharStream)input );
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,102 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+/** An abstract base class for simple, character-oriented tokenizers.*/
+public abstract class CharStreamAwareCharTokenizer extends Tokenizer {
+ public CharStreamAwareCharTokenizer(CharStream input) {
+ super(input);
+ }
+
+ private int offset = 0, bufferIndex = 0, dataLen = 0;
+ private static final int MAX_WORD_LEN = 255;
+ private static final int IO_BUFFER_SIZE = 4096;
+ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ /** Returns true iff a character should be included in a token. This
+ * tokenizer generates as tokens adjacent sequences of characters which
+ * satisfy this predicate. Characters for which this is false are used to
+ * define token boundaries and are not included in tokens. */
+ protected abstract boolean isTokenChar(char c);
+
+ /** Called on each token character to normalize it before it is added to the
+ * token. The default implementation does nothing. Subclasses may use this
+ * to, e.g., lowercase tokens. */
+ protected char normalize(char c) {
+ return c;
+ }
+
+ public final Token next(final Token reusableToken) throws IOException {
+ assert reusableToken != null;
+ reusableToken.clear();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = reusableToken.termBuffer();
+ while (true) {
+
+ if (bufferIndex >= dataLen) {
+ offset += dataLen;
+ dataLen = input.read(ioBuffer);
+ if (dataLen == -1) {
+ if (length > 0)
+ break;
+ else
+ return null;
+ }
+ bufferIndex = 0;
+ }
+
+ final char c = ioBuffer[bufferIndex++];
+
+ if (isTokenChar(c)) { // if it's a token char
+
+ if (length == 0) // start of token
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.length)
+ buffer = reusableToken.resizeTermBuffer(1+length);
+
+ buffer[length++] = normalize(c); // buffer it, normalized
+
+ if (length == MAX_WORD_LEN) // buffer overflow!
+ break;
+
+ } else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ reusableToken.setTermLength(length);
+ // Because of "CharStream aware" tokenizer, using correctOffset() to
+ // correct start/end offsets
+ reusableToken.setStartOffset(((CharStream)input).correctOffset(start));
+ reusableToken.setEndOffset(((CharStream)input).correctOffset(start+length));
+ return reusableToken;
+ }
+
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ bufferIndex = 0;
+ offset = 0;
+ dataLen = 0;
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,33 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ * Adjacent sequences of non-Whitespace characters form tokens. */
+public class CharStreamAwareWhitespaceTokenizer extends CharStreamAwareCharTokenizer {
+ /** Construct a new WhitespaceTokenizer. */
+ public CharStreamAwareWhitespaceTokenizer(CharStream in) {
+ super(in);
+ }
+
+ /** Collects only characters which do not satisfy
+ * {@link Character#isWhitespace(char)}.*/
+ protected boolean isTokenChar(char c) {
+ return !Character.isWhitespace(c);
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class CharStreamAwareWhitespaceTokenizerFactory extends BaseTokenizerFactory {
+
+ public TokenStream create(Reader input) {
+ return new CharStreamAwareWhitespaceTokenizer( (CharStream)input );
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class MappingCharFilter extends BaseCharFilter {
+
+ private final NormalizeMap normMap;
+ private LinkedList<Character> buffer;
+ private String replacement;
+ private int charPointer;
+ private int nextCharCounter;
+
+ public MappingCharFilter( NormalizeMap normMap, CharStream in ){
+ super( in );
+ this.normMap = normMap;
+ }
+
+ public int read() throws IOException {
+ while( true ){
+ if( replacement != null && charPointer < replacement.length() )
+ return replacement.charAt( charPointer++ );
+
+ int firstChar = nextChar();
+ if( firstChar == -1 ) return -1;
+ NormalizeMap nm = normMap.submap != null ?
+ normMap.submap.get( (char)firstChar ) : null;
+ if( nm == null ) return firstChar;
+ NormalizeMap result = match( nm );
+ if( result == null ) return firstChar;
+ replacement = result.normStr;
+ charPointer = 0;
+ if( result.diff != 0 ){
+ int prevCumulativeDiff = pcmList.isEmpty() ? 0 :
+ pcmList.get( pcmList.size() - 1 ).cumulativeDiff;
+ if( result.diff < 0 ){
+ for( int i = 0; i < -result.diff ; i++ )
+ pcmList.add( new PosCorrectMap( nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i ) );
+ }
+ else{
+ pcmList.add( new PosCorrectMap( nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff ) );
+ }
+ }
+ }
+ }
+
+ private int nextChar() throws IOException {
+ nextCharCounter++;
+ if( buffer != null && !buffer.isEmpty() )
+ return buffer.removeFirst();
+ return input.read();
+ }
+
+ private void pushChar( int c ){
+ nextCharCounter--;
+ if( buffer == null )
+ buffer = new LinkedList<Character>();
+ buffer.addFirst( (char)c );
+ }
+
+ private void pushLastChar( int c ){
+ if( buffer == null )
+ buffer = new LinkedList<Character>();
+ buffer.addLast( (char)c );
+ }
+
+ private NormalizeMap match( NormalizeMap map ) throws IOException {
+ NormalizeMap result = null;
+ if( map.submap != null ){
+ int chr = nextChar();
+ if( chr != -1 ){
+ NormalizeMap subMap = map.submap.get( (char)chr );
+ if( subMap != null ){
+ result = match( subMap );
+ }
+ if( result == null )
+ pushChar( chr );
+ }
+ }
+ if( result == null && map.normStr != null )
+ result = map;
+ return result;
+ }
+
+ public int read( char[] cbuf, int off, int len ) throws IOException {
+ char[] tmp = new char[len];
+ int l = input.read( tmp, 0, len );
+ if( l != -1 ){
+ for( int i = 0; i < l; i++ )
+ pushLastChar( tmp[i] );
+ }
+ l = 0;
+ for( int i = off; i < off + len; i++ ){
+ int c = read();
+ if( c == -1 ) break;
+ cbuf[i] = (char)c;
+ l++;
+ }
+ return l == 0 ? -1 : l;
+ }
+
+ public boolean markSupported(){
+ return false;
+ }
+
+ public void mark( int readAheadLimit ) throws IOException {
+ throw new IOException( "mark/reset not supported" );
+ }
+
+ public void reset() throws IOException {
+ throw new IOException( "mark/reset not supported" );
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class MappingCharFilterFactory extends BaseCharFilterFactory implements
+ ResourceLoaderAware {
+
+ protected NormalizeMap normMap;
+ private String mapping;
+
+ public void inform(ResourceLoader loader) {
+ mapping = args.get( "mapping" );
+
+ if( mapping != null ){
+ List<String> wlist = null;
+ try{
+ File mappingFile = new File( mapping );
+ if( mappingFile.exists() ){
+ wlist = loader.getLines( mapping );
+ }
+ else{
+ List<String> files = StrUtils.splitFileNames( mapping );
+ wlist = new ArrayList<String>();
+ for( String file : files ){
+ List<String> lines = loader.getLines( file.trim() );
+ wlist.addAll( lines );
+ }
+ }
+ }
+ catch( IOException e ){
+ throw new RuntimeException( e );
+ }
+ normMap = new NormalizeMap();
+ parseRules( wlist, normMap );
+ }
+ }
+
+ public CharStream create(CharStream input) {
+ return new MappingCharFilter(normMap,input);
+ }
+
+ // "source" => "target"
+ static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
+
+ protected void parseRules( List<String> rules, NormalizeMap normMap ){
+ for( String rule : rules ){
+ Matcher m = p.matcher( rule );
+ if( !m.find() )
+ throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "], file = " + mapping );
+ normMap.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
+ }
+ }
+
+ char[] out = new char[256];
+
+ protected String parseString( String s ){
+ int readPos = 0;
+ int len = s.length();
+ int writePos = 0;
+ while( readPos < len ){
+ char c = s.charAt( readPos++ );
+ if( c == '\\' ){
+ if( readPos >= len )
+ throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
+ c = s.charAt( readPos++ );
+ switch( c ) {
+ case '\\' : c = '\\'; break;
+ case '"' : c = '"'; break;
+ case 'n' : c = '\n'; break;
+ case 't' : c = '\t'; break;
+ case 'r' : c = '\r'; break;
+ case 'b' : c = '\b'; break;
+ case 'f' : c = '\f'; break;
+ case 'u' :
+ if( readPos + 3 >= len )
+ throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
+ c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+ readPos += 4;
+ break;
+ }
+ }
+ out[writePos++] = c;
+ }
+ return new String( out, 0, writePos );
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class NormalizeMap {
+
+ Map<Character, NormalizeMap> submap;
+ String normStr;
+ int diff;
+
+ public void add( String singleMatch, String replacement ){
+ NormalizeMap currMap = this;
+ for( int i = 0; i < singleMatch.length(); i++ ){
+ char c = singleMatch.charAt( i );
+ if( currMap.submap == null ){
+ currMap.submap = new HashMap<Character, NormalizeMap>( 1 );
+ }
+ NormalizeMap map = currMap.submap.get( c );
+ if( map == null ){
+ map = new NormalizeMap();
+ currMap.submap.put( c, map );
+ }
+ currMap = map;
+ }
+ if( currMap.normStr != null ){
+ throw new RuntimeException( "MappingCharFilter: there is already a mapping for " + singleMatch );
+ }
+ currMap.normStr = replacement;
+ currMap.diff = singleMatch.length() - replacement.length();
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/java/org/apache/solr/analysis/NormalizeMap.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerChain.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerChain.java?rev=713902&r1=713901&r2=713902&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerChain.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/analysis/TokenizerChain.java Thu Nov 13 17:56:21 2008
@@ -31,19 +31,37 @@
// create a TokenStream.
//
public class TokenizerChain extends SolrAnalyzer {
+ final private CharFilterFactory[] charFilters;
final private TokenizerFactory tokenizer;
final private TokenFilterFactory[] filters;
public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
+ this(null,tokenizer,filters);
+ }
+
+ public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
+ this.charFilters = charFilters;
this.tokenizer = tokenizer;
this.filters = filters;
}
+ public CharFilterFactory[] getCharFilterFactories() { return charFilters; }
public TokenizerFactory getTokenizerFactory() { return tokenizer; }
public TokenFilterFactory[] getTokenFilterFactories() { return filters; }
+ public Reader charStream(Reader reader){
+ if( charFilters != null && charFilters.length > 0 ){
+ CharStream cs = new CharReader( reader );
+ for (int i=0; i<charFilters.length; i++) {
+ cs = charFilters[i].create(cs);
+ }
+ reader = cs;
+ }
+ return reader;
+ }
+
public TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream ts = tokenizer.create(reader);
+ TokenStream ts = tokenizer.create(charStream(reader));
for (int i=0; i<filters.length; i++) {
ts = filters[i].create(ts);
}
@@ -52,6 +70,10 @@
public String toString() {
StringBuilder sb = new StringBuilder("TokenizerChain(");
+ for (CharFilterFactory filter: charFilters) {
+ sb.append(filter);
+ sb.append(", ");
+ }
sb.append(tokenizer);
for (TokenFilterFactory filter: filters) {
sb.append(", ");
Modified: lucene/solr/trunk/src/java/org/apache/solr/core/SolrResourceLoader.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/core/SolrResourceLoader.java?rev=713902&r1=713901&r2=713902&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/core/SolrResourceLoader.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/core/SolrResourceLoader.java Thu Nov 13 17:56:21 2008
@@ -37,6 +37,7 @@
import javax.naming.NamingException;
import javax.naming.NoInitialContextException;
+import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerFactory;
import org.apache.solr.common.ResourceLoader;
@@ -394,8 +395,9 @@
}
);
- awareCompatibility.put(
+ awareCompatibility.put(
ResourceLoaderAware.class, new Class[] {
+ CharFilterFactory.class,
TokenFilterFactory.class,
TokenizerFactory.class,
FieldType.class
@@ -427,5 +429,5 @@
}
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, builder.toString() );
}
-
+
}
\ No newline at end of file
Modified: lucene/solr/trunk/src/java/org/apache/solr/schema/IndexSchema.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/schema/IndexSchema.java?rev=713902&r1=713901&r2=713902&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/schema/IndexSchema.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/schema/IndexSchema.java Thu Nov 13 17:56:21 2008
@@ -29,6 +29,7 @@
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.Config;
import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.analysis.CharFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.TokenizerFactory;
@@ -739,12 +740,33 @@
XPath xpath = XPathFactory.newInstance().newXPath();
+ // Load the CharFilters
+ // --------------------------------------------------------------------------------
+ final ArrayList<CharFilterFactory> charFilters = new ArrayList<CharFilterFactory>();
+ AbstractPluginLoader<CharFilterFactory> charFilterLoader =
+ new AbstractPluginLoader<CharFilterFactory>( "[schema.xml] analyzer/charFilter", false, false )
+ {
+ @Override
+ protected void init(CharFilterFactory plugin, Node node) throws Exception {
+ if( plugin != null ) {
+ plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
+ charFilters.add( plugin );
+ }
+ }
+
+ @Override
+ protected CharFilterFactory register(String name, CharFilterFactory plugin) throws Exception {
+ return null; // used for map registration
+ }
+ };
+ charFilterLoader.load( solrConfig.getResourceLoader(), (NodeList)xpath.evaluate("./charFilter", node, XPathConstants.NODESET) );
+
// Load the Tokenizer
- // Although an analyzer only allows a single Tokenizer, we load a list to make sure
+ // Although an analyzer only allows a single Tokenizer, we load a list to make sure
// the configuration is ok
// --------------------------------------------------------------------------------
final ArrayList<TokenizerFactory> tokenizers = new ArrayList<TokenizerFactory>(1);
- AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
+ AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
new AbstractPluginLoader<TokenizerFactory>( "[schema.xml] analyzer/tokenizer", false, false )
{
@Override
@@ -790,8 +812,9 @@
}
};
filterLoader.load( loader, (NodeList)xpath.evaluate("./filter", node, XPathConstants.NODESET) );
-
- return new TokenizerChain(tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
+
+ return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]),
+ tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
};
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,52 @@
+package org.apache.solr.analysis;
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+public class TestCharFilter extends TestCase {
+
+ public void testCharFilter1() throws Exception {
+ CharStream cs = new CharFilter1( new CharReader( new StringReader("") ) );
+ assertEquals( "corrected position is invalid", 1, cs.correctOffset( 0 ) );
+ }
+
+ public void testCharFilter2() throws Exception {
+ CharStream cs = new CharFilter2( new CharReader( new StringReader("") ) );
+ assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
+ }
+
+ public void testCharFilter12() throws Exception {
+ CharStream cs = new CharFilter2( new CharFilter1( new CharReader( new StringReader("") ) ) );
+ assertEquals( "corrected position is invalid", 3, cs.correctOffset( 0 ) );
+ }
+
+ public void testCharFilter11() throws Exception {
+ CharStream cs = new CharFilter1( new CharFilter1( new CharReader( new StringReader("") ) ) );
+ assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
+ }
+
+ static class CharFilter1 extends CharFilter {
+
+ protected CharFilter1(CharStream in) {
+ super(in);
+ }
+
+ @Override
+ protected int correctPosition(int currentPos) {
+ return currentPos + 1;
+ }
+ }
+
+ static class CharFilter2 extends CharFilter {
+
+ protected CharFilter2(CharStream in) {
+ super(in);
+ }
+
+ @Override
+ protected int correctPosition(int currentPos) {
+ return currentPos + 2;
+ }
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestCharFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestMappingCharFilter extends BaseTokenTestCase {
+
+ NormalizeMap normMap;
+
+ public void setUp() throws Exception {
+ normMap = new NormalizeMap();
+
+ normMap.add( "aa", "a" );
+ normMap.add( "bbb", "b" );
+ normMap.add( "cccc", "cc" );
+
+ normMap.add( "h", "i" );
+ normMap.add( "j", "jj" );
+ normMap.add( "k", "kkk" );
+ normMap.add( "ll", "llll" );
+
+ normMap.add( "empty", "" );
+ }
+
+ public void testNothingChange() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "x" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "x" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test1to1() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "i" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test1to2() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "j" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "jj,1,0,1" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test1to3() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "k" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "kkk,1,0,1" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test2to4() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "ll" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "llll,1,0,2" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test2to1() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "aa" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "a,1,0,2" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test3to1() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "bbb" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "b,1,0,3" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test4to2() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "cccc" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "cc,1,0,4" );
+ assertTokEqualOff( expect, real );
+ }
+
+ public void test5to0() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "empty" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ assertEquals( 0, real.size() );
+ }
+
+ //
+ // 1111111111222
+ // 01234567890123456789012
+ //(in) h i j k ll cccc bbb aa
+ //
+ // 1111111111222
+ // 01234567890123456789012
+ //(out) i i jj kkk llll cc b a
+ //
+ // h, 0, 1 => i, 0, 1
+ // i, 2, 3 => i, 2, 3
+ // j, 4, 5 => jj, 4, 5
+ // k, 6, 7 => kkk, 6, 7
+ // ll, 8,10 => llll, 8,10
+ // cccc,11,15 => cc,11,15
+ // bbb,16,19 => b,16,19
+ // aa,20,22 => a,20,22
+ //
+ public void testTokenStream() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h i j k ll cccc bbb aa" ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
+ assertTokEqualOff( expect, real );
+ }
+
+ //
+ //
+ // 0123456789
+ //(in) aaaa ll h
+ //(out-1) aa llll i
+ //(out-2) a llllllll i
+ //
+ // aaaa,0,4 => a,0,4
+ // ll,5,7 => llllllll,5,7
+ // h,8,9 => i,8,9
+ public void testChained() throws Exception {
+ CharStream cs = new MappingCharFilter( normMap,
+ new MappingCharFilter( normMap, new CharReader( new StringReader( "aaaa ll h" ) ) ) );
+ TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+ List<Token> real = getTokens( ts );
+ List<Token> expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
+ assertTokEqualOff( expect, real );
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilter.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java?rev=713902&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java Thu Nov 13 17:56:21 2008
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import junit.framework.TestCase;
+
+public class TestMappingCharFilterFactory extends TestCase {
+ public void testParseString() throws Exception {
+
+ MappingCharFilterFactory f = new MappingCharFilterFactory();
+
+ try {
+ f.parseString( "\\" );
+ fail( "escape character cannot be alone." );
+ }
+ catch( RuntimeException expected ){}
+
+ assertEquals( "unexpected escaped characters",
+ "\\\"\n\t\r\b\f", f.parseString( "\\\\\\\"\\n\\t\\r\\b\\f" ) );
+ assertEquals( "unexpected escaped characters",
+ "A", f.parseString( "\\u0041" ) );
+ assertEquals( "unexpected escaped characters",
+ "AB", f.parseString( "\\u0041\\u0042" ) );
+
+ try {
+ f.parseString( "\\u000" );
+ fail( "invalid length check." );
+ }
+ catch( RuntimeException expected ){}
+
+ try {
+ f.parseString( "\\u123x" );
+ fail( "invalid hex number check." );
+ }
+ catch( NumberFormatException expected ){}
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Propchange: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestSynonymMap.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/solr/trunk/src/webapp/web/admin/analysis.jsp
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/webapp/web/admin/analysis.jsp?rev=713902&r1=713901&r2=713902&view=diff
==============================================================================
--- lucene/solr/trunk/src/webapp/web/admin/analysis.jsp (original)
+++ lucene/solr/trunk/src/webapp/web/admin/analysis.jsp Thu Nov 13 17:56:21 2008
@@ -181,9 +181,9 @@
TokenizerFactory tfac = tchain.getTokenizerFactory();
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
- TokenStream tstream = tfac.create(reader);
+ TokenStream tstream = tfac.create(tchain.charStream(reader));
List<Token> tokens = getTokens(tstream);
- tstream = tfac.create(reader);
+ tstream = tfac.create(tchain.charStream(reader));
if (verbose) {
writeHeader(out, tfac.getClass(), tfac.getArgs());
}