You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/08 00:12:15 UTC
[11/34] lucenenet git commit: Raw porting of
Lucene.Net.Analysis.Common
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilter.cs
new file mode 100644
index 0000000..1876a21
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilter.cs
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.pattern
+{
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+ /// <summary>
+ /// A TokenFilter which applies a Pattern to each token in the stream,
+ /// replacing match occurances with the specified replacement string.
+ ///
+ /// <para>
+ /// <b>Note:</b> Depending on the input and the pattern used and the input
+ /// TokenStream, this TokenFilter may produce Tokens whose text is the empty
+ /// string.
+ /// </para>
+ /// </summary>
+ /// <seealso cref= Pattern </seealso>
+ public sealed class PatternReplaceFilter : TokenFilter
+ {
+ private readonly string replacement;
+ private readonly bool all;
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly Matcher m;
+
+ /// <summary>
+ /// Constructs an instance to replace either the first, or all occurances
+ /// </summary>
+ /// <param name="in"> the TokenStream to process </param>
+ /// <param name="p"> the patterm to apply to each Token </param>
+ /// <param name="replacement"> the "replacement string" to substitute, if null a
+ /// blank string will be used. Note that this is not the literal
+ /// string that will be used, '$' and '\' have special meaning. </param>
+ /// <param name="all"> if true, all matches will be replaced otherwise just the first match. </param>
+ /// <seealso cref= Matcher#quoteReplacement </seealso>
+ public PatternReplaceFilter(TokenStream @in, Pattern p, string replacement, bool all) : base(@in)
+ {
+ this.replacement = (null == replacement) ? "" : replacement;
+ this.all = all;
+ this.m = p.matcher(termAtt);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (!input.incrementToken())
+ {
+ return false;
+ }
+
+ m.reset();
+ if (m.find())
+ {
+ // replaceAll/replaceFirst will reset() this previous find.
+ string transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement);
+ termAtt.setEmpty().append(transformed);
+ }
+
+ return true;
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilterFactory.cs
new file mode 100644
index 0000000..d030789
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternReplaceFilterFactory.cs
@@ -0,0 +1,64 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.pattern
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+
+
+ /// <summary>
+ /// Factory for <seealso cref="PatternReplaceFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ptnreplace" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.KeywordTokenizerFactory"/>
+ /// <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement=""
+ /// replace="all"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ /// <seealso cref= PatternReplaceFilter </seealso>
+ public class PatternReplaceFilterFactory : TokenFilterFactory
+ {
+ internal readonly Pattern pattern;
+ internal readonly string replacement;
+ internal readonly bool replaceAll;
+
+ /// <summary>
+ /// Creates a new PatternReplaceFilterFactory </summary>
+ public PatternReplaceFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ pattern = getPattern(args, "pattern");
+ replacement = get(args, "replacement");
+ replaceAll = "all".Equals(get(args, "replace", Arrays.asList("all", "first"), "all"));
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override PatternReplaceFilter create(TokenStream input)
+ {
+ return new PatternReplaceFilter(input, pattern, replacement, replaceAll);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizer.cs
new file mode 100644
index 0000000..d403494
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizer.cs
@@ -0,0 +1,185 @@
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.pattern
+{
+
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+ /// <summary>
+ /// This tokenizer uses regex pattern matching to construct distinct tokens
+ /// for the input stream. It takes two arguments: "pattern" and "group".
+ /// <p/>
+ /// <ul>
+ /// <li>"pattern" is the regular expression.</li>
+ /// <li>"group" says which group to extract into tokens.</li>
+ /// </ul>
+ /// <para>
+ /// group=-1 (the default) is equivalent to "split". In this case, the tokens will
+ /// be equivalent to the output from (without empty tokens):
+ /// <seealso cref="String#split(java.lang.String)"/>
+ /// </para>
+ /// <para>
+ /// Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
+ /// <pre>
+ /// pattern = \'([^\']+)\'
+ /// group = 0
+ /// input = aaa 'bbb' 'ccc'
+ /// </pre>
+ /// the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
+ /// but using group=1, the output would be: bbb and ccc (no ' marks)
+ /// </para>
+ /// <para>NOTE: This Tokenizer does not output tokens that are of zero length.</para>
+ /// </summary>
+ /// <seealso cref= Pattern </seealso>
+ public sealed class PatternTokenizer : Tokenizer
+ {
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+ private readonly StringBuilder str = new StringBuilder();
+ private int index;
+
+ private readonly int group;
+ private readonly Matcher matcher;
+
+ /// <summary>
+ /// creates a new PatternTokenizer returning tokens from group (-1 for split functionality) </summary>
+ public PatternTokenizer(Reader input, Pattern pattern, int group) : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, pattern, group)
+ {
+ }
+
+ /// <summary>
+ /// creates a new PatternTokenizer returning tokens from group (-1 for split functionality) </summary>
+ public PatternTokenizer(AttributeFactory factory, Reader input, Pattern pattern, int group) : base(factory, input)
+ {
+ this.group = group;
+
+ // Use "" instead of str so don't consume chars
+ // (fillBuffer) from the input on throwing IAE below:
+ matcher = pattern.matcher("");
+
+ // confusingly group count depends ENTIRELY on the pattern but is only accessible via matcher
+ if (group >= 0 && group > matcher.groupCount())
+ {
+ throw new System.ArgumentException("invalid group specified: pattern only has: " + matcher.groupCount() + " capturing groups");
+ }
+ }
+
+ public override bool incrementToken()
+ {
+ if (index >= str.Length)
+ {
+ return false;
+ }
+ clearAttributes();
+ if (group >= 0)
+ {
+
+ // match a specific group
+ while (matcher.find())
+ {
+ index = matcher.start(group);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int endIndex = matcher.end(group);
+ int endIndex = matcher.end(group);
+ if (index == endIndex)
+ {
+ continue;
+ }
+ termAtt.setEmpty().append(str, index, endIndex);
+ offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
+ return true;
+ }
+
+ index = int.MaxValue; // mark exhausted
+ return false;
+
+ }
+ else
+ {
+
+ // String.split() functionality
+ while (matcher.find())
+ {
+ if (matcher.start() - index > 0)
+ {
+ // found a non-zero-length token
+ termAtt.setEmpty().append(str, index, matcher.start());
+ offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start()));
+ index = matcher.end();
+ return true;
+ }
+
+ index = matcher.end();
+ }
+
+ if (str.Length - index == 0)
+ {
+ index = int.MaxValue; // mark exhausted
+ return false;
+ }
+
+ termAtt.setEmpty().append(str, index, str.Length);
+ offsetAtt.setOffset(correctOffset(index), correctOffset(str.Length));
+ index = int.MaxValue; // mark exhausted
+ return true;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void end() throws java.io.IOException
+ public override void end()
+ {
+ base.end();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int ofs = correctOffset(str.length());
+ int ofs = correctOffset(str.Length);
+ offsetAtt.setOffset(ofs, ofs);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ fillBuffer(str, input);
+ matcher.reset(str);
+ index = 0;
+ }
+
+ // TODO: we should see if we can make this tokenizer work without reading
+ // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
+ internal readonly char[] buffer = new char[8192];
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void fillBuffer(StringBuilder sb, java.io.Reader input) throws java.io.IOException
+ private void fillBuffer(StringBuilder sb, Reader input)
+ {
+ int len;
+ sb.Length = 0;
+ while ((len = input.read(buffer)) > 0)
+ {
+ sb.Append(buffer, 0, len);
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizerFactory.cs
new file mode 100644
index 0000000..c0b15c9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pattern/PatternTokenizerFactory.cs
@@ -0,0 +1,94 @@
+using System.Collections.Generic;
+using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
+
+namespace org.apache.lucene.analysis.pattern
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using TokenizerFactory = TokenizerFactory;
+ using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="PatternTokenizer"/>.
+ /// This tokenizer uses regex pattern matching to construct distinct tokens
+ /// for the input stream. It takes two arguments: "pattern" and "group".
+ /// <p/>
+ /// <ul>
+ /// <li>"pattern" is the regular expression.</li>
+ /// <li>"group" says which group to extract into tokens.</li>
+ /// </ul>
+ /// <para>
+ /// group=-1 (the default) is equivalent to "split". In this case, the tokens will
+ /// be equivalent to the output from (without empty tokens):
+ /// <seealso cref="String#split(java.lang.String)"/>
+ /// </para>
+ /// <para>
+ /// Using group >= 0 selects the matching group as the token. For example, if you have:<br/>
+ /// <pre>
+ /// pattern = \'([^\']+)\'
+ /// group = 0
+ /// input = aaa 'bbb' 'ccc'
+ /// </pre>
+ /// the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input
+ /// but using group=1, the output would be: bbb and ccc (no ' marks)
+ /// </para>
+ /// <para>NOTE: This Tokenizer does not output tokens that are of zero length.</para>
+ ///
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.PatternTokenizerFactory" pattern="\'([^\']+)\'" group="1"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ /// <seealso cref= PatternTokenizer
+ /// @since solr1.2 </seealso>
+ public class PatternTokenizerFactory : TokenizerFactory
+ {
+ public const string PATTERN = "pattern";
+ public const string GROUP = "group";
+
+ protected internal readonly Pattern pattern;
+ protected internal readonly int group;
+
+ /// <summary>
+ /// Creates a new PatternTokenizerFactory </summary>
+ public PatternTokenizerFactory(IDictionary<string, string> args) : base(args)
+ {
+ pattern = getPattern(args, PATTERN);
+ group = getInt(args, GROUP, -1);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ /// <summary>
+ /// Split the input using configured pattern
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: @Override public PatternTokenizer create(final org.apache.lucene.util.AttributeSource.AttributeFactory factory, final java.io.Reader in)
+ public override PatternTokenizer create(AttributeFactory factory, Reader @in)
+ {
+ return new PatternTokenizer(factory, @in, pattern, group);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/AbstractEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/AbstractEncoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/AbstractEncoder.cs
new file mode 100644
index 0000000..a336f2b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/AbstractEncoder.cs
@@ -0,0 +1,39 @@
+namespace org.apache.lucene.analysis.payloads
+{
+
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+ /// <summary>
+ /// Base class for payload encoders.
+ ///
+ ///
+ /// </summary>
+ public abstract class AbstractEncoder : PayloadEncoder
+ {
+ public abstract BytesRef encode(char[] buffer, int offset, int length);
+ public virtual BytesRef encode(char[] buffer)
+ {
+ return encode(buffer, 0, buffer.Length);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilter.cs
new file mode 100644
index 0000000..1c03b4e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilter.cs
@@ -0,0 +1,82 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using PayloadAttribute = org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+ /// <summary>
+ /// Characters before the delimiter are the "token", those after are the payload.
+ /// <p/>
+ /// For example, if the delimiter is '|', then for the string "foo|bar", foo is the token
+ /// and "bar" is a payload.
+ /// <p/>
+ /// Note, you can also include a <seealso cref="org.apache.lucene.analysis.payloads.PayloadEncoder"/> to convert the payload in an appropriate way (from characters to bytes).
+ /// <p/>
+ /// Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ /// </summary>
+ /// <seealso cref= PayloadEncoder </seealso>
+ public sealed class DelimitedPayloadTokenFilter : TokenFilter
+ {
+ public const char DEFAULT_DELIMITER = '|';
+ private readonly char delimiter;
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly PayloadAttribute payAtt = addAttribute(typeof(PayloadAttribute));
+ private readonly PayloadEncoder encoder;
+
+
+ public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) : base(input)
+ {
+ this.delimiter = delimiter;
+ this.encoder = encoder;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char[] buffer = termAtt.buffer();
+ char[] buffer = termAtt.buffer();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int length = termAtt.length();
+ int length = termAtt.length();
+ for (int i = 0; i < length; i++)
+ {
+ if (buffer[i] == delimiter)
+ {
+ payAtt.Payload = encoder.encode(buffer, i + 1, (length - (i + 1)));
+ termAtt.Length = i; // simply set a new length
+ return true;
+ }
+ }
+ // we have not seen the delimiter
+ payAtt.Payload = null;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilterFactory.cs
new file mode 100644
index 0000000..4d5dd75
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/DelimitedPayloadTokenFilterFactory.cs
@@ -0,0 +1,85 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.payloads
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+ using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+ using TokenFilterFactory = TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="DelimitedPayloadTokenFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float" delimiter="|"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class DelimitedPayloadTokenFilterFactory : TokenFilterFactory, ResourceLoaderAware
+ {
+ public const string ENCODER_ATTR = "encoder";
+ public const string DELIMITER_ATTR = "delimiter";
+
+ private readonly string encoderClass;
+ private readonly char delimiter;
+
+ private PayloadEncoder encoder;
+
+ /// <summary>
+ /// Creates a new DelimitedPayloadTokenFilterFactory </summary>
+ public DelimitedPayloadTokenFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ encoderClass = require(args, ENCODER_ATTR);
+ delimiter = getChar(args, DELIMITER_ATTR, '|');
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override DelimitedPayloadTokenFilter create(TokenStream input)
+ {
+ return new DelimitedPayloadTokenFilter(input, delimiter, encoder);
+ }
+
+ public virtual void inform(ResourceLoader loader)
+ {
+ if (encoderClass.Equals("float"))
+ {
+ encoder = new FloatEncoder();
+ }
+ else if (encoderClass.Equals("integer"))
+ {
+ encoder = new IntegerEncoder();
+ }
+ else if (encoderClass.Equals("identity"))
+ {
+ encoder = new IdentityEncoder();
+ }
+ else
+ {
+ encoder = loader.newInstance(encoderClass, typeof(PayloadEncoder));
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/FloatEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/FloatEncoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/FloatEncoder.cs
new file mode 100644
index 0000000..05cb90d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/FloatEncoder.cs
@@ -0,0 +1,41 @@
+namespace org.apache.lucene.analysis.payloads
+{
+
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Encode a character array Float as a <seealso cref="BytesRef"/>.
+ /// <p/> </summary>
+ /// <seealso cref= org.apache.lucene.analysis.payloads.PayloadHelper#encodeFloat(float, byte[], int)
+ ///
+ /// </seealso>
+ public class FloatEncoder : AbstractEncoder, PayloadEncoder
+ {
+
+ public override BytesRef encode(char[] buffer, int offset, int length)
+ {
+ float payload = float.Parse(new string(buffer, offset, length)); //TODO: improve this so that we don't have to new Strings
+ sbyte[] bytes = PayloadHelper.encodeFloat(payload);
+ BytesRef result = new BytesRef(bytes);
+ return result;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IdentityEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IdentityEncoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IdentityEncoder.cs
new file mode 100644
index 0000000..f204244
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IdentityEncoder.cs
@@ -0,0 +1,63 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+
+ /// <summary>
+ /// Does nothing other than convert the char array to a byte array using the specified encoding.
+ ///
+ ///
+ /// </summary>
+ public class IdentityEncoder : AbstractEncoder, PayloadEncoder
+ {
+ protected internal Charset charset = StandardCharsets.UTF_8;
+
+ public IdentityEncoder()
+ {
+ }
+
+ public IdentityEncoder(Charset charset)
+ {
+ this.charset = charset;
+ }
+
+ public override BytesRef encode(char[] buffer, int offset, int length)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final ByteBuffer bb = charset.encode(java.nio.CharBuffer.wrap(buffer, offset, length));
+ ByteBuffer bb = charset.encode(CharBuffer.wrap(buffer, offset, length));
+ if (bb.hasArray())
+ {
+ return new BytesRef(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining());
+ }
+ else
+ {
+ // normally it should always have an array, but who knows?
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final byte[] b = new byte[bb.remaining()];
+ sbyte[] b = new sbyte[bb.remaining()];
+ bb.get(b);
+ return new BytesRef(b);
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IntegerEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IntegerEncoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IntegerEncoder.cs
new file mode 100644
index 0000000..1533f9e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/IntegerEncoder.cs
@@ -0,0 +1,42 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+
+ /// <summary>
+ /// Encode a character array Integer as a <seealso cref="BytesRef"/>.
+ /// <p/>
+ /// See <seealso cref="org.apache.lucene.analysis.payloads.PayloadHelper#encodeInt(int, byte[], int)"/>.
+ ///
+ ///
+ /// </summary>
+ public class IntegerEncoder : AbstractEncoder, PayloadEncoder
+ {
+
+ public override BytesRef encode(char[] buffer, int offset, int length)
+ {
+ int payload = ArrayUtil.parseInt(buffer, offset, length); //TODO: improve this so that we don't have to new Strings
+ sbyte[] bytes = PayloadHelper.encodeInt(payload);
+ BytesRef result = new BytesRef(bytes);
+ return result;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilter.cs
new file mode 100644
index 0000000..629fef0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilter.cs
@@ -0,0 +1,70 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using PayloadAttribute = org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+
+ /// <summary>
+ /// Assigns a payload to a token based on the <seealso cref="org.apache.lucene.analysis.Token#type()"/>
+ ///
+ ///
+ /// </summary>
+ public class NumericPayloadTokenFilter : TokenFilter
+ {
+
+ private string typeMatch;
+ private BytesRef thePayload;
+
+ private readonly PayloadAttribute payloadAtt = addAttribute(typeof(PayloadAttribute));
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+
+ public NumericPayloadTokenFilter(TokenStream input, float payload, string typeMatch) : base(input)
+ {
+ if (typeMatch == null)
+ {
+ throw new System.ArgumentException("typeMatch cannot be null");
+ }
+ //Need to encode the payload
+ thePayload = new BytesRef(PayloadHelper.encodeFloat(payload));
+ this.typeMatch = typeMatch;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (typeAtt.type().Equals(typeMatch))
+ {
+ payloadAtt.Payload = thePayload;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilterFactory.cs
new file mode 100644
index 0000000..c6e21d6
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/NumericPayloadTokenFilterFactory.cs
@@ -0,0 +1,60 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.payloads
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="NumericPayloadTokenFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_numpayload" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.NumericPayloadTokenFilterFactory" payload="24" typeMatch="word"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class NumericPayloadTokenFilterFactory : TokenFilterFactory
+ {
+ private readonly float payload;
+ private readonly string typeMatch;
+
+ /// <summary>
+ /// Creates a new NumericPayloadTokenFilterFactory </summary>
+ public NumericPayloadTokenFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ payload = requireFloat(args, "payload");
+ typeMatch = require(args, "typeMatch");
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override NumericPayloadTokenFilter create(TokenStream input)
+ {
+ return new NumericPayloadTokenFilter(input,payload,typeMatch);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadEncoder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadEncoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadEncoder.cs
new file mode 100644
index 0000000..312f335
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadEncoder.cs
@@ -0,0 +1,43 @@
+namespace org.apache.lucene.analysis.payloads
+{
+
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ /// <summary>
+ /// Mainly for use with the DelimitedPayloadTokenFilter, converts char buffers to
+ /// <seealso cref="BytesRef"/>.
+ /// <p/>
+ /// NOTE: This interface is subject to change
+ ///
+ ///
+ /// </summary>
+ public interface PayloadEncoder
+ {
+
+ BytesRef encode(char[] buffer);
+
+ /// <summary>
+ /// Convert a char array to a <seealso cref="BytesRef"/> </summary>
+ /// <returns> encoded <seealso cref="BytesRef"/> </returns>
+ BytesRef encode(char[] buffer, int offset, int length);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadHelper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadHelper.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadHelper.cs
new file mode 100644
index 0000000..00eb10e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/PayloadHelper.cs
@@ -0,0 +1,81 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ /// <summary>
+ /// Utility methods for encoding payloads.
+ ///
+ ///
+ /// </summary>
+ public class PayloadHelper
+ {
+
+ public static sbyte[] encodeFloat(float payload)
+ {
+ return encodeFloat(payload, new sbyte[4], 0);
+ }
+
+ public static sbyte[] encodeFloat(float payload, sbyte[] data, int offset)
+ {
+ return encodeInt(float.floatToIntBits(payload), data, offset);
+ }
+
+ public static sbyte[] encodeInt(int payload)
+ {
+ return encodeInt(payload, new sbyte[4], 0);
+ }
+
+ public static sbyte[] encodeInt(int payload, sbyte[] data, int offset)
+ {
+ data[offset] = (sbyte)(payload >> 24);
+ data[offset + 1] = (sbyte)(payload >> 16);
+ data[offset + 2] = (sbyte)(payload >> 8);
+ data[offset + 3] = (sbyte) payload;
+ return data;
+ }
+
+ /// <seealso cref= #decodeFloat(byte[], int) </seealso>
+ /// <seealso cref= #encodeFloat(float) </seealso>
+ /// <returns> the decoded float </returns>
+ public static float decodeFloat(sbyte[] bytes)
+ {
+ return decodeFloat(bytes, 0);
+ }
+
+ /// <summary>
+ /// Decode the payload that was encoded using <seealso cref="#encodeFloat(float)"/>.
+ /// NOTE: the length of the array must be at least offset + 4 long. </summary>
+ /// <param name="bytes"> The bytes to decode </param>
+ /// <param name="offset"> The offset into the array. </param>
+ /// <returns> The float that was encoded
+ /// </returns>
+ /// <seealso cref= #encodeFloat(float) </seealso>
+ public static float decodeFloat(sbyte[] bytes, int offset)
+ {
+
+ return float.intBitsToFloat(decodeInt(bytes, offset));
+ }
+
+ public static int decodeInt(sbyte[] bytes, int offset)
+ {
+ return ((bytes[offset] & 0xFF) << 24) | ((bytes[offset + 1] & 0xFF) << 16) | ((bytes[offset + 2] & 0xFF) << 8) | (bytes[offset + 3] & 0xFF);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilter.cs
new file mode 100644
index 0000000..b08d0a4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilter.cs
@@ -0,0 +1,61 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+ using PayloadAttribute = org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+
+ /// <summary>
+ /// Adds the <seealso cref="OffsetAttribute#startOffset()"/>
+ /// and <seealso cref="OffsetAttribute#endOffset()"/>
+ /// First 4 bytes are the start
+ ///
+ ///
+ /// </summary>
+ public class TokenOffsetPayloadTokenFilter : TokenFilter
+ {
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly PayloadAttribute payAtt = addAttribute(typeof(PayloadAttribute));
+
+ public TokenOffsetPayloadTokenFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ sbyte[] data = new sbyte[8];
+ PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0);
+ PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4);
+ BytesRef payload = new BytesRef(data);
+ payAtt.Payload = payload;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilterFactory.cs
new file mode 100644
index 0000000..f06a9d8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TokenOffsetPayloadTokenFilterFactory.cs
@@ -0,0 +1,56 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.payloads
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="TokenOffsetPayloadTokenFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_tokenoffset" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.TokenOffsetPayloadTokenFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class TokenOffsetPayloadTokenFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new TokenOffsetPayloadTokenFilterFactory </summary>
+ public TokenOffsetPayloadTokenFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenOffsetPayloadTokenFilter create(TokenStream input)
+ {
+ return new TokenOffsetPayloadTokenFilter(input);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilter.cs
new file mode 100644
index 0000000..0c1bb7b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilter.cs
@@ -0,0 +1,62 @@
+namespace org.apache.lucene.analysis.payloads
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using PayloadAttribute = org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+ using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+
+
+ /// <summary>
+ /// Makes the <seealso cref="org.apache.lucene.analysis.Token#type()"/> a payload.
+ ///
+ /// Encodes the type using <seealso cref="String#getBytes(String)"/> with "UTF-8" as the encoding
+ ///
+ ///
+ /// </summary>
+ public class TypeAsPayloadTokenFilter : TokenFilter
+ {
+ private readonly PayloadAttribute payloadAtt = addAttribute(typeof(PayloadAttribute));
+ private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+
+ public TypeAsPayloadTokenFilter(TokenStream input) : base(input)
+ {
+ }
+
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ string type = typeAtt.type();
+ if (type != null && type.Length > 0)
+ {
+ payloadAtt.Payload = new BytesRef(type);
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterFactory.cs
new file mode 100644
index 0000000..88c236c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterFactory.cs
@@ -0,0 +1,56 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.payloads
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="TypeAsPayloadTokenFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_typeaspayload" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.TypeAsPayloadTokenFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class TypeAsPayloadTokenFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new TypeAsPayloadTokenFilterFactory </summary>
+ public TypeAsPayloadTokenFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TypeAsPayloadTokenFilter create(TokenStream input)
+ {
+ return new TypeAsPayloadTokenFilter(input);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilter.cs
new file mode 100644
index 0000000..92f73bc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilter.cs
@@ -0,0 +1,109 @@
+using System;
+
+namespace org.apache.lucene.analysis.position
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+
+ /// <summary>
+ /// Set the positionIncrement of all tokens to the "positionIncrement",
+ /// except the first return token which retains its original positionIncrement value.
+ /// The default positionIncrement value is zero. </summary>
+ /// @deprecated (4.4) PositionFilter makes <seealso cref="TokenStream"/> graphs inconsistent
+ /// which can cause highlighting bugs. Its main use-case being to make
+ /// <a href="{@docRoot}/../queryparser/overview-summary.html">QueryParser</a>
+ /// generate boolean queries instead of phrase queries, it is now advised to use
+ /// {@code QueryParser.setAutoGeneratePhraseQueries(boolean)}
+ /// (for simple cases) or to override {@code QueryParser.newFieldQuery}.
+ [Obsolete("(4.4) PositionFilter makes <seealso cref="TokenStream"/> graphs inconsistent")]
+ public sealed class PositionFilter : TokenFilter
+ {
+
+ /// <summary>
+ /// Position increment to assign to all but the first token - default = 0 </summary>
+ private readonly int positionIncrement;
+
+ /// <summary>
+ /// The first token must have non-zero positionIncrement * </summary>
+ private bool firstTokenPositioned = false;
+
+ private PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+
+ /// <summary>
+ /// Constructs a PositionFilter that assigns a position increment of zero to
+ /// all but the first token from the given input stream.
+ /// </summary>
+ /// <param name="input"> the input stream </param>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public PositionFilter(final org.apache.lucene.analysis.TokenStream input)
+ public PositionFilter(TokenStream input) : this(input, 0)
+ {
+ }
+
+ /// <summary>
+ /// Constructs a PositionFilter that assigns the given position increment to
+ /// all but the first token from the given input stream.
+ /// </summary>
+ /// <param name="input"> the input stream </param>
+ /// <param name="positionIncrement"> position increment to assign to all but the first
+ /// token from the input stream </param>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public PositionFilter(final org.apache.lucene.analysis.TokenStream input, final int positionIncrement)
+ public PositionFilter(TokenStream input, int positionIncrement) : base(input)
+ {
+ if (positionIncrement < 0)
+ {
+ throw new System.ArgumentException("positionIncrement may not be negative");
+ }
+ this.positionIncrement = positionIncrement;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (firstTokenPositioned)
+ {
+ posIncrAtt.PositionIncrement = positionIncrement;
+ }
+ else
+ {
+ firstTokenPositioned = true;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+ public override void reset()
+ {
+ base.reset();
+ firstTokenPositioned = false;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilterFactory.cs
new file mode 100644
index 0000000..74bf1e4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Position/PositionFilterFactory.cs
@@ -0,0 +1,70 @@
+using System;
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.position
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = TokenFilterFactory;
+ using Version = org.apache.lucene.util.Version;
+
+ /// <summary>
+ /// Factory for <seealso cref="PositionFilter"/>.
+ /// Set the positionIncrement of all tokens to the "positionIncrement", except the first return token which retains its
+ /// original positionIncrement value. The default positionIncrement value is zero.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_position" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ /// <filter class="solr.PositionFilterFactory" positionIncrement="0"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ /// <seealso cref= org.apache.lucene.analysis.position.PositionFilter
+ /// @since solr 1.4 </seealso>
+ /// @deprecated (4.4)
+ [Obsolete("(4.4)")]
+ public class PositionFilterFactory : TokenFilterFactory
+ {
+ private readonly int positionIncrement;
+
+ /// <summary>
+ /// Creates a new PositionFilterFactory </summary>
+ public PositionFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ positionIncrement = getInt(args, "positionIncrement", 0);
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ if (luceneMatchVersion != null && luceneMatchVersion.onOrAfter(Version.LUCENE_44))
+ {
+ throw new System.ArgumentException("PositionFilter is deprecated as of Lucene 4.4. You should either fix your code to not use it or use Lucene 4.3 version compatibility");
+ }
+ }
+
+ public override PositionFilter create(TokenStream input)
+ {
+ return new PositionFilter(input, positionIncrement);
+ }
+ }
+
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseAnalyzer.cs
new file mode 100644
index 0000000..ac178f5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseAnalyzer.cs
@@ -0,0 +1,155 @@
+using System;
+
+namespace org.apache.lucene.analysis.pt
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+ using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+ using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+ using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+ using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+ using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+ using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using Version = org.apache.lucene.util.Version;
+ using PortugueseStemmer = org.tartarus.snowball.ext.PortugueseStemmer;
+
+ /// <summary>
+ /// <seealso cref="Analyzer"/> for Portuguese.
+ /// <para>
+ /// <a name="version"/>
+ /// </para>
+ /// <para>You must specify the required <seealso cref="Version"/>
+ /// compatibility when creating PortugueseAnalyzer:
+ /// <ul>
+ /// <li> As of 3.6, PortugueseLightStemFilter is used for less aggressive stemming.
+ /// </ul>
+ /// </para>
+ /// </summary>
+ public sealed class PortugueseAnalyzer : StopwordAnalyzerBase
+ {
+ private readonly CharArraySet stemExclusionSet;
+
+ /// <summary>
+ /// File containing default Portuguese stopwords. </summary>
+ public const string DEFAULT_STOPWORD_FILE = "portuguese_stop.txt";
+
+ /// <summary>
+ /// Returns an unmodifiable instance of the default stop words set. </summary>
+ /// <returns> default stop words set. </returns>
+ public static CharArraySet DefaultStopSet
+ {
+ get
+ {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+ }
+
+ /// <summary>
+ /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
+ /// accesses the static final set the first time.;
+ /// </summary>
+ private class DefaultSetHolder
+ {
+ internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+ static DefaultSetHolder()
+ {
+ try
+ {
+ DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
+ }
+ catch (IOException)
+ {
+ // default set should always be present as it is part of the
+ // distribution (JAR)
+ throw new Exception("Unable to load default stopword set");
+ }
+ }
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+ /// </summary>
+ public PortugueseAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+ {
+ }
+
+ /// <summary>
+ /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+ /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+ /// stemming.
+ /// </summary>
+ /// <param name="matchVersion"> lucene compatibility version </param>
+ /// <param name="stopwords"> a stopword set </param>
+ /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+ public PortugueseAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+ {
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /// <summary>
+ /// Creates a
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+ /// </summary>
+ /// <returns> A
+ /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+ /// built from an <seealso cref="StandardTokenizer"/> filtered with
+ /// <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
+ /// , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+ /// provided and <seealso cref="PortugueseLightStemFilter"/>. </returns>
+ protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+ Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (!stemExclusionSet.Empty)
+ {
+ result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+ }
+ if (matchVersion.onOrAfter(Version.LUCENE_36))
+ {
+ result = new PortugueseLightStemFilter(result);
+ }
+ else
+ {
+ result = new SnowballFilter(result, new PortugueseStemmer());
+ }
+ return new TokenStreamComponents(source, result);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilter.cs
new file mode 100644
index 0000000..3a0529b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.pt
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="PortugueseLightStemmer"/> to stem
+ /// Portuguese words.
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para>
+ /// </summary>
+ public sealed class PortugueseLightStemFilter : TokenFilter
+ {
+ private readonly PortugueseLightStemmer stemmer = new PortugueseLightStemmer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public PortugueseLightStemFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilterFactory.cs
new file mode 100644
index 0000000..3dd6ebc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.pt
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="PortugueseLightStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ptlgtstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.PortugueseLightStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class PortugueseLightStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new PortugueseLightStemFilterFactory </summary>
+ public PortugueseLightStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new PortugueseLightStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemmer.cs
new file mode 100644
index 0000000..1262d8d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseLightStemmer.cs
@@ -0,0 +1,252 @@
+namespace org.apache.lucene.analysis.pt
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+ * This algorithm is updated based on code located at:
+ * http://members.unine.ch/jacques.savoy/clef/
+ *
+ * Full copyright for that code follows:
+ */
+
+ /*
+ * Copyright (c) 2005, Jacques Savoy
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. Redistributions in binary
+ * form must reproduce the above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or other materials
+ * provided with the distribution. Neither the name of the author nor the names
+ * of its contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+ using org.apache.lucene.analysis.util;
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+// import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+ /// <summary>
+ /// Light Stemmer for Portuguese
+ /// <para>
+ /// This stemmer implements the "UniNE" algorithm in:
+ /// <i>Light Stemming Approaches for the French, Portuguese, German and Hungarian Languages</i>
+ /// Jacques Savoy
+ /// </para>
+ /// </summary>
+ public class PortugueseLightStemmer
+ {
+
+ public virtual int stem(char[] s, int len)
+ {
+ if (len < 4)
+ {
+ return len;
+ }
+
+ len = removeSuffix(s, len);
+
+ if (len > 3 && s[len - 1] == 'a')
+ {
+ len = normFeminine(s, len);
+ }
+
+ if (len > 4)
+ {
+ switch (s[len - 1])
+ {
+ case 'e':
+ case 'a':
+ case 'o':
+ len--;
+ break;
+ }
+ }
+
+ for (int i = 0; i < len; i++)
+ {
+ switch (s[i])
+ {
+ case 'à':
+ case 'á':
+ case 'â':
+ case 'ä':
+ case 'ã':
+ s[i] = 'a';
+ break;
+ case 'ò':
+ case 'ó':
+ case 'ô':
+ case 'ö':
+ case 'õ':
+ s[i] = 'o';
+ break;
+ case 'è':
+ case 'é':
+ case 'ê':
+ case 'ë':
+ s[i] = 'e';
+ break;
+ case 'ù':
+ case 'ú':
+ case 'û':
+ case 'ü':
+ s[i] = 'u';
+ break;
+ case 'ì':
+ case 'í':
+ case 'î':
+ case 'ï':
+ s[i] = 'i';
+ break;
+ case 'ç':
+ s[i] = 'c';
+ break;
+ }
+ }
+
+ return len;
+ }
+
+ private int removeSuffix(char[] s, int len)
+ {
+ if (len > 4 && StemmerUtil.EndsWith(s, len, "es"))
+ {
+ switch (s[len - 3])
+ {
+ case 'r':
+ case 's':
+ case 'l':
+ case 'z':
+ return len - 2;
+ }
+ }
+
+ if (len > 3 && StemmerUtil.EndsWith(s, len, "ns"))
+ {
+ s[len - 2] = 'm';
+ return len - 1;
+ }
+
+ if (len > 4 && (StemmerUtil.EndsWith(s, len, "eis") || StemmerUtil.EndsWith(s, len, "éis")))
+ {
+ s[len - 3] = 'e';
+ s[len - 2] = 'l';
+ return len - 1;
+ }
+
+ if (len > 4 && StemmerUtil.EndsWith(s, len, "ais"))
+ {
+ s[len - 2] = 'l';
+ return len - 1;
+ }
+
+ if (len > 4 && StemmerUtil.EndsWith(s, len, "óis"))
+ {
+ s[len - 3] = 'o';
+ s[len - 2] = 'l';
+ return len - 1;
+ }
+
+ if (len > 4 && StemmerUtil.EndsWith(s, len, "is"))
+ {
+ s[len - 1] = 'l';
+ return len;
+ }
+
+ if (len > 3 && (StemmerUtil.EndsWith(s, len, "ões") || StemmerUtil.EndsWith(s, len, "ães")))
+ {
+ len--;
+ s[len - 2] = 'ã';
+ s[len - 1] = 'o';
+ return len;
+ }
+
+ if (len > 6 && StemmerUtil.EndsWith(s, len, "mente"))
+ {
+ return len - 5;
+ }
+
+ if (len > 3 && s[len - 1] == 's')
+ {
+ return len - 1;
+ }
+ return len;
+ }
+
+ private int normFeminine(char[] s, int len)
+ {
+ if (len > 7 && (StemmerUtil.EndsWith(s, len, "inha") || StemmerUtil.EndsWith(s, len, "iaca") || StemmerUtil.EndsWith(s, len, "eira")))
+ {
+ s[len - 1] = 'o';
+ return len;
+ }
+
+ if (len > 6)
+ {
+ if (StemmerUtil.EndsWith(s, len, "osa") || StemmerUtil.EndsWith(s, len, "ica") || StemmerUtil.EndsWith(s, len, "ida") || StemmerUtil.EndsWith(s, len, "ada") || StemmerUtil.EndsWith(s, len, "iva") || StemmerUtil.EndsWith(s, len, "ama"))
+ {
+ s[len - 1] = 'o';
+ return len;
+ }
+
+ if (StemmerUtil.EndsWith(s, len, "ona"))
+ {
+ s[len - 3] = 'ã';
+ s[len - 2] = 'o';
+ return len - 1;
+ }
+
+ if (StemmerUtil.EndsWith(s, len, "ora"))
+ {
+ return len - 1;
+ }
+
+ if (StemmerUtil.EndsWith(s, len, "esa"))
+ {
+ s[len - 3] = 'ê';
+ return len - 1;
+ }
+
+ if (StemmerUtil.EndsWith(s, len, "na"))
+ {
+ s[len - 1] = 'o';
+ return len;
+ }
+ }
+ return len;
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilter.cs
new file mode 100644
index 0000000..e9a0d26
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.pt
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+ using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+ using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+ /// <summary>
+ /// A <seealso cref="TokenFilter"/> that applies <seealso cref="PortugueseMinimalStemmer"/> to stem
+ /// Portuguese words.
+ /// <para>
+ /// To prevent terms from being stemmed use an instance of
+ /// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+ /// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+ /// </para>
+ /// </summary>
+ public sealed class PortugueseMinimalStemFilter : TokenFilter
+ {
+ private readonly PortugueseMinimalStemmer stemmer = new PortugueseMinimalStemmer();
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+ public PortugueseMinimalStemFilter(TokenStream input) : base(input)
+ {
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+ public override bool incrementToken()
+ {
+ if (input.incrementToken())
+ {
+ if (!keywordAttr.Keyword)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+ termAtt.Length = newlen;
+ }
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilterFactory.cs
new file mode 100644
index 0000000..e893dad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Pt/PortugueseMinimalStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.pt
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+ /// <summary>
+ /// Factory for <seealso cref="PortugueseMinimalStemFilter"/>.
+ /// <pre class="prettyprint">
+ /// <fieldType name="text_ptminstem" class="solr.TextField" positionIncrementGap="100">
+ /// <analyzer>
+ /// <tokenizer class="solr.StandardTokenizerFactory"/>
+ /// <filter class="solr.LowerCaseFilterFactory"/>
+ /// <filter class="solr.PortugueseMinimalStemFilterFactory"/>
+ /// </analyzer>
+ /// </fieldType></pre>
+ /// </summary>
+ public class PortugueseMinimalStemFilterFactory : TokenFilterFactory
+ {
+
+ /// <summary>
+ /// Creates a new PortugueseMinimalStemFilterFactory </summary>
+ public PortugueseMinimalStemFilterFactory(IDictionary<string, string> args) : base(args)
+ {
+ if (args.Count > 0)
+ {
+ throw new System.ArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ public override TokenStream create(TokenStream input)
+ {
+ return new PortugueseMinimalStemFilter(input);
+ }
+ }
+
+}
\ No newline at end of file