You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2017/06/09 21:52:38 UTC
lucene-solr:master: LUCENE-7854: Add a new
DelimitedTermFrequencyTokenFilter that allows to mark tokens with a custom
term frequency
Repository: lucene-solr
Updated Branches:
refs/heads/master c37b37743 -> 5844ed4ac
LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to mark tokens with a custom term frequency
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/5844ed4a
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5844ed4a
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5844ed4a
Branch: refs/heads/master
Commit: 5844ed4ac95373cbdb512e84b8ad08f78c2baf57
Parents: c37b377
Author: Uwe Schindler <us...@apache.org>
Authored: Fri Jun 9 23:52:19 2017 +0200
Committer: Uwe Schindler <us...@apache.org>
Committed: Fri Jun 9 23:52:19 2017 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 ++
.../DelimitedTermFrequencyTokenFilter.java | 75 +++++++++++++++++++
...elimitedTermFrequencyTokenFilterFactory.java | 53 ++++++++++++++
...ache.lucene.analysis.util.TokenFilterFactory | 1 +
.../DelimitedTermFrequencyTokenFilterTest.java | 77 ++++++++++++++++++++
5 files changed, 212 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5844ed4a/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 0251243..12e5000 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -18,6 +18,12 @@ New Features
with a custom token stream allows indexing custom term frequencies
(Mike McCandless)
+* LUCENE-7866: Add a new DelimitedTermFrequencyTokenFilter that allows to
+ mark tokens with a custom term frequency (LUCENE-7854). It parses a numeric
+ value after a separator char ('|') at the end of each token and changes
+ the term frequency to this value. (Uwe Schindler, Robert Muir,
+ Mike McCandless)
+
API Changes
* LUCENE-2605: Classic QueryParser no longer splits on whitespace by default.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5844ed4a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
new file mode 100644
index 0000000..e2095ad
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+
+/**
+ * Characters before the delimiter are the "token", the textual integer after is the term frequency.
+ * To use this {@code TokenFilter} the field must be indexed with
+ * {@link IndexOptions#DOCS_AND_FREQS} but no positions or offsets.
+ * <p>
+ * For example, if the delimiter is '|', then for the string "foo|5", "foo" is the token
+ * and "5" is a term frequency. If there is no delimiter, the TokenFilter does not modify
+ * the term frequency.
+ * <p>
+ * Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
+ */
+public final class DelimitedTermFrequencyTokenFilter extends TokenFilter {
+ public static final char DEFAULT_DELIMITER = '|';
+
+ private final char delimiter;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final TermFrequencyAttribute tfAtt = addAttribute(TermFrequencyAttribute.class);
+
+
+ public DelimitedTermFrequencyTokenFilter(TokenStream input) {
+ this(input, DEFAULT_DELIMITER);
+ }
+
+ public DelimitedTermFrequencyTokenFilter(TokenStream input, char delimiter) {
+ super(input);
+ this.delimiter = delimiter;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ final char[] buffer = termAtt.buffer();
+ final int length = termAtt.length();
+ for (int i = 0; i < length; i++) {
+ if (buffer[i] == delimiter) {
+ termAtt.setLength(i); // simply set a new length
+ i++;
+ tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
+ return true;
+ }
+ }
+ return true;
+ }
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5844ed4a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java
new file mode 100644
index 0000000..af5c0fa
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link DelimitedTermFrequencyTokenFilter}. The field must have {@code omitPositions=true}.
+ * <pre class="prettyprint">
+ * <fieldType name="text_tfdl" class="solr.TextField" omitPositions="true">
+ * <analyzer>
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.DelimitedTermFrequencyTokenFilterFactory" delimiter="|"/>
+ * </analyzer>
+ * </fieldType></pre>
+ */
+public class DelimitedTermFrequencyTokenFilterFactory extends TokenFilterFactory {
+ public static final String DELIMITER_ATTR = "delimiter";
+
+ private final char delimiter;
+
+ /** Creates a new DelimitedPayloadTokenFilterFactory */
+ public DelimitedTermFrequencyTokenFilterFactory(Map<String, String> args) {
+ super(args);
+ delimiter = getChar(args, DELIMITER_ATTR, DelimitedTermFrequencyTokenFilter.DEFAULT_DELIMITER);
+ if (!args.isEmpty()) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public DelimitedTermFrequencyTokenFilter create(TokenStream input) {
+ return new DelimitedTermFrequencyTokenFilter(input, delimiter);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5844ed4a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index 4e33006..bc19c4a 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -63,6 +63,7 @@ org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
+org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory
org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/5844ed4a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java
new file mode 100644
index 0000000..7609f6e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
+
+public class DelimitedTermFrequencyTokenFilterTest extends BaseTokenStreamTestCase {
+
+ public void testTermFrequency() throws Exception {
+ String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2 brown|123 dogs|1024";
+ DelimitedTermFrequencyTokenFilter filter =
+ new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
+ CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
+ filter.reset();
+ assertTermEquals("The", filter, termAtt, tfAtt, 1);
+ assertTermEquals("quick", filter, termAtt, tfAtt, 40);
+ assertTermEquals("red", filter, termAtt, tfAtt, 4);
+ assertTermEquals("fox", filter, termAtt, tfAtt, 6);
+ assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
+ assertTermEquals("over", filter, termAtt, tfAtt, 1);
+ assertTermEquals("the", filter, termAtt, tfAtt, 1);
+ assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
+ assertTermEquals("brown", filter, termAtt, tfAtt, 123);
+ assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
+ assertFalse(filter.incrementToken());
+ filter.end();
+ filter.close();
+ }
+
+ public void testInvalidNegativeTf() throws Exception {
+ String test = "foo bar|-20";
+ DelimitedTermFrequencyTokenFilter filter =
+ new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
+ CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
+ filter.reset();
+ assertTermEquals("foo", filter, termAtt, tfAtt, 1);
+ IllegalArgumentException iae = expectThrows(IllegalArgumentException.class, filter::incrementToken);
+ assertEquals("Term frequency must be 1 or greater; got -20", iae.getMessage());
+ }
+
+ public void testInvalidFloatTf() throws Exception {
+ String test = "foo bar|1.2";
+ DelimitedTermFrequencyTokenFilter filter =
+ new DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
+ CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
+ TermFrequencyAttribute tfAtt = filter.getAttribute(TermFrequencyAttribute.class);
+ filter.reset();
+ assertTermEquals("foo", filter, termAtt, tfAtt, 1);
+ expectThrows(NumberFormatException.class, filter::incrementToken);
+ }
+
+ void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) throws Exception {
+ assertTrue(stream.incrementToken());
+ assertEquals(expected, termAtt.toString());
+ assertEquals(expectedTf, tfAtt.getTermFrequency());
+ }
+}
RE: lucene-solr:master: LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to mark tokens with a custom term frequency
Posted by Uwe Schindler <uw...@thetaphi.de>.
Sorry wrong issue number!
-----
Uwe Schindler
Achterdiek 19, D-28357 Bremen
http://www.thetaphi.de
eMail: uwe@thetaphi.de
> -----Original Message-----
> From: uschindler@apache.org [mailto:uschindler@apache.org]
> Sent: Friday, June 9, 2017 11:53 PM
> To: commits@lucene.apache.org
> Subject: lucene-solr:master: LUCENE-7854: Add a new
> DelimitedTermFrequencyTokenFilter that allows to mark tokens with a
> custom term frequency
>
> Repository: lucene-solr
> Updated Branches:
> refs/heads/master c37b37743 -> 5844ed4ac
>
>
> LUCENE-7854: Add a new DelimitedTermFrequencyTokenFilter that allows to
> mark tokens with a custom term frequency
>
>
> Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
> Commit: http://git-wip-us.apache.org/repos/asf/lucene-
> solr/commit/5844ed4a
> Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/5844ed4a
> Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/5844ed4a
>
> Branch: refs/heads/master
> Commit: 5844ed4ac95373cbdb512e84b8ad08f78c2baf57
> Parents: c37b377
> Author: Uwe Schindler <us...@apache.org>
> Authored: Fri Jun 9 23:52:19 2017 +0200
> Committer: Uwe Schindler <us...@apache.org>
> Committed: Fri Jun 9 23:52:19 2017 +0200
>
> ----------------------------------------------------------------------
> lucene/CHANGES.txt | 6 ++
> .../DelimitedTermFrequencyTokenFilter.java | 75 +++++++++++++++++++
> ...elimitedTermFrequencyTokenFilterFactory.java | 53 ++++++++++++++
> ...ache.lucene.analysis.util.TokenFilterFactory | 1 +
> .../DelimitedTermFrequencyTokenFilterTest.java | 77
> ++++++++++++++++++++
> 5 files changed, 212 insertions(+)
> ----------------------------------------------------------------------
>
>
> http://git-wip-us.apache.org/repos/asf/lucene-
> solr/blob/5844ed4a/lucene/CHANGES.txt
> ----------------------------------------------------------------------
> diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
> index 0251243..12e5000 100644
> --- a/lucene/CHANGES.txt
> +++ b/lucene/CHANGES.txt
> @@ -18,6 +18,12 @@ New Features
> with a custom token stream allows indexing custom term frequencies
> (Mike McCandless)
>
> +* LUCENE-7866: Add a new DelimitedTermFrequencyTokenFilter that allows
> to
> + mark tokens with a custom term frequency (LUCENE-7854). It parses a
> numeric
> + value after a separator char ('|') at the end of each token and changes
> + the term frequency to this value. (Uwe Schindler, Robert Muir,
> + Mike McCandless)
> +
> API Changes
>
> * LUCENE-2605: Classic QueryParser no longer splits on whitespace by
> default.
>
> http://git-wip-us.apache.org/repos/asf/lucene-
> solr/blob/5844ed4a/lucene/analysis/common/src/java/org/apache/lucene/
> analysis/miscellaneous/DelimitedTermFrequencyTokenFilter.java
> ----------------------------------------------------------------------
> diff --git
> a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilter.java
> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilter.java
> new file mode 100644
> index 0000000..e2095ad
> --- /dev/null
> +++
> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilter.java
> @@ -0,0 +1,75 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +package org.apache.lucene.analysis.miscellaneous;
> +
> +import java.io.IOException;
> +
> +import org.apache.lucene.analysis.TokenFilter;
> +import org.apache.lucene.analysis.TokenStream;
> +import
> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
> +import org.apache.lucene.index.IndexOptions;
> +import org.apache.lucene.util.ArrayUtil;
> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> +
> +
> +/**
> + * Characters before the delimiter are the "token", the textual integer after
> is the term frequency.
> + * To use this {@code TokenFilter} the field must be indexed with
> + * {@link IndexOptions#DOCS_AND_FREQS} but no positions or offsets.
> + * <p>
> + * For example, if the delimiter is '|', then for the string "foo|5", "foo" is the
> token
> + * and "5" is a term frequency. If there is no delimiter, the TokenFilter does
> not modify
> + * the term frequency.
> + * <p>
> + * Note make sure your Tokenizer doesn't split on the delimiter, or this
> won't work
> + */
> +public final class DelimitedTermFrequencyTokenFilter extends TokenFilter {
> + public static final char DEFAULT_DELIMITER = '|';
> +
> + private final char delimiter;
> + private final CharTermAttribute termAtt =
> addAttribute(CharTermAttribute.class);
> + private final TermFrequencyAttribute tfAtt =
> addAttribute(TermFrequencyAttribute.class);
> +
> +
> + public DelimitedTermFrequencyTokenFilter(TokenStream input) {
> + this(input, DEFAULT_DELIMITER);
> + }
> +
> + public DelimitedTermFrequencyTokenFilter(TokenStream input, char
> delimiter) {
> + super(input);
> + this.delimiter = delimiter;
> + }
> +
> + @Override
> + public boolean incrementToken() throws IOException {
> + if (input.incrementToken()) {
> + final char[] buffer = termAtt.buffer();
> + final int length = termAtt.length();
> + for (int i = 0; i < length; i++) {
> + if (buffer[i] == delimiter) {
> + termAtt.setLength(i); // simply set a new length
> + i++;
> + tfAtt.setTermFrequency(ArrayUtil.parseInt(buffer, i, length - i));
> + return true;
> + }
> + }
> + return true;
> + }
> + return false;
> + }
> +}
>
> http://git-wip-us.apache.org/repos/asf/lucene-
> solr/blob/5844ed4a/lucene/analysis/common/src/java/org/apache/lucene/
> analysis/miscellaneous/DelimitedTermFrequencyTokenFilterFactory.java
> ----------------------------------------------------------------------
> diff --git
> a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilterFactory.java
> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilterFactory.java
> new file mode 100644
> index 0000000..af5c0fa
> --- /dev/null
> +++
> b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilterFactory.java
> @@ -0,0 +1,53 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +package org.apache.lucene.analysis.miscellaneous;
> +
> +import java.util.Map;
> +
> +import org.apache.lucene.analysis.TokenStream;
> +import org.apache.lucene.analysis.util.TokenFilterFactory;
> +
> +/**
> + * Factory for {@link DelimitedTermFrequencyTokenFilter}. The field must
> have {@code omitPositions=true}.
> + * <pre class="prettyprint">
> + * <fieldType name="text_tfdl" class="solr.TextField"
> omitPositions="true">
> + * <analyzer>
> + * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
> + * <filter class="solr.DelimitedTermFrequencyTokenFilterFactory"
> delimiter="|"/>
> + * </analyzer>
> + * </fieldType></pre>
> + */
> +public class DelimitedTermFrequencyTokenFilterFactory extends
> TokenFilterFactory {
> + public static final String DELIMITER_ATTR = "delimiter";
> +
> + private final char delimiter;
> +
> + /** Creates a new DelimitedPayloadTokenFilterFactory */
> + public DelimitedTermFrequencyTokenFilterFactory(Map<String, String>
> args) {
> + super(args);
> + delimiter = getChar(args, DELIMITER_ATTR,
> DelimitedTermFrequencyTokenFilter.DEFAULT_DELIMITER);
> + if (!args.isEmpty()) {
> + throw new IllegalArgumentException("Unknown parameters: " + args);
> + }
> + }
> +
> + @Override
> + public DelimitedTermFrequencyTokenFilter create(TokenStream input) {
> + return new DelimitedTermFrequencyTokenFilter(input, delimiter);
> + }
> +}
> \ No newline at end of file
>
> http://git-wip-us.apache.org/repos/asf/lucene-
> solr/blob/5844ed4a/lucene/analysis/common/src/resources/META-
> INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
> ----------------------------------------------------------------------
> diff --git a/lucene/analysis/common/src/resources/META-
> INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
> b/lucene/analysis/common/src/resources/META-
> INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
> index 4e33006..bc19c4a 100644
> --- a/lucene/analysis/common/src/resources/META-
> INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
> +++ b/lucene/analysis/common/src/resources/META-
> INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
> @@ -63,6 +63,7 @@
> org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
> org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
> org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
> org.apache.lucene.analysis.miscellaneous.DateRecognizerFilterFactory
> +org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFil
> terFactory
> org.apache.lucene.analysis.miscellaneous.FingerprintFilterFactory
> org.apache.lucene.analysis.miscellaneous.FixBrokenOffsetsFilterFactory
> org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
>
> http://git-wip-us.apache.org/repos/asf/lucene-
> solr/blob/5844ed4a/lucene/analysis/common/src/test/org/apache/lucene/a
> nalysis/miscellaneous/DelimitedTermFrequencyTokenFilterTest.java
> ----------------------------------------------------------------------
> diff --git
> a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilterTest.java
> b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilterTest.java
> new file mode 100644
> index 0000000..7609f6e
> --- /dev/null
> +++
> b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellane
> ous/DelimitedTermFrequencyTokenFilterTest.java
> @@ -0,0 +1,77 @@
> +/*
> + * Licensed to the Apache Software Foundation (ASF) under one or more
> + * contributor license agreements. See the NOTICE file distributed with
> + * this work for additional information regarding copyright ownership.
> + * The ASF licenses this file to You under the Apache License, Version 2.0
> + * (the "License"); you may not use this file except in compliance with
> + * the License. You may obtain a copy of the License at
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +package org.apache.lucene.analysis.miscellaneous;
> +
> +import org.apache.lucene.analysis.BaseTokenStreamTestCase;
> +import org.apache.lucene.analysis.TokenStream;
> +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> +import
> org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
> +
> +public class DelimitedTermFrequencyTokenFilterTest extends
> BaseTokenStreamTestCase {
> +
> + public void testTermFrequency() throws Exception {
> + String test = "The quick|40 red|4 fox|06 jumped|1 over the lazy|2
> brown|123 dogs|1024";
> + DelimitedTermFrequencyTokenFilter filter =
> + new
> DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
> + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
> + TermFrequencyAttribute tfAtt =
> filter.getAttribute(TermFrequencyAttribute.class);
> + filter.reset();
> + assertTermEquals("The", filter, termAtt, tfAtt, 1);
> + assertTermEquals("quick", filter, termAtt, tfAtt, 40);
> + assertTermEquals("red", filter, termAtt, tfAtt, 4);
> + assertTermEquals("fox", filter, termAtt, tfAtt, 6);
> + assertTermEquals("jumped", filter, termAtt, tfAtt, 1);
> + assertTermEquals("over", filter, termAtt, tfAtt, 1);
> + assertTermEquals("the", filter, termAtt, tfAtt, 1);
> + assertTermEquals("lazy", filter, termAtt, tfAtt, 2);
> + assertTermEquals("brown", filter, termAtt, tfAtt, 123);
> + assertTermEquals("dogs", filter, termAtt, tfAtt, 1024);
> + assertFalse(filter.incrementToken());
> + filter.end();
> + filter.close();
> + }
> +
> + public void testInvalidNegativeTf() throws Exception {
> + String test = "foo bar|-20";
> + DelimitedTermFrequencyTokenFilter filter =
> + new
> DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
> + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
> + TermFrequencyAttribute tfAtt =
> filter.getAttribute(TermFrequencyAttribute.class);
> + filter.reset();
> + assertTermEquals("foo", filter, termAtt, tfAtt, 1);
> + IllegalArgumentException iae =
> expectThrows(IllegalArgumentException.class, filter::incrementToken);
> + assertEquals("Term frequency must be 1 or greater; got -20",
> iae.getMessage());
> + }
> +
> + public void testInvalidFloatTf() throws Exception {
> + String test = "foo bar|1.2";
> + DelimitedTermFrequencyTokenFilter filter =
> + new
> DelimitedTermFrequencyTokenFilter(whitespaceMockTokenizer(test));
> + CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
> + TermFrequencyAttribute tfAtt =
> filter.getAttribute(TermFrequencyAttribute.class);
> + filter.reset();
> + assertTermEquals("foo", filter, termAtt, tfAtt, 1);
> + expectThrows(NumberFormatException.class, filter::incrementToken);
> + }
> +
> + void assertTermEquals(String expected, TokenStream stream,
> CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf)
> throws Exception {
> + assertTrue(stream.incrementToken());
> + assertEquals(expected, termAtt.toString());
> + assertEquals(expectedTf, tfAtt.getTermFrequency());
> + }
> +}
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org