You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/02/13 19:10:01 UTC
lucene-solr:branch_6x: LUCENE-7465: add SimplePatternTokenizer and
SimpleSplitPatternTokenizer,
for tokenization using Lucene's regexp/automaton implementation
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x a986368fd -> c24e03e6b
LUCENE-7465: add SimplePatternTokenizer and SimpleSplitPatternTokenizer, for tokenization using Lucene's regexp/automaton implementation
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c24e03e6
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c24e03e6
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c24e03e6
Branch: refs/heads/branch_6x
Commit: c24e03e6bf4d09e6f31eee8192bb6c0c4b2b6d27
Parents: a986368
Author: Mike McCandless <mi...@apache.org>
Authored: Mon Feb 13 12:50:16 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Mon Feb 13 12:52:10 2017 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 5 +
.../pattern/SimplePatternSplitTokenizer.java | 258 ++++++++++++++++++
.../SimplePatternSplitTokenizerFactory.java | 76 ++++++
.../pattern/SimplePatternTokenizer.java | 242 ++++++++++++++++
.../pattern/SimplePatternTokenizerFactory.java | 76 ++++++
...apache.lucene.analysis.util.TokenizerFactory | 2 +
.../lucene/analysis/core/TestRandomChains.java | 10 +-
.../TestSimplePatternSplitTokenizer.java | 273 +++++++++++++++++++
.../pattern/TestSimplePatternTokenizer.java | 218 +++++++++++++++
.../lucene/util/automaton/ByteRunAutomaton.java | 4 +-
.../util/automaton/CharacterRunAutomaton.java | 2 +-
.../lucene/util/automaton/Operations.java | 44 +--
.../lucene/util/automaton/RunAutomaton.java | 126 +++++----
.../lucene/search/TermAutomatonScorer.java | 2 +-
14 files changed, 1243 insertions(+), 95 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 9156a0c..d5dff3c 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -49,6 +49,11 @@ New Features
SortedNumericSelector.Type can give a ValueSource view of a
SortedNumericDocValues field. (Tom�s Fern�ndez L�bbe)
+* LUCENE-7465: Add SimplePatternTokenizer and
+ SimplePatternSplitTokenizer, using Lucene's regexp/automaton
+ implementation for analysis/tokenization (Clinton Gormley, Mike
+ McCandless)
+
Bug Fixes
* LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
new file mode 100644
index 0000000..d2b10c1
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
+ * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. This is just
+ * like {@link SimplePatternTokenizer} except that the pattern shold make valid token separator characters, like
+ * {@code String.split}. Empty string tokens are never produced.
+ *
+ * @lucene.experimental
+ */
+
+public final class SimplePatternSplitTokenizer extends Tokenizer {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private final CharacterRunAutomaton runDFA;
+
+ // TODO: this is copied from SimplePatternTokenizer, but there are subtle differences e.g. we track sepUpto an tokenUpto;
+ // find a clean way to share it:
+
+ // TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I
+ // suspect it's slowish:
+
+ private char[] pendingChars = new char[8];
+ private int tokenUpto;
+ private int pendingLimit;
+ private int pendingUpto;
+ private int offset;
+ private int sepUpto;
+ private final char[] buffer = new char[1024];
+ private int bufferLimit;
+ private int bufferNextRead;
+
+ /** See {@link RegExp} for the accepted syntax. */
+ public SimplePatternSplitTokenizer(String regexp) {
+ this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ }
+
+ /** Runs a pre-built automaton. */
+ public SimplePatternSplitTokenizer(Automaton dfa) {
+ this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
+ }
+
+ /** See {@link RegExp} for the accepted syntax. */
+ public SimplePatternSplitTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
+ this(factory, new RegExp(regexp).toAutomaton());
+ }
+
+ /** Runs a pre-built automaton. */
+ public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
+ super(factory);
+
+ // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
+ // realizing this ctor is otherwise trappy
+ if (dfa.isDeterministic() == false) {
+ throw new IllegalArgumentException("please determinize the incoming automaton first");
+ }
+
+ runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ }
+
+ private void fillToken(int offsetStart) {
+ termAtt.setLength(tokenUpto);
+ offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+tokenUpto));
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+
+ int offsetStart = offset;
+
+ clearAttributes();
+
+ tokenUpto = 0;
+
+ while (true) {
+ sepUpto = 0;
+
+ // The runDFA operates in Unicode space, not UTF16 (java's char):
+ int ch = nextCodePoint();
+ if (ch == -1) {
+ if (tokenUpto > 0) {
+ fillToken(offsetStart);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ int state = runDFA.step(0, ch);
+
+ if (state != -1) {
+ // a token separator just possibly started; keep scanning to see if the token is accepted:
+ int lastAcceptLength = -1;
+ do {
+
+ if (runDFA.isAccept(state)) {
+ // record that the token separator matches here, but keep scanning in case a longer match also works (greedy):
+ lastAcceptLength = sepUpto;
+ }
+
+ ch = nextCodePoint();
+ if (ch == -1) {
+ break;
+ }
+ state = runDFA.step(state, ch);
+ } while (state != -1);
+
+ if (lastAcceptLength != -1) {
+ // strip the trailing separater we just matched from the token:
+ tokenUpto -= lastAcceptLength;
+ // we found a token separator
+ int extra = sepUpto - lastAcceptLength;
+ if (extra != 0) {
+ pushBack(extra);
+ }
+ if (tokenUpto > 0) {
+ fillToken(offsetStart);
+ return true;
+ } else {
+ // we matched one token separator immediately after another
+ offsetStart = offset;
+ }
+ } else if (ch == -1) {
+ if (tokenUpto > 0) {
+ fillToken(offsetStart);
+ return true;
+ } else {
+ return false;
+ }
+ } else {
+ // false alarm: there was no token separator here; push back all but the first character we scanned
+ pushBack(sepUpto-1);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
+ offsetAtt.setOffset(ofs, ofs);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ offset = 0;
+ pendingUpto = 0;
+ pendingLimit = 0;
+ sepUpto = 0;
+ bufferNextRead = 0;
+ bufferLimit = 0;
+ }
+
+ /** Pushes back the last {@code count} characters in current token's buffer. */
+ private void pushBack(int count) {
+ tokenUpto -= count;
+ assert tokenUpto >= 0;
+ if (pendingLimit == 0) {
+ if (bufferNextRead >= count) {
+ // optimize common case when the chars we are pushing back are still in the buffer
+ bufferNextRead -= count;
+ } else {
+ if (count > pendingChars.length) {
+ pendingChars = ArrayUtil.grow(pendingChars, count);
+ }
+ System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+ pendingLimit = count;
+ }
+ } else {
+ // we are pushing back what is already in our pending buffer
+ pendingUpto -= count;
+ assert pendingUpto >= 0;
+ }
+ offset -= count;
+ }
+
+ private void appendToToken(char ch) {
+ char[] buffer = termAtt.buffer();
+ if (tokenUpto == buffer.length) {
+ buffer = termAtt.resizeBuffer(tokenUpto + 1);
+ }
+ buffer[tokenUpto++] = ch;
+ sepUpto++;
+ }
+
+ private int nextCodeUnit() throws IOException {
+ int result;
+ if (pendingUpto < pendingLimit) {
+ result = pendingChars[pendingUpto++];
+ if (pendingUpto == pendingLimit) {
+ // We used up the pending buffer
+ pendingUpto = 0;
+ pendingLimit = 0;
+ }
+ appendToToken((char) result);
+ offset++;
+ } else if (bufferLimit == -1) {
+ return -1;
+ } else {
+ assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
+ if (bufferNextRead == bufferLimit) {
+ bufferLimit = input.read(buffer, 0, buffer.length);
+ if (bufferLimit == -1) {
+ return -1;
+ }
+ bufferNextRead = 0;
+ }
+ result = buffer[bufferNextRead++];
+ offset++;
+ appendToToken((char) result);
+ }
+ return result;
+ }
+
+ private int nextCodePoint() throws IOException {
+
+ int ch = nextCodeUnit();
+ if (ch == -1) {
+ return ch;
+ }
+ if (Character.isHighSurrogate((char) ch)) {
+ return Character.toCodePoint((char) ch, (char) nextCodeUnit());
+ } else {
+ return ch;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
new file mode 100644
index 0000000..4af6286
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Factory for {@link SimplePatternSplitTokenizer}, for producing tokens by splitting according to the provided regexp.
+ *
+ * <p>This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
+ * for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the
+ * tokenization is quite a bit faster. It takes two arguments:
+ * <br>
+ * <ul>
+ * <li>"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}</li>
+ * <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp</li>
+ * </ul>
+ * <p>
+ * The pattern matches the characters that should split tokens, like {@code String.split}, and the
+ * matching is greedy such that the longest token separator matching at a given point is matched. Empty
+ * tokens are never created.
+ *
+ * <p>For example, to match tokens delimited by simple whitespace characters:
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.SimplePatternSplitTokenizerFactory" pattern="[ \t\r\n]+"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ * @lucene.experimental
+ *
+ * @see SimplePatternSplitTokenizer
+ */
+public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {
+ public static final String PATTERN = "pattern";
+ private final Automaton dfa;
+ private final int maxDeterminizedStates;
+
+ /** Creates a new SimpleSplitPatternTokenizerFactory */
+ public SimplePatternSplitTokenizerFactory(Map<String,String> args) {
+ super(args);
+ maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
+ if (args.isEmpty() == false) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public SimplePatternSplitTokenizer create(final AttributeFactory factory) {
+ return new SimplePatternSplitTokenizer(factory, dfa);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
new file mode 100644
index 0000000..867b10a
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
+ * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster. The provided
+ * regex should match valid token characters (not token separator characters, like {@code String.split}). The matching is greedy:
+ * the longest match at a given start point will be the next token. Empty string tokens are never produced.
+ *
+ * @lucene.experimental
+ */
+
+// TODO: the matcher here is naive and does have N^2 adversarial cases that are unlikely to arise in practice, e.g. if the pattern is
+// aaaaaaaaaab and the input is aaaaaaaaaaa, the work we do here is N^2 where N is the number of a's. This is because on failing to match
+// a token, we skip one character forward and try again. A better approach would be to compile something like this regexp
+// instead: .* | <pattern>, because that automaton would not "forget" all the as it had already seen, and would be a single pass
+// through the input. I think this is the same thing as Aho/Corasick's algorithm (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
+// But we cannot implement this (I think?) until/unless Lucene regexps support sub-group capture, so we could know
+// which specific characters the pattern matched. SynonymFilter has this same limitation.
+
+public final class SimplePatternTokenizer extends Tokenizer {
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private final CharacterRunAutomaton runDFA;
+
+ // TODO: we could likely use a single rolling buffer instead of two separate char buffers here. We could also use PushBackReader but I
+ // suspect it's slowish:
+
+ private char[] pendingChars = new char[8];
+ private int pendingLimit;
+ private int pendingUpto;
+ private int offset;
+ private int tokenUpto;
+ private final char[] buffer = new char[1024];
+ private int bufferLimit;
+ private int bufferNextRead;
+
+ /** See {@link RegExp} for the accepted syntax. */
+ public SimplePatternTokenizer(String regexp) {
+ this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ }
+
+ /** Runs a pre-built automaton. */
+ public SimplePatternTokenizer(Automaton dfa) {
+ this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
+ }
+
+ /** See {@link RegExp} for the accepted syntax. */
+ public SimplePatternTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
+ this(factory, new RegExp(regexp).toAutomaton());
+ }
+
+ /** Runs a pre-built automaton. */
+ public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
+ super(factory);
+
+ // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
+ // realizing this ctor is otherwise trappy
+ if (dfa.isDeterministic() == false) {
+ throw new IllegalArgumentException("please determinize the incoming automaton first");
+ }
+
+ runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+
+ clearAttributes();
+ tokenUpto = 0;
+
+ while (true) {
+
+ int offsetStart = offset;
+
+ // The runDFA operates in Unicode space, not UTF16 (java's char):
+
+ int ch = nextCodePoint();
+ if (ch == -1) {
+ return false;
+ }
+
+ int state = runDFA.step(0, ch);
+
+ if (state != -1) {
+ // a token just possibly started; keep scanning to see if the token is accepted:
+ int lastAcceptLength = -1;
+ do {
+
+ if (runDFA.isAccept(state)) {
+ // record that the token matches here, but keep scanning in case a longer match also works (greedy):
+ lastAcceptLength = tokenUpto;
+ }
+
+ ch = nextCodePoint();
+ if (ch == -1) {
+ break;
+ }
+ state = runDFA.step(state, ch);
+ } while (state != -1);
+
+ if (lastAcceptLength != -1) {
+ // we found a token
+ int extra = tokenUpto - lastAcceptLength;
+ if (extra != 0) {
+ pushBack(extra);
+ }
+ termAtt.setLength(lastAcceptLength);
+ offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+lastAcceptLength));
+ return true;
+ } else if (ch == -1) {
+ return false;
+ } else {
+ // false alarm: there was no token here; push back all but the first character we scanned
+ pushBack(tokenUpto-1);
+ tokenUpto = 0;
+ }
+ } else {
+ tokenUpto = 0;
+ }
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
+ offsetAtt.setOffset(ofs, ofs);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ offset = 0;
+ pendingUpto = 0;
+ pendingLimit = 0;
+ tokenUpto = 0;
+ bufferNextRead = 0;
+ bufferLimit = 0;
+ }
+
+ /** Pushes back the last {@code count} characters in current token's buffer. */
+ private void pushBack(int count) {
+
+ if (pendingLimit == 0) {
+ if (bufferNextRead >= count) {
+ // optimize common case when the chars we are pushing back are still in the buffer
+ bufferNextRead -= count;
+ } else {
+ if (count > pendingChars.length) {
+ pendingChars = ArrayUtil.grow(pendingChars, count);
+ }
+ System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+ pendingLimit = count;
+ }
+ } else {
+ // we are pushing back what is already in our pending buffer
+ pendingUpto -= count;
+ assert pendingUpto >= 0;
+ }
+ offset -= count;
+ }
+
+ private void appendToToken(char ch) {
+ char[] buffer = termAtt.buffer();
+ if (tokenUpto == buffer.length) {
+ buffer = termAtt.resizeBuffer(tokenUpto + 1);
+ }
+ buffer[tokenUpto++] = ch;
+ }
+
+ private int nextCodeUnit() throws IOException {
+ int result;
+ if (pendingUpto < pendingLimit) {
+ result = pendingChars[pendingUpto++];
+ if (pendingUpto == pendingLimit) {
+ // We used up the pending buffer
+ pendingUpto = 0;
+ pendingLimit = 0;
+ }
+ appendToToken((char) result);
+ offset++;
+ } else if (bufferLimit == -1) {
+ return -1;
+ } else {
+ assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
+ if (bufferNextRead == bufferLimit) {
+ bufferLimit = input.read(buffer, 0, buffer.length);
+ if (bufferLimit == -1) {
+ return -1;
+ }
+ bufferNextRead = 0;
+ }
+ result = buffer[bufferNextRead++];
+ offset++;
+ appendToToken((char) result);
+ }
+ return result;
+ }
+
+ private int nextCodePoint() throws IOException {
+
+ int ch = nextCodeUnit();
+ if (ch == -1) {
+ return ch;
+ }
+ if (Character.isHighSurrogate((char) ch)) {
+ return Character.toCodePoint((char) ch, (char) nextCodeUnit());
+ } else {
+ return ch;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
new file mode 100644
index 0000000..3e74d02
--- /dev/null
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Factory for {@link SimplePatternTokenizer}, for matching tokens based on the provided regexp.
+ *
+ * <p>This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
+ * for the input stream. The syntax is more limited than {@link PatternTokenizer}, but the
+ * tokenization is quite a bit faster. It takes two arguments:
+ * <br>
+ * <ul>
+ * <li>"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}</li>
+ * <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp</li>
+ * </ul>
+ * <p>
+ * The pattern matches the characters to include in a token (not the split characters), and the
+ * matching is greedy such that the longest token matching at a given point is created. Empty
+ * tokens are never created.
+ *
+ * <p>For example, to match tokens delimited by simple whitespace characters:
+ *
+ * <pre class="prettyprint">
+ * <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
+ * <analyzer>
+ * <tokenizer class="solr.SimplePatternTokenizerFactory" pattern="[^ \t\r\n]+"/>
+ * </analyzer>
+ * </fieldType></pre>
+ *
+ * @lucene.experimental
+ *
+ * @see SimplePatternTokenizer
+ */
+public class SimplePatternTokenizerFactory extends TokenizerFactory {
+ public static final String PATTERN = "pattern";
+ private final Automaton dfa;
+ private final int maxDeterminizedStates;
+
+ /** Creates a new SimplePatternTokenizerFactory */
+ public SimplePatternTokenizerFactory(Map<String,String> args) {
+ super(args);
+ maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
+ if (args.isEmpty() == false) {
+ throw new IllegalArgumentException("Unknown parameters: " + args);
+ }
+ }
+
+ @Override
+ public SimplePatternTokenizer create(final AttributeFactory factory) {
+ return new SimplePatternTokenizer(factory, dfa);
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
index be0b7d4..4b37eb8 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
@@ -21,6 +21,8 @@ org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
org.apache.lucene.analysis.ngram.NGramTokenizerFactory
org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory
org.apache.lucene.analysis.pattern.PatternTokenizerFactory
+org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizerFactory
+org.apache.lucene.analysis.pattern.SimplePatternTokenizerFactory
org.apache.lucene.analysis.standard.ClassicTokenizerFactory
org.apache.lucene.analysis.standard.StandardTokenizerFactory
org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 8953f9f..3a58bdd 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -96,7 +96,11 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.tartarus.snowball.SnowballProgram;
@@ -494,6 +498,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
if (random.nextBoolean()) return null;
return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
});
+ put(Automaton.class, random -> {
+ return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ });
}};
static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
@@ -503,7 +510,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
allowedTokenizerArgs.add(Reader.class);
allowedTokenizerArgs.add(AttributeFactory.class);
allowedTokenizerArgs.add(AttributeSource.class);
-
+ allowedTokenizerArgs.add(Automaton.class);
+
allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
allowedTokenFilterArgs.addAll(argProducers.keySet());
allowedTokenFilterArgs.add(TokenStream.class);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
new file mode 100644
index 0000000..5642c2b
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+
+public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
+
+ public void testGreedy() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("(foo)+");
+ t.setReader(new StringReader("bar foofoo baz"));
+ assertTokenStreamContents(t,
+ new String[] {"bar ", " baz"},
+ new int[] {0, 10},
+ new int[] {4, 14});
+ }
+
+ public void testBackToBack() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("foo");
+ t.setReader(new StringReader("bar foofoo baz"));
+ assertTokenStreamContents(t,
+ new String[] {"bar ", " baz"},
+ new int[] {0, 10},
+ new int[] {4, 14});
+ }
+
+ public void testBigLookahead() throws Exception {
+ StringBuilder b = new StringBuilder();
+ for(int i=0;i<100;i++) {
+ b.append('a');
+ }
+ b.append('b');
+ Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+
+ b = new StringBuilder();
+ for(int i=0;i<200;i++) {
+ b.append('a');
+ }
+ t.setReader(new StringReader(b.toString()));
+ t.reset();
+ assertTrue(t.incrementToken());
+ assertEquals(b.toString(), termAtt.toString());
+ assertFalse(t.incrementToken());
+ }
+
+ public void testNoTokens() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer(".*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ String s;
+ while (true) {
+ s = TestUtil.randomUnicodeString(random());
+ if (s.length() > 0) {
+ break;
+ }
+ }
+ t.setReader(new StringReader(s));
+ t.reset();
+ assertFalse(t.incrementToken());
+ }
+
+ public void testEmptyStringPatternNoMatch() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("a*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader("bbb"));
+ t.reset();
+ assertTrue(t.incrementToken());
+ assertEquals("bbb", termAtt.toString());
+ assertFalse(t.incrementToken());
+ }
+
+ public void testSplitSingleCharWhitespace() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader("a \tb c"));
+ assertTokenStreamContents(t,
+ new String[] {"a", "b", "c"},
+ new int[] {0, 3, 7},
+ new int[] {1, 4, 8});
+ }
+
+ public void testSplitMultiCharWhitespace() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader("a \tb c"));
+ assertTokenStreamContents(t,
+ new String[] {"a", "b", "c"},
+ new int[] {0, 3, 7},
+ new int[] {1, 4, 8});
+ }
+
+ public void testLeadingNonToken() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader(" a c"));
+ assertTokenStreamContents(t,
+ new String[] {"a", "c"},
+ new int[] {4, 6},
+ new int[] {5, 7});
+ }
+
+ public void testTrailingNonToken() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader("a c "));
+ assertTokenStreamContents(t,
+ new String[] {"a", "c"},
+ new int[] {0, 2},
+ new int[] {1, 3});
+ }
+
+ public void testEmptyStringPatternOneMatch() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("a*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader("bbab"));
+ assertTokenStreamContents(t,
+ new String[] {"bb", "b"},
+ new int[] {0, 3},
+ new int[] {2, 4});
+ }
+
+ public void testEndOffset() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("a+");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
+ t.setReader(new StringReader("aaabbb"));
+ t.reset();
+ assertTrue(t.incrementToken());
+ assertEquals("bbb", termAtt.toString());
+ assertFalse(t.incrementToken());
+ t.end();
+ assertEquals(6, offsetAtt.endOffset());
+ }
+
+ public void testFixedToken() throws Exception {
+ Tokenizer t = new SimplePatternSplitTokenizer("aaaa");
+
+ t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
+ assertTokenStreamContents(t,
+ new String[] {"aaa"},
+ new int[] {12},
+ new int[] {15});
+ }
+
+ public void testBasic() throws Exception
+ {
+ String[][] tests = {
+ // pattern input output
+ { "--", "aaa--bbb--ccc", "aaa bbb ccc" },
+ { ":", "aaa:bbb:ccc", "aaa bbb ccc" },
+ { ":", "boo:and:foo", "boo and foo" },
+ { "o", "boo:and:foo", "b :and:f" },
+ };
+
+ for(String[] test : tests) {
+ TokenStream stream = new SimplePatternSplitTokenizer(test[0]);
+ ((Tokenizer)stream).setReader(new StringReader(test[1]));
+ String out = tsToString(stream);
+ assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
+ }
+ }
+
+ public void testNotDeterminized() throws Exception {
+ Automaton a = new Automaton();
+ int start = a.createState();
+ int mid1 = a.createState();
+ int mid2 = a.createState();
+ int end = a.createState();
+ a.setAccept(end, true);
+ a.addTransition(start, mid1, 'a', 'z');
+ a.addTransition(start, mid2, 'a', 'z');
+ a.addTransition(mid1, end, 'b');
+ a.addTransition(mid2, end, 'b');
+ expectThrows(IllegalArgumentException.class, () -> {new SimplePatternSplitTokenizer(a);});
+ }
+
+ public void testOffsetCorrection() throws Exception {
+ final String INPUT = "Günther Günther is here";
+
+ // create MappingCharFilter
+ List<String> mappingRules = new ArrayList<>();
+ mappingRules.add( "\"ü\" => \"�\"" );
+ NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+ builder.add("ü", "�");
+ NormalizeCharMap normMap = builder.build();
+ CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
+
+ // create SimplePatternSplitTokenizer
+ Tokenizer stream = new SimplePatternSplitTokenizer("G�nther");
+ stream.setReader(charStream);
+ assertTokenStreamContents(stream,
+ new String[] { " ", " is here" },
+ new int[] { 12, 25 },
+ new int[] { 13, 33 },
+ INPUT.length());
+ }
+
+ /**
+ * TODO: rewrite tests not to use string comparison.
+ */
+ private static String tsToString(TokenStream in) throws IOException {
+ StringBuilder out = new StringBuilder();
+ CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
+ // extra safety to enforce, that the state is not preserved and also
+ // assign bogus values
+ in.clearAttributes();
+ termAtt.setEmpty().append("bogusTerm");
+ in.reset();
+ while (in.incrementToken()) {
+ if (out.length() > 0) {
+ out.append(' ');
+ }
+ out.append(termAtt.toString());
+ in.clearAttributes();
+ termAtt.setEmpty().append("bogusTerm");
+ }
+
+ in.close();
+ return out.toString();
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
+ checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ a.close();
+
+ Analyzer b = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
+ checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
+ b.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
new file mode 100644
index 0000000..b566713
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+
+public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
+
+ public void testGreedy() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer("(foo)+");
+ t.setReader(new StringReader("bar foofoo baz"));
+ assertTokenStreamContents(t,
+ new String[] {"foofoo"},
+ new int[] {4},
+ new int[] {10});
+ }
+
+ public void testBigLookahead() throws Exception {
+ StringBuilder b = new StringBuilder();
+ for(int i=0;i<100;i++) {
+ b.append('a');
+ }
+ b.append('b');
+ Tokenizer t = new SimplePatternTokenizer(b.toString());
+
+ b = new StringBuilder();
+ for(int i=0;i<200;i++) {
+ b.append('a');
+ }
+ t.setReader(new StringReader(b.toString()));
+ t.reset();
+ assertFalse(t.incrementToken());
+ }
+
+ public void testOneToken() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer(".*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ String s;
+ while (true) {
+ s = TestUtil.randomUnicodeString(random());
+ if (s.length() > 0) {
+ break;
+ }
+ }
+ t.setReader(new StringReader(s));
+ t.reset();
+ assertTrue(t.incrementToken());
+ assertEquals(s, termAtt.toString());
+ }
+
+ public void testEmptyStringPatternNoMatch() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer("a*");
+ t.setReader(new StringReader("bbb"));
+ t.reset();
+ assertFalse(t.incrementToken());
+ }
+
+ public void testEmptyStringPatternOneMatch() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer("a*");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ t.setReader(new StringReader("bbab"));
+ t.reset();
+ assertTrue(t.incrementToken());
+ assertEquals("a", termAtt.toString());
+ assertFalse(t.incrementToken());
+ }
+
+ public void testEndOffset() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer("a+");
+ CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+ OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
+ t.setReader(new StringReader("aaabbb"));
+ t.reset();
+ assertTrue(t.incrementToken());
+ assertEquals("aaa", termAtt.toString());
+ assertFalse(t.incrementToken());
+ t.end();
+ assertEquals(6, offsetAtt.endOffset());
+ }
+
+ public void testFixedToken() throws Exception {
+ Tokenizer t = new SimplePatternTokenizer("aaaa");
+
+ t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
+ assertTokenStreamContents(t,
+ new String[] {"aaaa", "aaaa", "aaaa"},
+ new int[] {0, 4, 8},
+ new int[] {4, 8, 12});
+ }
+
+ public void testBasic() throws Exception {
+ String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
+ String[][] tests = {
+ // pattern input output
+ { ":", "boo:and:foo", ": :" },
+ { qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
+ };
+
+ for(String[] test : tests) {
+ TokenStream stream = new SimplePatternTokenizer(test[0]);
+ ((Tokenizer)stream).setReader(new StringReader(test[1]));
+ String out = tsToString(stream);
+
+ assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
+ }
+ }
+
+ public void testNotDeterminized() throws Exception {
+ Automaton a = new Automaton();
+ int start = a.createState();
+ int mid1 = a.createState();
+ int mid2 = a.createState();
+ int end = a.createState();
+ a.setAccept(end, true);
+ a.addTransition(start, mid1, 'a', 'z');
+ a.addTransition(start, mid2, 'a', 'z');
+ a.addTransition(mid1, end, 'b');
+ a.addTransition(mid2, end, 'b');
+ expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);});
+ }
+
+ public void testOffsetCorrection() throws Exception {
+ final String INPUT = "Günther Günther is here";
+
+ // create MappingCharFilter
+ List<String> mappingRules = new ArrayList<>();
+ mappingRules.add( "\"ü\" => \"�\"" );
+ NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+ builder.add("ü", "�");
+ NormalizeCharMap normMap = builder.build();
+ CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
+
+ // create SimplePatternTokenizer
+ Tokenizer stream = new SimplePatternTokenizer("G�nther");
+ stream.setReader(charStream);
+ assertTokenStreamContents(stream,
+ new String[] { "G�nther", "G�nther" },
+ new int[] { 0, 13 },
+ new int[] { 12, 25 },
+ INPUT.length());
+ }
+
+ /**
+ * TODO: rewrite tests not to use string comparison.
+ */
+ private static String tsToString(TokenStream in) throws IOException {
+ StringBuilder out = new StringBuilder();
+ CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
+ // extra safety to enforce, that the state is not preserved and also
+ // assign bogus values
+ in.clearAttributes();
+ termAtt.setEmpty().append("bogusTerm");
+ in.reset();
+ while (in.incrementToken()) {
+ if (out.length() > 0) {
+ out.append(' ');
+ }
+ out.append(termAtt.toString());
+ in.clearAttributes();
+ termAtt.setEmpty().append("bogusTerm");
+ }
+
+ in.close();
+ return out.toString();
+ }
+
+ /** blast some random strings through the analyzer */
+ public void testRandomStrings() throws Exception {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new SimplePatternTokenizer("a");
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
+ checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+ a.close();
+
+ Analyzer b = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = new SimplePatternTokenizer("a");
+ return new TokenStreamComponents(tokenizer);
+ }
+ };
+ checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
+ b.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
index ca14bc6..abd5109 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@@ -27,9 +27,9 @@ public class ByteRunAutomaton extends RunAutomaton {
this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
- /** expert: if utf8 is true, the input is already byte-based */
+ /** expert: if isBinary is true, the input is already byte-based */
public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
- super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
+ super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates);
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
index 70ff9aa..1a9c1c9 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
@@ -36,7 +36,7 @@ public class CharacterRunAutomaton extends RunAutomaton {
* it then a TooComplexToDeterminizeException is thrown.
*/
public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
- super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates);
+ super(a, Character.MAX_CODE_POINT+1, maxDeterminizedStates);
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
index 718a908..b673a82 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@@ -29,24 +29,24 @@
package org.apache.lucene.util.automaton;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.RamUsageEstimator;
-
+import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
+
/**
* Automata operations.
*
@@ -335,7 +335,7 @@ final public class Operations {
Transition[][] transitions2 = a2.getSortedTransitions();
Automaton c = new Automaton();
c.createState();
- LinkedList<StatePair> worklist = new LinkedList<>();
+ ArrayDeque<StatePair> worklist = new ArrayDeque<>();
HashMap<StatePair,StatePair> newstates = new HashMap<>();
StatePair p = new StatePair(0, 0, 0);
worklist.add(p);
@@ -435,7 +435,7 @@ final public class Operations {
// TODO: cutover to iterators instead
Transition[][] transitions1 = a1.getSortedTransitions();
Transition[][] transitions2 = a2.getSortedTransitions();
- LinkedList<StatePair> worklist = new LinkedList<>();
+ ArrayDeque<StatePair> worklist = new ArrayDeque<>();
HashSet<StatePair> visited = new HashSet<>();
StatePair p = new StatePair(0, 0);
worklist.add(p);
@@ -682,7 +682,7 @@ final public class Operations {
// Create state 0:
b.createState();
- LinkedList<SortedIntSet.FrozenIntSet> worklist = new LinkedList<>();
+ ArrayDeque<SortedIntSet.FrozenIntSet> worklist = new ArrayDeque<>();
Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>();
worklist.add(initialset);
@@ -804,7 +804,7 @@ final public class Operations {
return false;
}
- LinkedList<Integer> workList = new LinkedList<>();
+ ArrayDeque<Integer> workList = new ArrayDeque<>();
BitSet seen = new BitSet(a.getNumStates());
workList.add(0);
seen.set(0);
@@ -907,7 +907,7 @@ final public class Operations {
if (numStates == 0) {
return live;
}
- LinkedList<Integer> workList = new LinkedList<>();
+ ArrayDeque<Integer> workList = new ArrayDeque<>();
live.set(0);
workList.add(0);
@@ -946,7 +946,7 @@ final public class Operations {
}
Automaton a2 = builder.finish();
- LinkedList<Integer> workList = new LinkedList<>();
+ ArrayDeque<Integer> workList = new ArrayDeque<>();
BitSet live = new BitSet(numStates);
BitSet acceptBits = a.getAcceptStates();
int s = 0;
@@ -1011,22 +1011,6 @@ final public class Operations {
}
/**
- * Finds the largest entry whose value is less than or equal to c, or 0 if
- * there is no such entry.
- */
- static int findIndex(int c, int[] points) {
- int a = 0;
- int b = points.length;
- while (b - a > 1) {
- int d = (a + b) >>> 1;
- if (points[d] > c) b = d;
- else if (points[d] < c) a = d;
- else return d;
- }
- return a;
- }
-
- /**
* Returns true if the language of this automaton is finite. The
* automaton must not have any dead states.
*/
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
index 1d64095..4f53926 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
@@ -38,13 +38,62 @@ import java.util.Arrays;
*/
public abstract class RunAutomaton {
final Automaton automaton;
- final int maxInterval;
+ final int alphabetSize;
final int size;
final boolean[] accept;
final int[] transitions; // delta(state,c) = transitions[state*points.length +
// getCharClass(c)]
final int[] points; // char interval start points
- final int[] classmap; // map from char number to class class
+ final int[] classmap; // map from char number to class
+
+ /**
+ * Constructs a new <code>RunAutomaton</code> from a deterministic
+ * <code>Automaton</code>.
+ *
+ * @param a an automaton
+ */
+ protected RunAutomaton(Automaton a, int alphabetSize) {
+ this(a, alphabetSize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+ }
+
+ /**
+ * Constructs a new <code>RunAutomaton</code> from a deterministic
+ * <code>Automaton</code>.
+ *
+ * @param a an automaton
+ * @param maxDeterminizedStates maximum number of states that can be created
+ * while determinizing a
+ */
+ protected RunAutomaton(Automaton a, int alphabetSize, int maxDeterminizedStates) {
+ this.alphabetSize = alphabetSize;
+ a = Operations.determinize(a, maxDeterminizedStates);
+ this.automaton = a;
+ points = a.getStartPoints();
+ size = Math.max(1,a.getNumStates());
+ accept = new boolean[size];
+ transitions = new int[size * points.length];
+ Arrays.fill(transitions, -1);
+ for (int n=0;n<size;n++) {
+ accept[n] = a.isAccept(n);
+ for (int c = 0; c < points.length; c++) {
+ int dest = a.step(n, points[c]);
+ assert dest == -1 || dest < size;
+ transitions[n * points.length + c] = dest;
+ }
+ }
+
+ /*
+ * Set alphabet table for optimal run performance.
+ */
+ classmap = new int[Math.min(256, alphabetSize)];
+ int i = 0;
+ for (int j = 0; j < classmap.length; j++) {
+ if (i + 1 < points.length && j == points[i + 1]) {
+ i++;
+ }
+ classmap[j] = i;
+ }
+ }
/**
* Returns a string representation of this automaton.
@@ -63,7 +112,7 @@ public abstract class RunAutomaton {
int min = points[j];
int max;
if (j + 1 < points.length) max = (points[j + 1] - 1);
- else max = maxInterval;
+ else max = alphabetSize;
b.append(" ");
Automaton.appendCharString(min, b);
if (min != max) {
@@ -103,63 +152,19 @@ public abstract class RunAutomaton {
* Gets character class of given codepoint
*/
final int getCharClass(int c) {
- return Operations.findIndex(c, points);
- }
- /**
- * Constructs a new <code>RunAutomaton</code> from a deterministic
- * <code>Automaton</code>.
- *
- * @param a an automaton
- */
- public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
- this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
- }
-
- /**
- * Constructs a new <code>RunAutomaton</code> from a deterministic
- * <code>Automaton</code>.
- *
- * @param a an automaton
- * @param maxDeterminizedStates maximum number of states that can be created
- * while determinizing a
- */
- public RunAutomaton(Automaton a, int maxInterval, boolean tableize,
- int maxDeterminizedStates) {
- this.maxInterval = maxInterval;
- a = Operations.determinize(a, maxDeterminizedStates);
- this.automaton = a;
- points = a.getStartPoints();
- size = Math.max(1,a.getNumStates());
- accept = new boolean[size];
- transitions = new int[size * points.length];
- Arrays.fill(transitions, -1);
- for (int n=0;n<size;n++) {
- accept[n] = a.isAccept(n);
- for (int c = 0; c < points.length; c++) {
- int dest = a.step(n, points[c]);
- assert dest == -1 || dest < size;
- transitions[n * points.length + c] = dest;
- }
- }
-
- /*
- * Set alphabet table for optimal run performance.
- */
- if (tableize) {
- classmap = new int[maxInterval + 1];
- int i = 0;
- for (int j = 0; j <= maxInterval; j++) {
- if (i + 1 < points.length && j == points[i + 1]) {
- i++;
- }
- classmap[j] = i;
- }
- } else {
- classmap = null;
+ // binary search
+ int a = 0;
+ int b = points.length;
+ while (b - a > 1) {
+ int d = (a + b) >>> 1;
+ if (points[d] > c) b = d;
+ else if (points[d] < c) a = d;
+ else return d;
}
+ return a;
}
-
+
/**
* Returns the state obtained by reading the given char from the given state.
* Returns -1 if not obtaining any such state. (If the original
@@ -168,7 +173,8 @@ public abstract class RunAutomaton {
* transition function.)
*/
public final int step(int state, int c) {
- if (classmap == null) {
+ assert c < alphabetSize;
+ if (c >= classmap.length) {
return transitions[state * points.length + getCharClass(c)];
} else {
return transitions[state * points.length + classmap[c]];
@@ -179,7 +185,7 @@ public abstract class RunAutomaton {
public int hashCode() {
final int prime = 31;
int result = 1;
- result = prime * result + maxInterval;
+ result = prime * result + alphabetSize;
result = prime * result + points.length;
result = prime * result + size;
return result;
@@ -191,7 +197,7 @@ public abstract class RunAutomaton {
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
RunAutomaton other = (RunAutomaton) obj;
- if (maxInterval != other.maxInterval) return false;
+ if (alphabetSize != other.alphabetSize) return false;
if (size != other.size) return false;
if (!Arrays.equals(points, other.points)) return false;
if (!Arrays.equals(accept, other.accept)) return false;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c24e03e6/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
----------------------------------------------------------------------
diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
index 9c5cf7f..c330ae2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
@@ -367,7 +367,7 @@ class TermAutomatonScorer extends Scorer {
static class TermRunAutomaton extends RunAutomaton {
public TermRunAutomaton(Automaton a, int termCount) {
- super(a, termCount, true);
+ super(a, termCount);
}
}