You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/11/02 01:10:02 UTC
svn commit: r1636074 -
/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java
Author: rjernst
Date: Sun Nov 2 00:10:02 2014
New Revision: 1636074
URL: http://svn.apache.org/r1636074
Log:
LUCENE-6043: Forgot to svn add the new file
Added:
lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java (with props)
Added: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java?rev=1636074&view=auto
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java (added)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java Sun Nov 2 00:10:02 2014
@@ -0,0 +1,144 @@
+package org.apache.lucene.analysis.standard.std40;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
+
+/** Backcompat uax29 tokenizer for Lucene 4.0-4.6. This supports Unicode 6.1.
+ *
+ * @deprecated Use {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer}
+ */
+@Deprecated
+public final class UAX29URLEmailTokenizer40 extends Tokenizer {
+ /** A private instance of the JFlex-constructed scanner */
+ private final UAX29URLEmailTokenizerImpl40 scanner;
+
+ public static final int ALPHANUM = 0;
+ public static final int NUM = 1;
+ public static final int SOUTHEAST_ASIAN = 2;
+ public static final int IDEOGRAPHIC = 3;
+ public static final int HIRAGANA = 4;
+ public static final int KATAKANA = 5;
+ public static final int HANGUL = 6;
+ public static final int URL = 7;
+ public static final int EMAIL = 8;
+
+ /** String token types that correspond to token type int constants */
+ public static final String [] TOKEN_TYPES = new String [] {
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.ALPHANUM],
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.NUM],
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.SOUTHEAST_ASIAN],
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.IDEOGRAPHIC],
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.HIRAGANA],
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.KATAKANA],
+ StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.HANGUL],
+ "<URL>",
+ "<EMAIL>",
+ };
+
+ private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+ /** Set the max allowed token length. Any token longer
+ * than this is skipped. */
+ public void setMaxTokenLength(int length) {
+ this.maxTokenLength = length;
+ }
+
+ /** @see #setMaxTokenLength */
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ /**
+ * Creates a new instance of the UAX29URLEmailTokenizer. Attaches
+ * the <code>input</code> to the newly created JFlex scanner.
+ */
+ public UAX29URLEmailTokenizer40() {
+ scanner = new UAX29URLEmailTokenizerImpl40(input);
+ }
+
+ /**
+ * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeSource}.
+ */
+ public UAX29URLEmailTokenizer40(AttributeFactory factory) {
+ super(factory);
+ scanner = new UAX29URLEmailTokenizerImpl40(input);
+ }
+
+ // this tokenizer generates three attributes:
+ // term offset, positionIncrement and type
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ @Override
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ int posIncr = 1;
+
+ while(true) {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == UAX29URLEmailTokenizerImpl40.YYEOF) {
+ return false;
+ }
+
+ if (scanner.yylength() <= maxTokenLength) {
+ posIncrAtt.setPositionIncrement(posIncr);
+ scanner.getText(termAtt);
+ final int start = scanner.yychar();
+ offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+ typeAtt.setType(TOKEN_TYPES[tokenType]);
+ return true;
+ } else
+ // When we skip a too-long term, we still increment the
+ // position increment
+ posIncr++;
+ }
+ }
+
+ @Override
+ public final void end() throws IOException {
+ super.end();
+ // set final offset
+ int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+
+ @Override
+ public void close() throws IOException {
+ super.close();
+ scanner.yyreset(input);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ }
+}