You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rj...@apache.org on 2014/11/02 01:10:02 UTC

svn commit: r1636074 - /lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java

Author: rjernst
Date: Sun Nov  2 00:10:02 2014
New Revision: 1636074

URL: http://svn.apache.org/r1636074
Log:
LUCENE-6043: Forgot to svn add the new file

Added:
    lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java   (with props)

Added: lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java?rev=1636074&view=auto
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java (added)
+++ lucene/dev/branches/branch_5x/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/std40/UAX29URLEmailTokenizer40.java Sun Nov  2 00:10:02 2014
@@ -0,0 +1,144 @@
+package org.apache.lucene.analysis.standard.std40;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
+
+/** Backcompat uax29 tokenizer for Lucene 4.0-4.6. This supports Unicode 6.1.
+ *
+ * @deprecated Use {@link org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer}
+ */
+@Deprecated
+public final class UAX29URLEmailTokenizer40 extends Tokenizer {
+  /** A private instance of the JFlex-constructed scanner */
+  private final UAX29URLEmailTokenizerImpl40 scanner;
+  
+  public static final int ALPHANUM          = 0;
+  public static final int NUM               = 1;
+  public static final int SOUTHEAST_ASIAN   = 2;
+  public static final int IDEOGRAPHIC       = 3;
+  public static final int HIRAGANA          = 4;
+  public static final int KATAKANA          = 5;
+  public static final int HANGUL            = 6;
+  public static final int URL               = 7;
+  public static final int EMAIL             = 8;
+
+  /** String token types that correspond to token type int constants */
+  public static final String [] TOKEN_TYPES = new String [] {
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.ALPHANUM],
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.NUM],
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.SOUTHEAST_ASIAN],
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.IDEOGRAPHIC],
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.HIRAGANA],
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.KATAKANA],
+      StandardTokenizer40.TOKEN_TYPES[StandardTokenizer40.HANGUL],
+    "<URL>",
+    "<EMAIL>",
+  };
+
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+  /** Set the max allowed token length.  Any token longer
+   *  than this is skipped. */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /** @see #setMaxTokenLength */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
+  /**
+   * Creates a new instance of the UAX29URLEmailTokenizer.  Attaches
+   * the <code>input</code> to the newly created JFlex scanner.
+   */
+  public UAX29URLEmailTokenizer40() {
+    scanner = new UAX29URLEmailTokenizerImpl40(input);
+  }
+
+  /**
+   * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeSource}. 
+   */
+  public UAX29URLEmailTokenizer40(AttributeFactory factory) {
+    super(factory);
+    scanner = new UAX29URLEmailTokenizerImpl40(input);
+  }
+
+  // this tokenizer generates three attributes:
+  // term offset, positionIncrement and type
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    clearAttributes();
+    int posIncr = 1;
+
+    while(true) {
+      int tokenType = scanner.getNextToken();
+
+      if (tokenType == UAX29URLEmailTokenizerImpl40.YYEOF) {
+        return false;
+      }
+
+      if (scanner.yylength() <= maxTokenLength) {
+        posIncrAtt.setPositionIncrement(posIncr);
+        scanner.getText(termAtt);
+        final int start = scanner.yychar();
+        offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
+        typeAtt.setType(TOKEN_TYPES[tokenType]);
+        return true;
+      } else
+        // When we skip a too-long term, we still increment the
+        // position increment
+        posIncr++;
+    }
+  }
+  
+  @Override
+  public final void end() throws IOException {
+    super.end();
+    // set final offset
+    int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+    offsetAtt.setOffset(finalOffset, finalOffset);
+  }
+
+  @Override
+  public void close() throws IOException {
+    super.close();
+    scanner.yyreset(input);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    scanner.yyreset(input);
+  }
+}