You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2010/09/08 20:02:18 UTC
svn commit: r995171 - /mahout/trunk/core/src/main/java/org/apache/mahout/vectors/LuceneTextValueEncoder.java

Author: tdunning
Date: Wed Sep  8 18:02:18 2010
New Revision: 995171

URL: http://svn.apache.org/viewvc?rev=995171&view=rev
Log:
MAHOUT-498 - Lucene based text encoder with more exposed text encoding methods

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/vectors/LuceneTextValueEncoder.java

Added: mahout/trunk/core/src/main/java/org/apache/mahout/vectors/LuceneTextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/LuceneTextValueEncoder.java?rev=995171&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectors/LuceneTextValueEncoder.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectors/LuceneTextValueEncoder.java Wed Sep  8 18:02:18 2010
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.vectors;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+/**
+ * Encodes text using a lucene style tokenizer.
+ * @see TextValueEncoder
+ */
+public class LuceneTextValueEncoder extends TextValueEncoder {
+  private Analyzer analyzer;
+
+  public LuceneTextValueEncoder(String name) {
+    super(name);
+  }
+
+  public void setAnalyzer(Analyzer analyzer) {
+    this.analyzer = analyzer;
+  }
+
+  /**
+   * Tokenizes a string using the simplest method.  This should be over-ridden for more subtle
+   * tokenization.
+   *
+   * @param originalForm
+   * @return
+   * @see org.apache.mahout.vectors.LuceneTextValueEncoder
+   */
+  @Override
+  protected Iterable<String> tokenize(CharSequence originalForm) {
+    TokenStream ts = analyzer.tokenStream(getName(), new CharSequenceReader(originalForm));
+    ts.addAttribute(TermAttribute.class);
+    return new LuceneTokenIterable(ts);
+  }
+
+  private static class CharSequenceReader extends Reader {
+    private CharBuffer buf;
+
+    /**
+     * Creates a new character-stream reader whose critical sections will synchronize on the reader
+     * itself.
+     */
+    private CharSequenceReader(CharSequence input) {
+      int n = input.length();
+      buf = CharBuffer.allocate(n);
+      for (int i = 0; i < n; i++) {
+        buf.put(input.charAt(i));
+      }
+    }
+
+    /**
+     * Reads characters into a portion of an array.  This method will block until some input is
+     * available, an I/O error occurs, or the end of the stream is reached.
+     *
+     * @param cbuf Destination buffer
+     * @param off  Offset at which to start storing characters
+     * @param len  Maximum number of characters to read
+     * @return The number of characters read, or -1 if the end of the stream has been reached
+     * @throws java.io.IOException If an I/O error occurs
+     */
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      buf.get(cbuf, off, len);
+      return len;
+    }
+
+    /**
+     * Closes the stream and releases any system resources associated with it.  Once the stream has
+     * been closed, further read(), ready(), mark(), reset(), or skip() invocations will throw an
+     * IOException. Closing a previously closed stream has no effect.
+     *
+     * @throws java.io.IOException If an I/O error occurs
+     */
+    @Override
+    public void close() throws IOException {
+      // do nothing
+    }
+  }
+
+  private class LuceneTokenIterable implements Iterable<String> {
+    private boolean firstTime = true;
+    private TokenStream tokenStream;
+
+    public LuceneTokenIterable(TokenStream ts) {
+      this.tokenStream = ts;
+    }
+
+    /**
+     * Returns an iterator over a set of elements of type T.
+     *
+     * @return an Iterator.
+     */
+    @Override
+    public Iterator<String> iterator() {
+      if (!firstTime) {
+        try {
+          tokenStream.reset();
+        } catch (IOException e) {
+          throw new IllegalStateException("This token stream can't be reset");
+        }
+      } else {
+        firstTime = false;
+      }
+
+      return new TokenStreamIterator(tokenStream);
+    }
+  }
+
+  private class TokenStreamIterator implements Iterator<String> {
+    private TokenStream tokenStream;
+    private String bufferedToken = null;
+
+    public TokenStreamIterator(TokenStream tokenStream) {
+      this.tokenStream = tokenStream;
+    }
+
+    /**
+     * Returns <tt>true</tt> if the iteration has more elements. (In other words, returns <tt>true</tt>
+     * if <tt>next</tt> would return an element rather than throwing an exception.)
+     *
+     * @return <tt>true</tt> if the iterator has more elements.
+     */
+    @Override
+    public boolean hasNext() {
+      if (bufferedToken == null) {
+        boolean r = false;
+        try {
+          r = tokenStream.incrementToken();
+        } catch (IOException e) {
+          throw new TokenizationException("IO error while tokenizing", e);
+        }
+        if (r) {
+          bufferedToken = tokenStream.getAttribute(TermAttribute.class).toString();
+        }
+        return r;
+      } else {
+        return true;
+      }
+    }
+
+    /**
+     * Returns the next element in the iteration.
+     *
+     * @return the next element in the iteration.
+     * @throws java.util.NoSuchElementException
+     *          iteration has no more elements.
+     */
+    @Override
+    public String next() {
+      if (bufferedToken != null) {
+        String r = bufferedToken;
+        bufferedToken = null;
+        return r;
+      } else if (hasNext()) {
+        return next();
+      } else {
+        throw new NoSuchElementException("Ran off end if token stream");
+      }
+    }
+
+    @Override
+    public void remove() {
+      throw new UnsupportedOperationException("Can't remove tokens");
+    }
+  }
+
+  private class TokenizationException extends RuntimeException {
+    public TokenizationException(String msg, Throwable cause) {
+      super(msg, cause);
+    }
+  }
+}