You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/03 14:44:23 UTC

svn commit: r940433 [2/3] - in /lucene/dev/trunk/lucene: ./ contrib/ contrib/analyzers/ contrib/analyzers/stempel/ contrib/analyzers/stempel/src/ contrib/analyzers/stempel/src/java/ contrib/analyzers/stempel/src/java/org/ contrib/analyzers/stempel/src/...

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java Mon May  3 12:44:22 2010
@@ -0,0 +1,333 @@
+/*
+                    Egothor Software License version 1.00
+                    Copyright (C) 1997-2004 Leo Galambos.
+                 Copyright (C) 2002-2004 "Egothor developers"
+                      on behalf of the Egothor Project.
+                             All rights reserved.
+
+   This  software  is  copyrighted  by  the "Egothor developers". If this
+   license applies to a single file or document, the "Egothor developers"
+   are the people or entities mentioned as copyright holders in that file
+   or  document.  If  this  license  applies  to the Egothor project as a
+   whole,  the  copyright holders are the people or entities mentioned in
+   the  file CREDITS. This file can be found in the same location as this
+   license in the distribution.
+
+   Redistribution  and  use  in  source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    1. Redistributions  of  source  code  must retain the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       following disclaimer.
+    2. Redistributions  in binary form must reproduce the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       disclaimer  that  follows  these  conditions  in the documentation
+       and/or other materials provided with the distribution.
+    3. The name "Egothor" must not be used to endorse or promote products
+       derived  from  this software without prior written permission. For
+       written permission, please contact Leo.G@seznam.cz
+    4. Products  derived  from this software may not be called "Egothor",
+       nor  may  "Egothor"  appear  in  their name, without prior written
+       permission from Leo.G@seznam.cz.
+
+   In addition, we request that you include in the end-user documentation
+   provided  with  the  redistribution  and/or  in the software itself an
+   acknowledgement equivalent to the following:
+   "This product includes software developed by the Egothor Project.
+    http://egothor.sf.net/"
+
+   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This  software  consists  of  voluntary  contributions  made  by  many
+   individuals  on  behalf  of  the  Egothor  Project  and was originally
+   created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * The MultiTrie is a Trie of Tries.
+ * <p>
+ * It stores words and their associated patch commands. The MultiTrie handles
+ * patch commmands broken into their constituent parts, as a MultiTrie does, but
+ * the commands are delimited by the skip command.
+ */
+public class MultiTrie2 extends MultiTrie {
+  /**
+   * Constructor for the MultiTrie object.
+   * 
+   * @param is the input stream
+   * @exception IOException if an I/O error occurs
+   */
+  public MultiTrie2(DataInput is) throws IOException {
+    super(is);
+  }
+  
+  /**
+   * Constructor for the MultiTrie2 object
+   * 
+   * @param forward set to <tt>true</tt> if the elements should be read left to
+   *          right
+   */
+  public MultiTrie2(boolean forward) {
+    super(forward);
+  }
+  
+  /**
+   * Return the element that is stored in a cell associated with the given key.
+   * 
+   * @param key the key to the cell holding the desired element
+   * @return the element
+   */
+  @Override
+  public CharSequence getFully(CharSequence key) {
+    StringBuilder result = new StringBuilder(tries.size() * 2);
+    try {
+      CharSequence lastkey = key;
+      CharSequence p[] = new CharSequence[tries.size()];
+      char lastch = ' ';
+      for (int i = 0; i < tries.size(); i++) {
+        CharSequence r = tries.get(i).getFully(lastkey);
+        if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
+          return result;
+        }
+        if (cannotFollow(lastch, r.charAt(0))) {
+          return result;
+        } else {
+          lastch = r.charAt(r.length() - 2);
+        }
+        // key=key.substring(lengthPP(r));
+        p[i] = r;
+        if (p[i].charAt(0) == '-') {
+          if (i > 0) {
+            key = skip(key, lengthPP(p[i - 1]));
+          }
+          key = skip(key, lengthPP(p[i]));
+        }
+        // key = skip(key, lengthPP(r));
+        result.append(r);
+        if (key.length() != 0) {
+          lastkey = key;
+        }
+      }
+    } catch (IndexOutOfBoundsException x) {}
+    return result;
+  }
+  
+  /**
+   * Return the element that is stored as last on a path belonging to the given
+   * key.
+   * 
+   * @param key the key associated with the desired element
+   * @return the element that is stored as last on a path
+   */
+  @Override
+  public CharSequence getLastOnPath(CharSequence key) {
+    StringBuilder result = new StringBuilder(tries.size() * 2);
+    try {
+      CharSequence lastkey = key;
+      CharSequence p[] = new CharSequence[tries.size()];
+      char lastch = ' ';
+      for (int i = 0; i < tries.size(); i++) {
+        CharSequence r = tries.get(i).getLastOnPath(lastkey);
+        if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
+          return result;
+        }
+        // System.err.println("LP:"+key+" last:"+lastch+" new:"+r);
+        if (cannotFollow(lastch, r.charAt(0))) {
+          return result;
+        } else {
+          lastch = r.charAt(r.length() - 2);
+        }
+        // key=key.substring(lengthPP(r));
+        p[i] = r;
+        if (p[i].charAt(0) == '-') {
+          if (i > 0) {
+            key = skip(key, lengthPP(p[i - 1]));
+          }
+          key = skip(key, lengthPP(p[i]));
+        }
+        // key = skip(key, lengthPP(r));
+        result.append(r);
+        if (key.length() != 0) {
+          lastkey = key;
+        }
+      }
+    } catch (IndexOutOfBoundsException x) {}
+    return result;
+  }
+  
+  /**
+   * Write this data structure to the given output stream.
+   * 
+   * @param os the output stream
+   * @exception IOException if an I/O error occurs
+   */
+  @Override
+  public void store(DataOutput os) throws IOException {
+    super.store(os);
+  }
+  
+  /**
+   * Add an element to this structure consisting of the given key and patch
+   * command. 
+   * <p>
+   * This method will return without executing if the <tt>cmd</tt>
+   * parameter's length is 0.
+   * 
+   * @param key the key
+   * @param cmd the patch command
+   */
+  @Override
+  public void add(CharSequence key, CharSequence cmd) {
+    if (cmd.length() == 0) {
+      return;
+    }
+    // System.err.println( cmd );
+    CharSequence p[] = decompose(cmd);
+    int levels = p.length;
+    // System.err.println("levels "+key+" cmd "+cmd+"|"+levels);
+    while (levels >= tries.size()) {
+      tries.add(new Trie(forward));
+    }
+    CharSequence lastkey = key;
+    for (int i = 0; i < levels; i++) {
+      if (key.length() > 0) {
+        tries.get(i).add(key, p[i]);
+        lastkey = key;
+      } else {
+        tries.get(i).add(lastkey, p[i]);
+      }
+      // System.err.println("-"+key+" "+p[i]+"|"+key.length());
+      /*
+       * key=key.substring(lengthPP(p[i]));
+       */
+      if (p[i].length() > 0 && p[i].charAt(0) == '-') {
+        if (i > 0) {
+          key = skip(key, lengthPP(p[i - 1]));
+        }
+        key = skip(key, lengthPP(p[i]));
+      }
+      // System.err.println("--->"+key);
+    }
+    if (key.length() > 0) {
+      tries.get(levels).add(key, EOM_NODE);
+    } else {
+      tries.get(levels).add(lastkey, EOM_NODE);
+    }
+  }
+  
+  /**
+   * Break the given patch command into its constituent pieces. The pieces are
+   * delimited by NOOP commands.
+   * 
+   * @param cmd the patch command
+   * @return an array containing the pieces of the command
+   */
+  public CharSequence[] decompose(CharSequence cmd) {
+    int parts = 0;
+    
+    for (int i = 0; 0 <= i && i < cmd.length();) {
+      int next = dashEven(cmd, i);
+      if (i == next) {
+        parts++;
+        i = next + 2;
+      } else {
+        parts++;
+        i = next;
+      }
+    }
+    
+    CharSequence part[] = new CharSequence[parts];
+    int x = 0;
+    
+    for (int i = 0; 0 <= i && i < cmd.length();) {
+      int next = dashEven(cmd, i);
+      if (i == next) {
+        part[x++] = cmd.subSequence(i, i + 2);
+        i = next + 2;
+      } else {
+        part[x++] = (next < 0) ? cmd.subSequence(i, cmd.length()) : cmd.subSequence(i, next);
+        i = next;
+      }
+    }
+    return part;
+  }
+  
+  /**
+   * Remove empty rows from the given Trie and return the newly reduced Trie.
+   * 
+   * @param by the Trie to reduce
+   * @return the newly reduced Trie
+   */
+  @Override
+  public Trie reduce(Reduce by) {
+    List<Trie> h = new ArrayList<Trie>();
+    for (Trie trie : tries)
+      h.add(trie.reduce(by));
+
+    MultiTrie2 m = new MultiTrie2(forward);
+    m.tries = h;
+    return m;
+  }
+  
+  private boolean cannotFollow(char after, char goes) {
+    switch (after) {
+      case '-':
+      case 'D':
+        return after == goes;
+    }
+    return false;
+  }
+  
+  private CharSequence skip(CharSequence in, int count) {
+    if (forward) {
+      return in.subSequence(count, in.length());
+    } else {
+      return in.subSequence(0, in.length() - count);
+    }
+  }
+  
+  private int dashEven(CharSequence in, int from) {
+    while (from < in.length()) {
+      if (in.charAt(from) == '-') {
+        return from;
+      } else {
+        from += 2;
+      }
+    }
+    return -1;
+  }
+  
+  private int lengthPP(CharSequence cmd) {
+    int len = 0;
+    for (int i = 0; i < cmd.length(); i++) {
+      switch (cmd.charAt(i++)) {
+        case '-':
+        case 'D':
+          len += cmd.charAt(i) - 'a' + 1;
+          break;
+        case 'R':
+          len++;
+        case 'I':
+          break;
+      }
+    }
+    return len;
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java Mon May  3 12:44:22 2010
@@ -0,0 +1,198 @@
+/*
+                    Egothor Software License version 1.00
+                    Copyright (C) 1997-2004 Leo Galambos.
+                 Copyright (C) 2002-2004 "Egothor developers"
+                      on behalf of the Egothor Project.
+                             All rights reserved.
+
+   This  software  is  copyrighted  by  the "Egothor developers". If this
+   license applies to a single file or document, the "Egothor developers"
+   are the people or entities mentioned as copyright holders in that file
+   or  document.  If  this  license  applies  to the Egothor project as a
+   whole,  the  copyright holders are the people or entities mentioned in
+   the  file CREDITS. This file can be found in the same location as this
+   license in the distribution.
+
+   Redistribution  and  use  in  source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    1. Redistributions  of  source  code  must retain the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       following disclaimer.
+    2. Redistributions  in binary form must reproduce the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       disclaimer  that  follows  these  conditions  in the documentation
+       and/or other materials provided with the distribution.
+    3. The name "Egothor" must not be used to endorse or promote products
+       derived  from  this software without prior written permission. For
+       written permission, please contact Leo.G@seznam.cz
+    4. Products  derived  from this software may not be called "Egothor",
+       nor  may  "Egothor"  appear  in  their name, without prior written
+       permission from Leo.G@seznam.cz.
+
+   In addition, we request that you include in the end-user documentation
+   provided  with  the  redistribution  and/or  in the software itself an
+   acknowledgement equivalent to the following:
+   "This product includes software developed by the Egothor Project.
+    http://egothor.sf.net/"
+
+   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This  software  consists  of  voluntary  contributions  made  by  many
+   individuals  on  behalf  of  the  Egothor  Project  and was originally
+   created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * The Optimizer class is a Trie that will be reduced (have empty rows removed).
+ * <p>
+ * The reduction will be made by joining two rows where the first is a subset of
+ * the second.
+ */
+public class Optimizer extends Reduce {
+  /**
+   * Constructor for the Optimizer object.
+   */
+  public Optimizer() {}
+  
+  /**
+   * Optimize (remove empty rows) from the given Trie and return the resulting
+   * Trie.
+   * 
+   * @param orig the Trie to consolidate
+   * @return the newly consolidated Trie
+   */
+  @Override
+  public Trie optimize(Trie orig) {
+    List<CharSequence> cmds = orig.cmds;
+    List<Row> rows = new ArrayList<Row>();
+    List<Row> orows = orig.rows;
+    int remap[] = new int[orows.size()];
+    
+    for (int j = orows.size() - 1; j >= 0; j--) {
+      Row now = new Remap(orows.get(j), remap);
+      boolean merged = false;
+      
+      for (int i = 0; i < rows.size(); i++) {
+        Row q = merge(now, rows.get(i));
+        if (q != null) {
+          rows.set(i, q);
+          merged = true;
+          remap[j] = i;
+          break;
+        }
+      }
+      
+      if (merged == false) {
+        remap[j] = rows.size();
+        rows.add(now);
+      }
+    }
+    
+    int root = remap[orig.root];
+    Arrays.fill(remap, -1);
+    rows = removeGaps(root, rows, new ArrayList<Row>(), remap);
+    
+    return new Trie(orig.forward, remap[root], cmds, rows);
+  }
+  
+  /**
+   * Merge the given rows and return the resulting Row.
+   * 
+   * @param master the master Row
+   * @param existing the existing Row
+   * @return the resulting Row, or <tt>null</tt> if the operation cannot be
+   *         realized
+   */
+  public Row merge(Row master, Row existing) {
+    Iterator<Character> i = master.cells.keySet().iterator();
+    Row n = new Row();
+    for (; i.hasNext();) {
+      Character ch = i.next();
+      // XXX also must handle Cnt and Skip !!
+      Cell a = master.cells.get(ch);
+      Cell b = existing.cells.get(ch);
+      
+      Cell s = (b == null) ? new Cell(a) : merge(a, b);
+      if (s == null) {
+        return null;
+      }
+      n.cells.put(ch, s);
+    }
+    i = existing.cells.keySet().iterator();
+    for (; i.hasNext();) {
+      Character ch = i.next();
+      if (master.at(ch) != null) {
+        continue;
+      }
+      n.cells.put(ch, existing.at(ch));
+    }
+    return n;
+  }
+  
+  /**
+   * Merge the given Cells and return the resulting Cell.
+   * 
+   * @param m the master Cell
+   * @param e the existing Cell
+   * @return the resulting Cell, or <tt>null</tt> if the operation cannot be
+   *         realized
+   */
+  public Cell merge(Cell m, Cell e) {
+    Cell n = new Cell();
+    
+    if (m.skip != e.skip) {
+      return null;
+    }
+    
+    if (m.cmd >= 0) {
+      if (e.cmd >= 0) {
+        if (m.cmd == e.cmd) {
+          n.cmd = m.cmd;
+        } else {
+          return null;
+        }
+      } else {
+        n.cmd = m.cmd;
+      }
+    } else {
+      n.cmd = e.cmd;
+    }
+    if (m.ref >= 0) {
+      if (e.ref >= 0) {
+        if (m.ref == e.ref) {
+          if (m.skip == e.skip) {
+            n.ref = m.ref;
+          } else {
+            return null;
+          }
+        } else {
+          return null;
+        }
+      } else {
+        n.ref = m.ref;
+      }
+    } else {
+      n.ref = e.ref;
+    }
+    n.cnt = m.cnt + e.cnt;
+    n.skip = m.skip;
+    return n;
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java Mon May  3 12:44:22 2010
@@ -0,0 +1,90 @@
+/*
+                    Egothor Software License version 1.00
+                    Copyright (C) 1997-2004 Leo Galambos.
+                 Copyright (C) 2002-2004 "Egothor developers"
+                      on behalf of the Egothor Project.
+                             All rights reserved.
+
+   This  software  is  copyrighted  by  the "Egothor developers". If this
+   license applies to a single file or document, the "Egothor developers"
+   are the people or entities mentioned as copyright holders in that file
+   or  document.  If  this  license  applies  to the Egothor project as a
+   whole,  the  copyright holders are the people or entities mentioned in
+   the  file CREDITS. This file can be found in the same location as this
+   license in the distribution.
+
+   Redistribution  and  use  in  source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    1. Redistributions  of  source  code  must retain the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       following disclaimer.
+    2. Redistributions  in binary form must reproduce the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       disclaimer  that  follows  these  conditions  in the documentation
+       and/or other materials provided with the distribution.
+    3. The name "Egothor" must not be used to endorse or promote products
+       derived  from  this software without prior written permission. For
+       written permission, please contact Leo.G@seznam.cz
+    4. Products  derived  from this software may not be called "Egothor",
+       nor  may  "Egothor"  appear  in  their name, without prior written
+       permission from Leo.G@seznam.cz.
+
+   In addition, we request that you include in the end-user documentation
+   provided  with  the  redistribution  and/or  in the software itself an
+   acknowledgement equivalent to the following:
+   "This product includes software developed by the Egothor Project.
+    http://egothor.sf.net/"
+
+   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This  software  consists  of  voluntary  contributions  made  by  many
+   individuals  on  behalf  of  the  Egothor  Project  and was originally
+   created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+/**
+ * The Optimizer class is a Trie that will be reduced (have empty rows removed).
+ * <p>
+ * This is the result of allowing a joining of rows when there is no collision
+ * between non-<tt>null</tt> values in the rows. Information loss, resulting in
+ * the stemmer not being able to recognize words (as in Optimizer), is
+ * curtailed, allowing the stemmer to recognize words for which the original
+ * trie was built. Use of this class allows the stemmer to be self-teaching.
+ */
+public class Optimizer2 extends Optimizer {
+  /**
+   * Constructor for the Optimizer2 object.
+   */
+  public Optimizer2() {}
+  
+  /**
+   * Merge the given Cells and return the resulting Cell.
+   * 
+   * @param m the master Cell
+   * @param e the existing Cell
+   * @return the resulting Cell, or <tt>null</tt> if the operation cannot be
+   *         realized
+   */
+  @Override
+  public Cell merge(Cell m, Cell e) {
+    if (m.cmd == e.cmd && m.ref == e.ref && m.skip == e.skip) {
+      Cell c = new Cell(m);
+      c.cnt += e.cnt;
+      return c;
+    } else {
+      return null;
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java Mon May  3 12:44:22 2010
@@ -0,0 +1,134 @@
+/*
+                    Egothor Software License version 1.00
+                    Copyright (C) 1997-2004 Leo Galambos.
+                 Copyright (C) 2002-2004 "Egothor developers"
+                      on behalf of the Egothor Project.
+                             All rights reserved.
+
+   This  software  is  copyrighted  by  the "Egothor developers". If this
+   license applies to a single file or document, the "Egothor developers"
+   are the people or entities mentioned as copyright holders in that file
+   or  document.  If  this  license  applies  to the Egothor project as a
+   whole,  the  copyright holders are the people or entities mentioned in
+   the  file CREDITS. This file can be found in the same location as this
+   license in the distribution.
+
+   Redistribution  and  use  in  source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    1. Redistributions  of  source  code  must retain the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       following disclaimer.
+    2. Redistributions  in binary form must reproduce the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       disclaimer  that  follows  these  conditions  in the documentation
+       and/or other materials provided with the distribution.
+    3. The name "Egothor" must not be used to endorse or promote products
+       derived  from  this software without prior written permission. For
+       written permission, please contact Leo.G@seznam.cz
+    4. Products  derived  from this software may not be called "Egothor",
+       nor  may  "Egothor"  appear  in  their name, without prior written
+       permission from Leo.G@seznam.cz.
+
+   In addition, we request that you include in the end-user documentation
+   provided  with  the  redistribution  and/or  in the software itself an
+   acknowledgement equivalent to the following:
+   "This product includes software developed by the Egothor Project.
+    http://egothor.sf.net/"
+
+   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This  software  consists  of  voluntary  contributions  made  by  many
+   individuals  on  behalf  of  the  Egothor  Project  and was originally
+   created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * The Reduce object is used to remove gaps in a Trie which stores a dictionary.
+ */
+public class Reduce {
+  
+  /**
+   * Constructor for the Reduce object.
+   */
+  public Reduce() {}
+  
+  /**
+   * Optimize (remove holes in the rows) the given Trie and return the
+   * restructured Trie.
+   * 
+   * @param orig the Trie to optimize
+   * @return the restructured Trie
+   */
+  public Trie optimize(Trie orig) {
+    List<CharSequence> cmds = orig.cmds;
+    List<Row> rows = new ArrayList<Row>();
+    List<Row> orows = orig.rows;
+    int remap[] = new int[orows.size()];
+    
+    Arrays.fill(remap, -1);
+    rows = removeGaps(orig.root, rows, new ArrayList<Row>(), remap);
+    
+    return new Trie(orig.forward, remap[orig.root], cmds, rows);
+  }
+  
+  List<Row> removeGaps(int ind, List<Row> old, List<Row> to, int remap[]) {
+    remap[ind] = to.size();
+    
+    Row now = old.get(ind);
+    to.add(now);
+    Iterator<Cell> i = now.cells.values().iterator();
+    for (; i.hasNext();) {
+      Cell c = i.next();
+      if (c.ref >= 0 && remap[c.ref] < 0) {
+        removeGaps(c.ref, old, to, remap);
+      }
+    }
+    to.set(remap[ind], new Remap(now, remap));
+    return to;
+  }
+  
+  /**
+   * This class is part of the Egothor Project
+   */
+  class Remap extends Row {
+    /**
+     * Constructor for the Remap object
+     * 
+     * @param old Description of the Parameter
+     * @param remap Description of the Parameter
+     */
+    public Remap(Row old, int remap[]) {
+      super();
+      Iterator<Character> i = old.cells.keySet().iterator();
+      for (; i.hasNext();) {
+        Character ch = i.next();
+        Cell c = old.at(ch);
+        Cell nc;
+        if (c.ref >= 0) {
+          nc = new Cell(c);
+          nc.ref = remap[nc.ref];
+        } else {
+          nc = new Cell(c);
+        }
+        cells.put(ch, nc);
+      }
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java Mon May  3 12:44:22 2010
@@ -0,0 +1,309 @@
+/*
+                    Egothor Software License version 1.00
+                    Copyright (C) 1997-2004 Leo Galambos.
+                 Copyright (C) 2002-2004 "Egothor developers"
+                      on behalf of the Egothor Project.
+                             All rights reserved.
+
+   This  software  is  copyrighted  by  the "Egothor developers". If this
+   license applies to a single file or document, the "Egothor developers"
+   are the people or entities mentioned as copyright holders in that file
+   or  document.  If  this  license  applies  to the Egothor project as a
+   whole,  the  copyright holders are the people or entities mentioned in
+   the  file CREDITS. This file can be found in the same location as this
+   license in the distribution.
+
+   Redistribution  and  use  in  source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    1. Redistributions  of  source  code  must retain the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       following disclaimer.
+    2. Redistributions  in binary form must reproduce the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       disclaimer  that  follows  these  conditions  in the documentation
+       and/or other materials provided with the distribution.
+    3. The name "Egothor" must not be used to endorse or promote products
+       derived  from  this software without prior written permission. For
+       written permission, please contact Leo.G@seznam.cz
+    4. Products  derived  from this software may not be called "Egothor",
+       nor  may  "Egothor"  appear  in  their name, without prior written
+       permission from Leo.G@seznam.cz.
+
+   In addition, we request that you include in the end-user documentation
+   provided  with  the  redistribution  and/or  in the software itself an
+   acknowledgement equivalent to the following:
+   "This product includes software developed by the Egothor Project.
+    http://egothor.sf.net/"
+
+   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This  software  consists  of  voluntary  contributions  made  by  many
+   individuals  on  behalf  of  the  Egothor  Project  and was originally
+   created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+/**
+ * The Row class represents a row in a matrix representation of a trie.
+ */
+public class Row {
+  TreeMap<Character,Cell> cells = new TreeMap<Character,Cell>();
+  int uniformCnt = 0;
+  int uniformSkip = 0;
+  
+  /**
+   * Construct a Row object from input carried in via the given input stream.
+   * 
+   * @param is the input stream
+   * @exception IOException if an I/O error occurs
+   */
+  public Row(DataInput is) throws IOException {
+    for (int i = is.readInt(); i > 0; i--) {
+      char ch = is.readChar();
+      Cell c = new Cell();
+      c.cmd = is.readInt();
+      c.cnt = is.readInt();
+      c.ref = is.readInt();
+      c.skip = is.readInt();
+      cells.put(ch, c);
+    }
+  }
+  
+  /**
+   * The default constructor for the Row object.
+   */
+  public Row() {}
+  
+  /**
+   * Construct a Row using the cells of the given Row.
+   * 
+   * @param old the Row to copy
+   */
+  public Row(Row old) {
+    cells = old.cells;
+  }
+  
+  /**
+   * Set the command in the Cell of the given Character to the given integer.
+   * 
+   * @param way the Character defining the Cell
+   * @param cmd the new command
+   */
+  public void setCmd(Character way, int cmd) {
+    Cell c = at(way);
+    if (c == null) {
+      c = new Cell();
+      c.cmd = cmd;
+      cells.put(way, c);
+    } else {
+      c.cmd = cmd;
+    }
+    c.cnt = (cmd >= 0) ? 1 : 0;
+  }
+  
+  /**
+   * Set the reference to the next row in the Cell of the given Character to the
+   * given integer.
+   * 
+   * @param way the Character defining the Cell
+   * @param ref The new ref value
+   */
+  public void setRef(Character way, int ref) {
+    Cell c = at(way);
+    if (c == null) {
+      c = new Cell();
+      c.ref = ref;
+      cells.put(way, c);
+    } else {
+      c.ref = ref;
+    }
+  }
+  
+  /**
+   * Return the number of cells in use.
+   * 
+   * @return the number of cells in use
+   */
+  public int getCells() {
+    Iterator<Character> i = cells.keySet().iterator();
+    int size = 0;
+    for (; i.hasNext();) {
+      Character c = i.next();
+      Cell e = at(c);
+      if (e.cmd >= 0 || e.ref >= 0) {
+        size++;
+      }
+    }
+    return size;
+  }
+  
+  /**
+   * Return the number of references (how many transitions) to other rows.
+   * 
+   * @return the number of references
+   */
+  public int getCellsPnt() {
+    Iterator<Character> i = cells.keySet().iterator();
+    int size = 0;
+    for (; i.hasNext();) {
+      Character c = i.next();
+      Cell e = at(c);
+      if (e.ref >= 0) {
+        size++;
+      }
+    }
+    return size;
+  }
+  
+  /**
+   * Return the number of patch commands saved in this Row.
+   * 
+   * @return the number of patch commands
+   */
+  public int getCellsVal() {
+    Iterator<Character> i = cells.keySet().iterator();
+    int size = 0;
+    for (; i.hasNext();) {
+      Character c = i.next();
+      Cell e = at(c);
+      if (e.cmd >= 0) {
+        size++;
+      }
+    }
+    return size;
+  }
+  
+  /**
+   * Return the command in the Cell associated with the given Character.
+   * 
+   * @param way the Character associated with the Cell holding the desired
+   *          command
+   * @return the command
+   */
+  public int getCmd(Character way) {
+    Cell c = at(way);
+    return (c == null) ? -1 : c.cmd;
+  }
+  
+  /**
+   * Return the number of patch commands were in the Cell associated with the
+   * given Character before the Trie containing this Row was reduced.
+   * 
+   * @param way the Character associated with the desired Cell
+   * @return the number of patch commands before reduction
+   */
+  public int getCnt(Character way) {
+    Cell c = at(way);
+    return (c == null) ? -1 : c.cnt;
+  }
+  
+  /**
+   * Return the reference to the next Row in the Cell associated with the given
+   * Character.
+   * 
+   * @param way the Character associated with the desired Cell
+   * @return the reference, or -1 if the Cell is <tt>null,/tt>
+   */
+  public int getRef(Character way) {
+    Cell c = at(way);
+    return (c == null) ? -1 : c.ref;
+  }
+  
+  /**
+   * Write the contents of this Row to the given output stream.
+   * 
+   * @param os the output stream
+   * @exception IOException if an I/O error occurs
+   */
+  public void store(DataOutput os) throws IOException {
+    os.writeInt(cells.size());
+    Iterator<Character> i = cells.keySet().iterator();
+    for (; i.hasNext();) {
+      Character c = i.next();
+      Cell e = at(c);
+      if (e.cmd < 0 && e.ref < 0) {
+        continue;
+      }
+      
+      os.writeChar(c.charValue());
+      os.writeInt(e.cmd);
+      os.writeInt(e.cnt);
+      os.writeInt(e.ref);
+      os.writeInt(e.skip);
+    }
+  }
+  
+  /**
+   * Return the number of identical Cells (containing patch commands) in this
+   * Row.
+   * 
+   * @param eqSkip when set to <tt>false</tt> the removed patch commands are
+   *          considered
+   * @return the number of identical Cells, or -1 if there are (at least) two
+   *         different cells
+   */
+  public int uniformCmd(boolean eqSkip) {
+    Iterator<Cell> i = cells.values().iterator();
+    int ret = -1;
+    uniformCnt = 1;
+    uniformSkip = 0;
+    for (; i.hasNext();) {
+      Cell c = i.next();
+      if (c.ref >= 0) {
+        return -1;
+      }
+      if (c.cmd >= 0) {
+        if (ret < 0) {
+          ret = c.cmd;
+          uniformSkip = c.skip;
+        } else if (ret == c.cmd) {
+          if (eqSkip) {
+            if (uniformSkip == c.skip) {
+              uniformCnt++;
+            } else {
+              return -1;
+            }
+          } else {
+            uniformCnt++;
+          }
+        } else {
+          return -1;
+        }
+      }
+    }
+    return ret;
+  }
+  
+  /**
+   * Write the contents of this Row to stdout.
+   */
+  public void print() {
+    for (Iterator<Character> i = cells.keySet().iterator(); i.hasNext();) {
+      Character ch = i.next();
+      Cell c = at(ch);
+      System.out.print("[" + ch + ":" + c + "]");
+    }
+    System.out.println();
+  }
+  
+  Cell at(Character index) {
+    return cells.get(index);
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java Mon May  3 12:44:22 2010
@@ -0,0 +1,419 @@
+/*
+                    Egothor Software License version 1.00
+                    Copyright (C) 1997-2004 Leo Galambos.
+                 Copyright (C) 2002-2004 "Egothor developers"
+                      on behalf of the Egothor Project.
+                             All rights reserved.
+
+   This  software  is  copyrighted  by  the "Egothor developers". If this
+   license applies to a single file or document, the "Egothor developers"
+   are the people or entities mentioned as copyright holders in that file
+   or  document.  If  this  license  applies  to the Egothor project as a
+   whole,  the  copyright holders are the people or entities mentioned in
+   the  file CREDITS. This file can be found in the same location as this
+   license in the distribution.
+
+   Redistribution  and  use  in  source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    1. Redistributions  of  source  code  must retain the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       following disclaimer.
+    2. Redistributions  in binary form must reproduce the above copyright
+       notice, the list of contributors, this list of conditions, and the
+       disclaimer  that  follows  these  conditions  in the documentation
+       and/or other materials provided with the distribution.
+    3. The name "Egothor" must not be used to endorse or promote products
+       derived  from  this software without prior written permission. For
+       written permission, please contact Leo.G@seznam.cz
+    4. Products  derived  from this software may not be called "Egothor",
+       nor  may  "Egothor"  appear  in  their name, without prior written
+       permission from Leo.G@seznam.cz.
+
+   In addition, we request that you include in the end-user documentation
+   provided  with  the  redistribution  and/or  in the software itself an
+   acknowledgement equivalent to the following:
+   "This product includes software developed by the Egothor Project.
+    http://egothor.sf.net/"
+
+   THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+   WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+   FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+   CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+   SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+   BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+   OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+   IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   This  software  consists  of  voluntary  contributions  made  by  many
+   individuals  on  behalf  of  the  Egothor  Project  and was originally
+   created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A Trie is used to store a dictionary of words and their stems.
+ * <p>
+ * Actually, what is stored are words with their respective patch commands. A
+ * trie can be termed forward (keys read from left to right) or backward (keys
+ * read from right to left). This property will vary depending on the language
+ * for which a Trie is constructed.
+ */
+public class Trie {
+  List<Row> rows = new ArrayList<Row>();
+  List<CharSequence> cmds = new ArrayList<CharSequence>();
+  int root;
+  
+  boolean forward = false;
+  
+  /**
+   * Constructor for the Trie object.
+   * 
+   * @param is the input stream
+   * @exception IOException if an I/O error occurs
+   */
+  public Trie(DataInput is) throws IOException {
+    forward = is.readBoolean();
+    root = is.readInt();
+    for (int i = is.readInt(); i > 0; i--) {
+      cmds.add(is.readUTF());
+    }
+    for (int i = is.readInt(); i > 0; i--) {
+      rows.add(new Row(is));
+    }
+  }
+  
+  /**
+   * Constructor for the Trie object.
+   * 
+   * @param forward set to <tt>true</tt>
+   */
+  public Trie(boolean forward) {
+    rows.add(new Row());
+    root = 0;
+    this.forward = forward;
+  }
+  
+  /**
+   * Constructor for the Trie object.
+   * 
+   * @param forward <tt>true</tt> if read left to right, <tt>false</tt> if read
+   *          right to left
+   * @param root index of the row that is the root node
+   * @param cmds the patch commands to store
+   * @param rows a Vector of Vectors. Each inner Vector is a node of this Trie
+   */
+  public Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows) {
+    this.rows = rows;
+    this.cmds = cmds;
+    this.root = root;
+    this.forward = forward;
+  }
+  
+  /**
+   * Gets the all attribute of the Trie object
+   * 
+   * @param key Description of the Parameter
+   * @return The all value
+   */
+  public CharSequence[] getAll(CharSequence key) {
+    int res[] = new int[key.length()];
+    int resc = 0;
+    Row now = getRow(root);
+    int w;
+    StrEnum e = new StrEnum(key, forward);
+    boolean br = false;
+    
+    for (int i = 0; i < key.length() - 1; i++) {
+      Character ch = new Character(e.next());
+      w = now.getCmd(ch);
+      if (w >= 0) {
+        int n = w;
+        for (int j = 0; j < resc; j++) {
+          if (n == res[j]) {
+            n = -1;
+            break;
+          }
+        }
+        if (n >= 0) {
+          res[resc++] = n;
+        }
+      }
+      w = now.getRef(ch);
+      if (w >= 0) {
+        now = getRow(w);
+      } else {
+        br = true;
+        break;
+      }
+    }
+    if (br == false) {
+      w = now.getCmd(new Character(e.next()));
+      if (w >= 0) {
+        int n = w;
+        for (int j = 0; j < resc; j++) {
+          if (n == res[j]) {
+            n = -1;
+            break;
+          }
+        }
+        if (n >= 0) {
+          res[resc++] = n;
+        }
+      }
+    }
+    
+    if (resc < 1) {
+      return null;
+    }
+    CharSequence R[] = new CharSequence[resc];
+    for (int j = 0; j < resc; j++) {
+      R[j] = cmds.get(res[j]);
+    }
+    return R;
+  }
+  
+  /**
+   * Return the number of cells in this Trie object.
+   * 
+   * @return the number of cells
+   */
+  public int getCells() {
+    int size = 0;
+    for (Row row : rows)
+      size += row.getCells();
+    return size;
+  }
+  
+  /**
+   * Gets the cellsPnt attribute of the Trie object
+   * 
+   * @return The cellsPnt value
+   */
+  public int getCellsPnt() {
+    int size = 0;
+    for (Row row : rows)
+      size += row.getCellsPnt();
+    return size;
+  }
+  
+  /**
+   * Gets the cellsVal attribute of the Trie object
+   * 
+   * @return The cellsVal value
+   */
+  public int getCellsVal() {
+    int size = 0;
+    for (Row row : rows)
+      size += row.getCellsVal();
+    return size;
+  }
+  
+  /**
+   * Return the element that is stored in a cell associated with the given key.
+   * 
+   * @param key the key
+   * @return the associated element
+   */
+  public CharSequence getFully(CharSequence key) {
+    Row now = getRow(root);
+    int w;
+    Cell c;
+    int cmd = -1;
+    StrEnum e = new StrEnum(key, forward);
+    Character ch = null;
+    Character aux = null;
+    
+    for (int i = 0; i < key.length();) {
+      ch = new Character(e.next());
+      i++;
+      
+      c = now.at(ch);
+      if (c == null) {
+        return null;
+      }
+      
+      cmd = c.cmd;
+      
+      for (int skip = c.skip; skip > 0; skip--) {
+        if (i < key.length()) {
+          aux = new Character(e.next());
+        } else {
+          return null;
+        }
+        i++;
+      }
+      
+      w = now.getRef(ch);
+      if (w >= 0) {
+        now = getRow(w);
+      } else if (i < key.length()) {
+        return null;
+      }
+    }
+    return (cmd == -1) ? null : cmds.get(cmd);
+  }
+  
+  /**
+   * Return the element that is stored as last on a path associated with the
+   * given key.
+   * 
+   * @param key the key associated with the desired element
+   * @return the last on path element
+   */
+  public CharSequence getLastOnPath(CharSequence key) {
+    Row now = getRow(root);
+    int w;
+    CharSequence last = null;
+    StrEnum e = new StrEnum(key, forward);
+    
+    for (int i = 0; i < key.length() - 1; i++) {
+      Character ch = new Character(e.next());
+      w = now.getCmd(ch);
+      if (w >= 0) {
+        last = cmds.get(w);
+      }
+      w = now.getRef(ch);
+      if (w >= 0) {
+        now = getRow(w);
+      } else {
+        return last;
+      }
+    }
+    w = now.getCmd(new Character(e.next()));
+    return (w >= 0) ? cmds.get(w) : last;
+  }
+  
+  /**
+   * Return the Row at the given index.
+   * 
+   * @param index the index containing the desired Row
+   * @return the Row
+   */
+  private Row getRow(int index) {
+    if (index < 0 || index >= rows.size()) {
+      return null;
+    }
+    return rows.get(index);
+  }
+  
+  /**
+   * Write this Trie to the given output stream.
+   * 
+   * @param os the output stream
+   * @exception IOException if an I/O error occurs
+   */
+  public void store(DataOutput os) throws IOException {
+    os.writeBoolean(forward);
+    os.writeInt(root);
+    os.writeInt(cmds.size());
+    for (CharSequence cmd : cmds)
+      os.writeUTF(cmd.toString());
+    
+    os.writeInt(rows.size());
+    for (Row row : rows)
+      row.store(os);
+  }
+  
+  /**
+   * Add the given key associated with the given patch command. If either
+   * parameter is null this method will return without executing.
+   * 
+   * @param key the key
+   * @param cmd the patch command
+   */
+  public void add(CharSequence key, CharSequence cmd) {
+    if (key == null || cmd == null) {
+      return;
+    }
+    if (cmd.length() == 0) {
+      return;
+    }
+    int id_cmd = cmds.indexOf(cmd);
+    if (id_cmd == -1) {
+      id_cmd = cmds.size();
+      cmds.add(cmd);
+    }
+    
+    int node = root;
+    Row r = getRow(node);
+    
+    StrEnum e = new StrEnum(key, forward);
+    
+    for (int i = 0; i < e.length() - 1; i++) {
+      Character ch = new Character(e.next());
+      node = r.getRef(ch);
+      if (node >= 0) {
+        r = getRow(node);
+      } else {
+        node = rows.size();
+        Row n;
+        rows.add(n = new Row());
+        r.setRef(ch, node);
+        r = n;
+      }
+    }
+    r.setCmd(new Character(e.next()), id_cmd);
+  }
+  
+  /**
+   * Remove empty rows from the given Trie and return the newly reduced Trie.
+   * 
+   * @param by the Trie to reduce
+   * @return the newly reduced Trie
+   */
+  public Trie reduce(Reduce by) {
+    return by.optimize(this);
+  }
+  
+  public void printInfo(CharSequence prefix) {
+    System.out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
+        + " cells " + getCells() + " valcells " + getCellsVal() + " pntcells "
+        + getCellsPnt());
+  }
+  
+  /**
+   * This class is part of the Egothor Project
+   */
+  class StrEnum {
+    CharSequence s;
+    int from;
+    int by;
+    
+    /**
+     * Constructor for the StrEnum object
+     * 
+     * @param s Description of the Parameter
+     * @param up Description of the Parameter
+     */
+    StrEnum(CharSequence s, boolean up) {
+      this.s = s;
+      if (up) {
+        from = 0;
+        by = 1;
+      } else {
+        from = s.length() - 1;
+        by = -1;
+      }
+    }
+    
+    int length() {
+      return s.length();
+    }
+    
+    char next() {
+      char ch = s.charAt(from);
+      from += by;
+      return ch;
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html Mon May  3 12:44:22 2010
@@ -0,0 +1,458 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+  <meta content="text/html; charset=UTF-8" http-equiv="content-type">
+  <title>Stempel - Algorithmic Stemmer for Polish Language</title>
+  <meta content="Andrzej Bialecki" name="author">
+  <meta name="keywords"
+ content="stemming, stemmer, algorithmic stemmer, Polish stemmer">
+  <meta
+ content="This page describes a software package consisting of high-quality stemming tables for Polish, and a universal algorithmic stemmer, which operates using these tables."
+ name="description">
+</head>
+<body style="font-family: Arial,SansSerif;">
+<h1><i>Stempel</i> - Algorithmic Stemmer for Polish Language</h1>
+<h2>Introduction</h2>
+<p>A method for conflation of different inflected word forms is an
+important component of many Information Retrieval systems. It helps to
+improve the system's recall and can significantly reduce the index
+size. This is especially true for highly-inflectional languages like
+those from the Slavic language family (Czech, Slovak, Polish, Russian,
+Bulgarian, etc).</p>
+<p>This page describes a software package consisting of high-quality
+stemming tables for Polish, and a universal algorithmic stemmer, which
+operates using these tables. The stemmer code is taken virtually
+unchanged from the <a href="http://www.egothor.org">Egothor project</a>.</p>
+<p>The software distribution includes stemmer
+tables prepared using an extensive corpus of Polish language (see
+details below).</p>
+<p>This work is available under Apache-style Open Source license - the
+stemmer code is covered by Egothor License, the tables and other
+additions are covered by Apache License 2.0. Both licenses allow to use
+the code in Open Source as well as commercial (closed source) projects.</p>
+<h3>Terminology</h3>
+<p>A short explanation is in order about the terminology used in this
+text.</p>
+<p>In the following sections I make a distinction between <b>stem</b>
+and <b>lemma</b>.</p>
+<p>Lemma is a base grammatical form (dictionary form, headword) of a
+word. Lemma is an existing, grammatically correct word in some human
+language.</p>
+<p>Stem on the other hand is just a unique token, not necessarily
+making any sense in any human language, but which can serve as a unique
+label instead of lemma for the same set of inflected forms. Quite often
+stem is referred to as a "root" of the word - which is incorrect and
+misleading (stems sometimes have very little to do with the linguistic
+root of a word, i.e. a pattern found in a word which is common to all
+inflected forms or within a family of languages).</p>
+<p>For an IR system stems are usually sufficient, for a morphological
+analysis system obviously lemmas are a must. In practice, various
+stemmers produce a mix of stems and lemmas, as is the case with the
+stemmer described here. Additionally, for some languages, which use
+suffix-based inflection rules many stemmers based on suffix-stripping
+will produce a large percentage of stems equivalent to lemmas. This is
+however not the case for languages with complex, irregular inflection
+rules (such as Slavic languages) - here simplistic suffix-stripping
+stemmers produce very poor results.</p>
+<h3>Background</h3>
+<p>Lemmatization is a process of finding the base, non-inflected form
+of a word. The result of lemmatization is a correct existing word,
+often in nominative case for nouns and infinitive form for verbs. A
+given inflected form may correspond to several lemmas (e.g. "found"
+-&gt; find, found) - the correct choice depends on the context.<br>
+<br>
+Stemming is concerned mostly with finding a unique "root" of a word,
+which not necessarily results in any existing word or lemma. The
+quality of stemming is measured by the rate of collisions (overstemming
+- which causes words with different lemmas to be incorrectly conflated
+into one "root"), and the rate of superfluous word "roots"
+(understemming - which assigns several "roots" to words with the same
+lemma). <br>
+<br>
+Both stemmer and lemmatizer can be implemented in various ways. The two
+most common approaches are:<br>
+</p>
+<ul>
+  <li>dictionary-based: where the stemmer uses an extensive dictionary
+of morphological forms in order to find the corresponding stem or lemma</li>
+  <li>algorithmic: where the stemmer uses an algorithm, based on
+general morphological properties of a given language plus a set of
+heuristic rules<br>
+  </li>
+</ul>
+There are many existing and well-known implementations of stemmers for
+English (Porter, Lovins, Krovetz) and other European languages
+(<a href="http://snowball.tartarus.org">Snowball</a>). There are also
+good quality commercial lemmatizers for Polish. However, there is only
+one
+freely available Polish stemmer, implemented by
+<a
+ href="http://www.cs.put.poznan.pl/dweiss/xml/projects/lametyzator/index.xml?lang=en">Dawid
+Weiss</a>, based on the "ispell" dictionary and Jan Daciuk's <a
+ href="http://www.eti.pg.gda.pl/%7Ejandac/">FSA package</a>. That
+stemmer is dictionary-based. This means that even
+though it can achieve
+perfect accuracy for previously known word forms found in its
+dictionary, it
+completely fails in case of all other word forms. This deficiency is
+somewhat mitigated by the comprehensive dictionary distributed with
+this stemmer (so there is a high probability that most of the words in
+the input text will be found in the dictionary), however the problem
+still remains (please see the page above for more detailed description).<br>
+<br>
+The implementation described here uses an algorithmic method. This
+method
+and particular algorithm implementation are described in detail in
+[1][2].
+The main advantage of algorithmic stemmers is their ability to process
+previously
+unseen word forms with high accuracy. This particular algorithm uses a
+set
+of
+transformation rules (patch commands), which describe how a word with a
+given pattern should be transformed to its stem. These rules are first
+learned from a training corpus. They don't
+cover
+all possible cases, so there is always some loss of precision/recall
+(which
+means that even the words from the training corpus are sometimes
+incorrectly stemmed).<br>
+<h2>Algorithm and implementation<span style="font-style: italic;"></span></h2>
+The algorithm and its Java implementation is described in detail in the
+publications cited below. Here's just a short excerpt from [2]:<br>
+<br>
+<center>
+<div style="width: 80%;" align="justify">"The aim is separation of the
+stemmer execution code from the data
+structures [...]. In other words, a static algorithm configurable by
+data must be developed. The word transformations that happen in the
+stemmer must be then encoded to the data tables.<br>
+<br>
+The tacit input of our method is a sample set (a so-called dictionary)
+of words (as keys) and their stems. Each record can be equivalently
+stored as a key and the record of key's transformation to its
+respective stem. The transformation record is termed a patch command
+(P-command). It must be ensured that P-commands are universal, and that
+P-commands can transform any word to its stem. Our solution[6,8] is
+based on the Levenstein metric [10], which produces P-command as the
+minimum cost path in a directed graph.<br>
+<br>
+One can imagine the P-command as an algorithm for an operator (editor)
+that rewrites a string to another string. The operator can use these
+instructions (PP-command's): <span style="font-weight: bold;">removal </span>-
+deletes a sequence of characters starting at the current cursor
+position and moves the cursor to the next character. The length of this
+sequence is the parameter; <span style="font-weight: bold;">insertion </span>-
+inserts a character ch, without moving the cursor. The character ch is
+a parameter; <span style="font-weight: bold;">substitution&nbsp;</span>
+- rewrites a character at the current cursor position to the character
+ch and moves the cursor to the next character. The character ch is a
+parameter; <span style="font-weight: bold;">no operation</span> (NOOP)
+- skip a sequence of characters starting at the current cursor
+position. The length of this sequence is the parameter.<br>
+<br>
+The P-commands are applied from the end of a word (right to left). This
+assumption can reduce the set of P-command's, because the last NOOP,
+moving the cursor to the end of a string without any changes, need not
+be stored."</div>
+</center>
+<br>
+Data structure used to keep the dictionary (words and their P-commands)
+is a trie. Several optimization steps are applied in turn to reduce and
+optimize the initial trie, by eliminating useless information and
+shortening the paths in the trie.<br>
+<br>
+Finally, in order to obtain a stem from the input word, the word is
+passed once through a matching path in the trie (applying at each node
+the P-commands stored there). The result is a word stem.<br>
+<h2>Corpus</h2>
+<p><i>(to be completed...)</i></p>
+<p>The following Polish corpora have been used:</p>
+<ul>
+  <li><a
+ href="http://sourceforge.net/project/showfiles.php?group_id=49316&amp;package_id=65354">Polish
+dictionary
+from ispell distribution</a></li>
+  <li><a href="http://www.mimuw.edu.pl/polszczyzna/">Wzbogacony korpus
+słownika frekwencyjnego</a></li>
+<!--<li><a href="http://www.korpus.pl">Korpus IPI PAN</a></li>-->
+<!--<li>The Bible (so called "Warsaw Bible" or "Brytyjka")</li>--><li>The
+Bible (so called "TysiÄ…clecia") - unauthorized electronic version</li>
+  <li><a
+ href="http://www.mimuw.edu.pl/polszczyzna/Debian/sam34_3.4a.02-1_i386.deb">Analizator
+morfologiczny SAM v. 3.4</a> - this was used to recover lemmas
+missing from other texts</li>
+</ul>
+<p>This step was the most time-consuming - and it would probably be
+even more tedious and difficult if not for the
+help of
+<a href="http://www.python.org/">Python</a>. The source texts had to be
+brought to a common encoding (UTF-8) - some of them used quite ancient
+encodings like Mazovia or DHN - and then scripts were written to
+collect all lemmas and
+inflected forms from the source texts. In cases when the source text
+was not
+tagged,
+I used the SAM analyzer to produce lemmas. In cases of ambiguous
+lemmatization I decided to put references to inflected forms from all
+base forms.<br>
+</p>
+<p>All grammatical categories were allowed to appear in the corpus,
+i.e. nouns, verbs, adjectives, numerals, and pronouns. The resulting
+corpus consisted of roughly 87,000+ inflection sets, i.e. each set
+consisted of one base form (lemma) and many inflected forms. However,
+because of the nature of the training method I restricted these sets to
+include only those where there were at least 4 inflected forms. Sets
+with 3 or less inflected forms were removed, so that the final corpus
+consisted of ~69,000 unique sets, which in turn contained ~1.5 mln
+inflected forms. <br>
+</p>
+<h2>Testing</h2>
+<p>I tested the stemmer tables produced using the implementation
+described above. The following sections give some details about
+the testing setup.
+</p>
+<h3>Testing procedure</h3>
+<p>The testing procedure was as follows:
+</p>
+<ul>
+  <li>the whole corpus of ~69,000 unique sets was shuffled, so that the
+input sets were in random order.</li>
+  <li>the corpus was split into two parts - one with 30,000 sets (Part
+1), the other with ~39,000 sets (Part 2).</li>
+  <li>Training samples were drawn in sequential order from the Part 1.
+Since the sets were already randomized, the training samples were also
+randomized, but this procedure ensured that each larger training sample
+contained all smaller samples.</li>
+  <li>Part 2 was used for testing. Note: this means that the testing
+run used <em>only</em> words previously unseen during the training
+phase. This is the worst scenario, because it means that stemmer must
+extrapolate the learned rules to unknown cases. This also means that in
+a real-life case (where the input is a mix between known and unknown
+words) the F-measure of the stemmer will be even higher than in the
+table below.</li>
+</ul>
+<h3>Test results</h3>
+<p>The following table summarizes test results for varying sizes
+of training samples. The meaning of the table columns is
+described below:
+</p>
+<ul>
+  <li><b>training sets:</b> the number of training sets. One set
+consists of one lemma and at least 4 and up to ~80 inflected forms
+(including pre- and suffixed forms).</li>
+  <li><b>testing forms:</b> the number of testing forms. Only inflected
+forms were used in testing.</li>
+  <li><b>stem OK:</b> the number of cases when produced output was a
+correct (unique) stem. Note: quite often correct stems were also
+correct lemmas.</li>
+  <li><b>lemma OK:</b> the number of cases when produced output was a
+correct lemma.</li>
+  <li><b>missing:</b> the number of cases when stemmer was unable to
+provide any output.</li>
+  <li><b>stem bad:</b> the number of cases when produced output was a
+stem, but already in use identifying a different set.</li>
+  <li><b>lemma bad:</b> the number of cases when produced output was an
+incorrect lemma. Note: quite often in such case the output was a
+correct stem.</li>
+  <li><b>table size:</b> the size in bytes of the stemmer table.</li>
+</ul>
+<div align="center">
+<table border="1" cellpadding="2" cellspacing="0">
+  <tbody>
+    <tr bgcolor="#a0b0c0">
+      <th>Training sets</th>
+      <th>Testing forms</th>
+      <th>Stem OK</th>
+      <th>Lemma OK</th>
+      <th>Missing</th>
+      <th>Stem Bad</th>
+      <th>Lemma Bad</th>
+      <th>Table size [B]</th>
+    </tr>
+    <tr align="right">
+      <td>100</td>
+      <td>1022985</td>
+      <td>842209</td>
+      <td>593632</td>
+      <td>172711</td>
+      <td>22331</td>
+      <td>256642</td>
+      <td>28438</td>
+    </tr>
+    <tr align="right">
+      <td>200</td>
+      <td>1022985</td>
+      <td>862789</td>
+      <td>646488</td>
+      <td>153288</td>
+      <td>16306</td>
+      <td>223209</td>
+      <td>48660</td>
+    </tr>
+    <tr align="right">
+      <td>500</td>
+      <td>1022985</td>
+      <td>885786</td>
+      <td>685009</td>
+      <td>130772</td>
+      <td>14856</td>
+      <td>207204</td>
+      <td>108798</td>
+    </tr>
+    <tr align="right">
+      <td>700</td>
+      <td>1022985</td>
+      <td>909031</td>
+      <td>704609</td>
+      <td>107084</td>
+      <td>15442</td>
+      <td>211292</td>
+      <td>139291</td>
+    </tr>
+    <tr align="right">
+      <td>1000</td>
+      <td>1022985</td>
+      <td>926079</td>
+      <td>725720</td>
+      <td>90117</td>
+      <td>14941</td>
+      <td>207148</td>
+      <td>183677</td>
+    </tr>
+    <tr align="right">
+      <td>2000</td>
+      <td>1022985</td>
+      <td>942886</td>
+      <td>746641</td>
+      <td>73429</td>
+      <td>14903</td>
+      <td>202915</td>
+      <td>313516</td>
+    </tr>
+    <tr align="right">
+      <td>5000</td>
+      <td>1022985</td>
+      <td>954721</td>
+      <td>759930</td>
+      <td>61476</td>
+      <td>14817</td>
+      <td>201579</td>
+      <td>640969</td>
+    </tr>
+    <tr align="right">
+      <td>7000</td>
+      <td>1022985</td>
+      <td>956165</td>
+      <td>764033</td>
+      <td>60364</td>
+      <td>14620</td>
+      <td>198588</td>
+      <td>839347</td>
+    </tr>
+    <tr align="right">
+      <td>10000</td>
+      <td>1022985</td>
+      <td>965427</td>
+      <td>775507</td>
+      <td>50797</td>
+      <td>14662</td>
+      <td>196681</td>
+      <td>1144537</td>
+    </tr>
+    <tr align="right">
+      <td>12000</td>
+      <td>1022985</td>
+      <td>967664</td>
+      <td>782143</td>
+      <td>48722</td>
+      <td>14284</td>
+      <td>192120</td>
+      <td>1313508</td>
+    </tr>
+    <tr align="right">
+      <td>15000</td>
+      <td>1022985</td>
+      <td>973188</td>
+      <td>788867</td>
+      <td>43247</td>
+      <td>14349</td>
+      <td>190871</td>
+      <td>1567902</td>
+    </tr>
+    <tr align="right">
+      <td>17000</td>
+      <td>1022985</td>
+      <td>974203</td>
+      <td>791804</td>
+      <td>42319</td>
+      <td>14333</td>
+      <td>188862</td>
+      <td>1733957</td>
+    </tr>
+    <tr align="right">
+      <td>20000</td>
+      <td>1022985</td>
+      <td>976234</td>
+      <td>791554</td>
+      <td>40058</td>
+      <td>14601</td>
+      <td>191373</td>
+      <td>1977615</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+<p>I also measured the time to produce a stem (which involves
+traversing a trie,
+retrieving a patch command and applying the patch command to the input
+string).
+On a machine running Windows XP (Pentium 4, 1.7 GHz, JDK 1.4.2_03
+HotSpot),
+for tables ranging in size from 1,000 to 20,000 cells, the time to
+produce a
+single stem varies between 5-10 microseconds.<br>
+</p>
+<p>This means that the stemmer can process up to <span
+ style="font-weight: bold;">200,000 words per second</span>, an
+outstanding result when compared to other stemmers (Morfeusz - ~2,000
+w/s, FormAN (MS Word analyzer) - ~1,000 w/s).<br>
+</p>
+<p>The package contains a class <code>org.getopt.stempel.Benchmark</code>,
+which you can use to produce reports
+like the one below:<br>
+</p>
+<pre>--------- Stemmer benchmark report: -----------<br>Stemmer table:  /res/tables/stemmer_2000.out<br>Input file:     ../test3.txt<br>Number of runs: 3<br><br> RUN NUMBER:            1       2       3<br> Total input words      1378176 1378176 1378176<br> Missed output words    112     112     112<br> Time elapsed [ms]      6989    6940    6640<br> Hit rate percent       99.99%  99.99%  99.99%<br> Miss rate percent      00.01%  00.01%  00.01%<br> Words per second       197192  198584  207557<br> Time per word [us]     5.07    5.04    4.82<br></pre>
+<h2>Summary</h2>
+<p>The results of these tests are very encouraging. It seems that using
+the
+training corpus and the stemming algorithm described above results in a
+high-quality stemmer useful for most applications. Moreover, it can
+also
+be used as a better than average lemmatizer.</p>
+<p>Both the author of the implementation
+(Leo Galambos, &lt;leo.galambos AT egothor DOT org&gt;) and the author
+of this
+compilation (Andrzej Bialecki &lt;ab AT getopt DOT org&gt;) would
+appreciate any
+feedback and suggestions for further improvements.</p>
+<h2>Bibliography</h2>
+<ol>
+  <li>Galambos, L.: Multilingual Stemmer in Web Environment, PhD
+Thesis,
+Faculty of Mathematics and Physics, Charles University in Prague, in
+press.</li>
+  <li>Galambos, L.: Semi-automatic Stemmer Evaluation. International
+Intelligent Information Processing and Web Mining Conference, 2004,
+Zakopane, Poland.</li>
+  <li>Galambos, L.: Lemmatizer for Document Information Retrieval
+Systems in JAVA.<span style="text-decoration: underline;"> </span><a
+ class="moz-txt-link-rfc2396E"
+ href="http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01">&lt;http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01&gt;</a>
+SOFSEM 2001, Piestany, Slovakia. <br>
+  </li>
+</ol>
+<br>
+<br>
+</body>
+</html>

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stemmer_20000.tbl
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stemmer_20000.tbl?rev=940433&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stemmer_20000.tbl
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt Mon May  3 12:44:22 2010
@@ -0,0 +1,186 @@
+# This file was created from the carrot2 project and is distributed under the BSD license.
+# See http://project.carrot2.org/license.html
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# From trunk/core/carrot2-util-text/src-resources/stopwords.pl
+vol
+o.o.
+mgr
+godz
+zł
+www
+pl
+ul
+tel
+hab
+prof
+inż
+dr
+i
+u
+aby
+albo
+ale
+ani
+aż
+bardzo
+bez
+bo
+bowiem
+by
+byli
+bym
+był
+była
+było
+były
+być
+będzie
+będą
+chce
+choć
+co
+coraz
+coś
+czy
+czyli
+często
+dla
+do
+gdy
+gdyby
+gdyż
+gdzie
+go
+ich
+im
+inne
+iż
+ja
+jak
+jakie
+jako
+je
+jednak
+jednym
+jedynie
+jego
+jej
+jest
+jeszcze
+jeśli
+jeżeli
+już
+ją
+kiedy
+kilku
+kto
+która
+które
+którego
+której
+który
+których
+którym
+którzy
+lat
+lecz
+lub
+ma
+mają
+mamy
+mi
+miał
+mimo
+mnie
+mogą
+może
+można
+mu
+musi
+na
+nad
+nam
+nas
+nawet
+nic
+nich
+nie
+niej
+nim
+niż
+no
+nowe
+np
+nr
+o
+od
+ok
+on
+one
+oraz
+pan
+po
+pod
+ponad
+ponieważ
+poza
+przed
+przede
+przez
+przy
+raz
+razie
+roku
+również
+się
+sobie
+sposób
+swoje
+są
+ta
+tak
+takich
+takie
+także
+tam
+te
+tego
+tej
+temu
+ten
+teraz
+też
+to
+trzeba
+tu
+tych
+tylko
+tym
+tys
+tzw
+tę
+w
+we
+wie
+więc
+wszystko
+wśród
+właśnie
+z
+za
+zaś
+ze
+że
+żeby
+ii
+iii
+iv
+vi
+vii
+viii
+ix
+xi
+xii
+xiii
+xiv
+xv

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Mon May  3 12:44:22 2010
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.pl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new PolishAnalyzer(TEST_VERSION_CURRENT);
+  }
+  
+  /** test stopwords and stemming */
+  public void testBasics() throws IOException {
+    Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT);
+    // stemming
+    checkOneTermReuse(a, "studenta", "student");
+    checkOneTermReuse(a, "studenci", "student");
+    // stopword
+    assertAnalyzesTo(a, "był", new String[] {});
+  }
+  
+  /** test use of exclusion set */
+  public void testExclude() throws IOException {
+    Set<String> exclusionSet = new HashSet<String>();
+    exclusionSet.add("studenta");
+    Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT, 
+        PolishAnalyzer.getDefaultStopSet(), exclusionSet);
+    checkOneTermReuse(a, "studenta", "studenta");
+    checkOneTermReuse(a, "studenci", "student");
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java Mon May  3 12:44:22 2010
@@ -0,0 +1,153 @@
+package org.egothor.stemmer;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This  software  is  copyrighted  by  the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or  document.  If  this  license  applies  to the Egothor project as a
+ whole,  the  copyright holders are the people or entities mentioned in
+ the  file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution  and  use  in  source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions  of  source  code  must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions  in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer  that  follows  these  conditions  in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived  from  this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products  derived  from this software may not be called "Egothor",
+ nor  may  "Egothor"  appear  in  their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided  with  the  redistribution  and/or  in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS  SOFTWARE  IS  PROVIDED  ``AS  IS''  AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES,  INCLUDING,  BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY  AND  FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN  NO  EVENT  SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR   ANY   DIRECT,   INDIRECT,  INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR
+ CONSEQUENTIAL  DAMAGES  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE  GOODS  OR  SERVICES;  LOSS  OF  USE,  DATA, OR PROFITS; OR
+ BUSINESS  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER  IN  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This  software  consists  of  voluntary  contributions  made  by  many
+ individuals  on  behalf  of  the  Egothor  Project  and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.net.URI;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCompile extends LuceneTestCase {
+  
+  public void testCompile() throws Exception {
+    URI uri = getClass().getResource("testRules.txt").toURI();
+    String path = uri.getPath();
+    Compile.main(new String[] {"test", path});
+    String compiled = path + ".out";
+    Trie trie = loadTrie(compiled);
+    assertTrie(trie, path, true, true);
+    assertTrie(trie, path, false, true);
+    new File(compiled).delete();
+  }
+  
+  public void testCompileBackwards() throws Exception {
+    URI uri = getClass().getResource("testRules.txt").toURI();
+    String path = uri.getPath();
+    Compile.main(new String[] {"-test", path});
+    String compiled = path + ".out";
+    Trie trie = loadTrie(compiled);
+    assertTrie(trie, path, true, true);
+    assertTrie(trie, path, false, true);
+    new File(compiled).delete();
+  }
+  
+  public void testCompileMulti() throws Exception {
+    URI uri = getClass().getResource("testRules.txt").toURI();
+    String path = uri.getPath();
+    Compile.main(new String[] {"Mtest", path});
+    String compiled = path + ".out";
+    Trie trie = loadTrie(compiled);
+    assertTrie(trie, path, true, true);
+    assertTrie(trie, path, false, true);
+    new File(compiled).delete();
+  }
+  
+  static Trie loadTrie(String path) throws IOException {
+    Trie trie;
+    DataInputStream is = new DataInputStream(new BufferedInputStream(
+        new FileInputStream(path)));
+    String method = is.readUTF().toUpperCase();
+    if (method.indexOf('M') < 0) {
+      trie = new Trie(is);
+    } else {
+      trie = new MultiTrie(is);
+    }
+    is.close();
+    return trie;
+  }
+  
+  private static void assertTrie(Trie trie, String file, boolean usefull,
+      boolean storeorig) throws Exception {
+    LineNumberReader in = new LineNumberReader(new BufferedReader(
+        new FileReader(file)));
+    
+    for (String line = in.readLine(); line != null; line = in.readLine()) {
+      try {
+        line = line.toLowerCase();
+        StringTokenizer st = new StringTokenizer(line);
+        String stem = st.nextToken();
+        if (storeorig) {
+          CharSequence cmd = (usefull) ? trie.getFully(stem) : trie
+              .getLastOnPath(stem);
+          StringBuilder stm = new StringBuilder(stem);
+          Diff.apply(stm, cmd);
+          assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+        }
+        while (st.hasMoreTokens()) {
+          String token = st.nextToken();
+          if (token.equals(stem)) {
+            continue;
+          }
+          CharSequence cmd = (usefull) ? trie.getFully(token) : trie
+              .getLastOnPath(token);
+          StringBuilder stm = new StringBuilder(token);
+          Diff.apply(stm, cmd);
+          assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+        }
+      } catch (java.util.NoSuchElementException x) {
+        // no base token (stem) on a line
+      }
+    }
+  }
+}

Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java
------------------------------------------------------------------------------
    svn:eol-style = native