You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/03 14:44:23 UTC
svn commit: r940433 [2/3] - in /lucene/dev/trunk/lucene: ./ contrib/
contrib/analyzers/ contrib/analyzers/stempel/
contrib/analyzers/stempel/src/ contrib/analyzers/stempel/src/java/
contrib/analyzers/stempel/src/java/org/ contrib/analyzers/stempel/src/...
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java Mon May 3 12:44:22 2010
@@ -0,0 +1,333 @@
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * The MultiTrie is a Trie of Tries.
+ * <p>
+ * It stores words and their associated patch commands. The MultiTrie handles
+ * patch commmands broken into their constituent parts, as a MultiTrie does, but
+ * the commands are delimited by the skip command.
+ */
+public class MultiTrie2 extends MultiTrie {
+ /**
+ * Constructor for the MultiTrie object.
+ *
+ * @param is the input stream
+ * @exception IOException if an I/O error occurs
+ */
+ public MultiTrie2(DataInput is) throws IOException {
+ super(is);
+ }
+
+ /**
+ * Constructor for the MultiTrie2 object
+ *
+ * @param forward set to <tt>true</tt> if the elements should be read left to
+ * right
+ */
+ public MultiTrie2(boolean forward) {
+ super(forward);
+ }
+
+ /**
+ * Return the element that is stored in a cell associated with the given key.
+ *
+ * @param key the key to the cell holding the desired element
+ * @return the element
+ */
+ @Override
+ public CharSequence getFully(CharSequence key) {
+ StringBuilder result = new StringBuilder(tries.size() * 2);
+ try {
+ CharSequence lastkey = key;
+ CharSequence p[] = new CharSequence[tries.size()];
+ char lastch = ' ';
+ for (int i = 0; i < tries.size(); i++) {
+ CharSequence r = tries.get(i).getFully(lastkey);
+ if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
+ return result;
+ }
+ if (cannotFollow(lastch, r.charAt(0))) {
+ return result;
+ } else {
+ lastch = r.charAt(r.length() - 2);
+ }
+ // key=key.substring(lengthPP(r));
+ p[i] = r;
+ if (p[i].charAt(0) == '-') {
+ if (i > 0) {
+ key = skip(key, lengthPP(p[i - 1]));
+ }
+ key = skip(key, lengthPP(p[i]));
+ }
+ // key = skip(key, lengthPP(r));
+ result.append(r);
+ if (key.length() != 0) {
+ lastkey = key;
+ }
+ }
+ } catch (IndexOutOfBoundsException x) {}
+ return result;
+ }
+
+ /**
+ * Return the element that is stored as last on a path belonging to the given
+ * key.
+ *
+ * @param key the key associated with the desired element
+ * @return the element that is stored as last on a path
+ */
+ @Override
+ public CharSequence getLastOnPath(CharSequence key) {
+ StringBuilder result = new StringBuilder(tries.size() * 2);
+ try {
+ CharSequence lastkey = key;
+ CharSequence p[] = new CharSequence[tries.size()];
+ char lastch = ' ';
+ for (int i = 0; i < tries.size(); i++) {
+ CharSequence r = tries.get(i).getLastOnPath(lastkey);
+ if (r == null || (r.length() == 1 && r.charAt(0) == EOM)) {
+ return result;
+ }
+ // System.err.println("LP:"+key+" last:"+lastch+" new:"+r);
+ if (cannotFollow(lastch, r.charAt(0))) {
+ return result;
+ } else {
+ lastch = r.charAt(r.length() - 2);
+ }
+ // key=key.substring(lengthPP(r));
+ p[i] = r;
+ if (p[i].charAt(0) == '-') {
+ if (i > 0) {
+ key = skip(key, lengthPP(p[i - 1]));
+ }
+ key = skip(key, lengthPP(p[i]));
+ }
+ // key = skip(key, lengthPP(r));
+ result.append(r);
+ if (key.length() != 0) {
+ lastkey = key;
+ }
+ }
+ } catch (IndexOutOfBoundsException x) {}
+ return result;
+ }
+
+ /**
+ * Write this data structure to the given output stream.
+ *
+ * @param os the output stream
+ * @exception IOException if an I/O error occurs
+ */
+ @Override
+ public void store(DataOutput os) throws IOException {
+ super.store(os);
+ }
+
+ /**
+ * Add an element to this structure consisting of the given key and patch
+ * command.
+ * <p>
+ * This method will return without executing if the <tt>cmd</tt>
+ * parameter's length is 0.
+ *
+ * @param key the key
+ * @param cmd the patch command
+ */
+ @Override
+ public void add(CharSequence key, CharSequence cmd) {
+ if (cmd.length() == 0) {
+ return;
+ }
+ // System.err.println( cmd );
+ CharSequence p[] = decompose(cmd);
+ int levels = p.length;
+ // System.err.println("levels "+key+" cmd "+cmd+"|"+levels);
+ while (levels >= tries.size()) {
+ tries.add(new Trie(forward));
+ }
+ CharSequence lastkey = key;
+ for (int i = 0; i < levels; i++) {
+ if (key.length() > 0) {
+ tries.get(i).add(key, p[i]);
+ lastkey = key;
+ } else {
+ tries.get(i).add(lastkey, p[i]);
+ }
+ // System.err.println("-"+key+" "+p[i]+"|"+key.length());
+ /*
+ * key=key.substring(lengthPP(p[i]));
+ */
+ if (p[i].length() > 0 && p[i].charAt(0) == '-') {
+ if (i > 0) {
+ key = skip(key, lengthPP(p[i - 1]));
+ }
+ key = skip(key, lengthPP(p[i]));
+ }
+ // System.err.println("--->"+key);
+ }
+ if (key.length() > 0) {
+ tries.get(levels).add(key, EOM_NODE);
+ } else {
+ tries.get(levels).add(lastkey, EOM_NODE);
+ }
+ }
+
+ /**
+ * Break the given patch command into its constituent pieces. The pieces are
+ * delimited by NOOP commands.
+ *
+ * @param cmd the patch command
+ * @return an array containing the pieces of the command
+ */
+ public CharSequence[] decompose(CharSequence cmd) {
+ int parts = 0;
+
+ for (int i = 0; 0 <= i && i < cmd.length();) {
+ int next = dashEven(cmd, i);
+ if (i == next) {
+ parts++;
+ i = next + 2;
+ } else {
+ parts++;
+ i = next;
+ }
+ }
+
+ CharSequence part[] = new CharSequence[parts];
+ int x = 0;
+
+ for (int i = 0; 0 <= i && i < cmd.length();) {
+ int next = dashEven(cmd, i);
+ if (i == next) {
+ part[x++] = cmd.subSequence(i, i + 2);
+ i = next + 2;
+ } else {
+ part[x++] = (next < 0) ? cmd.subSequence(i, cmd.length()) : cmd.subSequence(i, next);
+ i = next;
+ }
+ }
+ return part;
+ }
+
+ /**
+ * Remove empty rows from the given Trie and return the newly reduced Trie.
+ *
+ * @param by the Trie to reduce
+ * @return the newly reduced Trie
+ */
+ @Override
+ public Trie reduce(Reduce by) {
+ List<Trie> h = new ArrayList<Trie>();
+ for (Trie trie : tries)
+ h.add(trie.reduce(by));
+
+ MultiTrie2 m = new MultiTrie2(forward);
+ m.tries = h;
+ return m;
+ }
+
+ private boolean cannotFollow(char after, char goes) {
+ switch (after) {
+ case '-':
+ case 'D':
+ return after == goes;
+ }
+ return false;
+ }
+
+ private CharSequence skip(CharSequence in, int count) {
+ if (forward) {
+ return in.subSequence(count, in.length());
+ } else {
+ return in.subSequence(0, in.length() - count);
+ }
+ }
+
+ private int dashEven(CharSequence in, int from) {
+ while (from < in.length()) {
+ if (in.charAt(from) == '-') {
+ return from;
+ } else {
+ from += 2;
+ }
+ }
+ return -1;
+ }
+
+ private int lengthPP(CharSequence cmd) {
+ int len = 0;
+ for (int i = 0; i < cmd.length(); i++) {
+ switch (cmd.charAt(i++)) {
+ case '-':
+ case 'D':
+ len += cmd.charAt(i) - 'a' + 1;
+ break;
+ case 'R':
+ len++;
+ case 'I':
+ break;
+ }
+ }
+ return len;
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/MultiTrie2.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java Mon May 3 12:44:22 2010
@@ -0,0 +1,198 @@
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * The Optimizer class is a Trie that will be reduced (have empty rows removed).
+ * <p>
+ * The reduction will be made by joining two rows where the first is a subset of
+ * the second.
+ */
+public class Optimizer extends Reduce {
+ /**
+ * Constructor for the Optimizer object.
+ */
+ public Optimizer() {}
+
+ /**
+ * Optimize (remove empty rows) from the given Trie and return the resulting
+ * Trie.
+ *
+ * @param orig the Trie to consolidate
+ * @return the newly consolidated Trie
+ */
+ @Override
+ public Trie optimize(Trie orig) {
+ List<CharSequence> cmds = orig.cmds;
+ List<Row> rows = new ArrayList<Row>();
+ List<Row> orows = orig.rows;
+ int remap[] = new int[orows.size()];
+
+ for (int j = orows.size() - 1; j >= 0; j--) {
+ Row now = new Remap(orows.get(j), remap);
+ boolean merged = false;
+
+ for (int i = 0; i < rows.size(); i++) {
+ Row q = merge(now, rows.get(i));
+ if (q != null) {
+ rows.set(i, q);
+ merged = true;
+ remap[j] = i;
+ break;
+ }
+ }
+
+ if (merged == false) {
+ remap[j] = rows.size();
+ rows.add(now);
+ }
+ }
+
+ int root = remap[orig.root];
+ Arrays.fill(remap, -1);
+ rows = removeGaps(root, rows, new ArrayList<Row>(), remap);
+
+ return new Trie(orig.forward, remap[root], cmds, rows);
+ }
+
+ /**
+ * Merge the given rows and return the resulting Row.
+ *
+ * @param master the master Row
+ * @param existing the existing Row
+ * @return the resulting Row, or <tt>null</tt> if the operation cannot be
+ * realized
+ */
+ public Row merge(Row master, Row existing) {
+ Iterator<Character> i = master.cells.keySet().iterator();
+ Row n = new Row();
+ for (; i.hasNext();) {
+ Character ch = i.next();
+ // XXX also must handle Cnt and Skip !!
+ Cell a = master.cells.get(ch);
+ Cell b = existing.cells.get(ch);
+
+ Cell s = (b == null) ? new Cell(a) : merge(a, b);
+ if (s == null) {
+ return null;
+ }
+ n.cells.put(ch, s);
+ }
+ i = existing.cells.keySet().iterator();
+ for (; i.hasNext();) {
+ Character ch = i.next();
+ if (master.at(ch) != null) {
+ continue;
+ }
+ n.cells.put(ch, existing.at(ch));
+ }
+ return n;
+ }
+
+ /**
+ * Merge the given Cells and return the resulting Cell.
+ *
+ * @param m the master Cell
+ * @param e the existing Cell
+ * @return the resulting Cell, or <tt>null</tt> if the operation cannot be
+ * realized
+ */
+ public Cell merge(Cell m, Cell e) {
+ Cell n = new Cell();
+
+ if (m.skip != e.skip) {
+ return null;
+ }
+
+ if (m.cmd >= 0) {
+ if (e.cmd >= 0) {
+ if (m.cmd == e.cmd) {
+ n.cmd = m.cmd;
+ } else {
+ return null;
+ }
+ } else {
+ n.cmd = m.cmd;
+ }
+ } else {
+ n.cmd = e.cmd;
+ }
+ if (m.ref >= 0) {
+ if (e.ref >= 0) {
+ if (m.ref == e.ref) {
+ if (m.skip == e.skip) {
+ n.ref = m.ref;
+ } else {
+ return null;
+ }
+ } else {
+ return null;
+ }
+ } else {
+ n.ref = m.ref;
+ }
+ } else {
+ n.ref = e.ref;
+ }
+ n.cnt = m.cnt + e.cnt;
+ n.skip = m.skip;
+ return n;
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java Mon May 3 12:44:22 2010
@@ -0,0 +1,90 @@
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+/**
+ * The Optimizer class is a Trie that will be reduced (have empty rows removed).
+ * <p>
+ * This is the result of allowing a joining of rows when there is no collision
+ * between non-<tt>null</tt> values in the rows. Information loss, resulting in
+ * the stemmer not being able to recognize words (as in Optimizer), is
+ * curtailed, allowing the stemmer to recognize words for which the original
+ * trie was built. Use of this class allows the stemmer to be self-teaching.
+ */
+public class Optimizer2 extends Optimizer {
+ /**
+ * Constructor for the Optimizer2 object.
+ */
+ public Optimizer2() {}
+
+ /**
+ * Merge the given Cells and return the resulting Cell.
+ *
+ * @param m the master Cell
+ * @param e the existing Cell
+ * @return the resulting Cell, or <tt>null</tt> if the operation cannot be
+ * realized
+ */
+ @Override
+ public Cell merge(Cell m, Cell e) {
+ if (m.cmd == e.cmd && m.ref == e.ref && m.skip == e.skip) {
+ Cell c = new Cell(m);
+ c.cnt += e.cnt;
+ return c;
+ } else {
+ return null;
+ }
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Optimizer2.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java Mon May 3 12:44:22 2010
@@ -0,0 +1,134 @@
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * The Reduce object is used to remove gaps in a Trie which stores a dictionary.
+ */
+public class Reduce {
+
+ /**
+ * Constructor for the Reduce object.
+ */
+ public Reduce() {}
+
+ /**
+ * Optimize (remove holes in the rows) the given Trie and return the
+ * restructured Trie.
+ *
+ * @param orig the Trie to optimize
+ * @return the restructured Trie
+ */
+ public Trie optimize(Trie orig) {
+ List<CharSequence> cmds = orig.cmds;
+ List<Row> rows = new ArrayList<Row>();
+ List<Row> orows = orig.rows;
+ int remap[] = new int[orows.size()];
+
+ Arrays.fill(remap, -1);
+ rows = removeGaps(orig.root, rows, new ArrayList<Row>(), remap);
+
+ return new Trie(orig.forward, remap[orig.root], cmds, rows);
+ }
+
+ List<Row> removeGaps(int ind, List<Row> old, List<Row> to, int remap[]) {
+ remap[ind] = to.size();
+
+ Row now = old.get(ind);
+ to.add(now);
+ Iterator<Cell> i = now.cells.values().iterator();
+ for (; i.hasNext();) {
+ Cell c = i.next();
+ if (c.ref >= 0 && remap[c.ref] < 0) {
+ removeGaps(c.ref, old, to, remap);
+ }
+ }
+ to.set(remap[ind], new Remap(now, remap));
+ return to;
+ }
+
+ /**
+ * This class is part of the Egothor Project
+ */
+ class Remap extends Row {
+ /**
+ * Constructor for the Remap object
+ *
+ * @param old Description of the Parameter
+ * @param remap Description of the Parameter
+ */
+ public Remap(Row old, int remap[]) {
+ super();
+ Iterator<Character> i = old.cells.keySet().iterator();
+ for (; i.hasNext();) {
+ Character ch = i.next();
+ Cell c = old.at(ch);
+ Cell nc;
+ if (c.ref >= 0) {
+ nc = new Cell(c);
+ nc.ref = remap[nc.ref];
+ } else {
+ nc = new Cell(c);
+ }
+ cells.put(ch, nc);
+ }
+ }
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Reduce.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java Mon May 3 12:44:22 2010
@@ -0,0 +1,309 @@
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+/**
+ * The Row class represents a row in a matrix representation of a trie.
+ */
+public class Row {
+ TreeMap<Character,Cell> cells = new TreeMap<Character,Cell>();
+ int uniformCnt = 0;
+ int uniformSkip = 0;
+
+ /**
+ * Construct a Row object from input carried in via the given input stream.
+ *
+ * @param is the input stream
+ * @exception IOException if an I/O error occurs
+ */
+ public Row(DataInput is) throws IOException {
+ for (int i = is.readInt(); i > 0; i--) {
+ char ch = is.readChar();
+ Cell c = new Cell();
+ c.cmd = is.readInt();
+ c.cnt = is.readInt();
+ c.ref = is.readInt();
+ c.skip = is.readInt();
+ cells.put(ch, c);
+ }
+ }
+
+ /**
+ * The default constructor for the Row object.
+ */
+ public Row() {}
+
+ /**
+ * Construct a Row using the cells of the given Row.
+ *
+ * @param old the Row to copy
+ */
+ public Row(Row old) {
+ cells = old.cells;
+ }
+
+ /**
+ * Set the command in the Cell of the given Character to the given integer.
+ *
+ * @param way the Character defining the Cell
+ * @param cmd the new command
+ */
+ public void setCmd(Character way, int cmd) {
+ Cell c = at(way);
+ if (c == null) {
+ c = new Cell();
+ c.cmd = cmd;
+ cells.put(way, c);
+ } else {
+ c.cmd = cmd;
+ }
+ c.cnt = (cmd >= 0) ? 1 : 0;
+ }
+
+ /**
+ * Set the reference to the next row in the Cell of the given Character to the
+ * given integer.
+ *
+ * @param way the Character defining the Cell
+ * @param ref The new ref value
+ */
+ public void setRef(Character way, int ref) {
+ Cell c = at(way);
+ if (c == null) {
+ c = new Cell();
+ c.ref = ref;
+ cells.put(way, c);
+ } else {
+ c.ref = ref;
+ }
+ }
+
+ /**
+ * Return the number of cells in use.
+ *
+ * @return the number of cells in use
+ */
+ public int getCells() {
+ Iterator<Character> i = cells.keySet().iterator();
+ int size = 0;
+ for (; i.hasNext();) {
+ Character c = i.next();
+ Cell e = at(c);
+ if (e.cmd >= 0 || e.ref >= 0) {
+ size++;
+ }
+ }
+ return size;
+ }
+
+ /**
+ * Return the number of references (how many transitions) to other rows.
+ *
+ * @return the number of references
+ */
+ public int getCellsPnt() {
+ Iterator<Character> i = cells.keySet().iterator();
+ int size = 0;
+ for (; i.hasNext();) {
+ Character c = i.next();
+ Cell e = at(c);
+ if (e.ref >= 0) {
+ size++;
+ }
+ }
+ return size;
+ }
+
+ /**
+ * Return the number of patch commands saved in this Row.
+ *
+ * @return the number of patch commands
+ */
+ public int getCellsVal() {
+ Iterator<Character> i = cells.keySet().iterator();
+ int size = 0;
+ for (; i.hasNext();) {
+ Character c = i.next();
+ Cell e = at(c);
+ if (e.cmd >= 0) {
+ size++;
+ }
+ }
+ return size;
+ }
+
+ /**
+ * Return the command in the Cell associated with the given Character.
+ *
+ * @param way the Character associated with the Cell holding the desired
+ * command
+ * @return the command
+ */
+ public int getCmd(Character way) {
+ Cell c = at(way);
+ return (c == null) ? -1 : c.cmd;
+ }
+
+ /**
+ * Return the number of patch commands were in the Cell associated with the
+ * given Character before the Trie containing this Row was reduced.
+ *
+ * @param way the Character associated with the desired Cell
+ * @return the number of patch commands before reduction
+ */
+ public int getCnt(Character way) {
+ Cell c = at(way);
+ return (c == null) ? -1 : c.cnt;
+ }
+
+ /**
+ * Return the reference to the next Row in the Cell associated with the given
+ * Character.
+ *
+ * @param way the Character associated with the desired Cell
+ * @return the reference, or -1 if the Cell is <tt>null,/tt>
+ */
+ public int getRef(Character way) {
+ Cell c = at(way);
+ return (c == null) ? -1 : c.ref;
+ }
+
+ /**
+ * Write the contents of this Row to the given output stream.
+ *
+ * @param os the output stream
+ * @exception IOException if an I/O error occurs
+ */
+ public void store(DataOutput os) throws IOException {
+ os.writeInt(cells.size());
+ Iterator<Character> i = cells.keySet().iterator();
+ for (; i.hasNext();) {
+ Character c = i.next();
+ Cell e = at(c);
+ if (e.cmd < 0 && e.ref < 0) {
+ continue;
+ }
+
+ os.writeChar(c.charValue());
+ os.writeInt(e.cmd);
+ os.writeInt(e.cnt);
+ os.writeInt(e.ref);
+ os.writeInt(e.skip);
+ }
+ }
+
+ /**
+ * Return the number of identical Cells (containing patch commands) in this
+ * Row.
+ *
+ * @param eqSkip when set to <tt>false</tt> the removed patch commands are
+ * considered
+ * @return the number of identical Cells, or -1 if there are (at least) two
+ * different cells
+ */
+ public int uniformCmd(boolean eqSkip) {
+ Iterator<Cell> i = cells.values().iterator();
+ int ret = -1;
+ uniformCnt = 1;
+ uniformSkip = 0;
+ for (; i.hasNext();) {
+ Cell c = i.next();
+ if (c.ref >= 0) {
+ return -1;
+ }
+ if (c.cmd >= 0) {
+ if (ret < 0) {
+ ret = c.cmd;
+ uniformSkip = c.skip;
+ } else if (ret == c.cmd) {
+ if (eqSkip) {
+ if (uniformSkip == c.skip) {
+ uniformCnt++;
+ } else {
+ return -1;
+ }
+ } else {
+ uniformCnt++;
+ }
+ } else {
+ return -1;
+ }
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Write the contents of this Row to stdout.
+ */
+ public void print() {
+ for (Iterator<Character> i = cells.keySet().iterator(); i.hasNext();) {
+ Character ch = i.next();
+ Cell c = at(ch);
+ System.out.print("[" + ch + ":" + c + "]");
+ }
+ System.out.println();
+ }
+
+ Cell at(Character index) {
+ return cells.get(index);
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Row.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java Mon May 3 12:44:22 2010
@@ -0,0 +1,419 @@
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+package org.egothor.stemmer;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A Trie is used to store a dictionary of words and their stems.
+ * <p>
+ * Actually, what is stored are words with their respective patch commands. A
+ * trie can be termed forward (keys read from left to right) or backward (keys
+ * read from right to left). This property will vary depending on the language
+ * for which a Trie is constructed.
+ */
+public class Trie {
+ List<Row> rows = new ArrayList<Row>();
+ List<CharSequence> cmds = new ArrayList<CharSequence>();
+ int root;
+
+ boolean forward = false;
+
+ /**
+ * Constructor for the Trie object.
+ *
+ * @param is the input stream
+ * @exception IOException if an I/O error occurs
+ */
+ public Trie(DataInput is) throws IOException {
+ forward = is.readBoolean();
+ root = is.readInt();
+ for (int i = is.readInt(); i > 0; i--) {
+ cmds.add(is.readUTF());
+ }
+ for (int i = is.readInt(); i > 0; i--) {
+ rows.add(new Row(is));
+ }
+ }
+
+ /**
+ * Constructor for the Trie object.
+ *
+ * @param forward set to <tt>true</tt>
+ */
+ public Trie(boolean forward) {
+ rows.add(new Row());
+ root = 0;
+ this.forward = forward;
+ }
+
+ /**
+ * Constructor for the Trie object.
+ *
+ * @param forward <tt>true</tt> if read left to right, <tt>false</tt> if read
+ * right to left
+ * @param root index of the row that is the root node
+ * @param cmds the patch commands to store
+ * @param rows a Vector of Vectors. Each inner Vector is a node of this Trie
+ */
+ public Trie(boolean forward, int root, List<CharSequence> cmds, List<Row> rows) {
+ this.rows = rows;
+ this.cmds = cmds;
+ this.root = root;
+ this.forward = forward;
+ }
+
+ /**
+ * Gets the all attribute of the Trie object
+ *
+ * @param key Description of the Parameter
+ * @return The all value
+ */
+ public CharSequence[] getAll(CharSequence key) {
+ int res[] = new int[key.length()];
+ int resc = 0;
+ Row now = getRow(root);
+ int w;
+ StrEnum e = new StrEnum(key, forward);
+ boolean br = false;
+
+ for (int i = 0; i < key.length() - 1; i++) {
+ Character ch = new Character(e.next());
+ w = now.getCmd(ch);
+ if (w >= 0) {
+ int n = w;
+ for (int j = 0; j < resc; j++) {
+ if (n == res[j]) {
+ n = -1;
+ break;
+ }
+ }
+ if (n >= 0) {
+ res[resc++] = n;
+ }
+ }
+ w = now.getRef(ch);
+ if (w >= 0) {
+ now = getRow(w);
+ } else {
+ br = true;
+ break;
+ }
+ }
+ if (br == false) {
+ w = now.getCmd(new Character(e.next()));
+ if (w >= 0) {
+ int n = w;
+ for (int j = 0; j < resc; j++) {
+ if (n == res[j]) {
+ n = -1;
+ break;
+ }
+ }
+ if (n >= 0) {
+ res[resc++] = n;
+ }
+ }
+ }
+
+ if (resc < 1) {
+ return null;
+ }
+ CharSequence R[] = new CharSequence[resc];
+ for (int j = 0; j < resc; j++) {
+ R[j] = cmds.get(res[j]);
+ }
+ return R;
+ }
+
+ /**
+ * Return the number of cells in this Trie object.
+ *
+ * @return the number of cells
+ */
+ public int getCells() {
+ int size = 0;
+ for (Row row : rows)
+ size += row.getCells();
+ return size;
+ }
+
+ /**
+ * Gets the cellsPnt attribute of the Trie object
+ *
+ * @return The cellsPnt value
+ */
+ public int getCellsPnt() {
+ int size = 0;
+ for (Row row : rows)
+ size += row.getCellsPnt();
+ return size;
+ }
+
+ /**
+ * Gets the cellsVal attribute of the Trie object
+ *
+ * @return The cellsVal value
+ */
+ public int getCellsVal() {
+ int size = 0;
+ for (Row row : rows)
+ size += row.getCellsVal();
+ return size;
+ }
+
+ /**
+ * Return the element that is stored in a cell associated with the given key.
+ *
+ * @param key the key
+ * @return the associated element
+ */
+ public CharSequence getFully(CharSequence key) {
+ Row now = getRow(root);
+ int w;
+ Cell c;
+ int cmd = -1;
+ StrEnum e = new StrEnum(key, forward);
+ Character ch = null;
+ Character aux = null;
+
+ for (int i = 0; i < key.length();) {
+ ch = new Character(e.next());
+ i++;
+
+ c = now.at(ch);
+ if (c == null) {
+ return null;
+ }
+
+ cmd = c.cmd;
+
+ for (int skip = c.skip; skip > 0; skip--) {
+ if (i < key.length()) {
+ aux = new Character(e.next());
+ } else {
+ return null;
+ }
+ i++;
+ }
+
+ w = now.getRef(ch);
+ if (w >= 0) {
+ now = getRow(w);
+ } else if (i < key.length()) {
+ return null;
+ }
+ }
+ return (cmd == -1) ? null : cmds.get(cmd);
+ }
+
+ /**
+ * Return the element that is stored as last on a path associated with the
+ * given key.
+ *
+ * @param key the key associated with the desired element
+ * @return the last on path element
+ */
+ public CharSequence getLastOnPath(CharSequence key) {
+ Row now = getRow(root);
+ int w;
+ CharSequence last = null;
+ StrEnum e = new StrEnum(key, forward);
+
+ for (int i = 0; i < key.length() - 1; i++) {
+ Character ch = new Character(e.next());
+ w = now.getCmd(ch);
+ if (w >= 0) {
+ last = cmds.get(w);
+ }
+ w = now.getRef(ch);
+ if (w >= 0) {
+ now = getRow(w);
+ } else {
+ return last;
+ }
+ }
+ w = now.getCmd(new Character(e.next()));
+ return (w >= 0) ? cmds.get(w) : last;
+ }
+
+ /**
+ * Return the Row at the given index.
+ *
+ * @param index the index containing the desired Row
+ * @return the Row
+ */
+ private Row getRow(int index) {
+ if (index < 0 || index >= rows.size()) {
+ return null;
+ }
+ return rows.get(index);
+ }
+
+ /**
+ * Write this Trie to the given output stream.
+ *
+ * @param os the output stream
+ * @exception IOException if an I/O error occurs
+ */
+ public void store(DataOutput os) throws IOException {
+ os.writeBoolean(forward);
+ os.writeInt(root);
+ os.writeInt(cmds.size());
+ for (CharSequence cmd : cmds)
+ os.writeUTF(cmd.toString());
+
+ os.writeInt(rows.size());
+ for (Row row : rows)
+ row.store(os);
+ }
+
+ /**
+ * Add the given key associated with the given patch command. If either
+ * parameter is null this method will return without executing.
+ *
+ * @param key the key
+ * @param cmd the patch command
+ */
+ public void add(CharSequence key, CharSequence cmd) {
+ if (key == null || cmd == null) {
+ return;
+ }
+ if (cmd.length() == 0) {
+ return;
+ }
+ int id_cmd = cmds.indexOf(cmd);
+ if (id_cmd == -1) {
+ id_cmd = cmds.size();
+ cmds.add(cmd);
+ }
+
+ int node = root;
+ Row r = getRow(node);
+
+ StrEnum e = new StrEnum(key, forward);
+
+ for (int i = 0; i < e.length() - 1; i++) {
+ Character ch = new Character(e.next());
+ node = r.getRef(ch);
+ if (node >= 0) {
+ r = getRow(node);
+ } else {
+ node = rows.size();
+ Row n;
+ rows.add(n = new Row());
+ r.setRef(ch, node);
+ r = n;
+ }
+ }
+ r.setCmd(new Character(e.next()), id_cmd);
+ }
+
+ /**
+ * Remove empty rows from the given Trie and return the newly reduced Trie.
+ *
+ * @param by the Trie to reduce
+ * @return the newly reduced Trie
+ */
+ public Trie reduce(Reduce by) {
+ return by.optimize(this);
+ }
+
+ public void printInfo(CharSequence prefix) {
+ System.out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
+ + " cells " + getCells() + " valcells " + getCellsVal() + " pntcells "
+ + getCellsPnt());
+ }
+
+ /**
+ * This class is part of the Egothor Project
+ */
+ class StrEnum {
+ CharSequence s;
+ int from;
+ int by;
+
+ /**
+ * Constructor for the StrEnum object
+ *
+ * @param s Description of the Parameter
+ * @param up Description of the Parameter
+ */
+ StrEnum(CharSequence s, boolean up) {
+ this.s = s;
+ if (up) {
+ from = 0;
+ by = 1;
+ } else {
+ from = s.length() - 1;
+ by = -1;
+ }
+ }
+
+ int length() {
+ return s.length();
+ }
+
+ char next() {
+ char ch = s.charAt(from);
+ from += by;
+ return ch;
+ }
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/org/egothor/stemmer/Trie.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html Mon May 3 12:44:22 2010
@@ -0,0 +1,458 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html>
+<head>
+ <meta content="text/html; charset=UTF-8" http-equiv="content-type">
+ <title>Stempel - Algorithmic Stemmer for Polish Language</title>
+ <meta content="Andrzej Bialecki" name="author">
+ <meta name="keywords"
+ content="stemming, stemmer, algorithmic stemmer, Polish stemmer">
+ <meta
+ content="This page describes a software package consisting of high-quality stemming tables for Polish, and a universal algorithmic stemmer, which operates using these tables."
+ name="description">
+</head>
+<body style="font-family: Arial,SansSerif;">
+<h1><i>Stempel</i> - Algorithmic Stemmer for Polish Language</h1>
+<h2>Introduction</h2>
+<p>A method for conflation of different inflected word forms is an
+important component of many Information Retrieval systems. It helps to
+improve the system's recall and can significantly reduce the index
+size. This is especially true for highly-inflectional languages like
+those from the Slavic language family (Czech, Slovak, Polish, Russian,
+Bulgarian, etc).</p>
+<p>This page describes a software package consisting of high-quality
+stemming tables for Polish, and a universal algorithmic stemmer, which
+operates using these tables. The stemmer code is taken virtually
+unchanged from the <a href="http://www.egothor.org">Egothor project</a>.</p>
+<p>The software distribution includes stemmer
+tables prepared using an extensive corpus of Polish language (see
+details below).</p>
+<p>This work is available under Apache-style Open Source license - the
+stemmer code is covered by Egothor License, the tables and other
+additions are covered by Apache License 2.0. Both licenses allow to use
+the code in Open Source as well as commercial (closed source) projects.</p>
+<h3>Terminology</h3>
+<p>A short explanation is in order about the terminology used in this
+text.</p>
+<p>In the following sections I make a distinction between <b>stem</b>
+and <b>lemma</b>.</p>
+<p>Lemma is a base grammatical form (dictionary form, headword) of a
+word. Lemma is an existing, grammatically correct word in some human
+language.</p>
+<p>Stem on the other hand is just a unique token, not necessarily
+making any sense in any human language, but which can serve as a unique
+label instead of lemma for the same set of inflected forms. Quite often
+stem is referred to as a "root" of the word - which is incorrect and
+misleading (stems sometimes have very little to do with the linguistic
+root of a word, i.e. a pattern found in a word which is common to all
+inflected forms or within a family of languages).</p>
+<p>For an IR system stems are usually sufficient, for a morphological
+analysis system obviously lemmas are a must. In practice, various
+stemmers produce a mix of stems and lemmas, as is the case with the
+stemmer described here. Additionally, for some languages, which use
+suffix-based inflection rules many stemmers based on suffix-stripping
+will produce a large percentage of stems equivalent to lemmas. This is
+however not the case for languages with complex, irregular inflection
+rules (such as Slavic languages) - here simplistic suffix-stripping
+stemmers produce very poor results.</p>
+<h3>Background</h3>
+<p>Lemmatization is a process of finding the base, non-inflected form
+of a word. The result of lemmatization is a correct existing word,
+often in nominative case for nouns and infinitive form for verbs. A
+given inflected form may correspond to several lemmas (e.g. "found"
+-> find, found) - the correct choice depends on the context.<br>
+<br>
+Stemming is concerned mostly with finding a unique "root" of a word,
+which not necessarily results in any existing word or lemma. The
+quality of stemming is measured by the rate of collisions (overstemming
+- which causes words with different lemmas to be incorrectly conflated
+into one "root"), and the rate of superfluous word "roots"
+(understemming - which assigns several "roots" to words with the same
+lemma). <br>
+<br>
+Both stemmer and lemmatizer can be implemented in various ways. The two
+most common approaches are:<br>
+</p>
+<ul>
+ <li>dictionary-based: where the stemmer uses an extensive dictionary
+of morphological forms in order to find the corresponding stem or lemma</li>
+ <li>algorithmic: where the stemmer uses an algorithm, based on
+general morphological properties of a given language plus a set of
+heuristic rules<br>
+ </li>
+</ul>
+There are many existing and well-known implementations of stemmers for
+English (Porter, Lovins, Krovetz) and other European languages
+(<a href="http://snowball.tartarus.org">Snowball</a>). There are also
+good quality commercial lemmatizers for Polish. However, there is only
+one
+freely available Polish stemmer, implemented by
+<a
+ href="http://www.cs.put.poznan.pl/dweiss/xml/projects/lametyzator/index.xml?lang=en">Dawid
+Weiss</a>, based on the "ispell" dictionary and Jan Daciuk's <a
+ href="http://www.eti.pg.gda.pl/%7Ejandac/">FSA package</a>. That
+stemmer is dictionary-based. This means that even
+though it can achieve
+perfect accuracy for previously known word forms found in its
+dictionary, it
+completely fails in case of all other word forms. This deficiency is
+somewhat mitigated by the comprehensive dictionary distributed with
+this stemmer (so there is a high probability that most of the words in
+the input text will be found in the dictionary), however the problem
+still remains (please see the page above for more detailed description).<br>
+<br>
+The implementation described here uses an algorithmic method. This
+method
+and particular algorithm implementation are described in detail in
+[1][2].
+The main advantage of algorithmic stemmers is their ability to process
+previously
+unseen word forms with high accuracy. This particular algorithm uses a
+set
+of
+transformation rules (patch commands), which describe how a word with a
+given pattern should be transformed to its stem. These rules are first
+learned from a training corpus. They don't
+cover
+all possible cases, so there is always some loss of precision/recall
+(which
+means that even the words from the training corpus are sometimes
+incorrectly stemmed).<br>
+<h2>Algorithm and implementation<span style="font-style: italic;"></span></h2>
+The algorithm and its Java implementation is described in detail in the
+publications cited below. Here's just a short excerpt from [2]:<br>
+<br>
+<center>
+<div style="width: 80%;" align="justify">"The aim is separation of the
+stemmer execution code from the data
+structures [...]. In other words, a static algorithm configurable by
+data must be developed. The word transformations that happen in the
+stemmer must be then encoded to the data tables.<br>
+<br>
+The tacit input of our method is a sample set (a so-called dictionary)
+of words (as keys) and their stems. Each record can be equivalently
+stored as a key and the record of key's transformation to its
+respective stem. The transformation record is termed a patch command
+(P-command). It must be ensured that P-commands are universal, and that
+P-commands can transform any word to its stem. Our solution[6,8] is
+based on the Levenstein metric [10], which produces P-command as the
+minimum cost path in a directed graph.<br>
+<br>
+One can imagine the P-command as an algorithm for an operator (editor)
+that rewrites a string to another string. The operator can use these
+instructions (PP-command's): <span style="font-weight: bold;">removal </span>-
+deletes a sequence of characters starting at the current cursor
+position and moves the cursor to the next character. The length of this
+sequence is the parameter; <span style="font-weight: bold;">insertion </span>-
+inserts a character ch, without moving the cursor. The character ch is
+a parameter; <span style="font-weight: bold;">substitution </span>
+- rewrites a character at the current cursor position to the character
+ch and moves the cursor to the next character. The character ch is a
+parameter; <span style="font-weight: bold;">no operation</span> (NOOP)
+- skip a sequence of characters starting at the current cursor
+position. The length of this sequence is the parameter.<br>
+<br>
+The P-commands are applied from the end of a word (right to left). This
+assumption can reduce the set of P-command's, because the last NOOP,
+moving the cursor to the end of a string without any changes, need not
+be stored."</div>
+</center>
+<br>
+Data structure used to keep the dictionary (words and their P-commands)
+is a trie. Several optimization steps are applied in turn to reduce and
+optimize the initial trie, by eliminating useless information and
+shortening the paths in the trie.<br>
+<br>
+Finally, in order to obtain a stem from the input word, the word is
+passed once through a matching path in the trie (applying at each node
+the P-commands stored there). The result is a word stem.<br>
+<h2>Corpus</h2>
+<p><i>(to be completed...)</i></p>
+<p>The following Polish corpora have been used:</p>
+<ul>
+ <li><a
+ href="http://sourceforge.net/project/showfiles.php?group_id=49316&package_id=65354">Polish
+dictionary
+from ispell distribution</a></li>
+ <li><a href="http://www.mimuw.edu.pl/polszczyzna/">Wzbogacony korpus
+sÃ
âownika frekwencyjnego</a></li>
+<!--<li><a href="http://www.korpus.pl">Korpus IPI PAN</a></li>-->
+<!--<li>The Bible (so called "Warsaw Bible" or "Brytyjka")</li>--><li>The
+Bible (so called "TysiÃâ¦clecia") - unauthorized electronic version</li>
+ <li><a
+ href="http://www.mimuw.edu.pl/polszczyzna/Debian/sam34_3.4a.02-1_i386.deb">Analizator
+morfologiczny SAM v. 3.4</a> - this was used to recover lemmas
+missing from other texts</li>
+</ul>
+<p>This step was the most time-consuming - and it would probably be
+even more tedious and difficult if not for the
+help of
+<a href="http://www.python.org/">Python</a>. The source texts had to be
+brought to a common encoding (UTF-8) - some of them used quite ancient
+encodings like Mazovia or DHN - and then scripts were written to
+collect all lemmas and
+inflected forms from the source texts. In cases when the source text
+was not
+tagged,
+I used the SAM analyzer to produce lemmas. In cases of ambiguous
+lemmatization I decided to put references to inflected forms from all
+base forms.<br>
+</p>
+<p>All grammatical categories were allowed to appear in the corpus,
+i.e. nouns, verbs, adjectives, numerals, and pronouns. The resulting
+corpus consisted of roughly 87,000+ inflection sets, i.e. each set
+consisted of one base form (lemma) and many inflected forms. However,
+because of the nature of the training method I restricted these sets to
+include only those where there were at least 4 inflected forms. Sets
+with 3 or less inflected forms were removed, so that the final corpus
+consisted of ~69,000 unique sets, which in turn contained ~1.5 mln
+inflected forms. <br>
+</p>
+<h2>Testing</h2>
+<p>I tested the stemmer tables produced using the implementation
+described above. The following sections give some details about
+the testing setup.
+</p>
+<h3>Testing procedure</h3>
+<p>The testing procedure was as follows:
+</p>
+<ul>
+ <li>the whole corpus of ~69,000 unique sets was shuffled, so that the
+input sets were in random order.</li>
+ <li>the corpus was split into two parts - one with 30,000 sets (Part
+1), the other with ~39,000 sets (Part 2).</li>
+ <li>Training samples were drawn in sequential order from the Part 1.
+Since the sets were already randomized, the training samples were also
+randomized, but this procedure ensured that each larger training sample
+contained all smaller samples.</li>
+ <li>Part 2 was used for testing. Note: this means that the testing
+run used <em>only</em> words previously unseen during the training
+phase. This is the worst scenario, because it means that stemmer must
+extrapolate the learned rules to unknown cases. This also means that in
+a real-life case (where the input is a mix between known and unknown
+words) the F-measure of the stemmer will be even higher than in the
+table below.</li>
+</ul>
+<h3>Test results</h3>
+<p>The following table summarizes test results for varying sizes
+of training samples. The meaning of the table columns is
+described below:
+</p>
+<ul>
+ <li><b>training sets:</b> the number of training sets. One set
+consists of one lemma and at least 4 and up to ~80 inflected forms
+(including pre- and suffixed forms).</li>
+ <li><b>testing forms:</b> the number of testing forms. Only inflected
+forms were used in testing.</li>
+ <li><b>stem OK:</b> the number of cases when produced output was a
+correct (unique) stem. Note: quite often correct stems were also
+correct lemmas.</li>
+ <li><b>lemma OK:</b> the number of cases when produced output was a
+correct lemma.</li>
+ <li><b>missing:</b> the number of cases when stemmer was unable to
+provide any output.</li>
+ <li><b>stem bad:</b> the number of cases when produced output was a
+stem, but already in use identifying a different set.</li>
+ <li><b>lemma bad:</b> the number of cases when produced output was an
+incorrect lemma. Note: quite often in such case the output was a
+correct stem.</li>
+ <li><b>table size:</b> the size in bytes of the stemmer table.</li>
+</ul>
+<div align="center">
+<table border="1" cellpadding="2" cellspacing="0">
+ <tbody>
+ <tr bgcolor="#a0b0c0">
+ <th>Training sets</th>
+ <th>Testing forms</th>
+ <th>Stem OK</th>
+ <th>Lemma OK</th>
+ <th>Missing</th>
+ <th>Stem Bad</th>
+ <th>Lemma Bad</th>
+ <th>Table size [B]</th>
+ </tr>
+ <tr align="right">
+ <td>100</td>
+ <td>1022985</td>
+ <td>842209</td>
+ <td>593632</td>
+ <td>172711</td>
+ <td>22331</td>
+ <td>256642</td>
+ <td>28438</td>
+ </tr>
+ <tr align="right">
+ <td>200</td>
+ <td>1022985</td>
+ <td>862789</td>
+ <td>646488</td>
+ <td>153288</td>
+ <td>16306</td>
+ <td>223209</td>
+ <td>48660</td>
+ </tr>
+ <tr align="right">
+ <td>500</td>
+ <td>1022985</td>
+ <td>885786</td>
+ <td>685009</td>
+ <td>130772</td>
+ <td>14856</td>
+ <td>207204</td>
+ <td>108798</td>
+ </tr>
+ <tr align="right">
+ <td>700</td>
+ <td>1022985</td>
+ <td>909031</td>
+ <td>704609</td>
+ <td>107084</td>
+ <td>15442</td>
+ <td>211292</td>
+ <td>139291</td>
+ </tr>
+ <tr align="right">
+ <td>1000</td>
+ <td>1022985</td>
+ <td>926079</td>
+ <td>725720</td>
+ <td>90117</td>
+ <td>14941</td>
+ <td>207148</td>
+ <td>183677</td>
+ </tr>
+ <tr align="right">
+ <td>2000</td>
+ <td>1022985</td>
+ <td>942886</td>
+ <td>746641</td>
+ <td>73429</td>
+ <td>14903</td>
+ <td>202915</td>
+ <td>313516</td>
+ </tr>
+ <tr align="right">
+ <td>5000</td>
+ <td>1022985</td>
+ <td>954721</td>
+ <td>759930</td>
+ <td>61476</td>
+ <td>14817</td>
+ <td>201579</td>
+ <td>640969</td>
+ </tr>
+ <tr align="right">
+ <td>7000</td>
+ <td>1022985</td>
+ <td>956165</td>
+ <td>764033</td>
+ <td>60364</td>
+ <td>14620</td>
+ <td>198588</td>
+ <td>839347</td>
+ </tr>
+ <tr align="right">
+ <td>10000</td>
+ <td>1022985</td>
+ <td>965427</td>
+ <td>775507</td>
+ <td>50797</td>
+ <td>14662</td>
+ <td>196681</td>
+ <td>1144537</td>
+ </tr>
+ <tr align="right">
+ <td>12000</td>
+ <td>1022985</td>
+ <td>967664</td>
+ <td>782143</td>
+ <td>48722</td>
+ <td>14284</td>
+ <td>192120</td>
+ <td>1313508</td>
+ </tr>
+ <tr align="right">
+ <td>15000</td>
+ <td>1022985</td>
+ <td>973188</td>
+ <td>788867</td>
+ <td>43247</td>
+ <td>14349</td>
+ <td>190871</td>
+ <td>1567902</td>
+ </tr>
+ <tr align="right">
+ <td>17000</td>
+ <td>1022985</td>
+ <td>974203</td>
+ <td>791804</td>
+ <td>42319</td>
+ <td>14333</td>
+ <td>188862</td>
+ <td>1733957</td>
+ </tr>
+ <tr align="right">
+ <td>20000</td>
+ <td>1022985</td>
+ <td>976234</td>
+ <td>791554</td>
+ <td>40058</td>
+ <td>14601</td>
+ <td>191373</td>
+ <td>1977615</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<p>I also measured the time to produce a stem (which involves
+traversing a trie,
+retrieving a patch command and applying the patch command to the input
+string).
+On a machine running Windows XP (Pentium 4, 1.7 GHz, JDK 1.4.2_03
+HotSpot),
+for tables ranging in size from 1,000 to 20,000 cells, the time to
+produce a
+single stem varies between 5-10 microseconds.<br>
+</p>
+<p>This means that the stemmer can process up to <span
+ style="font-weight: bold;">200,000 words per second</span>, an
+outstanding result when compared to other stemmers (Morfeusz - ~2,000
+w/s, FormAN (MS Word analyzer) - ~1,000 w/s).<br>
+</p>
+<p>The package contains a class <code>org.getopt.stempel.Benchmark</code>,
+which you can use to produce reports
+like the one below:<br>
+</p>
+<pre>--------- Stemmer benchmark report: -----------<br>Stemmer table: /res/tables/stemmer_2000.out<br>Input file: ../test3.txt<br>Number of runs: 3<br><br> RUN NUMBER: 1 2 3<br> Total input words 1378176 1378176 1378176<br> Missed output words 112 112 112<br> Time elapsed [ms] 6989 6940 6640<br> Hit rate percent 99.99% 99.99% 99.99%<br> Miss rate percent 00.01% 00.01% 00.01%<br> Words per second 197192 198584 207557<br> Time per word [us] 5.07 5.04 4.82<br></pre>
+<h2>Summary</h2>
+<p>The results of these tests are very encouraging. It seems that using
+the
+training corpus and the stemming algorithm described above results in a
+high-quality stemmer useful for most applications. Moreover, it can
+also
+be used as a better than average lemmatizer.</p>
+<p>Both the author of the implementation
+(Leo Galambos, <leo.galambos AT egothor DOT org>) and the author
+of this
+compilation (Andrzej Bialecki <ab AT getopt DOT org>) would
+appreciate any
+feedback and suggestions for further improvements.</p>
+<h2>Bibliography</h2>
+<ol>
+ <li>Galambos, L.: Multilingual Stemmer in Web Environment, PhD
+Thesis,
+Faculty of Mathematics and Physics, Charles University in Prague, in
+press.</li>
+ <li>Galambos, L.: Semi-automatic Stemmer Evaluation. International
+Intelligent Information Processing and Web Mining Conference, 2004,
+Zakopane, Poland.</li>
+ <li>Galambos, L.: Lemmatizer for Document Information Retrieval
+Systems in JAVA.<span style="text-decoration: underline;"> </span><a
+ class="moz-txt-link-rfc2396E"
+ href="http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01"><http://www.informatik.uni-trier.de/%7Eley/db/conf/sofsem/sofsem2001.html#Galambos01></a>
+SOFSEM 2001, Piestany, Slovakia. <br>
+ </li>
+</ol>
+<br>
+<br>
+</body>
+</html>
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/java/overview.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stemmer_20000.tbl
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stemmer_20000.tbl?rev=940433&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stemmer_20000.tbl
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt Mon May 3 12:44:22 2010
@@ -0,0 +1,186 @@
+# This file was created from the carrot2 project and is distributed under the BSD license.
+# See http://project.carrot2.org/license.html
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# From trunk/core/carrot2-util-text/src-resources/stopwords.pl
+vol
+o.o.
+mgr
+godz
+zÅ
+www
+pl
+ul
+tel
+hab
+prof
+inż
+dr
+i
+u
+aby
+albo
+ale
+ani
+aż
+bardzo
+bez
+bo
+bowiem
+by
+byli
+bym
+byÅ
+byÅa
+byÅo
+byÅy
+byÄ
+bÄdzie
+bÄdÄ
+chce
+choÄ
+co
+coraz
+coÅ
+czy
+czyli
+czÄsto
+dla
+do
+gdy
+gdyby
+gdyż
+gdzie
+go
+ich
+im
+inne
+iż
+ja
+jak
+jakie
+jako
+je
+jednak
+jednym
+jedynie
+jego
+jej
+jest
+jeszcze
+jeÅli
+jeżeli
+już
+jÄ
+kiedy
+kilku
+kto
+która
+które
+którego
+której
+który
+których
+którym
+którzy
+lat
+lecz
+lub
+ma
+majÄ
+mamy
+mi
+miaÅ
+mimo
+mnie
+mogÄ
+może
+można
+mu
+musi
+na
+nad
+nam
+nas
+nawet
+nic
+nich
+nie
+niej
+nim
+niż
+no
+nowe
+np
+nr
+o
+od
+ok
+on
+one
+oraz
+pan
+po
+pod
+ponad
+ponieważ
+poza
+przed
+przede
+przez
+przy
+raz
+razie
+roku
+również
+siÄ
+sobie
+sposób
+swoje
+sÄ
+ta
+tak
+takich
+takie
+także
+tam
+te
+tego
+tej
+temu
+ten
+teraz
+też
+to
+trzeba
+tu
+tych
+tylko
+tym
+tys
+tzw
+tÄ
+w
+we
+wie
+wiÄc
+wszystko
+wÅród
+wÅaÅnie
+z
+za
+zaÅ
+ze
+że
+żeby
+ii
+iii
+iv
+vi
+vii
+viii
+ix
+xi
+xii
+xiii
+xiv
+xv
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Mon May 3 12:44:22 2010
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.pl;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+
+public class TestPolishAnalyzer extends BaseTokenStreamTestCase {
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new PolishAnalyzer(TEST_VERSION_CURRENT);
+ }
+
+ /** test stopwords and stemming */
+ public void testBasics() throws IOException {
+ Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT);
+ // stemming
+ checkOneTermReuse(a, "studenta", "student");
+ checkOneTermReuse(a, "studenci", "student");
+ // stopword
+ assertAnalyzesTo(a, "byÅ", new String[] {});
+ }
+
+ /** test use of exclusion set */
+ public void testExclude() throws IOException {
+ Set<String> exclusionSet = new HashSet<String>();
+ exclusionSet.add("studenta");
+ Analyzer a = new PolishAnalyzer(TEST_VERSION_CURRENT,
+ PolishAnalyzer.getDefaultStopSet(), exclusionSet);
+ checkOneTermReuse(a, "studenta", "studenta");
+ checkOneTermReuse(a, "studenci", "student");
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java?rev=940433&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java (added)
+++ lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java Mon May 3 12:44:22 2010
@@ -0,0 +1,153 @@
+package org.egothor.stemmer;
+
+/*
+ Egothor Software License version 1.00
+ Copyright (C) 1997-2004 Leo Galambos.
+ Copyright (C) 2002-2004 "Egothor developers"
+ on behalf of the Egothor Project.
+ All rights reserved.
+
+ This software is copyrighted by the "Egothor developers". If this
+ license applies to a single file or document, the "Egothor developers"
+ are the people or entities mentioned as copyright holders in that file
+ or document. If this license applies to the Egothor project as a
+ whole, the copyright holders are the people or entities mentioned in
+ the file CREDITS. This file can be found in the same location as this
+ license in the distribution.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, the list of contributors, this list of conditions, and the
+ disclaimer that follows these conditions in the documentation
+ and/or other materials provided with the distribution.
+ 3. The name "Egothor" must not be used to endorse or promote products
+ derived from this software without prior written permission. For
+ written permission, please contact Leo.G@seznam.cz
+ 4. Products derived from this software may not be called "Egothor",
+ nor may "Egothor" appear in their name, without prior written
+ permission from Leo.G@seznam.cz.
+
+ In addition, we request that you include in the end-user documentation
+ provided with the redistribution and/or in the software itself an
+ acknowledgement equivalent to the following:
+ "This product includes software developed by the Egothor Project.
+ http://egothor.sf.net/"
+
+ THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL THE EGOTHOR PROJECT OR ITS CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ This software consists of voluntary contributions made by many
+ individuals on behalf of the Egothor Project and was originally
+ created by Leo Galambos (Leo.G@seznam.cz).
+ */
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.LineNumberReader;
+import java.net.URI;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestCompile extends LuceneTestCase {
+
+ public void testCompile() throws Exception {
+ URI uri = getClass().getResource("testRules.txt").toURI();
+ String path = uri.getPath();
+ Compile.main(new String[] {"test", path});
+ String compiled = path + ".out";
+ Trie trie = loadTrie(compiled);
+ assertTrie(trie, path, true, true);
+ assertTrie(trie, path, false, true);
+ new File(compiled).delete();
+ }
+
+ public void testCompileBackwards() throws Exception {
+ URI uri = getClass().getResource("testRules.txt").toURI();
+ String path = uri.getPath();
+ Compile.main(new String[] {"-test", path});
+ String compiled = path + ".out";
+ Trie trie = loadTrie(compiled);
+ assertTrie(trie, path, true, true);
+ assertTrie(trie, path, false, true);
+ new File(compiled).delete();
+ }
+
+ public void testCompileMulti() throws Exception {
+ URI uri = getClass().getResource("testRules.txt").toURI();
+ String path = uri.getPath();
+ Compile.main(new String[] {"Mtest", path});
+ String compiled = path + ".out";
+ Trie trie = loadTrie(compiled);
+ assertTrie(trie, path, true, true);
+ assertTrie(trie, path, false, true);
+ new File(compiled).delete();
+ }
+
+ static Trie loadTrie(String path) throws IOException {
+ Trie trie;
+ DataInputStream is = new DataInputStream(new BufferedInputStream(
+ new FileInputStream(path)));
+ String method = is.readUTF().toUpperCase();
+ if (method.indexOf('M') < 0) {
+ trie = new Trie(is);
+ } else {
+ trie = new MultiTrie(is);
+ }
+ is.close();
+ return trie;
+ }
+
+ private static void assertTrie(Trie trie, String file, boolean usefull,
+ boolean storeorig) throws Exception {
+ LineNumberReader in = new LineNumberReader(new BufferedReader(
+ new FileReader(file)));
+
+ for (String line = in.readLine(); line != null; line = in.readLine()) {
+ try {
+ line = line.toLowerCase();
+ StringTokenizer st = new StringTokenizer(line);
+ String stem = st.nextToken();
+ if (storeorig) {
+ CharSequence cmd = (usefull) ? trie.getFully(stem) : trie
+ .getLastOnPath(stem);
+ StringBuilder stm = new StringBuilder(stem);
+ Diff.apply(stm, cmd);
+ assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+ }
+ while (st.hasMoreTokens()) {
+ String token = st.nextToken();
+ if (token.equals(stem)) {
+ continue;
+ }
+ CharSequence cmd = (usefull) ? trie.getFully(token) : trie
+ .getLastOnPath(token);
+ StringBuilder stm = new StringBuilder(token);
+ Diff.apply(stm, cmd);
+ assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+ }
+ } catch (java.util.NoSuchElementException x) {
+ // no base token (stem) on a line
+ }
+ }
+ }
+}
Propchange: lucene/dev/trunk/lucene/contrib/analyzers/stempel/src/test/org/egothor/stemmer/TestCompile.java
------------------------------------------------------------------------------
svn:eol-style = native