You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2007/12/21 21:08:25 UTC

svn commit: r606292 - /lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java

Author: gsingers
Date: Fri Dec 21 12:08:24 2007
New Revision: 606292

URL: http://svn.apache.org/viewvc?rev=606292&view=rev
Log:
Checkin of WikpediaTokenizer that extends StandardTokenizer using JFlex.  examples/wikipedia/README contains info on running.

Modified:
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=606292&r1=606291&r2=606292&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Fri Dec 21 12:08:24 2007
@@ -1,7 +1,7 @@
 package org.apache.lucene.analysis;
 
-import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.LuceneTestCase;
 
 import java.io.StringReader;
 
@@ -23,177 +23,176 @@
 
 public class TestStandardAnalyzer extends LuceneTestCase {
 
-    private Analyzer a = new StandardAnalyzer();
-
-    public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
-	assertAnalyzesTo(a, input, expected, null);
-    }
-
-    public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
-	TokenStream ts = a.tokenStream("dummy", new StringReader(input));
-	for (int i = 0; i < expectedImages.length; i++) {
-	    Token t = ts.next();
-	    assertNotNull(t);
-	    assertEquals(expectedImages[i], t.termText());
-	    if (expectedTypes != null)
-	    {
-		assertEquals(expectedTypes[i], t.type());
-	    }
-	}
-	assertNull(ts.next());
-	ts.close();
-    }
-
-
-    public void testAlphanumeric() throws Exception {
-	// alphanumeric tokens
-	assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
-	assertAnalyzesTo(a, "2B", new String[]{"2b"});
-    }
-
-    public void testUnderscores() throws Exception {
-	// underscores are delimiters, but not in email addresses (below)
-	assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
-	assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
-    }
-
-    public void testDelimiters() throws Exception {
-	// other delimiters: "-", "/", ","
-	assertAnalyzesTo(a, "some-dashed-phrase",   new String[]{"some", "dashed", "phrase" });
-	assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
-	assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
-    }
-
-    public void testApostrophes() throws Exception {
-	// internal apostrophes: O'Reilly, you're, O'Reilly's
-	// possessives are actually removed by StardardFilter, not the tokenizer
-	assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
-	assertAnalyzesTo(a, "you're", new String[]{"you're"});
-	assertAnalyzesTo(a, "she's", new String[]{"she"});
-	assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
-	assertAnalyzesTo(a, "don't", new String[]{"don't"});
-	assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
-    }
-
-    public void testTSADash() throws Exception {
-	// t and s had been stopwords in Lucene <= 2.0, which made it impossible
-	// to correctly search for these terms:
-	assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
-	assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
-	// 'a' is still a stopword:
-	assertAnalyzesTo(a, "a-class", new String[]{"class"});
-    }
-
-    public void testCompanyNames() throws Exception {
-	// company names
-	assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
-	assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
-    }
-
-    public void testDomainNames() throws Exception {
-	// domain names
-	assertAnalyzesTo(a, "www.nutch.org",   new String[]{"www.nutch.org" });
-    }
-
-    public void testEMailAddresses() throws Exception {
-	// email addresses, possibly with underscores, periods, etc
-	assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
-	assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
-	assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
-    }
-
-    public void testNumeric() throws Exception {
-	// floating point, serial, model numbers, ip addresses, etc.
-	// every other segment must have at least one digit
-	assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
-	assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
-	assertAnalyzesTo(a, "216.239.63.104",   new String[]{"216.239.63.104"});
-	assertAnalyzesTo(a, "1-2-3",   new String[]{"1-2-3"});
-	assertAnalyzesTo(a, "a1-b2-c3",   new String[]{"a1-b2-c3"});
-	assertAnalyzesTo(a, "a1-b-c3",   new String[]{"a1-b-c3"});
-    }
+  private Analyzer a = new StandardAnalyzer();
 
-    public void testTextWithNumbers() throws Exception {
-	// numbers
-	assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
-    }
-
-    public void testVariousText() throws Exception {
-	// various
-	assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted" });
-	assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
-	assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
-	assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
-    }
-
-    public void testAcronyms() throws Exception {
-	// acronyms have their dots stripped
-	assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
-    }
-
-    public void testCPlusPlusHash() throws Exception {
-	// It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
-	assertAnalyzesTo(a, "C++", new String[]{"c"});
-	assertAnalyzesTo(a, "C#", new String[]{"c"});
-    }
-
-    public void testKorean() throws Exception {
-	// Korean words
-	assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
-    }
-
-    // Compliance with the "old" JavaCC-based analyzer, see:
-    // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
-
-    public void testComplianceFileName() throws Exception {
-	assertAnalyzesTo(a, "2004.jpg",
-		new String[] { "2004.jpg" },
-		new String[] { "<HOST>" });
-    }
-
-    public void testComplianceNumericIncorrect() throws Exception {
-	assertAnalyzesTo(a, "62.46",
-		new String[] { "62.46" },
-		new String[] { "<HOST>" });
-    }
-
-    public void testComplianceNumericLong() throws Exception {
-	assertAnalyzesTo(a, "978-0-94045043-1",
-		new String[] { "978-0-94045043-1" },
-		new String[] { "<NUM>" });
-    }
-
-    public void testComplianceNumericFile() throws Exception {
-	assertAnalyzesTo(
-		a,
-		"78academyawards/rules/rule02.html",
-		new String[] { "78academyawards/rules/rule02.html" },
-		new String[] { "<NUM>" });
-    }
-
-    public void testComplianceNumericWithUnderscores() throws Exception {
-	assertAnalyzesTo(
-		a,
-		"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
-		new String[] { "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs" },
-		new String[] { "<NUM>" });
-    }
-
-    public void testComplianceNumericWithDash() throws Exception {
-	assertAnalyzesTo(a, "mid-20th", new String[] { "mid-20th" },
-		new String[] { "<NUM>" });
-    }
-
-    public void testComplianceManyTokens() throws Exception {
-	assertAnalyzesTo(
-		a,
-		"/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
-			+ "safari-0-sheikh-zayed-grand-mosque.jpg",
-		new String[] { "money.cnn.com", "magazines", "fortune",
-			"fortune", "archive/2007/03/19/8402357", "index.htm",
-			"safari-0-sheikh", "zayed", "grand", "mosque.jpg" },
-		new String[] { "<HOST>", "<ALPHANUM>", "<ALPHANUM>",
-			"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
-			"<ALPHANUM>", "<HOST>" });
-    }
+  public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception {
+    assertAnalyzesTo(a, input, expected, null);
+  }
+
+  public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
+    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+    for (int i = 0; i < expectedImages.length; i++) {
+      Token t = ts.next();
+      assertNotNull(t);
+      assertEquals(expectedImages[i], t.termText());
+      if (expectedTypes != null) {
+        assertEquals(expectedTypes[i], t.type());
+      }
+    }
+    assertNull(ts.next());
+    ts.close();
+  }
+
+
+  public void testAlphanumeric() throws Exception {
+    // alphanumeric tokens
+    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
+    assertAnalyzesTo(a, "2B", new String[]{"2b"});
+  }
+
+  public void testUnderscores() throws Exception {
+    // underscores are delimiters, but not in email addresses (below)
+    assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"});
+    assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"});
+  }
+
+  public void testDelimiters() throws Exception {
+    // other delimiters: "-", "/", ","
+    assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
+    assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
+    assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
+  }
+
+  public void testApostrophes() throws Exception {
+    // internal apostrophes: O'Reilly, you're, O'Reilly's
+    // possessives are actually removed by StardardFilter, not the tokenizer
+    assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
+    assertAnalyzesTo(a, "you're", new String[]{"you're"});
+    assertAnalyzesTo(a, "she's", new String[]{"she"});
+    assertAnalyzesTo(a, "Jim's", new String[]{"jim"});
+    assertAnalyzesTo(a, "don't", new String[]{"don't"});
+    assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
+  }
+
+  public void testTSADash() throws Exception {
+    // t and s had been stopwords in Lucene <= 2.0, which made it impossible
+    // to correctly search for these terms:
+    assertAnalyzesTo(a, "s-class", new String[]{"s", "class"});
+    assertAnalyzesTo(a, "t-com", new String[]{"t", "com"});
+    // 'a' is still a stopword:
+    assertAnalyzesTo(a, "a-class", new String[]{"class"});
+  }
+
+  public void testCompanyNames() throws Exception {
+    // company names
+    assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
+    assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"});
+  }
+
+  public void testDomainNames() throws Exception {
+    // domain names
+    assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
+  }
+
+  public void testEMailAddresses() throws Exception {
+    // email addresses, possibly with underscores, periods, etc
+    assertAnalyzesTo(a, "test@example.com", new String[]{"test@example.com"});
+    assertAnalyzesTo(a, "first.lastname@example.com", new String[]{"first.lastname@example.com"});
+    assertAnalyzesTo(a, "first_lastname@example.com", new String[]{"first_lastname@example.com"});
+  }
+
+  public void testNumeric() throws Exception {
+    // floating point, serial, model numbers, ip addresses, etc.
+    // every other segment must have at least one digit
+    assertAnalyzesTo(a, "21.35", new String[]{"21.35"}, new String[]{"<NUM>"});
+    assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
+    assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
+    assertAnalyzesTo(a, "1-2-3", new String[]{"1-2-3"});
+    assertAnalyzesTo(a, "a1-b2-c3", new String[]{"a1-b2-c3"});
+    assertAnalyzesTo(a, "a1-b-c3", new String[]{"a1-b-c3"});
+  }
+
+  public void testTextWithNumbers() throws Exception {
+    // numbers
+    assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
+  }
+
+  public void testVariousText() throws Exception {
+    // various
+    assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
+    assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
+    assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
+  }
+
+  public void testAcronyms() throws Exception {
+    // acronyms have their dots stripped
+    assertAnalyzesTo(a, "U.S.A.", new String[]{"usa"});
+  }
+
+  public void testCPlusPlusHash() throws Exception {
+    // It would be nice to change the grammar in StandardTokenizer.jj to make "C#" and "C++" end up as tokens.
+    assertAnalyzesTo(a, "C++", new String[]{"c"});
+    assertAnalyzesTo(a, "C#", new String[]{"c"});
+  }
+
+  public void testKorean() throws Exception {
+    // Korean words
+    assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
+  }
+
+  // Compliance with the "old" JavaCC-based analyzer, see:
+  // https://issues.apache.org/jira/browse/LUCENE-966#action_12516752
+
+  public void testComplianceFileName() throws Exception {
+    assertAnalyzesTo(a, "2004.jpg",
+            new String[]{"2004.jpg"},
+            new String[]{"<HOST>"});
+  }
+
+  public void testComplianceNumericIncorrect() throws Exception {
+    assertAnalyzesTo(a, "62.46",
+            new String[]{"62.46"},
+            new String[]{"<HOST>"});
+  }
+
+  public void testComplianceNumericLong() throws Exception {
+    assertAnalyzesTo(a, "978-0-94045043-1",
+            new String[]{"978-0-94045043-1"},
+            new String[]{"<NUM>"});
+  }
+
+  public void testComplianceNumericFile() throws Exception {
+    assertAnalyzesTo(
+            a,
+            "78academyawards/rules/rule02.html",
+            new String[]{"78academyawards/rules/rule02.html"},
+            new String[]{"<NUM>"});
+  }
+
+  public void testComplianceNumericWithUnderscores() throws Exception {
+    assertAnalyzesTo(
+            a,
+            "2006-03-11t082958z_01_ban130523_rtridst_0_ozabs",
+            new String[]{"2006-03-11t082958z_01_ban130523_rtridst_0_ozabs"},
+            new String[]{"<NUM>"});
+  }
+
+  public void testComplianceNumericWithDash() throws Exception {
+    assertAnalyzesTo(a, "mid-20th", new String[]{"mid-20th"},
+            new String[]{"<NUM>"});
+  }
+
+  public void testComplianceManyTokens() throws Exception {
+    assertAnalyzesTo(
+            a,
+            "/money.cnn.com/magazines/fortune/fortune_archive/2007/03/19/8402357/index.htm "
+                    + "safari-0-sheikh-zayed-grand-mosque.jpg",
+            new String[]{"money.cnn.com", "magazines", "fortune",
+                    "fortune", "archive/2007/03/19/8402357", "index.htm",
+                    "safari-0-sheikh", "zayed", "grand", "mosque.jpg"},
+            new String[]{"<HOST>", "<ALPHANUM>", "<ALPHANUM>",
+                    "<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
+                    "<ALPHANUM>", "<HOST>"});
+  }
 }