You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by gs...@apache.org on 2008/01/04 15:29:18 UTC
svn commit: r608852 - in /lucene/java/trunk: ./ contrib/wikipedia/
contrib/wikipedia/src/ contrib/wikipedia/src/java/
contrib/wikipedia/src/java/org/ contrib/wikipedia/src/java/org/apache/
contrib/wikipedia/src/java/org/apache/lucene/ contrib/wikipedia...
Author: gsingers
Date: Fri Jan 4 06:29:15 2008
New Revision: 608852
URL: http://svn.apache.org/viewvc?rev=608852&view=rev
Log:
LUCENE-1103
Added:
lucene/java/trunk/contrib/wikipedia/
lucene/java/trunk/contrib/wikipedia/build.xml (with props)
lucene/java/trunk/contrib/wikipedia/pom.xml.template
lucene/java/trunk/contrib/wikipedia/src/
lucene/java/trunk/contrib/wikipedia/src/java/
lucene/java/trunk/contrib/wikipedia/src/java/org/
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (with props)
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java (with props)
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex
lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html (with props)
lucene/java/trunk/contrib/wikipedia/src/test/
lucene/java/trunk/contrib/wikipedia/src/test/org/
lucene/java/trunk/contrib/wikipedia/src/test/org/apache/
lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/
lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/
lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/
lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (with props)
Modified:
lucene/java/trunk/build.xml
lucene/java/trunk/docs/developer-resources.html
lucene/java/trunk/docs/developer-resources.pdf
lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml
Modified: lucene/java/trunk/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/build.xml?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/build.xml (original)
+++ lucene/java/trunk/build.xml Fri Jan 4 06:29:15 2008
@@ -250,6 +250,7 @@
<packageset dir="contrib/spellchecker/src/java"/>
<packageset dir="contrib/surround/src/java"/>
<packageset dir="contrib/swing/src/java"/>
+ <packageset dir="contrib/wikipedia/src/java"/>
<packageset dir="contrib/wordnet/src/java"/>
<packageset dir="contrib/xml-query-parser/src/java"/>
<!-- end alpha sort -->
@@ -279,6 +280,7 @@
<group title="contrib: SpellChecker" packages="org.apache.lucene.search.spell*"/>
<group title="contrib: Surround Parser" packages="org.apache.lucene.queryParser.surround*"/>
<group title="contrib: Swing" packages="org.apache.lucene.swing*"/>
+ <group title="contrib: Wikipedia" packages="org.apache.lucene.wikipedia*"/>
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
Added: lucene/java/trunk/contrib/wikipedia/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/build.xml?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/build.xml (added)
+++ lucene/java/trunk/contrib/wikipedia/build.xml Fri Jan 4 06:29:15 2008
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="wikipedia" default="default">
+
+ <description>
+ Tools for working with Wikipedia
+ </description>
+
+
+ <import file="../contrib-build.xml"/>
+
+
+ <target name="jflex" depends="clean-jflex,jflex-wiki-tokenizer"/>
+
+ <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
+ <taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
+ <classpath location="${jflex.home}/lib/JFlex.jar"/>
+ </taskdef>
+
+ <jflex file="src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex"
+ outdir="src/java/org/apache/lucene/wikipedia/analysis"
+ nobak="on"/>
+ </target>
+
+ <target name="clean-jflex">
+ <delete>
+ <fileset dir="src/java/org/apache/lucene/wikipedia" includes="*.java">
+ <containsregexp expression="generated.*by.*JFlex"/>
+ </fileset>
+ </delete>
+ </target>
+</project>
Propchange: lucene/java/trunk/contrib/wikipedia/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/wikipedia/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/pom.xml.template?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/pom.xml.template (added)
+++ lucene/java/trunk/contrib/wikipedia/pom.xml.template Fri Jan 4 06:29:15 2008
@@ -0,0 +1,43 @@
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+ -->
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-contrib</artifactId>
+ <version>@version@</version>
+ </parent>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-wikipedia</artifactId>
+ <name>Lucene Wikipedia Tools</name>
+ <version>@version@</version>
+ <description>Lucene Wikipedia Contributions</description>
+ <packaging>jar</packaging>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-benchmark</artifactId>
+ <version>@version@</version>
+ </dependency>
+ </dependencies>
+</project>
Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Fri Jan 4 06:29:15 2008
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.wikipedia.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.Reader;
+import java.io.IOException;
+
+
+/**
+ * Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
+ * Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
+ *
+ *
+ **/
+public class WikipediaTokenizer extends Tokenizer {
+ public static final String INTERNAL_LINK = "il";
+ public static final String EXTERNAL_LINK = "el";
+ //The URL part of the link, i.e. the first token
+ public static final String EXTERNAL_LINK_URL = "elu";
+ public static final String CITATION = "ci";
+ public static final String CATEGORY = "c";
+ public static final String BOLD = "b";
+ public static final String ITALICS = "i";
+ public static final String BOLD_ITALICS = "bi";
+ public static final String HEADING = "h";
+ public static final String SUB_HEADING = "sh";
+ /**
+ * A private instance of the JFlex-constructed scanner
+ */
+ private final WikipediaTokenizerImpl scanner;
+
+ void setInput(Reader reader) {
+ this.input = reader;
+ }
+
+ /**
+ * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
+ * <code>input</code> to a newly created JFlex scanner.
+ * @param input The Input Reader
+ */
+ public WikipediaTokenizer(Reader input) {
+ this.input = input;
+ this.scanner = new WikipediaTokenizerImpl(input);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#next()
+ */
+ public Token next(Token result) throws IOException {
+ int tokenType = scanner.getNextToken();
+
+ if (tokenType == WikipediaTokenizerImpl.YYEOF) {
+ return null;
+ }
+
+ scanner.getText(result, tokenType);
+ final int start = scanner.yychar();
+ result.setStartOffset(start);
+ result.setEndOffset(start + result.termLength());
+ result.setPositionIncrement(scanner.getPositionIncrement());
+ result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
+ return result;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.TokenStream#reset()
+ */
+ public void reset() throws IOException {
+ super.reset();
+ scanner.yyreset(input);
+ }
+
+ public void reset(Reader reader) throws IOException {
+ input = reader;
+ reset();
+ }
+
+}
\ No newline at end of file
Propchange: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java Fri Jan 4 06:29:15 2008
@@ -0,0 +1,949 @@
+/* The following code was generated by JFlex 1.4.1 on 1/3/08 10:05 PM */
+
+package org.apache.lucene.wikipedia.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+
+/**
+ * This class is a scanner generated by
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ * on 1/3/08 10:05 PM from the specification file
+ * <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
+ */
+class WikipediaTokenizerImpl {
+
+ /** This character denotes the end of file */
+ public static final int YYEOF = -1;
+
+ /** initial size of the lookahead buffer */
+ private static final int ZZ_BUFFERSIZE = 16384;
+
+ /** lexical states */
+ public static final int DOUBLE_BRACE_STATE = 7;
+ public static final int INTERNAL_LINK_STATE = 2;
+ public static final int TWO_SINGLE_QUOTES_STATE = 4;
+ public static final int CATEGORY_STATE = 1;
+ public static final int FIVE_SINGLE_QUOTES_STATE = 5;
+ public static final int STRING = 8;
+ public static final int YYINITIAL = 0;
+ public static final int DOUBLE_EQUALS_STATE = 6;
+ public static final int THREE_SINGLE_QUOTES_STATE = 5;
+ public static final int EXTERNAL_LINK_STATE = 3;
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final String ZZ_CMAP_PACKED =
+ "\11\0\1\24\1\23\1\0\1\24\1\22\22\0\1\24\1\0\1\12"+
+ "\1\52\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"+
+ "\1\27\1\0\1\7\1\11\1\13\1\52\1\4\2\15\1\30\5\15"+
+ "\1\41\21\15\1\25\1\0\1\26\1\0\1\6\1\0\1\31\1\43"+
+ "\2\15\1\33\1\40\1\34\1\50\1\41\4\15\1\42\1\35\1\51"+
+ "\1\15\1\36\1\15\1\32\3\15\1\44\1\37\1\15\1\45\1\47"+
+ "\1\46\102\0\27\15\1\0\37\15\1\0\u0568\15\12\17\206\15\12\17"+
+ "\u026c\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17"+
+ "\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
+ "\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
+ "\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
+ "\u0200\21\u0465\0\73\21\75\15\43\0";
+
+ /**
+ * Translates characters to character classes
+ */
+ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+ /**
+ * Translates DFA states to action switch labels.
+ */
+ private static final int [] ZZ_ACTION = zzUnpackAction();
+
+ private static final String ZZ_ACTION_PACKED_0 =
+ "\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
+ "\1\1\1\6\2\7\1\10\1\11\1\10\1\12\1\13"+
+ "\1\7\1\14\1\15\1\16\1\17\1\7\1\20\1\7"+
+ "\4\21\1\22\1\21\1\23\1\24\1\25\3\0\1\26"+
+ "\14\0\1\27\1\30\1\10\1\0\1\31\1\0\1\32"+
+ "\1\0\1\33\3\0\1\34\1\35\2\36\1\35\2\37"+
+ "\2\0\1\36\1\0\14\36\1\35\3\0\1\10\1\40"+
+ "\3\0\1\41\1\42\5\0\1\43\4\0\1\43\2\0"+
+ "\2\43\2\0\1\10\5\0\1\30\1\35\1\36\1\44"+
+ "\5\0\1\45\30\0\1\46\2\0\1\47\1\50\1\51";
+
+ private static int [] zzUnpackAction() {
+ int [] result = new int[174];
+ int offset = 0;
+ offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAction(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /**
+ * Translates a state to a row index in the transition table
+ */
+ private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+ private static final String ZZ_ROWMAP_PACKED_0 =
+ "\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
+ "\0\u0158\0\u0183\0\u01ae\0\u01d9\0\u0204\0\u022f\0\u025a\0\u0285"+
+ "\0\u02b0\0\u0183\0\u02db\0\u0306\0\u0331\0\u035c\0\u0387\0\u03b2"+
+ "\0\u03dd\0\u0183\0\u035c\0\u0408\0\u0183\0\u0433\0\u045e\0\u0489"+
+ "\0\u04b4\0\u04df\0\u050a\0\u0535\0\u0560\0\u058b\0\u05b6\0\u05e1"+
+ "\0\u0183\0\u060c\0\u035c\0\u0637\0\u0662\0\u068d\0\u06b8\0\u0183"+
+ "\0\u0183\0\u06e3\0\u070e\0\u0739\0\u0183\0\u0764\0\u078f\0\u07ba"+
+ "\0\u07e5\0\u0810\0\u083b\0\u0866\0\u0891\0\u08bc\0\u08e7\0\u0912"+
+ "\0\u093d\0\u0968\0\u0993\0\u09be\0\u09e9\0\u0a14\0\u0a3f\0\u0a6a"+
+ "\0\u0a95\0\u0ac0\0\u0aeb\0\u0b16\0\u0b41\0\u0b6c\0\u0b97\0\u0bc2"+
+ "\0\u0bed\0\u0c18\0\u07ba\0\u0c43\0\u0c6e\0\u0c99\0\u0cc4\0\u0cef"+
+ "\0\u0d1a\0\u0d45\0\u0d70\0\u0d9b\0\u0dc6\0\u0df1\0\u0e1c\0\u0e47"+
+ "\0\u0e72\0\u0e9d\0\u0ec8\0\u0ef3\0\u0f1e\0\u0f49\0\u0f74\0\u0f9f"+
+ "\0\u0fca\0\u0183\0\u0ff5\0\u1020\0\u104b\0\u1076\0\u0183\0\u10a1"+
+ "\0\u10cc\0\u10f7\0\u1122\0\u114d\0\u1178\0\u11a3\0\u11ce\0\u11f9"+
+ "\0\u1224\0\u124f\0\u127a\0\u12a5\0\u078f\0\u0912\0\u12d0\0\u12fb"+
+ "\0\u1326\0\u1351\0\u137c\0\u13a7\0\u13d2\0\u13fd\0\u0183\0\u1428"+
+ "\0\u1453\0\u147e\0\u14a9\0\u14d4\0\u14ff\0\u152a\0\u1555\0\u0183"+
+ "\0\u1580\0\u15ab\0\u15d6\0\u1601\0\u162c\0\u1657\0\u1682\0\u16ad"+
+ "\0\u16d8\0\u1703\0\u172e\0\u1759\0\u1784\0\u17af\0\u17da\0\u1805"+
+ "\0\u1830\0\u185b\0\u1886\0\u18b1\0\u18dc\0\u1907\0\u1932\0\u195d"+
+ "\0\u1988\0\u19b3\0\u19de\0\u0183\0\u0183\0\u0183";
+
+ private static int [] zzUnpackRowMap() {
+ int [] result = new int[174];
+ int offset = 0;
+ offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int high = packed.charAt(i++) << 16;
+ result[j++] = high | packed.charAt(i++);
+ }
+ return j;
+ }
+
+ /**
+ * The transition table of the DFA
+ */
+ private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+ private static final String ZZ_TRANS_PACKED_0 =
+ "\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
+ "\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
+ "\15\16\1\25\2\12\2\16\10\12\1\26\5\12\4\27"+
+ "\1\12\1\23\3\12\1\30\1\12\15\27\3\12\2\27"+
+ "\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\30"+
+ "\1\12\15\31\3\12\2\31\1\12\7\32\1\33\5\32"+
+ "\4\34\1\32\1\23\2\12\1\32\1\35\1\32\15\34"+
+ "\3\32\1\36\1\34\2\32\1\37\5\32\1\33\5\32"+
+ "\4\40\1\32\1\41\2\32\1\42\2\32\15\40\3\32"+
+ "\2\40\10\32\1\33\5\32\4\43\1\32\1\41\2\32"+
+ "\1\42\2\32\15\43\3\32\2\43\10\32\1\33\1\32"+
+ "\1\44\3\32\4\45\1\32\1\41\5\32\15\45\3\32"+
+ "\2\45\10\32\1\46\5\32\4\47\1\32\1\41\5\32"+
+ "\15\47\1\32\1\50\1\32\2\47\1\32\1\51\1\52"+
+ "\5\51\1\53\1\51\1\54\3\51\4\55\1\51\1\56"+
+ "\2\51\1\57\2\51\15\55\2\51\1\60\2\55\1\51"+
+ "\54\0\1\61\61\0\1\62\4\0\4\63\7\0\6\63"+
+ "\1\64\6\63\3\0\2\63\12\0\1\65\42\0\1\66"+
+ "\1\67\1\70\1\71\2\72\1\0\1\73\3\0\1\73"+
+ "\1\16\1\17\1\20\1\21\7\0\15\16\3\0\2\16"+
+ "\3\0\1\74\1\0\1\75\2\76\1\0\1\77\3\0"+
+ "\1\77\3\17\1\21\7\0\15\17\3\0\2\17\2\0"+
+ "\1\66\1\100\1\70\1\71\2\76\1\0\1\77\3\0"+
+ "\1\77\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
+ "\2\20\3\0\1\101\1\0\1\75\2\72\1\0\1\73"+
+ "\3\0\1\73\4\21\7\0\15\21\3\0\2\21\24\0"+
+ "\1\12\54\0\1\102\72\0\1\103\15\0\1\62\4\0"+
+ "\4\63\7\0\15\63\3\0\2\63\16\0\4\27\7\0"+
+ "\15\27\3\0\2\27\27\0\1\35\41\0\4\31\7\0"+
+ "\15\31\3\0\2\31\16\0\4\34\7\0\15\34\3\0"+
+ "\2\34\16\0\4\34\7\0\2\34\1\104\12\34\3\0"+
+ "\2\34\2\0\1\105\66\0\4\40\7\0\15\40\3\0"+
+ "\2\40\24\0\1\32\54\0\1\106\42\0\4\43\7\0"+
+ "\15\43\3\0\2\43\12\0\1\35\56\0\4\45\7\0"+
+ "\15\45\3\0\2\45\11\0\1\107\4\0\4\63\7\0"+
+ "\15\63\3\0\2\63\16\0\4\47\7\0\15\47\3\0"+
+ "\2\47\47\0\1\35\5\0\1\110\62\0\1\111\56\0"+
+ "\4\55\7\0\15\55\3\0\2\55\24\0\1\51\54\0"+
+ "\1\112\42\0\4\63\7\0\15\63\3\0\2\63\14\0"+
+ "\1\32\1\0\4\113\1\0\3\114\3\0\15\113\3\0"+
+ "\2\113\14\0\1\32\1\0\4\113\1\0\3\114\3\0"+
+ "\3\113\1\115\11\113\3\0\2\113\16\0\1\116\1\0"+
+ "\1\116\10\0\15\116\3\0\2\116\16\0\1\117\1\120"+
+ "\1\121\1\122\7\0\15\117\3\0\2\117\16\0\1\123"+
+ "\1\0\1\123\10\0\15\123\3\0\2\123\16\0\1\124"+
+ "\1\125\1\124\1\125\7\0\15\124\3\0\2\124\16\0"+
+ "\1\126\2\127\1\130\7\0\15\126\3\0\2\126\16\0"+
+ "\1\73\2\131\10\0\15\73\3\0\2\73\16\0\1\132"+
+ "\2\133\1\134\7\0\15\132\3\0\2\132\16\0\4\125"+
+ "\7\0\15\125\3\0\2\125\16\0\1\135\2\136\1\137"+
+ "\7\0\15\135\3\0\2\135\16\0\1\140\2\141\1\142"+
+ "\7\0\15\140\3\0\2\140\16\0\1\143\1\133\1\144"+
+ "\1\134\7\0\15\143\3\0\2\143\16\0\1\145\2\120"+
+ "\1\122\7\0\15\145\3\0\2\145\30\0\1\146\1\147"+
+ "\63\0\1\150\26\0\4\34\7\0\2\34\1\151\12\34"+
+ "\3\0\2\34\2\0\1\152\100\0\1\153\1\154\37\0"+
+ "\4\63\7\0\6\63\1\155\6\63\3\0\2\63\2\0"+
+ "\1\156\62\0\1\157\70\0\1\160\1\161\33\0\1\162"+
+ "\1\0\1\32\1\0\4\113\1\0\3\114\3\0\15\113"+
+ "\3\0\2\113\16\0\4\163\1\0\3\114\3\0\15\163"+
+ "\3\0\2\163\12\0\1\162\1\0\1\32\1\0\4\113"+
+ "\1\0\3\114\3\0\10\113\1\164\4\113\3\0\2\113"+
+ "\2\0\1\66\13\0\1\116\1\0\1\116\10\0\15\116"+
+ "\3\0\2\116\3\0\1\165\1\0\1\75\2\166\6\0"+
+ "\1\117\1\120\1\121\1\122\7\0\15\117\3\0\2\117"+
+ "\3\0\1\167\1\0\1\75\2\170\1\0\1\171\3\0"+
+ "\1\171\3\120\1\122\7\0\15\120\3\0\2\120\3\0"+
+ "\1\172\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
+ "\1\121\1\120\1\121\1\122\7\0\15\121\3\0\2\121"+
+ "\3\0\1\173\1\0\1\75\2\166\6\0\4\122\7\0"+
+ "\15\122\3\0\2\122\3\0\1\174\2\0\1\174\7\0"+
+ "\1\124\1\125\1\124\1\125\7\0\15\124\3\0\2\124"+
+ "\3\0\1\174\2\0\1\174\7\0\4\125\7\0\15\125"+
+ "\3\0\2\125\3\0\1\166\1\0\1\75\2\166\6\0"+
+ "\1\126\2\127\1\130\7\0\15\126\3\0\2\126\3\0"+
+ "\1\170\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
+ "\3\127\1\130\7\0\15\127\3\0\2\127\3\0\1\166"+
+ "\1\0\1\75\2\166\6\0\4\130\7\0\15\130\3\0"+
+ "\2\130\3\0\1\171\2\0\2\171\1\0\1\171\3\0"+
+ "\1\171\3\131\10\0\15\131\3\0\2\131\3\0\1\101"+
+ "\1\0\1\75\2\72\1\0\1\73\3\0\1\73\1\132"+
+ "\2\133\1\134\7\0\15\132\3\0\2\132\3\0\1\74"+
+ "\1\0\1\75\2\76\1\0\1\77\3\0\1\77\3\133"+
+ "\1\134\7\0\15\133\3\0\2\133\3\0\1\101\1\0"+
+ "\1\75\2\72\1\0\1\73\3\0\1\73\4\134\7\0"+
+ "\15\134\3\0\2\134\3\0\1\72\1\0\1\75\2\72"+
+ "\1\0\1\73\3\0\1\73\1\135\2\136\1\137\7\0"+
+ "\15\135\3\0\2\135\3\0\1\76\1\0\1\75\2\76"+
+ "\1\0\1\77\3\0\1\77\3\136\1\137\7\0\15\136"+
+ "\3\0\2\136\3\0\1\72\1\0\1\75\2\72\1\0"+
+ "\1\73\3\0\1\73\4\137\7\0\15\137\3\0\2\137"+
+ "\3\0\1\73\2\0\2\73\1\0\1\73\3\0\1\73"+
+ "\1\140\2\141\1\142\7\0\15\140\3\0\2\140\3\0"+
+ "\1\77\2\0\2\77\1\0\1\77\3\0\1\77\3\141"+
+ "\1\142\7\0\15\141\3\0\2\141\3\0\1\73\2\0"+
+ "\2\73\1\0\1\73\3\0\1\73\4\142\7\0\15\142"+
+ "\3\0\2\142\3\0\1\175\1\0\1\75\2\72\1\0"+
+ "\1\73\3\0\1\73\1\143\1\133\1\144\1\134\7\0"+
+ "\15\143\3\0\2\143\3\0\1\176\1\0\1\75\2\76"+
+ "\1\0\1\77\3\0\1\77\1\144\1\133\1\144\1\134"+
+ "\7\0\15\144\3\0\2\144\3\0\1\173\1\0\1\75"+
+ "\2\166\6\0\1\145\2\120\1\122\7\0\15\145\3\0"+
+ "\2\145\31\0\1\147\53\0\1\177\63\0\1\200\25\0"+
+ "\4\34\7\0\15\34\3\0\1\34\1\201\31\0\1\154"+
+ "\53\0\1\202\34\0\1\32\1\0\4\113\1\0\3\114"+
+ "\3\0\3\113\1\203\11\113\3\0\2\113\2\0\1\204"+
+ "\101\0\1\161\53\0\1\205\33\0\1\206\51\0\1\162"+
+ "\3\0\4\163\7\0\15\163\3\0\2\163\12\0\1\162"+
+ "\1\0\1\207\1\0\4\113\1\0\3\114\3\0\15\113"+
+ "\3\0\2\113\16\0\1\210\1\122\1\210\1\122\7\0"+
+ "\15\210\3\0\2\210\16\0\4\130\7\0\15\130\3\0"+
+ "\2\130\16\0\4\134\7\0\15\134\3\0\2\134\16\0"+
+ "\4\137\7\0\15\137\3\0\2\137\16\0\4\142\7\0"+
+ "\15\142\3\0\2\142\16\0\1\211\1\134\1\211\1\134"+
+ "\7\0\15\211\3\0\2\211\16\0\4\122\7\0\15\122"+
+ "\3\0\2\122\16\0\4\212\7\0\15\212\3\0\2\212"+
+ "\33\0\1\213\60\0\1\214\27\0\4\34\6\0\1\215"+
+ "\15\34\3\0\2\34\33\0\1\216\31\0\1\162\1\0"+
+ "\1\32\1\0\4\113\1\0\3\114\3\0\10\113\1\217"+
+ "\4\113\3\0\2\113\2\0\1\220\103\0\1\221\35\0"+
+ "\4\222\7\0\15\222\3\0\2\222\3\0\1\165\1\0"+
+ "\1\75\2\166\6\0\1\210\1\122\1\210\1\122\7\0"+
+ "\15\210\3\0\2\210\3\0\1\175\1\0\1\75\2\72"+
+ "\1\0\1\73\3\0\1\73\1\211\1\134\1\211\1\134"+
+ "\7\0\15\211\3\0\2\211\3\0\1\174\2\0\1\174"+
+ "\7\0\4\212\7\0\15\212\3\0\2\212\34\0\1\223"+
+ "\54\0\1\224\25\0\1\225\75\0\1\226\30\0\1\162"+
+ "\1\0\1\35\1\0\4\113\1\0\3\114\3\0\15\113"+
+ "\3\0\2\113\34\0\1\227\31\0\1\230\2\0\4\222"+
+ "\7\0\15\222\3\0\2\222\35\0\1\231\61\0\1\232"+
+ "\17\0\1\233\76\0\1\234\52\0\1\235\31\0\1\32"+
+ "\1\0\4\163\1\0\3\114\3\0\15\163\3\0\2\163"+
+ "\36\0\1\236\52\0\1\237\32\0\4\240\7\0\15\240"+
+ "\3\0\2\240\36\0\1\241\52\0\1\242\53\0\1\243"+
+ "\60\0\1\244\10\0\1\245\12\0\4\240\7\0\15\240"+
+ "\3\0\2\240\37\0\1\246\52\0\1\247\53\0\1\250"+
+ "\21\0\1\12\61\0\4\251\7\0\15\251\3\0\2\251"+
+ "\40\0\1\252\52\0\1\253\42\0\1\254\25\0\2\251"+
+ "\1\0\2\251\1\0\2\251\2\0\5\251\7\0\15\251"+
+ "\3\0\3\251\27\0\1\255\52\0\1\256\23\0";
+
+ private static int [] zzUnpackTrans() {
+ int [] result = new int[6665];
+ int offset = 0;
+ offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackTrans(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ value--;
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+
+ /* error codes */
+ private static final int ZZ_UNKNOWN_ERROR = 0;
+ private static final int ZZ_NO_MATCH = 1;
+ private static final int ZZ_PUSHBACK_2BIG = 2;
+
+ /* error messages for the codes above */
+ private static final String ZZ_ERROR_MSG[] = {
+ "Unkown internal scanner error",
+ "Error: could not match input",
+ "Error: pushback value was too large"
+ };
+
+ /**
+ * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+ */
+ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+ private static final String ZZ_ATTRIBUTE_PACKED_0 =
+ "\11\0\1\11\7\1\1\11\7\1\1\11\2\1\1\11"+
+ "\13\1\1\11\6\1\2\11\3\0\1\11\14\0\3\1"+
+ "\1\0\1\1\1\0\1\1\1\0\1\1\3\0\7\1"+
+ "\2\0\1\1\1\0\15\1\3\0\1\1\1\11\3\0"+
+ "\1\1\1\11\5\0\1\1\4\0\1\1\2\0\2\1"+
+ "\2\0\1\1\5\0\1\11\3\1\5\0\1\11\30\0"+
+ "\1\1\2\0\3\11";
+
+ private static int [] zzUnpackAttribute() {
+ int [] result = new int[174];
+ int offset = 0;
+ offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+ return result;
+ }
+
+ private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+ int i = 0; /* index in packed string */
+ int j = offset; /* index in unpacked array */
+ int l = packed.length();
+ while (i < l) {
+ int count = packed.charAt(i++);
+ int value = packed.charAt(i++);
+ do result[j++] = value; while (--count > 0);
+ }
+ return j;
+ }
+
+ /** the input device */
+ private java.io.Reader zzReader;
+
+ /** the current state of the DFA */
+ private int zzState;
+
+ /** the current lexical state */
+ private int zzLexicalState = YYINITIAL;
+
+ /** this buffer contains the current text to be matched and is
+ the source of the yytext() string */
+ private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+ /** the textposition at the last accepting state */
+ private int zzMarkedPos;
+
+ /** the textposition at the last state to be included in yytext */
+ private int zzPushbackPos;
+
+ /** the current text position in the buffer */
+ private int zzCurrentPos;
+
+ /** startRead marks the beginning of the yytext() string in the buffer */
+ private int zzStartRead;
+
+ /** endRead marks the last character in the buffer, that has been read
+ from input */
+ private int zzEndRead;
+
+ /** number of newlines encountered up to the start of the matched text */
+ private int yyline;
+
+ /** the number of characters up to the start of the matched text */
+ private int yychar;
+
+ /**
+ * the number of characters from the last newline up to the start of the
+ * matched text
+ */
+ private int yycolumn;
+
+ /**
+ * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+ */
+ private boolean zzAtBOL = true;
+
+ /** zzAtEOF == true <=> the scanner is at the EOF */
+ private boolean zzAtEOF;
+
+ /* user code: */
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+public static final int INTERNAL_LINK = 8;
+public static final int EXTERNAL_LINK = 9;
+public static final int CITATION = 10;
+public static final int CATEGORY = 11;
+public static final int BOLD = 12;
+public static final int ITALICS = 13;
+public static final int BOLD_ITALICS = 14;
+public static final int HEADING = 15;
+public static final int SUB_HEADING = 16;
+public static final int EXTERNAL_LINK_URL = 17;
+
+
+private int currentTokType;
+private int numBalanced = 0;
+private int positionInc = 1;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ WikipediaTokenizer.INTERNAL_LINK,
+ WikipediaTokenizer.EXTERNAL_LINK,
+ WikipediaTokenizer.CITATION,
+ WikipediaTokenizer.CATEGORY,
+ WikipediaTokenizer.BOLD,
+ WikipediaTokenizer.ITALICS,
+ WikipediaTokenizer.BOLD_ITALICS,
+ WikipediaTokenizer.HEADING,
+ WikipediaTokenizer.SUB_HEADING,
+ WikipediaTokenizer.EXTERNAL_LINK_URL
+};
+
+public final int yychar()
+{
+ return yychar;
+}
+
+public final int getPositionIncrement(){
+ return positionInc;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t, int tokType) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+
+
+ /**
+ * Creates a new scanner
+ * There is also a java.io.InputStream version of this constructor.
+ *
+ * @param in the java.io.Reader to read input from.
+ */
+ WikipediaTokenizerImpl(java.io.Reader in) {
+ this.zzReader = in;
+ }
+
+ /**
+ * Creates a new scanner.
+ * There is also java.io.Reader version of this constructor.
+ *
+ * @param in the java.io.Inputstream to read input from.
+ */
+ WikipediaTokenizerImpl(java.io.InputStream in) {
+ this(new java.io.InputStreamReader(in));
+ }
+
+ /**
+ * Unpacks the compressed character translation table.
+ *
+ * @param packed the packed character translation table
+ * @return the unpacked character translation table
+ */
+ private static char [] zzUnpackCMap(String packed) {
+ char [] map = new char[0x10000];
+ int i = 0; /* index in packed string */
+ int j = 0; /* index in unpacked array */
+ while (i < 230) {
+ int count = packed.charAt(i++);
+ char value = packed.charAt(i++);
+ do map[j++] = value; while (--count > 0);
+ }
+ return map;
+ }
+
+
+ /**
+ * Refills the input buffer.
+ *
+ * @return <code>false</code>, iff there was new input.
+ *
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ private boolean zzRefill() throws java.io.IOException {
+
+ /* first: make room (if you can) */
+ if (zzStartRead > 0) {
+ System.arraycopy(zzBuffer, zzStartRead,
+ zzBuffer, 0,
+ zzEndRead-zzStartRead);
+
+ /* translate stored positions */
+ zzEndRead-= zzStartRead;
+ zzCurrentPos-= zzStartRead;
+ zzMarkedPos-= zzStartRead;
+ zzPushbackPos-= zzStartRead;
+ zzStartRead = 0;
+ }
+
+ /* is the buffer big enough? */
+ if (zzCurrentPos >= zzBuffer.length) {
+ /* if not: blow it up */
+ char newBuffer[] = new char[zzCurrentPos*2];
+ System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+ zzBuffer = newBuffer;
+ }
+
+ /* finally: fill the buffer with new input */
+ int numRead = zzReader.read(zzBuffer, zzEndRead,
+ zzBuffer.length-zzEndRead);
+
+ if (numRead < 0) {
+ return true;
+ }
+ else {
+ zzEndRead+= numRead;
+ return false;
+ }
+ }
+
+
+ /**
+ * Closes the input stream.
+ */
+ public final void yyclose() throws java.io.IOException {
+ zzAtEOF = true; /* indicate end of file */
+ zzEndRead = zzStartRead; /* invalidate buffer */
+
+ if (zzReader != null)
+ zzReader.close();
+ }
+
+
+ /**
+ * Resets the scanner to read from a new input stream.
+ * Does not close the old reader.
+ *
+ * All internal variables are reset, the old input stream
+ * <b>cannot</b> be reused (internal buffer is discarded and lost).
+ * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+ *
+ * @param reader the new input stream
+ */
+ public final void yyreset(java.io.Reader reader) {
+ zzReader = reader;
+ zzAtBOL = true;
+ zzAtEOF = false;
+ zzEndRead = zzStartRead = 0;
+ zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+ yyline = yychar = yycolumn = 0;
+ zzLexicalState = YYINITIAL;
+ }
+
+
+ /**
+ * Returns the current lexical state.
+ */
+ public final int yystate() {
+ return zzLexicalState;
+ }
+
+
+ /**
+ * Enters a new lexical state
+ *
+ * @param newState the new lexical state
+ */
+ public final void yybegin(int newState) {
+ zzLexicalState = newState;
+ }
+
+
+ /**
+ * Returns the text matched by the current regular expression.
+ */
+ public final String yytext() {
+ return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+ }
+
+
+ /**
+ * Returns the character at position <tt>pos</tt> from the
+ * matched text.
+ *
+ * It is equivalent to yytext().charAt(pos), but faster
+ *
+ * @param pos the position of the character to fetch.
+ * A value from 0 to yylength()-1.
+ *
+ * @return the character at position pos
+ */
+ public final char yycharat(int pos) {
+ return zzBuffer[zzStartRead+pos];
+ }
+
+
+ /**
+ * Returns the length of the matched text region.
+ */
+ public final int yylength() {
+ return zzMarkedPos-zzStartRead;
+ }
+
+
+ /**
+ * Reports an error that occured while scanning.
+ *
+ * In a wellformed scanner (no or only correct usage of
+ * yypushback(int) and a match-all fallback rule) this method
+ * will only be called with things that "Can't Possibly Happen".
+ * If this method is called, something is seriously wrong
+ * (e.g. a JFlex bug producing a faulty scanner etc.).
+ *
+ * Usual syntax/scanner level error handling should be done
+ * in error fallback rules.
+ *
+ * @param errorCode the code of the errormessage to display
+ */
+ private void zzScanError(int errorCode) {
+ String message;
+ try {
+ message = ZZ_ERROR_MSG[errorCode];
+ }
+ catch (ArrayIndexOutOfBoundsException e) {
+ message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+ }
+
+ throw new Error(message);
+ }
+
+
+ /**
+ * Pushes the specified amount of characters back into the input stream.
+ *
+ * They will be read again by then next call of the scanning method
+ *
+ * @param number the number of characters to be read again.
+ * This number must not be greater than yylength()!
+ */
+ public void yypushback(int number) {
+ if ( number > yylength() )
+ zzScanError(ZZ_PUSHBACK_2BIG);
+
+ zzMarkedPos -= number;
+ }
+
+
+ /**
+ * Resumes scanning until the next regular expression is matched,
+ * the end of input is encountered or an I/O-Error occurs.
+ *
+ * @return the next token
+ * @exception java.io.IOException if any I/O-Error occurs
+ */
+ public int getNextToken() throws java.io.IOException {
+ int zzInput;
+ int zzAction;
+
+ // cached fields:
+ int zzCurrentPosL;
+ int zzMarkedPosL;
+ int zzEndReadL = zzEndRead;
+ char [] zzBufferL = zzBuffer;
+ char [] zzCMapL = ZZ_CMAP;
+
+ int [] zzTransL = ZZ_TRANS;
+ int [] zzRowMapL = ZZ_ROWMAP;
+ int [] zzAttrL = ZZ_ATTRIBUTE;
+
+ while (true) {
+ zzMarkedPosL = zzMarkedPos;
+
+ yychar+= zzMarkedPosL-zzStartRead;
+
+ zzAction = -1;
+
+ zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+ zzState = zzLexicalState;
+
+
+ zzForAction: {
+ while (true) {
+
+ if (zzCurrentPosL < zzEndReadL)
+ zzInput = zzBufferL[zzCurrentPosL++];
+ else if (zzAtEOF) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ // store back cached positions
+ zzCurrentPos = zzCurrentPosL;
+ zzMarkedPos = zzMarkedPosL;
+ boolean eof = zzRefill();
+ // get translated positions and possibly new buffer
+ zzCurrentPosL = zzCurrentPos;
+ zzMarkedPosL = zzMarkedPos;
+ zzBufferL = zzBuffer;
+ zzEndReadL = zzEndRead;
+ if (eof) {
+ zzInput = YYEOF;
+ break zzForAction;
+ }
+ else {
+ zzInput = zzBufferL[zzCurrentPosL++];
+ }
+ }
+ int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+ if (zzNext == -1) break zzForAction;
+ zzState = zzNext;
+
+ int zzAttributes = zzAttrL[zzState];
+ if ( (zzAttributes & 1) == 1 ) {
+ zzAction = zzState;
+ zzMarkedPosL = zzCurrentPosL;
+ if ( (zzAttributes & 8) == 8 ) break zzForAction;
+ }
+
+ }
+ }
+
+ // store back cached position
+ zzMarkedPos = zzMarkedPosL;
+
+ switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+ case 7:
+ { /* ignore */
+ }
+ case 42: break;
+ case 3:
+ { positionInc = 1; return CJ;
+ }
+ case 43: break;
+ case 26:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
+ }
+ case 44: break;
+ case 37:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
+ }
+ case 45: break;
+ case 11:
+ { currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
+ }
+ case 46: break;
+ case 5:
+ { yybegin(CATEGORY_STATE); return currentTokType;
+ }
+ case 47: break;
+ case 34:
+ { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
+ }
+ case 48: break;
+ case 24:
+ { positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
+ }
+ case 49: break;
+ case 22:
+ { positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
+ }
+ case 50: break;
+ case 39:
+ { positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
+ }
+ case 51: break;
+ case 18:
+ { yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
+ }
+ case 52: break;
+ case 21:
+ { positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
+ }
+ case 53: break;
+ case 1:
+ { positionInc = 1;
+ }
+ case 54: break;
+ case 41:
+ { numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
+ }
+ case 55: break;
+ case 9:
+ { yybegin(YYINITIAL);
+ }
+ case 56: break;
+ case 19:
+ { numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
+ }
+ case 57: break;
+ case 13:
+ { yybegin(STRING);return currentTokType;
+ }
+ case 58: break;
+ case 36:
+ { positionInc = 1; return EMAIL;
+ }
+ case 59: break;
+ case 35:
+ { positionInc = 1; return ACRONYM;
+ }
+ case 60: break;
+ case 4:
+ { positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
+ }
+ case 61: break;
+ case 17:
+ { /* ignore STRING */
+ }
+ case 62: break;
+ case 40:
+ { currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
+ }
+ case 63: break;
+ case 20:
+ { yybegin(STRING); return currentTokType;/*pipe*/
+ }
+ case 64: break;
+ case 12:
+ { currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
+ }
+ case 65: break;
+ case 27:
+ { numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
+ }
+ case 66: break;
+ case 33:
+ { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
+ }
+ case 67: break;
+ case 16:
+ { yybegin(DOUBLE_BRACE_STATE); return currentTokType;
+ }
+ case 68: break;
+ case 29:
+ { positionInc = 1; return HOST;
+ }
+ case 69: break;
+ case 32:
+ { currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
+ }
+ case 70: break;
+ case 25:
+ { currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
+ }
+ case 71: break;
+ case 23:
+ { positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
+ }
+ case 72: break;
+ case 14:
+ { currentTokType = SUB_HEADING; yybegin(STRING);
+ }
+ case 73: break;
+ case 28:
+ { positionInc = 1; return APOSTROPHE;
+ }
+ case 74: break;
+ case 30:
+ { positionInc = 1; return NUM;
+ }
+ case 75: break;
+ case 15:
+ { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
+ }
+ case 76: break;
+ case 6:
+ { yybegin(INTERNAL_LINK_STATE); return currentTokType;
+ }
+ case 77: break;
+ case 2:
+ { positionInc = 1; return ALPHANUM;
+ }
+ case 78: break;
+ case 31:
+ { positionInc = 1; return COMPANY;
+ }
+ case 79: break;
+ case 10:
+ { currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
+ }
+ case 80: break;
+ case 8:
+ { positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;
+ }
+ case 81: break;
+ case 38:
+ { positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+ }
+ case 82: break;
+ default:
+ if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+ zzAtEOF = true;
+ return YYEOF;
+ }
+ else {
+ zzScanError(ZZ_NO_MATCH);
+ }
+ }
+ }
+ }
+
+
+}
Propchange: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex Fri Jan 4 06:29:15 2008
@@ -0,0 +1,324 @@
+package org.apache.lucene.wikipedia.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+%%
+
+%class WikipediaTokenizerImpl
+%unicode
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM = 0;
+public static final int APOSTROPHE = 1;
+public static final int ACRONYM = 2;
+public static final int COMPANY = 3;
+public static final int EMAIL = 4;
+public static final int HOST = 5;
+public static final int NUM = 6;
+public static final int CJ = 7;
+public static final int INTERNAL_LINK = 8;
+public static final int EXTERNAL_LINK = 9;
+public static final int CITATION = 10;
+public static final int CATEGORY = 11;
+public static final int BOLD = 12;
+public static final int ITALICS = 13;
+public static final int BOLD_ITALICS = 14;
+public static final int HEADING = 15;
+public static final int SUB_HEADING = 16;
+public static final int EXTERNAL_LINK_URL = 17;
+
+
+private int currentTokType;
+private int numBalanced = 0;
+private int positionInc = 1;
+
+public static final String [] TOKEN_TYPES = new String [] {
+ "<ALPHANUM>",
+ "<APOSTROPHE>",
+ "<ACRONYM>",
+ "<COMPANY>",
+ "<EMAIL>",
+ "<HOST>",
+ "<NUM>",
+ "<CJ>",
+ WikipediaTokenizer.INTERNAL_LINK,
+ WikipediaTokenizer.EXTERNAL_LINK,
+ WikipediaTokenizer.CITATION,
+ WikipediaTokenizer.CATEGORY,
+ WikipediaTokenizer.BOLD,
+ WikipediaTokenizer.ITALICS,
+ WikipediaTokenizer.BOLD_ITALICS,
+ WikipediaTokenizer.HEADING,
+ WikipediaTokenizer.SUB_HEADING,
+ WikipediaTokenizer.EXTERNAL_LINK_URL
+};
+
+public final int yychar()
+{
+ return yychar;
+}
+
+public final int getPositionIncrement(){
+ return positionInc;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t, int tokType) {
+ t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+%}
+
+// basic word: a sequence of digits & letters
+ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possesives
+APOSTROPHE = {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM = {ALPHA} "." ({ALPHA} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY = {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST = {ALPHANUM} ((".") {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+NUM = ({ALPHANUM} {P} {HAS_DIGIT}
+ | {DIGIT}+ {P} {DIGIT}+
+ | {HAS_DIGIT} {P} {ALPHANUM}
+ | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+
+TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
+
+// punctuation
+P = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT =
+ ({LETTER}|{DIGIT})*
+ {DIGIT}
+ ({LETTER}|{DIGIT})*
+
+ALPHA = ({LETTER})+
+
+
+LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+
+DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+
+KOREAN = [\uac00-\ud7af\u1100-\u11ff]
+
+// Chinese, Japanese
+CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+//Wikipedia
+DOUBLE_BRACKET = "["{2}
+DOUBLE_BRACKET_CLOSE = "]"{2}
+DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
+EXTERNAL_LINK = "["
+TWO_SINGLE_QUOTES = "'"{2}
+CITATION = "<ref>"
+CITATION_CLOSE = "</ref>"
+INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
+
+DOUBLE_BRACE = "{"{2}
+DOUBLE_BRACE_CLOSE = "}"{2}
+PIPE = "|"
+DOUBLE_EQUALS = "="{2}
+
+
+%state CATEGORY_STATE
+%state INTERNAL_LINK_STATE
+%state EXTERNAL_LINK_STATE
+
+%state TWO_SINGLE_QUOTES_STATE
+%state THREE_SINGLE_QUOTES_STATE
+%state FIVE_SINGLE_QUOTES_STATE
+%state DOUBLE_EQUALS_STATE
+%state DOUBLE_BRACE_STATE
+%state STRING
+
+%%
+
+<YYINITIAL>{ALPHANUM} {positionInc = 1; return ALPHANUM; }
+<YYINITIAL>{APOSTROPHE} {positionInc = 1; return APOSTROPHE; }
+<YYINITIAL>{ACRONYM} {positionInc = 1; return ACRONYM; }
+<YYINITIAL>{COMPANY} {positionInc = 1; return COMPANY; }
+<YYINITIAL>{EMAIL} {positionInc = 1; return EMAIL; }
+<YYINITIAL>{NUM} {positionInc = 1; return NUM; }
+<YYINITIAL>{HOST} {positionInc = 1; return HOST; }
+<YYINITIAL>{CJ} {positionInc = 1; return CJ; }
+
+//wikipedia
+<YYINITIAL>{
+ //First {ALPHANUM} is always the link, set position to 0 for double bracket
+ {DOUBLE_BRACKET} {positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
+ {DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
+ {EXTERNAL_LINK} {positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
+ {TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
+ {DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
+ {DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
+ {CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
+//ignore
+ . | {WHITESPACE} |{INFOBOX} { positionInc = 1; }
+}
+
+<INTERNAL_LINK_STATE>{
+//First {ALPHANUM} is always the link, set position to 0 for these
+ {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
+ {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
+ //ignore
+ . | {WHITESPACE} { positionInc = 1; }
+}
+
+<EXTERNAL_LINK_STATE>{
+ "http://"{HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
+ {ALPHANUM} {positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;}
+ "]" {yybegin(YYINITIAL);}
+ {WHITESPACE} { positionInc = 1; }
+}
+
+<CATEGORY_STATE>{
+ {ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
+ {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
+ //ignore
+ . | {WHITESPACE} { positionInc = 1; }
+}
+//italics
+<TWO_SINGLE_QUOTES_STATE>{
+ "'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);}
+ "'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);}
+ {ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
+ //we can have links inside, let those override
+ {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+ {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+ {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+ //ignore
+ . | {WHITESPACE} { /* ignore */ }
+}
+//bold
+<THREE_SINGLE_QUOTES_STATE>{
+ {ALPHANUM} {yybegin(STRING);return currentTokType;}
+ //we can have links inside, let those override
+ {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+ {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+ {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+ //ignore
+ . | {WHITESPACE} { /* ignore */ }
+
+}
+//bold italics
+<FIVE_SINGLE_QUOTES_STATE>{
+ {ALPHANUM} {yybegin(STRING);return currentTokType;}
+ //we can have links inside, let those override
+ {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+ {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+ {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+ //ignore
+ . | {WHITESPACE} { /* ignore */ }
+}
+
+<DOUBLE_EQUALS_STATE>{
+ "=" {currentTokType = SUB_HEADING; yybegin(STRING);}
+ {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
+ {DOUBLE_EQUALS} {yybegin(YYINITIAL);}
+ //ignore
+ . | {WHITESPACE} { /* ignore */ }
+}
+
+<DOUBLE_BRACE_STATE>{
+ {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
+ {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
+ {CITATION_CLOSE} {yybegin(YYINITIAL);}
+ //ignore
+ . | {WHITESPACE} { /* ignore */ }
+}
+
+<STRING> {
+ "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/}
+ "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
+ "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
+ "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
+ {ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
+ //we can have links inside, let those override
+ {DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+ {DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+ {EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+
+ {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
+
+ .|{WHITESPACE} { /* ignore STRING */ }
+}
+
+
+
+
+/*
+{INTERNAL_LINK} { return curentTokType; }
+
+{CITATION} { return currentTokType; }
+{CATEGORY} { return currentTokType; }
+
+{BOLD} { return currentTokType; }
+{ITALICS} { return currentTokType; }
+{BOLD_ITALICS} { return currentTokType; }
+{HEADING} { return currentTokType; }
+{SUB_HEADING} { return currentTokType; }
+
+*/
+//end wikipedia
+
+/** Ignore the rest */
+. | {WHITESPACE}|{TAGS} { /* ignore */ }
+
+
+//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
+//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
+//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
+//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
+//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
+//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
+//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}
\ No newline at end of file
Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html Fri Jan 4 06:29:15 2008
@@ -0,0 +1,35 @@
+<!--
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ -->
+
+<HTML>
+ <!--
+ *
+ --><HEAD>
+ <TITLE>org.apache.lucene.wikipedia</TITLE>
+</HEAD>
+<BODY>
+<DIV>Tools for working with <a href="http://www.wikipedia.org">Wikipedia</a> content.
+</DIV>
+<DIV> </DIV>
+<DIV align="center">
+Copyright © 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
+</DIV>
+</BODY>
+</HTML>
\ No newline at end of file
Propchange: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (added)
+++ lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java Fri Jan 4 06:29:15 2008
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.lucene.wikipedia.analysis;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+
+import java.io.File;
+import java.io.StringReader;
+import java.util.Map;
+import java.util.HashMap;
+
+
+/**
+ *
+ *
+ **/
+public class WikipediaTokenizerTest extends TestCase {
+
+
+ public WikipediaTokenizerTest(String s) {
+ super(s);
+ }
+
+ protected void setUp() {
+ }
+
+ protected void tearDown() {
+
+ }
+
+
+ public void testHandwritten() throws Exception {
+ //make sure all tokens are in only one type
+ String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " +
+ "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " +
+ "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " +
+ " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " +
+ "==heading== ===sub head=== followed by some text [[Category:blah| ]] " +
+ "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." +
+ "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" +
+ " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" +
+ " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
+ Map tcm = new HashMap();//map tokens to types
+ tcm.put("link", WikipediaTokenizer.INTERNAL_LINK);
+ tcm.put("display", WikipediaTokenizer.INTERNAL_LINK);
+ tcm.put("info", WikipediaTokenizer.INTERNAL_LINK);
+
+ tcm.put("http://lucene.apache.org", WikipediaTokenizer.EXTERNAL_LINK_URL);
+ tcm.put("http://foo.boo.com/test/test/", WikipediaTokenizer.EXTERNAL_LINK_URL);
+ tcm.put("http://foo.boo.com/test/test/test.html", WikipediaTokenizer.EXTERNAL_LINK_URL);
+ tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d", WikipediaTokenizer.EXTERNAL_LINK_URL);
+ tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK);
+
+ //alphanums
+ tcm.put("This", "<ALPHANUM>");
+ tcm.put("is", "<ALPHANUM>");
+ tcm.put("a", "<ALPHANUM>");
+ tcm.put("Category", "<ALPHANUM>");
+ tcm.put("linked", "<ALPHANUM>");
+ tcm.put("parens", "<ALPHANUM>");
+ tcm.put("external", "<ALPHANUM>");
+ tcm.put("URL", "<ALPHANUM>");
+ tcm.put("and", "<ALPHANUM>");
+ tcm.put("period", "<ALPHANUM>");
+ tcm.put("Here", "<ALPHANUM>");
+ tcm.put("Here's", "<APOSTROPHE>");
+ tcm.put("here", "<ALPHANUM>");
+ tcm.put("Johnny", "<ALPHANUM>");
+ tcm.put("followed", "<ALPHANUM>");
+ tcm.put("by", "<ALPHANUM>");
+ tcm.put("text", "<ALPHANUM>");
+ tcm.put("that", "<ALPHANUM>");
+ tcm.put("but", "<ALPHANUM>");
+ tcm.put("never", "<ALPHANUM>");
+ tcm.put("closed", "<ALPHANUM>");
+ tcm.put("goes", "<ALPHANUM>");
+ tcm.put("for", "<ALPHANUM>");
+ tcm.put("this", "<ALPHANUM>");
+ tcm.put("an", "<ALPHANUM>");
+ tcm.put("some", "<ALPHANUM>");
+ tcm.put("martian", "<ALPHANUM>");
+ tcm.put("code", "<ALPHANUM>");
+
+ tcm.put("foo", WikipediaTokenizer.CATEGORY);
+ tcm.put("bar", WikipediaTokenizer.CATEGORY);
+ tcm.put("none", WikipediaTokenizer.CATEGORY);
+ tcm.put("withstanding", WikipediaTokenizer.CATEGORY);
+ tcm.put("blah", WikipediaTokenizer.CATEGORY);
+ tcm.put("ital", WikipediaTokenizer.CATEGORY);
+ tcm.put("cat", WikipediaTokenizer.CATEGORY);
+
+ tcm.put("italics", WikipediaTokenizer.ITALICS);
+ tcm.put("more", WikipediaTokenizer.ITALICS);
+ tcm.put("bold", WikipediaTokenizer.BOLD);
+ tcm.put("same", WikipediaTokenizer.BOLD);
+ tcm.put("five", WikipediaTokenizer.BOLD_ITALICS);
+ tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS);
+ tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS);
+
+ tcm.put("heading", WikipediaTokenizer.HEADING);
+ tcm.put("sub", WikipediaTokenizer.SUB_HEADING);
+ tcm.put("head", WikipediaTokenizer.SUB_HEADING);
+
+ tcm.put("Citation", WikipediaTokenizer.CITATION);
+
+ tcm.put("3.25", "<NUM>");
+ tcm.put("3.50", "<NUM>");
+ WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+ Token token = new Token();
+ int count = 0;
+ int numItalics = 0;
+ int numBoldItalics = 0;
+ int numCategory = 0;
+ int numCitation = 0;
+ while ((token = tf.next(token)) != null) {
+ String tokText = token.termText();
+ //System.out.println("Text: " + tokText + " Type: " + token.type());
+ assertTrue("token is null and it shouldn't be", token != null);
+ String expectedType = (String) tcm.get(tokText);
+ assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null);
+ assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true);
+ count++;
+ if (token.type().equals(WikipediaTokenizer.ITALICS) == true){
+ numItalics++;
+ } else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
+ numBoldItalics++;
+ } else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true){
+ numCategory++;
+ }
+ else if (token.type().equals(WikipediaTokenizer.CITATION) == true){
+ numCitation++;
+ }
+ }
+ assertTrue("We have not seen enough tokens: " + count + " is not >= " + tcm.size(), count >= tcm.size());
+ assertTrue(numItalics + " does not equal: " + 4 + " for numItalics", numItalics == 4);
+ assertTrue(numBoldItalics + " does not equal: " + 3 + " for numBoldItalics", numBoldItalics == 3);
+ assertTrue(numCategory + " does not equal: " + 10 + " for numCategory", numCategory == 10);
+ assertTrue(numCitation + " does not equal: " + 1 + " for numCitation", numCitation == 1);
+ }
+
+ public void testLinkPhrases() throws Exception {
+ String test = "click [[link here]] click [http://lucene.apache.org here]";
+ WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+ Token token = new Token();
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
+ assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+ assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+ new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
+ new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
+ assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
+ new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
+ assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+ new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+ assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+ }
+
+ public void testLinks() throws Exception {
+ String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here]";
+ WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+ Token token = new Token();
+ token = tf.next(token);
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
+ new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
+ assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+ tf.next(token);//skip here
+ token = tf.next(token);
+
+ assertTrue("token is null and it shouldn't be", token != null);
+ assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
+ new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
+ assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+ }
+}
Propchange: lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/docs/developer-resources.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/docs/developer-resources.html?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/docs/developer-resources.html (original)
+++ lucene/java/trunk/docs/developer-resources.html Fri Jan 4 06:29:15 2008
@@ -367,6 +367,10 @@
<li>
<a href="api/contrib-swing/index.html">Swing</a>
</li>
+
+<li>
+<a href="api/contrib-wikipedia/index.html">Wikipedia</a>
+</li>
<li>
<a href="api/contrib-wordnet/index.html">Wordnet</a>
@@ -383,11 +387,11 @@
</p>
</div>
-<a name="N10097"></a><a name="Downloads"></a>
+<a name="N1009C"></a><a name="Downloads"></a>
<h2 class="boxed">Downloads</h2>
<div class="section">
<p>System Requirements are detailed <a href="systemrequirements.html">here</a>.</p>
-<a name="N100A3"></a><a name="Clover"></a>
+<a name="N100A8"></a><a name="Clover"></a>
<h3 class="boxed">Clover Test Coverage Reports</h3>
<p>
@@ -396,7 +400,7 @@
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/lastSuccessfulBuild/artifact/trunk/build/test/clover/reports/index.html">here</a>
for the nightly build.
</p>
-<a name="N100B4"></a><a name="Hudson"></a>
+<a name="N100B9"></a><a name="Hudson"></a>
<h3 class="boxed">Hudson</h3>
<p>
@@ -404,13 +408,13 @@
project. It is responsible for running nightly builds, code coverage reports as well as building the nightly version
of the website.
</p>
-<a name="N100C1"></a><a name="Nightly"></a>
+<a name="N100C6"></a><a name="Nightly"></a>
<h3 class="boxed">Nightly Build Download</h3>
<p>Nightly builds are based on the trunk version of the code checked into
<a href="https://svn.apache.org/repos/asf/lucene/java/trunk">SVN</a>
</p>
-<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D3"></a><a name="source"></a>
+<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D8"></a><a name="source"></a>
<h3 class="boxed">Source Code</h3>
<p>The source files are now stored using Subversion (see http://subversion.tigris.org/ and http://svnbook.red-bean.com/)
</p>
Modified: lucene/java/trunk/docs/developer-resources.pdf
URL: http://svn.apache.org/viewvc/lucene/java/trunk/docs/developer-resources.pdf?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/docs/developer-resources.pdf (original)
+++ lucene/java/trunk/docs/developer-resources.pdf Fri Jan 4 06:29:15 2008
@@ -5,10 +5,10 @@
/Producer (FOP 0.20.5) >>
endobj
5 0 obj
-<< /Length 677 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 680 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gaua=966RV&:j6I$6AM$<'>LHA#F%1Bfjds>FL0G11<e;/&7SNq]Z`9S4S?[.)cF,`Nm,5C0/h1%GkAg5Sum3h$(mtAH;j$J-"qb(aN]GA-gU;#3ep&]/ni-lp7ej)2[APh<K'JakDs)h!dWOA>Sut];]T!W(q!2=bAet?M3_@^r3fKo#YAMEQkN1%pIJb1\.LtUi'.39h$=WfkX&D$ptq:,H82VL,c_h[8K(Fc;`WB\og;YJC1VbN(fJ+7PK\#4mZt!5et;#^@iB5UT=jd;Rk\opTD^:Xt<Ye#/_-[k36A0O](e$'q*+_$&kI,a?727rO"0tg?QdGDn<6sSTlFj,$.pR.)3TrnR(3!%/D[jS]CK30@XK;.?Q:,pGt&Qj>:DrCFl?ENl^_7Os^t7,Q2JU%^u-!)Q0lTrMo<nMSe]#CUmNXrPp?BCX+O3BaG&*`)?`o/:*cc:]Fpk2bUXP+.*qSbiC$](/Fa-^RXo>T)MuiO.ELK<fCB"NTEpua[nU3=VS/eE_l^I38"-1cfP12HW0*DWq2ANI2`#%cM-:B`-lBp*b'pA6,J2=?l+1u2We=2VAp`rPDF5I>(AVg8]X1aV_)eep.sBn^,r7-'98VO&.4tmqKIa^GD[?ICEJ9(AhI(bkJEaNd/2&B!(LDdh0?5AUcuj>3qV;OUY?7~>
+Gaua=c#T:-&:j43KoYjMWr@QZm5/Z9D.f;V97'K#-;5k/"ZV^T07n`8N#UP>.6^iaQJML?]95'DnE,T0_B8p%/jT+`%1pS_$rH]9+LZ?C+\5$)P+UZMol`";#IOaP,`K<YX&!T4.Smct_b5,KV;<`_VJ]A8>Ec(5%DIJU\oWo5-A21?T00t%7r7\jhT2^:7Qsk`1:8aXa[c[k4(Uff.&[uA,kA\9j5fpa9(1OpeQ&BE`Wa<^7`Y3VZ;GW\_DiP%@:/m8!q&;I[6MhV7?os$dLE[VG>m;+9qeUFkbm>o:D\YCPhHADa?2YH@JC7!k4(V%01GlePYVd8dhZNW`+sX[R83':q(4dirW40MaIe%I4MI_AW?80=bA:=qe&[3(o8mgmZ%t_-,/[JTSl!LhoXCY\%)"m,EMDa<YPR[`be9%L;1!/KJ".^pk'Z)O0*-Yl4=/q9U)t$+l$mBQHKgap"#Igs1BGO]H^^@ZC#A3AF.V(6T54)(':-atoo$\lgs7)jdBmI^aQCT-rS.<Zo*[ZZs)7:AS[5<@8+Ljqn>ee2qr894W<kYB5$e?u#=a8Mn28kNHNBOTKO@27`k?-gjrDBmK:#D<gfEm9N*UgT:4FK.4,G(*S<WemLmN=C>J4enV=A0#FB`>)Srh=rbh`=hU%_i>ZDs0MW!k`Zh]eWG%f~>
endstream
endobj
6 0 obj
@@ -102,10 +102,10 @@
>>
endobj
22 0 obj
-<< /Length 1268 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1260 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gatm;?'!Gq&:O#Nn0:K*nOK)[hWQR">-S@A]`RH86^kT'Zu0-=UqOIVr_AoXEJf1PLfL90PNI,8S/HZ#e+r(\I.D>FE1Fk"UZOM.[.tF^M%MLYjT"i14dmW!',6<(^...@i>#u!f*Mmh'a\#T.K_u7rrdhGS5:e\^/Icnc7uPESqSo9.`jjh>-1FouchJfCC"B8jluOB)*59PK>=NGX:7fAQJ$4,5l(#F[/^8]$,=qWGEfc7o'=LNP4o,C1D?k/DWQ8]_]h^Vk3$9Z%RGrq5V]FOHgb$`EmPuh1$D>W$\@lq=Zg+jZ2npBItb@+TV65XbH9VP;$l!i)XX(d>('QFQr67N(#]-(;3M\YTVn'\d&<e6/U"#a0Lk/B6]M&leKQn!%9u^T^CUal4=nKjE%bERKQ_pZ%7"K?g>`RTi'IoK(a6="6$Z9HYJH^F'2Q;9F--FCME+=e#)]Z:Z3!(J`nQ"grH"\4b\NY#"d#-P#F6
tPl8J@aKek'ZoC*fP8DONp4Nb1=YgD;3ZmGZNJqn<R>C^(g>ge?H<Hh0pt)t`_0fnUH,=Pd"c?\rau[f)maA0G*N$Cn$5p,Rp%,;dO`_oC;X%JVWq5$O=mF^OE`=:.)bS;C_"R-T,nNBVm&CS^??-drOiJG-iQ"ecolHJtZU\WH8kCNK8Bi=iG'Ga9c#=L,"C$G_DD,8\h9e&N5-5X1F,loG`<ZbTCYgL^1+^C>p.VZi6W\R#S_L"`oWJX8Qr8QVnm,G-KEHHPm1T:,5i+<s~>
+Gatm;9on$e&A@sBn<a2...@P>Or+i0PqRJjqCB1FXVEqm-70e>lTsADZGDU;,)[:SQ"t0G`\&YZ<=8J=4Rrb.^mUii3V/EZ4Zts57-+K!?*^:+7QDVc7sb\%89l^p")%FCX`HX.>Wn3]=,pf1a@]Sc/gl?*2"`&)a.HUdH?/Oe:fS9H8>#RGLZoQq2IU6-RTkHT)A*n7a\41>cn(oB\r(fW3@hX;BF:$lCo=bLe0L*f?t:UKHH>jUq)t:/70IA$7YK7?q4%?l^^']u#0QK.4U,D\4e//5NL)^YC4THNffP>MN#OW'0*nTi[L^WK2u(LRloq"nV"7gOW8u#4cD&"pIZ4o$$gIk^C!`!s@f<B6HE31^Ao9bc=Og7<Ymd1,gs`]^gI!Am%If`^S=Vc>Sc9L9n!2#4#gtP*tU0g!=>(>R1ags7`D/3QD.HBd40i;nCGu;X&#^mlDF)PfTTjM-D-fi[&Gs#E85f&^F-O%9?,IK(&TR5VkE"T&.l'IBZnu"q:FR*mYHg`=Q4Jgf8#9Vf7=/E+9:,X(b@m[n7aXRU,&Peb1\RIqEK#FkCD,qpf>(P`&B3'QIO
A0@?AFfP/40AU)J!*c]>Pi+/M7.rST2/\T8aB6JX<kg3u7m6)gVF4mgS<7$^u&n;[on%s@ZbOTgVS@$^(ikub,0=Pggef>_ql!:@,jmEd]H>dg@aks`gZl0-'NI0ahb%2gqWY_74F-XgD,_Die.M-f'^N@%n4E;rQ(:8q9]&YTCG^*A"c+k)dJpWLIY0:eVmUR77cKceDllk%inK<VlCYgL^2Cuhmp.V[TA!b^=c@'b'pETS>VUT"DI92^DGdJK!C$irj!3?kVBE~>
endstream
endobj
23 0 obj
@@ -117,10 +117,10 @@
>>
endobj
24 0 obj
-<< /Length 1000 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1033 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gatm:997OU&AIm?pp_iZ'0EJji5RB#%j;'l`J*E8Zu4K3Ll:Z-g4,0Q7m/<'!!uYB11P[nS!e1W]q3PVc"B7RN3*!PorOfO8XcOUQ](h><Ql;u1aPVSCA)@brPo"&"2$_,'njPT@D82XmL]%57f%`]L6Q^6s$=A*]P3^52j`B4BBV\8n@gW'X-Tbg#Lu.":,T.kd5KN7\Mg9_fZp5HPa/86;\Eqi)SJAV+M;V/ZB5gPhrV^@d(FL"OPe.Wh,.EK!ITjL8A!9/<E5$[Y-\LeBq&]I>GKa[Y'p_8W`A`?5/Yc$-'l!fP\['LEheU]nWrBe-gUX$9'9ggAWiMbXqMu!WTVA7Z:ACP2:/B[(3d'ib"3FDG2/gR(HG42rlH&456WWZ6Hkmg?i9hj$uD^gn<&]h!u8q2Nu;@tcc8TWU.?G9E?@FI#qG_*j)SDndhK`eHIpZX>iHkQ%3k?_&tY)U&e<T_(7@M33#$u;OF@;ZErsn>TY+=%PmAP9`?bK/F71i>Ip]]1!^=]E,Bi94&N/4sQOcH$U0Ddf<fo\2%'bP??H$((!E3P@`gbu&;+>L/BrK?BO="BY6nsjZ,LYB3egsXTDRtLl,=n>T-k<=QF3+B=+H-BnO`^$S\30K+qb#E0**4&l6O0/?\<*h`>R,Iioc$Eh0<@S(*h:tIO!8r'#u7U^ZCoH'64fUIWI'<q2Cuk07T8'=deXP<YcCB^%4q.4kl1-<WT&#CGU=OP9mI]7G=hsCFmFW+GQL)^r-F-AIuE$Z[&^.nVF'pggm6@71FKE)Z*30FnD&la2C>uLTT7E3SmX)jW:Zh%cnR_:,"lE.QO^$rM=a\`AZ[o>9XLF':B`go(XKe?.AIbk@\=`q=R92")Hk*U!\HCmfmr\S0Q[fm&elXe_q9`,cF?-WT*YNe.S.j5a@";2kfo=Ehf5@9mpj1c'N:G)Bjc9@TQK:Dn#bA_i-;H9eSVYZrYPVrerRXRk^6S[9.S7f0%"A
/"j0OmBE~>
+GatU29lo&3&A@sBE(jj^";9pn[jJ*'Ful0Pnat9%+PGBo;1o=uT)AJ[0=-Q':-'8RD&*9CmI&^tQa=fSg#UJ4U6YW49JV2+MW<@f1>[sC6:Pt91Sb1(-9iiDr<@l&/0tmCrY_QW5:9TkOC"X0MmkXDLZ$BD:cIMK5R=4nU6/ZK/LR?*7bicnDC^Wa7RK"Z6GLei*9)$0Z9@F<;nXlb3_-s4m)q,u,*"IF%'V;!nHb@()Kqc`^bE]sdgg>I(a,[Fj,I)D\WG@a:X)mNAQ>cL4=2R/<][]&Shn\OGp,gpV#ZQ["ib?:NN`"[<+6"/+QaHXij0YhCh#6oMe"dcr6iGu<+TT;JlA".HUHCb2=dl4X0jOOrBsN+EadfjmhZ%/>G:X'R*fB<Zj$Hk@([.aH$lI-),V9jB$]eiAY-l.%PBh$As<D*4E9@U'Ll8WTsENW!'H*XjL+d/&fA7=MR0;&?N(O-q6u[s\SM?/c"j_f8sM&)('s(P%Pp#)MAT)p;]qU8`nug)!JsPd+Hm^3[`FF.$t2%`)6ckaO5>84l<[##/`r**d+'<gZ+HXcqUbpLHG\8f_-tdhl2u@kk#u4LpVpW'HE1^-`SBY>5=@lo7M66+@6cYnWn)5&U8p1[-Nq;l#]iH?!ca+Hf*I&+"B3<%p_cbdH%E\oV)Y<:9'sD`X#*L\D+4unXtC2@Edq[(3,WU#k'[f.rKGmVQ'MV)]a=anC2p>=0NX"^i6lSo[ORMf`/`J=+n>udePgF:^6_fB,!(\CA2q`&iNQhOjf3(]B06fM(PZ.4ff"9."M%Y>AU?'5&,A*oqO0F]4`V@m!-i6K0F^m:f`BjG$&jUes7t#^BGI/t/;Z`=1[V(m7b5!"MGe;E*mNQ!:'tT4/l$aF-m]0pGV;/<qHp#/je+Y\P\B3J;UY17hHr?k\-R=N8cPZ=e$-rHJ)?pS]eIAEV9HL*h\oR2o;nE&kNi%0jO@BW+gJM&[<Df\=5)3]ddDF
@G;E/t=,Ze&!/#.E7G&J0[K3fj6>sgE>Y]bL-t,tW~>
endstream
endobj
25 0 obj
@@ -260,31 +260,31 @@
13 0 obj
<<
/S /GoTo
-/D [23 0 R /XYZ 85.0 251.932 null]
+/D [23 0 R /XYZ 85.0 238.732 null]
>>
endobj
15 0 obj
<<
/S /GoTo
-/D [23 0 R /XYZ 85.0 199.598 null]
+/D [23 0 R /XYZ 85.0 186.398 null]
>>
endobj
17 0 obj
<<
/S /GoTo
-/D [23 0 R /XYZ 85.0 148.345 null]
+/D [25 0 R /XYZ 85.0 659.0 null]
>>
endobj
19 0 obj
<<
/S /GoTo
-/D [25 0 R /XYZ 85.0 611.4 null]
+/D [25 0 R /XYZ 85.0 581.347 null]
>>
endobj
21 0 obj
<<
/S /GoTo
-/D [25 0 R /XYZ 85.0 546.647 null]
+/D [25 0 R /XYZ 85.0 516.594 null]
>>
endobj
26 0 obj
@@ -295,45 +295,45 @@
xref
0 40
0000000000 65535 f
-0000006782 00000 n
-0000006854 00000 n
-0000006946 00000 n
+0000006810 00000 n
+0000006882 00000 n
+0000006974 00000 n
0000000015 00000 n
0000000071 00000 n
-0000000839 00000 n
-0000000959 00000 n
-0000001026 00000 n
-0000007080 00000 n
-0000001161 00000 n
-0000007143 00000 n
-0000001298 00000 n
-0000007209 00000 n
-0000001434 00000 n
-0000007275 00000 n
-0000001571 00000 n
-0000007341 00000 n
-0000001708 00000 n
-0000007407 00000 n
-0000001844 00000 n
-0000007471 00000 n
-0000001981 00000 n
-0000003342 00000 n
-0000003450 00000 n
-0000004543 00000 n
-0000007537 00000 n
-0000004651 00000 n
-0000004866 00000 n
-0000005102 00000 n
-0000005288 00000 n
-0000005555 00000 n
-0000005707 00000 n
-0000005953 00000 n
-0000006120 00000 n
-0000006233 00000 n
-0000006343 00000 n
-0000006451 00000 n
-0000006557 00000 n
-0000006673 00000 n
+0000000842 00000 n
+0000000962 00000 n
+0000001029 00000 n
+0000007108 00000 n
+0000001164 00000 n
+0000007171 00000 n
+0000001301 00000 n
+0000007237 00000 n
+0000001437 00000 n
+0000007303 00000 n
+0000001574 00000 n
+0000007369 00000 n
+0000001711 00000 n
+0000007433 00000 n
+0000001847 00000 n
+0000007499 00000 n
+0000001984 00000 n
+0000003337 00000 n
+0000003445 00000 n
+0000004571 00000 n
+0000007565 00000 n
+0000004679 00000 n
+0000004894 00000 n
+0000005130 00000 n
+0000005316 00000 n
+0000005583 00000 n
+0000005735 00000 n
+0000005981 00000 n
+0000006148 00000 n
+0000006261 00000 n
+0000006371 00000 n
+0000006479 00000 n
+0000006585 00000 n
+0000006701 00000 n
trailer
<<
/Size 40
@@ -341,5 +341,5 @@
/Info 4 0 R
>>
startxref
-7588
+7616
%%EOF
Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml Fri Jan 4 06:29:15 2008
@@ -35,6 +35,7 @@
<li><a href="api/contrib-spellchecker/index.html">Spellchecker</a></li>
<li><a href="api/contrib-surround/index.html">Surround</a></li>
<li><a href="api/contrib-swing/index.html">Swing</a></li>
+ <li><a href="api/contrib-wikipedia/index.html">Wikipedia</a></li>
<li><a href="api/contrib-wordnet/index.html">Wordnet</a></li>
<li><a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a></li></ul></li>
</ul>