You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/07/19 17:59:32 UTC
svn commit: r1363400 [6/31] - in /lucene/dev/branches/pforcodec_3892: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/
dev-tools/idea/.idea/copyright/ dev-tools/idea/.idea/libraries/
dev-tools/idea/lucene/ dev-tools/maven/ dev-tools/maven/lucene/...
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Thu Jul 19 15:58:54 2012
@@ -1,4 +1,3 @@
-// -*- c-basic-offset: 2 -*-
package org.apache.lucene.analysis.morfologik;
/*
@@ -20,10 +19,9 @@ package org.apache.lucene.analysis.morfo
import java.io.IOException;
import java.io.StringReader;
+import java.util.TreeSet;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
@@ -39,8 +37,8 @@ public class TestMorfologikAnalyzer exte
public final void testSingleTokens() throws IOException {
Analyzer a = getTestAnalyzer();
assertAnalyzesToReuse(a, "a", new String[] { "a" });
- assertAnalyzesToReuse(a, "liÅcie", new String[] { "liÅÄ", "list", "lista", });
- assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dane", "daÄ" });
+ assertAnalyzesToReuse(a, "liÅcie", new String[] { "liÅcie", "liÅÄ", "list", "lista" });
+ assertAnalyzesToReuse(a, "danych", new String[] { "dany", "dana", "dane", "daÄ" });
assertAnalyzesToReuse(a, "ÄóÄ
ÅÅżźÄÅ", new String[] { "ÄóÄ
ÅÅżźÄÅ" });
}
@@ -50,10 +48,10 @@ public class TestMorfologikAnalyzer exte
assertAnalyzesToReuse(
a,
"liÅcie danych",
- new String[] { "liÅÄ", "list", "lista", "dany", "dane", "daÄ" },
- new int[] { 0, 0, 0, 7, 7, 7 },
- new int[] { 6, 6, 6, 13, 13, 13 },
- new int[] { 1, 0, 0, 1, 0, 0 });
+ new String[] { "liÅcie", "liÅÄ", "list", "lista", "dany", "dana", "dane", "daÄ" },
+ new int[] { 0, 0, 0, 0, 7, 7, 7, 7 },
+ new int[] { 6, 6, 6, 6, 13, 13, 13, 13 },
+ new int[] { 1, 0, 0, 0, 1, 0, 0, 0 });
}
/** Test reuse of MorfologikFilter with leftover stems. */
@@ -63,7 +61,7 @@ public class TestMorfologikAnalyzer exte
CharTermAttribute termAtt_1 = ts_1.getAttribute(CharTermAttribute.class);
ts_1.reset();
ts_1.incrementToken();
- assertEquals("first stream", "liÅÄ", termAtt_1.toString());
+ assertEquals("first stream", "liÅcie", termAtt_1.toString());
TokenStream ts_2 = a.tokenStream("dummy", new StringReader("danych"));
CharTermAttribute termAtt_2 = ts_2.getAttribute(CharTermAttribute.class);
@@ -76,35 +74,63 @@ public class TestMorfologikAnalyzer exte
public final void testCase() throws IOException {
Analyzer a = getTestAnalyzer();
- assertAnalyzesToReuse(a, "AGD", new String[] { "artykuÅy gospodarstwa domowego" });
+ assertAnalyzesToReuse(a, "AGD", new String[] { "AGD", "artykuÅy gospodarstwa domowego" });
assertAnalyzesToReuse(a, "agd", new String[] { "artykuÅy gospodarstwa domowego" });
assertAnalyzesToReuse(a, "Poznania", new String[] { "PoznaÅ" });
- assertAnalyzesToReuse(a, "poznania", new String[] { "poznaÄ" });
+ assertAnalyzesToReuse(a, "poznania", new String[] { "poznanie", "poznaÄ" });
assertAnalyzesToReuse(a, "Aarona", new String[] { "Aaron" });
assertAnalyzesToReuse(a, "aarona", new String[] { "aarona" });
- assertAnalyzesToReuse(a, "LiÅcie", new String[] { "liÅÄ", "list", "lista" });
+ assertAnalyzesToReuse(a, "LiÅcie", new String[] { "liÅcie", "liÅÄ", "list", "lista" });
}
- private void assertPOSToken(TokenStream ts, String term, String pos) throws IOException {
+ private void assertPOSToken(TokenStream ts, String term, String... tags) throws IOException {
ts.incrementToken();
assertEquals(term, ts.getAttribute(CharTermAttribute.class).toString());
- assertEquals(pos, ts.getAttribute(MorphosyntacticTagAttribute.class).getTag().toString());
+
+ TreeSet<String> actual = new TreeSet<String>();
+ TreeSet<String> expected = new TreeSet<String>();
+ for (StringBuilder b : ts.getAttribute(MorphosyntacticTagsAttribute.class).getTags()) {
+ actual.add(b.toString());
+ }
+ for (String s : tags) {
+ expected.add(s);
+ }
+
+ if (!expected.equals(actual)) {
+ System.out.println("Expected:\n" + expected);
+ System.out.println("Actual:\n" + actual);
+ assertEquals(expected, actual);
+ }
}
/** Test morphosyntactic annotations. */
public final void testPOSAttribute() throws IOException {
TokenStream ts = getTestAnalyzer().tokenStream("dummy", new StringReader("liÅcie"));
- assertPOSToken(ts, "liÅÄ", "subst:pl:acc.nom.voc:m3");
- assertPOSToken(ts, "list", "subst:sg:loc.voc:m3");
- assertPOSToken(ts, "lista", "subst:sg:dat.loc:f");
+ assertPOSToken(ts, "liÅcie",
+ "subst:sg:acc:n2",
+ "subst:sg:nom:n2",
+ "subst:sg:voc:n2");
+
+ assertPOSToken(ts, "liÅÄ",
+ "subst:pl:acc:m3",
+ "subst:pl:nom:m3",
+ "subst:pl:voc:m3");
+
+ assertPOSToken(ts, "list",
+ "subst:sg:loc:m3",
+ "subst:sg:voc:m3");
+
+ assertPOSToken(ts, "lista",
+ "subst:sg:dat:f",
+ "subst:sg:loc:f");
}
-
+
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
- checkRandomData(random(), getTestAnalyzer(), 10000 * RANDOM_MULTIPLIER);
+ checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java Thu Jul 19 15:58:54 2012
@@ -62,9 +62,13 @@ public final class BeiderMorseFilter ext
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- /**
- * Calls {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, Languages.LanguageSet)
- * BeiderMorseFilter(input, engine, null)}
+
+ /**
+ * Calls
+ * {@link #BeiderMorseFilter(TokenStream, PhoneticEngine, org.apache.commons.codec.language.bm.Languages.LanguageSet)}
+ *
+ * @param input
+ * @param engine
*/
public BeiderMorseFilter(TokenStream input, PhoneticEngine engine) {
this(input, engine, null);
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java Thu Jul 19 15:58:54 2012
@@ -27,9 +27,8 @@ import java.io.IOException;
/**
* Create tokens for phonetic matches.
- * @see <a href="
- * http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html
- * ">Apache Commons Codec</a>
+ * @see <a href="http://commons.apache.org/codec/api-release/org/apache/commons/codec/language/package-summary.html">
+ * Apache Commons Codec</a>
*/
public final class PhoneticFilter extends TokenFilter
{
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Thu Jul 19 15:58:54 2012
@@ -66,12 +66,9 @@ public class AnalyzerProfile {
if (ANALYSIS_DATA_DIR.length() == 0) {
// Dictionary directory cannot be found.
- System.err
- .println("WARNING: Can not find lexical dictionary directory!");
- System.err
- .println("WARNING: This will cause unpredictable exceptions in your application!");
- System.err
- .println("WARNING: Please refer to the manual to download the dictionaries.");
+ throw new RuntimeException("WARNING: Can not find lexical dictionary directory!"
+ + " This will cause unpredictable exceptions in your application!"
+ + " Please refer to the manual to download the dictionaries.");
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Thu Jul 19 15:58:54 2012
@@ -117,13 +117,13 @@ public final class SentenceTokenizer ext
}
@Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
+ public void setReader(Reader input) throws IOException {
+ super.setReader(input);
reset();
}
@Override
- public void end() throws IOException {
+ public void end() {
// set final offset
final int finalOffset = correctOffset(tokenEnd);
offsetAtt.setOffset(finalOffset, finalOffset);
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Thu Jul 19 15:58:54 2012
@@ -118,9 +118,8 @@ abstract class AbstractDictionary {
// Therefore, each code page only has 16*6-2=94 characters.
return (short) (b0 * 94 + b1);
} catch (UnsupportedEncodingException e) {
- e.printStackTrace();
+ throw new RuntimeException(e);
}
- return -1;
}
/**
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Thu Jul 19 15:58:54 2012
@@ -79,14 +79,9 @@ class BigramDictionary extends AbstractD
try {
loadFromInputStream(new FileInputStream(serialObj));
return true;
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
- return false;
}
private void loadFromInputStream(InputStream serialObjectInputStream)
@@ -148,8 +143,7 @@ class BigramDictionary extends AbstractD
* @throws IOException
* @throws UnsupportedEncodingException
*/
- public void loadFromFile(String dctFilePath) throws FileNotFoundException,
- IOException, UnsupportedEncodingException {
+ public void loadFromFile(String dctFilePath) throws IOException {
int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Thu Jul 19 15:58:54 2012
@@ -145,14 +145,9 @@ class WordDictionary extends AbstractDic
try {
loadFromObjectInputStream(new FileInputStream(serialObj));
return true;
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (ClassNotFoundException e) {
- e.printStackTrace();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
}
- return false;
}
private void loadFromObjectInputStream(InputStream serialObjectInputStream)
@@ -190,8 +185,7 @@ class WordDictionary extends AbstractDic
* @throws IOException
* @throws UnsupportedEncodingException
*/
- private int loadMainDataFromFile(String dctFilePath)
- throws FileNotFoundException, IOException, UnsupportedEncodingException {
+ private int loadMainDataFromFile(String dctFilePath) throws IOException {
int i, cnt, length, total = 0;
// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
// The 3756th is used (as a header) to store information.
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseAnalyzer.java Thu Jul 19 15:58:54 2012
@@ -224,13 +224,13 @@ public class TestSmartChineseAnalyzer ex
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- checkRandomData(random(), new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random(), new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
- checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 200*RANDOM_MULTIPLIER, 8192);
+ checkRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 100*RANDOM_MULTIPLIER, 8192);
}
public void testEmptyTerm() throws IOException {
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java Thu Jul 19 15:58:54 2012
@@ -65,7 +65,7 @@ public class StempelStemmer {
DataInputStream in = null;
try {
in = new DataInputStream(new BufferedInputStream(stemmerTable));
- String method = in.readUTF().toUpperCase(Locale.ENGLISH);
+ String method = in.readUTF().toUpperCase(Locale.ROOT);
if (method.indexOf('M') < 0) {
return new org.egothor.stemmer.Trie(in);
} else {
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Compile.java Thu Jul 19 15:58:54 2012
@@ -63,6 +63,7 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
+import java.util.Locale;
import java.util.StringTokenizer;
/**
@@ -84,12 +85,12 @@ public class Compile {
*
* @param args the command line arguments
*/
- public static void main(java.lang.String[] args) {
+ public static void main(java.lang.String[] args) throws Exception {
if (args.length < 1) {
return;
}
- args[0].toUpperCase();
+ args[0].toUpperCase(Locale.ROOT);
backward = args[0].charAt(0) == '-';
int qq = (backward) ? 1 : 0;
@@ -116,82 +117,75 @@ public class Compile {
LineNumberReader in;
// System.out.println("[" + args[i] + "]");
Diff diff = new Diff();
- try {
- int stems = 0;
- int words = 0;
-
- allocTrie();
-
- System.out.println(args[i]);
- in = new LineNumberReader(new BufferedReader(new InputStreamReader(
- new FileInputStream(args[i]), charset)));
- for (String line = in.readLine(); line != null; line = in.readLine()) {
- try {
- line = line.toLowerCase();
- StringTokenizer st = new StringTokenizer(line);
- String stem = st.nextToken();
- if (storeorig) {
- trie.add(stem, "-a");
+ int stems = 0;
+ int words = 0;
+
+ allocTrie();
+
+ System.out.println(args[i]);
+ in = new LineNumberReader(new BufferedReader(new InputStreamReader(
+ new FileInputStream(args[i]), charset)));
+ for (String line = in.readLine(); line != null; line = in.readLine()) {
+ try {
+ line = line.toLowerCase(Locale.ROOT);
+ StringTokenizer st = new StringTokenizer(line);
+ String stem = st.nextToken();
+ if (storeorig) {
+ trie.add(stem, "-a");
+ words++;
+ }
+ while (st.hasMoreTokens()) {
+ String token = st.nextToken();
+ if (token.equals(stem) == false) {
+ trie.add(token, diff.exec(token, stem));
words++;
}
- while (st.hasMoreTokens()) {
- String token = st.nextToken();
- if (token.equals(stem) == false) {
- trie.add(token, diff.exec(token, stem));
- words++;
- }
- }
- } catch (java.util.NoSuchElementException x) {
- // no base token (stem) on a line
}
+ } catch (java.util.NoSuchElementException x) {
+ // no base token (stem) on a line
}
-
- Optimizer o = new Optimizer();
- Optimizer2 o2 = new Optimizer2();
- Lift l = new Lift(true);
- Lift e = new Lift(false);
- Gener g = new Gener();
-
- for (int j = 0; j < optimizer.length; j++) {
- String prefix;
- switch (optimizer[j]) {
- case 'G':
- trie = trie.reduce(g);
- prefix = "G: ";
- break;
- case 'L':
- trie = trie.reduce(l);
- prefix = "L: ";
- break;
- case 'E':
- trie = trie.reduce(e);
- prefix = "E: ";
- break;
- case '2':
- trie = trie.reduce(o2);
- prefix = "2: ";
- break;
- case '1':
- trie = trie.reduce(o);
- prefix = "1: ";
- break;
- default:
- continue;
- }
- trie.printInfo(prefix + " ");
+ }
+
+ Optimizer o = new Optimizer();
+ Optimizer2 o2 = new Optimizer2();
+ Lift l = new Lift(true);
+ Lift e = new Lift(false);
+ Gener g = new Gener();
+
+ for (int j = 0; j < optimizer.length; j++) {
+ String prefix;
+ switch (optimizer[j]) {
+ case 'G':
+ trie = trie.reduce(g);
+ prefix = "G: ";
+ break;
+ case 'L':
+ trie = trie.reduce(l);
+ prefix = "L: ";
+ break;
+ case 'E':
+ trie = trie.reduce(e);
+ prefix = "E: ";
+ break;
+ case '2':
+ trie = trie.reduce(o2);
+ prefix = "2: ";
+ break;
+ case '1':
+ trie = trie.reduce(o);
+ prefix = "1: ";
+ break;
+ default:
+ continue;
}
-
- DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
- new FileOutputStream(args[i] + ".out")));
- os.writeUTF(args[0]);
- trie.store(os);
- os.close();
-
- } catch (FileNotFoundException x) {
- x.printStackTrace();
- } catch (IOException x) {
- x.printStackTrace();
+ trie.printInfo(System.out, prefix + " ");
}
+
+ DataOutputStream os = new DataOutputStream(new BufferedOutputStream(
+ new FileOutputStream(args[i] + ".out")));
+ os.writeUTF(args[0]);
+ trie.store(os);
+ os.close();
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/DiffIt.java Thu Jul 19 15:58:54 2012
@@ -55,9 +55,10 @@
package org.egothor.stemmer;
import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
import java.io.LineNumberReader;
+import java.util.Locale;
import java.util.StringTokenizer;
/**
@@ -83,7 +84,7 @@ public class DiffIt {
*
* @param args the path to a file containing a stemmer table
*/
- public static void main(java.lang.String[] args) {
+ public static void main(java.lang.String[] args) throws Exception {
int ins = get(0, args[0]);
int del = get(1, args[0]);
@@ -94,27 +95,23 @@ public class DiffIt {
LineNumberReader in;
// System.out.println("[" + args[i] + "]");
Diff diff = new Diff(ins, del, rep, nop);
- try {
- in = new LineNumberReader(new BufferedReader(new FileReader(args[i])));
- for (String line = in.readLine(); line != null; line = in.readLine()) {
- try {
- line = line.toLowerCase();
- StringTokenizer st = new StringTokenizer(line);
- String stem = st.nextToken();
- System.out.println(stem + " -a");
- while (st.hasMoreTokens()) {
- String token = st.nextToken();
- if (token.equals(stem) == false) {
- System.out.println(stem + " " + diff.exec(token, stem));
- }
+ String charset = System.getProperty("egothor.stemmer.charset", "UTF-8");
+ in = new LineNumberReader(new BufferedReader(new InputStreamReader(new FileInputStream(args[i]), charset)));
+ for (String line = in.readLine(); line != null; line = in.readLine()) {
+ try {
+ line = line.toLowerCase(Locale.ROOT);
+ StringTokenizer st = new StringTokenizer(line);
+ String stem = st.nextToken();
+ System.out.println(stem + " -a");
+ while (st.hasMoreTokens()) {
+ String token = st.nextToken();
+ if (token.equals(stem) == false) {
+ System.out.println(stem + " " + diff.exec(token, stem));
}
- } catch (java.util.NoSuchElementException x) {
- // no base token (stem) on a line
}
+ } catch (java.util.NoSuchElementException x) {
+ // no base token (stem) on a line
}
-
- } catch (IOException x) {
- x.printStackTrace();
}
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/MultiTrie.java Thu Jul 19 15:58:54 2012
@@ -57,6 +57,7 @@ package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
@@ -200,9 +201,9 @@ public class MultiTrie extends Trie {
* @param prefix the desired prefix
*/
@Override
- public void printInfo(CharSequence prefix) {
+ public void printInfo(PrintStream out, CharSequence prefix) {
int c = 0;
for (Trie trie : tries)
- trie.printInfo(prefix + "[" + (++c) + "] ");
+ trie.printInfo(out, prefix + "[" + (++c) + "] ");
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Row.java Thu Jul 19 15:58:54 2012
@@ -57,6 +57,7 @@ package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.io.PrintStream;
import java.util.Iterator;
import java.util.TreeMap;
@@ -292,15 +293,15 @@ public class Row {
}
/**
- * Write the contents of this Row to stdout.
+ * Write the contents of this Row to the printstream.
*/
- public void print() {
+ public void print(PrintStream out) {
for (Iterator<Character> i = cells.keySet().iterator(); i.hasNext();) {
Character ch = i.next();
Cell c = at(ch);
- System.out.print("[" + ch + ":" + c + "]");
+ out.print("[" + ch + ":" + c + "]");
}
- System.out.println();
+ out.println();
}
Cell at(Character index) {
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/java/org/egothor/stemmer/Trie.java Thu Jul 19 15:58:54 2012
@@ -57,6 +57,7 @@ package org.egothor.stemmer;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
+import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
@@ -375,8 +376,8 @@ public class Trie {
return by.optimize(this);
}
- public void printInfo(CharSequence prefix) {
- System.out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
+ public void printInfo(PrintStream out, CharSequence prefix) {
+ out.println(prefix + "nds " + rows.size() + " cmds " + cmds.size()
+ " cells " + getCells() + " valcells " + getCellsVal() + " pntcells "
+ getCellsPnt());
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/apache/lucene/analysis/pl/TestPolishAnalyzer.java Thu Jul 19 15:58:54 2012
@@ -51,6 +51,6 @@ public class TestPolishAnalyzer extends
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
- checkRandomData(random(), new PolishAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+ checkRandomData(random(), new PolishAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/stempel/src/test/org/egothor/stemmer/TestCompile.java Thu Jul 19 15:58:54 2012
@@ -60,12 +60,14 @@ import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.net.URI;
+import java.util.Locale;
import java.util.StringTokenizer;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
public class TestCompile extends LuceneTestCase {
@@ -107,7 +109,7 @@ public class TestCompile extends LuceneT
Trie trie;
DataInputStream is = new DataInputStream(new BufferedInputStream(
new FileInputStream(path)));
- String method = is.readUTF().toUpperCase();
+ String method = is.readUTF().toUpperCase(Locale.ROOT);
if (method.indexOf('M') < 0) {
trie = new Trie(is);
} else {
@@ -120,11 +122,11 @@ public class TestCompile extends LuceneT
private static void assertTrie(Trie trie, String file, boolean usefull,
boolean storeorig) throws Exception {
LineNumberReader in = new LineNumberReader(new BufferedReader(
- new FileReader(file)));
+ new InputStreamReader(new FileInputStream(file), IOUtils.CHARSET_UTF_8)));
for (String line = in.readLine(); line != null; line = in.readLine()) {
try {
- line = line.toLowerCase();
+ line = line.toLowerCase(Locale.ROOT);
StringTokenizer st = new StringTokenizer(line);
String stem = st.nextToken();
if (storeorig) {
@@ -132,7 +134,7 @@ public class TestCompile extends LuceneT
.getLastOnPath(stem);
StringBuilder stm = new StringBuilder(stem);
Diff.apply(stm, cmd);
- assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+ assertEquals(stem.toLowerCase(Locale.ROOT), stm.toString().toLowerCase(Locale.ROOT));
}
while (st.hasMoreTokens()) {
String token = st.nextToken();
@@ -143,7 +145,7 @@ public class TestCompile extends LuceneT
.getLastOnPath(token);
StringBuilder stm = new StringBuilder(token);
Diff.apply(stm, cmd);
- assertEquals(stem.toLowerCase(), stm.toString().toLowerCase());
+ assertEquals(stem.toLowerCase(Locale.ROOT), stm.toString().toLowerCase(Locale.ROOT));
}
} catch (java.util.NoSuchElementException x) {
// no base token (stem) on a line
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java Thu Jul 19 15:58:54 2012
@@ -80,8 +80,8 @@ public abstract class BaseUIMATokenizer
}
@Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
+ public void setReader(Reader input) throws IOException {
+ super.setReader(input);
iterator = null;
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java Thu Jul 19 15:58:54 2012
@@ -30,7 +30,6 @@ import org.apache.lucene.search.MatchAll
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.Version;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@@ -65,7 +64,7 @@ public class UIMABaseAnalyzerTest extend
@Test
public void baseUIMAAnalyzerIntegrationTest() throws Exception {
Directory dir = new RAMDirectory();
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
// add the first doc
Document doc = new Document();
String dummyTitle = "this is a dummy title ";
@@ -78,7 +77,7 @@ public class UIMABaseAnalyzerTest extend
// try the search over the first doc
DirectoryReader directoryReader = DirectoryReader.open(dir);
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
- TopDocs result = indexSearcher.search(new MatchAllDocsQuery(), 10);
+ TopDocs result = indexSearcher.search(new MatchAllDocsQuery(), 1);
assertTrue(result.totalHits > 0);
Document d = indexSearcher.doc(result.scoreDocs[0].doc);
assertNotNull(d);
@@ -99,7 +98,7 @@ public class UIMABaseAnalyzerTest extend
directoryReader.close();
directoryReader = DirectoryReader.open(dir);
indexSearcher = new IndexSearcher(directoryReader);
- result = indexSearcher.search(new MatchAllDocsQuery(), 10);
+ result = indexSearcher.search(new MatchAllDocsQuery(), 2);
Document d1 = indexSearcher.doc(result.scoreDocs[1].doc);
assertNotNull(d1);
assertNotNull(d1.getField("title"));
@@ -109,7 +108,7 @@ public class UIMABaseAnalyzerTest extend
// do a matchalldocs query to retrieve both docs
indexSearcher = new IndexSearcher(directoryReader);
- result = indexSearcher.search(new MatchAllDocsQuery(), 10);
+ result = indexSearcher.search(new MatchAllDocsQuery(), 2);
assertEquals(2, result.totalHits);
writer.close();
indexSearcher.getIndexReader().close();
@@ -119,7 +118,7 @@ public class UIMABaseAnalyzerTest extend
@Test
public void testRandomStrings() throws Exception {
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
- 1000 * RANDOM_MULTIPLIER);
+ 100 * RANDOM_MULTIPLIER);
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java Thu Jul 19 15:58:54 2012
@@ -61,7 +61,7 @@ public class UIMATypeAwareAnalyzerTest e
@Test
public void testRandomStrings() throws Exception {
checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
- "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 1000 * RANDOM_MULTIPLIER);
+ "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 100 * RANDOM_MULTIPLIER);
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/build.xml Thu Jul 19 15:58:54 2012
@@ -155,6 +155,7 @@
<fileset dir="lib">
<include name="commons-compress-1.2.jar"/>
<include name="xercesImpl-2.9.1.jar"/>
+ <include name="nekohtml-1.9.15.jar"/>
</fileset>
</path>
<path id="run.classpath">
@@ -261,18 +262,6 @@
<target name="init" depends="module-build.init,resolve-icu,jar-memory,jar-highlighter,jar-analyzers-common,jar-queryparser,jar-facet"/>
- <target name="clean-javacc">
- <fileset dir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml" includes="*.java">
- <containsregexp expression="Generated.*By.*JavaCC"/>
- </fileset>
- </target>
-
- <target name="javacc" depends="init,javacc-check" if="javacc.present">
- <invoke-javacc target="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml/HTMLParser.jj"
- outputDir="src/java/org/apache/lucene/benchmark/byTask/feeds/demohtml"
- />
- </target>
-
<target name="compile-test" depends="copy-alg-files-for-testing,module-build.compile-test"/>
<target name="copy-alg-files-for-testing" description="copy .alg files as resources for testing">
<copy todir="${build.dir}/classes/test/conf">
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/ivy.xml Thu Jul 19 15:58:54 2012
@@ -21,6 +21,7 @@
<dependencies>
<dependency org="org.apache.commons" name="commons-compress" rev="1.2" transitive="false"/>
<dependency org="xerces" name="xercesImpl" rev="2.9.1" transitive="false"/>
+ <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.15" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java Thu Jul 19 15:58:54 2012
@@ -23,6 +23,7 @@ import java.io.Reader;
import org.apache.lucene.benchmark.byTask.utils.Algorithm;
import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.util.IOUtils;
/**
@@ -106,7 +107,7 @@ public class Benchmark {
Benchmark benchmark = null;
try {
- benchmark = new Benchmark(new FileReader(algFile));
+ benchmark = new Benchmark(IOUtils.getDecodingReader(algFile, IOUtils.CHARSET_UTF_8));
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DemoHTMLParser.java Thu Jul 19 15:58:54 2012
@@ -19,51 +19,203 @@ package org.apache.lucene.benchmark.byTa
import java.io.IOException;
import java.io.Reader;
-import java.text.DateFormat;
-import java.text.ParseException;
+import java.io.StringReader;
+import java.util.Collections;
import java.util.Date;
+import java.util.HashSet;
+import java.util.Locale;
import java.util.Properties;
+import java.util.Set;
+
+import org.cyberneko.html.parsers.SAXParser;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
- * HTML Parser that is based on Lucene's demo HTML parser.
+ * Simple HTML Parser extracting title, meta tags, and body text
+ * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
*/
-public class DemoHTMLParser implements org.apache.lucene.benchmark.byTask.feeds.HTMLParser {
+public class DemoHTMLParser implements HTMLParser {
+
+ /** The actual parser to read HTML documents */
+ public static final class Parser {
+
+ public final Properties metaTags = new Properties();
+ public final String title, body;
+
+ public Parser(Reader reader) throws IOException, SAXException {
+ this(new InputSource(reader));
+ }
+
+ public Parser(InputSource source) throws IOException, SAXException {
+ final SAXParser parser = new SAXParser();
+ parser.setFeature("http://xml.org/sax/features/namespaces", true);
+ parser.setFeature("http://cyberneko.org/html/features/balance-tags", true);
+ parser.setFeature("http://cyberneko.org/html/features/report-errors", false);
+ parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+ parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
- public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException {
- org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser p = new org.apache.lucene.benchmark.byTask.feeds.demohtml.HTMLParser(reader);
+ final StringBuilder title = new StringBuilder(), body = new StringBuilder();
+ final DefaultHandler handler = new DefaultHandler() {
+ private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
+
+ @Override
+ public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
+ if (inHEAD > 0) {
+ if (equalsIgnoreTurkish("title", localName)) {
+ inTITLE++;
+ } else {
+ if (equalsIgnoreTurkish("meta", localName)) {
+ String name = atts.getValue("name");
+ if (name == null) {
+ name = atts.getValue("http-equiv");
+ }
+ final String val = atts.getValue("content");
+ if (name != null && val != null) {
+ metaTags.setProperty(name.toLowerCase(Locale.ROOT), val);
+ }
+ }
+ }
+ } else if (inBODY > 0) {
+ if (SUPPRESS_ELEMENTS.contains(localName)) {
+ suppressed++;
+ } else if (equalsIgnoreTurkish("img", localName)) {
+ // the original javacc-based parser preserved <IMG alt="..."/>
+ // attribute as body text in [] parenthesis:
+ final String alt = atts.getValue("alt");
+ if (alt != null) {
+ body.append('[').append(alt).append(']');
+ }
+ }
+ } else if (equalsIgnoreTurkish("body", localName)) {
+ inBODY++;
+ } else if (equalsIgnoreTurkish("head", localName)) {
+ inHEAD++;
+ } else if (equalsIgnoreTurkish("frameset", localName)) {
+ throw new SAXException("This parser does not support HTML framesets.");
+ }
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
+ if (inBODY > 0) {
+ if (equalsIgnoreTurkish("body", localName)) {
+ inBODY--;
+ } else if (ENDLINE_ELEMENTS.contains(localName)) {
+ body.append('\n');
+ } else if (SUPPRESS_ELEMENTS.contains(localName)) {
+ suppressed--;
+ }
+ } else if (inHEAD > 0) {
+ if (equalsIgnoreTurkish("head", localName)) {
+ inHEAD--;
+ } else if (inTITLE > 0 && equalsIgnoreTurkish("title", localName)) {
+ inTITLE--;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (inBODY > 0 && suppressed == 0) {
+ body.append(ch, start, length);
+ } else if (inTITLE > 0) {
+ title.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId) {
+ // disable network access caused by DTDs
+ return new InputSource(new StringReader(""));
+ }
+ };
+
+ parser.setContentHandler(handler);
+ parser.setErrorHandler(handler);
+ parser.parse(source);
+
+ // the javacc-based parser trimmed title (which should be done for HTML in all cases):
+ this.title = title.toString().trim();
+
+ // assign body text
+ this.body = body.toString();
+ }
+
+ // TODO: remove the Turkish workaround once this is fixed in NekoHTML:
+ // https://sourceforge.net/tracker/?func=detail&aid=3544334&group_id=195122&atid=952178
- // title
- if (title==null) {
- title = p.getTitle();
+ // BEGIN: workaround
+ static final String convertTurkish(String s) {
+ return s.replace('i', 'ı');
}
- // properties
- Properties props = p.getMetaTags();
- // body
- Reader r = p.getReader();
- char c[] = new char[1024];
- StringBuilder bodyBuf = new StringBuilder();
- int n;
- while ((n = r.read(c)) >= 0) {
- if (n>0) {
- bodyBuf.append(c,0,n);
+ static final boolean equalsIgnoreTurkish(String s1, String s2) {
+ final int len1 = s1.length(), len2 = s2.length();
+ if (len1 != len2)
+ return false;
+ for (int i = 0; i < len1; i++) {
+ char ch1 = s1.charAt(i), ch2 = s2.charAt(i);
+ if (ch1 == 'ı') ch1 = 'i';
+ if (ch2 == 'ı') ch2 = 'i';
+ if (ch1 != ch2)
+ return false;
}
+ return true;
}
- r.close();
- if (date == null && props.getProperty("date")!=null) {
- try {
- date = dateFormat.parse(props.getProperty("date").trim());
- } catch (ParseException e) {
- // do not fail test just because a date could not be parsed
- System.out.println("ignoring date parse exception (assigning 'now') for: "+props.getProperty("date"));
- date = new Date(); // now
+ // END: workaround
+
+ static final Set<String> createElementNameSet(String... names) {
+ final HashSet<String> set = new HashSet<String>();
+ for (final String name : names) {
+ set.add(name);
+ set.add(convertTurkish(name));
+ }
+ return Collections.unmodifiableSet(set);
+ }
+
+ /** HTML elements that cause a line break (they are block-elements) */
+ static final Set<String> ENDLINE_ELEMENTS = createElementNameSet(
+ "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
+ "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
+ "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
+ );
+
+ /** HTML elements with contents that are ignored */
+ static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet(
+ "style", "script"
+ );
+ }
+
+ @Override
+ public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException {
+ try {
+ return parse(docData, name, date, new InputSource(reader), trecSrc);
+ } catch (SAXException saxe) {
+ throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
+ }
+ }
+
+ public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException {
+ final Parser p = new Parser(source);
+
+ // properties
+ final Properties props = p.metaTags;
+ String dateStr = props.getProperty("date");
+ if (dateStr != null) {
+ final Date newDate = trecSrc.parseDate(dateStr);
+ if (newDate != null) {
+ date = newDate;
}
}
docData.clear();
docData.setName(name);
- docData.setBody(bodyBuf.toString());
- docData.setTitle(title);
+ docData.setBody(p.body);
+ docData.setTitle(p.title);
docData.setProps(props);
docData.setDate(date);
return docData;
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DirContentSource.java Thu Jul 19 15:58:54 2012
@@ -18,12 +18,14 @@ package org.apache.lucene.benchmark.byTa
*/
import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.util.IOUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@@ -161,7 +163,7 @@ public class DirContentSource extends Co
dfi = new DateFormatInfo();
dfi.pos = new ParsePosition(0);
// date format: 30-MAR-1987 14:22:36.87
- dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.US);
+ dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS", Locale.ROOT);
dfi.df.setLenient(true);
dateFormat.set(dfi);
}
@@ -198,7 +200,7 @@ public class DirContentSource extends Co
name = f.getCanonicalPath()+"_"+iteration;
}
- BufferedReader reader = new BufferedReader(new FileReader(f));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
String line = null;
//First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine();
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java Thu Jul 19 15:58:54 2012
@@ -29,6 +29,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
+import java.util.TimeZone;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.benchmark.byTask.utils.Config;
@@ -182,8 +183,8 @@ public class DocMaker implements Closeab
private boolean storeBytes = false;
private static class DateUtil {
- public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US);
- public Calendar cal = Calendar.getInstance();
+ public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ROOT);
+ public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
public ParsePosition pos = new ParsePosition(0);
public DateUtil() {
parser.setLenient(true);
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiContentSource.java Thu Jul 19 15:58:54 2012
@@ -25,6 +25,7 @@ import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.HashMap;
+import java.util.Locale;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
@@ -146,7 +147,7 @@ public class EnwikiContentSource extends
case BODY:
body = contents.toString();
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
- String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase();
+ String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
if (startsWith.startsWith("#redirect")) {
body = null;
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/FileBasedQueryMaker.java Thu Jul 19 15:58:54 2012
@@ -5,6 +5,7 @@ import org.apache.lucene.queryparser.cla
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.benchmark.byTask.tasks.NewAnalyzerTask;
+import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.Version;
import java.io.*;
@@ -59,13 +60,14 @@ public class FileBasedQueryMaker extends
{
File file = new File(fileName);
Reader reader = null;
+ // note: we use a decoding reader, so if your queries are screwed up you know
if (file.exists()) {
- reader = new FileReader(file);
+ reader = IOUtils.getDecodingReader(file, IOUtils.CHARSET_UTF_8);
} else {
//see if we can find it as a resource
InputStream asStream = FileBasedQueryMaker.class.getClassLoader().getResourceAsStream(fileName);
if (asStream != null) {
- reader = new InputStreamReader(asStream);
+ reader = IOUtils.getDecodingReader(asStream, IOUtils.CHARSET_UTF_8);
}
}
if (reader != null) {
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/HTMLParser.java Thu Jul 19 15:58:54 2012
@@ -19,7 +19,6 @@ package org.apache.lucene.benchmark.byTa
import java.io.IOException;
import java.io.Reader;
-import java.text.DateFormat;
import java.util.Date;
/**
@@ -34,13 +33,11 @@ public interface HTMLParser {
* @param docData result reused
* @param name name of the result doc data.
* @param date date of the result doc data. If null, attempt to set by parsed data.
- * @param title title of the result doc data. If null, attempt to set by parsed data.
* @param reader reader of html text to parse.
- * @param dateFormat date formatter to use for extracting the date.
+ * @param trecSrc the {@link TrecContentSource} used to parse dates.
* @return Parsed doc data.
* @throws IOException
- * @throws InterruptedException
*/
- public DocData parse(DocData docData, String name, Date date, String title, Reader reader, DateFormat dateFormat) throws IOException, InterruptedException;
+ public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException;
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishContentSource.java Thu Jul 19 15:58:54 2012
@@ -35,7 +35,7 @@ public class LongToEnglishContentSource
}
// TODO: we could take param to specify locale...
- private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ENGLISH,
+ private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
RuleBasedNumberFormat.SPELLOUT);
@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LongToEnglishQueryMaker.java Thu Jul 19 15:58:54 2012
@@ -37,7 +37,7 @@ public class LongToEnglishQueryMaker imp
protected QueryParser parser;
// TODO: we could take param to specify locale...
- private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ENGLISH,
+ private final RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
RuleBasedNumberFormat.SPELLOUT);
public Query makeQuery(int size) throws Exception {
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/ReutersContentSource.java Thu Jul 19 15:58:54 2012
@@ -19,8 +19,9 @@ package org.apache.lucene.benchmark.byTa
import java.io.BufferedReader;
import java.io.File;
-import java.io.FileReader;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@@ -29,6 +30,7 @@ import java.util.Date;
import java.util.Locale;
import org.apache.lucene.benchmark.byTask.utils.Config;
+import org.apache.lucene.util.IOUtils;
/**
* A {@link ContentSource} reading from the Reuters collection.
@@ -74,7 +76,7 @@ public class ReutersContentSource extend
if (dfi == null) {
dfi = new DateFormatInfo();
// date format: 30-MAR-1987 14:22:36.87
- dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.US);
+ dfi.df = new SimpleDateFormat("dd-MMM-yyyy kk:mm:ss.SSS",Locale.ROOT);
dfi.df.setLenient(true);
dfi.pos = new ParsePosition(0);
dateFormat.set(dfi);
@@ -112,7 +114,7 @@ public class ReutersContentSource extend
name = f.getCanonicalPath() + "_" + iteration;
}
- BufferedReader reader = new BufferedReader(new FileReader(f));
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
try {
// First line is the date, 3rd is the title, rest is body
String dateStr = reader.readLine();
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecContentSource.java Thu Jul 19 15:58:54 2012
@@ -22,7 +22,6 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.io.Reader;
import java.text.DateFormat;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
@@ -33,8 +32,6 @@ import java.util.Locale;
import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
-import org.apache.lucene.benchmark.byTask.utils.StringBuilderReader;
-import org.apache.lucene.util.ThreadInterruptedException;
/**
* Implements a {@link ContentSource} over the TREC collection.
@@ -57,7 +54,7 @@ import org.apache.lucene.util.ThreadInte
*/
public class TrecContentSource extends ContentSource {
- private static final class DateFormatInfo {
+ static final class DateFormatInfo {
DateFormat[] dfs;
ParsePosition pos;
}
@@ -83,13 +80,10 @@ public class TrecContentSource extends C
};
private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<DateFormatInfo>();
- private ThreadLocal<StringBuilderReader> trecDocReader = new ThreadLocal<StringBuilderReader>();
private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<StringBuilder>();
private File dataDir = null;
private ArrayList<File> inputFiles = new ArrayList<File>();
private int nextFile = 0;
- private int rawDocSize = 0;
-
// Use to synchronize threads on reading from the TREC documents.
private Object lock = new Object();
@@ -108,7 +102,7 @@ public class TrecContentSource extends C
dfi = new DateFormatInfo();
dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length];
for (int i = 0; i < dfi.dfs.length; i++) {
- dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.US);
+ dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.ROOT);
dfi.dfs[i].setLenient(true);
}
dfi.pos = new ParsePosition(0);
@@ -126,17 +120,6 @@ public class TrecContentSource extends C
return sb;
}
- Reader getTrecDocReader(StringBuilder docBuffer) {
- StringBuilderReader r = trecDocReader.get();
- if (r == null) {
- r = new StringBuilderReader(docBuffer);
- trecDocReader.set(r);
- } else {
- r.set(docBuffer);
- }
- return r;
- }
-
HTMLParser getHtmlParser() {
return htmlParser;
}
@@ -161,7 +144,7 @@ public class TrecContentSource extends C
continue;
}
- rawDocSize += line.length();
+ line.length();
if (lineStart!=null && line.startsWith(lineStart)) {
if (collectMatchLine) {
@@ -287,12 +270,8 @@ public class TrecContentSource extends C
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
- try {
- docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
- addItem();
- } catch (InterruptedException ie) {
- throw new ThreadInterruptedException(ie);
- }
+ docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
+ addItem();
return docData;
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecDocParser.java Thu Jul 19 15:58:54 2012
@@ -47,7 +47,7 @@ public abstract class TrecDocParser {
static final Map<String,ParsePathType> pathName2Type = new HashMap<String,ParsePathType>();
static {
for (ParsePathType ppt : ParsePathType.values()) {
- pathName2Type.put(ppt.name().toUpperCase(Locale.ENGLISH),ppt);
+ pathName2Type.put(ppt.name().toUpperCase(Locale.ROOT),ppt);
}
}
@@ -60,7 +60,7 @@ public abstract class TrecDocParser {
public static ParsePathType pathType(File f) {
int pathLength = 0;
while (f != null && ++pathLength < MAX_PATH_LENGTH) {
- ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ENGLISH));
+ ParsePathType ppt = pathName2Type.get(f.getName().toUpperCase(Locale.ROOT));
if (ppt!=null) {
return ppt;
}
@@ -80,7 +80,7 @@ public abstract class TrecDocParser {
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException;
+ StringBuilder docBuf, ParsePathType pathType) throws IOException;
/**
* strip tags from <code>buf</code>: each tag is replaced by a single blank.
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFBISParser.java Thu Jul 19 15:58:54 2012
@@ -37,7 +37,7 @@ public class TrecFBISParser extends Trec
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date, title
Date date = null;
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFR94Parser.java Thu Jul 19 15:58:54 2012
@@ -41,7 +41,7 @@ public class TrecFR94Parser extends Trec
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// optionally skip some of the text, set date (no title?)
Date date = null;
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecFTParser.java Thu Jul 19 15:58:54 2012
@@ -33,7 +33,7 @@ public class TrecFTParser extends TrecDo
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// date...
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecGov2Parser.java Thu Jul 19 15:58:54 2012
@@ -18,7 +18,7 @@ package org.apache.lucene.benchmark.byTa
*/
import java.io.IOException;
-import java.io.Reader;
+import java.io.StringReader;
import java.util.Date;
/**
@@ -31,29 +31,24 @@ public class TrecGov2Parser extends Trec
private static final String DOCHDR = "<DOCHDR>";
private static final String TERMINATING_DOCHDR = "</DOCHDR>";
- private static final int TERMINATING_DOCHDR_LENGTH = TERMINATING_DOCHDR.length();
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
- // Set up a (per-thread) reused Reader over the read content, reset it to re-read from docBuf
- Reader r = trecSrc.getTrecDocReader(docBuf);
-
- // skip some of the text, optionally set date
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
+ // skip some of the non-html text, optionally set date
Date date = null;
- int h1 = docBuf.indexOf(DOCHDR);
- if (h1>=0) {
- int h2 = docBuf.indexOf(TERMINATING_DOCHDR,h1);
- String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
+ int start = 0;
+ final int h1 = docBuf.indexOf(DOCHDR);
+ if (h1 >= 0) {
+ final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
+ final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
- r.mark(h2+TERMINATING_DOCHDR_LENGTH);
+ start = h2 + TERMINATING_DOCHDR.length();
}
-
- r.reset();
- HTMLParser htmlParser = trecSrc.getHtmlParser();
- return htmlParser.parse(docData, name, date, null, r, null);
+ final String html = docBuf.substring(start);
+ return trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecLATimesParser.java Thu Jul 19 15:58:54 2012
@@ -36,7 +36,7 @@ public class TrecLATimesParser extends T
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
int mark = 0; // that much is skipped
// date...
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/TrecParserByPath.java Thu Jul 19 15:58:54 2012
@@ -26,7 +26,7 @@ public class TrecParserByPath extends Tr
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
- StringBuilder docBuf, ParsePathType pathType) throws IOException, InterruptedException {
+ StringBuilder docBuf, ParsePathType pathType) throws IOException {
return pathType2parser.get(pathType).parse(docData, name, trecSrc, docBuf, pathType);
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java Thu Jul 19 15:58:54 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTa
*/
import java.text.NumberFormat;
+import java.util.Locale;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@@ -61,7 +62,7 @@ public class AddDocTask extends PerfTask
@Override
protected String getLogMessage(int recsCount) {
- return String.format("added %9d docs",recsCount);
+ return String.format(Locale.ROOT, "added %9d docs",recsCount);
}
@Override
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java Thu Jul 19 15:58:54 2012
@@ -20,7 +20,6 @@ package org.apache.lucene.benchmark.byTa
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
-import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
import org.apache.lucene.index.IndexWriter;
@@ -34,7 +33,6 @@ import org.apache.lucene.index.NoDeletio
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.NoMergeScheduler;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import java.io.BufferedOutputStream;
@@ -42,6 +40,7 @@ import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
+import java.nio.charset.Charset;
/**
* Create an index. <br>
@@ -174,7 +173,7 @@ public class CreateIndexTask extends Per
return iwConf;
}
- public static IndexWriter configureWriter(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException {
+ public static IndexWriter configureWriter(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) throws IOException {
IndexWriterConfig iwc = createWriterConfig(config, runData, mode, commit);
String infoStreamVal = config.get("writer.info.stream", null);
if (infoStreamVal != null) {
@@ -184,7 +183,7 @@ public class CreateIndexTask extends Per
iwc.setInfoStream(System.err);
} else {
File f = new File(infoStreamVal).getAbsoluteFile();
- iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f))));
+ iwc.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)), false, Charset.defaultCharset().name()));
}
}
IndexWriter writer = new IndexWriter(runData.getDirectory(), iwc);
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java Thu Jul 19 15:58:54 2012
@@ -17,6 +17,8 @@ package org.apache.lucene.benchmark.byTa
* limitations under the License.
*/
+import java.util.Locale;
+
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.stats.Points;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
@@ -266,7 +268,7 @@ public abstract class PerfTask implement
public void tearDown() throws Exception {
if (++logStepCount % logStep == 0) {
double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0;
- System.out.println(String.format("%7.2f",time) + " sec --> "
+ System.out.println(String.format(Locale.ROOT, "%7.2f",time) + " sec --> "
+ Thread.currentThread().getName() + " " + getLogMessage(logStepCount));
}
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.java Thu Jul 19 15:58:54 2012
@@ -77,7 +77,7 @@ public class SearchWithSortTask extends
} else {
throw new RuntimeException("You must specify the sort type ie page:int,subject:string");
}
- sortField0 = new SortField(fieldName, SortField.Type.valueOf(typeString.toUpperCase(Locale.ENGLISH)));
+ sortField0 = new SortField(fieldName, SortField.Type.valueOf(typeString.toUpperCase(Locale.ROOT)));
}
sortFields[upto++] = sortField0;
}
Modified: lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java?rev=1363400&r1=1363399&r2=1363400&view=diff
==============================================================================
--- lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java (original)
+++ lucene/dev/branches/pforcodec_3892/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java Thu Jul 19 15:58:54 2012
@@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.byTa
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.text.NumberFormat;
import org.apache.lucene.benchmark.byTask.PerfRunData;
@@ -428,7 +429,7 @@ public class TaskSequence extends PerfTa
sb.append(padd);
sb.append(!letChildReport ? ">" : (parallel ? "]" : "}"));
if (fixedTime) {
- sb.append(" " + NumberFormat.getNumberInstance().format(runTimeSec) + "s");
+ sb.append(" " + NumberFormat.getNumberInstance(Locale.ROOT).format(runTimeSec) + "s");
} else if (repetitions>1) {
sb.append(" * " + repetitions);
} else if (repetitions==REPEAT_EXHAUST) {
@@ -487,7 +488,7 @@ public class TaskSequence extends PerfTa
if (rate>0) {
seqName += "_" + rate + (perMin?"/min":"/sec");
}
- if (parallel && seqName.toLowerCase().indexOf("par")<0) {
+ if (parallel && seqName.toLowerCase(Locale.ROOT).indexOf("par")<0) {
seqName += "_Par";
}
}