You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/06/24 14:22:51 UTC
svn commit: r957522 - in /lucene/dev/branches/branch_3x/lucene/contrib:
CHANGES.txt misc/src/java/org/apache/lucene/misc/GetTermInfo.java
misc/src/java/org/apache/lucene/misc/HighFreqTerms.java
misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
Author: mikemccand
Date: Thu Jun 24 12:22:51 2010
New Revision: 957522
URL: http://svn.apache.org/viewvc?rev=957522&view=rev
Log:
LUCENE-2393: add total TF tracking to HighFreqTerms tool
Added:
lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java (with props)
lucene/dev/branches/branch_3x/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java (with props)
Modified:
lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java
Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=957522&r1=957521&r2=957522&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Thu Jun 24 12:22:51 2010
@@ -169,6 +169,9 @@ New features
* LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary.
(Thomas Peuss via Robert Muir)
+ * LUCENE-2393: The HighFreqTerms tool (in misc) can now optionally
+ also include the total termFreq. (Tom Burton-West via Mike McCandless)
+
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
Added: lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java?rev=957522&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java Thu Jun 24 12:22:51 2010
@@ -0,0 +1,63 @@
+package org.apache.lucene.misc;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexReader;
+
+
+/*
+ * Utility to get document frequency and total number of occurrences (sum of the tf for each doc) of a term.
+ */
+public class GetTermInfo {
+
+ public static void main(String[] args) throws Exception {
+
+ FSDirectory dir = null;
+ String inputStr = null;
+ String field = null;
+
+ if (args.length == 3) {
+ dir = FSDirectory.open(new File(args[0]));
+ field = args[1];
+ inputStr = args[2];
+ } else {
+ usage();
+ System.exit(1);
+ }
+ Term term = new Term(field, inputStr);
+ getTermInfo(dir,term);
+ }
+
+ public static void getTermInfo(Directory dir, Term term) throws Exception {
+ IndexReader reader = IndexReader.open(dir);
+
+ long totalTF = HighFreqTerms.getTotalTermFreq(reader, term);
+ System.out.printf("%s:%s \t total_tf = %,d \t doc freq = %,d \n",
+ term.field(), term.text(), totalTF, reader.docFreq(term));
+ }
+
+ private static void usage() {
+ System.out
+ .println("\n\nusage:\n\t"
+ + "java org.apache.lucene.index.GetTermInfo <index dir> field term \n\n");
+ }
+}
Propchange: lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/GetTermInfo.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java?rev=957522&r1=957521&r2=957522&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/misc/src/java/org/apache/lucene/misc/HighFreqTerms.java Thu Jun 24 12:22:51 2010
@@ -1,99 +1,233 @@
package org.apache.lucene.misc;
/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.PriorityQueue;
+import java.util.Arrays;
+import java.util.Comparator;
import java.io.File;
/**
- * <code>HighFreqTerms</code> class extracts terms and their frequencies out
- * of an existing Lucene index.
+ * <code>HighFreqTerms</code> class extracts the top n most frequent terms
+ * (by document frequency ) from an existing Lucene index and reports their
+ * document frequency. If used with the -t flag it also reports their
+ * total tf (total number of occurences) in order of highest total tf
*/
public class HighFreqTerms {
- // The top numTerms will be displayed
- public static final int numTerms = 100;
-
+ // The top numTerms will be displayed
+ public static final int DEFAULTnumTerms = 100;
+ public static int numTerms = DEFAULTnumTerms;
+
public static void main(String[] args) throws Exception {
IndexReader reader = null;
FSDirectory dir = null;
String field = null;
- if (args.length == 1) {
- dir = FSDirectory.open(new File(args[0]));
- reader = IndexReader.open(dir, true);
- } else if (args.length == 2) {
- dir = FSDirectory.open(new File(args[0]));
- reader = IndexReader.open(dir, true);
- field = args[1];
- } else {
+ boolean IncludeTermFreqs = false;
+
+ if (args.length == 0 || args.length > 4) {
usage();
System.exit(1);
- }
-
- TermInfoQueue tiq = new TermInfoQueue(numTerms);
- TermEnum terms = reader.terms();
+ }
- if (field != null) {
- while (terms.next()) {
- if (terms.term().field().equals(field)) {
- tiq.insertWithOverflow(new TermInfo(terms.term(), terms.docFreq()));
+ if (args.length > 0) {
+ dir = FSDirectory.open(new File(args[0]));
+ }
+
+ for (int i = 1; i < args.length; i++) {
+ if (args[i].equals("-t")) {
+ IncludeTermFreqs = true;
+ }
+ else{
+ try {
+ numTerms = Integer.parseInt(args[i]);
+ } catch (NumberFormatException e) {
+ field=args[i];
}
}
}
- else {
+
+
+ reader = IndexReader.open(dir, true);
+ TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
+ /*
+ * Insert logic so it will only lookup totaltf if right arg
+ * also change names as in flex
+ */
+ if (!IncludeTermFreqs) {
+ //default HighFreqTerms behavior
+ for (int i = 0; i < terms.length; i++) {
+ System.out.printf("%s %,d \n",
+ terms[i].term, terms[i].docFreq);
+ }
+ } else {
+
+ TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
+ for (int i = 0; i < termsWithTF.length; i++) {
+ System.out.printf("%s \t total_tf = %,d \t doc freq = %,d \n",
+ termsWithTF[i].term, termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq);
+ }
+ }
+
+ reader.close();
+ }
+
+ private static void usage() {
+ System.out
+ .println("\n\n"
+ + "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t][number_terms] [field]\n\t -t: include totalTermFreq\n\n");
+ }
+
+ /**
+ *
+ * @param reader
+ * @param numTerms
+ * @param field
+ * @return TermStats[] ordered by terms with highest docFreq first.
+ * @throws Exception
+ */
+ public static TermStats[] getHighFreqTerms(IndexReader reader,
+ int numTerms, String field) throws Exception {
+
+ TermInfoWiTFQueue tiq = new TermInfoWiTFQueue(numTerms);
+ if (field != null) {
+ TermEnum terms = reader.terms(new Term(field));
+ if (terms != null && terms.term() != null) {
+ do {
+ if (!terms.term().field().equals(field)) {
+ break;
+ }
+ tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
+ } while (terms.next());
+ } else {
+ System.out.println("No terms for field \"" + field + "\"");
+ }
+ } else {
+ TermEnum terms = reader.terms();
while (terms.next()) {
- tiq.insertWithOverflow(new TermInfo(terms.term(), terms.docFreq()));
+ tiq.insertWithOverflow(new TermStats(terms.term(), terms.docFreq()));
}
}
+
+ TermStats[] result = new TermStats[tiq.size()];
+
+ // we want highest first so we read the queue and populate the array
+ // starting at the end and work backwards
+ int count = tiq.size() - 1;
while (tiq.size() != 0) {
- TermInfo termInfo = tiq.pop();
- System.out.println(termInfo.term + " " + termInfo.docFreq);
+ result[count] = tiq.pop();
+ count--;
}
-
- reader.close();
+ return result;
}
- private static void usage() {
- System.out.println(
- "\n\n"
- + "java org.apache.lucene.misc.HighFreqTerms <index dir> [field]\n\n");
+ /**
+ * Takes array of TermStats. For each term looks up the tf for each doc
+ * containing the term and stores the total in the output array of TermStats.
+ * Output array is sorted by highest total tf.
+ *
+ * @param reader
+ * @param terms
+ * TermStats[]
+ * @return TermStats[]
+ * @throws Exception
+ */
+
+ public static TermStats[] sortByTotalTermFreq(IndexReader reader, TermStats[] terms) throws Exception {
+ TermStats[] ts = new TermStats[terms.length]; // array for sorting
+ long totalTF;
+ for (int i = 0; i < terms.length; i++) {
+ totalTF = getTotalTermFreq(reader, terms[i].term);
+ ts[i] = new TermStats( terms[i].term, terms[i].docFreq, totalTF);
+ }
+
+ Comparator<TermStats> c = new TotalTermFreqComparatorSortDescending();
+ Arrays.sort(ts, c);
+
+ return ts;
+ }
+
+ public static long getTotalTermFreq(IndexReader reader, Term term) throws Exception {
+ long totalTF = 0;
+ TermDocs td = reader.termDocs(term);
+ while (td.next()) {
+ totalTF += td.freq();
+ }
+ return totalTF;
}
}
-final class TermInfo {
- TermInfo(Term t, int df) {
- term = t;
- docFreq = df;
+
+final class TermStats {
+ public Term term;
+ public int docFreq;
+ public long totalTermFreq;
+
+ public TermStats(Term t, int df) {
+ this.term = t;
+ this.docFreq = df;
+ }
+
+ public TermStats(Term t, int df, long tf) {
+ this.term = t;
+ this.docFreq = df;
+ this.totalTermFreq = tf;
}
- int docFreq;
- Term term;
}
-final class TermInfoQueue extends PriorityQueue<TermInfo> {
- TermInfoQueue(int size) {
+
+/**
+ * Priority queue for TermStats objects ordered by TermStats.docFreq
+ **/
+final class TermInfoWiTFQueue extends PriorityQueue<TermStats> {
+ TermInfoWiTFQueue(int size) {
initialize(size);
}
+
@Override
- protected final boolean lessThan(TermInfo termInfoA, TermInfo termInfoB) {
+ protected boolean lessThan(TermStats termInfoA,
+ TermStats termInfoB) {
return termInfoA.docFreq < termInfoB.docFreq;
}
}
+
+/**
+ * Comparator
+ *
+ * Reverse of normal Comparator. i.e. returns 1 if a.totalTermFreq is less than
+ * b.totalTermFreq So we can sort in descending order of totalTermFreq
+ */
+final class TotalTermFreqComparatorSortDescending implements Comparator<TermStats> {
+
+ public int compare(TermStats a, TermStats b) {
+ if (a.totalTermFreq < b.totalTermFreq) {
+ return 1;
+ } else if (a.totalTermFreq > b.totalTermFreq) {
+ return -1;
+ } else {
+ return 0;
+ }
+ }
+}
+
Added: lucene/dev/branches/branch_3x/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java?rev=957522&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java (added)
+++ lucene/dev/branches/branch_3x/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java Thu Jun 24 12:22:51 2010
@@ -0,0 +1,271 @@
+package org.apache.lucene.misc;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+public class TestHighFreqTerms extends LuceneTestCase {
+
+ private static IndexWriter writer =null;
+ private static MockRAMDirectory dir = null;
+ private static IndexReader reader =null;
+
+ public void setUp() throws Exception {
+ super.setUp();
+ dir= new MockRAMDirectory();
+ writer = new IndexWriter(dir, new IndexWriterConfig(
+ TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))
+ .setMaxBufferedDocs(2));
+ indexDocs(writer);
+ reader = IndexReader.open(dir, true);
+ }
+
+public void tearDown()throws Exception{
+ super.tearDown();
+ reader.close();
+}
+/******************** Tests for getHighFreqTerms **********************************/
+
+ // test without specifying field (i.e. if we pass in field=null it should examine all fields)
+ // the term "diff" in the field "different_field" occurs 20 times and is the highest df term
+ public static void testFirstTermHighestDocFreqAllFields () throws Exception{
+ int numTerms = 12;
+ String field =null;
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
+ }
+
+
+ public static void testFirstTermHighestDocFreq () throws Exception{
+ int numTerms = 12;
+ String field="FIELD_1";
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
+ }
+ public static void testOrderedByDocFreqDescending () throws Exception{
+ int numTerms = 12;
+ String field="FIELD_1";
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ for (int i = 0; i < terms.length; i++) {
+ if (i >0){
+ assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
+ }
+ }
+ }
+
+ public static void testNumTerms () throws Exception{
+ int numTerms = 12;
+ String field = null;
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
+ }
+
+ public static void testGetHighFreqTerms () throws Exception{
+ int numTerms=12;
+ String field="FIELD_1";
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+
+ for (int i = 0; i < terms.length; i++) {
+ String termtext = terms[i].term.text();
+ // hardcoded highTF or highTFmedDF
+ if (termtext.contains("highTF")) {
+ if (termtext.contains("medDF")) {
+ assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
+ } else {
+ assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
+ }
+ } else {
+ int n = Integer.parseInt(termtext);
+ assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
+ terms[i].docFreq);
+ }
+ }
+ }
+
+ /********************Test sortByTotalTermFreq**********************************/
+
+ public static void testFirstTermHighestTotalTermFreq () throws Exception{
+ int numTerms = 20;
+ String field = null;
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
+ assertEquals("Term with highest totalTermFreq is first",200, termsWithTotalTermFreq[0].totalTermFreq);
+ }
+ public static void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
+ int numTerms = 20;
+ String field = "different_field";
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ TermStats[] termsWithTotalTermFreq = HighFreqTerms.sortByTotalTermFreq(reader, terms);
+ assertEquals("Term with highest totalTermFreq is first"+ termsWithTotalTermFreq[0].term.text(),150, termsWithTotalTermFreq[0].totalTermFreq);
+ }
+
+ public static void testOrderedByTermFreqDescending () throws Exception{
+ int numTerms = 12;
+ String field = "FIELD_1";
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
+
+ for (int i = 0; i < termsWithTF.length; i++) {
+ // check that they are sorted by descending termfreq order
+ if (i >0){
+ assertTrue ("out of order" +termsWithTF[i-1]+ " > " +termsWithTF[i],termsWithTF[i-1].totalTermFreq > termsWithTF[i].totalTermFreq);
+ }
+ }
+ }
+
+ public static void testGetTermFreqOrdered () throws Exception{
+ int numTerms = 12;
+ String field = "FIELD_1";
+ TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field);
+ TermStats[] termsWithTF = HighFreqTerms.sortByTotalTermFreq(reader, terms);
+
+ for (int i = 0; i < termsWithTF.length; i++) {
+ String text = termsWithTF[i].term.text();
+ if (text.contains("highTF")) {
+ if (text.contains("medDF")) {
+ assertEquals("total term freq is expected", 125,
+ termsWithTF[i].totalTermFreq);
+ } else {
+ assertEquals("total term freq is expected", 200,
+ termsWithTF[i].totalTermFreq);
+ }
+
+ } else {
+ int n = Integer.parseInt(text);
+ assertEquals("doc freq is expected", getExpecteddocFreq(n),
+ termsWithTF[i].docFreq);
+ assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
+ termsWithTF[i].totalTermFreq);
+ }
+ }
+ reader.close();
+ }
+
+ /********************Tests for getTotalTermFreq**********************************/
+
+ public static void testGetTotalTermFreq() throws Exception{
+ String termtext ="highTF";
+ String field = "FIELD_1";
+ Term term = new Term(field,termtext);
+ long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, term);
+ reader.close();
+ assertEquals("highTf tf should be 200",200,totalTermFreq);
+
+ }
+
+ public static void testGetTotalTermFreqBadTerm() throws Exception{
+ String termtext ="foobar";
+ String field = "FIELD_1";
+ Term term = new Term(field,termtext);
+ long totalTermFreq = HighFreqTerms.getTotalTermFreq(reader, term);
+ reader.close();
+ assertEquals("totalTermFreq should be 0 for term not in index",0,totalTermFreq);
+
+ }
+ /********************Testing Utils**********************************/
+
+ private static void indexDocs(IndexWriter writer) throws Exception {
+
+ /**
+ * Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared).
+ */
+ for (int i = 1; i <= 10; i++) {
+ Document doc = new Document();
+ String content = getContent(i);
+
+ doc.add(new Field("FIELD_1", content, Field.Store.YES,Field.Index.ANALYZED, Field.TermVector.NO));
+ //add a different field
+ doc.add(new Field("different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(doc);
+ }
+
+ //add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
+ //highest freq terms for a specific field.
+ for (int i = 1; i <= 10; i++) {
+ Document doc = new Document();
+ doc.add(new Field("different_field", "diff", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(doc);
+ }
+ // add some docs where tf < df so we can see if sorting works
+ // highTF low df
+ int highTF = 200;
+ Document doc = new Document();
+ String content = "";
+ for (int i = 0; i < highTF; i++) {
+ content += "highTF ";
+ }
+ doc.add(new Field("FIELD_1", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(doc);
+ // highTF medium df =5
+ int medium_df = 5;
+ for (int i = 0; i < medium_df; i++) {
+ int tf = 25;
+ Document newdoc = new Document();
+ String newcontent = "";
+ for (int j = 0; j < tf; j++) {
+ newcontent += "highTFmedDF ";
+ }
+ newdoc.add(new Field("FIELD_1", newcontent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(newdoc);
+ }
+ // add a doc with high tf in field different_field
+ int targetTF =150;
+ doc = new Document();
+ content = "";
+ for (int i = 0; i < targetTF; i++) {
+ content += "TF150 ";
+ }
+ doc.add(new Field("different_field", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO));
+ writer.addDocument(doc);
+ writer.close();
+
+ }
+
+/**
+ * getContent
+ * return string containing numbers 1 to i with each number n occurring n times.
+ * i.e. for input of 3 return string "3 3 3 2 2 1"
+ */
+
+ private static String getContent(int i) {
+ String s = "";
+ for (int j = 10; j >= i; j--) {
+ for (int k = 0; k < j; k++) {
+ // if j is 3 we return "3 3 3"
+ s += String.valueOf(j) + " ";
+ }
+ }
+ return s;
+ }
+
+ private static int getExpectedtotalTermFreq(int i) {
+ return getExpecteddocFreq(i) * i;
+ }
+
+ private static int getExpecteddocFreq(int i) {
+ return i;
+ }
+}
Propchange: lucene/dev/branches/branch_3x/lucene/contrib/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java
------------------------------------------------------------------------------
svn:eol-style = native