You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2016/10/07 13:59:01 UTC
lucene-solr:branch_6x: LUCENE-7438: Renovate benchmark module's
support for highlighting
Repository: lucene-solr
Updated Branches:
refs/heads/branch_6x 9ee556946 -> 3497a2902
LUCENE-7438: Renovate benchmark module's support for highlighting
(cherry picked from commit 5ef60af)
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/3497a290
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/3497a290
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/3497a290
Branch: refs/heads/branch_6x
Commit: 3497a2902c198b8092b4b0352650e58543b296b5
Parents: 9ee5569
Author: David Smiley <ds...@apache.org>
Authored: Fri Oct 7 09:57:11 2016 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Fri Oct 7 09:58:56 2016 -0400
----------------------------------------------------------------------
build.xml | 2 +
lucene/CHANGES.txt | 3 +
lucene/benchmark/.gitignore | 4 +-
lucene/benchmark/README.enwiki | 11 +-
lucene/benchmark/conf/highlight-profile.alg | 68 -----
.../conf/highlight-vs-vector-highlight.alg | 80 ------
lucene/benchmark/conf/highlighters-postings.alg | 65 +++++
lucene/benchmark/conf/highlighters-tv.alg | 64 +++++
lucene/benchmark/conf/highlights.alg | 69 +++++
lucene/benchmark/conf/query-phrases.txt | 10 +
lucene/benchmark/conf/query-terms.txt | 10 +
lucene/benchmark/conf/query-wildcards.txt | 7 +
.../benchmark/conf/standard-highlights-notv.alg | 69 -----
.../benchmark/conf/standard-highlights-tv.alg | 69 -----
.../benchmark/conf/vector-highlight-profile.alg | 68 -----
.../lucene/benchmark/byTask/PerfRunData.java | 2 +
.../lucene/benchmark/byTask/feeds/DocMaker.java | 7 +
.../byTask/tasks/BenchmarkHighlighter.java | 30 --
.../lucene/benchmark/byTask/tasks/ReadTask.java | 105 +++----
.../tasks/SearchTravRetHighlightTask.java | 283 ++++++++++++++-----
.../tasks/SearchTravRetVectorHighlightTask.java | 147 ----------
.../benchmark/byTask/TestPerfTasksLogic.java | 106 -------
.../tasks/CountingHighlighterTestTask.java | 68 -----
23 files changed, 499 insertions(+), 848 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 247ba60..0d25615 100644
--- a/build.xml
+++ b/build.xml
@@ -192,6 +192,8 @@
// excludes:
exclude(name: '**/build/**')
exclude(name: '**/dist/**')
+ exclude(name: 'lucene/benchmark/work/**')
+ exclude(name: 'lucene/benchmark/temp/**')
exclude(name: '**/CheckLoggingConfiguration.java')
exclude(name: 'build.xml') // ourselves :-)
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 591e3d2..d6372b3 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -38,6 +38,9 @@ Other
* LUCENE-7452: Block join query exception suggests how to find a doc, which
violates orthogonality requirement. (Mikhail Khludnev)
+* LUCENE-7438: Renovate the Benchmark module's support for benchmarking highlighting. All
+ highlighters are supported via SearchTravRetHighlight. (David Smiley)
+
Build
* LUCENE-7292: Fix build to use "--release 8" instead of "-release 8" on
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/.gitignore
----------------------------------------------------------------------
diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore
index 6cac9b7..a20524a 100644
--- a/lucene/benchmark/.gitignore
+++ b/lucene/benchmark/.gitignore
@@ -1,2 +1,2 @@
-temp/
-work/
\ No newline at end of file
+/temp
+/work
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/README.enwiki
----------------------------------------------------------------------
diff --git a/lucene/benchmark/README.enwiki b/lucene/benchmark/README.enwiki
index f9d4930..7ad07a8 100644
--- a/lucene/benchmark/README.enwiki
+++ b/lucene/benchmark/README.enwiki
@@ -13,10 +13,13 @@ writing, there is a page file in
http://download.wikimedia.org/enwiki/20070402/. You can download this
file manually and put it in temp. Note that the file you download will
probably have the date in the name, e.g.,
-http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2. When
-you put it in temp, rename it to enwiki-latest-pages-articles.xml.bz2.
+http://download.wikimedia.org/enwiki/20070402/enwiki-20070402-pages-articles.xml.bz2.
+
+If you use the EnwikiContentSource then the data will be decompressed on the fly
+during the benchmark. If you want to benchmark indexing, you should probably decompress
+it beforehand using the "enwiki" Ant target which will produce a work/enwiki.txt, after
+which you can use LineDocSource in your benchmark.
After that, ant enwiki should process the data set and run a load
-test. Ant targets get-enwiki, expand-enwiki, and extract-enwiki can
-also be used to download, decompress, and extract (to individual files
+test. Ant target enwiki will download, decompress, and extract (to individual files
in work/enwiki) the dataset, respectively.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/highlight-profile.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlight-profile.alg b/lucene/benchmark/conf/highlight-profile.alg
deleted file mode 100644
index b62644c..0000000
--- a/lucene/benchmark/conf/highlight-profile.alg
+++ /dev/null
@@ -1,68 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-# -------------------------------------------------------------------------------------
-# multi val params are iterated by NewRound's, added to reports, start with column name.
-
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.tokenized=true
-doc.term.vector=true
-doc.term.vector.offsets=true
-doc.term.vector.positions=true
-log.step=2000
-
-docs.dir=reuters-out
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-
-query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=true
-# -------------------------------------------------------------------------------------
-{ "Populate"
- CreateIndex
- { "MAddDocs" AddDoc } : 20000
- ForceMerge(1)
- CloseIndex
- }
-{ "Rounds"
-
- ResetSystemSoft
-
-
- OpenReader
- { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000
-
- CloseReader
-
- RepSumByPref MAddDocs
-
- NewRound
-
-} : 4
-
-RepSumByNameRound
-RepSumByName
-RepSumByPrefRound MAddDocs
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/highlight-vs-vector-highlight.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlight-vs-vector-highlight.alg b/lucene/benchmark/conf/highlight-vs-vector-highlight.alg
deleted file mode 100644
index cc4382d..0000000
--- a/lucene/benchmark/conf/highlight-vs-vector-highlight.alg
+++ /dev/null
@@ -1,80 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-# -------------------------------------------------------------------------------------
-
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.tokenized=true
-doc.term.vector=true
-doc.term.vector.offsets=true
-doc.term.vector.positions=true
-log.step=2000
-
-docs.dir=reuters-out
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
-docs.file=temp/enwiki-20070527-pages-articles.xml
-
-query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker
-enwikiQueryMaker.disableSpanQueries=true
-
-max.field.length=2147483647
-highlighter.maxDocCharsToAnalyze=2147483647
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=true
-# -------------------------------------------------------------------------------------
-{ "Populate"
- CreateIndex
- { "MAddDocs" AddDoc } : 20000
- ForceMerge(1)
- CloseIndex
-}
-{
- OpenReader
- { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100
- CloseReader
-}
-{
- "Rounds"
-
- ResetSystemSoft
-
- OpenReader
- { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200
- CloseReader
-
- ResetSystemSoft
-
- OpenReader
- { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200
- CloseReader
-
- RepSumByPref Search
-
- NewRound
-} : 4
-
-RepSumByNameRound
-RepSumByName
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/highlighters-postings.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-postings.alg b/lucene/benchmark/conf/highlighters-postings.alg
new file mode 100644
index 0000000..cf9df11
--- /dev/null
+++ b/lucene/benchmark/conf/highlighters-postings.alg
@@ -0,0 +1,65 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+# For postings-offsets with light term-vectors
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+work.dir=work/enwikiPostings
+ram.flush.mb=64
+compound=false
+
+doc.stored=true
+doc.tokenized=true
+# offsets in postings:
+doc.body.offsets=true
+# term vector, but no positions/offsets with it
+doc.term.vector=true
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
+docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
+file.query.maker.file=conf/query-phrases.txt
+log.queries=false
+log.step.SearchTravRetHighlight=-1
+
+highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
+
+{ "Populate"
+ CreateIndex
+ [{ "MAddDocs" AddDoc > : 50000] : 4
+ CloseIndex
+ } : 0
+
+{
+ "Rounds"
+
+ ResetSystemSoft
+
+ OpenReader
+
+ { "Warm" SearchTravRetHighlight > : 1000
+
+ { "HL" SearchTravRetHighlight > : 500
+
+ CloseReader
+
+ NewRound
+} : 6
+
+RepSumByPrefRound HL
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/highlighters-tv.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-tv.alg b/lucene/benchmark/conf/highlighters-tv.alg
new file mode 100644
index 0000000..1e51018
--- /dev/null
+++ b/lucene/benchmark/conf/highlighters-tv.alg
@@ -0,0 +1,64 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+# This is a full-term vector configuration.
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+work.dir=work/enwikiTermVec
+ram.flush.mb=64
+compound=false
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=true
+doc.term.vector.positions=true
+doc.term.vector.offsets=true
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
+docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
+file.query.maker.file=conf/query-terms.txt
+log.queries=false
+log.step.SearchTravRetHighlight=-1
+
+highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
+
+{ "Populate"
+ CreateIndex
+ [{ "MAddDocs" AddDoc > : 50000] : 4
+ CloseIndex
+ } : 0
+
+{
+ "Rounds"
+
+ ResetSystemSoft
+
+ OpenReader
+
+ { "Warm" SearchTravRetHighlight > : 1000
+
+ { "HL" SearchTravRetHighlight > : 500
+
+ CloseReader
+
+ NewRound
+} : 4
+
+RepSumByPrefRound HL
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/highlights.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlights.alg b/lucene/benchmark/conf/highlights.alg
new file mode 100644
index 0000000..88b056e
--- /dev/null
+++ b/lucene/benchmark/conf/highlights.alg
@@ -0,0 +1,69 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements. See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+# multi val params are iterated by NewRound's, added to reports, start with column name.
+
+ram.flush.mb=flush:32:32
+compound=cmpnd:true:false
+
+analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
+directory=FSDirectory
+
+doc.stored=true
+doc.tokenized=true
+doc.term.vector=false
+doc.term.vector.offsets=false
+doc.term.vector.positions=false
+log.step=2000
+
+docs.dir=reuters-out
+
+content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
+
+query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
+
+# task at this depth or less would print when they start
+task.max.depth.log=2
+
+log.queries=true
+# -------------------------------------------------------------------------------------
+{ "Populate"
+ CreateIndex
+ { "MAddDocs" AddDoc } : 20000
+ ForceMerge(1)
+ CloseIndex
+}
+{ "Rounds"
+
+ ResetSystemSoft
+ OpenReader
+ { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
+ CloseReader
+ OpenReader
+ { "SearchHlgtSameRdr" SearchTravRetHighlight(type[UH]) > : 1000
+
+ CloseReader
+
+ RepSumByPref SearchHlgtSameRdr
+
+ NewRound
+
+} : 2
+
+RepSumByNameRound
+RepSumByName
+RepSumByPrefRound MAddDocs
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/query-phrases.txt
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/query-phrases.txt b/lucene/benchmark/conf/query-phrases.txt
new file mode 100644
index 0000000..b479663
--- /dev/null
+++ b/lucene/benchmark/conf/query-phrases.txt
@@ -0,0 +1,10 @@
+"Abraham Lincoln"
+"Union Wisconsin"
+"court of law"
+"Field Theory" OR "Set Theory"
+"Top 100"
+"red hot chili"
+"greatest guitarists"
+"Planes, Trains & Automobiles" OR ships
+"international airport"
+"Xbox 360"
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/query-terms.txt
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/query-terms.txt b/lucene/benchmark/conf/query-terms.txt
new file mode 100644
index 0000000..c57bace
--- /dev/null
+++ b/lucene/benchmark/conf/query-terms.txt
@@ -0,0 +1,10 @@
+Abraham AND Lincoln
+Union AND Wisconsin
+court AND law
+top AND 100
+(field OR set) AND theory
+red AND hot AND chili
+greatest AND guitarists
+(planes AND trains AND automobiles) OR ships
+international AND airport
+xbox AND 360
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/query-wildcards.txt
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/query-wildcards.txt b/lucene/benchmark/conf/query-wildcards.txt
new file mode 100644
index 0000000..06685c6
--- /dev/null
+++ b/lucene/benchmark/conf/query-wildcards.txt
@@ -0,0 +1,7 @@
+abrah* AND linc*
+court* AND law*
+(field OR set) AND theor*
+red AND hot AND chili*
+great* AND guitar*
+(plan* AND train* AND automob*) OR ship*
+international AND airport*
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/standard-highlights-notv.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/standard-highlights-notv.alg b/lucene/benchmark/conf/standard-highlights-notv.alg
deleted file mode 100644
index 040e1ef..0000000
--- a/lucene/benchmark/conf/standard-highlights-notv.alg
+++ /dev/null
@@ -1,69 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-# -------------------------------------------------------------------------------------
-# multi val params are iterated by NewRound's, added to reports, start with column name.
-
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.tokenized=true
-doc.term.vector=false
-doc.term.vector.offsets=false
-doc.term.vector.positions=false
-log.step=2000
-
-docs.dir=reuters-out
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-
-query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=true
-# -------------------------------------------------------------------------------------
-{ "Populate"
- CreateIndex
- { "MAddDocs" AddDoc } : 20000
- ForceMerge(1)
- CloseIndex
-}
-{ "Rounds"
-
- ResetSystemSoft
- OpenReader
- { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
- CloseReader
- OpenReader
- { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
-
- CloseReader
-
- RepSumByPref SearchHlgtSameRdr
-
- NewRound
-
-} : 2
-
-RepSumByNameRound
-RepSumByName
-RepSumByPrefRound MAddDocs
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/standard-highlights-tv.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/standard-highlights-tv.alg b/lucene/benchmark/conf/standard-highlights-tv.alg
deleted file mode 100644
index 3cd18b8..0000000
--- a/lucene/benchmark/conf/standard-highlights-tv.alg
+++ /dev/null
@@ -1,69 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-# -------------------------------------------------------------------------------------
-# multi val params are iterated by NewRound's, added to reports, start with column name.
-
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.tokenized=true
-doc.term.vector=true
-doc.term.vector.offsets=true
-doc.term.vector.positions=true
-log.step=2000
-
-docs.dir=reuters-out
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-
-query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=true
-# -------------------------------------------------------------------------------------
-{ "Populate"
- CreateIndex
- { "MAddDocs" AddDoc } : 20000
- ForceMerge(1)
- CloseIndex
-}
-{ "Rounds"
-
- ResetSystemSoft
- OpenReader
- { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000
- CloseReader
- OpenReader
- { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
-
- CloseReader
-
- RepSumByPref SearchHlgtSameRdr
-
- NewRound
-
-} : 2
-
-RepSumByNameRound
-RepSumByName
-RepSumByPrefRound MAddDocs
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/conf/vector-highlight-profile.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/vector-highlight-profile.alg b/lucene/benchmark/conf/vector-highlight-profile.alg
deleted file mode 100644
index 4348783..0000000
--- a/lucene/benchmark/conf/vector-highlight-profile.alg
+++ /dev/null
@@ -1,68 +0,0 @@
-#/**
-# * Licensed to the Apache Software Foundation (ASF) under one or more
-# * contributor license agreements. See the NOTICE file distributed with
-# * this work for additional information regarding copyright ownership.
-# * The ASF licenses this file to You under the Apache License, Version 2.0
-# * (the "License"); you may not use this file except in compliance with
-# * the License. You may obtain a copy of the License at
-# *
-# * http://www.apache.org/licenses/LICENSE-2.0
-# *
-# * Unless required by applicable law or agreed to in writing, software
-# * distributed under the License is distributed on an "AS IS" BASIS,
-# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# * See the License for the specific language governing permissions and
-# * limitations under the License.
-# */
-# -------------------------------------------------------------------------------------
-# multi val params are iterated by NewRound's, added to reports, start with column name.
-
-ram.flush.mb=flush:32:32
-compound=cmpnd:true:false
-
-analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
-directory=FSDirectory
-
-doc.stored=true
-doc.tokenized=true
-doc.term.vector=true
-doc.term.vector.offsets=true
-doc.term.vector.positions=true
-log.step=2000
-
-docs.dir=reuters-out
-
-content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource
-
-query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker
-
-# task at this depth or less would print when they start
-task.max.depth.log=2
-
-log.queries=true
-# -------------------------------------------------------------------------------------
-{ "Populate"
- CreateIndex
- { "MAddDocs" AddDoc } : 20000
- ForceMerge(1)
- CloseIndex
- }
-{ "Rounds"
-
- ResetSystemSoft
-
-
- OpenReader
- { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000
-
- CloseReader
-
- RepSumByPref MAddDocs
-
- NewRound
-
-} : 4
-
-RepSumByNameRound
-RepSumByName
-RepSumByPrefRound MAddDocs
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
index 1d4b643..a08b79e 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/PerfRunData.java
@@ -349,6 +349,8 @@ public class PerfRunData implements Closeable {
// Hold reference to new IR
indexReader.incRef();
indexSearcher = new IndexSearcher(indexReader);
+ // TODO Some day we should make the query cache in this module configurable and control clearing the cache
+ indexSearcher.setQueryCache(null);
} else {
indexSearcher = null;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
index 4afafc3..2c722a7 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java
@@ -43,6 +43,7 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexOptions;
/**
* Creates {@link Document} objects. Uses a {@link ContentSource} to generate
@@ -58,6 +59,8 @@ import org.apache.lucene.document.TextField;
* (default <b>true</b>).
* <li><b>doc.body.tokenized</b> - specifies whether the
* body field should be tokenized (default = <b>doc.tokenized</b>).
+ * <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index
+ * for the body field. It is useful for highlighting. (default <b>false</b>)
* <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in
* the index or not. (default <b>false</b>).
* <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be
@@ -424,6 +427,7 @@ public class DocMaker implements Closeable {
boolean bodyTokenized = config.get("doc.body.tokenized", tokenized);
boolean norms = config.get("doc.tokenized.norms", false);
boolean bodyNorms = config.get("doc.body.tokenized.norms", true);
+ boolean bodyOffsets = config.get("doc.body.offsets", false);
boolean termVec = config.get("doc.term.vector", false);
boolean termVecPositions = config.get("doc.term.vector.positions", false);
boolean termVecOffsets = config.get("doc.term.vector.offsets", false);
@@ -441,6 +445,9 @@ public class DocMaker implements Closeable {
bodyValType.setStored(bodyStored);
bodyValType.setTokenized(bodyTokenized);
bodyValType.setOmitNorms(!bodyNorms);
+ if (bodyTokenized && bodyOffsets) {
+ bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+ }
bodyValType.setStoreTermVectors(termVec);
bodyValType.setStoreTermVectorPositions(termVecPositions);
bodyValType.setStoreTermVectorOffsets(termVecOffsets);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
deleted file mode 100644
index e00cc38..0000000
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.benchmark.byTask.tasks;
-
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-
-/**
- * Abstract class for benchmarking highlighting performance
- */
-public abstract class BenchmarkHighlighter {
- public abstract int doHighlight( IndexReader reader, int doc, String field,
- Document document, Analyzer analyzer, String text ) throws Exception ;
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
index 59ee9f9..b1ae112 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java
@@ -75,7 +75,7 @@ public abstract class ReadTask extends PerfTask {
int res = 0;
// open reader or use existing one
- IndexSearcher searcher = getRunData().getIndexSearcher();
+ IndexSearcher searcher = getRunData().getIndexSearcher(); // (will incRef the reader)
IndexReader reader;
@@ -132,46 +132,20 @@ public abstract class ReadTask extends PerfTask {
//hits = collector.topDocs();
}
- final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
- if (hits != null && printHitsField != null && printHitsField.length() > 0) {
- System.out.println("totalHits = " + hits.totalHits);
- System.out.println("maxDoc() = " + reader.maxDoc());
- System.out.println("numDocs() = " + reader.numDocs());
- for(int i=0;i<hits.scoreDocs.length;i++) {
- final int docID = hits.scoreDocs[i].doc;
- final Document doc = reader.document(docID);
- System.out.println(" " + i + ": doc=" + docID + " score=" + hits.scoreDocs[i].score + " " + printHitsField + " =" + doc.get(printHitsField));
- }
- }
-
- if (withTraverse()) {
- final ScoreDoc[] scoreDocs = hits.scoreDocs;
- int traversalSize = Math.min(scoreDocs.length, traversalSize());
-
- if (traversalSize > 0) {
- boolean retrieve = withRetrieve();
- int numHighlight = Math.min(numToHighlight(), scoreDocs.length);
- Analyzer analyzer = getRunData().getAnalyzer();
- BenchmarkHighlighter highlighter = null;
- if (numHighlight > 0) {
- highlighter = getBenchmarkHighlighter(q);
- }
- for (int m = 0; m < traversalSize; m++) {
- int id = scoreDocs[m].doc;
- res++;
- if (retrieve) {
- Document document = retrieveDoc(reader, id);
- res += document != null ? 1 : 0;
- if (numHighlight > 0 && m < numHighlight) {
- Collection<String> fieldsToHighlight = getFieldsToHighlight(document);
- for (final String field : fieldsToHighlight) {
- String text = document.get(field);
- res += highlighter.doHighlight(reader, id, field, document, analyzer, text);
- }
- }
- }
+ if (hits != null) {
+ final String printHitsField = getRunData().getConfig().get("print.hits.field", null);
+ if (printHitsField != null && printHitsField.length() > 0) {
+ System.out.println("totalHits = " + hits.totalHits);
+ System.out.println("maxDoc() = " + reader.maxDoc());
+ System.out.println("numDocs() = " + reader.numDocs());
+ for(int i=0;i<hits.scoreDocs.length;i++) {
+ final int docID = hits.scoreDocs[i].doc;
+ final Document doc = reader.document(docID);
+ System.out.println(" " + i + ": doc=" + docID + " score=" + hits.scoreDocs[i].score + " " + printHitsField + " =" + doc.get(printHitsField));
}
}
+
+ res += withTopDocs(searcher, q, hits);
}
}
}
@@ -185,6 +159,28 @@ public abstract class ReadTask extends PerfTask {
return res;
}
+ protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ IndexReader reader = searcher.getIndexReader();
+ int res = 0;
+ if (withTraverse()) {
+ final ScoreDoc[] scoreDocs = hits.scoreDocs;
+ int traversalSize = Math.min(scoreDocs.length, traversalSize());
+
+ if (traversalSize > 0) {
+ boolean retrieve = withRetrieve();
+ for (int m = 0; m < traversalSize; m++) {
+ int id = scoreDocs[m].doc;
+ res++;
+ if (retrieve) {
+ Document document = retrieveDoc(reader, id);
+ res += document != null ? 1 : 0;
+ }
+ }
+ }
+ }
+ return res;
+ }
+
protected Collector createCollector() throws Exception {
return TopScoreDocCollector.create(numHits());
}
@@ -267,39 +263,8 @@ public abstract class ReadTask extends PerfTask {
*/
public abstract boolean withRetrieve();
- /**
- * Set to the number of documents to highlight.
- *
- * @return The number of the results to highlight. O means no docs will be highlighted.
- */
- public int numToHighlight() {
- return 0;
- }
-
- /**
- * Return an appropriate highlighter to be used with
- * highlighting tasks
- */
- protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
- return null;
- }
-
protected Sort getSort() {
return null;
}
- /**
- * Define the fields to highlight. Base implementation returns all fields
- * @param document The Document
- * @return A Collection of Field names (Strings)
- */
- protected Collection<String> getFieldsToHighlight(Document document) {
- List<IndexableField> fields = document.getFields();
- Set<String> result = new HashSet<>(fields.size());
- for (final IndexableField f : fields) {
- result.add(f.name());
- }
- return result;
- }
-
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
index f017177..f36854d 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
@@ -14,66 +14,99 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.lucene.benchmark.byTask.tasks;
+package org.apache.lucene.benchmark.byTask.tasks;
-import java.util.Collection;
+import java.text.BreakIterator;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.highlight.DefaultEncoder;
+import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
-import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
+import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
+import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
+import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
+import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
+import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
+import org.apache.lucene.search.vectorhighlight.FieldQuery;
+import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
+import org.apache.lucene.search.vectorhighlight.WeightedFragListBuilder;
+import org.apache.lucene.util.ArrayUtil;
/**
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
*
- * Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting.
- *
* <p>Note: This task reuses the reader if it is already open.
* Otherwise a reader is opened at start and closed at the end.
* </p>
*
- * <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
+ * <p>Takes optional multivalued, comma separated param string as: type[<enum>],maxFrags[<int>],fields[name1;name2;...]</p>
* <ul>
- * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
- * <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
+ * <li>type - the highlighter implementation, e.g. "UH"</li>
* <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
- * <li>mergeContiguous - true if contiguous fragments should be merged.</li>
* <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
* </ul>
* Example:
- * <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
+ * <pre>"SearchHlgtSameRdr" SearchTravRetHighlight(type[UH],maxFrags[3],fields[body]) > : 1000
* </pre>
*
- * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well.
+ * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well,
+ * and offsets in postings is another option.
*
* <p>Other side effects: counts additional 1 (record) for each traversed hit,
* and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
*/
public class SearchTravRetHighlightTask extends SearchTravTask {
-
- protected int numToHighlight = Integer.MAX_VALUE;
- protected boolean mergeContiguous;
- protected int maxFrags = 2;
- protected Set<String> paramFields = Collections.emptySet();
- protected Highlighter highlighter;
- protected int maxDocCharsToAnalyze;
+ private int maxDocCharsToAnalyze; // max leading content chars to highlight
+ private int maxFrags = 1; // aka passages
+ private Set<String> hlFields = Collections.singleton("body");
+ private String type;
+ private HLImpl hlImpl;
+ private Analyzer analyzer;
public SearchTravRetHighlightTask(PerfRunData runData) {
super(runData);
}
@Override
+ public void setParams(String params) {
+ // can't call super because super doesn't understand our params syntax
+ this.params = params;
+ // TODO consider instead using data.getConfig().get("highlighter.*")?
+ String[] splits = params.split(",");
+ for (String split : splits) {
+ if (split.startsWith("type[") == true) {
+ type = split.substring("type[".length(), split.length() - 1);
+ } else if (split.startsWith("maxFrags[") == true) {
+ maxFrags = (int) Float.parseFloat(split.substring("maxFrags[".length(), split.length() - 1));
+ } else if (split.startsWith("fields[") == true) {
+ String fieldNames = split.substring("fields[".length(), split.length() - 1);
+ String[] fieldSplits = fieldNames.split(";");
+ hlFields = new HashSet<>(Arrays.asList(fieldSplits));
+ }
+ }
+ }
+
+ @Override
public void setup() throws Exception {
super.setup();
//check to make sure either the doc is being stored
@@ -82,72 +115,188 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
throw new Exception("doc.stored must be set to true");
}
maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
+ analyzer = data.getAnalyzer();
+ String type = this.type;
+ if (type == null) {
+ type = data.getConfig().get("highlighter", null);
+ }
+ switch (type) {
+ case "NONE": hlImpl = new NoHLImpl(); break;
+ case "SH_A": hlImpl = new StandardHLImpl(false); break;
+ case "SH_V": hlImpl = new StandardHLImpl(true); break;
+
+ case "FVH_V": hlImpl = new FastVectorHLImpl(); break;
+
+ case "UH": hlImpl = new UnifiedHLImpl(null); break;
+ case "UH_A": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.ANALYSIS); break;
+ case "UH_V": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.TERM_VECTORS); break;
+ case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
+ case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
+
+ case "PH_P": hlImpl = new PostingsHLImpl(); break;
+
+ default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
+ }
}
+ // here is where we intercept ReadTask's logic to do the highlighting, and nothing else (no retrieval of all field vals)
@Override
- public boolean withRetrieve() {
- return true;
+ protected int withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ hlImpl.withTopDocs(searcher, q, hits);
+ // note: it'd be nice if we knew the sum kilobytes of text across these hits so we could return that. It'd be a more
+ // useful number to gauge the amount of work. But given "average" document sizes and lots of queries, returning the
+ // number of docs is reasonable.
+ return hits.scoreDocs.length; // always return # scored docs.
}
- @Override
- public int numToHighlight() {
- return numToHighlight;
+ private interface HLImpl {
+ void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception;
}
-
- @Override
- protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
- highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
- highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
- return new BenchmarkHighlighter(){
- @Override
- public int doHighlight(IndexReader reader, int doc, String field,
- Document document, Analyzer analyzer, String text) throws Exception {
- final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
- TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
- TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
- return frag != null ? frag.length : 0;
+
+ private volatile int preventOptimizeAway = 0;
+
+ private class StandardHLImpl implements HLImpl {
+ SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>");
+ DefaultEncoder encoder = new DefaultEncoder();
+ Highlighter highlighter = new Highlighter(formatter, encoder, null);
+ boolean termVecs;
+
+ StandardHLImpl(boolean termVecs) {
+ highlighter.setEncoder(new DefaultEncoder());
+ highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
+ this.termVecs = termVecs;
+ }
+
+ @Override
+ public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ IndexReader reader = searcher.getIndexReader();
+ highlighter.setFragmentScorer(new QueryScorer(q));
+ // highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. Default here is trivial
+ for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
+ Document document = reader.document(scoreDoc.doc, hlFields);
+ Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
+ for (IndexableField indexableField : document) {
+ TokenStream tokenStream;
+ if (termVecs) {
+ tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
+ indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
+ } else {
+ tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
+ }
+ // will close TokenStream:
+ String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
+ preventOptimizeAway = fragments.length;
+ }
}
- };
+ }
}
- @Override
- protected Collection<String> getFieldsToHighlight(Document document) {
- Collection<String> result = super.getFieldsToHighlight(document);
- //if stored is false, then result will be empty, in which case just get all the param fields
- if (paramFields.isEmpty() == false && result.isEmpty() == false) {
- result.retainAll(paramFields);
- } else {
- result = paramFields;
+ private class FastVectorHLImpl implements HLImpl {
+ int fragSize = 100;
+ WeightedFragListBuilder fragListBuilder = new WeightedFragListBuilder();
+ BoundaryScanner bs = new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(Locale.ENGLISH));
+ ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(bs);
+ String[] preTags = {"<em>"};
+ String[] postTags = {"</em>"};
+ Encoder encoder = new DefaultEncoder();// new SimpleHTMLEncoder();
+ FastVectorHighlighter highlighter = new FastVectorHighlighter(
+ true, // phraseHighlight
+ false); // requireFieldMatch -- not pertinent to our benchmark
+
+ @Override
+ public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ IndexReader reader = searcher.getIndexReader();
+ final FieldQuery fq = highlighter.getFieldQuery( q, reader);
+ for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
+ for (String hlField : hlFields) {
+ String[] fragments = highlighter.getBestFragments(fq, reader, scoreDoc.doc, hlField, fragSize, maxFrags,
+ fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
+ preventOptimizeAway = fragments.length;
+ }
+ }
}
- return result;
}
- @Override
- public void setParams(String params) {
- // can't call super because super doesn't understand our
- // params syntax
- this.params = params;
- String [] splits = params.split(",");
- for (int i = 0; i < splits.length; i++) {
- if (splits[i].startsWith("size[") == true){
- traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("highlight[") == true){
- numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("maxFrags[") == true){
- maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("mergeContiguous[") == true){
- mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue();
- } else if (splits[i].startsWith("fields[") == true){
- paramFields = new HashSet<>();
- String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
- String [] fieldSplits = fieldNames.split(";");
- for (int j = 0; j < fieldSplits.length; j++) {
- paramFields.add(fieldSplits[j]);
+ private ScoreDoc[] docIdOrder(ScoreDoc[] scoreDocs) {
+ ScoreDoc[] clone = new ScoreDoc[scoreDocs.length];
+ System.arraycopy(scoreDocs, 0, clone, 0, scoreDocs.length);
+ ArrayUtil.introSort(clone, (a, b) -> Integer.compare(a.doc, b.doc));
+ return clone;
+ }
+
+ private class PostingsHLImpl implements HLImpl {
+ PostingsHighlighter highlighter;
+ String[] fields = hlFields.toArray(new String[hlFields.size()]);
+ int[] maxPassages;
+ PostingsHLImpl() {
+ highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
+ @Override
+ protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
+ return analyzer;
}
+ @Override
+ protected BreakIterator getBreakIterator(String field) {
+ return BreakIterator.getSentenceInstance(Locale.ENGLISH);
+ }
+ };
+ maxPassages = new int[hlFields.size()];
+ Arrays.fill(maxPassages, maxFrags);
+ }
+
+ @Override
+ public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
+ preventOptimizeAway = result.size();
+ }
+ }
+
+ private class UnifiedHLImpl implements HLImpl {
+ UnifiedHighlighter highlighter;
+ IndexSearcher lastSearcher;
+ UnifiedHighlighter.OffsetSource offsetSource; // null means auto select
+ String[] fields = hlFields.toArray(new String[hlFields.size()]);
+ int[] maxPassages;
+
+ UnifiedHLImpl(final UnifiedHighlighter.OffsetSource offsetSource) {
+ this.offsetSource = offsetSource;
+ maxPassages = new int[hlFields.size()];
+ Arrays.fill(maxPassages, maxFrags);
+ }
+
+ private void reset(IndexSearcher searcher) {
+ if (lastSearcher == searcher) {
+ return;
}
+ lastSearcher = searcher;
+ highlighter = new UnifiedHighlighter(searcher, analyzer) {
+ @Override
+ protected OffsetSource getOffsetSource(String field) {
+ return offsetSource != null ? offsetSource : super.getOffsetSource(field);
+ }
+ };
+ highlighter.setBreakIterator(() -> BreakIterator.getSentenceInstance(Locale.ENGLISH));
+ highlighter.setMaxLength(maxDocCharsToAnalyze);
+ highlighter.setHighlightPhrasesStrictly(true);
+ highlighter.setHandleMultiTermQuery(true);
+ }
+
+ @Override
+ public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ reset(searcher);
+ Map<String, String[]> result = highlighter.highlightFields(fields, q, hits, maxPassages);
+ preventOptimizeAway = result.size();
}
}
+ private class NoHLImpl implements HLImpl {
+ @Override
+ public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
+ //just retrieve the HL fields
+ for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
+ preventOptimizeAway += searcher.doc(scoreDoc.doc, hlFields).iterator().hasNext() ? 2 : 1;
+ }
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
deleted file mode 100644
index 15a13ca..0000000
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.benchmark.byTask.tasks;
-
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
-import org.apache.lucene.search.vectorhighlight.FieldQuery;
-
-import java.util.Set;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Collections;
-
-/**
- * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter.
- *
- * <p>Note: This task reuses the reader if it is already open.
- * Otherwise a reader is opened at start and closed at the end.
- * </p>
- *
- * <p>Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]</p>
- * <ul>
- * <li>traversal size - The number of hits to traverse, otherwise all will be traversed</li>
- * <li>highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())</li>
- * <li>maxFrags - The maximum number of fragments to score by the highlighter</li>
- * <li>fragSize - The length of fragments</li>
- * <li>fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)</li>
- * </ul>
- * Example:
- * <pre>"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
- * </pre>
- *
- * Fields must be stored and term vector offsets and positions in order must be true for this task to work.
- *
- * <p>Other side effects: counts additional 1 (record) for each traversed hit,
- * and 1 more for each retrieved (non null) document and 1 for each fragment returned.</p>
- */
-public class SearchTravRetVectorHighlightTask extends SearchTravTask {
-
- protected int numToHighlight = Integer.MAX_VALUE;
- protected int maxFrags = 2;
- protected int fragSize = 100;
- protected Set<String> paramFields = Collections.emptySet();
- protected FastVectorHighlighter highlighter;
-
- public SearchTravRetVectorHighlightTask(PerfRunData runData) {
- super(runData);
- }
-
- @Override
- public void setup() throws Exception {
- super.setup();
- //check to make sure either the doc is being stored
- PerfRunData data = getRunData();
- if (data.getConfig().get("doc.stored", false) == false){
- throw new Exception("doc.stored must be set to true");
- }
- if (data.getConfig().get("doc.term.vector.offsets", false) == false){
- throw new Exception("doc.term.vector.offsets must be set to true");
- }
- if (data.getConfig().get("doc.term.vector.positions", false) == false){
- throw new Exception("doc.term.vector.positions must be set to true");
- }
- }
-
- @Override
- public boolean withRetrieve() {
- return true;
- }
-
- @Override
- public int numToHighlight() {
- return numToHighlight;
- }
-
- @Override
- protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){
- highlighter = new FastVectorHighlighter( false, false );
- final Query myq = q;
- return new BenchmarkHighlighter(){
- @Override
- public int doHighlight(IndexReader reader, int doc, String field,
- Document document, Analyzer analyzer, String text) throws Exception {
- final FieldQuery fq = highlighter.getFieldQuery( myq, reader);
- String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags);
- return fragments != null ? fragments.length : 0;
- }
- };
- }
-
- @Override
- protected Collection<String> getFieldsToHighlight(Document document) {
- Collection<String> result = super.getFieldsToHighlight(document);
- //if stored is false, then result will be empty, in which case just get all the param fields
- if (paramFields.isEmpty() == false && result.isEmpty() == false) {
- result.retainAll(paramFields);
- } else {
- result = paramFields;
- }
- return result;
- }
-
- @Override
- public void setParams(String params) {
- // can't call super because super doesn't understand our
- // params syntax
- final String [] splits = params.split(",");
- for (int i = 0; i < splits.length; i++) {
- if (splits[i].startsWith("size[") == true){
- traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("highlight[") == true){
- numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("maxFrags[") == true){
- maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("fragSize[") == true){
- fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1));
- } else if (splits[i].startsWith("fields[") == true){
- paramFields = new HashSet<>();
- String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1);
- String [] fieldSplits = fieldNames.split(";");
- for (int j = 0; j < fieldSplits.length; j++) {
- paramFields.add(fieldSplits[j]);
- }
-
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
index 5dbf660..3d483f3 100644
--- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
+++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
@@ -31,9 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
-import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
-import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.collation.CollationKeyAnalyzer;
@@ -159,110 +157,6 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
//assertTrue(CountingSearchTestTask.numSearches > 0);
}
- public void testHighlighting() throws Exception {
- // 1. alg definition (required in every "logic" test)
- String algLines[] = {
- "doc.stored=true",
- "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
- "docs.file=" + getReuters20LinesFile(),
- "query.maker=" + ReutersQueryMaker.class.getName(),
- "ResetSystemErase",
- "CreateIndex",
- "{ AddDoc } : 100",
- "ForceMerge(1)",
- "CloseIndex",
- "OpenReader",
- "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
- "CloseReader",
- };
-
- // 2. we test this value later
- CountingHighlighterTestTask.numHighlightedResults = 0;
- CountingHighlighterTestTask.numDocsRetrieved = 0;
- // 3. execute the algorithm (required in every "logic" test)
- Benchmark benchmark = execBenchmark(algLines);
-
- // 4. test specific checks after the benchmark run completed.
- assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
- //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
- //we probably should use a different doc/query maker, but...
- assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
-
- assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
- // now we should be able to open the index for write.
- IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
- iw.close();
- IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
- assertEquals("100 docs were added to the index, this is what we expect to find!",100,ir.numDocs());
- ir.close();
- }
-
- public void testHighlightingTV() throws Exception {
- // 1. alg definition (required in every "logic" test)
- String algLines[] = {
- "doc.stored=true",//doc storage is required in order to have text to highlight
- "doc.term.vector=true",
- "doc.term.vector.offsets=true",
- "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
- "docs.file=" + getReuters20LinesFile(),
- "query.maker=" + ReutersQueryMaker.class.getName(),
- "ResetSystemErase",
- "CreateIndex",
- "{ AddDoc } : 1000",
- "ForceMerge(1)",
- "CloseIndex",
- "OpenReader",
- "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
- "CloseReader",
- };
-
- // 2. we test this value later
- CountingHighlighterTestTask.numHighlightedResults = 0;
- CountingHighlighterTestTask.numDocsRetrieved = 0;
- // 3. execute the algorithm (required in every "logic" test)
- Benchmark benchmark = execBenchmark(algLines);
-
- // 4. test specific checks after the benchmark run completed.
- assertEquals("TestSearchTask was supposed to be called!",92,CountingHighlighterTestTask.numDocsRetrieved);
- //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs
- //we probably should use a different doc/query maker, but...
- assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0);
-
- assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory()));
- // now we should be able to open the index for write.
- IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND));
- iw.close();
- IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory());
- assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs());
- ir.close();
- }
-
- public void testHighlightingNoTvNoStore() throws Exception {
- // 1. alg definition (required in every "logic" test)
- String algLines[] = {
- "doc.stored=false",
- "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
- "docs.file=" + getReuters20LinesFile(),
- "query.maker=" + ReutersQueryMaker.class.getName(),
- "ResetSystemErase",
- "CreateIndex",
- "{ AddDoc } : 1000",
- "ForceMerge(1)",
- "CloseIndex",
- "OpenReader",
- "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200",
- "CloseReader",
- };
-
- // 2. we test this value later
- CountingHighlighterTestTask.numHighlightedResults = 0;
- CountingHighlighterTestTask.numDocsRetrieved = 0;
- // 3. execute the algorithm (required in every "logic" test)
- expectThrows(Exception.class, () -> {
- execBenchmark(algLines);
- });
- }
-
/**
* Test Exhasting Doc Maker logic
*/
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/3497a290/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java b/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
deleted file mode 100644
index da322df..0000000
--- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.benchmark.byTask.tasks;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.highlight.Highlighter;
-import org.apache.lucene.search.highlight.QueryScorer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
-import org.apache.lucene.search.highlight.TextFragment;
-import org.apache.lucene.search.highlight.TokenSources;
-
-/**
- * Test Search task which counts number of searches.
- */
-public class CountingHighlighterTestTask extends SearchTravRetHighlightTask {
-
- public static int numHighlightedResults = 0;
- public static int numDocsRetrieved = 0;
-
- public CountingHighlighterTestTask(PerfRunData runData) {
- super(runData);
- }
-
- @Override
- protected Document retrieveDoc(IndexReader ir, int id) throws IOException {
- Document document = ir.document(id);
- if (document != null) {
- numDocsRetrieved++;
- }
- return document;
- }
-
- @Override
- public BenchmarkHighlighter getBenchmarkHighlighter(Query q) {
- highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q));
- return new BenchmarkHighlighter() {
- @Override
- public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception {
- final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
- TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
- TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
- numHighlightedResults += frag != null ? frag.length : 0;
- return frag != null ? frag.length : 0;
- }
- };
- }
-}