You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by tf...@apache.org on 2017/05/19 00:13:58 UTC
[37/50] [abbrv] lucene-solr:jira/solr-10233: LUCENE-7730: Better
accuracy for the length normalization factor.
LUCENE-7730: Better accuracy for the length normalization factor.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/06a6034d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/06a6034d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/06a6034d
Branch: refs/heads/jira/solr-10233
Commit: 06a6034d9bc8f06ea567c0110b954b35515c2ea0
Parents: c53d19e
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu May 18 16:27:31 2017 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu May 18 16:27:31 2017 +0200
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +
.../lucene/benchmark/quality/trecQRels.txt | 1090 +++++++++---------
.../lucene/index/DefaultIndexingChain.java | 8 +-
.../lucene/index/DocumentsWriterPerThread.java | 4 +
.../apache/lucene/index/FieldInvertState.java | 17 +-
.../search/similarities/BM25Similarity.java | 79 +-
.../search/similarities/ClassicSimilarity.java | 101 +-
.../search/similarities/SimilarityBase.java | 65 +-
.../search/similarities/TFIDFSimilarity.java | 128 +-
.../java/org/apache/lucene/util/SmallFloat.java | 83 +-
.../apache/lucene/index/TestIndexSorting.java | 2 +-
.../lucene/index/TestMaxTermFrequency.java | 47 +-
.../test/org/apache/lucene/index/TestNorms.java | 65 +-
.../org/apache/lucene/index/TestOmitTf.java | 4 +-
.../lucene/search/TestDisjunctionMaxQuery.java | 3 +-
.../lucene/search/TestElevationComparator.java | 9 +-
.../apache/lucene/search/TestPhraseQuery.java | 7 +-
.../apache/lucene/search/TestQueryRescorer.java | 18 +-
.../apache/lucene/search/TestSimilarity.java | 14 +-
.../lucene/search/TestSimilarityProvider.java | 117 +-
.../apache/lucene/search/TestSortRescorer.java | 2 +-
.../similarities/TestAxiomaticSimilarity.java | 13 -
.../search/similarities/TestBM25Similarity.java | 70 +-
.../similarities/TestBooleanSimilarity.java | 5 +-
.../similarities/TestClassicSimilarity.java | 91 +-
.../search/similarities/TestSimilarityBase.java | 99 +-
.../org/apache/lucene/util/TestSmallFloat.java | 54 +-
.../expressions/TestExpressionRescorer.java | 2 +-
.../search/highlight/HighlighterTest.java | 8 +-
.../apache/lucene/index/memory/MemoryIndex.java | 2 +-
.../lucene/index/memory/TestMemoryIndex.java | 34 +-
.../apache/lucene/misc/SweetSpotSimilarity.java | 25 +-
.../lucene/misc/SweetSpotSimilarityTest.java | 116 +-
.../function/valuesource/NormValueSource.java | 36 +-
.../function/TestLongNormValueSource.java | 117 +-
.../queries/function/TestValueSources.java | 2 +-
.../queries/payloads/TestPayloadScoreQuery.java | 9 +-
.../queries/payloads/TestPayloadTermQuery.java | 3 +-
.../search/similarities/RandomSimilarity.java | 2 +-
.../apache/solr/DisMaxRequestHandlerTest.java | 4 +-
.../component/QueryElevationComponentTest.java | 4 +-
.../search/TestPayloadScoreQParserPlugin.java | 2 +-
.../search/function/SortByFunctionTest.java | 6 +-
.../solr/search/function/TestFunctionQuery.java | 7 +-
.../TestSweetSpotSimilarityFactory.java | 91 +-
45 files changed, 1353 insertions(+), 1315 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index df8d20d..404e923 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -63,6 +63,9 @@ Improvements
* LUCENE-7489: Better storage of sparse doc-values fields with the default
codec. (Adrien Grand)
+* LUCENE-7730: More accurate encoding of the length normalization factor
+ thanks to the removal of index-time boosts. (Adrien Grand)
+
Optimizations
* LUCENE-7416: BooleanQuery optimizes queries that have queries that occur both
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt b/lucene/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt
index 183a7f4..16094e3 100644
--- a/lucene/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt
+++ b/lucene/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt
@@ -40,64 +40,64 @@
0 0 fakedoc3 1
0 0 fakedoc4 1
-0 0 doc18211 1
0 0 doc20192 1
+0 0 doc18211 1
+0 0 doc12431 1
+0 0 doc5471 1
+0 0 doc3462 1
+0 0 doc3057 1
0 0 doc7401 1
-0 0 doc11285 1
-0 0 doc20647 1
-0 0 doc3057 1
-0 0 doc12431 1
-0 0 doc4989 1
-0 0 doc17324 1
0 0 doc4030 1
-0 0 doc4290 1
-0 0 doc3462 1
+0 0 doc4290 1
+0 0 doc17904 1
+0 0 doc11285 1
+0 0 doc20647 1
+0 0 doc17324 1
+0 0 doc7168 1
+0 0 doc9011 1
+0 0 doc4989 1
0 0 doc15313 1
-0 0 doc10303 1
-0 0 doc1893 1
-0 0 doc5008 1
+0 0 doc10303 1
0 0 doc14634 1
-0 0 doc5471 1
-0 0 doc17904 1
-0 0 doc7168 1
-0 0 doc21275 1
-0 0 doc9011 1
-0 0 doc17546 1
0 0 doc9102 1
-0 0 doc13199 1
+0 0 doc5008 1
+0 0 doc1893 1
+0 0 doc17546 1
+0 0 doc13199 1
+0 0 doc21275 1
# --- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
-1 0 doc9857 0
-1 0 doc16846 1
-1 0 doc4320 1
-1 0 doc9501 0
-1 0 doc10159 1
-1 0 doc16642 1
-1 0 doc17536 0
-1 0 doc17571 1
-1 0 doc18728 1
-1 0 doc18828 1
-1 0 doc19108 0
-1 0 doc9940 1
-1 0 doc11852 1
-1 0 doc7430 0
-1 0 doc19162 1
-1 0 doc1743 1
-1 0 doc2137 1
-1 0 doc7611 1
-1 0 doc8072 1
-1 0 doc12764 1
-1 0 doc2593 1
-1 0 doc11088 1
-1 0 doc931 1
-1 0 doc7673 1
-1 0 doc12941 1
-1 0 doc11797 1
-1 0 doc11831 1
-1 0 doc13162 1
-1 0 doc4423 1
-1 0 doc5217 1
+1 0 doc9857 0
+1 0 doc16846 1
+1 0 doc9940 1
+1 0 doc11852 0
+1 0 doc12764 1
+1 0 doc11088 1
+1 0 doc2137 0
+1 0 doc7673 1
+1 0 doc7611 1
+1 0 doc8072 1
+1 0 doc19162 0
+1 0 doc12941 1
+1 0 doc931 1
+1 0 doc2593 0
+1 0 doc5037 1
+1 0 doc13162 1
+1 0 doc5018 1
+1 0 doc11797 1
+1 0 doc11831 1
+1 0 doc5217 1
+1 0 doc15426 1
+1 0 doc7228 1
+1 0 doc15370 1
+1 0 doc10159 1
+1 0 doc4320 1
+1 0 doc9501 1
+1 0 doc16642 1
+1 0 doc17536 1
+1 0 doc17571 1
+1 0 doc18728 1
# ---- m==2: all precision, precision_at_n and recall are hurt.
@@ -106,200 +106,200 @@
2 0 fakedoc3 1
2 0 fakedoc4 1
-2 0 doc3137 0
-2 0 doc7142 0
-2 0 doc13667 0
-2 0 doc13171 0
-2 0 doc13372 1
-2 0 doc21415 1
-2 0 doc16298 1
-2 0 doc14957 1
-2 0 doc153 1
-2 0 doc16092 1
-2 0 doc16096 1
-2 0 doc21303 1
-2 0 doc18681 1
-2 0 doc20756 1
-2 0 doc355 1
-2 0 doc13395 1
-2 0 doc5009 1
-2 0 doc17164 1
-2 0 doc13162 1
-2 0 doc11757 1
-2 0 doc9637 1
-2 0 doc18087 1
-2 0 doc4593 1
-2 0 doc4677 1
-2 0 doc20865 1
-2 0 doc8556 1
-2 0 doc2578 1
-2 0 doc1163 1
-2 0 doc3797 1
-2 0 doc11094 1
-
-
-3 0 doc19578 1
-3 0 doc14860 1
-3 0 doc7235 1
-3 0 doc20590 1
-3 0 doc17933 1
-3 0 doc9384 1
-3 0 doc10783 1
-3 0 doc1963 1
-3 0 doc18356 1
-3 0 doc13254 1
-3 0 doc18402 1
-3 0 doc15241 1
-3 0 doc3303 1
-3 0 doc8868 1
-3 0 doc18520 1
-3 0 doc4650 1
-3 0 doc4727 1
-3 0 doc21518 1
-3 0 doc5060 1
-3 0 doc7587 1
-3 0 doc2990 1
-3 0 doc8042 1
-3 0 doc6304 1
-3 0 doc13223 1
-3 0 doc1964 1
-3 0 doc10597 1
-3 0 doc21023 1
-3 0 doc19057 1
-3 0 doc14948 1
-3 0 doc9692 1
-
-
-4 0 doc2534 1
-4 0 doc21388 1
-4 0 doc20923 1
-4 0 doc11547 1
-4 0 doc19755 1
-4 0 doc3793 1
-4 0 doc6714 1
-4 0 doc12722 1
-4 0 doc5552 1
-4 0 doc6810 1
-4 0 doc16953 1
-4 0 doc2527 1
-4 0 doc5361 1
-4 0 doc12353 1
-4 0 doc7308 1
-4 0 doc3836 1
-4 0 doc2293 1
-4 0 doc7348 1
-4 0 doc17119 1
-4 0 doc19331 1
-4 0 doc3411 1
-4 0 doc14643 1
-4 0 doc9058 1
-4 0 doc11099 1
-4 0 doc12485 1
-4 0 doc16432 1
-4 0 doc10047 1
-4 0 doc13788 1
-4 0 doc117 1
-4 0 doc638 1
+2 0 doc3137 0
+2 0 doc13667 0
+2 0 doc7142 0
+2 0 doc16298 0
+2 0 doc13171 1
+2 0 doc14957 1
+2 0 doc5009 1
+2 0 doc13372 1
+2 0 doc17164 1
+2 0 doc21303 1
+2 0 doc18681 1
+2 0 doc13162 1
+2 0 doc20756 1
+2 0 doc3797 1
+2 0 doc20865 1
+2 0 doc153 1
+2 0 doc16092 1
+2 0 doc16096 1
+2 0 doc2578 1
+2 0 doc21415 1
+2 0 doc4593 1
+2 0 doc4677 1
+2 0 doc21088 1
+2 0 doc8556 1
+2 0 doc9637 1
+2 0 doc344 1
+2 0 doc355 1
+2 0 doc13395 1
+2 0 doc1163 1
+2 0 doc11757 1
+
+
+3 0 doc7235 1
+3 0 doc19578 1
+3 0 doc17933 1
+3 0 doc20590 1
+3 0 doc14860 1
+3 0 doc10783 1
+3 0 doc15241 1
+3 0 doc13223 1
+3 0 doc1963 1
+3 0 doc10597 1
+3 0 doc6304 1
+3 0 doc3303 1
+3 0 doc13254 1
+3 0 doc9384 1
+3 0 doc18356 1
+3 0 doc18402 1
+3 0 doc18520 1
+3 0 doc14948 1
+3 0 doc5060 1
+3 0 doc4650 1
+3 0 doc4727 1
+3 0 doc19057 1
+3 0 doc8868 1
+3 0 doc2990 1
+3 0 doc21518 1
+3 0 doc21023 1
+3 0 doc7587 1
+3 0 doc8042 1
+3 0 doc1964 1
+3 0 doc7124 1
+
+
+
+4 0 doc2534 1
+4 0 doc6714 1
+4 0 doc6810 1
+4 0 doc21388 1
+4 0 doc5361 1
+4 0 doc7308 1
+4 0 doc20923 1
+4 0 doc12722 1
+4 0 doc2527 1
+4 0 doc7348 1
+4 0 doc10047 1
+4 0 doc5552 1
+4 0 doc19755 1
+4 0 doc13788 1
+4 0 doc14643 1
+4 0 doc11547 1
+4 0 doc2293 1
+4 0 doc3793 1
+4 0 doc19331 1
+4 0 doc3836 1
+4 0 doc12353 1
+4 0 doc11099 1
+4 0 doc16432 1
+4 0 doc117 1
+4 0 doc16953 1
+4 0 doc9058 1
+4 0 doc3411 1
+4 0 doc12485 1
+4 0 doc17119 1
+4 0 doc638 1
5 0 doc13181 1
-5 0 doc169 1
-5 0 doc5389 1
-5 0 doc955 1
-5 0 doc8573 1
+5 0 doc169 1
+5 0 doc8573 1
5 0 doc10242 1
-5 0 doc4350 1
-5 0 doc17417 1
5 0 doc11758 1
-5 0 doc9197 1
+5 0 doc955 1
+5 0 doc9197 1
+5 0 doc17417 1
+5 0 doc5389 1
+5 0 doc4350 1
+5 0 doc3857 1
+5 0 doc3204 1
5 0 doc10639 1
-5 0 doc3857 1
-5 0 doc10478 1
5 0 doc10262 1
-5 0 doc2981 1
-5 0 doc3204 1
+5 0 doc2981 1
+5 0 doc10478 1
5 0 doc17122 1
+5 0 doc4065 1
5 0 doc17864 1
-5 0 doc9298 1
-5 0 doc4065 1
-5 0 doc2492 1
-5 0 doc18879 1
-5 0 doc12199 1
-5 0 doc5180 1
+5 0 doc9298 1
+5 0 doc6918 1
5 0 doc11528 1
+5 0 doc12199 1
+5 0 doc2492 1
+5 0 doc18879 1
5 0 doc20190 1
-5 0 doc6918 1
-5 0 doc4665 1
+5 0 doc4665 1
+5 0 doc5180 1
+5 0 doc9124 1
5 0 doc10195 1
-5 0 doc3062 1
-
-
-
-6 0 doc9507 1
-6 0 doc15630 1
-6 0 doc8469 1
-6 0 doc11918 1
-6 0 doc20482 1
-6 0 doc20158 1
-6 0 doc19831 1
-6 0 doc8296 1
-6 0 doc8930 1
-6 0 doc16460 1
-6 0 doc2577 1
-6 0 doc15476 1
-6 0 doc1767 1
-6 0 doc689 1
-6 0 doc16606 1
-6 0 doc6149 1
-6 0 doc18691 1
-6 0 doc2208 1
-6 0 doc3592 1
-6 0 doc11199 1
-6 0 doc16329 1
-6 0 doc6007 1
-6 0 doc15231 1
-6 0 doc20622 1
-6 0 doc21468 1
-6 0 doc12230 1
-6 0 doc5723 1
-6 0 doc8120 1
-6 0 doc8668 1
-6 0 doc303 1
-
-
-
-
-7 0 doc7728 1
-7 0 doc7693 1
-7 0 doc21088 1
-7 0 doc5017 1
-7 0 doc10807 1
-7 0 doc16204 1
-7 0 doc2233 1
-7 0 doc3632 1
-7 0 doc4719 1
-7 0 doc6477 1
-7 0 doc6502 1
-7 0 doc6709 1
-7 0 doc7710 1
-7 0 doc9193 1
-7 0 doc9309 1
-7 0 doc9789 1
-7 0 doc10971 1
-7 0 doc18059 1
-7 0 doc19906 1
-7 0 doc20089 1
-7 0 doc20102 1
-7 0 doc21040 1
-7 0 doc21153 1
-7 0 doc9147 1
-7 0 doc9930 1
-7 0 doc19763 1
-7 0 doc1559 1
-7 0 doc21248 1
-7 0 doc17945 1
-7 0 doc526 1
+
+
+
+6 0 doc15630 1
+6 0 doc9507 1
+6 0 doc8469 1
+6 0 doc20158 1
+6 0 doc20482 1
+6 0 doc1767 1
+6 0 doc5723 1
+6 0 doc12230 1
+6 0 doc2577 1
+6 0 doc11918 1
+6 0 doc6007 1
+6 0 doc20622 1
+6 0 doc15231 1
+6 0 doc21468 1
+6 0 doc8296 1
+6 0 doc16606 1
+6 0 doc18691 1
+6 0 doc6149 1
+6 0 doc19831 1
+6 0 doc8930 1
+6 0 doc2208 1
+6 0 doc16460 1
+6 0 doc689 1
+6 0 doc303 1
+6 0 doc8120 1
+6 0 doc11199 1
+6 0 doc3592 1
+6 0 doc8668 1
+6 0 doc15476 1
+6 0 doc7693 1
+
+
+
+7 0 doc7693 1
+7 0 doc7728 1
+7 0 doc21088 1
+7 0 doc19763 1
+7 0 doc19906 1
+7 0 doc16204 1
+7 0 doc4719 1
+7 0 doc18059 1
+7 0 doc9147 1
+7 0 doc9930 1
+7 0 doc6477 1
+7 0 doc21040 1
+7 0 doc2233 1
+7 0 doc6709 1
+7 0 doc7710 1
+7 0 doc9789 1
+7 0 doc10971 1
+7 0 doc20102 1
+7 0 doc5017 1
+7 0 doc3632 1
+7 0 doc6502 1
+7 0 doc9193 1
+7 0 doc9309 1
+7 0 doc21153 1
+7 0 doc526 1
+7 0 doc20089 1
+7 0 doc10807 1
+7 0 doc1559 1
+7 0 doc21248 1
+7 0 doc15559 1
# --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant
@@ -309,71 +309,71 @@
8 0 fakedoc3 1
8 0 fakedoc4 1
-8 0 doc16299 1
-8 0 doc1662 1
-8 0 doc4585 1
-8 0 doc12315 1
-8 0 doc16266 1
-8 0 doc13136 1
-8 0 doc19212 1
-8 0 doc7086 1
-8 0 doc7062 1
-8 0 doc6134 1
-8 0 doc13953 1
-8 0 doc16264 1
-8 0 doc2494 1
-8 0 doc10636 1
-8 0 doc10894 1
-8 0 doc6844 1
-8 0 doc674 1
-8 0 doc13520 1
-8 0 doc344 1
-8 0 doc2896 1
-8 0 doc11871 1
-8 0 doc1862 1
-8 0 doc16728 1
-8 0 doc10308 1
-8 0 doc2227 1
-8 0 doc13167 1
-8 0 doc20607 1
-8 0 doc9670 1
-8 0 doc1566 1
-8 0 doc17885 1
+8 0 doc1662 1
+8 0 doc12315 1
+8 0 doc16299 1
+8 0 doc19212 1
+8 0 doc2494 1
+8 0 doc13520 1
+8 0 doc13136 1
+8 0 doc7086 1
+8 0 doc674 1
+8 0 doc16266 1
+8 0 doc10894 1
+8 0 doc4585 1
+8 0 doc6134 1
+8 0 doc7062 1
+8 0 doc13953 1
+8 0 doc2227 1
+8 0 doc20607 1
+8 0 doc344 1
+8 0 doc16264 1
+8 0 doc13167 1
+8 0 doc2896 1
+8 0 doc11871 1
+8 0 doc6844 1
+8 0 doc10636 1
+8 0 doc9670 1
+8 0 doc10180 1
+8 0 doc1862 1
+8 0 doc10308 1
+8 0 doc16728 1
+8 0 doc15794 1
# ---- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
-9 0 doc1990 0
-9 0 doc9342 1
-9 0 doc19427 1
-9 0 doc12432 0
-9 0 doc13480 1
-9 0 doc3322 1
-9 0 doc16044 1
-9 0 doc266 0
-9 0 doc3437 1
-9 0 doc5370 1
-9 0 doc10314 1
-9 0 doc4892 1
-9 0 doc5763 0
-9 0 doc14045 1
-9 0 doc1090 1
-9 0 doc7437 1
-9 0 doc5822 1
-9 0 doc4285 1
-9 0 doc17119 1
-9 0 doc21001 1
-9 0 doc4337 1
-9 0 doc5967 1
-9 0 doc10214 1
-9 0 doc12001 1
-9 0 doc18553 1
-9 0 doc12116 1
-9 0 doc5064 1
-9 0 doc5018 1
-9 0 doc5037 1
-9 0 doc8025 1
+9 0 doc1990 0
+9 0 doc4892 1
+9 0 doc9342 1
+9 0 doc12432 0
+9 0 doc13480 1
+9 0 doc19427 1
+9 0 doc12116 1
+9 0 doc5064 0
+9 0 doc14045 1
+9 0 doc4285 1
+9 0 doc5822 1
+9 0 doc3322 1
+9 0 doc5763 1
+9 0 doc3437 0
+9 0 doc5370 1
+9 0 doc10314 1
+9 0 doc16044 1
+9 0 doc18553 1
+9 0 doc5037 1
+9 0 doc7437 1
+9 0 doc12001 1
+9 0 doc5018 1
+9 0 doc1090 1
+9 0 doc266 1
+9 0 doc17894 1
+9 0 doc17119 1
+9 0 doc4337 1
+9 0 doc5967 1
+9 0 doc10214 1
+9 0 doc20647 1
# ---- m==2: all precision, precision_at_n and recall are hurt.
@@ -384,200 +384,200 @@
10 0 fakedoc4 1
10 0 doc16087 0
-10 0 doc19943 0
-10 0 doc5958 0
-10 0 doc6510 0
-10 0 doc4354 1
-10 0 doc17218 1
-10 0 doc6964 1
-10 0 doc10270 1
+10 0 doc17218 0
+10 0 doc10270 0
+10 0 doc16743 0
+10 0 doc19943 1
+10 0 doc16729 1
+10 0 doc16761 1
+10 0 doc4354 1
10 0 doc18321 1
+10 0 doc5958 1
+10 0 doc6510 1
+10 0 doc7357 1
+10 0 doc2534 1
+10 0 doc6964 1
10 0 doc14893 1
-10 0 doc16743 1
-10 0 doc7357 1
-10 0 doc2534 1
10 0 doc18497 1
-10 0 doc16729 1
-10 0 doc16761 1
-10 0 doc8933 1
-10 0 doc15769 1
10 0 doc14948 1
+10 0 doc8933 1
+10 0 doc14935 1
10 0 doc10818 1
+10 0 doc7891 1
10 0 doc11819 1
-10 0 doc7891 1
-10 0 doc14935 1
+10 0 doc7235 1
+10 0 doc15769 1
10 0 doc14954 1
-10 0 doc9897 1
-10 0 doc6930 1
-10 0 doc7235 1
+10 0 doc9897 1
10 0 doc15559 1
-10 0 doc6621 1
10 0 doc11214 1
+10 0 doc5348 1
+10 0 doc6930 1
+11 0 doc8593 1
11 0 doc11943 1
-11 0 doc9705 1
-11 0 doc286 1
+11 0 doc8800 1
+11 0 doc286 1
11 0 doc17916 1
11 0 doc17918 1
-11 0 doc1574 1
+11 0 doc9705 1
+11 0 doc1574 1
11 0 doc10180 1
-11 0 doc1893 1
+11 0 doc9337 1
+11 0 doc11869 1
+11 0 doc5194 1
11 0 doc11189 1
-11 0 doc8593 1
-11 0 doc3188 1
-11 0 doc8800 1
-11 0 doc9337 1
+11 0 doc1893 1
11 0 doc19213 1
-11 0 doc8735 1
-11 0 doc5194 1
-11 0 doc3552 1
+11 0 doc3188 1
+11 0 doc8735 1
+11 0 doc18580 1
11 0 doc16030 1
+11 0 doc3552 1
11 0 doc10195 1
+11 0 doc209 1
+11 0 doc5792 1
+11 0 doc8715 1
11 0 doc17702 1
-11 0 doc209 1
-11 0 doc11869 1
-11 0 doc5008 1
-11 0 doc5792 1
-11 0 doc1990 1
-11 0 doc3393 1
+11 0 doc3166 1
+11 0 doc1990 1
+11 0 doc3393 1
11 0 doc19027 1
-11 0 doc18580 1
-11 0 doc8715 1
-11 0 doc12753 1
+11 0 doc5008 1
+12 0 doc6544 1
12 0 doc10640 1
-12 0 doc6544 1
-12 0 doc4305 1
-12 0 doc10760 1
12 0 doc18198 1
-12 0 doc10881 1
-12 0 doc128 1
+12 0 doc4305 1
+12 0 doc2444 1
12 0 doc12192 1
-12 0 doc2444 1
-12 0 doc11639 1
-12 0 doc2911 1
-12 0 doc1884 1
-12 0 doc2698 1
-12 0 doc3552 1
+12 0 doc10760 1
+12 0 doc10881 1
+12 0 doc128 1
+12 0 doc1884 1
12 0 doc18704 1
-12 0 doc7652 1
-12 0 doc9187 1
-12 0 doc3131 1
-12 0 doc2277 1
-12 0 doc2589 1
-12 0 doc3747 1
-12 0 doc3813 1
-12 0 doc5222 1
-12 0 doc6023 1
-12 0 doc6624 1
-12 0 doc7655 1
-12 0 doc9205 1
-12 0 doc12062 1
+12 0 doc11639 1
+12 0 doc3131 1
+12 0 doc2698 1
+12 0 doc3552 1
+12 0 doc2911 1
+12 0 doc7652 1
+12 0 doc20524 1
+12 0 doc9187 1
+12 0 doc2277 1
12 0 doc15504 1
-12 0 doc16329 1
+12 0 doc2589 1
+12 0 doc5222 1
+12 0 doc3747 1
+12 0 doc6624 1
+12 0 doc9205 1
+12 0 doc12062 1
+12 0 doc3813 1
+12 0 doc6023 1
+12 0 doc7655 1
13 0 doc16347 1
-13 0 doc1866 1
+13 0 doc8695 1
+13 0 doc4948 1
+13 0 doc8554 1
13 0 doc13431 1
-13 0 doc4948 1
+13 0 doc1866 1
13 0 doc13989 1
+13 0 doc2100 1
13 0 doc21565 1
-13 0 doc8554 1
-13 0 doc8695 1
-13 0 doc6764 1
-13 0 doc2408 1
-13 0 doc5605 1
-13 0 doc42 1
+13 0 doc42 1
+13 0 doc2408 1
13 0 doc15794 1
+13 0 doc6764 1
+13 0 doc3980 1
13 0 doc17135 1
+13 0 doc5605 1
+13 0 doc7783 1
+13 0 doc5967 1
13 0 doc14847 1
-13 0 doc3980 1
-13 0 doc2592 1
-13 0 doc5967 1
-13 0 doc2100 1
13 0 doc10947 1
-13 0 doc4557 1
-13 0 doc2492 1
-13 0 doc7783 1
-13 0 doc8025 1
-13 0 doc355 1
-13 0 doc17170 1
-13 0 doc14595 1
13 0 doc16894 1
-13 0 doc5822 1
+13 0 doc355 1
+13 0 doc14595 1
+13 0 doc8977 1
+13 0 doc2592 1
+13 0 doc4557 1
+13 0 doc8025 1
+13 0 doc2492 1
13 0 doc11088 1
-
-
-
-14 0 doc17172 1
-14 0 doc17210 1
-14 0 doc5044 1
-14 0 doc4627 1
-14 0 doc4683 1
-14 0 doc15126 1
-14 0 doc4538 1
-14 0 doc273 1
-14 0 doc19585 1
-14 0 doc16078 1
-14 0 doc4529 1
-14 0 doc4186 1
-14 0 doc12961 1
-14 0 doc19217 1
-14 0 doc5670 1
-14 0 doc1699 1
-14 0 doc4716 1
-14 0 doc12644 1
-14 0 doc18387 1
-14 0 doc336 1
-14 0 doc16130 1
-14 0 doc18718 1
-14 0 doc12527 1
-14 0 doc11797 1
-14 0 doc11831 1
-14 0 doc7538 1
-14 0 doc17259 1
-14 0 doc18724 1
-14 0 doc19330 1
-14 0 doc19206 1
-
-
-
-15 0 doc12198 1
-15 0 doc20371 1
-15 0 doc2947 1
-15 0 doc10750 1
-15 0 doc7239 1
-15 0 doc14189 1
-15 0 doc19474 1
-15 0 doc14776 1
-15 0 doc21270 1
-15 0 doc6387 1
-15 0 doc12908 1
-15 0 doc9573 1
-15 0 doc17102 1
-15 0 doc21482 1
-15 0 doc6524 1
-15 0 doc18034 1
-15 0 doc1358 1
-15 0 doc13147 1
-15 0 doc17731 1
-15 0 doc12890 1
-15 0 doc20887 1
-15 0 doc19508 1
-15 0 doc18498 1
-15 0 doc20642 1
-15 0 doc19878 1
-15 0 doc6556 1
-15 0 doc10272 1
-15 0 doc5720 1
-15 0 doc17578 1
-15 0 doc17164 1
+13 0 doc1844 1
+
+
+
+14 0 doc17172 1
+14 0 doc17210 1
+14 0 doc4627 1
+14 0 doc4683 1
+14 0 doc15126 1
+14 0 doc273 1
+14 0 doc4716 1
+14 0 doc4538 1
+14 0 doc4529 1
+14 0 doc19206 1
+14 0 doc5044 1
+14 0 doc12961 1
+14 0 doc16078 1
+14 0 doc19585 1
+14 0 doc12527 1
+14 0 doc19217 1
+14 0 doc19330 1
+14 0 doc5670 1
+14 0 doc1699 1
+14 0 doc11797 1
+14 0 doc11831 1
+14 0 doc17259 1
+14 0 doc18387 1
+14 0 doc7538 1
+14 0 doc336 1
+14 0 doc18718 1
+14 0 doc4186 1
+14 0 doc18724 1
+14 0 doc18356 1
+14 0 doc12644 1
+
+
+
+15 0 doc12198 1
+15 0 doc20371 1
+15 0 doc1358 1
+15 0 doc20887 1
+15 0 doc14189 1
+15 0 doc14776 1
+15 0 doc21270 1
+15 0 doc13147 1
+15 0 doc2947 1
+15 0 doc7239 1
+15 0 doc19474 1
+15 0 doc12908 1
+15 0 doc10750 1
+15 0 doc19878 1
+15 0 doc20642 1
+15 0 doc19508 1
+15 0 doc18034 1
+15 0 doc6387 1
+15 0 doc17102 1
+15 0 doc6524 1
+15 0 doc6556 1
+15 0 doc9573 1
+15 0 doc5720 1
+15 0 doc10272 1
+15 0 doc17164 1
+15 0 doc15126 1
+15 0 doc21482 1
+15 0 doc4496 1
+15 0 doc18498 1
+15 0 doc10890 1
# --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant
@@ -587,65 +587,70 @@
16 0 fakedoc3 1
16 0 fakedoc4 1
-16 0 doc4043 1
-16 0 doc15370 1
+16 0 doc4043 1
16 0 doc15426 1
-16 0 doc1702 1
+16 0 doc15370 1
+16 0 doc1702 1
+16 0 doc3446 1
+16 0 doc3062 1
16 0 doc14985 1
-16 0 doc3446 1
+16 0 doc8224 1
16 0 doc16609 1
+16 0 doc19032 1
+16 0 doc7228 1
16 0 doc16134 1
-16 0 doc3062 1
-16 0 doc8224 1
+16 0 doc5044 1
16 0 doc16493 1
-16 0 doc15037 1
+16 0 doc8545 1
16 0 doc12686 1
-16 0 doc1710 1
-16 0 doc19032 1
-16 0 doc8545 1
-16 0 doc5044 1
-16 0 doc17894 1
-16 0 doc7228 1
-16 0 doc7373 1
-16 0 doc9064 1
-16 0 doc13161 1
-16 0 doc3166 1
+16 0 doc1710 1
+16 0 doc15037 1
+16 0 doc9064 1
16 0 doc19297 1
+16 0 doc3281 1
+16 0 doc3166 1
16 0 doc15499 1
+16 0 doc17894 1
+16 0 doc13161 1
+16 0 doc13619 1
+16 0 doc7373 1
+16 0 doc15411 1
+16 0 doc10890 1
+16 0 doc8977 1
# --- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs
-17 0 doc3117 0
-17 0 doc7477 0
-17 0 doc7569 0
-17 0 doc20667 0
-17 0 doc20260 1
-17 0 doc17355 1
-17 0 doc11021 1
-17 0 doc20934 1
-17 0 doc552 1
-17 0 doc20856 1
-17 0 doc3524 1
-17 0 doc17343 1
-17 0 doc21055 1
-17 0 doc19032 1
-17 0 doc19786 1
-17 0 doc9281 1
-17 0 doc1695 1
-17 0 doc15940 1
-17 0 doc9215 1
-17 0 doc8335 1
-17 0 doc20936 1
-17 0 doc6914 1
-17 0 doc12122 1
-17 0 doc6618 1
-17 0 doc5049 1
-17 0 doc450 1
-17 0 doc19206 1
-17 0 doc18823 1
-17 0 doc5307 1
-17 0 doc17295 1
+17 0 doc7477 0
+17 0 doc7569 0
+17 0 doc3117 0
+17 0 doc20667 0
+17 0 doc20260 1
+17 0 doc20934 1
+17 0 doc17355 1
+17 0 doc3524 1
+17 0 doc11021 1
+17 0 doc552 1
+17 0 doc21055 1
+17 0 doc19032 1
+17 0 doc1695 1
+17 0 doc12122 1
+17 0 doc20856 1
+17 0 doc9215 1
+17 0 doc15940 1
+17 0 doc5049 1
+17 0 doc19786 1
+17 0 doc9281 1
+17 0 doc450 1
+17 0 doc17343 1
+17 0 doc20936 1
+17 0 doc8335 1
+17 0 doc5307 1
+17 0 doc6618 1
+17 0 doc1168 1
+17 0 doc18823 1
+17 0 doc19206 1
+17 0 doc6914 1
# ---- m==2: all precision, precision_at_n and recall are hurt.
@@ -655,61 +660,66 @@
18 0 fakedoc3 1
18 0 fakedoc4 1
-18 0 doc8064 0
-18 0 doc18142 0
-18 0 doc19383 0
-18 0 doc21151 0
-18 0 doc4665 1
-18 0 doc2897 1
-18 0 doc6878 1
-18 0 doc14507 1
-18 0 doc2976 1
-18 0 doc11757 1
-18 0 doc12625 1
-18 0 doc14908 1
-18 0 doc12790 1
-18 0 doc17915 1
-18 0 doc11804 1
-18 0 doc12935 1
-18 0 doc8225 1
-18 0 doc18011 1
-18 0 doc10493 1
-18 0 doc17922 1
-18 0 doc1902 1
-18 0 doc14049 1
-18 0 doc1334 1
-18 0 doc1168 1
-18 0 doc4859 1
-18 0 doc7124 1
-18 0 doc9692 1
-18 0 doc18402 1
-18 0 doc9089 1
-18 0 doc15375 1
-
-
-
-19 0 doc2310 1
-19 0 doc5267 1
+18 0 doc8064 0
+18 0 doc18142 0
+18 0 doc19383 0
+18 0 doc2897 0
+18 0 doc21151 1
+18 0 doc14507 1
+18 0 doc12935 1
+18 0 doc12790 1
+18 0 doc4665 1
+18 0 doc10493 1
+18 0 doc2976 1
+18 0 doc18011 1
+18 0 doc1334 1
+18 0 doc14908 1
+18 0 doc1168 1
+18 0 doc15375 1
+18 0 doc18402 1
+18 0 doc8225 1
+18 0 doc11757 1
+18 0 doc11804 1
+18 0 doc6878 1
+18 0 doc12625 1
+18 0 doc4859 1
+18 0 doc5348 1
+18 0 doc9089 1
+18 0 doc14049 1
+18 0 doc17922 1
+18 0 doc1902 1
+18 0 doc17915 1
+18 0 doc7124 1
+
+
+
+19 0 doc2310 1
+19 0 doc5267 1
19 0 doc15666 1
-19 0 doc10803 1
-19 0 doc4900 1
+19 0 doc7925 1
+19 0 doc4900 1
19 0 doc11435 1
-19 0 doc7925 1
-19 0 doc7652 1
+19 0 doc10803 1
+19 0 doc7652 1
+19 0 doc19546 1
19 0 doc18561 1
+19 0 doc9163 1
+19 0 doc8869 1
19 0 doc12733 1
+19 0 doc2444 1
+19 0 doc7194 1
19 0 doc10634 1
-19 0 doc19546 1
-19 0 doc7194 1
-19 0 doc529 1
-19 0 doc9163 1
-19 0 doc8869 1
-19 0 doc2444 1
-19 0 doc5605 1
-19 0 doc5051 1
-19 0 doc10881 1
-19 0 doc4496 1
-19 0 doc3979 1
-19 0 doc8419 1
-19 0 doc9431 1
+19 0 doc529 1
+19 0 doc8419 1
19 0 doc16235 1
+19 0 doc4496 1
+19 0 doc5051 1
+19 0 doc5605 1
+19 0 doc3979 1
+19 0 doc9431 1
+19 0 doc10881 1
+19 0 doc12527 1
+19 0 doc4804 1
+19 0 doc4494 1
+19 0 doc8833 1
+19 0 doc732 1
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
index ba65629..ca384ae 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DefaultIndexingChain.java
@@ -603,7 +603,7 @@ final class DefaultIndexingChain extends DocConsumer {
// PerField.invert to allow for later downgrading of the index options:
fi.setIndexOptions(fieldType.indexOptions());
- fp = new PerField(fi, invert);
+ fp = new PerField(docWriter.getIndexCreatedVersionMajor(), fi, invert);
fp.next = fieldHash[hashPos];
fieldHash[hashPos] = fp;
totalFieldCount++;
@@ -633,6 +633,7 @@ final class DefaultIndexingChain extends DocConsumer {
/** NOTE: not static: accesses at least docState, termsHash. */
private final class PerField implements Comparable<PerField> {
+ final int indexCreatedVersionMajor;
final FieldInfo fieldInfo;
final Similarity similarity;
@@ -659,7 +660,8 @@ final class DefaultIndexingChain extends DocConsumer {
// reused
TokenStream tokenStream;
- public PerField(FieldInfo fieldInfo, boolean invert) {
+ public PerField(int indexCreatedVersionMajor, FieldInfo fieldInfo, boolean invert) {
+ this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.fieldInfo = fieldInfo;
similarity = docState.similarity;
if (invert) {
@@ -668,7 +670,7 @@ final class DefaultIndexingChain extends DocConsumer {
}
void setInvertState() {
- invertState = new FieldInvertState(fieldInfo.name);
+ invertState = new FieldInvertState(indexCreatedVersionMajor, fieldInfo.name);
termsHashPerField = termsHash.addField(invertState, fieldInfo);
if (fieldInfo.omitsNorms() == false) {
assert norms == null;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
index ed50650..c929ba2 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java
@@ -193,6 +193,10 @@ class DocumentsWriterPerThread {
return fieldInfos;
}
+ public int getIndexCreatedVersionMajor() {
+ return indexWriter.segmentInfos.getIndexCreatedVersionMajor();
+ }
+
final void testPoint(String message) {
if (enableTestPoints) {
assert infoStream.isEnabled("TP"); // don't enable unless you need them.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
index 1da02b2..f93edde 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInvertState.java
@@ -31,7 +31,8 @@ import org.apache.lucene.util.AttributeSource;
* @lucene.experimental
*/
public final class FieldInvertState {
- String name;
+ final int indexCreatedVersionMajor;
+ final String name;
int position;
int length;
int numOverlap;
@@ -50,14 +51,15 @@ public final class FieldInvertState {
/** Creates {code FieldInvertState} for the specified
* field name. */
- public FieldInvertState(String name) {
+ public FieldInvertState(int indexCreatedVersionMajor, String name) {
+ this.indexCreatedVersionMajor = indexCreatedVersionMajor;
this.name = name;
}
/** Creates {code FieldInvertState} for the specified
* field name and values for all fields. */
- public FieldInvertState(String name, int position, int length, int numOverlap, int offset) {
- this.name = name;
+ public FieldInvertState(int indexCreatedVersionMajor, String name, int position, int length, int numOverlap, int offset) {
+ this(indexCreatedVersionMajor, name);
this.position = position;
this.length = length;
this.numOverlap = numOverlap;
@@ -164,4 +166,11 @@ public final class FieldInvertState {
public String getName() {
return name;
}
+
+ /**
+ * Return the version that was used to create the index, or 6 if it was created before 7.0.
+ */
+ public int getIndexCreatedVersionMajor() {
+ return indexCreatedVersionMajor;
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index 74978fd..e693b2b 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -96,20 +96,6 @@ public class BM25Similarity extends Similarity {
}
}
- /** The default implementation encodes <code>1 / sqrt(length)</code>
- * with {@link SmallFloat#floatToByte315(float)}. This is compatible with
- * Lucene's historic implementation: {@link ClassicSimilarity}. If you
- * change this, then you should change {@link #decodeNormValue(byte)} to match. */
- protected byte encodeNormValue(int fieldLength) {
- return SmallFloat.floatToByte315((float) (1 / Math.sqrt(fieldLength)));
- }
-
- /** The default implementation returns <code>1 / f<sup>2</sup></code>
- * where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
- protected float decodeNormValue(byte b) {
- return NORM_TABLE[b & 0xFF];
- }
-
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
@@ -132,21 +118,31 @@ public class BM25Similarity extends Similarity {
}
/** Cache of decoded bytes. */
- private static final float[] NORM_TABLE = new float[256];
+ private static final float[] OLD_LENGTH_TABLE = new float[256];
+ private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 1; i < 256; i++) {
float f = SmallFloat.byte315ToFloat((byte)i);
- NORM_TABLE[i] = 1.0f / (f*f);
+ OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
+ }
+ OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
+
+ for (int i = 0; i < 256; i++) {
+ LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
- NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
- return encodeNormValue(numTerms);
+ int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
+ if (indexCreatedVersionMajor >= 7) {
+ return SmallFloat.intToByte4(numTerms);
+ } else {
+ return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
+ }
}
/**
@@ -207,34 +203,43 @@ public class BM25Similarity extends Similarity {
@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
-
float avgdl = avgFieldLength(collectionStats);
- // compute freq-independent part of bm25 equation across all norm values
- float cache[] = new float[256];
+ float[] oldCache = new float[256];
+ float[] cache = new float[256];
for (int i = 0; i < cache.length; i++) {
- cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
+ oldCache[i] = k1 * ((1 - b) + b * OLD_LENGTH_TABLE[i] / avgdl);
+ cache[i] = k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl);
}
- return new BM25Stats(collectionStats.field(), boost, idf, avgdl, cache);
+ return new BM25Stats(collectionStats.field(), boost, idf, avgdl, oldCache, cache);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
BM25Stats bm25stats = (BM25Stats) stats;
- return new BM25DocScorer(bm25stats, context.reader().getNormValues(bm25stats.field));
+ return new BM25DocScorer(bm25stats, context.reader().getMetaData().getCreatedVersionMajor(), context.reader().getNormValues(bm25stats.field));
}
private class BM25DocScorer extends SimScorer {
private final BM25Stats stats;
private final float weightValue; // boost * idf * (k1 + 1)
private final NumericDocValues norms;
+ /** precomputed cache for all length values */
+ private final float[] lengthCache;
+ /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float[] cache;
- BM25DocScorer(BM25Stats stats, NumericDocValues norms) throws IOException {
+ BM25DocScorer(BM25Stats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1);
- this.cache = stats.cache;
this.norms = norms;
+ if (indexCreatedVersionMajor >= 7) {
+ lengthCache = LENGTH_TABLE;
+ cache = stats.cache;
+ } else {
+ lengthCache = OLD_LENGTH_TABLE;
+ cache = stats.oldCache;
+ }
}
@Override
@@ -245,7 +250,7 @@ public class BM25Similarity extends Similarity {
norm = k1;
} else {
if (norms.advanceExact(doc)) {
- norm = cache[(byte)norms.longValue() & 0xFF];
+ norm = cache[((byte) norms.longValue()) & 0xFF];
} else {
norm = cache[0];
}
@@ -255,7 +260,7 @@ public class BM25Similarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
- return explainScore(doc, freq, stats, norms);
+ return explainScore(doc, freq, stats, norms, lengthCache);
}
@Override
@@ -281,21 +286,23 @@ public class BM25Similarity extends Similarity {
private final float weight;
/** field name, for pulling norms */
private final String field;
- /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
- private final float cache[];
+ /** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
+ * for both OLD_LENGTH_TABLE and LENGTH_TABLE */
+ private final float[] oldCache, cache;
- BM25Stats(String field, float boost, Explanation idf, float avgdl, float cache[]) {
+ BM25Stats(String field, float boost, Explanation idf, float avgdl, float[] oldCache, float[] cache) {
this.field = field;
this.boost = boost;
this.idf = idf;
this.avgdl = avgdl;
- this.cache = cache;
this.weight = idf.getValue() * boost;
+ this.oldCache = oldCache;
+ this.cache = cache;
}
}
- private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
+ private Explanation explainTFNorm(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
List<Explanation> subs = new ArrayList<>();
subs.add(freq);
subs.add(Explanation.match(k1, "parameter k1"));
@@ -311,7 +318,7 @@ public class BM25Similarity extends Similarity {
} else {
norm = 0;
}
- float doclen = decodeNormValue(norm);
+ float doclen = lengthCache[norm & 0xff];
subs.add(Explanation.match(b, "parameter b"));
subs.add(Explanation.match(stats.avgdl, "avgFieldLength"));
subs.add(Explanation.match(doclen, "fieldLength"));
@@ -321,13 +328,13 @@ public class BM25Similarity extends Similarity {
}
}
- private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms) throws IOException {
+ private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, NumericDocValues norms, float[] lengthCache) throws IOException {
Explanation boostExpl = Explanation.match(stats.boost, "boost");
List<Explanation> subs = new ArrayList<>();
if (boostExpl.getValue() != 1.0f)
subs.add(boostExpl);
subs.add(stats.idf);
- Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms);
+ Explanation tfNormExpl = explainTFNorm(doc, freq, stats, norms, lengthCache);
subs.add(tfNormExpl);
return Explanation.match(
boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue(),
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
index f56575f..c3d36c3 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
@@ -17,91 +17,27 @@
package org.apache.lucene.search.similarities;
-import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.SmallFloat;
/**
- * Expert: Default scoring implementation which {@link #encodeNormValue(float)
- * encodes} norm values as a single byte before being stored. At search time,
- * the norm byte value is read from the index
- * {@link org.apache.lucene.store.Directory directory} and
- * {@link #decodeNormValue(long) decoded} back to a float <i>norm</i> value.
- * This encoding/decoding, while reducing index size, comes with the price of
- * precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>. For
- * instance, <i>decode(encode(0.89)) = 0.875</i>.
- * <p>
- * Compression of norm values to a single byte saves memory at search time,
- * because once a field is referenced at search time, its norms - for all
- * documents - are maintained in memory.
- * <p>
- * The rationale supporting such lossy compression of norm values is that given
- * the difficulty (and inaccuracy) of users to express their true information
- * need by a query, only big differences matter. <br>
- * <br>
- * Last, note that search time is too late to modify this <i>norm</i> part of
- * scoring, e.g. by using a different {@link Similarity} for search.
+ * Expert: Historical scoring implementation. You might want to consider using
+ * {@link BM25Similarity} instead, which is generally considered superior to
+ * TF-IDF.
*/
public class ClassicSimilarity extends TFIDFSimilarity {
-
- /** Cache of decoded bytes. */
- private static final float[] NORM_TABLE = new float[256];
-
- static {
- for (int i = 0; i < 256; i++) {
- NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
- }
- }
/** Sole constructor: parameter-free */
public ClassicSimilarity() {}
-
- /**
- * Encodes a normalization factor for storage in an index.
- * <p>
- * The encoding uses a three-bit mantissa, a five-bit exponent, and the
- * zero-exponent point at 15, thus representing values from around 7x10^9 to
- * 2x10^-9 with about one significant decimal digit of accuracy. Zero is also
- * represented. Negative numbers are rounded up to zero. Values too large to
- * represent are rounded down to the largest representable value. Positive
- * values too small to represent are rounded up to the smallest positive
- * representable value.
- *
- * @see org.apache.lucene.util.SmallFloat
- */
- @Override
- public final long encodeNormValue(float f) {
- return SmallFloat.floatToByte315(f);
- }
-
- /**
- * Decodes the norm value, assuming it is a single byte.
- *
- * @see #encodeNormValue(float)
- */
- @Override
- public final float decodeNormValue(long norm) {
- return NORM_TABLE[(int) (norm & 0xFF)]; // & 0xFF maps negative bytes to positive above 127
- }
/** Implemented as
- * <code>state.getBoost()*lengthNorm(numTerms)</code>, where
- * <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
- * #setDiscountOverlaps} is false, else it's {@link
- * FieldInvertState#getLength()} - {@link
- * FieldInvertState#getNumOverlap()}.
+ * <code>1/sqrt(length)</code>.
*
* @lucene.experimental */
@Override
- public float lengthNorm(FieldInvertState state) {
- final int numTerms;
- if (discountOverlaps)
- numTerms = state.getLength() - state.getNumOverlap();
- else
- numTerms = state.getLength();
+ public float lengthNorm(int numTerms) {
return (float) (1.0 / Math.sqrt(numTerms));
}
@@ -138,33 +74,6 @@ public class ClassicSimilarity extends TFIDFSimilarity {
public float idf(long docFreq, long docCount) {
return (float)(Math.log((docCount+1)/(double)(docFreq+1)) + 1.0);
}
-
- /**
- * True if overlap tokens (tokens with a position of increment of zero) are
- * discounted from the document's length.
- */
- protected boolean discountOverlaps = true;
-
- /** Determines whether overlap tokens (Tokens with
- * 0 position increment) are ignored when computing
- * norm. By default this is true, meaning overlap
- * tokens do not count when computing norms.
- *
- * @lucene.experimental
- *
- * @see #computeNorm
- */
- public void setDiscountOverlaps(boolean v) {
- discountOverlaps = v;
- }
-
- /**
- * Returns true if overlap tokens are discounted from the document's length.
- * @see #setDiscountOverlaps
- */
- public boolean getDiscountOverlaps() {
- return discountOverlaps;
- }
@Override
public String toString() {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
index dbf8d45..fb34f07 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@@ -190,7 +190,8 @@ public abstract class SimilarityBase extends Similarity {
}
@Override
- public SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
+ public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
+ int indexCreatedVersionMajor = context.reader().getMetaData().getCreatedVersionMajor();
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
@@ -198,12 +199,12 @@ public abstract class SimilarityBase extends Similarity {
SimScorer subScorers[] = new SimScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
BasicStats basicstats = (BasicStats) subStats[i];
- subScorers[i] = new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
+ subScorers[i] = new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
}
return new MultiSimilarity.MultiSimScorer(subScorers);
} else {
BasicStats basicstats = (BasicStats) stats;
- return new BasicSimScorer(basicstats, context.reader().getNormValues(basicstats.field));
+ return new BasicSimScorer(basicstats, indexCreatedVersionMajor, context.reader().getNormValues(basicstats.field));
}
}
@@ -216,40 +217,38 @@ public abstract class SimilarityBase extends Similarity {
// ------------------------------ Norm handling ------------------------------
- /** Norm to document length map. */
- private static final float[] NORM_TABLE = new float[256];
+ /** Cache of decoded bytes. */
+ private static final float[] OLD_LENGTH_TABLE = new float[256];
+ private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 1; i < 256; i++) {
- float floatNorm = SmallFloat.byte315ToFloat((byte)i);
- NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
+ float f = SmallFloat.byte315ToFloat((byte)i);
+ OLD_LENGTH_TABLE[i] = 1.0f / (f*f);
+ }
+ OLD_LENGTH_TABLE[0] = 1.0f / OLD_LENGTH_TABLE[255]; // otherwise inf
+
+ for (int i = 0; i < 256; i++) {
+ LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
- NORM_TABLE[0] = 1.0f / NORM_TABLE[255]; // otherwise inf
}
- /** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
+ /** Encodes the document length in the same way as {@link BM25Similarity}. */
@Override
- public long computeNorm(FieldInvertState state) {
- final float numTerms;
+ public final long computeNorm(FieldInvertState state) {
+ final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
- return encodeNormValue(numTerms);
- }
-
- /** Decodes a normalization factor (document length) stored in an index.
- * @see #encodeNormValue(float)
- */
- protected float decodeNormValue(byte norm) {
- return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
- }
-
- /** Encodes the length to a byte via SmallFloat. */
- protected byte encodeNormValue(float length) {
- return SmallFloat.floatToByte315((float) (1 / Math.sqrt(length)));
+ int indexCreatedVersionMajor = state.getIndexCreatedVersionMajor();
+ if (indexCreatedVersionMajor >= 7) {
+ return SmallFloat.intToByte4(numTerms);
+ } else {
+ return SmallFloat.floatToByte315((float) (1 / Math.sqrt(numTerms)));
+ }
}
-
+
// ----------------------------- Static methods ------------------------------
/** Returns the base two logarithm of {@code x}. */
@@ -266,35 +265,37 @@ public abstract class SimilarityBase extends Similarity {
* {@link SimilarityBase#explain(BasicStats, int, Explanation, float)},
* respectively.
*/
- private class BasicSimScorer extends SimScorer {
+ final class BasicSimScorer extends SimScorer {
private final BasicStats stats;
private final NumericDocValues norms;
+ private final float[] normCache;
- BasicSimScorer(BasicStats stats, NumericDocValues norms) throws IOException {
+ BasicSimScorer(BasicStats stats, int indexCreatedVersionMajor, NumericDocValues norms) throws IOException {
this.stats = stats;
this.norms = norms;
+ this.normCache = indexCreatedVersionMajor >= 7 ? LENGTH_TABLE : OLD_LENGTH_TABLE;
}
- private float getNormValue(int doc) throws IOException {
+ float getLengthValue(int doc) throws IOException {
if (norms == null) {
return 1F;
}
if (norms.advanceExact(doc)) {
- return decodeNormValue((byte) norms.longValue());
+ return normCache[Byte.toUnsignedInt((byte) norms.longValue())];
} else {
- return decodeNormValue((byte) 0);
+ return 0;
}
}
@Override
public float score(int doc, float freq) throws IOException {
// We have to supply something in case norms are omitted
- return SimilarityBase.this.score(stats, freq, getNormValue(doc));
+ return SimilarityBase.this.score(stats, freq, getLengthValue(doc));
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
- return SimilarityBase.this.explain(stats, doc, freq, getNormValue(doc));
+ return SimilarityBase.this.explain(stats, doc, freq, getLengthValue(doc));
}
@Override
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
index 2246561..14b3c3f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.SmallFloat;
/**
@@ -233,11 +234,6 @@ import org.apache.lucene.util.BytesRef;
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
- *
- * <li>Applying query normalization on the scores helps to keep the
- * scores around the unit vector, hence preventing loss of score data
- * because of floating point precision limitations.
- * </li>
* </ul>
* </li>
*
@@ -379,13 +375,49 @@ import org.apache.lucene.util.BytesRef;
* @see IndexSearcher#setSimilarity(Similarity)
*/
public abstract class TFIDFSimilarity extends Similarity {
-
+
+ /** Cache of decoded bytes. */
+ static final float[] OLD_NORM_TABLE = new float[256];
+
+ static {
+ for (int i = 0; i < 256; i++) {
+ OLD_NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
+ }
+ }
+
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public TFIDFSimilarity() {}
-
+
+ /**
+ * True if overlap tokens (tokens with a position of increment of zero) are
+ * discounted from the document's length.
+ */
+ protected boolean discountOverlaps = true;
+
+ /** Determines whether overlap tokens (Tokens with
+ * 0 position increment) are ignored when computing
+ * norm. By default this is true, meaning overlap
+ * tokens do not count when computing norms.
+ *
+ * @lucene.experimental
+ *
+ * @see #computeNorm
+ */
+ public void setDiscountOverlaps(boolean v) {
+ discountOverlaps = v;
+ }
+
+ /**
+ * Returns true if overlap tokens are discounted from the document's length.
+ * @see #setDiscountOverlaps
+ */
+ public boolean getDiscountOverlaps() {
+ return discountOverlaps;
+ }
+
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(long, long)}
* factor for each term in the query and these products are then summed to
@@ -471,30 +503,25 @@ public abstract class TFIDFSimilarity extends Similarity {
/**
* Compute an index-time normalization value for this field instance.
- * <p>
- * This value will be stored in a single byte lossy representation by
- * {@link #encodeNormValue(float)}.
*
- * @param state statistics of the current field (such as length, boost, etc)
- * @return an index-time normalization value
+ * @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) discounting overlaps}
+ * @return a length normalization value
*/
- public abstract float lengthNorm(FieldInvertState state);
+ public abstract float lengthNorm(int length);
@Override
public final long computeNorm(FieldInvertState state) {
- float normValue = lengthNorm(state);
- return encodeNormValue(normValue);
+ final int numTerms;
+ if (discountOverlaps)
+ numTerms = state.getLength() - state.getNumOverlap();
+ else
+ numTerms = state.getLength();
+ if (state.getIndexCreatedVersionMajor() >= 7) {
+ return SmallFloat.intToByte4(numTerms);
+ } else {
+ return SmallFloat.floatToByte315(lengthNorm(numTerms));
+ }
}
-
- /**
- * Decodes a normalization factor stored in an index.
- *
- * @see #encodeNormValue(float)
- */
- public abstract float decodeNormValue(long norm);
-
- /** Encodes a normalization factor for storage in an index. */
- public abstract long encodeNormValue(float f);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
@@ -529,24 +556,41 @@ public abstract class TFIDFSimilarity extends Similarity {
final Explanation idf = termStats.length == 1
? idfExplain(collectionStats, termStats[0])
: idfExplain(collectionStats, termStats);
- return new IDFStats(collectionStats.field(), boost, idf);
+ float[] normTable = new float[256];
+ for (int i = 1; i < 256; ++i) {
+ int length = SmallFloat.byte4ToInt((byte) i);
+ float norm = lengthNorm(length);
+ normTable[i] = norm;
+ }
+ normTable[0] = 1f / normTable[255];
+ return new IDFStats(collectionStats.field(), boost, idf, normTable);
}
@Override
public final SimScorer simScorer(SimWeight stats, LeafReaderContext context) throws IOException {
IDFStats idfstats = (IDFStats) stats;
- return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field));
+ final float[] normTable;
+ if (context.reader().getMetaData().getCreatedVersionMajor() >= 7) {
+ // the norms only encode the length, we need a translation table that depends on how lengthNorm is implemented
+ normTable = idfstats.normTable;
+ } else {
+ // the norm is directly encoded in the index
+ normTable = OLD_NORM_TABLE;
+ }
+ return new TFIDFSimScorer(idfstats, context.reader().getNormValues(idfstats.field), normTable);
}
private final class TFIDFSimScorer extends SimScorer {
private final IDFStats stats;
private final float weightValue;
private final NumericDocValues norms;
+ private final float[] normTable;
- TFIDFSimScorer(IDFStats stats, NumericDocValues norms) throws IOException {
+ TFIDFSimScorer(IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
this.stats = stats;
this.weightValue = stats.queryWeight;
this.norms = norms;
+ this.normTable = normTable;
}
@Override
@@ -556,13 +600,13 @@ public abstract class TFIDFSimilarity extends Similarity {
if (norms == null) {
return raw;
} else {
- long normValue;
+ float normValue;
if (norms.advanceExact(doc)) {
- normValue = norms.longValue();
+ normValue = normTable[(int) (norms.longValue() & 0xFF)];
} else {
normValue = 0;
}
- return raw * decodeNormValue(normValue); // normalize for field
+ return raw * normValue; // normalize for field
}
}
@@ -578,35 +622,39 @@ public abstract class TFIDFSimilarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
- return explainScore(doc, freq, stats, norms);
+ return explainScore(doc, freq, stats, norms, normTable);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
- private static class IDFStats extends SimWeight {
+ static class IDFStats extends SimWeight {
private final String field;
/** The idf and its explanation */
private final Explanation idf;
private final float boost;
private final float queryWeight;
+ final float[] normTable;
- public IDFStats(String field, float boost, Explanation idf) {
+ public IDFStats(String field, float boost, Explanation idf, float[] normTable) {
// TODO: Validate?
this.field = field;
this.idf = idf;
this.boost = boost;
this.queryWeight = boost * idf.getValue();
+ this.normTable = normTable;
}
}
- private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
+ private Explanation explainField(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation tfExplanation = Explanation.match(tf(freq.getValue()), "tf(freq="+freq.getValue()+"), with freq of:", freq);
float norm;
- if (norms != null && norms.advanceExact(doc)) {
- norm = decodeNormValue(norms.longValue());
- } else {
+ if (norms == null) {
norm = 1f;
+ } else if (norms.advanceExact(doc) == false) {
+ norm = 0f;
+ } else {
+ norm = normTable[(int) (norms.longValue() & 0xFF)];
}
Explanation fieldNormExpl = Explanation.match(
@@ -619,9 +667,9 @@ public abstract class TFIDFSimilarity extends Similarity {
tfExplanation, stats.idf, fieldNormExpl);
}
- private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms) throws IOException {
+ private Explanation explainScore(int doc, Explanation freq, IDFStats stats, NumericDocValues norms, float[] normTable) throws IOException {
Explanation queryExpl = Explanation.match(stats.boost, "boost");
- Explanation fieldExpl = explainField(doc, freq, stats, norms);
+ Explanation fieldExpl = explainField(doc, freq, stats, norms, normTable);
if (stats.boost == 1f) {
return fieldExpl;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/java/org/apache/lucene/util/SmallFloat.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/SmallFloat.java b/lucene/core/src/java/org/apache/lucene/util/SmallFloat.java
index 39395ac..317acab 100644
--- a/lucene/core/src/java/org/apache/lucene/util/SmallFloat.java
+++ b/lucene/core/src/java/org/apache/lucene/util/SmallFloat.java
@@ -97,31 +97,74 @@ public class SmallFloat {
return Float.intBitsToFloat(bits);
}
+ /** Float-like encoding for positive longs that preserves ordering and 4 significant bits. */
+ public static int longToInt4(long i) {
+ if (i < 0) {
+ throw new IllegalArgumentException("Only supports positive values, got " + i);
+ }
+ int numBits = 64 - Long.numberOfLeadingZeros(i);
+ if (numBits < 4) {
+ // subnormal value
+ return Math.toIntExact(i);
+ } else {
+ // normal value
+ int shift = numBits - 4;
+ // only keep the 5 most significant bits
+ int encoded = Math.toIntExact(i >>> shift);
+ // clear the most significant bit, which is implicit
+ encoded &= 0x07;
+ // encode the shift, adding 1 because 0 is reserved for subnormal values
+ encoded |= (shift + 1) << 3;
+ return encoded;
+ }
+ }
- /** floatToByte(b, mantissaBits=5, zeroExponent=2)
- * <br>smallest nonzero value = 0.033203125
- * <br>largest value = 1984.0
- * <br>epsilon = 0.03125
+ /**
+ * Decode values encoded with {@link #longToInt4(long)}.
*/
- public static byte floatToByte52(float f) {
- int bits = Float.floatToRawIntBits(f);
- int smallfloat = bits >> (24-5);
- if (smallfloat <= (63-2)<<5) {
- return (bits<=0) ? (byte)0 : (byte)1;
+ public static final long int4ToLong(int i) {
+ long bits = i & 0x07;
+ int shift = (i >>> 3) - 1;
+ long decoded;
+ if (shift == -1) {
+ // subnormal value
+ decoded = bits;
+ } else {
+ // normal value
+ decoded = (bits | 0x08) << shift;
}
- if (smallfloat >= ((63-2)<<5) + 0x100) {
- return -1;
+ return decoded;
+ }
+
+ private static final int MAX_INT4 = longToInt4(Integer.MAX_VALUE);
+ private static final int NUM_FREE_VALUES = 255 - MAX_INT4;
+
+ /**
+ * Encode an integer to a byte. It is built upon {@link #longToInt4(long)}
+ * and leverages the fact that {@code longToInt4(Integer.MAX_VALUE)} is
+ * less than 255 to encode low values more accurately.
+ */
+ public static byte intToByte4(int i) {
+ if (i < 0) {
+ throw new IllegalArgumentException("Only supports positive values, got " + i);
+ }
+ if (i < NUM_FREE_VALUES) {
+ return (byte) i;
+ } else {
+ return (byte) (NUM_FREE_VALUES + longToInt4(i - NUM_FREE_VALUES));
}
- return (byte)(smallfloat - ((63-2)<<5));
}
- /** byteToFloat(b, mantissaBits=5, zeroExponent=2) */
- public static float byte52ToFloat(byte b) {
- // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup
- // is only a little bit faster (anywhere from 0% to 7%)
- if (b == 0) return 0.0f;
- int bits = (b&0xff) << (24-5);
- bits += (63-2) << 24;
- return Float.intBitsToFloat(bits);
+ /**
+ * Decode values that have been encoded with {@link #intToByte4(int)}.
+ */
+ public static int byte4ToInt(byte b) {
+ int i = Byte.toUnsignedInt(b);
+ if (i < NUM_FREE_VALUES) {
+ return i;
+ } else {
+ long decoded = NUM_FREE_VALUES + int4ToLong(i - NUM_FREE_VALUES);
+ return Math.toIntExact(decoded);
+ }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
index be3a2af..bd483d3 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexSorting.java
@@ -2441,7 +2441,7 @@ public class TestIndexSorting extends LuceneTestCase {
assertTrue(sparseValues.advanceExact(docID));
assertTrue(sparseBinaryValues.advanceExact(docID));
assertTrue(normsValues.advanceExact(docID));
- assertEquals(124, normsValues.longValue());
+ assertEquals(1, normsValues.longValue());
assertEquals(127-docID, (int) sparseValues.longValue());
assertEquals(new BytesRef(Integer.toString(127-docID)), sparseBinaryValues.binaryValue());
} else {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
index 4f74c30..491660b 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
@@ -17,6 +17,7 @@
package org.apache.lucene.index;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -26,7 +27,9 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-import org.apache.lucene.search.similarities.TFIDFSimilarity;
+import org.apache.lucene.search.CollectionStatistics;
+import org.apache.lucene.search.TermStatistics;
+import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@@ -35,12 +38,12 @@ import org.apache.lucene.util.TestUtil;
/**
* Tests the maxTermFrequency statistic in FieldInvertState
*/
-public class TestMaxTermFrequency extends LuceneTestCase {
+public class TestMaxTermFrequency extends LuceneTestCase {
Directory dir;
IndexReader reader;
/* expected maxTermFrequency values for our documents */
ArrayList<Integer> expected = new ArrayList<>();
-
+
@Override
public void setUp() throws Exception {
super.setUp();
@@ -59,14 +62,14 @@ public class TestMaxTermFrequency extends LuceneTestCase {
reader = writer.getReader();
writer.close();
}
-
+
@Override
public void tearDown() throws Exception {
reader.close();
dir.close();
super.tearDown();
}
-
+
public void test() throws Exception {
NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
for (int i = 0; i < reader.maxDoc(); i++) {
@@ -95,30 +98,42 @@ public class TestMaxTermFrequency extends LuceneTestCase {
Collections.shuffle(terms, random());
return Arrays.toString(terms.toArray(new String[terms.size()]));
}
-
+
/**
* Simple similarity that encodes maxTermFrequency directly as a byte
*/
- static class TestSimilarity extends TFIDFSimilarity {
+ static class TestSimilarity extends Similarity {
@Override
- public float lengthNorm(FieldInvertState state) {
+ public long computeNorm(FieldInvertState state) {
return state.getMaxTermFrequency();
}
@Override
- public long encodeNormValue(float f) {
- return (byte) f;
+ public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
+ return new SimWeight() {};
}
@Override
- public float decodeNormValue(long norm) {
- return norm;
+ public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
+ return new SimScorer() {
+
+ @Override
+ public float score(int doc, float freq) throws IOException {
+ return 0;
+ }
+
+ @Override
+ public float computeSlopFactor(int distance) {
+ return 0;
+ }
+
+ @Override
+ public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
+ return 0;
+ }
+ };
}
- @Override public float tf(float freq) { return 0; }
- @Override public float idf(long docFreq, long docCount) { return 0; }
- @Override public float sloppyFreq(int distance) { return 0; }
- @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
index 64c0649..70c7a32 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestNorms.java
@@ -32,13 +32,11 @@ import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
-import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
/**
@@ -49,67 +47,6 @@ import org.apache.lucene.util.TestUtil;
@Slow
public class TestNorms extends LuceneTestCase {
static final String BYTE_TEST_FIELD = "normsTestByte";
-
- static class CustomNormEncodingSimilarity extends TFIDFSimilarity {
-
- @Override
- public long encodeNormValue(float f) {
- return (long) f;
- }
-
- @Override
- public float decodeNormValue(long norm) {
- return norm;
- }
-
- @Override
- public float lengthNorm(FieldInvertState state) {
- return state.getLength();
- }
-
- @Override public float tf(float freq) { return 0; }
- @Override public float idf(long docFreq, long docCount) { return 0; }
- @Override public float sloppyFreq(int distance) { return 0; }
- @Override public float scorePayload(int doc, int start, int end, BytesRef payload) { return 0; }
- }
-
- // LUCENE-1260
- public void testCustomEncoder() throws Exception {
- Directory dir = newDirectory();
- MockAnalyzer analyzer = new MockAnalyzer(random());
-
- IndexWriterConfig config = newIndexWriterConfig(analyzer);
- config.setSimilarity(new CustomNormEncodingSimilarity());
- RandomIndexWriter writer = new RandomIndexWriter(random(), dir, config);
- Document doc = new Document();
- Field foo = newTextField("foo", "", Field.Store.NO);
- Field bar = newTextField("bar", "", Field.Store.NO);
- doc.add(foo);
- doc.add(bar);
-
- for (int i = 0; i < 100; i++) {
- bar.setStringValue("singleton");
- writer.addDocument(doc);
- }
-
- IndexReader reader = writer.getReader();
- writer.close();
-
- NumericDocValues fooNorms = MultiDocValues.getNormValues(reader, "foo");
- for (int i = 0; i < reader.maxDoc(); i++) {
- assertEquals(i, fooNorms.nextDoc());
- assertEquals(0, fooNorms.longValue());
- }
-
- NumericDocValues barNorms = MultiDocValues.getNormValues(reader, "bar");
- for (int i = 0; i < reader.maxDoc(); i++) {
- assertEquals(i, barNorms.nextDoc());
- assertEquals(1, barNorms.longValue());
- }
-
- reader.close();
- dir.close();
- }
public void testMaxByteNorms() throws IOException {
Directory dir = newFSDirectory(createTempDir("TestNorms.testMaxByteNorms"));
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java b/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
index 0deafdd..8af744f 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestOmitTf.java
@@ -44,9 +44,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestOmitTf extends LuceneTestCase {
public static class SimpleSimilarity extends TFIDFSimilarity {
- @Override public float decodeNormValue(long norm) { return norm; }
- @Override public long encodeNormValue(float f) { return (long) f; }
- @Override public float lengthNorm(FieldInvertState state) { return 1; }
+ @Override public float lengthNorm(int length) { return 1; }
@Override public float tf(float freq) { return freq; }
@Override public float sloppyFreq(int distance) { return 2.0f; }
@Override public float idf(long docFreq, long docCount) { return 1.0f; }
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
index e20163a..112d892 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestDisjunctionMaxQuery.java
@@ -30,7 +30,6 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -72,7 +71,7 @@ public class TestDisjunctionMaxQuery extends LuceneTestCase {
}
@Override
- public float lengthNorm(FieldInvertState state) {
+ public float lengthNorm(int length) {
// Disable length norm
return 1;
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/06a6034d/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java b/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java
index fb01e1d..bc849e9 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestElevationComparator.java
@@ -33,6 +33,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FieldValueHitQueue.Entry;
+import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
@@ -63,7 +64,7 @@ public class TestElevationComparator extends LuceneTestCase {
writer.close();
IndexSearcher searcher = newSearcher(r);
- searcher.setSimilarity(new ClassicSimilarity());
+ searcher.setSimilarity(new BM25Similarity());
runTest(searcher, true);
runTest(searcher, false);
@@ -98,11 +99,11 @@ public class TestElevationComparator extends LuceneTestCase {
assertEquals(3, topDocs.scoreDocs[1].doc);
if (reversed) {
- assertEquals(2, topDocs.scoreDocs[2].doc);
- assertEquals(1, topDocs.scoreDocs[3].doc);
- } else {
assertEquals(1, topDocs.scoreDocs[2].doc);
assertEquals(2, topDocs.scoreDocs[3].doc);
+ } else {
+ assertEquals(2, topDocs.scoreDocs[2].doc);
+ assertEquals(1, topDocs.scoreDocs[3].doc);
}
/*