You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/10/23 19:53:03 UTC
svn commit: r1401363 [1/2] - in /lucene/dev/branches/lucene3846: ./
dev-tools/ dev-tools/scripts/ lucene/ lucene/analysis/
lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/
lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/...
Author: mikemccand
Date: Tue Oct 23 17:53:00 2012
New Revision: 1401363
URL: http://svn.apache.org/viewvc?rev=1401363&view=rev
Log:
LUCENE-3846: merge trunk
Added:
lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
- copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
- copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
- copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
- copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
lucene/dev/branches/lucene3846/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
- copied unchanged from r1401358, lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
- copied unchanged from r1401358, lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
lucene/dev/branches/lucene3846/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
- copied unchanged from r1401358, lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
Modified:
lucene/dev/branches/lucene3846/ (props changed)
lucene/dev/branches/lucene3846/build.xml
lucene/dev/branches/lucene3846/dev-tools/ (props changed)
lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py
lucene/dev/branches/lucene3846/lucene/ (props changed)
lucene/dev/branches/lucene3846/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/lucene3846/lucene/analysis/ (props changed)
lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
lucene/dev/branches/lucene3846/lucene/build.xml (contents, props changed)
lucene/dev/branches/lucene3846/lucene/classification/build.xml
lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html
lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
lucene/dev/branches/lucene3846/lucene/common-build.xml (contents, props changed)
lucene/dev/branches/lucene3846/lucene/core/ (props changed)
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/NoMergePolicy.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/UpgradeIndexMergePolicy.java
lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java
lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/Test4GBStoredFields.java
lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/TestBagOfPostings.java
lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/TestNoMergePolicy.java
lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java
lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
lucene/dev/branches/lucene3846/lucene/highlighter/ (props changed)
lucene/dev/branches/lucene3846/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
lucene/dev/branches/lucene3846/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
lucene/dev/branches/lucene3846/lucene/suggest/ (props changed)
lucene/dev/branches/lucene3846/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
lucene/dev/branches/lucene3846/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
lucene/dev/branches/lucene3846/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
lucene/dev/branches/lucene3846/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
lucene/dev/branches/lucene3846/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java
lucene/dev/branches/lucene3846/lucene/test-framework/ (props changed)
lucene/dev/branches/lucene3846/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
lucene/dev/branches/lucene3846/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
lucene/dev/branches/lucene3846/solr/ (props changed)
lucene/dev/branches/lucene3846/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/lucene3846/solr/build.xml (contents, props changed)
lucene/dev/branches/lucene3846/solr/contrib/ (props changed)
lucene/dev/branches/lucene3846/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
lucene/dev/branches/lucene3846/solr/core/ (props changed)
lucene/dev/branches/lucene3846/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
lucene/dev/branches/lucene3846/solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java
lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/schema.xml
lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
lucene/dev/branches/lucene3846/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java
Modified: lucene/dev/branches/lucene3846/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/build.xml (original)
+++ lucene/dev/branches/lucene3846/build.xml Tue Oct 23 17:53:00 2012
@@ -30,43 +30,37 @@
<subant buildpath="lucene" target="test-help" inheritall="false" failonerror="true"/>
</target>
+ <property name="tests.heap-dump-dir" location="heapdumps"/>
+
<target name="precommit" description="Run basic checks before committing"
depends="check-svn-working-copy,validate,documentation-lint"/>
<target name="test" description="Test both Lucene and Solr">
- <sequential>
- <subant target="test" inheritall="false" failonerror="true">
- <fileset dir="lucene" includes="build.xml" />
- <fileset dir="solr" includes="build.xml" />
- </subant>
- </sequential>
+ <subant target="test" inheritall="false" failonerror="true">
+ <fileset dir="lucene" includes="build.xml" />
+ <fileset dir="solr" includes="build.xml" />
+ </subant>
</target>
<target name="pitest" description="Run PITest on both Lucene and Solr">
- <sequential>
- <subant target="pitest" inheritall="false" failonerror="false">
- <fileset dir="lucene" includes="build.xml" />
- <fileset dir="solr" includes="build.xml" />
- </subant>
- </sequential>
+ <subant target="pitest" inheritall="false" failonerror="false">
+ <fileset dir="lucene" includes="build.xml" />
+ <fileset dir="solr" includes="build.xml" />
+ </subant>
</target>
<target name="documentation" description="Generate Lucene and Solr Documentation">
- <sequential>
- <subant target="documentation" inheritall="false" failonerror="true">
- <fileset dir="lucene" includes="build.xml" />
- <fileset dir="solr" includes="build.xml" />
- </subant>
- </sequential>
+ <subant target="documentation" inheritall="false" failonerror="true">
+ <fileset dir="lucene" includes="build.xml" />
+ <fileset dir="solr" includes="build.xml" />
+ </subant>
</target>
<target name="documentation-lint" description="Validates the generated documentation (HTML errors, broken links,...)">
- <sequential>
- <subant target="documentation-lint" inheritall="false" failonerror="true">
- <fileset dir="lucene" includes="build.xml" />
- <fileset dir="solr" includes="build.xml" />
- </subant>
- </sequential>
+ <subant target="documentation-lint" inheritall="false" failonerror="true">
+ <fileset dir="lucene" includes="build.xml" />
+ <fileset dir="solr" includes="build.xml" />
+ </subant>
</target>
<target name="validate" description="Validate dependencies, licenses, etc." depends="-validate-source-patterns">
@@ -218,13 +212,11 @@
<target name="clean" description="Clean Lucene and Solr build dirs">
<delete dir="dist" />
- <sequential>
- <subant target="clean" inheritall="false" failonerror="true">
- <fileset dir="lucene" includes="build.xml" />
- <fileset dir="solr" includes="build.xml" />
- </subant>
- <delete dir="dist" failonerror="false" />
- </sequential>
+ <delete dir="${tests.heap-dump-dir}" />
+ <subant target="clean" inheritall="false" failonerror="true">
+ <fileset dir="lucene" includes="build.xml" />
+ <fileset dir="solr" includes="build.xml" />
+ </subant>
</target>
<target name="ivy-bootstrap" description="Download and install Ivy in the users ant lib dir">
@@ -258,46 +250,44 @@
</target>
<target name="nightly-smoke" description="Builds an unsigned release and smoke tests it" depends="clean,-env-JAVA6_HOME,-env-JAVA7_HOME">
- <sequential>
- <fail unless="JAVA6_HOME">JAVA6_HOME property or environment variable is not defined.</fail>
- <fail unless="JAVA7_HOME">JAVA7_HOME property or environment variable is not defined.</fail>
- <subant target="prepare-release-no-sign" inheritall="false" failonerror="true">
- <fileset dir="lucene" includes="build.xml" />
- <fileset dir="solr" includes="build.xml" />
- <property name="version" value="${fakeReleaseVersion}" />
- </subant>
- <delete dir="${fakeRelease}"/>
- <delete dir="${fakeReleaseTmp}"/>
- <mkdir dir="${fakeRelease}"/>
- <copy todir="${fakeRelease}/lucene">
- <fileset dir="lucene/dist"/>
- </copy>
- <copy todir="${fakeRelease}/lucene/changes">
- <fileset dir="lucene/build/docs/changes"/>
- </copy>
- <get src="http://people.apache.org/keys/group/lucene.asc"
- dest="${fakeRelease}/lucene/KEYS"/>
- <copy todir="${fakeRelease}/solr">
- <fileset dir="solr/package"/>
- </copy>
- <copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
- <copy todir="${fakeRelease}/solr/changes">
- <fileset dir="solr/build/docs/changes"/>
- </copy>
- <makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
- <exec executable="${python32.exe}" failonerror="true">
- <arg value="-u"/>
- <arg file="dev-tools/scripts/smokeTestRelease.py"/>
- <arg value="${fakeRelease.uri}"/>
- <arg value="${fakeReleaseVersion}"/>
- <arg file="${fakeReleaseTmp}"/>
- <arg value="false"/>
- <env key="JAVA6_HOME" file="${JAVA6_HOME}"/>
- <env key="JAVA7_HOME" file="${JAVA7_HOME}"/>
- </exec>
- <delete dir="${fakeRelease}"/>
- <delete dir="${fakeReleaseTmp}"/>
- </sequential>
+ <fail unless="JAVA6_HOME">JAVA6_HOME property or environment variable is not defined.</fail>
+ <fail unless="JAVA7_HOME">JAVA7_HOME property or environment variable is not defined.</fail>
+ <subant target="prepare-release-no-sign" inheritall="false" failonerror="true">
+ <fileset dir="lucene" includes="build.xml" />
+ <fileset dir="solr" includes="build.xml" />
+ <property name="version" value="${fakeReleaseVersion}" />
+ </subant>
+ <delete dir="${fakeRelease}"/>
+ <delete dir="${fakeReleaseTmp}"/>
+ <mkdir dir="${fakeRelease}"/>
+ <copy todir="${fakeRelease}/lucene">
+ <fileset dir="lucene/dist"/>
+ </copy>
+ <copy todir="${fakeRelease}/lucene/changes">
+ <fileset dir="lucene/build/docs/changes"/>
+ </copy>
+ <get src="http://people.apache.org/keys/group/lucene.asc"
+ dest="${fakeRelease}/lucene/KEYS"/>
+ <copy todir="${fakeRelease}/solr">
+ <fileset dir="solr/package"/>
+ </copy>
+ <copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
+ <copy todir="${fakeRelease}/solr/changes">
+ <fileset dir="solr/build/docs/changes"/>
+ </copy>
+ <makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
+ <exec executable="${python32.exe}" failonerror="true">
+ <arg value="-u"/>
+ <arg file="dev-tools/scripts/smokeTestRelease.py"/>
+ <arg value="${fakeRelease.uri}"/>
+ <arg value="${fakeReleaseVersion}"/>
+ <arg file="${fakeReleaseTmp}"/>
+ <arg value="false"/>
+ <env key="JAVA6_HOME" file="${JAVA6_HOME}"/>
+ <env key="JAVA7_HOME" file="${JAVA7_HOME}"/>
+ </exec>
+ <delete dir="${fakeRelease}"/>
+ <delete dir="${fakeReleaseTmp}"/>
</target>
<target name="check-svn-working-copy" description="Checks the status of the SVN working copy">
@@ -309,7 +299,7 @@
<param name="run.clover" value="true"/>
<!-- must be 1, as clover does not like parallel test runs: -->
<param name="tests.jvms" value="1"/>
- <!-- Also override some other props to be fast, ignoring what's set on command line: -->
+ <!-- Also override some other props to be fast: -->
<param name="tests.multiplier" value="1"/>
<param name="tests.nightly" value="false"/>
<param name="tests.weekly" value="false"/>
@@ -326,19 +316,42 @@
<subant buildpath="." antfile="extra-targets.xml" target="-generate-clover-reports" inheritall="false" failonerror="true"/>
</target>
+ <target name="test-with-heapdumps" depends="-test-with-heapdumps-enabled,-test-with-heapdumps-disabled" description="Runs tests with heap dumps on OOM enabled (if VM supports this)"/>
+
+ <condition property="vm.supports.heapdumps">
+ <or>
+ <contains string="${java.vm.name}" substring="hotspot" casesensitive="false"/>
+ <contains string="${java.vm.name}" substring="openjdk" casesensitive="false"/>
+ <contains string="${java.vm.name}" substring="jrockit" casesensitive="false"/>
+ </or>
+ </condition>
+
+ <target name="-test-with-heapdumps-enabled" if="vm.supports.heapdumps">
+ <echo level="info" message="${java.vm.name}: Enabling heap dumps on OutOfMemoryError to dir '${tests.heap-dump-dir}'."/>
+ <mkdir dir="${tests.heap-dump-dir}"/>
+ <delete includeEmptyDirs="true">
+ <fileset dir="${tests.heap-dump-dir}" includes="**/*"/>
+ </delete>
+ <antcall inheritAll="false" target="test">
+ <param name="tests.heapdump.args" value="-XX:+HeapDumpOnOutOfMemoryError "-XX:HeapDumpPath=${tests.heap-dump-dir}""/>
+ </antcall>
+ <pathconvert property="heapdumps.list" setonempty="false" pathsep="${line.separator}">
+ <fileset dir="${tests.heap-dump-dir}"/>
+ <map from="${tests.heap-dump-dir}${file.separator}" to="* "/>
+ </pathconvert>
+ <fail if="heapdumps.list" message="Some of the tests produced a heap dump, but did not fail. Maybe a suppressed OutOfMemoryError? Dumps created:${line.separator}${heapdumps.list}"/>
+ <delete dir="${tests.heap-dump-dir}"/>
+ </target>
+
+ <target name="-test-with-heapdumps-disabled" unless="vm.supports.heapdumps">
+ <echo level="warning" message="WARN: The used JVM (${java.vm.name}) does not support HPROF heap dumps on OutOfMemoryError."/>
+ <antcall target="test"/>
+ </target>
+
<!-- Jenkins tasks -->
- <target name="jenkins-hourly" depends="clean,test,validate,-jenkins-documentation-lint,jar-checksums,check-svn-working-copy"/>
+ <target name="jenkins-hourly" depends="clean,test-with-heapdumps,validate,documentation-lint,jar-checksums,check-svn-working-copy"/>
<target name="jenkins-maven-nightly" depends="clean,remove-maven-artifacts,run-maven-build,generate-maven-artifacts,validate-maven-dependencies"/>
<target name="jenkins-clover" depends="run-clover"/>
-
- <!-- we need this extra condition, as we want to match only on "true", not solely if property is set: -->
- <property name="disable.documentation-lint" value="false" />
- <condition property="-disable.documentation-lint">
- <istrue value="${disable.documentation-lint}"/>
- </condition>
- <target name="-jenkins-documentation-lint" unless="-disable.documentation-lint">
- <antcall target="documentation-lint"/>
- </target>
</project>
Modified: lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py (original)
+++ lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py Tue Oct 23 17:53:00 2012
@@ -581,6 +581,9 @@ def verifyUnpacked(project, artifact, un
textFiles.extend(('JRE_VERSION_MIGRATION', 'CHANGES', 'MIGRATE', 'SYSTEM_REQUIREMENTS'))
if isSrc:
textFiles.append('BUILD')
+ elif not isSrc:
+ textFiles.append('SYSTEM_REQUIREMENTS')
+
for fileName in textFiles:
fileName += '.txt'
if fileName not in l:
@@ -629,10 +632,8 @@ def verifyUnpacked(project, artifact, un
if project == 'lucene':
if len(l) > 0:
raise RuntimeError('%s: unexpected files/dirs in artifact %s: %s' % (project, artifact, l))
- else:
- # TODO: re-enable this check
- if False and not os.path.exists('%s/solr/SYSTEM_REQUIREMENTS.txt' % unpackPath):
- raise RuntimeError('%s: solr/SYSTEM_REQUIREMENTS.txt does not exist in artifact %s' % (project, artifact))
+ elif isSrc and not os.path.exists('%s/solr/SYSTEM_REQUIREMENTS.txt' % unpackPath):
+ raise RuntimeError('%s: solr/SYSTEM_REQUIREMENTS.txt does not exist in artifact %s' % (project, artifact))
if isSrc:
print(' make sure no JARs/WARs in src dist...')
Modified: lucene/dev/branches/lucene3846/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/CHANGES.txt?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene3846/lucene/CHANGES.txt Tue Oct 23 17:53:00 2012
@@ -44,18 +44,31 @@ New Features
the suggester to ignore such variations. (Robert Muir, Sudarshan
Gaikaiwari, Mike McCandless)
+* LUCENE-4446: Lucene 4.1 has a new default index format (Lucene41Codec)
+ that incorporates the previously experimental "Block" postings format
+ for better search performance.
+ (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
+
API Changes
* LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
no longer seek when writing. (Adrien Grand, Robert Muir)
+* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String)
+ to TokenStream.getTokenStreamWithOffsets, and return null on failure
+ rather than throwing IllegalArgumentException. (Alan Woodward)
+
+* LUCENE-4472: MergePolicy now accepts a MergeTrigger that provides
+ information about the trigger of the merge ie. merge triggered due
+ to a segment merge or a full flush etc. (Simon Willnauer)
+
Bug Fixes
* LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.
(Alex Vigdor, Arcadius Ahouansou, Koji Sekiguchi)
-* LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings
- formats (Robert Muir)
+* LUCENE-4468: Fix rareish integer overflows in Lucene41 postings
+ format. (Robert Muir)
* LUCENE-4486: Add support for ConstantScoreQuery in Highlighter.
(Simon Willnauer)
@@ -63,18 +76,24 @@ Bug Fixes
* LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
these counts now all exclude deleted documents. (Mike McCandless)
+* LUCENE-4479: Highlighter works correctly for fields with term vector
+ positions, but no offsets. (Alan Woodward)
+
+* SOLR-3906: JapaneseReadingFormFilter in romaji mode will return
+ romaji even for out-of-vocabulary kana cases (e.g. half-width forms).
+ (Robert Muir)
+
Optimizations
-* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets
- into the skipdata. You need to reindex any indexes created with
- this experimental codec. (Robert Muir)
+* LUCENE-4443: Lucene41PostingsFormat no longer writes unnecessary offsets
+ into the skipdata. (Robert Muir)
* LUCENE-4459: Improve WeakIdentityMap.keyIterator() to remove GCed keys
from backing map early instead of waiting for reap(). This makes test
failures in TestWeakIdentityMap disappear, too.
(Uwe Schindler, Mike McCandless, Robert Muir)
-* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently
+* LUCENE-4473: Lucene41PostingsFormat encodes offsets more efficiently
for low frequency terms (< 128 occurrences). (Robert Muir)
* LUCENE-4462: DocumentsWriter now flushes deletes, segment infos and builds
@@ -82,6 +101,17 @@ Optimizations
was a single threaded process while now all IO and CPU heavy computation is done
concurrently in DocumentsWriterPerThread. (Simon Willnauer)
+* LUCENE-4496: Optimize Lucene41PostingsFormat when requesting a subset of
+ the postings data (via flags to TermsEnum.docs/docsAndPositions) to use
+ ForUtil.skipBlock. (Robert Muir)
+
+* LUCENE-4497: Don't write PosVIntCount to the positions file in
+ Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir)
+
+* LUCENE-4498: In Lucene41PostingsFormat, when a term appears in only one document,
+ Instead of writing a file pointer to a VIntBlock containing the doc id, just
+ write the doc id. (Mike McCandless, Robert Muir)
+
Build
* LUCENE-4451: Memory leak per unique thread caused by
Modified: lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java Tue Oct 23 17:53:00 2012
@@ -35,6 +35,7 @@ public final class JapaneseReadingFormFi
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
+ private StringBuilder buffer = new StringBuilder();
private boolean useRomaji;
public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
@@ -50,10 +51,19 @@ public final class JapaneseReadingFormFi
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
String reading = readingAttr.getReading();
- if (reading != null) {
- if (useRomaji) {
- ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+
+ if (useRomaji) {
+ if (reading == null) {
+ // if its an OOV term, just try the term text
+ buffer.setLength(0);
+ ToStringUtil.getRomanization(buffer, termAttr);
+ termAttr.setEmpty().append(buffer);
} else {
+ ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+ }
+ } else {
+ // just replace the term text with the reading, if it exists
+ if (reading != null) {
termAttr.setEmpty().append(reading);
}
}
Modified: lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java Tue Oct 23 17:53:00 2012
@@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import java.io.IOException;
@@ -52,12 +54,40 @@ public class TestJapaneseReadingFormFilt
new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
);
}
+
+ public void testKatakanaReadingsHalfWidth() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
+ }
+ };
+ assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å
çã¨è©±ãã",
+ new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
+ );
+ }
public void testRomajiReadings() throws IOException {
assertAnalyzesTo(romajiAnalyzer, "ä»å¤ã¯ããã¼ãå
çã¨è©±ãã",
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
);
}
+
+ public void testRomajiReadingsHalfWidth() throws IOException {
+ Analyzer a = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+ TokenStream stream = new CJKWidthFilter(tokenizer);
+ return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
+ }
+ };
+ assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾å
çã¨è©±ãã",
+ new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+ );
+ }
public void testRandomData() throws IOException {
Random random = random();
Modified: lucene/dev/branches/lucene3846/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/build.xml (original)
+++ lucene/dev/branches/lucene3846/lucene/build.xml Tue Oct 23 17:53:00 2012
@@ -226,59 +226,63 @@
<target name="javadoc" depends="javadocs"/>
<target name="javadocs" description="Generate javadoc" depends="javadocs-lucene-core, javadocs-modules, javadocs-test-framework"/>
+ <target name="documentation-lint" depends="-ecj-javadoc-lint,-documentation-lint,-documentation-lint-unsupported"
+ description="Validates the generated documentation (HTML errors, broken links,...)"/>
+
<!-- we check for broken links across all documentation -->
- <target name="documentation-lint" depends="compile-test-framework,documentation,-ecj-resolve">
- <sequential>
- <subant target="-ecj-javadoc-lint" failonerror="true" inheritall="false">
- <propertyset refid="uptodate.and.compiled.properties"/>
- <fileset dir="core" includes="build.xml"/>
- <fileset dir="test-framework" includes="build.xml"/>
- </subant>
- <modules-crawl target="-ecj-javadoc-lint"/>
- <echo message="Checking for broken links..."/>
- <check-broken-links dir="build/docs"/>
- <echo message="Checking for missing docs..."/>
- <!-- TODO: change this level=method -->
- <check-missing-javadocs dir="build/docs" level="class"/>
- <!-- too many classes to fix overall to just enable
- the above to be level="method" right now, but we
- can prevent the modules that don't have problems
- from getting any worse -->
- <!-- analyzers-common: problems -->
- <check-missing-javadocs dir="build/docs/analyzers-icu" level="method"/>
- <!-- analyzers-kuromoji: problems -->
- <check-missing-javadocs dir="build/docs/analyzers-morfologik" level="method"/>
- <check-missing-javadocs dir="build/docs/analyzers-phonetic" level="method"/>
- <!-- analyzers-smartcn: problems -->
- <check-missing-javadocs dir="build/docs/analyzers-stempel" level="method"/>
- <!-- analyzers-uima: problems -->
- <!-- benchmark: problems -->
- <check-missing-javadocs dir="build/docs/classification" level="method"/>
- <!-- codecs: problems -->
- <!-- core: problems -->
- <check-missing-javadocs dir="build/docs/demo" level="method"/>
- <!-- facet: problems -->
- <!-- grouping: problems -->
- <!-- highlighter: problems -->
- <check-missing-javadocs dir="build/docs/join" level="method"/>
- <check-missing-javadocs dir="build/docs/memory" level="method"/>
- <!-- misc: problems -->
- <!-- queries: problems -->
- <!-- queryparser: problems -->
- <!-- sandbox: problems -->
- <!-- spatial: problems -->
- <check-missing-javadocs dir="build/docs/suggest" level="method"/>
- <!-- test-framework: problems -->
-
- <!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
- <check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
- </sequential>
+ <target name="-documentation-lint" if="documentation-lint.supported" depends="documentation">
+ <echo message="Checking for broken links..."/>
+ <check-broken-links dir="build/docs"/>
+ <echo message="Checking for missing docs..."/>
+ <!-- TODO: change this level=method -->
+ <check-missing-javadocs dir="build/docs" level="class"/>
+ <!-- too many classes to fix overall to just enable
+ the above to be level="method" right now, but we
+ can prevent the modules that don't have problems
+ from getting any worse -->
+ <!-- analyzers-common: problems -->
+ <check-missing-javadocs dir="build/docs/analyzers-icu" level="method"/>
+ <!-- analyzers-kuromoji: problems -->
+ <check-missing-javadocs dir="build/docs/analyzers-morfologik" level="method"/>
+ <check-missing-javadocs dir="build/docs/analyzers-phonetic" level="method"/>
+ <!-- analyzers-smartcn: problems -->
+ <check-missing-javadocs dir="build/docs/analyzers-stempel" level="method"/>
+ <!-- analyzers-uima: problems -->
+ <!-- benchmark: problems -->
+ <check-missing-javadocs dir="build/docs/classification" level="method"/>
+ <!-- codecs: problems -->
+ <!-- core: problems -->
+ <check-missing-javadocs dir="build/docs/demo" level="method"/>
+ <!-- facet: problems -->
+ <!-- grouping: problems -->
+ <!-- highlighter: problems -->
+ <check-missing-javadocs dir="build/docs/join" level="method"/>
+ <check-missing-javadocs dir="build/docs/memory" level="method"/>
+ <!-- misc: problems -->
+ <!-- queries: problems -->
+ <!-- queryparser: problems -->
+ <!-- sandbox: problems -->
+ <!-- spatial: problems -->
+ <check-missing-javadocs dir="build/docs/suggest" level="method"/>
+ <!-- test-framework: problems -->
+
+ <!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
+ <check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
</target>
+ <target name="-ecj-javadoc-lint" depends="documentation,compile-test-framework,-ecj-resolve">
+ <subant target="-ecj-javadoc-lint" failonerror="true" inheritall="false">
+ <propertyset refid="uptodate.and.compiled.properties"/>
+ <fileset dir="core" includes="build.xml"/>
+ <fileset dir="test-framework" includes="build.xml"/>
+ </subant>
+ <modules-crawl target="-ecj-javadoc-lint"/>
+ </target>
+
<target name="process-webpages" depends="resolve-groovy,resolve-pegdown">
<makeurl property="process-webpages.buildfiles" separator="|">
<fileset dir="." includes="**/build.xml" excludes="build.xml,analysis/*,build/**,tools/**,backwards/**,site/**"/>
Modified: lucene/dev/branches/lucene3846/lucene/classification/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/build.xml (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/build.xml Tue Oct 23 17:53:00 2012
@@ -23,4 +23,30 @@
</description>
<import file="../module-build.xml"/>
+
+ <path id="base.classpath">
+ <pathelement location="${common.dir}/build/core/classes/java"/>
+ <pathelement path="${queries.jar}"/>
+ <pathelement path="${project.classpath}"/>
+ </path>
+
+ <path id="test.classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <pathelement location="${common.dir}/build/test-framework/classes/java"/>
+ <pathelement location="${common.dir}/build/codecs/classes/java"/>
+ <path refid="classpath"/>
+ <path refid="junit-path"/>
+ <pathelement location="${build.dir}/classes/java"/>
+ </path>
+
+ <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
+
+ <target name="javadocs" depends="javadocs-queries,compile-core">
+ <invoke-module-javadoc>
+ <links>
+ <link href="../queries"/>
+ </links>
+ </invoke-module-javadoc>
+ </target>
+
</project>
Modified: lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Tue Oct 23 17:53:00 2012
@@ -29,12 +29,12 @@ import java.io.IOException;
public interface Classifier {
/**
- * Assign a class to the given text String
+ * Assign a class (with score) to the given text String
* @param text a String containing text to be classified
- * @return a String representing a class
+ * @return a {@link ClassificationResult} holding assigned class and score
* @throws IOException If there is a low-level I/O error.
*/
- public String assignClass(String text) throws IOException;
+ public ClassificationResult assignClass(String text) throws IOException;
/**
* Train the classifier using the underlying Lucene index
Modified: lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Tue Oct 23 17:53:00 2012
@@ -80,7 +80,7 @@ public class SimpleNaiveBayesClassifier
return result.toArray(new String[result.size()]);
}
- public String assignClass(String inputDocument) throws IOException {
+ public ClassificationResult assignClass(String inputDocument) throws IOException {
if (atomicReader == null) {
throw new RuntimeException("need to train the classifier first");
}
@@ -98,7 +98,7 @@ public class SimpleNaiveBayesClassifier
foundClass = next.utf8ToString();
}
}
- return foundClass;
+ return new ClassificationResult(foundClass, max);
}
Modified: lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html Tue Oct 23 17:53:00 2012
@@ -18,6 +18,6 @@
<body>
Uses already seen data (the indexed documents) to classify new documents.
Currently only contains a (simplistic) Lucene based Naive Bayes classifier
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
</body>
</html>
Modified: lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Tue Oct 23 17:53:00 2012
@@ -19,112 +19,32 @@ package org.apache.lucene.classification
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.junit.Test;
+import java.io.Reader;
+
/**
* Testcase for {@link SimpleNaiveBayesClassifier}
*/
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
- private RandomIndexWriter indexWriter;
- private String textFieldName;
- private String classFieldName;
- private Analyzer analyzer;
- private Directory dir;
-
- @Before
- public void setUp() throws Exception {
- super.setUp();
- analyzer = new MockAnalyzer(random());
- dir = newDirectory();
- indexWriter = new RandomIndexWriter(random(), dir);
- textFieldName = "text";
- classFieldName = "cat";
- }
-
- @After
- public void tearDown() throws Exception {
- super.tearDown();
- indexWriter.close();
- dir.close();
- }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
@Test
public void testBasicUsage() throws Exception {
- SlowCompositeReaderWrapper compositeReaderWrapper = null;
- try {
- populateIndex();
- SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
- compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
- simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
- String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
- assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
- } finally {
- if (compositeReaderWrapper != null)
- compositeReaderWrapper.close();
- }
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
}
- private void populateIndex() throws Exception {
-
- Document doc = new Document();
- doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
- "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
- "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
- " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
- "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
- "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
- "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
- "Albany's School of Criminal Justice.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
- "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
- "world through the Internet.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
- "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
- " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
- "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
+ @Test
+ public void testNGramUsage() throws Exception {
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+ }
- indexWriter.commit();
+ private class NGramAnalyzer extends Analyzer {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+ 10, 20));
+ }
}
}
Modified: lucene/dev/branches/lucene3846/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/common-build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/common-build.xml (original)
+++ lucene/dev/branches/lucene3846/lucene/common-build.xml Tue Oct 23 17:53:00 2012
@@ -109,6 +109,8 @@
</condition>
<property name="tests.clover.args" value=""/>
+ <property name="tests.heapdump.args" value=""/>
+
<property name="tests.tempDir" location="${build.dir}/test"/>
<property name="tests.cachefile" location="${common.dir}/tools/junit4/cached-timehints.txt" />
@@ -263,6 +265,25 @@
</condition>
</fail>
+ <condition property="documentation-lint.supported">
+ <and>
+ <or>
+ <contains string="${java.vm.name}" substring="hotspot" casesensitive="false"/>
+ <contains string="${java.vm.name}" substring="openjdk" casesensitive="false"/>
+ <contains string="${java.vm.name}" substring="jrockit" casesensitive="false"/>
+ </or>
+ <or>
+ <equals arg1="${ant.java.version}" arg2="1.6"/>
+ <equals arg1="${ant.java.version}" arg2="1.7"/>
+ <equals arg1="${ant.java.version}" arg2="1.8"/>
+ </or>
+ </and>
+ </condition>
+
+ <target name="-documentation-lint-unsupported" unless="documentation-lint.supported">
+ <echo level="warning" message="WARN: Linting documentation HTML is not supported on this Java version (${ant.java.version}) / JVM (${java.vm.name}). NOTHING DONE!"/>
+ </target>
+
<!-- Import custom ANT tasks. -->
<import file="${common.dir}/tools/custom-tasks.xml" />
@@ -826,6 +847,7 @@
<!-- JVM arguments and system properties. -->
<jvmarg line="${args}"/>
+ <jvmarg line="${tests.heapdump.args}"/>
<jvmarg line="${tests.clover.args}"/>
<!-- set the number of times tests should run -->
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java Tue Oct 23 17:53:00 2012
@@ -157,7 +157,7 @@ final class ForUtil {
*/
void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
if (isAllEqual(data)) {
- out.writeVInt(ALL_VALUES_EQUAL);
+ out.writeByte((byte) ALL_VALUES_EQUAL);
out.writeVInt(data[0]);
return;
}
@@ -170,7 +170,7 @@ final class ForUtil {
final int encodedSize = encodedSizes[numBits];
assert (iters * encoder.blockCount()) << 3 >= encodedSize;
- out.writeVInt(numBits);
+ out.writeByte((byte) numBits);
encoder.encode(data, 0, encoded, 0, iters);
out.writeBytes(encoded, encodedSize);
@@ -185,7 +185,7 @@ final class ForUtil {
* @throws IOException If there is a low-level I/O error
*/
void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
- final int numBits = in.readVInt();
+ final int numBits = in.readByte();
assert numBits <= 32 : numBits;
if (numBits == ALL_VALUES_EQUAL) {
@@ -211,7 +211,7 @@ final class ForUtil {
* @throws IOException If there is a low-level I/O error
*/
void skipBlock(IndexInput in) throws IOException {
- final int numBits = in.readVInt();
+ final int numBits = in.readByte();
if (numBits == ALL_VALUES_EQUAL) {
in.readVInt();
return;
@@ -222,7 +222,7 @@ final class ForUtil {
}
private static boolean isAllEqual(final int[] data) {
- final long v = data[0];
+ final int v = data[0];
for (int i = 1; i < BLOCK_SIZE; ++i) {
if (data[i] != v) {
return false;
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java Tue Oct 23 17:53:00 2012
@@ -127,10 +127,10 @@ import org.apache.lucene.util.packed.Pac
*
* <ul>
* <li>Postings Metadata --> Header, PackedBlockSize</li>
- * <li>Term Metadata --> DocFPDelta, PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
+ * <li>Term Metadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
* SkipFPDelta?</li>
* <li>Header, --> {@link CodecUtil#writeHeader CodecHeader}</li>
- * <li>PackedBlockSize --> {@link DataOutput#writeVInt VInt}</li>
+ * <li>PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}</li>
* <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link DataOutput#writeVLong VLong}</li>
* </ul>
* <p>Notes:</p>
@@ -162,6 +162,9 @@ import org.apache.lucene.util.packed.Pac
* file. In particular, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
* (i.e. 8 in Lucene41PostingsFormat).</li>
+ * <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
+ * of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
+ * single document ID is written to the term dictionary.</li>
* </ul>
* </dd>
* </dl>
@@ -274,10 +277,10 @@ import org.apache.lucene.util.packed.Pac
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermPositions --> <PackedPosDeltaBlock> <sup>PackedPosBlockNum</sup>,
* VIntBlock? </li>
- * <li>VIntBlock --> PosVIntCount, <PositionDelta[, PayloadLength?], PayloadData?,
+ * <li>VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?,
* OffsetDelta?, OffsetLength?><sup>PosVIntCount</sup>
* <li>PackedPosDeltaBlock --> {@link PackedInts PackedInts}</li>
- * <li>PosVIntCount, PositionDelta, OffsetDelta, OffsetLength -->
+ * <li>PositionDelta, OffsetDelta, OffsetLength -->
* {@link DataOutput#writeVInt VInt}</li>
* <li>PayloadData --> {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
* </ul>
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java Tue Oct 23 17:53:00 2012
@@ -148,6 +148,9 @@ public final class Lucene41PostingsReade
long payStartFP;
long skipOffset;
long lastPosBlockOffset;
+ // docid when there is a single pulsed posting, otherwise -1
+ // freq is always implicitly totalTermFreq in this case.
+ int singletonDocID;
// Only used by the "primary" TermState -- clones don't
// copy this (basically they are "transient"):
@@ -170,6 +173,7 @@ public final class Lucene41PostingsReade
payStartFP = other.payStartFP;
lastPosBlockOffset = other.lastPosBlockOffset;
skipOffset = other.skipOffset;
+ singletonDocID = other.singletonDocID;
// Do not copy bytes, bytesReader (else TermState is
// very heavy, ie drags around the entire block's
@@ -179,7 +183,7 @@ public final class Lucene41PostingsReade
@Override
public String toString() {
- return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset;
+ return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
}
}
@@ -223,7 +227,13 @@ public final class Lucene41PostingsReade
final DataInput in = termState.bytesReader;
if (isFirstTerm) {
- termState.docStartFP = in.readVLong();
+ if (termState.docFreq == 1) {
+ termState.singletonDocID = in.readVInt();
+ termState.docStartFP = 0;
+ } else {
+ termState.singletonDocID = -1;
+ termState.docStartFP = in.readVLong();
+ }
if (fieldHasPositions) {
termState.posStartFP = in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) {
@@ -238,7 +248,12 @@ public final class Lucene41PostingsReade
}
}
} else {
- termState.docStartFP += in.readVLong();
+ if (termState.docFreq == 1) {
+ termState.singletonDocID = in.readVInt();
+ } else {
+ termState.singletonDocID = -1;
+ termState.docStartFP += in.readVLong();
+ }
if (fieldHasPositions) {
termState.posStartFP += in.readVLong();
if (termState.totalTermFreq > BLOCK_SIZE) {
@@ -275,10 +290,10 @@ public final class Lucene41PostingsReade
} else {
docsEnum = new BlockDocsEnum(fieldInfo);
}
- return docsEnum.reset(liveDocs, (IntBlockTermState) termState);
+ return docsEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
}
- // TODO: specialize to liveDocs vs not, and freqs vs not
+ // TODO: specialize to liveDocs vs not
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
@@ -310,7 +325,7 @@ public final class Lucene41PostingsReade
} else {
everythingEnum = new EverythingEnum(fieldInfo);
}
- return everythingEnum.reset(liveDocs, (IntBlockTermState) termState);
+ return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
}
}
@@ -327,13 +342,14 @@ public final class Lucene41PostingsReade
final IndexInput startDocIn;
- final IndexInput docIn;
+ IndexInput docIn;
final boolean indexHasFreq;
final boolean indexHasPos;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
+ private long totalTermFreq; // sum of freqs in this posting list (or docFreq when omitted)
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
@@ -352,10 +368,13 @@ public final class Lucene41PostingsReade
private int nextSkipDoc;
private Bits liveDocs;
+
+ private boolean needsFreq; // true if the caller actually needs frequencies
+ private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn;
- this.docIn = startDocIn.clone();
+ this.docIn = null;
indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -370,17 +389,26 @@ public final class Lucene41PostingsReade
indexHasPayloads == fieldInfo.hasPayloads();
}
- public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException {
+ public DocsEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
this.liveDocs = liveDocs;
// if (DEBUG) {
// System.out.println(" FPR.reset: termState=" + termState);
// }
docFreq = termState.docFreq;
+ totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
docTermStartFP = termState.docStartFP;
- docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset;
+ singletonDocID = termState.singletonDocID;
+ if (docFreq > 1) {
+ if (docIn == null) {
+ // lazy init
+ docIn = startDocIn.clone();
+ }
+ docIn.seek(docTermStartFP);
+ }
doc = -1;
+ this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
if (!indexHasFreq) {
Arrays.fill(freqBuffer, 1);
}
@@ -416,8 +444,15 @@ public final class Lucene41PostingsReade
// if (DEBUG) {
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// }
- forUtil.readBlock(docIn, encoded, freqBuffer);
+ if (needsFreq) {
+ forUtil.readBlock(docIn, encoded, freqBuffer);
+ } else {
+ forUtil.skipBlock(docIn); // skip over freqs
+ }
}
+ } else if (docFreq == 1) {
+ docDeltaBuffer[0] = singletonDocID;
+ freqBuffer[0] = (int) totalTermFreq;
} else {
// Read vInts:
// if (DEBUG) {
@@ -583,13 +618,14 @@ public final class Lucene41PostingsReade
final IndexInput startDocIn;
- final IndexInput docIn;
+ IndexInput docIn;
final IndexInput posIn;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
+ private long totalTermFreq; // number of positions in this posting list
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
@@ -627,10 +663,11 @@ public final class Lucene41PostingsReade
private int nextSkipDoc;
private Bits liveDocs;
+ private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn;
- this.docIn = startDocIn.clone();
+ this.docIn = null;
this.posIn = Lucene41PostingsReader.this.posIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -652,8 +689,16 @@ public final class Lucene41PostingsReade
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
- docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset;
+ totalTermFreq = termState.totalTermFreq;
+ singletonDocID = termState.singletonDocID;
+ if (docFreq > 1) {
+ if (docIn == null) {
+ // lazy init
+ docIn = startDocIn.clone();
+ }
+ docIn.seek(docTermStartFP);
+ }
posPendingFP = posTermStartFP;
posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) {
@@ -696,6 +741,9 @@ public final class Lucene41PostingsReade
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// }
forUtil.readBlock(docIn, encoded, freqBuffer);
+ } else if (docFreq == 1) {
+ docDeltaBuffer[0] = singletonDocID;
+ freqBuffer[0] = (int) totalTermFreq;
} else {
// Read vInts:
// if (DEBUG) {
@@ -714,7 +762,7 @@ public final class Lucene41PostingsReade
// if (DEBUG) {
// System.out.println(" vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets);
// }
- final int count = posIn.readVInt();
+ final int count = (int) (totalTermFreq % BLOCK_SIZE);
int payloadLength = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
@@ -993,7 +1041,7 @@ public final class Lucene41PostingsReade
final IndexInput startDocIn;
- final IndexInput docIn;
+ IndexInput docIn;
final IndexInput posIn;
final IndexInput payIn;
final BytesRef payload;
@@ -1002,6 +1050,7 @@ public final class Lucene41PostingsReade
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
+ private long totalTermFreq; // number of positions in this posting list
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
@@ -1044,9 +1093,13 @@ public final class Lucene41PostingsReade
private Bits liveDocs;
+ private boolean needsOffsets; // true if we actually need offsets
+ private boolean needsPayloads; // true if we actually need payloads
+ private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
+
public EverythingEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene41PostingsReader.this.docIn;
- this.docIn = startDocIn.clone();
+ this.docIn = null;
this.posIn = Lucene41PostingsReader.this.posIn.clone();
this.payIn = Lucene41PostingsReader.this.payIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
@@ -1079,7 +1132,7 @@ public final class Lucene41PostingsReade
indexHasPayloads == fieldInfo.hasPayloads();
}
- public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException {
+ public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
this.liveDocs = liveDocs;
// if (DEBUG) {
// System.out.println(" FPR.reset: termState=" + termState);
@@ -1088,8 +1141,16 @@ public final class Lucene41PostingsReade
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
- docIn.seek(docTermStartFP);
skipOffset = termState.skipOffset;
+ totalTermFreq = termState.totalTermFreq;
+ singletonDocID = termState.singletonDocID;
+ if (docFreq > 1) {
+ if (docIn == null) {
+ // lazy init
+ docIn = startDocIn.clone();
+ }
+ docIn.seek(docTermStartFP);
+ }
posPendingFP = posTermStartFP;
payPendingFP = payTermStartFP;
posPendingCount = 0;
@@ -1101,6 +1162,9 @@ public final class Lucene41PostingsReade
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
}
+ this.needsOffsets = (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0;
+ this.needsPayloads = (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0;
+
doc = -1;
accum = 0;
docUpto = 0;
@@ -1133,6 +1197,9 @@ public final class Lucene41PostingsReade
// System.out.println(" fill freq block from fp=" + docIn.getFilePointer());
// }
forUtil.readBlock(docIn, encoded, freqBuffer);
+ } else if (docFreq == 1) {
+ docDeltaBuffer[0] = singletonDocID;
+ freqBuffer[0] = (int) totalTermFreq;
} else {
// if (DEBUG) {
// System.out.println(" fill last vInt doc block from fp=" + docIn.getFilePointer());
@@ -1150,7 +1217,7 @@ public final class Lucene41PostingsReade
// if (DEBUG) {
// System.out.println(" vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets);
// }
- final int count = posIn.readVInt();
+ final int count = (int) (totalTermFreq % BLOCK_SIZE);
int payloadLength = 0;
int offsetLength = 0;
payloadByteUpto = 0;
@@ -1203,15 +1270,22 @@ public final class Lucene41PostingsReade
// if (DEBUG) {
// System.out.println(" bulk payload block @ pay.fp=" + payIn.getFilePointer());
// }
- forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
- int numBytes = payIn.readVInt();
- // if (DEBUG) {
- // System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
- // }
- if (numBytes > payloadBytes.length) {
- payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
+ if (needsPayloads) {
+ forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
+ int numBytes = payIn.readVInt();
+ // if (DEBUG) {
+ // System.out.println(" " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
+ // }
+ if (numBytes > payloadBytes.length) {
+ payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
+ }
+ payIn.readBytes(payloadBytes, 0, numBytes);
+ } else {
+ // this works, because when writing a vint block we always force the first length to be written
+ forUtil.skipBlock(payIn); // skip over lengths
+ int numBytes = payIn.readVInt(); // read length of payloadBytes
+ payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes
}
- payIn.readBytes(payloadBytes, 0, numBytes);
payloadByteUpto = 0;
}
@@ -1219,8 +1293,14 @@ public final class Lucene41PostingsReade
// if (DEBUG) {
// System.out.println(" bulk offset block @ pay.fp=" + payIn.getFilePointer());
// }
- forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
- forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
+ if (needsOffsets) {
+ forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
+ forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
+ } else {
+ // this works, because when writing a vint block we always force the first length to be written
+ forUtil.skipBlock(payIn); // skip over starts
+ forUtil.skipBlock(payIn); // skip over lengths
+ }
}
}
}
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java Tue Oct 23 17:53:00 2012
@@ -354,13 +354,15 @@ public final class Lucene41PostingsWrite
public final long payStartFP;
public final long skipOffset;
public final long lastPosBlockOffset;
+ public final int singletonDocID;
- public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset) {
+ public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
this.docStartFP = docStartFP;
this.posStartFP = posStartFP;
this.payStartFP = payStartFP;
this.skipOffset = skipOffset;
this.lastPosBlockOffset = lastPosBlockOffset;
+ this.singletonDocID = singletonDocID;
}
}
@@ -384,18 +386,26 @@ public final class Lucene41PostingsWrite
// System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
// }
// }
-
- // vInt encode the remaining doc deltas and freqs:
- for(int i=0;i<docBufferUpto;i++) {
- final int docDelta = docDeltaBuffer[i];
- final int freq = freqBuffer[i];
- if (!fieldHasFreqs) {
- docOut.writeVInt(docDelta);
- } else if (freqBuffer[i] == 1) {
- docOut.writeVInt((docDelta<<1)|1);
- } else {
- docOut.writeVInt(docDelta<<1);
- docOut.writeVInt(freq);
+
+ // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
+ final int singletonDocID;
+ if (stats.docFreq == 1) {
+ // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
+ singletonDocID = docDeltaBuffer[0];
+ } else {
+ singletonDocID = -1;
+ // vInt encode the remaining doc deltas and freqs:
+ for(int i=0;i<docBufferUpto;i++) {
+ final int docDelta = docDeltaBuffer[i];
+ final int freq = freqBuffer[i];
+ if (!fieldHasFreqs) {
+ docOut.writeVInt(docDelta);
+ } else if (freqBuffer[i] == 1) {
+ docOut.writeVInt((docDelta<<1)|1);
+ } else {
+ docOut.writeVInt(docDelta<<1);
+ docOut.writeVInt(freq);
+ }
}
}
@@ -417,9 +427,7 @@ public final class Lucene41PostingsWrite
} else {
lastPosBlockOffset = -1;
}
- if (posBufferUpto > 0) {
- posOut.writeVInt(posBufferUpto);
-
+ if (posBufferUpto > 0) {
// TODO: should we send offsets/payloads to
// .pay...? seems wasteful (have to store extra
// vLong for low (< BLOCK_SIZE) DF terms = vast vast
@@ -509,7 +517,7 @@ public final class Lucene41PostingsWrite
// System.out.println(" payStartFP=" + payStartFP);
// }
- pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
+ pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
docBufferUpto = 0;
posBufferUpto = 0;
lastDocID = 0;
@@ -537,8 +545,12 @@ public final class Lucene41PostingsWrite
for(int idx=limit-count; idx<limit; idx++) {
PendingTerm term = pendingTerms.get(idx);
- bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
- lastDocStartFP = term.docStartFP;
+ if (term.singletonDocID == -1) {
+ bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
+ lastDocStartFP = term.docStartFP;
+ } else {
+ bytesWriter.writeVInt(term.singletonDocID);
+ }
if (fieldHasPositions) {
bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java Tue Oct 23 17:53:00 2012
@@ -99,8 +99,14 @@ final class DocumentsWriterFlushControl
maxConfiguredRamBuffer = Math.max(maxRamMB, maxConfiguredRamBuffer);
final long ram = flushBytes + activeBytes;
final long ramBufferBytes = (long) (maxConfiguredRamBuffer * 1024 * 1024);
- // take peakDelta into account - worst case is that all flushing, pending and blocked DWPT had maxMem and the last doc had the peakDelta
- final long expected = (2 * (ramBufferBytes)) + ((numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta);
+ // take peakDelta into account - worst case is that all flushing, pending and blocked DWPT had maxMem and the last doc had the peakDelta
+
+ // 2 * ramBufferBytes -> before we stall we need to cross the 2xRAM Buffer border this is still a valid limit
+ // (numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta) -> those are the total number of DWPT that are not active but not yet fully fluhsed
+ // all of them could theoretically be taken out of the loop once they crossed the RAM buffer and the last document was the peak delta
+ // (perThreadPool.getActiveThreadState() * peakDelta) -> at any given time there could be n threads in flight that crossed the stall control before we reached the limit and each of them could hold a peak document
+ final long expected = (2 * (ramBufferBytes)) + ((numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta) + (perThreadPool.getActiveThreadState() * peakDelta);
+ // the expected ram consumption is an upper bound at this point and not really the expected consumption
if (peakDelta < (ramBufferBytes >> 1)) {
/*
* if we are indexing with very low maxRamBuffer like 0.1MB memory can
@@ -111,11 +117,11 @@ final class DocumentsWriterFlushControl
* fail. To prevent this we only assert if the the largest document seen
* is smaller than the 1/2 of the maxRamBufferMB
*/
- assert ram <= expected : "ram was " + ram + " expected: " + expected
- + " flush mem: " + flushBytes + " activeMem: " + activeBytes
- + " pendingMem: " + numPending + " flushingMem: "
- + numFlushingDWPT() + " blockedMem: " + numBlockedFlushes()
- + " peakDeltaMem: " + peakDelta;
+ assert ram <= expected : "actual mem: " + ram + " byte, expected mem: " + expected
+ + " byte, flush mem: " + flushBytes + ", active mem: " + activeBytes
+ + ", pending DWPT: " + numPending + ", flushing DWPT: "
+ + numFlushingDWPT() + ", blocked DWPT: " + numBlockedFlushes()
+ + ", peakDelta mem: " + peakDelta + " byte";
}
}
return true;
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java Tue Oct 23 17:53:00 2012
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.Analyz
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.index.FieldInfos.FieldNumbers;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.MergePolicy.MergeTrigger;
import org.apache.lucene.index.MergeState.CheckAbort;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.AlreadyClosedException;
@@ -181,6 +182,10 @@ import org.apache.lucene.util.ThreadInte
* keeps track of the last non commit checkpoint.
*/
public class IndexWriter implements Closeable, TwoPhaseCommit {
+
+ private static final int UNBOUNDED_MAX_MERGE_SEGMENTS = -1;
+
+
/**
* Name of the write lock in the index.
*/
@@ -377,7 +382,7 @@ public class IndexWriter implements Clos
}
}
if (anySegmentFlushed) {
- maybeMerge();
+ maybeMerge(MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", "getReader took " + (System.currentTimeMillis() - tStart) + " msec");
@@ -1226,7 +1231,7 @@ public class IndexWriter implements Clos
}
}
if (anySegmentFlushed) {
- maybeMerge();
+ maybeMerge(MergeTrigger.SEGMENT_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
} catch (OutOfMemoryError oom) {
handleOOM(oom, "updateDocuments");
@@ -1448,7 +1453,7 @@ public class IndexWriter implements Clos
}
if (anySegmentFlushed) {
- maybeMerge();
+ maybeMerge(MergeTrigger.SEGMENT_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
} catch (OutOfMemoryError oom) {
handleOOM(oom, "updateDocument");
@@ -1621,7 +1626,7 @@ public class IndexWriter implements Clos
}
}
- maybeMerge(maxNumSegments);
+ maybeMerge(MergeTrigger.EXPLICIT, maxNumSegments);
if (doWait) {
synchronized(this) {
@@ -1796,25 +1801,28 @@ public class IndexWriter implements Clos
* Explicit calls to maybeMerge() are usually not
* necessary. The most common case is when merge policy
* parameters have changed.
+ *
+ * This method will call the {@link MergePolicy} with
+ * {@link MergeTrigger#EXPLICIT}.
*
* <p><b>NOTE</b>: if this method hits an OutOfMemoryError
* you should immediately close the writer. See <a
* href="#OOME">above</a> for details.</p>
*/
public final void maybeMerge() throws IOException {
- maybeMerge(-1);
+ maybeMerge(MergeTrigger.EXPLICIT, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
- private final void maybeMerge(int maxNumSegments) throws IOException {
+ private final void maybeMerge(MergeTrigger trigger, int maxNumSegments) throws IOException {
ensureOpen(false);
- updatePendingMerges(maxNumSegments);
+ updatePendingMerges(trigger, maxNumSegments);
mergeScheduler.merge(this);
}
- private synchronized void updatePendingMerges(int maxNumSegments)
+ private synchronized void updatePendingMerges(MergeTrigger trigger, int maxNumSegments)
throws IOException {
assert maxNumSegments == -1 || maxNumSegments > 0;
-
+ assert trigger != null;
if (stopMerges) {
return;
}
@@ -1825,7 +1833,9 @@ public class IndexWriter implements Clos
}
final MergePolicy.MergeSpecification spec;
- if (maxNumSegments != -1) {
+ if (maxNumSegments != UNBOUNDED_MAX_MERGE_SEGMENTS) {
+ assert trigger == MergeTrigger.EXPLICIT || trigger == MergeTrigger.MERGE_FINISHED :
+ "Expected EXPLICT or MERGE_FINISHED as trigger even with maxNumSegments set but was: " + trigger.name();
spec = mergePolicy.findForcedMerges(segmentInfos, maxNumSegments, Collections.unmodifiableMap(segmentsToMerge));
if (spec != null) {
final int numMerges = spec.merges.size();
@@ -1836,7 +1846,7 @@ public class IndexWriter implements Clos
}
} else {
- spec = mergePolicy.findMerges(segmentInfos);
+ spec = mergePolicy.findMerges(trigger, segmentInfos);
}
if (spec != null) {
@@ -2653,7 +2663,7 @@ public class IndexWriter implements Clos
boolean success = false;
try {
if (anySegmentsFlushed) {
- maybeMerge();
+ maybeMerge(MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
success = true;
} finally {
@@ -2809,7 +2819,7 @@ public class IndexWriter implements Clos
// We can be called during close, when closing==true, so we must pass false to ensureOpen:
ensureOpen(false);
if (doFlush(applyAllDeletes) && triggerMerge) {
- maybeMerge();
+ maybeMerge(MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
}
}
@@ -3240,7 +3250,7 @@ public class IndexWriter implements Clos
// segments) may now enable new merges, so we call
// merge policy & update pending merges.
if (success && !merge.isAborted() && (merge.maxNumSegments != -1 || (!closed && !closing))) {
- updatePendingMerges(merge.maxNumSegments);
+ updatePendingMerges(MergeTrigger.MERGE_FINISHED, merge.maxNumSegments);
}
}
}
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java Tue Oct 23 17:53:00 2012
@@ -24,6 +24,8 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
+import org.apache.lucene.index.MergePolicy.MergeTrigger;
+
/**
* <p>This class implements a {@link MergePolicy} that tries
@@ -560,7 +562,7 @@ public abstract class LogMergePolicy ext
* will return multiple merges, allowing the {@link
* MergeScheduler} to use concurrency. */
@Override
- public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
+ public MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos infos) throws IOException {
final int numSegments = infos.size();
if (verbose()) {
Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java Tue Oct 23 17:53:00 2012
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -57,7 +58,7 @@ import org.apache.lucene.util.SetOnce;
*/
public abstract class MergePolicy implements java.io.Closeable, Cloneable {
-
+
/** OneMerge provides the information necessary to perform
* an individual primitive merge operation, resulting in
* a single new segment. The merge spec includes the
@@ -333,11 +334,11 @@ public abstract class MergePolicy implem
* {@link IndexWriter} calls this whenever there is a change to the segments.
* This call is always synchronized on the {@link IndexWriter} instance so
* only one thread at a time will call this method.
- *
+ * @param mergeTrigger the event that triggered the merge
* @param segmentInfos
* the total set of segments in the index
*/
- public abstract MergeSpecification findMerges(SegmentInfos segmentInfos)
+ public abstract MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos)
throws IOException;
/**
@@ -379,9 +380,36 @@ public abstract class MergePolicy implem
* Release all resources for the policy.
*/
public abstract void close();
-
+
+
/**
* Returns true if a new segment (regardless of its origin) should use the compound file format.
*/
public abstract boolean useCompoundFile(SegmentInfos segments, SegmentInfoPerCommit newSegment) throws IOException;
+
+ /**
+ * MergeTrigger is passed to
+ * {@link MergePolicy#findMerges(MergeTrigger, SegmentInfos)} to indicate the
+ * event that triggered the merge.
+ */
+ public static enum MergeTrigger {
+ /**
+ * Merge was triggered by a segment flush.
+ */
+ SEGMENT_FLUSH,
+ /**
+ * Merge was triggered by a full flush. Full flushes
+ * can be caused by a commit, NRT reader reopen or a close call on the index writer.
+ */
+ FULL_FLUSH,
+ /**
+ * Merge has been triggered explicitly by the user.
+ */
+ EXPLICIT,
+
+ /**
+ * Merge was triggered by a successfully finished merge.
+ */
+ MERGE_FINISHED,
+ }
}