You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/10/23 19:53:03 UTC
svn commit: r1401363 [1/2] - in /lucene/dev/branches/lucene3846: ./ dev-tools/ dev-tools/scripts/ lucene/ lucene/analysis/ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/...

Author: mikemccand
Date: Tue Oct 23 17:53:00 2012
New Revision: 1401363

URL: http://svn.apache.org/viewvc?rev=1401363&view=rev
Log:
LUCENE-3846: merge trunk

Added:
    lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
      - copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
    lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
      - copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
    lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
      - copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
    lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
      - copied unchanged from r1401358, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
    lucene/dev/branches/lucene3846/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
      - copied unchanged from r1401358, lucene/dev/trunk/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
    lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
      - copied unchanged from r1401358, lucene/dev/trunk/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
    lucene/dev/branches/lucene3846/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
      - copied unchanged from r1401358, lucene/dev/trunk/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
Modified:
    lucene/dev/branches/lucene3846/   (props changed)
    lucene/dev/branches/lucene3846/build.xml
    lucene/dev/branches/lucene3846/dev-tools/   (props changed)
    lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py
    lucene/dev/branches/lucene3846/lucene/   (props changed)
    lucene/dev/branches/lucene3846/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene3846/lucene/analysis/   (props changed)
    lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
    lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
    lucene/dev/branches/lucene3846/lucene/build.xml   (contents, props changed)
    lucene/dev/branches/lucene3846/lucene/classification/build.xml
    lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
    lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
    lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html
    lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
    lucene/dev/branches/lucene3846/lucene/common-build.xml   (contents, props changed)
    lucene/dev/branches/lucene3846/lucene/core/   (props changed)
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/NoMergePolicy.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/UpgradeIndexMergePolicy.java
    lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/util/fst/Util.java
    lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/codecs/lucene41/TestBlockPostingsFormat3.java
    lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/Test4GBStoredFields.java
    lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/TestBagOfPostings.java
    lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/TestNoMergePolicy.java
    lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/index/TestPerSegmentDeletes.java
    lucene/dev/branches/lucene3846/lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
    lucene/dev/branches/lucene3846/lucene/highlighter/   (props changed)
    lucene/dev/branches/lucene3846/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
    lucene/dev/branches/lucene3846/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
    lucene/dev/branches/lucene3846/lucene/suggest/   (props changed)
    lucene/dev/branches/lucene3846/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
    lucene/dev/branches/lucene3846/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java
    lucene/dev/branches/lucene3846/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
    lucene/dev/branches/lucene3846/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
    lucene/dev/branches/lucene3846/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/WFSTCompletionTest.java
    lucene/dev/branches/lucene3846/lucene/test-framework/   (props changed)
    lucene/dev/branches/lucene3846/lucene/test-framework/src/java/org/apache/lucene/index/MockRandomMergePolicy.java
    lucene/dev/branches/lucene3846/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
    lucene/dev/branches/lucene3846/solr/   (props changed)
    lucene/dev/branches/lucene3846/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene3846/solr/build.xml   (contents, props changed)
    lucene/dev/branches/lucene3846/solr/contrib/   (props changed)
    lucene/dev/branches/lucene3846/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
    lucene/dev/branches/lucene3846/solr/core/   (props changed)
    lucene/dev/branches/lucene3846/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
    lucene/dev/branches/lucene3846/solr/core/src/java/org/apache/solr/spelling/suggest/Suggester.java
    lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
    lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/schema.xml
    lucene/dev/branches/lucene3846/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
    lucene/dev/branches/lucene3846/solr/core/src/test/org/apache/solr/highlight/HighlighterTest.java

Modified: lucene/dev/branches/lucene3846/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/build.xml (original)
+++ lucene/dev/branches/lucene3846/build.xml Tue Oct 23 17:53:00 2012
@@ -30,43 +30,37 @@
     <subant buildpath="lucene" target="test-help" inheritall="false" failonerror="true"/>
   </target>
 
+  <property name="tests.heap-dump-dir" location="heapdumps"/>
+  
   <target name="precommit" description="Run basic checks before committing"
           depends="check-svn-working-copy,validate,documentation-lint"/>
 
   <target name="test" description="Test both Lucene and Solr">
-    <sequential>
-      <subant target="test" inheritall="false" failonerror="true">
-        <fileset dir="lucene" includes="build.xml" />
-        <fileset dir="solr" includes="build.xml" />
-      </subant>
-    </sequential>
+    <subant target="test" inheritall="false" failonerror="true">
+      <fileset dir="lucene" includes="build.xml" />
+      <fileset dir="solr" includes="build.xml" />
+    </subant>
   </target>
 
   <target name="pitest" description="Run PITest on both Lucene and Solr">
-    <sequential>
-      <subant target="pitest" inheritall="false" failonerror="false">
-        <fileset dir="lucene" includes="build.xml" />
-        <fileset dir="solr" includes="build.xml" />
-      </subant>
-    </sequential>
+    <subant target="pitest" inheritall="false" failonerror="false">
+      <fileset dir="lucene" includes="build.xml" />
+      <fileset dir="solr" includes="build.xml" />
+    </subant>
   </target>
 
   <target name="documentation" description="Generate Lucene and Solr Documentation">
-    <sequential>
-      <subant target="documentation" inheritall="false" failonerror="true">
-        <fileset dir="lucene" includes="build.xml" />
-        <fileset dir="solr" includes="build.xml" />
-      </subant>
-    </sequential>
+    <subant target="documentation" inheritall="false" failonerror="true">
+      <fileset dir="lucene" includes="build.xml" />
+      <fileset dir="solr" includes="build.xml" />
+    </subant>
   </target>
 
   <target name="documentation-lint" description="Validates the generated documentation (HTML errors, broken links,...)">
-    <sequential>
-      <subant target="documentation-lint" inheritall="false" failonerror="true">
-        <fileset dir="lucene" includes="build.xml" />
-        <fileset dir="solr" includes="build.xml" />
-      </subant>
-    </sequential>
+    <subant target="documentation-lint" inheritall="false" failonerror="true">
+      <fileset dir="lucene" includes="build.xml" />
+      <fileset dir="solr" includes="build.xml" />
+    </subant>
   </target>
 
   <target name="validate" description="Validate dependencies, licenses, etc." depends="-validate-source-patterns">
@@ -218,13 +212,11 @@
 
   <target name="clean" description="Clean Lucene and Solr build dirs">
     <delete dir="dist" />
-    <sequential>
-      <subant target="clean" inheritall="false" failonerror="true">
-        <fileset dir="lucene" includes="build.xml" />
-        <fileset dir="solr" includes="build.xml" />
-      </subant>
-      <delete dir="dist" failonerror="false" />
-    </sequential>
+    <delete dir="${tests.heap-dump-dir}" />
+    <subant target="clean" inheritall="false" failonerror="true">
+      <fileset dir="lucene" includes="build.xml" />
+      <fileset dir="solr" includes="build.xml" />
+    </subant>
   </target>
 
   <target name="ivy-bootstrap" description="Download and install Ivy in the users ant lib dir">
@@ -258,46 +250,44 @@
   </target>
   
   <target name="nightly-smoke" description="Builds an unsigned release and smoke tests it" depends="clean,-env-JAVA6_HOME,-env-JAVA7_HOME">
-   <sequential>
-     <fail unless="JAVA6_HOME">JAVA6_HOME property or environment variable is not defined.</fail>
-     <fail unless="JAVA7_HOME">JAVA7_HOME property or environment variable is not defined.</fail>
-     <subant target="prepare-release-no-sign" inheritall="false" failonerror="true">
-       <fileset dir="lucene" includes="build.xml" />
-       <fileset dir="solr" includes="build.xml" />
-       <property name="version" value="${fakeReleaseVersion}" />
-     </subant>
-     <delete dir="${fakeRelease}"/>
-     <delete dir="${fakeReleaseTmp}"/>
-     <mkdir dir="${fakeRelease}"/>
-     <copy todir="${fakeRelease}/lucene">
-       <fileset dir="lucene/dist"/>
-     </copy>
-     <copy todir="${fakeRelease}/lucene/changes">
-       <fileset dir="lucene/build/docs/changes"/>
-     </copy>
-     <get src="http://people.apache.org/keys/group/lucene.asc" 
-          dest="${fakeRelease}/lucene/KEYS"/>
-     <copy todir="${fakeRelease}/solr">
-       <fileset dir="solr/package"/>
-     </copy>
-     <copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
-     <copy todir="${fakeRelease}/solr/changes">
-       <fileset dir="solr/build/docs/changes"/>
-     </copy>
-     <makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
-     <exec executable="${python32.exe}" failonerror="true">
-       <arg value="-u"/>
-       <arg file="dev-tools/scripts/smokeTestRelease.py"/>
-       <arg value="${fakeRelease.uri}"/>
-       <arg value="${fakeReleaseVersion}"/>
-       <arg file="${fakeReleaseTmp}"/>
-       <arg value="false"/>
-       <env key="JAVA6_HOME" file="${JAVA6_HOME}"/>
-       <env key="JAVA7_HOME" file="${JAVA7_HOME}"/>
-     </exec>
-     <delete dir="${fakeRelease}"/>
-     <delete dir="${fakeReleaseTmp}"/>
-   </sequential>
+   <fail unless="JAVA6_HOME">JAVA6_HOME property or environment variable is not defined.</fail>
+   <fail unless="JAVA7_HOME">JAVA7_HOME property or environment variable is not defined.</fail>
+   <subant target="prepare-release-no-sign" inheritall="false" failonerror="true">
+     <fileset dir="lucene" includes="build.xml" />
+     <fileset dir="solr" includes="build.xml" />
+     <property name="version" value="${fakeReleaseVersion}" />
+   </subant>
+   <delete dir="${fakeRelease}"/>
+   <delete dir="${fakeReleaseTmp}"/>
+   <mkdir dir="${fakeRelease}"/>
+   <copy todir="${fakeRelease}/lucene">
+     <fileset dir="lucene/dist"/>
+   </copy>
+   <copy todir="${fakeRelease}/lucene/changes">
+     <fileset dir="lucene/build/docs/changes"/>
+   </copy>
+   <get src="http://people.apache.org/keys/group/lucene.asc" 
+        dest="${fakeRelease}/lucene/KEYS"/>
+   <copy todir="${fakeRelease}/solr">
+     <fileset dir="solr/package"/>
+   </copy>
+   <copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
+   <copy todir="${fakeRelease}/solr/changes">
+     <fileset dir="solr/build/docs/changes"/>
+   </copy>
+   <makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
+   <exec executable="${python32.exe}" failonerror="true">
+     <arg value="-u"/>
+     <arg file="dev-tools/scripts/smokeTestRelease.py"/>
+     <arg value="${fakeRelease.uri}"/>
+     <arg value="${fakeReleaseVersion}"/>
+     <arg file="${fakeReleaseTmp}"/>
+     <arg value="false"/>
+     <env key="JAVA6_HOME" file="${JAVA6_HOME}"/>
+     <env key="JAVA7_HOME" file="${JAVA7_HOME}"/>
+   </exec>
+   <delete dir="${fakeRelease}"/>
+   <delete dir="${fakeReleaseTmp}"/>
   </target>
   
   <target name="check-svn-working-copy" description="Checks the status of the SVN working copy">
@@ -309,7 +299,7 @@
       <param name="run.clover" value="true"/>
       <!-- must be 1, as clover does not like parallel test runs: -->
       <param name="tests.jvms" value="1"/>
-      <!-- Also override some other props to be fast, ignoring what's set on command line: -->
+      <!-- Also override some other props to be fast: -->
       <param name="tests.multiplier" value="1"/>
       <param name="tests.nightly" value="false"/>
       <param name="tests.weekly" value="false"/>
@@ -326,19 +316,42 @@
     <subant buildpath="." antfile="extra-targets.xml" target="-generate-clover-reports" inheritall="false" failonerror="true"/>
   </target>
 
+  <target name="test-with-heapdumps" depends="-test-with-heapdumps-enabled,-test-with-heapdumps-disabled" description="Runs tests with heap dumps on OOM enabled (if VM supports this)"/>
+  
+  <condition property="vm.supports.heapdumps">
+    <or>
+      <contains string="${java.vm.name}" substring="hotspot" casesensitive="false"/>
+      <contains string="${java.vm.name}" substring="openjdk" casesensitive="false"/>
+      <contains string="${java.vm.name}" substring="jrockit" casesensitive="false"/>
+    </or>
+  </condition>
+
+  <target name="-test-with-heapdumps-enabled" if="vm.supports.heapdumps">
+    <echo level="info" message="${java.vm.name}: Enabling heap dumps on OutOfMemoryError to dir '${tests.heap-dump-dir}'."/>
+    <mkdir dir="${tests.heap-dump-dir}"/>
+    <delete includeEmptyDirs="true">
+      <fileset dir="${tests.heap-dump-dir}"  includes="**/*"/>
+    </delete>
+    <antcall inheritAll="false" target="test">
+      <param name="tests.heapdump.args" value="-XX:+HeapDumpOnOutOfMemoryError &quot;-XX:HeapDumpPath=${tests.heap-dump-dir}&quot;"/>
+    </antcall>
+    <pathconvert property="heapdumps.list" setonempty="false" pathsep="${line.separator}">
+      <fileset dir="${tests.heap-dump-dir}"/>
+      <map from="${tests.heap-dump-dir}${file.separator}" to="* "/>
+    </pathconvert>
+    <fail if="heapdumps.list" message="Some of the tests produced a heap dump, but did not fail. Maybe a suppressed OutOfMemoryError? Dumps created:${line.separator}${heapdumps.list}"/>
+    <delete dir="${tests.heap-dump-dir}"/>
+  </target>
+
+  <target name="-test-with-heapdumps-disabled" unless="vm.supports.heapdumps">
+    <echo level="warning" message="WARN: The used JVM (${java.vm.name}) does not support HPROF heap dumps on OutOfMemoryError."/>
+    <antcall target="test"/>
+  </target>
+
   <!-- Jenkins tasks -->
-  <target name="jenkins-hourly" depends="clean,test,validate,-jenkins-documentation-lint,jar-checksums,check-svn-working-copy"/>
+  <target name="jenkins-hourly" depends="clean,test-with-heapdumps,validate,documentation-lint,jar-checksums,check-svn-working-copy"/>
   
   <target name="jenkins-maven-nightly" depends="clean,remove-maven-artifacts,run-maven-build,generate-maven-artifacts,validate-maven-dependencies"/>
   
   <target name="jenkins-clover" depends="run-clover"/>
-  
-  <!-- we need this extra condition, as we want to match only on "true", not solely if property is set: -->
-  <property name="disable.documentation-lint" value="false" />
-  <condition property="-disable.documentation-lint">
-    <istrue value="${disable.documentation-lint}"/>
-  </condition>
-  <target name="-jenkins-documentation-lint" unless="-disable.documentation-lint">
-    <antcall target="documentation-lint"/> 
-  </target>
 </project>

Modified: lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py (original)
+++ lucene/dev/branches/lucene3846/dev-tools/scripts/smokeTestRelease.py Tue Oct 23 17:53:00 2012
@@ -581,6 +581,9 @@ def verifyUnpacked(project, artifact, un
     textFiles.extend(('JRE_VERSION_MIGRATION', 'CHANGES', 'MIGRATE', 'SYSTEM_REQUIREMENTS'))
     if isSrc:
       textFiles.append('BUILD')
+  elif not isSrc:
+    textFiles.append('SYSTEM_REQUIREMENTS')
+    
   for fileName in textFiles:
     fileName += '.txt'
     if fileName not in l:
@@ -629,10 +632,8 @@ def verifyUnpacked(project, artifact, un
   if project == 'lucene':
     if len(l) > 0:
       raise RuntimeError('%s: unexpected files/dirs in artifact %s: %s' % (project, artifact, l))
-  else:
-    # TODO: re-enable this check
-    if False and not os.path.exists('%s/solr/SYSTEM_REQUIREMENTS.txt' % unpackPath):
-      raise RuntimeError('%s: solr/SYSTEM_REQUIREMENTS.txt does not exist in artifact %s' % (project, artifact))
+  elif isSrc and not os.path.exists('%s/solr/SYSTEM_REQUIREMENTS.txt' % unpackPath):
+    raise RuntimeError('%s: solr/SYSTEM_REQUIREMENTS.txt does not exist in artifact %s' % (project, artifact))
 
   if isSrc:
     print('    make sure no JARs/WARs in src dist...')

Modified: lucene/dev/branches/lucene3846/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/CHANGES.txt?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene3846/lucene/CHANGES.txt Tue Oct 23 17:53:00 2012
@@ -44,18 +44,31 @@ New Features
   the suggester to ignore such variations. (Robert Muir, Sudarshan
   Gaikaiwari, Mike McCandless)
 
+* LUCENE-4446: Lucene 4.1 has a new default index format (Lucene41Codec)
+  that incorporates the previously experimental "Block" postings format
+  for better search performance. 
+  (Han Jiang, Adrien Grand, Robert Muir, Mike McCandless)
+
 API Changes
 
 * LUCENE-4399: Deprecated AppendingCodec. Lucene's term dictionaries
   no longer seek when writing.  (Adrien Grand, Robert Muir)
 
+* LUCENE-4479: Rename TokenStream.getTokenStream(IndexReader, int, String)
+  to TokenStream.getTokenStreamWithOffsets, and return null on failure
+  rather than throwing IllegalArgumentException.  (Alan Woodward)
+
+* LUCENE-4472: MergePolicy now accepts a MergeTrigger that provides 
+  information about the trigger of the merge ie. merge triggered due
+  to a segment merge or a full flush etc. (Simon Willnauer)
+
 Bug Fixes
 
 * LUCENE-1822: BaseFragListBuilder hard-coded 6 char margin is too naive.
   (Alex Vigdor, Arcadius Ahouansou, Koji Sekiguchi)
 
-* LUCENE-4468: Fix rareish integer overflows in Block and Lucene40 postings 
-  formats (Robert Muir)
+* LUCENE-4468: Fix rareish integer overflows in Lucene41 postings 
+  format. (Robert Muir)
   
 * LUCENE-4486: Add support for ConstantScoreQuery in Highlighter.
  (Simon Willnauer)
@@ -63,18 +76,24 @@ Bug Fixes
 * LUCENE-4485: When CheckIndex terms, terms/docs pairs and tokens,
   these counts now all exclude deleted documents.  (Mike McCandless)
 
+* LUCENE-4479: Highlighter works correctly for fields with term vector
+  positions, but no offsets.  (Alan Woodward)
+
+* SOLR-3906: JapaneseReadingFormFilter in romaji mode will return
+  romaji even for out-of-vocabulary kana cases (e.g. half-width forms).
+  (Robert Muir)
+
 Optimizations
 
-* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets 
-  into the skipdata. You need to reindex any indexes created with
-  this experimental codec.  (Robert Muir)
+* LUCENE-4443: Lucene41PostingsFormat no longer writes unnecessary offsets 
+  into the skipdata. (Robert Muir)
 
 * LUCENE-4459: Improve WeakIdentityMap.keyIterator() to remove GCed keys
   from backing map early instead of waiting for reap(). This makes test
   failures in TestWeakIdentityMap disappear, too.
   (Uwe Schindler, Mike McCandless, Robert Muir)
 
-* LUCENE-4473: BlockPostingsFormat encodes offsets more efficiently
+* LUCENE-4473: Lucene41PostingsFormat encodes offsets more efficiently
   for low frequency terms (< 128 occurrences).  (Robert Muir)
 
 * LUCENE-4462: DocumentsWriter now flushes deletes, segment infos and builds
@@ -82,6 +101,17 @@ Optimizations
   was a single threaded process while now all IO and CPU heavy computation is done
   concurrently in DocumentsWriterPerThread. (Simon Willnauer)
 
+* LUCENE-4496: Optimize Lucene41PostingsFormat when requesting a subset of
+  the postings data (via flags to TermsEnum.docs/docsAndPositions) to use
+  ForUtil.skipBlock.  (Robert Muir)
+
+* LUCENE-4497: Don't write PosVIntCount to the positions file in 
+  Lucene41PostingsFormat, as its always totalTermFreq % BLOCK_SIZE. (Robert Muir)
+
+* LUCENE-4498: In Lucene41PostingsFormat, when a term appears in only one document, 
+  Instead of writing a file pointer to a VIntBlock containing the doc id, just 
+  write the doc id.  (Mike McCandless, Robert Muir)
+
 Build
 
 * LUCENE-4451: Memory leak per unique thread caused by 

Modified: lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java Tue Oct 23 17:53:00 2012
@@ -35,6 +35,7 @@ public final class JapaneseReadingFormFi
   private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
   private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
 
+  private StringBuilder buffer = new StringBuilder();
   private boolean useRomaji;
 
   public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
@@ -50,10 +51,19 @@ public final class JapaneseReadingFormFi
   public boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
       String reading = readingAttr.getReading();
-      if (reading != null) {
-        if (useRomaji) {
-          ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+      
+      if (useRomaji) {
+        if (reading == null) {
+          // if its an OOV term, just try the term text
+          buffer.setLength(0);
+          ToStringUtil.getRomanization(buffer, termAttr);
+          termAttr.setEmpty().append(buffer);
         } else {
+          ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+        }
+      } else {
+        // just replace the term text with the reading, if it exists
+        if (reading != null) {
           termAttr.setEmpty().append(reading);
         }
       }

Modified: lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java Tue Oct 23 17:53:00 2012
@@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 
 import java.io.IOException;
@@ -52,12 +54,40 @@ public class TestJapaneseReadingFormFilt
         new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
     );
   }
+  
+  public void testKatakanaReadingsHalfWidth() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+        TokenStream stream = new CJKWidthFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
+      }
+    };
+    assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾åçã¨è©±ãã",
+        new String[] { "ã³ã³ã¤", "ã", "ããã¼ã", "ã»ã³ã»ã¤", "ã", "ããã·", "ã¿" }
+    );
+  }
 
   public void testRomajiReadings() throws IOException {
     assertAnalyzesTo(romajiAnalyzer, "ä»å¤ã¯ããã¼ãåçã¨è©±ãã",
         new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
     );
   }
+  
+  public void testRomajiReadingsHalfWidth() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+        TokenStream stream = new CJKWidthFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
+      }
+    };
+    assertAnalyzesTo(a, "ä»å¤ã¯ï¾ï¾ï¾ï½°ï¾åçã¨è©±ãã",
+        new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+    );
+  }
 
   public void testRandomData() throws IOException {
     Random random = random();

Modified: lucene/dev/branches/lucene3846/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/build.xml (original)
+++ lucene/dev/branches/lucene3846/lucene/build.xml Tue Oct 23 17:53:00 2012
@@ -226,59 +226,63 @@
   <target name="javadoc" depends="javadocs"/>
   <target name="javadocs" description="Generate javadoc" depends="javadocs-lucene-core, javadocs-modules, javadocs-test-framework"/>
 
+  <target name="documentation-lint" depends="-ecj-javadoc-lint,-documentation-lint,-documentation-lint-unsupported"
+          description="Validates the generated documentation (HTML errors, broken links,...)"/>
+  
   <!-- we check for broken links across all documentation -->
-  <target name="documentation-lint" depends="compile-test-framework,documentation,-ecj-resolve">
-    <sequential>
-      <subant target="-ecj-javadoc-lint" failonerror="true" inheritall="false">
-        <propertyset refid="uptodate.and.compiled.properties"/>
-        <fileset dir="core" includes="build.xml"/>
-        <fileset dir="test-framework" includes="build.xml"/>
-      </subant>
-      <modules-crawl target="-ecj-javadoc-lint"/>
-      <echo message="Checking for broken links..."/>
-      <check-broken-links dir="build/docs"/>
-      <echo message="Checking for missing docs..."/>
-      <!-- TODO: change this level=method -->
-      <check-missing-javadocs dir="build/docs" level="class"/>
-      <!-- too many classes to fix overall to just enable
-           the above to be level="method" right now, but we
-           can prevent the modules that don't have problems
-           from getting any worse -->
-      <!-- analyzers-common: problems -->
-      <check-missing-javadocs dir="build/docs/analyzers-icu" level="method"/>
-      <!-- analyzers-kuromoji: problems -->
-      <check-missing-javadocs dir="build/docs/analyzers-morfologik" level="method"/>
-      <check-missing-javadocs dir="build/docs/analyzers-phonetic" level="method"/>
-      <!-- analyzers-smartcn: problems -->
-      <check-missing-javadocs dir="build/docs/analyzers-stempel" level="method"/>
-      <!-- analyzers-uima: problems -->
-      <!-- benchmark: problems -->
-      <check-missing-javadocs dir="build/docs/classification" level="method"/>
-      <!-- codecs: problems -->
-      <!-- core: problems -->
-      <check-missing-javadocs dir="build/docs/demo" level="method"/>
-      <!-- facet: problems -->
-      <!-- grouping: problems -->
-      <!-- highlighter: problems -->
-      <check-missing-javadocs dir="build/docs/join" level="method"/>
-      <check-missing-javadocs dir="build/docs/memory" level="method"/>
-      <!-- misc: problems -->
-      <!-- queries: problems -->
-      <!-- queryparser: problems -->
-      <!-- sandbox: problems -->
-      <!-- spatial: problems -->
-      <check-missing-javadocs dir="build/docs/suggest" level="method"/>
-      <!-- test-framework: problems -->
-
-      <!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
-      <check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
-      <check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
-      <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
-      <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
-      <check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
-    </sequential>
+  <target name="-documentation-lint" if="documentation-lint.supported" depends="documentation">
+    <echo message="Checking for broken links..."/>
+    <check-broken-links dir="build/docs"/>
+    <echo message="Checking for missing docs..."/>
+    <!-- TODO: change this level=method -->
+    <check-missing-javadocs dir="build/docs" level="class"/>
+    <!-- too many classes to fix overall to just enable
+         the above to be level="method" right now, but we
+         can prevent the modules that don't have problems
+         from getting any worse -->
+    <!-- analyzers-common: problems -->
+    <check-missing-javadocs dir="build/docs/analyzers-icu" level="method"/>
+    <!-- analyzers-kuromoji: problems -->
+    <check-missing-javadocs dir="build/docs/analyzers-morfologik" level="method"/>
+    <check-missing-javadocs dir="build/docs/analyzers-phonetic" level="method"/>
+    <!-- analyzers-smartcn: problems -->
+    <check-missing-javadocs dir="build/docs/analyzers-stempel" level="method"/>
+    <!-- analyzers-uima: problems -->
+    <!-- benchmark: problems -->
+    <check-missing-javadocs dir="build/docs/classification" level="method"/>
+    <!-- codecs: problems -->
+    <!-- core: problems -->
+    <check-missing-javadocs dir="build/docs/demo" level="method"/>
+    <!-- facet: problems -->
+    <!-- grouping: problems -->
+    <!-- highlighter: problems -->
+    <check-missing-javadocs dir="build/docs/join" level="method"/>
+    <check-missing-javadocs dir="build/docs/memory" level="method"/>
+    <!-- misc: problems -->
+    <!-- queries: problems -->
+    <!-- queryparser: problems -->
+    <!-- sandbox: problems -->
+    <!-- spatial: problems -->
+    <check-missing-javadocs dir="build/docs/suggest" level="method"/>
+    <!-- test-framework: problems -->
+
+    <!-- too much to fix core/ for now, but enforce full javadocs for key packages -->
+    <check-missing-javadocs dir="build/docs/core/org/apache/lucene/analysis" level="method"/>
+    <check-missing-javadocs dir="build/docs/core/org/apache/lucene/document" level="method"/>
+    <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
+    <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
+    <check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
   </target>
   
+  <target name="-ecj-javadoc-lint" depends="documentation,compile-test-framework,-ecj-resolve">
+    <subant target="-ecj-javadoc-lint" failonerror="true" inheritall="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+      <fileset dir="core" includes="build.xml"/>
+      <fileset dir="test-framework" includes="build.xml"/>
+    </subant>
+    <modules-crawl target="-ecj-javadoc-lint"/>
+  </target>
+
   <target name="process-webpages" depends="resolve-groovy,resolve-pegdown">
     <makeurl property="process-webpages.buildfiles" separator="|">
       <fileset dir="." includes="**/build.xml" excludes="build.xml,analysis/*,build/**,tools/**,backwards/**,site/**"/>

Modified: lucene/dev/branches/lucene3846/lucene/classification/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/build.xml (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/build.xml Tue Oct 23 17:53:00 2012
@@ -23,4 +23,30 @@
   </description>
 
   <import file="../module-build.xml"/>
+
+  <path id="base.classpath">
+    <pathelement location="${common.dir}/build/core/classes/java"/>
+    <pathelement path="${queries.jar}"/>
+    <pathelement path="${project.classpath}"/>
+  </path>
+
+  <path id="test.classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <pathelement location="${common.dir}/build/test-framework/classes/java"/>
+    <pathelement location="${common.dir}/build/codecs/classes/java"/>
+    <path refid="classpath"/>
+    <path refid="junit-path"/>
+    <pathelement location="${build.dir}/classes/java"/>
+  </path>
+
+  <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
+
+  <target name="javadocs" depends="javadocs-queries,compile-core">
+    <invoke-module-javadoc>
+      <links>
+        <link href="../queries"/>
+      </links>
+    </invoke-module-javadoc>
+  </target>
+
 </project>

Modified: lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Tue Oct 23 17:53:00 2012
@@ -29,12 +29,12 @@ import java.io.IOException;
 public interface Classifier {
 
   /**
-   * Assign a class to the given text String
+   * Assign a class (with score) to the given text String
    * @param text a String containing text to be classified
-   * @return a String representing a class
+   * @return a {@link ClassificationResult} holding assigned class and score
    * @throws IOException If there is a low-level I/O error.
    */
-  public String assignClass(String text) throws IOException;
+  public ClassificationResult assignClass(String text) throws IOException;
 
   /**
    * Train the classifier using the underlying Lucene index

Modified: lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Tue Oct 23 17:53:00 2012
@@ -80,7 +80,7 @@ public class SimpleNaiveBayesClassifier 
     return result.toArray(new String[result.size()]);
   }
 
-  public String assignClass(String inputDocument) throws IOException {
+  public ClassificationResult assignClass(String inputDocument) throws IOException {
     if (atomicReader == null) {
       throw new RuntimeException("need to train the classifier first");
     }
@@ -98,7 +98,7 @@ public class SimpleNaiveBayesClassifier 
         foundClass = next.utf8ToString();
       }
     }
-    return foundClass;
+    return new ClassificationResult(foundClass, max);
   }
 
 

Modified: lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/java/org/apache/lucene/classification/package.html Tue Oct 23 17:53:00 2012
@@ -18,6 +18,6 @@
 <body>
 Uses already seen data (the indexed documents) to classify new documents.
 Currently only contains a (simplistic) Lucene based Naive Bayes classifier 
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
 </body>
 </html>

Modified: lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/lucene3846/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Tue Oct 23 17:53:00 2012
@@ -19,112 +19,32 @@ package org.apache.lucene.classification
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.junit.Test;
 
+import java.io.Reader;
+
 /**
  * Testcase for {@link SimpleNaiveBayesClassifier}
  */
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
-  private RandomIndexWriter indexWriter;
-  private String textFieldName;
-  private String classFieldName;
-  private Analyzer analyzer;
-  private Directory dir;
-
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    analyzer = new MockAnalyzer(random());
-    dir = newDirectory();
-    indexWriter = new RandomIndexWriter(random(), dir);
-    textFieldName = "text";
-    classFieldName = "cat";
-  }
-
-  @After
-  public void tearDown() throws Exception {
-    super.tearDown();
-    indexWriter.close();
-    dir.close();
-  }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
 
   @Test
   public void testBasicUsage() throws Exception {
-    SlowCompositeReaderWrapper compositeReaderWrapper = null;
-    try {
-      populateIndex();
-      SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
-      compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
-      simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
-      String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
-      assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
-    } finally {
-      if (compositeReaderWrapper != null)
-        compositeReaderWrapper.close();
-    }
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
   }
 
-  private void populateIndex() throws Exception {
-
-    Document doc = new Document();
-    doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
-        "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
-        "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
-        " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
-        "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
-        "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
-        "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
-        "Albany's School of Criminal Justice.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
-        "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
-        "world through the Internet.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
-        "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
-        " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
-        "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
+  @Test
+  public void testNGramUsage() throws Exception {
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+  }
 
-    indexWriter.commit();
+  private class NGramAnalyzer extends Analyzer {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+          10, 20));
+    }
   }
 
 }

Modified: lucene/dev/branches/lucene3846/lucene/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/common-build.xml?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/common-build.xml (original)
+++ lucene/dev/branches/lucene3846/lucene/common-build.xml Tue Oct 23 17:53:00 2012
@@ -109,6 +109,8 @@
   </condition>
   <property name="tests.clover.args" value=""/>
 
+  <property name="tests.heapdump.args" value=""/>
+
   <property name="tests.tempDir" location="${build.dir}/test"/>
 
   <property name="tests.cachefile" location="${common.dir}/tools/junit4/cached-timehints.txt" />
@@ -263,6 +265,25 @@
     </condition>
   </fail>
 
+  <condition property="documentation-lint.supported">
+    <and>
+      <or>
+        <contains string="${java.vm.name}" substring="hotspot" casesensitive="false"/>
+        <contains string="${java.vm.name}" substring="openjdk" casesensitive="false"/>
+        <contains string="${java.vm.name}" substring="jrockit" casesensitive="false"/>
+      </or>
+      <or>
+        <equals arg1="${ant.java.version}" arg2="1.6"/>
+        <equals arg1="${ant.java.version}" arg2="1.7"/>
+        <equals arg1="${ant.java.version}" arg2="1.8"/>
+      </or>
+    </and>
+  </condition>
+
+  <target name="-documentation-lint-unsupported" unless="documentation-lint.supported">
+    <echo level="warning" message="WARN: Linting documentation HTML is not supported on this Java version (${ant.java.version}) / JVM (${java.vm.name}). NOTHING DONE!"/>
+  </target>
+
   <!-- Import custom ANT tasks. -->
   <import file="${common.dir}/tools/custom-tasks.xml" />
 
@@ -826,6 +847,7 @@
 
             <!-- JVM arguments and system properties. -->
             <jvmarg line="${args}"/>
+            <jvmarg line="${tests.heapdump.args}"/>
             <jvmarg line="${tests.clover.args}"/>
 
             <!-- set the number of times tests should run -->

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/ForUtil.java Tue Oct 23 17:53:00 2012
@@ -157,7 +157,7 @@ final class ForUtil {
    */
   void writeBlock(int[] data, byte[] encoded, IndexOutput out) throws IOException {
     if (isAllEqual(data)) {
-      out.writeVInt(ALL_VALUES_EQUAL);
+      out.writeByte((byte) ALL_VALUES_EQUAL);
       out.writeVInt(data[0]);
       return;
     }
@@ -170,7 +170,7 @@ final class ForUtil {
     final int encodedSize = encodedSizes[numBits];
     assert (iters * encoder.blockCount()) << 3 >= encodedSize;
 
-    out.writeVInt(numBits);
+    out.writeByte((byte) numBits);
 
     encoder.encode(data, 0, encoded, 0, iters);
     out.writeBytes(encoded, encodedSize);
@@ -185,7 +185,7 @@ final class ForUtil {
    * @throws IOException If there is a low-level I/O error
    */
   void readBlock(IndexInput in, byte[] encoded, int[] decoded) throws IOException {
-    final int numBits = in.readVInt();
+    final int numBits = in.readByte();
     assert numBits <= 32 : numBits;
 
     if (numBits == ALL_VALUES_EQUAL) {
@@ -211,7 +211,7 @@ final class ForUtil {
    * @throws IOException If there is a low-level I/O error
    */
   void skipBlock(IndexInput in) throws IOException {
-    final int numBits = in.readVInt();
+    final int numBits = in.readByte();
     if (numBits == ALL_VALUES_EQUAL) {
       in.readVInt();
       return;
@@ -222,7 +222,7 @@ final class ForUtil {
   }
 
   private static boolean isAllEqual(final int[] data) {
-    final long v = data[0];
+    final int v = data[0];
     for (int i = 1; i < BLOCK_SIZE; ++i) {
       if (data[i] != v) {
         return false;

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsFormat.java Tue Oct 23 17:53:00 2012
@@ -127,10 +127,10 @@ import org.apache.lucene.util.packed.Pac
  *
  * <ul>
  *   <li>Postings Metadata --&gt; Header, PackedBlockSize</li>
- *   <li>Term Metadata --&gt; DocFPDelta, PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?, 
+ *   <li>Term Metadata --&gt; (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?, 
  *                            SkipFPDelta?</li>
  *   <li>Header, --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
- *   <li>PackedBlockSize --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *   <li>PackedBlockSize, SingletonDocID --&gt; {@link DataOutput#writeVInt VInt}</li>
  *   <li>DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --&gt; {@link DataOutput#writeVLong VLong}</li>
  * </ul>
  * <p>Notes:</p>
@@ -162,6 +162,9 @@ import org.apache.lucene.util.packed.Pac
  *        file. In particular, it is the length of the TermFreq data.
  *        SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
  *        (i.e. 8 in Lucene41PostingsFormat).</li>
+ *    <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
+ *        of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the 
+ *        single document ID is written to the term dictionary.</li>
  * </ul>
  * </dd>
  * </dl>
@@ -274,10 +277,10 @@ import org.apache.lucene.util.packed.Pac
  *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
  *   <li>TermPositions --&gt; &lt;PackedPosDeltaBlock&gt; <sup>PackedPosBlockNum</sup>,  
  *                            VIntBlock? </li>
- *   <li>VIntBlock --&gt; PosVIntCount, &lt;PositionDelta[, PayloadLength?], PayloadData?, 
+ *   <li>VIntBlock --&gt; &lt;PositionDelta[, PayloadLength?], PayloadData?, 
  *                        OffsetDelta?, OffsetLength?&gt;<sup>PosVIntCount</sup>
  *   <li>PackedPosDeltaBlock --&gt; {@link PackedInts PackedInts}</li>
- *   <li>PosVIntCount, PositionDelta, OffsetDelta, OffsetLength --&gt; 
+ *   <li>PositionDelta, OffsetDelta, OffsetLength --&gt; 
  *       {@link DataOutput#writeVInt VInt}</li>
  *   <li>PayloadData --&gt; {@link DataOutput#writeByte byte}<sup>PayLength</sup></li>
  * </ul>

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java Tue Oct 23 17:53:00 2012
@@ -148,6 +148,9 @@ public final class Lucene41PostingsReade
     long payStartFP;
     long skipOffset;
     long lastPosBlockOffset;
+    // docid when there is a single pulsed posting, otherwise -1
+    // freq is always implicitly totalTermFreq in this case.
+    int singletonDocID;
 
     // Only used by the "primary" TermState -- clones don't
     // copy this (basically they are "transient"):
@@ -170,6 +173,7 @@ public final class Lucene41PostingsReade
       payStartFP = other.payStartFP;
       lastPosBlockOffset = other.lastPosBlockOffset;
       skipOffset = other.skipOffset;
+      singletonDocID = other.singletonDocID;
 
       // Do not copy bytes, bytesReader (else TermState is
       // very heavy, ie drags around the entire block's
@@ -179,7 +183,7 @@ public final class Lucene41PostingsReade
 
     @Override
     public String toString() {
-      return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset;
+      return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID;
     }
   }
 
@@ -223,7 +227,13 @@ public final class Lucene41PostingsReade
 
     final DataInput in = termState.bytesReader;
     if (isFirstTerm) {
-      termState.docStartFP = in.readVLong();
+      if (termState.docFreq == 1) {
+        termState.singletonDocID = in.readVInt();
+        termState.docStartFP = 0;
+      } else {
+        termState.singletonDocID = -1;
+        termState.docStartFP = in.readVLong();
+      }
       if (fieldHasPositions) {
         termState.posStartFP = in.readVLong();
         if (termState.totalTermFreq > BLOCK_SIZE) {
@@ -238,7 +248,12 @@ public final class Lucene41PostingsReade
         }
       }
     } else {
-      termState.docStartFP += in.readVLong();
+      if (termState.docFreq == 1) {
+        termState.singletonDocID = in.readVInt();
+      } else {
+        termState.singletonDocID = -1;
+        termState.docStartFP += in.readVLong();
+      }
       if (fieldHasPositions) {
         termState.posStartFP += in.readVLong();
         if (termState.totalTermFreq > BLOCK_SIZE) {
@@ -275,10 +290,10 @@ public final class Lucene41PostingsReade
     } else {
       docsEnum = new BlockDocsEnum(fieldInfo);
     }
-    return docsEnum.reset(liveDocs, (IntBlockTermState) termState);
+    return docsEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
   }
 
-  // TODO: specialize to liveDocs vs not, and freqs vs not
+  // TODO: specialize to liveDocs vs not
   
   @Override
   public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
@@ -310,7 +325,7 @@ public final class Lucene41PostingsReade
       } else {
         everythingEnum = new EverythingEnum(fieldInfo);
       }
-      return everythingEnum.reset(liveDocs, (IntBlockTermState) termState);
+      return everythingEnum.reset(liveDocs, (IntBlockTermState) termState, flags);
     }
   }
 
@@ -327,13 +342,14 @@ public final class Lucene41PostingsReade
 
     final IndexInput startDocIn;
 
-    final IndexInput docIn;
+    IndexInput docIn;
     final boolean indexHasFreq;
     final boolean indexHasPos;
     final boolean indexHasOffsets;
     final boolean indexHasPayloads;
 
     private int docFreq;                              // number of docs in this posting list
+    private long totalTermFreq;                       // sum of freqs in this posting list (or docFreq when omitted)
     private int docUpto;                              // how many docs we've read
     private int doc;                                  // doc we last read
     private int accum;                                // accumulator for doc deltas
@@ -352,10 +368,13 @@ public final class Lucene41PostingsReade
     private int nextSkipDoc;
 
     private Bits liveDocs;
+    
+    private boolean needsFreq; // true if the caller actually needs frequencies
+    private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
 
     public BlockDocsEnum(FieldInfo fieldInfo) throws IOException {
       this.startDocIn = Lucene41PostingsReader.this.docIn;
-      this.docIn = startDocIn.clone();
+      this.docIn = null;
       indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
       indexHasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
       indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -370,17 +389,26 @@ public final class Lucene41PostingsReade
         indexHasPayloads == fieldInfo.hasPayloads();
     }
     
-    public DocsEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException {
+    public DocsEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
       this.liveDocs = liveDocs;
       // if (DEBUG) {
       //   System.out.println("  FPR.reset: termState=" + termState);
       // }
       docFreq = termState.docFreq;
+      totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq;
       docTermStartFP = termState.docStartFP;
-      docIn.seek(docTermStartFP);
       skipOffset = termState.skipOffset;
+      singletonDocID = termState.singletonDocID;
+      if (docFreq > 1) {
+        if (docIn == null) {
+          // lazy init
+          docIn = startDocIn.clone();
+        }
+        docIn.seek(docTermStartFP);
+      }
 
       doc = -1;
+      this.needsFreq = (flags & DocsEnum.FLAG_FREQS) != 0;
       if (!indexHasFreq) {
         Arrays.fill(freqBuffer, 1);
       }
@@ -416,8 +444,15 @@ public final class Lucene41PostingsReade
           // if (DEBUG) {
           //   System.out.println("    fill freq block from fp=" + docIn.getFilePointer());
           // }
-          forUtil.readBlock(docIn, encoded, freqBuffer);
+          if (needsFreq) {
+            forUtil.readBlock(docIn, encoded, freqBuffer);
+          } else {
+            forUtil.skipBlock(docIn); // skip over freqs
+          }
         }
+      } else if (docFreq == 1) {
+        docDeltaBuffer[0] = singletonDocID;
+        freqBuffer[0] = (int) totalTermFreq;
       } else {
         // Read vInts:
         // if (DEBUG) {
@@ -583,13 +618,14 @@ public final class Lucene41PostingsReade
 
     final IndexInput startDocIn;
 
-    final IndexInput docIn;
+    IndexInput docIn;
     final IndexInput posIn;
 
     final boolean indexHasOffsets;
     final boolean indexHasPayloads;
 
     private int docFreq;                              // number of docs in this posting list
+    private long totalTermFreq;                       // number of positions in this posting list
     private int docUpto;                              // how many docs we've read
     private int doc;                                  // doc we last read
     private int accum;                                // accumulator for doc deltas
@@ -627,10 +663,11 @@ public final class Lucene41PostingsReade
     private int nextSkipDoc;
 
     private Bits liveDocs;
+    private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
     
     public BlockDocsAndPositionsEnum(FieldInfo fieldInfo) throws IOException {
       this.startDocIn = Lucene41PostingsReader.this.docIn;
-      this.docIn = startDocIn.clone();
+      this.docIn = null;
       this.posIn = Lucene41PostingsReader.this.posIn.clone();
       encoded = new byte[MAX_ENCODED_SIZE];
       indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
@@ -652,8 +689,16 @@ public final class Lucene41PostingsReade
       docTermStartFP = termState.docStartFP;
       posTermStartFP = termState.posStartFP;
       payTermStartFP = termState.payStartFP;
-      docIn.seek(docTermStartFP);
       skipOffset = termState.skipOffset;
+      totalTermFreq = termState.totalTermFreq;
+      singletonDocID = termState.singletonDocID;
+      if (docFreq > 1) {
+        if (docIn == null) {
+          // lazy init
+          docIn = startDocIn.clone();
+        }
+        docIn.seek(docTermStartFP);
+      }
       posPendingFP = posTermStartFP;
       posPendingCount = 0;
       if (termState.totalTermFreq < BLOCK_SIZE) {
@@ -696,6 +741,9 @@ public final class Lucene41PostingsReade
         //   System.out.println("    fill freq block from fp=" + docIn.getFilePointer());
         // }
         forUtil.readBlock(docIn, encoded, freqBuffer);
+      } else if (docFreq == 1) {
+        docDeltaBuffer[0] = singletonDocID;
+        freqBuffer[0] = (int) totalTermFreq;
       } else {
         // Read vInts:
         // if (DEBUG) {
@@ -714,7 +762,7 @@ public final class Lucene41PostingsReade
         // if (DEBUG) {
         //   System.out.println("        vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets);
         // }
-        final int count = posIn.readVInt();
+        final int count = (int) (totalTermFreq % BLOCK_SIZE);
         int payloadLength = 0;
         for(int i=0;i<count;i++) {
           int code = posIn.readVInt();
@@ -993,7 +1041,7 @@ public final class Lucene41PostingsReade
 
     final IndexInput startDocIn;
 
-    final IndexInput docIn;
+    IndexInput docIn;
     final IndexInput posIn;
     final IndexInput payIn;
     final BytesRef payload;
@@ -1002,6 +1050,7 @@ public final class Lucene41PostingsReade
     final boolean indexHasPayloads;
 
     private int docFreq;                              // number of docs in this posting list
+    private long totalTermFreq;                       // number of positions in this posting list
     private int docUpto;                              // how many docs we've read
     private int doc;                                  // doc we last read
     private int accum;                                // accumulator for doc deltas
@@ -1044,9 +1093,13 @@ public final class Lucene41PostingsReade
 
     private Bits liveDocs;
     
+    private boolean needsOffsets; // true if we actually need offsets
+    private boolean needsPayloads; // true if we actually need payloads
+    private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
+    
     public EverythingEnum(FieldInfo fieldInfo) throws IOException {
       this.startDocIn = Lucene41PostingsReader.this.docIn;
-      this.docIn = startDocIn.clone();
+      this.docIn = null;
       this.posIn = Lucene41PostingsReader.this.posIn.clone();
       this.payIn = Lucene41PostingsReader.this.payIn.clone();
       encoded = new byte[MAX_ENCODED_SIZE];
@@ -1079,7 +1132,7 @@ public final class Lucene41PostingsReade
         indexHasPayloads == fieldInfo.hasPayloads();
     }
     
-    public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState) throws IOException {
+    public EverythingEnum reset(Bits liveDocs, IntBlockTermState termState, int flags) throws IOException {
       this.liveDocs = liveDocs;
       // if (DEBUG) {
       //   System.out.println("  FPR.reset: termState=" + termState);
@@ -1088,8 +1141,16 @@ public final class Lucene41PostingsReade
       docTermStartFP = termState.docStartFP;
       posTermStartFP = termState.posStartFP;
       payTermStartFP = termState.payStartFP;
-      docIn.seek(docTermStartFP);
       skipOffset = termState.skipOffset;
+      totalTermFreq = termState.totalTermFreq;
+      singletonDocID = termState.singletonDocID;
+      if (docFreq > 1) {
+        if (docIn == null) {
+          // lazy init
+          docIn = startDocIn.clone();
+        }
+        docIn.seek(docTermStartFP);
+      }
       posPendingFP = posTermStartFP;
       payPendingFP = payTermStartFP;
       posPendingCount = 0;
@@ -1101,6 +1162,9 @@ public final class Lucene41PostingsReade
         lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
       }
 
+      this.needsOffsets = (flags & DocsAndPositionsEnum.FLAG_OFFSETS) != 0;
+      this.needsPayloads = (flags & DocsAndPositionsEnum.FLAG_PAYLOADS) != 0;
+
       doc = -1;
       accum = 0;
       docUpto = 0;
@@ -1133,6 +1197,9 @@ public final class Lucene41PostingsReade
         //   System.out.println("    fill freq block from fp=" + docIn.getFilePointer());
         // }
         forUtil.readBlock(docIn, encoded, freqBuffer);
+      } else if (docFreq == 1) {
+        docDeltaBuffer[0] = singletonDocID;
+        freqBuffer[0] = (int) totalTermFreq;
       } else {
         // if (DEBUG) {
         //   System.out.println("    fill last vInt doc block from fp=" + docIn.getFilePointer());
@@ -1150,7 +1217,7 @@ public final class Lucene41PostingsReade
         // if (DEBUG) {
         //   System.out.println("        vInt pos block @ fp=" + posIn.getFilePointer() + " hasPayloads=" + indexHasPayloads + " hasOffsets=" + indexHasOffsets);
         // }
-        final int count = posIn.readVInt();
+        final int count = (int) (totalTermFreq % BLOCK_SIZE);
         int payloadLength = 0;
         int offsetLength = 0;
         payloadByteUpto = 0;
@@ -1203,15 +1270,22 @@ public final class Lucene41PostingsReade
           // if (DEBUG) {
           //   System.out.println("        bulk payload block @ pay.fp=" + payIn.getFilePointer());
           // }
-          forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
-          int numBytes = payIn.readVInt();
-          // if (DEBUG) {
-          //   System.out.println("        " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
-          // }
-          if (numBytes > payloadBytes.length) {
-            payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
+          if (needsPayloads) {
+            forUtil.readBlock(payIn, encoded, payloadLengthBuffer);
+            int numBytes = payIn.readVInt();
+            // if (DEBUG) {
+            //   System.out.println("        " + numBytes + " payload bytes @ pay.fp=" + payIn.getFilePointer());
+            // }
+            if (numBytes > payloadBytes.length) {
+              payloadBytes = ArrayUtil.grow(payloadBytes, numBytes);
+            }
+            payIn.readBytes(payloadBytes, 0, numBytes);
+          } else {
+            // this works, because when writing a vint block we always force the first length to be written
+            forUtil.skipBlock(payIn); // skip over lengths
+            int numBytes = payIn.readVInt(); // read length of payloadBytes
+            payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes
           }
-          payIn.readBytes(payloadBytes, 0, numBytes);
           payloadByteUpto = 0;
         }
 
@@ -1219,8 +1293,14 @@ public final class Lucene41PostingsReade
           // if (DEBUG) {
           //   System.out.println("        bulk offset block @ pay.fp=" + payIn.getFilePointer());
           // }
-          forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
-          forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
+          if (needsOffsets) {
+            forUtil.readBlock(payIn, encoded, offsetStartDeltaBuffer);
+            forUtil.readBlock(payIn, encoded, offsetLengthBuffer);
+          } else {
+            // this works, because when writing a vint block we always force the first length to be written
+            forUtil.skipBlock(payIn); // skip over starts
+            forUtil.skipBlock(payIn); // skip over lengths
+          }
         }
       }
     }

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java Tue Oct 23 17:53:00 2012
@@ -354,13 +354,15 @@ public final class Lucene41PostingsWrite
     public final long payStartFP;
     public final long skipOffset;
     public final long lastPosBlockOffset;
+    public final int singletonDocID;
 
-    public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset) {
+    public PendingTerm(long docStartFP, long posStartFP, long payStartFP, long skipOffset, long lastPosBlockOffset, int singletonDocID) {
       this.docStartFP = docStartFP;
       this.posStartFP = posStartFP;
       this.payStartFP = payStartFP;
       this.skipOffset = skipOffset;
       this.lastPosBlockOffset = lastPosBlockOffset;
+      this.singletonDocID = singletonDocID;
     }
   }
 
@@ -384,18 +386,26 @@ public final class Lucene41PostingsWrite
     //     System.out.println("  write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docTermStartFP=" + docTermStartFP);
     //   }
     // }
-
-    // vInt encode the remaining doc deltas and freqs:
-    for(int i=0;i<docBufferUpto;i++) {
-      final int docDelta = docDeltaBuffer[i];
-      final int freq = freqBuffer[i];
-      if (!fieldHasFreqs) {
-        docOut.writeVInt(docDelta);
-      } else if (freqBuffer[i] == 1) {
-        docOut.writeVInt((docDelta<<1)|1);
-      } else {
-        docOut.writeVInt(docDelta<<1);
-        docOut.writeVInt(freq);
+    
+    // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it.
+    final int singletonDocID;
+    if (stats.docFreq == 1) {
+      // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq
+      singletonDocID = docDeltaBuffer[0];
+    } else {
+      singletonDocID = -1;
+      // vInt encode the remaining doc deltas and freqs:
+      for(int i=0;i<docBufferUpto;i++) {
+        final int docDelta = docDeltaBuffer[i];
+        final int freq = freqBuffer[i];
+        if (!fieldHasFreqs) {
+          docOut.writeVInt(docDelta);
+        } else if (freqBuffer[i] == 1) {
+          docOut.writeVInt((docDelta<<1)|1);
+        } else {
+          docOut.writeVInt(docDelta<<1);
+          docOut.writeVInt(freq);
+        }
       }
     }
 
@@ -417,9 +427,7 @@ public final class Lucene41PostingsWrite
       } else {
         lastPosBlockOffset = -1;
       }
-      if (posBufferUpto > 0) {
-        posOut.writeVInt(posBufferUpto);
-        
+      if (posBufferUpto > 0) {       
         // TODO: should we send offsets/payloads to
         // .pay...?  seems wasteful (have to store extra
         // vLong for low (< BLOCK_SIZE) DF terms = vast vast
@@ -509,7 +517,7 @@ public final class Lucene41PostingsWrite
     //   System.out.println("  payStartFP=" + payStartFP);
     // }
 
-    pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset));
+    pendingTerms.add(new PendingTerm(docTermStartFP, posTermStartFP, payStartFP, skipOffset, lastPosBlockOffset, singletonDocID));
     docBufferUpto = 0;
     posBufferUpto = 0;
     lastDocID = 0;
@@ -537,8 +545,12 @@ public final class Lucene41PostingsWrite
     for(int idx=limit-count; idx<limit; idx++) {
       PendingTerm term = pendingTerms.get(idx);
 
-      bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
-      lastDocStartFP = term.docStartFP;
+      if (term.singletonDocID == -1) {
+        bytesWriter.writeVLong(term.docStartFP - lastDocStartFP);
+        lastDocStartFP = term.docStartFP;
+      } else {
+        bytesWriter.writeVInt(term.singletonDocID);
+      }
 
       if (fieldHasPositions) {
         bytesWriter.writeVLong(term.posStartFP - lastPosStartFP);

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java Tue Oct 23 17:53:00 2012
@@ -99,8 +99,14 @@ final class DocumentsWriterFlushControl 
       maxConfiguredRamBuffer = Math.max(maxRamMB, maxConfiguredRamBuffer);
       final long ram = flushBytes + activeBytes;
       final long ramBufferBytes = (long) (maxConfiguredRamBuffer * 1024 * 1024);
-      // take peakDelta into account - worst case is that all flushing, pending and blocked DWPT had maxMem and the last doc had the peakDelta 
-      final long expected = (2 * (ramBufferBytes)) + ((numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta);
+      // take peakDelta into account - worst case is that all flushing, pending and blocked DWPT had maxMem and the last doc had the peakDelta
+      
+      // 2 * ramBufferBytes -> before we stall we need to cross the 2xRAM Buffer border this is still a valid limit
+      // (numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta) -> those are the total number of DWPT that are not active but not yet fully fluhsed
+      // all of them could theoretically be taken out of the loop once they crossed the RAM buffer and the last document was the peak delta
+      // (perThreadPool.getActiveThreadState() * peakDelta) -> at any given time there could be n threads in flight that crossed the stall control before we reached the limit and each of them could hold a peak document
+      final long expected = (2 * (ramBufferBytes)) + ((numPending + numFlushingDWPT() + numBlockedFlushes()) * peakDelta) + (perThreadPool.getActiveThreadState() * peakDelta);
+      // the expected ram consumption is an upper bound at this point and not really the expected consumption
       if (peakDelta < (ramBufferBytes >> 1)) {
         /*
          * if we are indexing with very low maxRamBuffer like 0.1MB memory can
@@ -111,11 +117,11 @@ final class DocumentsWriterFlushControl 
          * fail. To prevent this we only assert if the the largest document seen
          * is smaller than the 1/2 of the maxRamBufferMB
          */
-        assert ram <= expected : "ram was " + ram + " expected: " + expected
-            + " flush mem: " + flushBytes + " activeMem: " + activeBytes
-            + " pendingMem: " + numPending + " flushingMem: "
-            + numFlushingDWPT() + " blockedMem: " + numBlockedFlushes()
-            + " peakDeltaMem: " + peakDelta;
+        assert ram <= expected : "actual mem: " + ram + " byte, expected mem: " + expected
+            + " byte, flush mem: " + flushBytes + ", active mem: " + activeBytes
+            + ", pending DWPT: " + numPending + ", flushing DWPT: "
+            + numFlushingDWPT() + ", blocked DWPT: " + numBlockedFlushes()
+            + ", peakDelta mem: " + peakDelta + " byte";
       }
     }
     return true;

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java Tue Oct 23 17:53:00 2012
@@ -36,6 +36,7 @@ import org.apache.lucene.analysis.Analyz
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.index.FieldInfos.FieldNumbers;
 import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.index.MergePolicy.MergeTrigger;
 import org.apache.lucene.index.MergeState.CheckAbort;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.AlreadyClosedException;
@@ -181,6 +182,10 @@ import org.apache.lucene.util.ThreadInte
  * keeps track of the last non commit checkpoint.
  */
 public class IndexWriter implements Closeable, TwoPhaseCommit {
+  
+  private static final int UNBOUNDED_MAX_MERGE_SEGMENTS = -1;
+
+  
   /**
    * Name of the write lock in the index.
    */
@@ -377,7 +382,7 @@ public class IndexWriter implements Clos
       }
     }
     if (anySegmentFlushed) {
-      maybeMerge();
+      maybeMerge(MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
     }
     if (infoStream.isEnabled("IW")) {
       infoStream.message("IW", "getReader took " + (System.currentTimeMillis() - tStart) + " msec");
@@ -1226,7 +1231,7 @@ public class IndexWriter implements Clos
         }
       }
       if (anySegmentFlushed) {
-        maybeMerge();
+        maybeMerge(MergeTrigger.SEGMENT_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
       }
     } catch (OutOfMemoryError oom) {
       handleOOM(oom, "updateDocuments");
@@ -1448,7 +1453,7 @@ public class IndexWriter implements Clos
       }
 
       if (anySegmentFlushed) {
-        maybeMerge();
+        maybeMerge(MergeTrigger.SEGMENT_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
       }
     } catch (OutOfMemoryError oom) {
       handleOOM(oom, "updateDocument");
@@ -1621,7 +1626,7 @@ public class IndexWriter implements Clos
       }
     }
 
-    maybeMerge(maxNumSegments);
+    maybeMerge(MergeTrigger.EXPLICIT, maxNumSegments);
 
     if (doWait) {
       synchronized(this) {
@@ -1796,25 +1801,28 @@ public class IndexWriter implements Clos
    * Explicit calls to maybeMerge() are usually not
    * necessary. The most common case is when merge policy
    * parameters have changed.
+   * 
+   * This method will call the {@link MergePolicy} with
+   * {@link MergeTrigger#EXPLICIT}.
    *
    * <p><b>NOTE</b>: if this method hits an OutOfMemoryError
    * you should immediately close the writer.  See <a
    * href="#OOME">above</a> for details.</p>
    */
   public final void maybeMerge() throws IOException {
-    maybeMerge(-1);
+    maybeMerge(MergeTrigger.EXPLICIT, UNBOUNDED_MAX_MERGE_SEGMENTS);
   }
 
-  private final void maybeMerge(int maxNumSegments) throws IOException {
+  private final void maybeMerge(MergeTrigger trigger, int maxNumSegments) throws IOException {
     ensureOpen(false);
-    updatePendingMerges(maxNumSegments);
+    updatePendingMerges(trigger, maxNumSegments);
     mergeScheduler.merge(this);
   }
 
-  private synchronized void updatePendingMerges(int maxNumSegments)
+  private synchronized void updatePendingMerges(MergeTrigger trigger, int maxNumSegments)
     throws IOException {
     assert maxNumSegments == -1 || maxNumSegments > 0;
-
+    assert trigger != null;
     if (stopMerges) {
       return;
     }
@@ -1825,7 +1833,9 @@ public class IndexWriter implements Clos
     }
 
     final MergePolicy.MergeSpecification spec;
-    if (maxNumSegments != -1) {
+    if (maxNumSegments != UNBOUNDED_MAX_MERGE_SEGMENTS) {
+      assert trigger == MergeTrigger.EXPLICIT || trigger == MergeTrigger.MERGE_FINISHED :
+        "Expected EXPLICT or MERGE_FINISHED as trigger even with maxNumSegments set but was: " + trigger.name();
       spec = mergePolicy.findForcedMerges(segmentInfos, maxNumSegments, Collections.unmodifiableMap(segmentsToMerge));
       if (spec != null) {
         final int numMerges = spec.merges.size();
@@ -1836,7 +1846,7 @@ public class IndexWriter implements Clos
       }
 
     } else {
-      spec = mergePolicy.findMerges(segmentInfos);
+      spec = mergePolicy.findMerges(trigger, segmentInfos);
     }
 
     if (spec != null) {
@@ -2653,7 +2663,7 @@ public class IndexWriter implements Clos
       boolean success = false;
       try {
         if (anySegmentsFlushed) {
-          maybeMerge();
+          maybeMerge(MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
         }
         success = true;
       } finally {
@@ -2809,7 +2819,7 @@ public class IndexWriter implements Clos
     // We can be called during close, when closing==true, so we must pass false to ensureOpen:
     ensureOpen(false);
     if (doFlush(applyAllDeletes) && triggerMerge) {
-      maybeMerge();
+      maybeMerge(MergeTrigger.FULL_FLUSH, UNBOUNDED_MAX_MERGE_SEGMENTS);
     }
   }
 
@@ -3240,7 +3250,7 @@ public class IndexWriter implements Clos
           // segments) may now enable new merges, so we call
           // merge policy & update pending merges.
           if (success && !merge.isAborted() && (merge.maxNumSegments != -1 || (!closed && !closing))) {
-            updatePendingMerges(merge.maxNumSegments);
+            updatePendingMerges(MergeTrigger.MERGE_FINISHED, merge.maxNumSegments);
           }
         }
       }

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java Tue Oct 23 17:53:00 2012
@@ -24,6 +24,8 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 
+import org.apache.lucene.index.MergePolicy.MergeTrigger;
+
 
 /**
  * <p>This class implements a {@link MergePolicy} that tries
@@ -560,7 +562,7 @@ public abstract class LogMergePolicy ext
    *  will return multiple merges, allowing the {@link
    *  MergeScheduler} to use concurrency. */
   @Override
-  public MergeSpecification findMerges(SegmentInfos infos) throws IOException {
+  public MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos infos) throws IOException {
 
     final int numSegments = infos.size();
     if (verbose()) {

Modified: lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java?rev=1401363&r1=1401362&r2=1401363&view=diff
==============================================================================
--- lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/dev/branches/lucene3846/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java Tue Oct 23 17:53:00 2012
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
@@ -57,7 +58,7 @@ import org.apache.lucene.util.SetOnce;
  */
 
 public abstract class MergePolicy implements java.io.Closeable, Cloneable {
-
+  
   /** OneMerge provides the information necessary to perform
    *  an individual primitive merge operation, resulting in
    *  a single new segment.  The merge spec includes the
@@ -333,11 +334,11 @@ public abstract class MergePolicy implem
    * {@link IndexWriter} calls this whenever there is a change to the segments.
    * This call is always synchronized on the {@link IndexWriter} instance so
    * only one thread at a time will call this method.
-   * 
+   * @param mergeTrigger the event that triggered the merge
    * @param segmentInfos
    *          the total set of segments in the index
    */
-  public abstract MergeSpecification findMerges(SegmentInfos segmentInfos)
+  public abstract MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos segmentInfos)
       throws IOException;
 
   /**
@@ -379,9 +380,36 @@ public abstract class MergePolicy implem
    * Release all resources for the policy.
    */
   public abstract void close();
-
+  
+  
   /**
    * Returns true if a new segment (regardless of its origin) should use the compound file format.
    */
   public abstract boolean useCompoundFile(SegmentInfos segments, SegmentInfoPerCommit newSegment) throws IOException;
+  
+  /**
+   * MergeTrigger is passed to
+   * {@link MergePolicy#findMerges(MergeTrigger, SegmentInfos)} to indicate the
+   * event that triggered the merge.
+   */
+  public static enum MergeTrigger {
+    /**
+     * Merge was triggered by a segment flush.
+     */
+    SEGMENT_FLUSH, 
+    /**
+     * Merge was triggered by a full flush. Full flushes
+     * can be caused by a commit, NRT reader reopen or a close call on the index writer.
+     */
+    FULL_FLUSH,
+    /**
+     * Merge has been triggered explicitly by the user.
+     */
+    EXPLICIT,
+    
+    /**
+     * Merge was triggered by a successfully finished merge.
+     */
+    MERGE_FINISHED,
+  }
 }