You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2013/07/04 01:26:45 UTC
svn commit: r1499601 [2/20] - in /lucene/dev/branches/security: ./
dev-tools/ dev-tools/eclipse/ dev-tools/idea/.idea/libraries/
dev-tools/idea/solr/core/src/test/ dev-tools/maven/ dev-tools/maven/lucene/
dev-tools/maven/lucene/analysis/stempel/ dev-to...
Modified: lucene/dev/branches/security/.gitignore
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/.gitignore?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/.gitignore (original)
+++ lucene/dev/branches/security/.gitignore Wed Jul 3 23:26:32 2013
@@ -1,4 +1,5 @@
-
+# hdfs
+/solr/example/hdfs
*.jar
# .
Modified: lucene/dev/branches/security/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/build.xml?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/build.xml (original)
+++ lucene/dev/branches/security/build.xml Wed Jul 3 23:26:32 2013
@@ -90,6 +90,7 @@
<target name="rat-sources" description="Runs rat across all sources and tests">
<subant target="rat-sources" inheritall="false" failonerror="true">
+ <fileset dir="." includes="extra-targets.xml" /><!-- run rat-sources also for root directory -->
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
</subant>
@@ -130,7 +131,7 @@
<target name="get-maven-poms"
description="Copy Maven POMs from dev-tools/maven/ to maven-build/">
- <copy todir="${maven-build-dir}" overwrite="true">
+ <copy todir="${maven-build-dir}" overwrite="true" encoding="UTF-8">
<fileset dir="${basedir}/dev-tools/maven"/>
<filterset begintoken="@" endtoken="@">
<filter token="version" value="${version}"/>
@@ -185,7 +186,12 @@
</target>
<target name="eclipse" depends="resolve" description="Setup Eclipse configuration">
- <copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false"/>
+ <basename file="${basedir}" property="eclipseprojectname"/>
+ <copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false" encoding="UTF-8">
+ <filterset>
+ <filter token="ECLIPSEPROJECTNAME" value="${eclipseprojectname}"/>
+ </filterset>
+ </copy>
<mkdir dir=".settings"/>
<copy todir=".settings/" overwrite="true">
<fileset dir="dev-tools/eclipse/dot.settings" includes="*.prefs" />
@@ -199,7 +205,7 @@
<!-- TODO: find a better way to exclude duplicate JAR files & fix the servlet-api mess! -->
<pathconvert property="eclipse.fileset.libs" pathsep="|" dirsep="/">
<fileset dir="${basedir}/lucene" includes="**/lib/*.jar" excludes="**/*servlet-api*.jar, analysis/uima/**, tools/**, build/**"/>
- <fileset dir="${basedir}/solr" includes="**/lib/*.jar" excludes="core/lib/*servlet-api*.jar, contrib/analysis-extras/**, test-framework/**, build/**, dist/**, package/**" />
+ <fileset dir="${basedir}/solr" includes="**/lib/*.jar" excludes="core/lib/*servlet-api*.jar, contrib/analysis-extras/**, test-framework/lib/junit*, test-framework/lib/ant*, test-framework/lib/randomizedtesting*, build/**, dist/**, package/**" />
<map from="${basedir}/" to=""/>
</pathconvert>
<xslt in="${ant.file}" out=".classpath" style="dev-tools/eclipse/dot.classpath.xsl" force="true">
Modified: lucene/dev/branches/security/dev-tools/eclipse/dot.project
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/eclipse/dot.project?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/eclipse/dot.project (original)
+++ lucene/dev/branches/security/dev-tools/eclipse/dot.project Wed Jul 3 23:26:32 2013
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
- <name>lucene_solr_trunk</name>
+ <name>@ECLIPSEPROJECTNAME@</name>
<comment></comment>
<projects>
</projects>
Modified: lucene/dev/branches/security/dev-tools/idea/solr/core/src/test/solr-core-tests.iml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/idea/solr/core/src/test/solr-core-tests.iml?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/idea/solr/core/src/test/solr-core-tests.iml (original)
+++ lucene/dev/branches/security/dev-tools/idea/solr/core/src/test/solr-core-tests.iml Wed Jul 3 23:26:32 2013
@@ -13,6 +13,7 @@
<orderEntry type="library" scope="TEST" name="Solr core library" level="project" />
<orderEntry type="library" scope="TEST" name="Solrj library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr example library" level="project" />
+ <orderEntry type="library" scope="TEST" name="Solr test framework library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-core-test-files" />
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/analysis/stempel/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/analysis/stempel/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/analysis/stempel/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/analysis/stempel/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -96,6 +96,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/benchmark/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/benchmark/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/benchmark/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/benchmark/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -142,6 +142,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/core/src/java/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/core/src/java/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/core/src/java/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/core/src/java/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -87,6 +87,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/demo/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/demo/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/demo/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/demo/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -113,6 +113,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/facet/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/facet/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/facet/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/facet/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -98,6 +98,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/misc/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/misc/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/misc/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/misc/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -86,6 +86,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -78,6 +78,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
@@ -96,6 +97,7 @@
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/tests.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/queryparser/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/queryparser/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/queryparser/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/queryparser/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -101,6 +101,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/lucene/test-framework/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/lucene/test-framework/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/lucene/test-framework/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/lucene/test-framework/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -106,6 +106,7 @@
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/tests.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -47,10 +47,11 @@
<jetty.version>8.1.10.v20130312</jetty.version>
<slf4j.version>1.6.6</slf4j.version>
<log4j.version>1.2.16</log4j.version>
- <tika.version>1.3</tika.version>
+ <tika.version>1.4</tika.version>
<httpcomponents.version>4.2.3</httpcomponents.version>
<commons-io.version>2.1</commons-io.version>
<restlet.version>2.1.1</restlet.version>
+ <hadoop.version>2.0.5-alpha</hadoop.version>
<!-- RandomizedTesting library system properties -->
<tests.iters>1</tests.iters>
@@ -183,6 +184,11 @@
<version>${commons-io.version}</version>
</dependency>
<dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ <version>2.2</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpcomponents.version}</version>
@@ -204,6 +210,16 @@
<version>0.5</version>
</dependency>
<dependency>
+ <groupId>com.googlecode.concurrentlinkedhashmap</groupId>
+ <artifactId>concurrentlinkedhashmap-lru</artifactId>
+ <version>1.2</version>
+ </dependency>
+ <dependency>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-core</artifactId>
+ <version>1.16</version>
+ </dependency>
+ <dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
@@ -281,6 +297,38 @@
<version>10.9.1.0</version>
</dependency>
<dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-annotations</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-auth</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <classifier>tests</classifier>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>${hadoop.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <version>${hadoop.version}</version>
+ <classifier>tests</classifier>
+ </dependency>
+ <dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
@@ -307,8 +355,8 @@
<artifactId>vorbis-java-core</artifactId>
</exclusion>
<exclusion>
- <groupId>asm</groupId>
- <artifactId>asm</artifactId>
+ <groupId>org.ow2.asm</groupId>
+ <artifactId>asm-debug-all</artifactId>
</exclusion>
<exclusion>
<groupId>org.aspectj</groupId>
@@ -364,7 +412,7 @@
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>morfologik-polish</artifactId>
- <version>1.5.5</version>
+ <version>1.6.0</version>
</dependency>
<dependency>
<groupId>org.codehaus.woodstox</groupId>
@@ -398,6 +446,16 @@
<version>${jetty.version}</version>
</dependency>
<dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty</artifactId>
+ <version>6.1.26</version>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ <version>6.1.26</version>
+ </dependency>
+ <dependency>
<groupId>org.restlet.jee</groupId>
<artifactId>org.restlet</artifactId>
<version>${restlet.version}</version>
@@ -510,7 +568,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
- <version>3.0</version>
+ <version>3.1</version>
<configuration>
<source>${java.compat.version}</source>
<target>${java.compat.version}</target>
@@ -519,7 +577,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
- <version>2.6</version>
+ <version>2.8</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
@@ -529,7 +587,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-enforcer-plugin</artifactId>
- <version>1.2</version>
+ <version>1.3</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
@@ -565,7 +623,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
- <version>2.9</version>
+ <version>2.9.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
@@ -574,8 +632,13 @@
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-site-plugin</artifactId>
+ <version>3.3</version>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
- <version>2.13</version>
+ <version>2.15</version>
<configuration>
<runOrder>random</runOrder>
<reportFormat>plain</reportFormat>
@@ -640,7 +703,7 @@
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
- <version>1.7</version>
+ <version>1.8</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
@@ -739,7 +802,7 @@
<plugin>
<groupId>org.apache.felix</groupId>
<artifactId>maven-bundle-plugin</artifactId>
- <version>2.3.7</version>
+ <version>2.4.0</version>
<configuration>
<instructions>
<Export-Package>*;-split-package:=merge-first</Export-Package>
Modified: lucene/dev/branches/security/dev-tools/maven/solr/core/src/java/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/solr/core/src/java/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/solr/core/src/java/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/solr/core/src/java/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -125,6 +125,10 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>com.googlecode.concurrentlinkedhashmap</groupId>
+ <artifactId>concurrentlinkedhashmap-lru</artifactId>
+ </dependency>
+ <dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</dependency>
@@ -137,6 +141,146 @@
<artifactId>commons-fileupload</artifactId>
</dependency>
<dependency>
+ <groupId>joda-time</groupId>
+ <artifactId>joda-time</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-auth</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-math</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xmlenc</groupId>
+ <artifactId>xmlenc</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-net</groupId>
+ <artifactId>commons-net</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-json</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-server</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>tomcat</groupId>
+ <artifactId>jasper-compiler</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>tomcat</groupId>
+ <artifactId>jasper-runtime</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-el</groupId>
+ <artifactId>commons-el</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>net.java.dev.jets3t</groupId>
+ <artifactId>jets3t</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-digester</groupId>
+ <artifactId>commons-digester</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-beanutils</groupId>
+ <artifactId>commons-beanutils-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>net.sf.kosmosfs</groupId>
+ <artifactId>kfs</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.jcraft</groupId>
+ <artifactId>jsch</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-server</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>tomcat</groupId>
+ <artifactId>jasper-runtime</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xmlenc</groupId>
+ <artifactId>xmlenc</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-daemon</groupId>
+ <artifactId>commons-daemon</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
<groupId>org.restlet.jee</groupId>
<artifactId>org.restlet</artifactId>
<version>${restlet.version}</version>
@@ -249,6 +393,7 @@
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/solr/core/src/test/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/solr/core/src/test/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/solr/core/src/test/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/solr/core/src/test/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -138,6 +138,7 @@
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/solr/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/solr/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/solr/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/solr/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -34,7 +34,6 @@
<modules>
<module>core</module>
<module>solrj</module>
- <module>webapp</module>
<module>test-framework</module>
<module>contrib</module>
</modules>
@@ -149,6 +148,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
@@ -169,6 +169,7 @@
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/tests.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/java/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/java/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/java/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/java/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -155,6 +155,7 @@
</bundledSignatures>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
<!-- Solr-J does NOT depend on servlet-api -->
<!-- <signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile> -->
</signaturesFiles>
Modified: lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/test/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/test/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/test/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/solr/solrj/src/test/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -142,6 +142,7 @@
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/tests.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/maven/solr/test-framework/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/maven/solr/test-framework/pom.xml.template?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/maven/solr/test-framework/pom.xml.template (original)
+++ lucene/dev/branches/security/dev-tools/maven/solr/test-framework/pom.xml.template Wed Jul 3 23:26:32 2013
@@ -65,6 +65,128 @@
<artifactId>junit</artifactId>
</dependency>
<dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <classifier>tests</classifier>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-math</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xmlenc</groupId>
+ <artifactId>xmlenc</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-httpclient</groupId>
+ <artifactId>commons-httpclient</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-net</groupId>
+ <artifactId>commons-net</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-json</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-server</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>tomcat</groupId>
+ <artifactId>jasper-compiler</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>tomcat</groupId>
+ <artifactId>jasper-runtime</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-el</groupId>
+ <artifactId>commons-el</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>net.java.dev.jets3t</groupId>
+ <artifactId>jets3t</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-digester</groupId>
+ <artifactId>commons-digester</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.apache.avro</groupId>
+ <artifactId>avro</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>net.sf.kosmosfs</groupId>
+ <artifactId>kfs</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.jcraft</groupId>
+ <artifactId>jsch</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-beanutils</groupId>
+ <artifactId>commons-beanutils-core</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-hdfs</artifactId>
+ <classifier>tests</classifier>
+ <exclusions>
+ <exclusion>
+ <groupId>javax.servlet</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.sun.jersey</groupId>
+ <artifactId>jersey-server</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>tomcat</groupId>
+ <artifactId>jasper-runtime</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>xmlenc</groupId>
+ <artifactId>xmlenc</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>commons-daemon</groupId>
+ <artifactId>commons-daemon</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-servlet</artifactId>
</dependency>
@@ -72,6 +194,23 @@
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
</dependency>
+
+ <!-- Jetty 6 required for Hadoop DfsMiniCluster -->
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty</artifactId>
+ <exclusions>
+ <exclusion>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>servlet-api</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ </dependency>
+
<!-- If your tests don't use BaseDistributedSearchTestCase or SolrJettyTestBase,
you can exclude the two Jetty dependencies below. -->
<dependency>
@@ -130,6 +269,7 @@
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/tests.txt</signaturesFile>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/executors.txt</signaturesFile>
+ <signaturesFile>${top-level}/lucene/tools/forbiddenApis/chars.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
Modified: lucene/dev/branches/security/dev-tools/scripts/buildAndPushRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/scripts/buildAndPushRelease.py?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/scripts/buildAndPushRelease.py (original)
+++ lucene/dev/branches/security/dev-tools/scripts/buildAndPushRelease.py Wed Jul 3 23:26:32 2013
@@ -46,9 +46,10 @@ def run(command):
raise RuntimeError(msg)
def runAndSendGPGPassword(command, password):
- p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
+ p = subprocess.Popen(command, shell=True, bufsize=0, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE)
f = open(LOG, 'ab')
while True:
+ p.stdout.flush()
line = p.stdout.readline()
if len(line) == 0:
break
Modified: lucene/dev/branches/security/dev-tools/scripts/smokeTestRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/dev-tools/scripts/smokeTestRelease.py?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/dev-tools/scripts/smokeTestRelease.py (original)
+++ lucene/dev/branches/security/dev-tools/scripts/smokeTestRelease.py Wed Jul 3 23:26:32 2013
@@ -970,10 +970,6 @@ def getDistributionsForMavenChecks(tmpDi
print(' unpack %s...' % distribution)
unpackLogFile = '%s/unpack-%s-maven-checks.log' % (tmpDir, distribution)
run('tar xzf %s/%s' % (tmpDir, distribution), unpackLogFile)
- if project == 'solr': # unpack the Solr war
- unpackLogFile = '%s/unpack-solr-war-maven-checks.log' % tmpDir
- print(' unpack Solr war...')
- run('jar xvf */dist/*.war', unpackLogFile)
distributionFiles[project] = []
for root, dirs, files in os.walk(destDir):
distributionFiles[project].extend([os.path.join(root, file) for file in files])
@@ -1309,7 +1305,7 @@ def main():
if len(sys.argv) < 5:
print()
- print('Usage python -u %s BaseURL SvnRevision version tmpDir [ isSigned ] [ -testArgs "-Dwhat=ever [ ... ]" ]'
+ print('Usage python -u %s BaseURL SvnRevision version tmpDir [ isSigned(True|False) ] [ -testArgs "-Dwhat=ever [ ... ]" ]'
% sys.argv[0])
print()
print(' example: python3.2 -u dev-tools/scripts/smokeTestRelease.py http://people.apache.org/~whoever/staging_area/lucene-solr-4.3.0-RC1-rev1469340 1469340 4.3.0 /path/to/a/tmp/dir')
Modified: lucene/dev/branches/security/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/CHANGES.txt?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/security/lucene/CHANGES.txt Wed Jul 3 23:26:32 2013
@@ -23,11 +23,19 @@ Changes in backwards compatibility polic
not positioned. This change affects all classes that inherit from
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
+* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
+ no longer support multiple "dictionaries" as there is only one dictionary available.
+ (Dawid Weiss)
+
New Features
* LUCENE-4747: Move to Java 7 as minimum Java version.
(Robert Muir, Uwe Schindler)
+* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
+ no longer support multiple "dictionaries" as there is only one dictionary available.
+ (Dawid Weiss)
+
Optimizations
* LUCENE-4848: Use Java 7 NIO2-FileChannel instead of RandomAccessFile
@@ -39,6 +47,9 @@ Optimizations
Changes in backwards compatibility policy
+* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
+ (Dawid Weiss, Grzegorz Sobczyk)
+
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
same position and preserves the position length and the offsets of the
original token. (Simon Willnauer, Adrien Grand)
@@ -47,6 +58,10 @@ Changes in backwards compatibility polic
(a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
whitespaces. (Adrien Grand)
+* LUCENE-5042: The n-gram and edge n-gram tokenizers and filters now correctly
+ handle supplementary characters, and the tokenizers have the ability to
+ pre-tokenize the input stream similarly to CharTokenizer. (Adrien Grand)
+
* LUCENE-4967: NRTManager is replaced by
ControlledRealTimeReopenThread, for controlling which requests must
see which indexing changes, so that it can work with any
@@ -90,8 +105,31 @@ Changes in backwards compatibility polic
categories. You should set TakmiSampleFixer on SamplingParams if required (but
notice that this means slower search). (Rob Audenaerde, Gilad Barkai, Shai Erera)
+* LUCENE-4933: Replace ExactSimScorer/SloppySimScorer with just SimScorer. Previously
+ there were 2 implementations as a performance hack to support tableization of
+ sqrt(), but this caching is removed, as sqrt is implemented in hardware with modern
+ jvms and its faster not to cache. (Robert Muir)
+
+* LUCENE-5038: MergePolicy now has a default implementation for useCompoundFile based
+ on segment size and noCFSRatio. The default implemantion was pulled up from
+ TieredMergePolicy. (Simon Willnauer)
+
+* LUCENE-5063: FieldCache.get(Bytes|Shorts), SortField.Type.(BYTE|SHORT) and
+ FieldCache.DEFAULT_(BYTE|SHORT|INT|LONG|FLOAT|DOUBLE)_PARSER are now
+ deprecated. These methods/types assume that data is stored as strings although
+ Lucene has much better support for numeric data through (Int|Long)Field,
+ NumericRangeQuery and FieldCache.get(Int|Long)s. (Adrien Grand)
+
+* LUCENE-5078: TfIDFSimilarity lets you encode the norm value as any arbitrary long.
+ As a result, encode/decodeNormValue were made abstract with their signatures changed.
+ The default implementation was moved to DefaultSimilarity, which encodes the norm as
+ a single-byte value. (Shai Erera)
+
Bug Fixes
+* LUCENE-4890: QueryTreeBuilder.getBuilder() only finds interfaces on the
+ most derived class. (Adriano Crestani)
+
* LUCENE-4997: Internal test framework's tests are sensitive to previous
test failures and tests.failfast. (Dawid Weiss, Shai Erera)
@@ -123,6 +161,48 @@ Bug Fixes
some readers did not have the requested numeric DV field.
(Rob Audenaerde, Shai Erera)
+* LUCENE-5028: Remove pointless and confusing doShare option in FST's
+ PositiveIntOutputs (Han Jiang via Mike McCandless)
+
+* LUCENE-5032: Fix IndexOutOfBoundsExc in PostingsHighlighter when
+ multi-valued fields exceed maxLength (Tomás Fernández Löbbe
+ via Mike McCandless)
+
+* LUCENE-4933: SweetSpotSimilarity didn't apply its tf function to some
+ queries (SloppyPhraseQuery, SpanQueries). (Robert Muir)
+
+* LUCENE-5033: SlowFuzzyQuery was accepting too many terms (documents) when
+ provided minSimilarity is an int > 1 (Tim Allison via Mike McCandless)
+
+* LUCENE-5045: DrillSideways.search did not work on an empty index. (Shai Erera)
+
+* LUCENE-4995: CompressingStoredFieldsReader now only reuses an internal buffer
+ when there is no more than 32kb to decompress. This prevents from running
+ into out-of-memory errors when working with large stored fields.
+ (Adrien Grand)
+
+* LUCENE-5048: CategoryPath with a long path could result in hitting
+ NegativeArraySizeException, categories being added multiple times to the
+ taxonomy or drill-down terms silently discarded by the indexer. CategoryPath
+ is now limited to MAX_CATEGORY_PATH_LENGTH characters.
+ (Colton Jamieson, Mike McCandless, Shai Erera)
+
+* LUCENE-5062: If the spatial data for a document was comprised of multiple
+ overlapping or adjacent parts then a CONTAINS predicate query might not match
+ when the sum of those shapes contain the query shape but none do individually.
+ A flag was added to use the original faster algorithm. (David Smiley)
+
+* LUCENE-4971: Fixed NPE in AnalyzingSuggester when there are too many
+ graph expansions. (Alexey Kudinov via Mike McCandless)
+
+* LUCENE-5080: Combined setMaxMergeCount and setMaxThreadCount into one
+ setter in ConcurrentMergePolicy: setMaxMergesAndThreads. Previously these
+ setters would not work unless you invoked them very carefully.
+ (Robert Muir, Shai Erera)
+
+* LUCENE-5068: QueryParserUtil.escape() does not escape forward slash.
+ (Matias Holte via Steve Rowe)
+
Optimizations
* LUCENE-4936: Improve numeric doc values compression in case all values share
@@ -137,8 +217,23 @@ Optimizations
single snapshots_N file, and no longer requires closing (Mike
McCandless, Shai Erera)
+* LUCENE-5035: Compress addresses in FieldCacheImpl.SortedDocValuesImpl more
+ efficiently. (Adrien Grand, Robert Muir)
+
+* LUCENE-4941: Sort "from" terms only once when using JoinUtil.
+ (Martijn van Groningen)
+
+* LUCENE-5050: Close the stored fields and term vectors index files as soon as
+ the index has been loaded into memory to save file descriptors. (Adrien Grand)
+
New Features
+* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
+ (Dawid Weiss, Grzegorz Sobczyk)
+
+* LUCENE-5064: Added PagedMutable (internal), a paged extension of
+ PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
+
* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to
emit multiple tokens one for each capture group in one or more patterns.
(Simon Willnauer, Clinton Gormley)
@@ -169,6 +264,37 @@ New Features
* LUCENE-5022: Added FacetResult.mergeHierarchies to merge multiple
FacetResult of the same dimension into a single one with the reconstructed
hierarchy. (Shai Erera)
+
+* LUCENE-5026: Added PagedGrowableWriter, a new internal packed-ints structure
+ that grows the number of bits per value on demand, can store more than 2B
+ values and supports random write and read access. (Adrien Grand)
+
+* LUCENE-5025: FST's Builder can now handle more than 2.1 billion
+ "tail nodes" while building a minimal FST. (Aaron Binns, Adrien
+ Grand, Mike McCandless)
+
+* LUCENE-5063: FieldCache.DEFAULT.get(Ints|Longs) now uses bit-packing to save
+ memory. (Adrien Grand)
+
+* LUCENE-5079: IndexWriter.hasUncommittedChanges() returns true if there are
+ changes that have not been committed. (yonik, Mike McCandless, Uwe Schindler)
+
+* SOLR-4565: Extend NorwegianLightStemFilter and NorwegianMinimalStemFilter
+ to handle "nynorsk" (Erlend Garåsen, janhoy via Robert Muir)
+
+* LUCENE-5087: Add getMultiValuedSeparator to PostingsHighlighter, for cases
+ where you want a different logical separator between field values. This can
+ be set to e.g. U+2029 PARAGRAPH SEPARATOR if you never want passes to span
+ values. (Mike McCandless, Robert Muir)
+
+* LUCENE-5013: Added ScandinavianFoldingFilterFactory and
+ ScandinavianNormalizationFilterFactory (Karl Wettin via janhoy)
+
+API Changes
+
+* LUCENE-5077: Make it easier to use compressed norms. Lucene42NormsFormat takes
+ an overhead parameter, so you can easily pass a different value other than
+ PackedInts.FASTEST from your own codec. (Robert Muir)
Build
@@ -176,12 +302,31 @@ Build
Test framework may fail internally due to overly aggresive J9 optimizations.
(Dawid Weiss, Shai Erera)
+* LUCENE-5043: The eclipse target now uses the containing directory for the
+ project name. This also enforces UTF-8 encoding when files are copied with
+ filtering.
+
+* LUCENE-5055: "rat-sources" target now checks also build.xml, ivy.xml,
+ forbidden-api signatures, and parts of resources folders. (Ryan Ernst,
+ Uwe Schindler)
+
+* LUCENE-5072: Automatically patch javadocs generated by JDK versions
+ before 7u25 to work around the frame injection vulnerability (CVE-2013-1571,
+ VU#225657). (Uwe Schindler)
+
Tests
* LUCENE-4901: TestIndexWriterOnJRECrash should work on any
JRE vendor via Runtime.halt().
(Mike McCandless, Robert Muir, Uwe Schindler, Rodrigo Trujillo, Dawid Weiss)
+Changes in runtime behavior
+
+* LUCENE-5038: New segments written by IndexWriter are now wrapped into CFS
+ by default. DocumentsWriterPerThread doesn't consult MergePolicy anymore
+ to decide if a CFS must be written, instead IndexWriterConfig now has a
+ property to enable / disable CFS for newly created segments. (Simon Willnauer)
+
======================= Lucene 4.3.1 =======================
Bug Fixes
Modified: lucene/dev/branches/security/lucene/analysis/common/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/build.xml?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/build.xml (original)
+++ lucene/dev/branches/security/lucene/analysis/common/build.xml Wed Jul 3 23:26:32 2013
@@ -25,6 +25,7 @@
<!-- some files for testing that do not have license headers -->
<property name="rat.excludes" value="**/*.aff,**/*.dic,**/*.txt,**/charfilter/*.htm*,**/*LuceneResourcesWikiPage.html"/>
+ <property name="rat.additional-includes" value="src/tools/**"/>
<import file="../analysis-module-build.xml"/>
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java Wed Jul 3 23:26:32 2013
@@ -57,7 +57,7 @@ public final class GreekLowerCaseFilter
int chLen = termAtt.length();
for (int i = 0; i < chLen;) {
i += Character.toChars(
- lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
+ lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
}
return true;
} else {
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java Wed Jul 3 23:26:32 2013
@@ -378,17 +378,14 @@ public class HunspellDictionary {
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
- if(ignoreCase) {
- entry = entry.toLowerCase(Locale.ROOT);
- }
}
-
- List<HunspellWord> entries = words.get(entry);
- if (entries == null) {
- entries = new ArrayList<HunspellWord>();
- words.put(entry, entries);
+ if(ignoreCase) {
+ entry = entry.toLowerCase(Locale.ROOT);
}
+
+ List<HunspellWord> entries = new ArrayList<HunspellWord>();
entries.add(wordForm);
+ words.put(entry, entries);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java Wed Jul 3 23:26:32 2013
@@ -25,21 +25,26 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
* Tokenizes the given token into n-grams of given size(s).
* <p>
* This {@link TokenFilter} create n-grams from the beginning edge of a input token.
+ * <p><a name="match_version" />As of Lucene 4.4, this filter handles correctly
+ * supplementary characters.
*/
public final class EdgeNGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
+ private final CharacterUtils charUtils;
private final int minGram;
private final int maxGram;
private char[] curTermBuffer;
private int curTermLength;
+ private int curCodePointCount;
private int curGramSize;
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
@@ -74,6 +79,9 @@ public final class EdgeNGramTokenFilter
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
+ this.charUtils = version.onOrAfter(Version.LUCENE_44)
+ ? CharacterUtils.getInstance(version)
+ : CharacterUtils.getJava4Instance();
this.minGram = minGram;
this.maxGram = maxGram;
}
@@ -87,6 +95,7 @@ public final class EdgeNGramTokenFilter
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
+ curCodePointCount = charUtils.codePointCount(termAtt);
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
@@ -95,7 +104,7 @@ public final class EdgeNGramTokenFilter
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
- if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
+ if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
clearAttributes();
offsetAtt.setOffset(tokStart, tokEnd);
@@ -107,7 +116,8 @@ public final class EdgeNGramTokenFilter
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
- termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
+ final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
+ termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizer.java Wed Jul 3 23:26:32 2013
@@ -17,37 +17,23 @@ package org.apache.lucene.analysis.ngram
* limitations under the License.
*/
-import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Version;
/**
* Tokenizes the input from an edge into n-grams of given size(s).
* <p>
* This {@link Tokenizer} create n-grams from the beginning edge of a input token.
+ * <p><a name="match_version" />As of Lucene 4.4, this class supports
+ * {@link #isTokenChar(int) pre-tokenization} and correctly handles
+ * supplementary characters.
*/
-public final class EdgeNGramTokenizer extends Tokenizer {
+public class EdgeNGramTokenizer extends NGramTokenizer {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-
- private int minGram;
- private int maxGram;
- private int gramSize;
- private boolean started;
- private int inLen; // length of the input AFTER trim()
- private int charsRead; // length of the input
- private String inStr;
-
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
@@ -57,8 +43,7 @@ public final class EdgeNGramTokenizer ex
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
- super(input);
- init(version, minGram, maxGram);
+ super(version, input, minGram, maxGram, true);
}
/**
@@ -71,102 +56,7 @@ public final class EdgeNGramTokenizer ex
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
- super(factory, input);
- init(version, minGram, maxGram);
- }
-
- private void init(Version version, int minGram, int maxGram) {
- if (version == null) {
- throw new IllegalArgumentException("version must not be null");
- }
-
- if (minGram < 1) {
- throw new IllegalArgumentException("minGram must be greater than zero");
- }
-
- if (minGram > maxGram) {
- throw new IllegalArgumentException("minGram must not be greater than maxGram");
- }
-
- this.minGram = minGram;
- this.maxGram = maxGram;
+ super(version, factory, input, minGram, maxGram, true);
}
- /** Returns the next token in the stream, or null at EOS. */
- @Override
- public boolean incrementToken() throws IOException {
- clearAttributes();
- // if we are just starting, read the whole input
- if (!started) {
- started = true;
- gramSize = minGram;
- char[] chars = new char[Math.min(1024, maxGram)];
- charsRead = 0;
- // TODO: refactor to a shared readFully somewhere:
- boolean exhausted = false;
- while (charsRead < maxGram) {
- final int inc = input.read(chars, charsRead, chars.length-charsRead);
- if (inc == -1) {
- exhausted = true;
- break;
- }
- charsRead += inc;
- if (charsRead == chars.length && charsRead < maxGram) {
- chars = ArrayUtil.grow(chars);
- }
- }
-
- inStr = new String(chars, 0, charsRead);
-
- if (!exhausted) {
- // Read extra throwaway chars so that on end() we
- // report the correct offset:
- char[] throwaway = new char[1024];
- while(true) {
- final int inc = input.read(throwaway, 0, throwaway.length);
- if (inc == -1) {
- break;
- }
- charsRead += inc;
- }
- }
-
- inLen = inStr.length();
- if (inLen == 0) {
- return false;
- }
- posIncrAtt.setPositionIncrement(1);
- } else {
- posIncrAtt.setPositionIncrement(1);
- }
-
- // if the remaining input is too short, we can't generate any n-grams
- if (gramSize > inLen) {
- return false;
- }
-
- // if we have hit the end of our n-gram size range, quit
- if (gramSize > maxGram || gramSize > inLen) {
- return false;
- }
-
- // grab gramSize chars from front or back
- termAtt.setEmpty().append(inStr, 0, gramSize);
- offsetAtt.setOffset(correctOffset(0), correctOffset(gramSize));
- gramSize++;
- return true;
- }
-
- @Override
- public void end() {
- // set final offset
- final int finalOffset = correctOffset(charsRead);
- this.offsetAtt.setOffset(finalOffset, finalOffset);
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- started = false;
- }
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Wed Jul 3 23:26:32 2013
@@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@@ -33,6 +34,7 @@ import org.apache.lucene.util.Version;
* <a name="version"/>
* <p>You must specify the required {@link Version} compatibility when
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
+ * <li>handles supplementary characters correctly,</li>
* <li>emits all n-grams for the same token at the same position,</li>
* <li>does not modify offsets,</li>
* <li>sorts n-grams by their offset in the original token first, then
@@ -42,6 +44,10 @@ import org.apache.lucene.util.Version;
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
* it will lead to broken {@link TokenStream}s that will cause highlighting
* bugs.
+ * <p>If you were using this {@link TokenFilter} to perform partial highlighting,
+ * this won't work anymore since this filter doesn't update offsets. You should
+ * modify your analysis chain to use {@link NGramTokenizer}, and potentially
+ * override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
*/
public final class NGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
@@ -51,6 +57,7 @@ public final class NGramTokenFilter exte
private char[] curTermBuffer;
private int curTermLength;
+ private int curCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc, curPosLen;
@@ -59,6 +66,7 @@ public final class NGramTokenFilter exte
private boolean hasIllegalOffsets; // only if the length changed before this filter
private final Version version;
+ private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
@@ -75,6 +83,9 @@ public final class NGramTokenFilter exte
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
+ this.charUtils = version.onOrAfter(Version.LUCENE_44)
+ ? CharacterUtils.getInstance(version)
+ : CharacterUtils.getJava4Instance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -126,6 +137,7 @@ public final class NGramTokenFilter exte
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
+ curCodePointCount = charUtils.codePointCount(termAtt);
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
@@ -138,13 +150,15 @@ public final class NGramTokenFilter exte
}
}
if (version.onOrAfter(Version.LUCENE_44)) {
- if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
+ if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
++curPos;
curGramSize = minGram;
}
- if (curPos + curGramSize <= curTermLength) {
+ if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes();
- termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+ final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+ final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+ termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
posLenAtt.setPositionLength(curPosLen);
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java Wed Jul 3 23:26:32 2013
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@@ -40,29 +41,47 @@ import org.apache.lucene.util.Version;
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
* </table>
* <a name="version"/>
- * <p>Before Lucene 4.4, this class had a different behavior:<ul>
- * <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
- * <li>The last whitespaces of the 1024 chars block were trimmed.</li>
- * <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
- * <p>Although highly discouraged, it is still possible to use the old behavior
- * through {@link Lucene43NGramTokenizer}.
+ * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
+ * <li>tokenize in a streaming fashion to support streams which are larger
+ * than 1024 chars (limit of the previous version),
+ * <li>count grams based on unicode code points instead of java chars (and
+ * never split in the middle of surrogate pairs),
+ * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
+ * before computing n-grams.</ul>
+ * <p>Additionally, this class doesn't trim trailing whitespaces and emits
+ * tokens in a different order, tokens are now emitted by increasing start
+ * offsets while they used to be emitted by increasing lengths (which prevented
+ * from supporting large input streams).
+ * <p>Although <b style="color:red">highly</b> discouraged, it is still possible
+ * to use the old behavior through {@link Lucene43NGramTokenizer}.
*/
-public final class NGramTokenizer extends Tokenizer {
+// non-final to allow for overriding isTokenChar, but all other methods should be final
+public class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
- private char[] buffer;
- private int bufferStart, bufferEnd; // remaining slice of the buffer
+ private CharacterUtils charUtils;
+ private CharacterUtils.CharacterBuffer charBuffer;
+ private int[] buffer; // like charBuffer, but converted to code points
+ private int bufferStart, bufferEnd; // remaining slice in buffer
private int offset;
private int gramSize;
private int minGram, maxGram;
private boolean exhausted;
+ private int lastCheckedChar; // last offset in the buffer that we checked
+ private int lastNonTokenChar; // last offset that we found to not be a token char
+ private boolean edgesOnly; // leading edges n-grams only
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ NGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+ super(input);
+ init(version, minGram, maxGram, edgesOnly);
+ }
+
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param version the lucene compatibility <a href="#version">version</a>
@@ -71,8 +90,12 @@ public final class NGramTokenizer extend
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
- super(input);
- init(version, minGram, maxGram);
+ this(version, input, minGram, maxGram, false);
+ }
+
+ NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
+ super(factory, input);
+ init(version, minGram, maxGram, edgesOnly);
}
/**
@@ -84,8 +107,7 @@ public final class NGramTokenizer extend
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
- super(factory, input);
- init(version, minGram, maxGram);
+ this(version, factory, input, minGram, maxGram, false);
}
/**
@@ -97,10 +119,13 @@ public final class NGramTokenizer extend
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
- private void init(Version version, int minGram, int maxGram) {
- if (!version.onOrAfter(Version.LUCENE_44)) {
+ private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
+ if (!edgesOnly && !version.onOrAfter(Version.LUCENE_44)) {
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
}
+ charUtils = version.onOrAfter(Version.LUCENE_44)
+ ? CharacterUtils.getInstance(version)
+ : CharacterUtils.getJava4Instance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@@ -109,66 +134,107 @@ public final class NGramTokenizer extend
}
this.minGram = minGram;
this.maxGram = maxGram;
- buffer = new char[maxGram + 1024];
+ this.edgesOnly = edgesOnly;
+ charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+ buffer = new int[charBuffer.getBuffer().length];
+ // Make the term att large enough
+ termAtt.resizeBuffer(2 * maxGram);
}
- /** Returns the next token in the stream, or null at EOS. */
@Override
- public boolean incrementToken() throws IOException {
+ public final boolean incrementToken() throws IOException {
clearAttributes();
- // compact
- if (bufferStart >= buffer.length - maxGram) {
- System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
- bufferEnd -= bufferStart;
- bufferStart = 0;
-
- // fill in remaining space
- if (!exhausted) {
- // TODO: refactor to a shared readFully
- while (bufferEnd < buffer.length) {
- final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
- if (read == -1) {
- exhausted = true;
- break;
- }
- bufferEnd += read;
+ // termination of this loop is guaranteed by the fact that every iteration
+ // either advances the buffer (calls consumes()) or increases gramSize
+ while (true) {
+ // compact
+ if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
+ System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+ bufferEnd -= bufferStart;
+ lastCheckedChar -= bufferStart;
+ lastNonTokenChar -= bufferStart;
+ bufferStart = 0;
+
+ // fill in remaining space
+ exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
+ // convert to code points
+ bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
+ }
+
+ // should we go to the next offset?
+ if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
+ if (bufferStart + 1 + minGram > bufferEnd) {
+ assert exhausted;
+ return false;
}
+ consume();
+ gramSize = minGram;
}
+
+ updateLastNonTokenChar();
+
+ // retry if the token to be emitted was going to not only contain token chars
+ final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+ final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+ if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
+ consume();
+ gramSize = minGram;
+ continue;
+ }
+
+ final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+ termAtt.setLength(length);
+ posIncAtt.setPositionIncrement(1);
+ posLenAtt.setPositionLength(1);
+ offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
+ ++gramSize;
+ return true;
}
+ }
- // should we go to the next offset?
- if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
- bufferStart++;
- offset++;
- gramSize = minGram;
- }
-
- // are there enough chars remaining?
- if (bufferStart + gramSize > bufferEnd) {
- return false;
- }
-
- termAtt.copyBuffer(buffer, bufferStart, gramSize);
- posIncAtt.setPositionIncrement(1);
- posLenAtt.setPositionLength(1);
- offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
- ++gramSize;
+ private void updateLastNonTokenChar() {
+ final int termEnd = bufferStart + gramSize - 1;
+ if (termEnd > lastCheckedChar) {
+ for (int i = termEnd; i > lastCheckedChar; --i) {
+ if (!isTokenChar(buffer[i])) {
+ lastNonTokenChar = i;
+ break;
+ }
+ }
+ lastCheckedChar = termEnd;
+ }
+ }
+
+ /** Consume one code point. */
+ private void consume() {
+ offset += Character.charCount(buffer[bufferStart++]);
+ }
+
+ /** Only collect characters which satisfy this condition. */
+ protected boolean isTokenChar(int chr) {
return true;
}
@Override
- public void end() {
- final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+ public final void end() {
+ assert bufferStart <= bufferEnd;
+ int endOffset = offset;
+ for (int i = bufferStart; i < bufferEnd; ++i) {
+ endOffset += Character.charCount(buffer[i]);
+ }
+ endOffset = correctOffset(endOffset);
offsetAtt.setOffset(endOffset, endOffset);
}
@Override
- public void reset() throws IOException {
+ public final void reset() throws IOException {
super.reset();
bufferStart = bufferEnd = buffer.length;
+ lastNonTokenChar = lastCheckedChar = bufferStart - 1;
offset = 0;
gramSize = minGram;
exhausted = false;
+ charBuffer.reset();
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilter.java Wed Jul 3 23:26:32 2013
@@ -35,12 +35,26 @@ import org.apache.lucene.analysis.tokena
* </p>
*/
public final class NorwegianLightStemFilter extends TokenFilter {
- private final NorwegianLightStemmer stemmer = new NorwegianLightStemmer();
+ private final NorwegianLightStemmer stemmer;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
-
+
+ /**
+ * Calls {@link #NorwegianLightStemFilter(TokenStream, int)
+ * NorwegianLightStemFilter(input, BOKMAAL)}
+ */
public NorwegianLightStemFilter(TokenStream input) {
+ this(input, NorwegianLightStemmer.BOKMAAL);
+ }
+
+ /**
+ * Creates a new NorwegianLightStemFilter
+ * @param flags set to {@link NorwegianLightStemmer#BOKMAAL},
+ * {@link NorwegianLightStemmer#NYNORSK}, or both.
+ */
+ public NorwegianLightStemFilter(TokenStream input, int flags) {
super(input);
+ stemmer = new NorwegianLightStemmer(flags);
}
@Override
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java Wed Jul 3 23:26:32 2013
@@ -23,6 +23,9 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.no.NorwegianLightStemFilter;
import org.apache.lucene.analysis.util.TokenFilterFactory;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.BOKMAAL;
+import static org.apache.lucene.analysis.no.NorwegianLightStemmer.NYNORSK;
+
/**
* Factory for {@link NorwegianLightStemFilter}.
* <pre class="prettyprint">
@@ -30,15 +33,27 @@ import org.apache.lucene.analysis.util.T
* <analyzer>
* <tokenizer class="solr.StandardTokenizerFactory"/>
* <filter class="solr.LowerCaseFilterFactory"/>
- * <filter class="solr.NorwegianLightStemFilterFactory"/>
+ * <filter class="solr.NorwegianLightStemFilterFactory" variant="nb"/>
* </analyzer>
* </fieldType></pre>
*/
public class NorwegianLightStemFilterFactory extends TokenFilterFactory {
+ private final int flags;
+
/** Creates a new NorwegianLightStemFilterFactory */
public NorwegianLightStemFilterFactory(Map<String,String> args) {
super(args);
+ String variant = get(args, "variant");
+ if (variant == null || "nb".equals(variant)) {
+ flags = BOKMAAL;
+ } else if ("nn".equals(variant)) {
+ flags = NYNORSK;
+ } else if ("no".equals(variant)) {
+ flags = BOKMAAL | NYNORSK;
+ } else {
+ throw new IllegalArgumentException("invalid variant: " + variant);
+ }
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@@ -46,6 +61,6 @@ public class NorwegianLightStemFilterFac
@Override
public TokenStream create(TokenStream input) {
- return new NorwegianLightStemFilter(input);
+ return new NorwegianLightStemFilter(input, flags);
}
}
Modified: lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java?rev=1499601&r1=1499600&r2=1499601&view=diff
==============================================================================
--- lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java (original)
+++ lucene/dev/branches/security/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemmer.java Wed Jul 3 23:26:32 2013
@@ -62,50 +62,106 @@ import static org.apache.lucene.analysis
* corpus to validate against whereas the Norwegian one is hand crafted.
*/
public class NorwegianLightStemmer {
+ /** Constant to remove Bokmål-specific endings */
+ public static final int BOKMAAL = 1;
+ /** Constant to remove Nynorsk-specific endings */
+ public static final int NYNORSK = 2;
+ final boolean useBokmaal;
+ final boolean useNynorsk;
+
+ /**
+ * Creates a new NorwegianLightStemmer
+ * @param flags set to {@link #BOKMAAL}, {@link #NYNORSK}, or both.
+ */
+ public NorwegianLightStemmer(int flags) {
+ if (flags <= 0 || flags > BOKMAAL + NYNORSK) {
+ throw new IllegalArgumentException("invalid flags");
+ }
+ useBokmaal = (flags & BOKMAAL) != 0;
+ useNynorsk = (flags & NYNORSK) != 0;
+ }
+
public int stem(char s[], int len) {
// Remove posessive -s (bilens -> bilen) and continue checking
if (len > 4 && s[len-1] == 's')
len--;
// Remove common endings, single-pass
- if (len > 7 &&
- (endsWith(s, len, "heter") || // general ending (hemmelig-heter -> hemmelig)
- endsWith(s, len, "heten"))) // general ending (hemmelig-heten -> hemmelig)
+ if (len > 7 &&
+ ((endsWith(s, len, "heter") &&
+ useBokmaal) || // general ending (hemmelig-heter -> hemmelig)
+ (endsWith(s, len, "heten") &&
+ useBokmaal) || // general ending (hemmelig-heten -> hemmelig)
+ (endsWith(s, len, "heita") &&
+ useNynorsk))) // general ending (hemmeleg-heita -> hemmeleg)
return len - 5;
+
+ // Remove Nynorsk common endings, single-pass
+ if (len > 8 && useNynorsk &&
+ (endsWith(s, len, "heiter") || // general ending (hemmeleg-heiter -> hemmeleg)
+ endsWith(s, len, "leiken") || // general ending (trygg-leiken -> trygg)
+ endsWith(s, len, "leikar"))) // general ending (trygg-leikar -> trygg)
+ return len - 6;
if (len > 5 &&
- (endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
- endsWith(s, len, "het"))) // general ending (hemmelig-het -> hemmelig)
+ (endsWith(s, len, "dom") || // general ending (kristen-dom -> kristen)
+ (endsWith(s, len, "het") &&
+ useBokmaal))) // general ending (hemmelig-het -> hemmelig)
return len - 3;
+ if (len > 6 && useNynorsk &&
+ (endsWith(s, len, "heit") || // general ending (hemmeleg-heit -> hemmeleg)
+ endsWith(s, len, "semd") || // general ending (verk-semd -> verk)
+ endsWith(s, len, "leik"))) // general ending (trygg-leik -> trygg)
+ return len - 4;
+
if (len > 7 &&
(endsWith(s, len, "elser") || // general ending (føl-elser -> føl)
endsWith(s, len, "elsen"))) // general ending (føl-elsen -> føl)
return len - 5;
if (len > 6 &&
- (endsWith(s, len, "ende") || // (sov-ende -> sov)
+ ((endsWith(s, len, "ende") &&
+ useBokmaal) || // (sov-ende -> sov)
+ (endsWith(s, len, "ande") &&
+ useNynorsk) || // (sov-ande -> sov)
endsWith(s, len, "else") || // general ending (føl-else -> føl)
- endsWith(s, len, "este") || // adj (fin-este -> fin)
- endsWith(s, len, "eren"))) // masc
+ (endsWith(s, len, "este") &&
+ useBokmaal) || // adj (fin-este -> fin)
+ (endsWith(s, len, "aste") &&
+ useNynorsk) || // adj (fin-aste -> fin)
+ (endsWith(s, len, "eren") &&
+ useBokmaal) || // masc
+ (endsWith(s, len, "aren") &&
+ useNynorsk))) // masc
return len - 4;
if (len > 5 &&
- (endsWith(s, len, "ere") || // adj (fin-ere -> fin)
- endsWith(s, len, "est") || // adj (fin-est -> fin)
- endsWith(s, len, "ene") // masc/fem/neutr pl definite (hus-ene)
- ))
+ ((endsWith(s, len, "ere") &&
+ useBokmaal) || // adj (fin-ere -> fin)
+ (endsWith(s, len, "are") &&
+ useNynorsk) || // adj (fin-are -> fin)
+ (endsWith(s, len, "est") &&
+ useBokmaal) || // adj (fin-est -> fin)
+ (endsWith(s, len, "ast") &&
+ useNynorsk) || // adj (fin-ast -> fin)
+ endsWith(s, len, "ene") || // masc/fem/neutr pl definite (hus-ene)
+ (endsWith(s, len, "ane") &&
+ useNynorsk))) // masc pl definite (gut-ane)
return len - 3;
if (len > 4 &&
(endsWith(s, len, "er") || // masc/fem indefinite
endsWith(s, len, "en") || // masc/fem definite
endsWith(s, len, "et") || // neutr definite
- endsWith(s, len, "st") || // adj (billig-st -> billig)
+ (endsWith(s, len, "ar") &&
+ useNynorsk) || // masc pl indefinite
+ (endsWith(s, len, "st") &&
+ useBokmaal) || // adj (billig-st -> billig)
endsWith(s, len, "te")))
return len - 2;
-
+
if (len > 3)
switch(s[len-1]) {
case 'a': // fem definite