You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by mi...@apache.org on 2005/11/16 16:14:18 UTC
svn commit: r345031 [2/2] - in /lenya/trunk/src/java/org/apache/cocoon: ./
components/ components/search/ components/search/analyzer/
components/search/components/ components/search/components/impl/
components/search/fieldmodel/ components/search/utils...
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelIndexerImpl.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelIndexerImpl.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelIndexerImpl.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelIndexerImpl.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,246 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.components.search.components.impl;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Stack;
+
+import org.apache.avalon.framework.context.Context;
+import org.apache.avalon.framework.context.ContextException;
+import org.apache.avalon.framework.context.Contextualizable;
+import org.apache.cocoon.Constants;
+import org.apache.cocoon.components.search.IndexException;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+
+/**
+ * Parrallel Indexer Class
+ *
+ * @author Nicolas Maisonneuve
+ */
+
+public class ParallelIndexerImpl extends AbstractIndexer implements
+ Contextualizable {
+
+ // Parallel specific variables
+ private Stack queue;
+
+ private boolean releaseSession, first_writing;
+
+ /**
+ * Number of threads (number of writers)
+ */
+ private int numThread;
+
+ /**
+ * temp dir where are stored the temporared index
+ */
+ private File tempDir;
+
+ /**
+ * multi-thread writer
+ */
+ private WriterThread[] writers;
+
+ public ParallelIndexerImpl() {
+ super();
+ this.queue = new Stack();
+
+ /**
+ * @TODO see how many processor there are automatically
+ */
+ this.setNumThread(2);
+ first_writing = true;
+ }
+
+ /**
+ * Set the number of thread writer
+ *
+ * @param num
+ * the number of thread
+ */
+ public void setNumThread(int num) {
+ numThread = num;
+ writers = new WriterThread[num];
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.avalon.framework.context.Contextualizable#contextualize(org.apache.avalon.framework.context.Context)
+ */
+ public void contextualize(Context context) throws ContextException {
+ tempDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
+ }
+
+ protected void release() throws IndexException {
+
+ // ok this is the end of indexation (information for the threads)
+ releaseSession = true;
+
+ // wait for the end of writer threads
+ boolean isindexing = true;
+ while (isindexing) {
+
+ // check if all the thread are died
+ isindexing = false;
+ for (int i = 0; i < writers.length; i++) {
+ isindexing |= writers[i].alive;
+ }
+
+ // no, so sleep
+ if (isindexing) {
+ try {
+ Thread.sleep(50);
+ } catch (InterruptedException ex) {
+ ex.printStackTrace();
+ }
+ } else {
+ break;
+ }
+ }
+
+ // merge index
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("Merging....");
+ }
+ this.switchToADD_MODE(false);
+ Directory[] dirs = new Directory[numThread];
+ for (int i = 0; i < numThread; i++) {
+ dirs[i] = writers[i].dir;
+ }
+ try {
+ this.add_writer.addIndexes(dirs);
+ } catch (IOException ex1) {
+ throw new IndexException("merge error ", ex1);
+ }
+
+ releaseSession = false;
+ first_writing = true;
+ super.release();
+ }
+
+ final protected void addDocument(Document document) throws IndexException {
+ startThread();
+ // put the document in the queue
+ this.queue.add(document);
+ }
+
+ final protected void updateDocument(Document document)
+ throws IndexException {
+ del(document.get(DOCUMENT_UID_FIELD));
+ addDocument(document);
+ }
+
+ /**
+ * start the threads if it's not already done
+ *
+ * @throws IndexException
+ */
+ private void startThread() throws IndexException {
+ if (first_writing) {
+ for (int i = 0; i < writers.length; i++) {
+ writers[i] = new WriterThread();
+ writers[i].start();
+ }
+ first_writing = false;
+ }
+ }
+
+ /**
+ * Writer Thread
+ */
+ final class WriterThread extends Thread {
+ boolean alive = true;
+
+ private IndexWriter mywriter;
+
+ Directory dir;
+
+ public void run() {
+ // create a temp directory to store a subindex
+ File file = new File(tempDir + File.separator + this.getName());
+ file.mkdirs();
+
+ // open a writer
+ try {
+ dir = FSDirectory.getDirectory(file, true);
+ mywriter = new IndexWriter(dir, analyzer, true);
+ mywriter.mergeFactor = mergeFactor;
+ mywriter.minMergeDocs = mergeFactor * 2;
+ } catch (IOException e) {
+ e.printStackTrace();
+ getLogger().error("Thread " + getName() + ": opening error", e);
+ }
+
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug(
+ "WriterThread " + this.getName() + " is ready....");
+ }
+ while (alive) {
+ if (!queue.isEmpty()) {
+ try {
+ // add document
+ Document doc = (Document) queue.pop();
+ addDocument(mywriter, doc);
+ } catch (IndexException ex) {
+ ex.printStackTrace();
+ getLogger().error(
+ "Thread " + getName() + ": indexation error",
+ ex);
+ }
+ } else {
+ // end session ?
+ if (releaseSession) {
+
+ // stop thread
+ alive = false;
+
+ // close writer
+ try {
+ mywriter.close();
+ } catch (IOException ex) {
+ ex.printStackTrace();
+ getLogger()
+ .error(
+ "Thread " + getName()
+ + ": close error", ex);
+ }
+ } else {
+ // wait new documents
+ try {
+ Thread.sleep(20);
+ } catch (InterruptedException e2) {
+ getLogger()
+ .error(
+ "Thread " + getName()
+ + ": sleep error", e2);
+ }
+ }
+ }
+
+ }
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug(
+ "WriterThread " + getName() + " is stoping...");
+
+ }
+ }
+ }
+}
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelSearcherImpl.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelSearcherImpl.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelSearcherImpl.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/components/impl/ParallelSearcherImpl.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.components.search.components.impl;
+
+import java.io.IOException;
+
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ParallelMultiSearcher;
+import org.apache.lucene.store.Directory;
+
+/**
+ * use lucene ParallelMultiSearcher Class
+ * @author Nicolas Maisonneuve
+ */
+public class ParallelSearcherImpl extends AbstractSearcher {
+
+ /* (non-Javadoc)
+ * @see org.apache.cocoon.components.search.components.impl.AbstractSearcher#getLuceneSearcher()
+ */
+ protected void getLuceneSearcher() throws IOException {
+ if (directories.size() > 1) {
+ IndexSearcher[] searchers = new IndexSearcher[directories
+ .size()];
+ for (int i = 0; i < searchers.length; i++) {
+ searchers[i]= new IndexSearcher((Directory)(directories
+ .get(i)));
+ }
+ luceneSearcher = new ParallelMultiSearcher(searchers);
+ } else {
+ luceneSearcher = new IndexSearcher((Directory) (directories
+ .get(0)));
+ }
+ }
+
+}
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/DateFieldDefinition.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/DateFieldDefinition.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/DateFieldDefinition.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/DateFieldDefinition.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,94 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.components.search.fieldmodel;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+
+import org.apache.lucene.document.DateField;
+import org.apache.lucene.document.Field;
+
+/**
+ * Field Definition for Date type
+ *
+ * @author Nicolas Maisonneuve
+ */
+public class DateFieldDefinition extends FieldDefinition {
+
+ private SimpleDateFormat df;
+
+ /**
+ * @param name
+ * name of the field
+ */
+ public DateFieldDefinition(String name) {
+ super(name, DATE);
+ }
+
+ /**
+ * Set the date format to parse string date in the
+ *
+ * @see #createLField(String) method
+ * @param df
+ */
+ public void setDateFormat(SimpleDateFormat df) {
+ this.df = df;
+ }
+
+ /**
+ * @return the dateformat
+ */
+ public SimpleDateFormat getDateFormat() {
+ return df;
+ }
+
+ /**
+ * Create a Lucene Field
+ *
+ * @param dateString
+ * String date in string format
+ * @throws ParseException
+ * @return
+ * @see org.apache.lucene.document.Field
+ *
+ */
+ public final Field createLField(String dateString)
+ throws IllegalArgumentException {
+ Date date = null;
+ try {
+ date = df.parse(dateString);
+ } catch (ParseException ex) {
+ throw new IllegalArgumentException(ex.getMessage());
+ }
+ return createLField(date);
+ }
+
+ /**
+ * Create Lucene Field
+ *
+ * @param date
+ * the date
+ * @throws ParseException
+ * @return
+ * @see org.apache.lucene.document.Field
+ *
+ */
+ public final Field createLField(Date date) {
+ return new Field(name, DateField.dateToString(date), store, true, index);
+ }
+
+}
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/FieldDefinition.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/FieldDefinition.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/FieldDefinition.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/FieldDefinition.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,211 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.components.search.fieldmodel;
+
+import org.apache.lucene.document.Field;
+
+/**
+ * Field Definition class
+ *
+ * @author Nicolas Maisonneuve
+ *
+ */
+public abstract class FieldDefinition {
+ /**
+ * Text type
+ */
+ public static final int TEXT = 0;
+
+ /**
+ * Keyword type
+ */
+ public static final int KEYWORD = 1;
+
+ /**
+ * Date type
+ */
+ public static final int DATE = 2;
+
+ public static final String[] STRING_TYPE = { "text", "keyword", "date" };
+
+ /**
+ * Name of the field
+ */
+ protected String name;
+
+ /**
+ * type of the field (text, keyword, date)
+ */
+ protected int type;
+
+ /**
+ * Lucene Field specification
+ */
+ protected boolean store;
+
+ protected boolean index;
+
+ // futur lucene 1.9: protected Field.Store store;
+ // futur lucene 1.9: protected Field.Index index;
+
+ protected FieldDefinition(String name, String type)
+ throws IllegalArgumentException {
+ this(name, stringTotype(type));
+ }
+
+ protected FieldDefinition(String name, int type)
+ throws IllegalArgumentException {
+ this(name, type, false);
+ }
+
+ public static FieldDefinition create(String name, int type) {
+ FieldDefinition field = null;
+
+ if (name == null || name == "") {
+ throw new IllegalArgumentException("name cannot be empty");
+ }
+ switch (type) {
+ case TEXT:
+ case KEYWORD:
+ field = new StringFieldDefinition(name, type);
+ break;
+ case DATE:
+ field = new DateFieldDefinition(name);
+ break;
+ default:
+ throw new IllegalArgumentException("type not allowed");
+ }
+ return field;
+ }
+
+ /**
+ *
+ * @param name
+ * String field's name
+ * @param type
+ * int indexation type
+ * @param store
+ * boolean store value in the index
+ * @throws IllegalArgumentException
+ */
+ private FieldDefinition(String name, int type, boolean store)
+ throws IllegalArgumentException {
+
+ this.name = name.intern();
+ setType(type);
+ setStore(store);
+ }
+
+ public int hashCode() {
+ return name.hashCode() * this.type;
+ }
+
+ public void setStore(boolean store) {
+ // for futur lucene1.9
+ // this.store=(store)?Field.Store.YES:Field.Store.NO;
+ this.store = store;
+ }
+
+ public boolean getStore() {
+ // for futur lucene1.9 return this.store==Field.Store.YES;
+ return store;
+ }
+
+ public boolean equals(FieldDefinition fielddef) {
+ if (name == fielddef.name() && getType() == fielddef.getType()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ public boolean equals(Object object) {
+ if (object instanceof FieldDefinition) {
+ return equals((FieldDefinition) object);
+ } else {
+ return false;
+ }
+ }
+
+ public String name() {
+ return name;
+ }
+
+ /**
+ * Create Lucene Field
+ *
+ * @param value
+ * String value to store in the lucene field
+ * @return Field
+ */
+ public abstract Field createLField(String value);
+
+ public int getType() {
+ return type;
+ }
+
+ /**
+ * Set the type of the FieldDefinition (DATE,TEXT,KEYWORD)
+ *
+ * @param type
+ * int
+ * @throws IllegalArgumentException
+ */
+ private void setType(int type) throws IllegalArgumentException {
+ switch (type) {
+ case FieldDefinition.TEXT:
+ index = true;
+ break;
+ case FieldDefinition.DATE:
+ index = true;
+ break;
+ case FieldDefinition.KEYWORD:
+ index = false;
+ break;
+ default:
+ throw new IllegalArgumentException("type not allowed");
+ }
+ this.type = type;
+ }
+
+ public final String toString() {
+ StringBuffer b = new StringBuffer();
+ b.append("name: " + name);
+ b.append(", type: " + FieldDefinition.STRING_TYPE[type]);
+ b.append(", store: " + getStore());
+ return b.toString();
+ }
+
+ /**
+ * Convert String to type
+ *
+ * @param typename
+ * String
+ * @throws Exception
+ * @return int
+ */
+ static final public int stringTotype(String typename)
+ throws IllegalArgumentException {
+ for (int i = 0; i < STRING_TYPE.length; i++) {
+ if (typename.toLowerCase().equals(STRING_TYPE[i])) {
+ return i;
+ }
+ }
+ throw new IllegalArgumentException("type " + typename
+ + " is not allowed");
+ }
+
+}
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/StringFieldDefinition.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/StringFieldDefinition.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/StringFieldDefinition.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/fieldmodel/StringFieldDefinition.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,43 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.components.search.fieldmodel;
+
+import org.apache.lucene.document.Field;
+
+/**
+ * String Field Definition (used for Text and Keyword type)
+ *
+ * @author Nicolas Maisonneuve
+ */
+public final class StringFieldDefinition extends FieldDefinition {
+
+ public StringFieldDefinition(String name, int type) {
+ super(name, type);
+ }
+
+ /**
+ * Create a Lucene Field
+ *
+ * @param value
+ * value to index
+ * @return
+ * @see org.apache.lucene.document.Field
+ */
+ public final Field createLField(String value) {
+ return new Field(name, value, store, true, index);
+ }
+
+}
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/lucene2.roles
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/lucene2.roles?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/lucene2.roles (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/lucene2.roles Wed Nov 16 07:14:03 2005
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<!--
+ Copyright 1999-2004 The Apache Software Foundation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- $Id$ -->
+<role-list>
+
+ <!-- default indexer -->
+ <role name="org.apache.cocoon.components.search.components.Indexer/default" shorthand="default_indexer"
+default-class="org.apache.cocoon.components.search.components.impl.DefaultIndexerImpl"/>
+
+ <!-- parallel indexer -->
+ <role name="org.apache.cocoon.components.search.components.Indexer/parallel" shorthand="parallel_indexer"
+default-class="org.apache.cocoon.components.search.components.impl.ParallelIndexerImpl"/>
+
+ <!-- analyzer manager -->
+ <role name="org.apache.cocoon.components.search.components.AnalyzerManager" shorthand="analyzer_manager"
+default-class="org.apache.cocoon.components.search.components.impl.AnalyzerManagerImpl"/>
+
+ <!-- index manager -->
+ <role name="org.apache.cocoon.components.search.components.IndexManager" shorthand="index_manager"
+default-class="org.apache.cocoon.components.search.components.impl.IndexManagerImpl"/>
+
+</role-list>
Added: lenya/trunk/src/java/org/apache/cocoon/components/search/utils/SourceHelper.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/components/search/utils/SourceHelper.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/components/search/utils/SourceHelper.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/components/search/utils/SourceHelper.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.components.search.utils;
+
+/**
+ * Utility class
+ *
+ * @author Maisonneuve Nicolas
+ *
+ */
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.avalon.framework.configuration.Configuration;
+import org.apache.avalon.framework.configuration.ConfigurationException;
+import org.apache.avalon.framework.configuration.DefaultConfigurationBuilder;
+import org.apache.excalibur.source.Source;
+import org.apache.excalibur.source.SourceValidity;
+import org.xml.sax.SAXException;
+
+public class SourceHelper {
+
+ static final private HashMap sources = new HashMap();
+
+ static final private DefaultConfigurationBuilder confBuilder = new DefaultConfigurationBuilder();
+
+ static final public void registerSource(Source source) {
+ if (!sources.containsKey(source)) {
+ SourceValidity refValidity = source.getValidity();
+ sources.put(source, refValidity);
+ }
+ }
+
+ /**
+ * Check the validity of the source with the registered source
+ *
+ * @return true if the source didn't changed
+ */
+ static final public boolean checkSourceValidity(Source source) {
+ SourceValidity newValidity = source.getValidity();
+ SourceValidity refValidity = (SourceValidity) sources.get(source);
+ return checkSourceValidity(newValidity, refValidity);
+ }
+
+ /**
+ * Compare two sources
+ *
+ * @return true if the source didn't changed
+ */
+ static final public boolean checkSourceValidity(SourceValidity s1Validity,
+ SourceValidity s2Validity) {
+
+ int valid = s2Validity.isValid();
+ boolean isValid;
+ if (valid == 0) {
+ valid = s2Validity.isValid(s1Validity);
+ isValid = (valid == 1);
+ } else {
+ isValid = (valid == 1);
+ }
+ return isValid;
+ }
+
+ static final public Configuration build(Source source)
+ throws ConfigurationException {
+ try {
+ return confBuilder.build(source.getInputStream());
+ } catch (IOException ex) {
+ throw new ConfigurationException("File " + source.getURI(), ex);
+ } catch (SAXException ex) {
+ throw new ConfigurationException(
+ "SAX Error in the configuration File", ex);
+ }
+ }
+
+}
Added: lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformer2.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformer2.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformer2.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformer2.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,607 @@
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.transformation;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.avalon.excalibur.pool.Recyclable;
+import org.apache.avalon.framework.configuration.ConfigurationException;
+import org.apache.avalon.framework.parameters.Parameters;
+import org.apache.avalon.framework.service.ServiceException;
+import org.apache.avalon.framework.service.ServiceManager;
+import org.apache.avalon.framework.service.Serviceable;
+import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.components.search.Index;
+import org.apache.cocoon.components.search.IndexException;
+import org.apache.cocoon.components.search.components.AnalyzerManager;
+import org.apache.cocoon.components.search.components.IndexManager;
+import org.apache.cocoon.components.search.components.Indexer;
+import org.apache.cocoon.environment.SourceResolver;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Another lucene index transformer.</br> allow
+ * <ul>
+ * <li>index function (update indexing or add indexing if clear attribute is
+ * true)</li>
+ * <li>lucene field boosting</li>
+ * <li>delete function</li>
+ * </ul>
+ *
+ * <p>
+ * This tranformer used several avalon components, but you can use them
+ * separatly :
+ * <ul>
+ * <li>AnalyzerManager: you can setup a analyzer (configurable) in the
+ * analyzer_manager tag in cocoon.xconf file</li>
+ * <li>IndexManager: you can setup a index in a the /WEB-INF/index.xml (default
+ * location , but you can specify the location in the IndexManager component
+ * configuration in cocoon.xconf file)</li>
+ * <li>Indexer (2 implementations: default (with update optimization) and
+ * parallel implementation for multiple cpu)</li>
+ * </p>
+ * <p>
+ * <strong>Example of input source: </strong>
+ * </p>
+ * <p>
+ * <ul>
+ * <li>to Index <br>
+ * <lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0"
+ * <br/>indexid="myindex" <br>
+ * clear="true" (optinal attribute: clear index) <br/>merge-factor="100">
+ * (optinal attribute: see lucene doc) <br>
+ * <br/><lucene:document uid="http://myhost/myfile1.data"> <br/>
+ * <lucene:field name="tile" > sqdqsdq </lucene:field> <br>
+ * <lucene:field name="description" > a text bla bal blalael
+ * balbal</lucene:field> <br>
+ * <lucene:field name="date" >10/12/2002</lucene:field> <br/>
+ * </lucene:document> <br>
+ *
+ * <p>
+ * <lucene:document uid="http://myhost/myfile2.data" > <br>
+ * <lucene:field name="author" boost="2" >Mr Author </lucene:field>
+ * <em>(boost the field for the search (see Lucene documentation))</em> <br/>
+ * <lucene:field name="langage" >french</lucene:field> <br>
+ * </lucene:document> <br>
+ * < /lucene:index>
+ * </p>
+ * </li>
+ *
+ * <li>To delete <br/>
+ * <p>
+ * <lucene:delete indexid="myindex" > <br>
+ * <lucene:document uid="http://myhost/myfile.data" > <br>
+ * <lucene:document uid="EODOED-EFE" <br>
+ * </lucene:delete>
+ * </p>
+ *
+ * <p>
+ * <strong>Example of Output Source </strong>
+ * </p>
+ * <p>
+ * <page xmlns:lucene="http://apache.org/cocoon/lucene/1.0">
+ * <br>
+ * < lucene:index > <br>
+ * <lucene:document uid="http://myhost/myfile1.data"/> <br/>
+ * <lucene:document uid="http://myhost/myfile2.data"/> <br/>
+ * </lucene:index>
+ * </p>
+ * <p>
+ * <lucene:delete > <lucene:document
+ * uid="http://myhost/myfile1.data"/> <br/><lucene:document
+ * uid="EODOED-EFE"/> <br/></lucene:delete ></br></li>
+ * </ul>
+ *
+ * @author Nicolas Maisonneuve
+ */
+
+public class LuceneIndexTransformer2 extends AbstractTransformer implements
+ Recyclable, Serviceable {
+
+ public static final String DIRECTORY_DEFAULT = "index";
+
+ public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
+
+ public static final String LUCENE_PREXIF = "lucene";
+
+ /**
+ * action element : index doc
+ */
+ public static final String LUCENE_INDEXING_ELEMENT = "index";
+
+ /**
+ * action element: delete doc
+ */
+ public static final String LUCENE_DELETING_ELEMENT = "delete";
+
+ /**
+ * index identity (see index definition file)
+ */
+ public static final String LUCENE_INDEXING_INDEXID_ATTRIBUTE = "indexid";
+
+ /**
+ * Optional attribute: Clear index: true/false (default: false)
+ */
+ public static final String LUCENE_INDEXING_CREATE_ATTRIBUTE = "clear";
+
+ /**
+ * Optional attribute: Analyzer identity: see analyzerManager Component
+ * (default: the analyer of the index declared in the index definition)
+ */
+ public static final String LUCENE_INDEXING_ANALYZER_ATTRIBUTE = "analyzer";
+
+ /**
+ * Optional attribute: MergeFactor number (default 10): improve the indexing
+ * speed for large indexing (see Lucene docs)
+ */
+ public static final String LUCENE_INDEXING_MERGE_FACTOR_ATTRIBUTE = "mergefactor";
+
+ /**
+ * Lucene document element
+ */
+ public static final String LUCENE_DOCUMENT_ELEMENT = "document";
+
+ /**
+ * Lucene document uid field
+ */
+ public static final String LUCENE_DOCUMENT_UID_ATTRIBUTE = "uid";
+
+ /**
+ * lucene field element
+ */
+ public static final String LUCENE_FIELD_ELEMENT = "field";
+
+ /**
+ * lucene field name
+ */
+ public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name";
+
+ /**
+ * Optional attribute: lucene field boost (see lucene docs)
+ */
+ public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost";
+
+ // The 6 states of the state machine
+ private int processing;
+
+ public static final int NO_PROCESSING = 0;
+
+ public static final int INDEX_PROCESS = 1;
+
+ public static final int IN_DOCUMENT_PROCESS = 2;
+
+ public static final int IN_FIELD_PROCESS = 4;
+
+ public static final int DELETE_PROCESS = 5;
+
+ public static final int DELETING_PROCESS = 6;
+
+ // Runtime variables
+ private int mergeFactor;
+
+ private AttributesImpl attrs = new AttributesImpl();
+
+ private Index index;
+
+ private Indexer indexer;
+
+ private ServiceManager manager;
+
+ private Document bodyDocument;
+
+ private String uid;
+
+ private String fieldname;
+
+ private float fieldboost;
+
+ private StringBuffer fieldvalue;
+
+ /**
+ * Setup the transformer.
+ */
+ public void setup(SourceResolver resolver, Map objectModel, String src,
+ Parameters parameters) throws ProcessingException, SAXException,
+ IOException {
+ }
+
+ public void recycle() {
+ this.processing = NO_PROCESSING;
+ }
+
+ public void service(ServiceManager manager) throws ServiceException {
+ this.manager = manager;
+ }
+
+ public void startDocument() throws SAXException {
+ super.startDocument();
+ }
+
+ public void endDocument() throws SAXException {
+ super.endDocument();
+ }
+
+ /**
+ * Begin the scope of a prefix-URI Namespace mapping.
+ *
+ * @param prefix
+ * The Namespace prefix being declared.
+ * @param uri
+ * The Namespace URI the prefix is mapped to.
+ */
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ if (processing == NO_PROCESSING) {
+ super.startPrefixMapping(prefix, uri);
+ }
+ }
+
+ /**
+ * End the scope of a prefix-URI mapping.
+ *
+ * @param prefix
+ * The prefix that was being mapping.
+ */
+ public void endPrefixMapping(String prefix) throws SAXException {
+ if (processing == NO_PROCESSING) {
+ super.endPrefixMapping(prefix);
+ }
+ }
+
+ public void startElement(String namespaceURI, String localName,
+ String qName, Attributes atts) throws SAXException {
+
+ // System.out.println("START processing: "+processing+" "+localName);
+
+ if (LUCENE_URI.equals(namespaceURI)) {
+ switch (processing) {
+
+ case NO_PROCESSING:
+
+ // index action
+ if (LUCENE_INDEXING_ELEMENT.equals(localName)) {
+ this.initIndexer(atts);
+ processing = INDEX_PROCESS;
+
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ // delete action
+ else if (LUCENE_DELETING_ELEMENT.equals(localName)) {
+ this.initIndexer(atts);
+ processing = DELETE_PROCESS;
+ super.startElement(namespaceURI, localName, qName, attrs);
+ } else {
+ handleError("element " + localName + " unknown ");
+ }
+ break;
+
+ case INDEX_PROCESS:
+
+ // new document to index
+ if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
+
+ uid = atts.getValue(LUCENE_DOCUMENT_UID_ATTRIBUTE);
+ if (uid == null) {
+ handleError("<" + LUCENE_PREXIF + ":"
+ + LUCENE_DOCUMENT_ELEMENT
+ + "> element must contain "
+ + LUCENE_DOCUMENT_UID_ATTRIBUTE + " attribute");
+ }
+ bodyDocument = index.createDocument(uid);
+ processing = IN_DOCUMENT_PROCESS;
+ } else {
+ handleError("element " + localName
+ + " is not allowed in <" + LUCENE_PREXIF + ":"
+ + LUCENE_DOCUMENT_ELEMENT + "> element ");
+ }
+ break;
+
+ case DELETE_PROCESS:
+
+ if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
+ uid = atts.getValue(LUCENE_DOCUMENT_UID_ATTRIBUTE);
+ if (uid == null) {
+ handleError("<" + LUCENE_PREXIF + ":"
+ + LUCENE_DOCUMENT_ELEMENT
+ + "> element must contain "
+ + LUCENE_DOCUMENT_UID_ATTRIBUTE + " attribute");
+ }
+ processing = DELETING_PROCESS;
+ } else {
+ handleError("element " + localName
+ + " is not a <lucene:document> element ");
+ }
+ break;
+
+ case IN_DOCUMENT_PROCESS:
+ if (LUCENE_FIELD_ELEMENT.equals(localName)) {
+
+ // set the field name
+ this.fieldname = atts.getValue(LUCENE_FIELD_NAME_ATTRIBUTE);
+ if (this.fieldname == null || this.fieldname.equals("")) {
+ handleError("<lucene:field> element must contain name attribut");
+ }
+
+ // clear the text buffer
+ this.fieldvalue = new StringBuffer();
+
+ // set boost value
+ String fieldboostS = atts
+ .getValue(LUCENE_FIELD_BOOST_ATTRIBUTE);
+ if (fieldboostS == null) {
+ fieldboost = 1.0f;
+ } else {
+ fieldboost = Float.parseFloat(fieldboostS);
+ }
+ processing = IN_FIELD_PROCESS;
+ } else {
+ handleError("<" + LUCENE_PREXIF + ":"
+ + LUCENE_FIELD_ELEMENT + " was expected!");
+ }
+ break;
+ }
+ } else {
+ // bypass
+ super.startElement(namespaceURI, localName, qName, atts);
+ }
+ }
+
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+
+ // System.out.println("END processing: "+processing+" "+localName);
+
+ if (LUCENE_URI.equals(namespaceURI)) {
+ switch (processing) {
+
+ case INDEX_PROCESS:
+ if (LUCENE_INDEXING_ELEMENT.equals(localName)) {
+ // end of the indexing -> close the indexer
+ this.closeIndexer();
+ this.processing = NO_PROCESSING;
+ super.endElement(namespaceURI, localName, qName);
+ } else {
+ handleError("</lucene:" + LUCENE_DELETING_ELEMENT
+ + " was expected!");
+ }
+ break;
+
+ case DELETE_PROCESS:
+ if (LUCENE_DELETING_ELEMENT.equals(localName)) {
+ // end of the deleting -> close the indexer
+ this.closeIndexer();
+ this.processing = NO_PROCESSING;
+ super.endElement(namespaceURI, localName, qName);
+ } else {
+ handleError("</lucene:" + LUCENE_DELETING_ELEMENT
+ + " was expected!");
+ }
+ break;
+
+ case IN_DOCUMENT_PROCESS:
+ if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
+ // index the document
+ try {
+ this.indexer.index(bodyDocument);
+ } catch (IndexException ex1) {
+ handleError(ex1);
+ }
+ if (this.getLogger().isDebugEnabled()) {
+ this.getLogger().debug(
+ " lucene document: " + this.bodyDocument);
+ }
+ bodyDocument = null;
+ attrs.clear();
+ attrs
+ .addAttribute(namespaceURI, "uid", "uid", "CDATA",
+ uid);
+ super.startElement(namespaceURI, localName, qName, attrs);
+ super.endElement(namespaceURI, localName, qName);
+ this.processing = INDEX_PROCESS;
+ } else {
+ handleError("</lucene:" + LUCENE_DOCUMENT_ELEMENT
+ + " was expected!");
+ }
+ break;
+
+ case DELETING_PROCESS:
+ if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
+ // delete a document
+ try {
+ indexer.del(uid);
+ } catch (IndexException ex2) {
+ handleError(ex2);
+ }
+ attrs.clear();
+ attrs
+ .addAttribute(namespaceURI, "uid", "uid", "CDATA",
+ uid);
+ super.startElement(namespaceURI, localName, qName, attrs);
+ super.endElement(namespaceURI, localName, qName);
+ this.processing = DELETE_PROCESS;
+ } else {
+ handleError("</lucene:" + LUCENE_DOCUMENT_ELEMENT
+ + " was expected!");
+ }
+ break;
+
+ case IN_FIELD_PROCESS:
+ if (LUCENE_FIELD_ELEMENT.equals(localName)) {
+
+ // create lucene field
+ Field f = null;
+ try {
+ f = index.createField(fieldname, fieldvalue.toString());
+ } catch (IndexException ex) {
+ handleError(ex);
+ }
+ f.setBoost(fieldboost);
+
+ // add field to the lucene document
+ bodyDocument.add(f);
+ processing = IN_DOCUMENT_PROCESS;
+ } else {
+ handleError("</lucene:" + LUCENE_FIELD_ELEMENT
+ + " was expected!");
+ }
+ break;
+
+ default:
+ handleError("unknow element " + LUCENE_FIELD_ELEMENT + " !");
+ }
+ } else {
+ super.endElement(namespaceURI, localName, qName);
+ }
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (processing == IN_FIELD_PROCESS) {
+ this.fieldvalue.append(ch, start, length);
+ } else {
+ super.characters(ch, start, length);
+ }
+
+ }
+
+ /**
+ * Configure the Indexer
+ *
+ * @param id
+ * the indexid
+ * @param analyzerid
+ * @param mergeF
+ * @param clear
+ * @throws SAXException
+ */
+ private void initIndexer(Attributes atts) throws SAXException {
+
+ String id = atts.getValue(LUCENE_INDEXING_INDEXID_ATTRIBUTE);
+ String analyzerid = atts.getValue(LUCENE_URI,
+ LUCENE_INDEXING_ANALYZER_ATTRIBUTE);
+ String mergeF = atts.getValue(LUCENE_URI,
+ LUCENE_INDEXING_MERGE_FACTOR_ATTRIBUTE);
+ String clear = atts.getValue(LUCENE_URI,
+ LUCENE_INDEXING_CREATE_ATTRIBUTE);
+ attrs = new AttributesImpl(atts);
+
+ // set the indexer
+ try {
+ IndexManager indexM = (IndexManager) manager
+ .lookup(IndexManager.ROLE);
+ index = indexM.getIndex(id);
+ if (index == null) {
+ handleError("index id: " + id
+ + " no found in the index definition");
+ }
+ indexer = index.getIndexer();
+ manager.release(indexM);
+ } catch (ServiceException ex1) {
+ handleError("service Exception", ex1);
+
+ } catch (IndexException ex3) {
+ handleError(" get Indexer error for index " + id, ex3);
+ }
+
+ // set a custum analyzer (default: the analyzer of the index)
+ if (analyzerid != null) {
+ Analyzer analyzer = null;
+ try {
+ AnalyzerManager analyzerM = (AnalyzerManager) manager
+ .lookup(IndexManager.ROLE);
+ analyzer = analyzerM.getAnalyzer(analyzerid);
+ indexer.setAnalyzer(analyzer);
+ manager.release(analyzerM);
+ } catch (ServiceException ex1) {
+ handleError("service Exception", ex1);
+ } catch (ConfigurationException ex2) {
+ this.handleError("set analyzer error for index" + id, ex2);
+ }
+ }else {
+
+ attrs.addAttribute(LUCENE_URI, LUCENE_INDEXING_ANALYZER_ATTRIBUTE,LUCENE_INDEXING_ANALYZER_ATTRIBUTE, "CDATA",
+ index.getDefaultAnalyzerID());
+ }
+
+ // set clear mode
+ boolean new_index = (clear != null && clear.toLowerCase()
+ .equals("true")) ? true : false;
+ if (new_index) {
+ try {
+ indexer.clearIndex();
+ } catch (IndexException ex3) {
+ handleError(" clear index error ", ex3);
+ }
+ }
+
+ // set the mergeFactor
+ if (mergeF != null) {
+ int mergeFactor = Integer.parseInt(mergeF);
+ indexer.setMergeFactor(mergeFactor);
+ }
+
+
+ if (this.getLogger().isDebugEnabled()) {
+ this.getLogger().debug(
+ "index " + id + " clear: " + new_index + " analyzerid: "
+ + analyzerid + "mergefactor: " + mergeF);
+ }
+ }
+
+ void handleError(String msg) throws SAXException {
+ this.handleError(msg, null);
+ }
+
+ void handleError(Exception ex) throws SAXException {
+ this.handleError("", ex);
+ }
+
+ /**
+ * Handle Exception or Error
+ *
+ * @param msg
+ * @param ex
+ * @throws SAXException
+ */
+ void handleError(String msg, Exception ex) throws SAXException {
+ closeIndexer();
+ if (ex == null) {
+ // this.getLogger().error(msg);
+ throw new SAXException(msg);
+ } else {
+ // this.getLogger().error(msg, ex);
+ throw new SAXException(msg, ex);
+ }
+ }
+
+ /**
+ * Close the indexer
+ *
+ * @throws SAXException
+ */
+ void closeIndexer() throws SAXException {
+ if (index != null) {
+ index.releaseIndexer(indexer);
+ }
+ }
+
+}
Added: lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformerOptimized.java
URL: http://svn.apache.org/viewcvs/lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformerOptimized.java?rev=345031&view=auto
==============================================================================
--- lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformerOptimized.java (added)
+++ lenya/trunk/src/java/org/apache/cocoon/transformation/LuceneIndexTransformerOptimized.java Wed Nov 16 07:14:03 2005
@@ -0,0 +1,546 @@
+/*
+ * Copyright 1999-2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cocoon.transformation;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.avalon.framework.configuration.Configurable;
+import org.apache.avalon.framework.configuration.Configuration;
+import org.apache.avalon.framework.configuration.ConfigurationException;
+import org.apache.avalon.framework.context.Context;
+import org.apache.avalon.framework.context.ContextException;
+import org.apache.avalon.framework.context.Contextualizable;
+import org.apache.avalon.framework.parameters.Parameters;
+import org.apache.avalon.framework.service.ServiceException;
+import org.apache.avalon.framework.service.ServiceManager;
+import org.apache.avalon.framework.service.Serviceable;
+import org.apache.cocoon.Constants;
+import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.caching.CacheableProcessingComponent;
+import org.apache.cocoon.components.search.IndexException;
+import org.apache.cocoon.components.search.LuceneCocoonHelper;
+import org.apache.cocoon.components.search.LuceneXMLIndexer;
+import org.apache.cocoon.components.search.components.Indexer;
+import org.apache.cocoon.environment.SourceResolver;
+import org.apache.commons.lang.BooleanUtils;
+import org.apache.excalibur.source.SourceValidity;
+import org.apache.excalibur.source.impl.validity.NOPValidity;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.store.Directory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * A lucene index creation transformer.
+ * <p>
+ * See <a
+ * href="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer
+ * </a> documentation on the Cocoon Wiki.
+ * </p>
+ * <p>
+ * TODO: Write more documentation.
+ * </p>
+ *
+ * @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko </a>
+ * @author <a href="mailto:conal@nzetc.org">Conal Tuohy </a>
+ * @author Nicolas Maisonneuve
+ */
+public class LuceneIndexTransformerOptimized extends AbstractTransformer implements
+ CacheableProcessingComponent, Configurable, Contextualizable,
+ Serviceable {
+
+ public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
+
+ public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
+
+ public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
+
+ public static final String DIRECTORY_CONFIG = "directory";
+
+ public static final String DIRECTORY_PARAMETER = "directory";
+
+ public static final String DIRECTORY_DEFAULT = "index";
+
+ public static final String MERGE_FACTOR_CONFIG = "merge-factor";
+
+ public static final String MERGE_FACTOR_PARAMETER = "merge-factor";
+
+ public static final int MERGE_FACTOR_DEFAULT = 20;
+
+ public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
+
+ public static final String LUCENE_QUERY_ELEMENT = "index";
+
+ public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
+
+ public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
+
+ public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
+
+ public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
+
+ public static final String LUCENE_DOCUMENT_ELEMENT = "document";
+
+ public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
+
+ public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
+
+ public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
+
+ public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
+
+ public static final String CDATA = "CDATA";
+
+ // The 3 states of the state machine
+ private static final int STATE_GROUND = 0; // initial or "ground" state
+
+ private static final int STATE_QUERY = 1; // processing a lucene:index
+ // (Query) element
+
+ private static final int STATE_DOCUMENT = 2; // processing a
+ // lucene:document element
+
+ // Initialization time variables
+ protected File workDir = null;
+
+ // service manager
+ private ServiceManager manager;
+
+ private Indexer indexer;
+
+ // Declaration time parameters values (specified in sitemap component
+ // config)
+ private IndexerConfiguration configureConfiguration;
+
+ // Invocation time parameters values (specified in sitemap transform
+ // parameters)
+ private IndexerConfiguration setupConfiguration;
+
+ // Parameters specified in the input document
+ private IndexerConfiguration queryConfiguration;
+
+ // Runtime variables
+ private int processing;
+
+ private boolean createIndex = false;
+
+ private StringBuffer bodyText;
+
+ private Document bodyDocument;
+
+ private String bodyDocumentURL;
+
+ private Stack elementStack = new Stack();
+
+ /**
+ * Storage for the document element's attributes until the document has been
+ * indexed, so that they can be copied to the output along with a boolean
+ * <code>indexed</code> attribute.
+ */
+ private AttributesImpl documentAttributes;
+
+ private long documentStartTime;
+
+ private static String uid(String url) {
+ return url.replace('/', '\u0000'); // + "\u0000" +
+ // DateField.timeToString(urlConnection.getLastModified());
+ }
+
+ public void service(ServiceManager manager) throws ServiceException {
+ this.manager = manager;
+ }
+
+ /**
+ * Configure the transformer. The configuration parameters are stored as
+ * general defaults, which may be over-ridden by parameters specified as
+ * parameters in the sitemap pipeline, or by attributes of the query
+ * element(s) in the XML input document.
+ */
+ public void configure(Configuration conf) throws ConfigurationException {
+ this.configureConfiguration = new IndexerConfiguration(
+ conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(
+ ANALYZER_CLASSNAME_DEFAULT), conf.getChild(
+ DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), conf
+ .getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(
+ MERGE_FACTOR_DEFAULT));
+ }
+
+ /**
+ * Setup the transformer. Called when the pipeline is assembled. The
+ * parameters are those specified as child elements of the
+ * <code><map:transform></code> element in the sitemap. These
+ * parameters are optional: If no parameters are specified here then the
+ * defaults are supplied by the component configuration. Any parameters
+ * specified here may be over-ridden by attributes of the lucene:index
+ * element in the input document.
+ */
+ public void setup(SourceResolver resolver, Map objectModel, String src,
+ Parameters parameters) throws ProcessingException, SAXException,
+ IOException {
+ setupConfiguration = new IndexerConfiguration(parameters.getParameter(
+ ANALYZER_CLASSNAME_PARAMETER,
+ configureConfiguration.analyzerClassname), parameters
+ .getParameter(DIRECTORY_PARAMETER,
+ configureConfiguration.indexDirectory), parameters
+ .getParameterAsInteger(MERGE_FACTOR_PARAMETER,
+ configureConfiguration.mergeFactor));
+ }
+
+ /**
+ * Contextualize this class
+ */
+ public void contextualize(Context context) throws ContextException {
+ this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
+ }
+
+ public void recycle() {
+ this.processing = STATE_GROUND;
+ if (this.indexer != null) {
+ manager.release(indexer);
+ indexer = null;
+ }
+
+ this.bodyText = null;
+ this.bodyDocument = null;
+ this.bodyDocumentURL = null;
+ this.elementStack.clear();
+ super.recycle();
+ }
+
+ /**
+ * Generate the unique key. This key must be unique inside the space of this
+ * component.
+ *
+ * @return The generated key
+ */
+ public Serializable getKey() {
+ return "1";
+ }
+
+ /**
+ * Generate the validity object.
+ *
+ * @return The generated validity object or <code>null</code> if the
+ * component is currently not cacheable.
+ */
+ public SourceValidity getValidity() {
+ return NOPValidity.SHARED_INSTANCE;
+ }
+
+ public void startDocument() throws SAXException {
+ super.startDocument();
+ }
+
+ public void endDocument() throws SAXException {
+ super.endDocument();
+ }
+
+ /**
+ * Begin the scope of a prefix-URI Namespace mapping.
+ *
+ * @param prefix
+ * The Namespace prefix being declared.
+ * @param uri
+ * The Namespace URI the prefix is mapped to.
+ */
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ if (processing == STATE_GROUND) {
+ super.startPrefixMapping(prefix, uri);
+ }
+ }
+
+ /**
+ * End the scope of a prefix-URI mapping.
+ *
+ * @param prefix
+ * The prefix that was being mapping.
+ */
+ public void endPrefixMapping(String prefix) throws SAXException {
+ if (processing == STATE_GROUND) {
+ super.endPrefixMapping(prefix);
+ }
+ }
+
+ public void startElement(String namespaceURI, String localName,
+ String qName, Attributes atts) throws SAXException {
+
+ if (processing == STATE_GROUND) {
+ if (LUCENE_URI.equals(namespaceURI)
+ && LUCENE_QUERY_ELEMENT.equals(localName)) {
+ String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
+ createIndex = BooleanUtils.toBoolean(sCreate);
+
+ String analyzerClassname = atts
+ .getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
+ String indexDirectory = atts
+ .getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
+ String mergeFactor = atts
+ .getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
+
+ queryConfiguration = new IndexerConfiguration(
+ analyzerClassname != null ? analyzerClassname
+ : setupConfiguration.analyzerClassname,
+ indexDirectory != null ? indexDirectory
+ : setupConfiguration.indexDirectory,
+ mergeFactor != null ? Integer.parseInt(mergeFactor)
+ : setupConfiguration.mergeFactor);
+
+ // propagate the lucene:index to the next stage in the pipeline
+ super.startElement(namespaceURI, localName, qName, atts);
+ processing = STATE_QUERY;
+ } else {
+ super.startElement(namespaceURI, localName, qName, atts);
+ }
+ } else if (processing == STATE_QUERY) {
+ // processing a lucene:index - expecting a lucene:document
+ if (LUCENE_URI.equals(namespaceURI)
+ && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
+ this.bodyDocumentURL = atts
+ .getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
+ if (this.bodyDocumentURL == null) {
+ throw new SAXException(
+ "<lucene:document> must have @url attribute");
+ }
+
+ // Remember the time the document indexing began
+ this.documentStartTime = System.currentTimeMillis();
+ // remember these attributes so they can be passed on to the
+ // next stage in the pipeline,
+ // when this document element is ended.
+ this.documentAttributes = new AttributesImpl(atts);
+ this.bodyText = new StringBuffer();
+ this.bodyDocument = new Document();
+ this.elementStack.clear();
+ processing = STATE_DOCUMENT;
+ } else {
+ throw new SAXException(
+ "<lucene:index> element can contain only <lucene:document> elements!");
+ }
+ } else if (processing == STATE_DOCUMENT) {
+ elementStack.push(new IndexHelperField(localName,
+ new AttributesImpl(atts)));
+ }
+ }
+
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+
+ if (processing == STATE_QUERY) {
+ if (LUCENE_URI.equals(namespaceURI)
+ && LUCENE_QUERY_ELEMENT.equals(localName)) {
+ // propagate the query element to the next stage in the pipeline
+ super.endElement(namespaceURI, localName, qName);
+ this.processing = STATE_GROUND;
+ } else {
+ throw new SAXException("</lucene:index> was expected!");
+ }
+ } else if (processing == STATE_DOCUMENT) {
+ if (LUCENE_URI.equals(namespaceURI)
+ && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
+ // End document processing
+ this.bodyDocument.add(Field.UnStored(
+ LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
+ this.bodyText = null;
+
+ this.bodyDocument.add(Field.UnIndexed(
+ LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
+ // store: false, index: true, tokenize: false
+ this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD,
+ uid(this.bodyDocumentURL), false, true, false));
+ try {
+ reindexDocument();
+ } catch (IndexException e) {
+ throw new SAXException(e);
+ }
+ this.bodyDocumentURL = null;
+
+ // propagate the lucene:document element to the next stage in
+ // the pipeline
+ long elapsedTime = System.currentTimeMillis()
+ - this.documentStartTime;
+ // documentAttributes = new AttributesImpl();
+ this.documentAttributes.addAttribute("",
+ LUCENE_ELAPSED_TIME_ATTRIBUTE,
+ LUCENE_ELAPSED_TIME_ATTRIBUTE, CDATA, String
+ .valueOf(elapsedTime));
+ super.startElement(namespaceURI, localName, qName,
+ this.documentAttributes);
+ super.endElement(namespaceURI, localName, qName);
+ this.processing = STATE_QUERY;
+ } else {
+ // End element processing
+ IndexHelperField tos = (IndexHelperField) elementStack.pop();
+ StringBuffer text = tos.getText();
+
+ Attributes atts = tos.getAttributes();
+ boolean attributesToText = atts.getIndex(LUCENE_URI,
+ LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
+ for (int i = 0; i < atts.getLength(); i++) {
+ // Ignore Lucene attributes
+ if (LUCENE_URI.equals(atts.getURI(i))) {
+ continue;
+ }
+
+ String atts_lname = atts.getLocalName(i);
+ String atts_value = atts.getValue(i);
+ bodyDocument.add(Field.UnStored(localName + "@"
+ + atts_lname, atts_value));
+ if (attributesToText) {
+ text.append(atts_value);
+ text.append(' ');
+ bodyText.append(atts_value);
+ bodyText.append(' ');
+ }
+ }
+
+ boolean store = atts.getIndex(LUCENE_URI,
+ LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
+ if (text != null && text.length() > 0) {
+ if (store) {
+ bodyDocument
+ .add(Field.Text(localName, text.toString()));
+ } else {
+ bodyDocument.add(Field.UnStored(localName, text
+ .toString()));
+ }
+ }
+ }
+ } else {
+ // All other tags
+ super.endElement(namespaceURI, localName, qName);
+ }
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+
+ if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0
+ && length > 1 && elementStack.size() > 0) {
+ String text = new String(ch, start, length);
+ ((IndexHelperField) elementStack.peek()).append(text);
+ bodyText.append(text);
+ bodyText.append(' ');
+ } else if (processing == STATE_GROUND) {
+ super.characters(ch, start, length);
+ }
+ }
+
+ private void openWriter() throws IndexException {
+ System.out.println("use luceneIndexTransformer with indexer component");
+ // lookup the indexer
+ try {
+ indexer = (Indexer) this.manager.lookup(Indexer.ROLE+"/default");
+ } catch (ServiceException e) {
+ throw new IndexException(e);
+ }
+
+ File indexDirectory = new File(queryConfiguration.indexDirectory);
+ if (!indexDirectory.isAbsolute()) {
+ indexDirectory = new File(workDir,
+ queryConfiguration.indexDirectory);
+ }
+ // If the index directory doesn't exist, then always create it.
+ boolean indexExists = IndexReader.indexExists(indexDirectory);
+ if (!indexExists) {
+ createIndex = true;
+ }
+ // Get the index directory, creating it if necessary
+ try {
+ Directory directory = LuceneCocoonHelper.getDirectory(
+ indexDirectory, createIndex);
+ indexer.setIndex(directory);
+ } catch (IOException e) {
+ throw new IndexException("set directory " + indexDirectory
+ + " error", e);
+ }
+ // Get the analyzer
+ Analyzer analyzer = LuceneCocoonHelper
+ .getAnalyzer(queryConfiguration.analyzerClassname);
+ indexer.setAnalyzer(analyzer);
+
+ this.indexer.setMergeFactor(queryConfiguration.mergeFactor);
+ if (this.createIndex) {
+ this.indexer.clearIndex();
+ }
+ }
+
+ private void reindexDocument() throws IndexException {
+ // The index is being created, so there's no need to delete the doc from
+ // an existing index.
+ // This means we can keep a single IndexWriter open throughout the
+ // process.
+ if (this.indexer == null) {
+ openWriter();
+ }
+ this.indexer.index(this.bodyDocument);
+ this.bodyDocument = null;
+ }
+
+ class IndexHelperField {
+ String localName;
+
+ StringBuffer text;
+
+ Attributes attributes;
+
+ IndexHelperField(String localName, Attributes atts) {
+ this.localName = localName;
+ this.attributes = atts;
+ this.text = new StringBuffer();
+ }
+
+ public Attributes getAttributes() {
+ return attributes;
+ }
+
+ public StringBuffer getText() {
+ return text;
+ }
+
+ public void append(String text) {
+ this.text.append(text);
+ }
+
+ public void append(char[] str, int offset, int length) {
+ this.text.append(str, offset, length);
+ }
+ }
+
+ class IndexerConfiguration {
+ String analyzerClassname;
+
+ String indexDirectory;
+
+ int mergeFactor;
+
+ public IndexerConfiguration(String analyzerClassname,
+ String indexDirectory, int mergeFactor) {
+ this.analyzerClassname = analyzerClassname;
+ this.indexDirectory = indexDirectory;
+ this.mergeFactor = mergeFactor;
+ }
+ }
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org