You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by an...@apache.org on 2008/01/28 01:05:34 UTC
svn commit: r615682 - in /lenya/trunk/src/modules/resource: ./
java/src/org/apache/lenya/modules/
java/src/org/apache/lenya/modules/resource/ resources/xml/ xslt/
Author: andreas
Date: Sun Jan 27 16:05:32 2008
New Revision: 615682
URL: http://svn.apache.org/viewvc?rev=615682&view=rev
Log:
Added indexing for PDF documents
Added:
lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/
lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/
lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java
lenya/trunk/src/modules/resource/resources/xml/
lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml
lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl
Modified:
lenya/trunk/src/modules/resource/sitemap.xmap
Added: lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java?rev=615682&view=auto
==============================================================================
--- lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java (added)
+++ lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java Sun Jan 27 16:05:32 2008
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lenya.modules.resource;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.avalon.framework.parameters.Parameters;
+import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.environment.SourceResolver;
+import org.apache.cocoon.generation.AbstractGenerator;
+import org.apache.lenya.cms.cocoon.source.RepositorySource;
+import org.apache.lenya.cms.repository.ContentHolder;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class PdfToTextGenerator extends AbstractGenerator {
+
+ private static final String PREFIX = "pdf";
+ private static final String NAMESPACE = "http://apache.org/lenya/pdf/1.0";
+ private ContentHolder content;
+
+ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par)
+ throws ProcessingException, SAXException, IOException {
+
+ super.setup(resolver, objectModel, src, par);
+
+ RepositorySource source = null;
+ try {
+ source = (RepositorySource) resolver.resolveURI(src);
+ this.content = source.getContent();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ if (source != null) {
+ resolver.release(source);
+ }
+ }
+ }
+
+ public void generate() throws IOException, SAXException, ProcessingException {
+ this.contentHandler.startDocument();
+ this.contentHandler.startPrefixMapping(PREFIX, NAMESPACE);
+ this.contentHandler.startElement(NAMESPACE, "document", PREFIX + ":document",
+ new AttributesImpl());
+
+ try {
+ PDFTextStripper stripper = new PDFTextStripper();
+ PDFParser parser = new PDFParser(this.content.getInputStream());
+ parser.parse();
+ PDDocument doc = parser.getPDDocument();
+ String text = stripper.getText(doc);
+ doc.close();
+ char[] chars = text.toCharArray();
+ this.contentHandler.characters(chars, 0, chars.length);
+ } catch (Exception e) {
+ throw new ProcessingException(e);
+ }
+
+ this.contentHandler.endElement(NAMESPACE, "document", PREFIX + ":document");
+ this.contentHandler.endDocument();
+
+ }
+
+}
Added: lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml?rev=615682&view=auto
==============================================================================
--- lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml (added)
+++ lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml Sun Jan 27 16:05:32 2008
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0">
+ <lucene:document/>
+</lucene:index>
\ No newline at end of file
Modified: lenya/trunk/src/modules/resource/sitemap.xmap
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/sitemap.xmap?rev=615682&r1=615681&r2=615682&view=diff
==============================================================================
--- lenya/trunk/src/modules/resource/sitemap.xmap (original)
+++ lenya/trunk/src/modules/resource/sitemap.xmap Sun Jan 27 16:05:32 2008
@@ -24,6 +24,7 @@
<map:generator label="content" logger="sitemap.generator.directory"
name="lenyaMetaData" pool-max="16"
src="org.apache.lenya.cms.cocoon.generation.LenyaMetaDataGenerator"/>
+ <map:generator name="pdf" pool-max="16" src="org.apache.lenya.modules.resource.PdfToTextGenerator"/>
</map:generators>
<map:serializers default="xhtml">
<map:serializer logger="sitemap.serializer.links" name="links" src="org.apache.lenya.cms.cocoon.serialization.LinkSerializer"/>
@@ -41,10 +42,22 @@
<map:serialize type="xml"/>
</map:match>
+ <!-- {1:pubId}/{2:area}/{3:uuid}/{4:language}/{5:extension} -->
+ <map:match pattern="lucene-index-content/*/*/*/*.pdf">
+ <map:generate type="pdf" src="lenya-document:{3},pub={1},area={2},lang={4}"/>
+ <map:transform src="fallback://lenya/modules/resource/xslt/pdf2xhtml.xsl"/>
+ <map:serialize type="xml"/>
+ </map:match>
+ <map:match pattern="lucene-index-content/*/*/*/*.*">
+ <map:generate src="fallback://lenya/modules/resource/resources/xml/emptyLuceneIndex.xml"/>
+ <map:serialize type="xml"/>
+ </map:match>
+
<!-- {pub-id}/{area}/{uuid}/{language} -->
<map:match pattern="lucene-index/*/*/*/*">
<map:aggregate element="cmsbody">
<map:part src="cocoon:/lenyametadata.xml/{1}/{2}/{3}/{4}/-1"/>
+ <map:part src="cocoon:/lucene-index-content/{1}/{2}/{3}/{4}.{doc-info:{1}:{2}:{3}:{4}:sourceExtension}"/>
</map:aggregate>
<map:transform src="fallback://lenya/modules/xhtml/xslt/xhtml2index.xsl">
<map:parameter name="url" value="{request:requestURI}"/>
Added: lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl?rev=615682&view=auto
==============================================================================
--- lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl (added)
+++ lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl Sun Jan 27 16:05:32 2008
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+ xmlns:pdf="http://apache.org/lenya/pdf/1.0"
+ xmlns="http://www.w3.org/1999/xhtml">
+
+ <xsl:template match="/pdf:document">
+ <html>
+ <body>
+ <p>
+ <xsl:value-of select="."/>
+ </p>
+ </body>
+ </html>
+ </xsl:template>
+
+</xsl:stylesheet>
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org