You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lenya.apache.org by an...@apache.org on 2008/01/28 01:05:34 UTC

svn commit: r615682 - in /lenya/trunk/src/modules/resource: ./ java/src/org/apache/lenya/modules/ java/src/org/apache/lenya/modules/resource/ resources/xml/ xslt/

Author: andreas
Date: Sun Jan 27 16:05:32 2008
New Revision: 615682

URL: http://svn.apache.org/viewvc?rev=615682&view=rev
Log:
Added indexing for PDF documents

Added:
    lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/
    lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/
    lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java
    lenya/trunk/src/modules/resource/resources/xml/
    lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml
    lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl
Modified:
    lenya/trunk/src/modules/resource/sitemap.xmap

Added: lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java?rev=615682&view=auto
==============================================================================
--- lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java (added)
+++ lenya/trunk/src/modules/resource/java/src/org/apache/lenya/modules/resource/PdfToTextGenerator.java Sun Jan 27 16:05:32 2008
@@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lenya.modules.resource;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.avalon.framework.parameters.Parameters;
+import org.apache.cocoon.ProcessingException;
+import org.apache.cocoon.environment.SourceResolver;
+import org.apache.cocoon.generation.AbstractGenerator;
+import org.apache.lenya.cms.cocoon.source.RepositorySource;
+import org.apache.lenya.cms.repository.ContentHolder;
+import org.pdfbox.pdfparser.PDFParser;
+import org.pdfbox.pdmodel.PDDocument;
+import org.pdfbox.util.PDFTextStripper;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class PdfToTextGenerator extends AbstractGenerator {
+
+    private static final String PREFIX = "pdf";
+    private static final String NAMESPACE = "http://apache.org/lenya/pdf/1.0";
+    private ContentHolder content;
+
+    public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par)
+            throws ProcessingException, SAXException, IOException {
+
+        super.setup(resolver, objectModel, src, par);
+
+        RepositorySource source = null;
+        try {
+            source = (RepositorySource) resolver.resolveURI(src);
+            this.content = source.getContent();
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        } finally {
+            if (source != null) {
+                resolver.release(source);
+            }
+        }
+    }
+
+    public void generate() throws IOException, SAXException, ProcessingException {
+        this.contentHandler.startDocument();
+        this.contentHandler.startPrefixMapping(PREFIX, NAMESPACE);
+        this.contentHandler.startElement(NAMESPACE, "document", PREFIX + ":document",
+                new AttributesImpl());
+
+        try {
+            PDFTextStripper stripper = new PDFTextStripper();
+            PDFParser parser = new PDFParser(this.content.getInputStream());
+            parser.parse();
+            PDDocument doc = parser.getPDDocument();
+            String text = stripper.getText(doc);
+            doc.close();
+            char[] chars = text.toCharArray();
+            this.contentHandler.characters(chars, 0, chars.length);
+        } catch (Exception e) {
+            throw new ProcessingException(e);
+        }
+
+        this.contentHandler.endElement(NAMESPACE, "document", PREFIX + ":document");
+        this.contentHandler.endDocument();
+
+    }
+
+}

Added: lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml?rev=615682&view=auto
==============================================================================
--- lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml (added)
+++ lenya/trunk/src/modules/resource/resources/xml/emptyLuceneIndex.xml Sun Jan 27 16:05:32 2008
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0">
+  <lucene:document/>
+</lucene:index>
\ No newline at end of file

Modified: lenya/trunk/src/modules/resource/sitemap.xmap
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/sitemap.xmap?rev=615682&r1=615681&r2=615682&view=diff
==============================================================================
--- lenya/trunk/src/modules/resource/sitemap.xmap (original)
+++ lenya/trunk/src/modules/resource/sitemap.xmap Sun Jan 27 16:05:32 2008
@@ -24,6 +24,7 @@
       <map:generator label="content" logger="sitemap.generator.directory"
         name="lenyaMetaData" pool-max="16"
         src="org.apache.lenya.cms.cocoon.generation.LenyaMetaDataGenerator"/>
+      <map:generator name="pdf" pool-max="16" src="org.apache.lenya.modules.resource.PdfToTextGenerator"/>
     </map:generators>
     <map:serializers default="xhtml">
       <map:serializer logger="sitemap.serializer.links" name="links" src="org.apache.lenya.cms.cocoon.serialization.LinkSerializer"/>
@@ -41,10 +42,22 @@
         <map:serialize type="xml"/>
       </map:match>
       
+      <!-- {1:pubId}/{2:area}/{3:uuid}/{4:language}/{5:extension} -->
+      <map:match pattern="lucene-index-content/*/*/*/*.pdf">
+        <map:generate type="pdf" src="lenya-document:{3},pub={1},area={2},lang={4}"/>
+        <map:transform src="fallback://lenya/modules/resource/xslt/pdf2xhtml.xsl"/>
+        <map:serialize type="xml"/>
+      </map:match>
+      <map:match pattern="lucene-index-content/*/*/*/*.*">
+        <map:generate src="fallback://lenya/modules/resource/resources/xml/emptyLuceneIndex.xml"/>
+        <map:serialize type="xml"/>
+      </map:match>
+      
       <!-- {pub-id}/{area}/{uuid}/{language} -->
       <map:match pattern="lucene-index/*/*/*/*">
         <map:aggregate element="cmsbody">
           <map:part src="cocoon:/lenyametadata.xml/{1}/{2}/{3}/{4}/-1"/>
+          <map:part src="cocoon:/lucene-index-content/{1}/{2}/{3}/{4}.{doc-info:{1}:{2}:{3}:{4}:sourceExtension}"/>
         </map:aggregate>
         <map:transform src="fallback://lenya/modules/xhtml/xslt/xhtml2index.xsl">
           <map:parameter name="url" value="{request:requestURI}"/>

Added: lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl
URL: http://svn.apache.org/viewvc/lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl?rev=615682&view=auto
==============================================================================
--- lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl (added)
+++ lenya/trunk/src/modules/resource/xslt/pdf2xhtml.xsl Sun Jan 27 16:05:32 2008
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
+  xmlns:pdf="http://apache.org/lenya/pdf/1.0"
+  xmlns="http://www.w3.org/1999/xhtml">
+  
+  <xsl:template match="/pdf:document">
+    <html>
+      <body>
+        <p>
+          <xsl:value-of select="."/>
+        </p>
+      </body>
+    </html>
+  </xsl:template>
+
+</xsl:stylesheet>
\ No newline at end of file



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@lenya.apache.org
For additional commands, e-mail: commits-help@lenya.apache.org