You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:28 UTC
[12/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/build.xml b/nutch-plugins/parse-swf/build.xml
new file mode 100644
index 0000000..f4fb20f
--- /dev/null
+++ b/nutch-plugins/parse-swf/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-swf" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+ </target>
+
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy file="sample/test1.swf" todir="${build.test}/data"/>
+ <copy file="sample/test2.swf" todir="${build.test}/data"/>
+ <copy file="sample/test3.swf" todir="${build.test}/data"/>
+ <copy file="sample/test1.txt" todir="${build.test}/data"/>
+ <copy file="sample/test2.txt" todir="${build.test}/data"/>
+ <copy file="sample/test3.txt" todir="${build.test}/data"/>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/ivy.xml b/nutch-plugins/parse-swf/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-swf/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
new file mode 100644
index 0000000..4138a66
--- /dev/null
+++ b/nutch-plugins/parse-swf/lib/javaswf-LICENSE.txt
@@ -0,0 +1,33 @@
+
+ Copyright (c) 2001-2005, David N. Main, All rights reserved.
+
+ Redistribution and use in source and binary forms, with or
+ without modification, are permitted provided that the
+ following conditions are met:
+
+ 1. Redistributions of source code must retain the above
+ copyright notice, this list of conditions and the following
+ disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+ 3. The name of the author may not be used to endorse or
+ promote products derived from this software without specific
+ prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/lib/javaswf.jar
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/lib/javaswf.jar b/nutch-plugins/parse-swf/lib/javaswf.jar
new file mode 100644
index 0000000..78f9b0b
Binary files /dev/null and b/nutch-plugins/parse-swf/lib/javaswf.jar differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/plugin.xml b/nutch-plugins/parse-swf/plugin.xml
new file mode 100644
index 0000000..8cc72c0
--- /dev/null
+++ b/nutch-plugins/parse-swf/plugin.xml
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parse-swf"
+ name="SWF Parse Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="parse-swf.jar">
+ <export name="*"/>
+ </library>
+ <library name="javaswf.jar"/>
+ </runtime>
+
+ <extension id="org.apache.nutch.parse.swf"
+ name="SWFParse"
+ point="org.apache.nutch.parse.Parser">
+
+ <implementation id="org.apache.nutch.parse.swf.SWFParser"
+ class="org.apache.nutch.parse.swf.SWFParser">
+ <parameter name="contentType" value="application/x-shockwave-flash"/>
+ <parameter name="pathSuffix" value="swf"/>
+ </implementation>
+
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/pom.xml b/nutch-plugins/parse-swf/pom.xml
new file mode 100644
index 0000000..743511e
--- /dev/null
+++ b/nutch-plugins/parse-swf/pom.xml
@@ -0,0 +1,46 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>parse-swf</artifactId>
+ <packaging>jar</packaging>
+
+ <name>parse-swf</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>com.google.gwt</groupId>
+ <artifactId>gwt-incubator</artifactId>
+ <version>2.0.1</version>
+ </dependency>
+
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
new file mode 100644
index 0000000..9251366
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -0,0 +1,685 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.*;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.conf.Configuration;
+
+import com.anotherbigidea.flash.interfaces.*;
+import com.anotherbigidea.flash.readers.*;
+import com.anotherbigidea.flash.structs.*;
+import com.anotherbigidea.flash.writers.SWFActionBlockImpl;
+import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
+import com.anotherbigidea.io.InStream;
+
+/**
+ * Parser for Flash SWF files. Loosely based on the sample in JavaSWF
+ * distribution.
+ */
+public class SWFParser implements Parser {
+ public static final Logger LOG = LoggerFactory
+ .getLogger("org.apache.nutch.parse.swf");
+
+ private Configuration conf = null;
+
+ public SWFParser() {
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public ParseResult getParse(Content content) {
+
+ String text = null;
+ Vector<Outlink> outlinks = new Vector<Outlink>();
+
+ try {
+
+ byte[] raw = content.getContent();
+
+ String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
+ if (contentLength != null
+ && raw.length != Integer.parseInt(contentLength)) {
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
+ + " bytes. Parser can't handle incomplete files.")
+ .getEmptyParseResult(content.getUrl(), getConf());
+ }
+ ExtractText extractor = new ExtractText();
+
+ // TagParser implements SWFTags and drives a SWFTagTypes interface
+ TagParser parser = new TagParser(extractor);
+ // use this instead to debug the file
+ // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
+
+ // SWFReader reads an input file and drives a SWFTags interface
+ SWFReader reader = new SWFReader(parser, new InStream(raw));
+
+ // read the input SWF file and pass it through the interface pipeline
+ reader.readFile();
+ text = extractor.getText();
+ String atext = extractor.getActionText();
+ if (atext != null && atext.length() > 0)
+ text += "\n--------\n" + atext;
+ // harvest potential outlinks
+ String[] links = extractor.getUrls();
+ for (int i = 0; i < links.length; i++) {
+ Outlink out = new Outlink(links[i], "");
+ outlinks.add(out);
+ }
+ Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
+ if (olinks != null)
+ for (int i = 0; i < olinks.length; i++) {
+ outlinks.add(olinks[i]);
+ }
+ } catch (Exception e) { // run time exception
+ LOG.error("Error, runtime exception: ", e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as SWF document. " + e).getEmptyParseResult(
+ content.getUrl(), getConf());
+ }
+ if (text == null)
+ text = "";
+
+ Outlink[] links = (Outlink[]) outlinks
+ .toArray(new Outlink[outlinks.size()]);
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
+ content.getMetadata());
+ return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
+ parseData));
+ }
+
+ /**
+ * Arguments are: 0. Name of input SWF file.
+ */
+ public static void main(String[] args) throws IOException {
+ FileInputStream in = new FileInputStream(args[0]);
+
+ byte[] buf = new byte[in.available()];
+ in.read(buf);
+ in.close();
+ SWFParser parser = new SWFParser();
+ ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
+ "file:" + args[0], buf, "application/x-shockwave-flash",
+ new Metadata(), NutchConfiguration.create()));
+ Parse p = parseResult.get("file:" + args[0]);
+ System.out.println("Parse Text:");
+ System.out.println(p.getText());
+ System.out.println("Parse Data:");
+ System.out.println(p.getData());
+ }
+}
+
+/**
+ * Shows how to parse a Flash movie and extract all the text in Text symbols and
+ * the initial text in Edit Fields. Output is to System.out.
+ *
+ * A "pipeline" is set up in the main method:
+ *
+ * SWFReader-->TagParser-->ExtractText
+ *
+ * SWFReader reads the input SWF file and separates out the header and the tags.
+ * The separated contents are passed to TagParser which parses out the
+ * individual tag types and passes them to ExtractText.
+ *
+ * ExtractText extends SWFTagTypesImpl and overrides some methods.
+ */
+class ExtractText extends SWFTagTypesImpl {
+ /**
+ * Store font info keyed by the font symbol id. Each entry is an int[] of
+ * character codes for the correspnding font glyphs (An empty array denotes a
+ * System Font).
+ */
+ protected HashMap<Integer, int[]> fontCodes = new HashMap<Integer, int[]>();
+
+ public ArrayList<String> strings = new ArrayList<String>();
+
+ public HashSet<String> actionStrings = new HashSet<String>();
+
+ public ArrayList<String> urls = new ArrayList<String>();
+
+ public ExtractText() {
+ super(null);
+ }
+
+ public String getText() {
+ StringBuffer res = new StringBuffer();
+ Iterator<String> it = strings.iterator();
+ while (it.hasNext()) {
+ if (res.length() > 0)
+ res.append(' ');
+ res.append(it.next());
+ }
+ return res.toString();
+ }
+
+ public String getActionText() {
+ StringBuffer res = new StringBuffer();
+ String[] strings = (String[]) actionStrings
+ .toArray(new String[actionStrings.size()]);
+ Arrays.sort(strings);
+ for (int i = 0; i < strings.length; i++) {
+ if (i > 0)
+ res.append('\n');
+ res.append(strings[i]);
+ }
+ return res.toString();
+ }
+
+ public String[] getUrls() {
+ String[] res = new String[urls.size()];
+ int i = 0;
+ Iterator<String> it = urls.iterator();
+ while (it.hasNext()) {
+ res[i] = (String) it.next();
+ i++;
+ }
+ return res;
+ }
+
+ public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
+ int arg4) throws IOException {
+ tagDefineFontInfo(arg0, arg1, arg2, arg3);
+ }
+
+ /**
+ * SWFTagTypes interface Save the Text Font character code info
+ */
+ public void tagDefineFontInfo(int fontId, String fontName, int flags,
+ int[] codes) throws IOException {
+ // System.out.println("-defineFontInfo id=" + fontId + ", name=" +
+ // fontName);
+ fontCodes.put(new Integer(fontId), codes);
+ }
+
+ // XXX too much hassle for too little return ... we cannot guess character
+ // XXX codes anyway, so we just give up.
+ /*
+ * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
+ * return null; }
+ */
+
+ /**
+ * SWFTagTypes interface. Save the character code info.
+ */
+ public SWFVectors tagDefineFont2(int id, int flags, String name,
+ int numGlyphs, int ascent, int descent, int leading, int[] codes,
+ int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
+ int[] kernAdjustments) throws IOException {
+ // System.out.println("-defineFontInfo id=" + id + ", name=" + name);
+ fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]);
+
+ return null;
+ }
+
+ /**
+ * SWFTagTypes interface. Dump any initial text in the field.
+ */
+ public void tagDefineTextField(int fieldId, String fieldName,
+ String initialText, Rect boundary, int flags, AlphaColor textColor,
+ int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
+ int rightMargin, int indentation, int lineSpacing) throws IOException {
+ if (initialText != null) {
+ strings.add(initialText);
+ }
+ }
+
+ /**
+ * SWFTagTypes interface
+ */
+ public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
+ throws IOException {
+ lastBounds = curBounds;
+ curBounds = bounds;
+ return new TextDumper();
+ }
+
+ Rect lastBounds = null;
+ Rect curBounds = null;
+
+ /**
+ * SWFTagTypes interface
+ */
+ public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
+ throws IOException {
+ lastBounds = curBounds;
+ curBounds = bounds;
+ return new TextDumper();
+ }
+
+ public class TextDumper implements SWFText {
+ protected Integer fontId;
+
+ protected boolean firstY = true;
+
+ public void font(int fontId, int textHeight) {
+ this.fontId = new Integer(fontId);
+ }
+
+ public void setY(int y) {
+ if (firstY)
+ firstY = false;
+ else
+ strings.add("\n"); // Change in Y - dump a new line
+ }
+
+ /*
+ * There are some issues with this method: sometimes SWF files define their
+ * own font, so short of OCR we cannot guess what is the glyph code ->
+ * character mapping. Additionally, some files don't use literal space
+ * character, instead they adjust glyphAdvances. We don't handle it at all -
+ * in such cases the text will be all glued together.
+ */
+ public void text(int[] glyphIndices, int[] glyphAdvances) {
+ // System.out.println("-text id=" + fontId);
+ int[] codes = (int[]) fontCodes.get(fontId);
+ if (codes == null) {
+ // unknown font, better not guess
+ strings.add("\n**** ?????????????? ****\n");
+ return;
+ }
+
+ // --Translate the glyph indices to character codes
+ char[] chars = new char[glyphIndices.length];
+
+ for (int i = 0; i < chars.length; i++) {
+ int index = glyphIndices[i];
+
+ if (index >= codes.length) // System Font ?
+ {
+ chars[i] = (char) index;
+ } else {
+ chars[i] = (char) (codes[index]);
+ }
+ // System.out.println("-ch[" + i + "]='" + chars[i] + "'(" +
+ // (int)chars[i] + ") +" + glyphAdvances[i]);
+ }
+ strings.add(new String(chars));
+ }
+
+ public void color(Color color) {
+ }
+
+ public void setX(int x) {
+ }
+
+ public void done() {
+ strings.add("\n");
+ }
+ }
+
+ public SWFActions tagDoAction() throws IOException {
+ // ActionTextWriter actions = new ActionTextWriter(new
+ // PrintWriter(System.out));
+ NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
+ return actions;
+ }
+
+ public SWFActions tagDoInitAction(int arg0) throws IOException {
+ // ActionTextWriter actions = new ActionTextWriter(new
+ // PrintWriter(System.out));
+ NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
+ return actions;
+ }
+
+ public void tagGeneratorFont(byte[] arg0) throws IOException {
+ // TODO Auto-generated method stub
+ super.tagGeneratorFont(arg0);
+ }
+
+ public void tagGeneratorText(byte[] arg0) throws IOException {
+ // TODO Auto-generated method stub
+ super.tagGeneratorText(arg0);
+ }
+
+}
+
+/**
+ * ActionScript parser. This parser tries to extract free text embedded inside
+ * the script, but without polluting it too much with names of variables,
+ * methods, etc. Not ideal, but it works.
+ */
+class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {
+ private HashSet<String> strings = null;
+
+ private ArrayList<String> urls = null;
+
+ String[] dict = null;
+
+ Stack<Object> stack = null;
+
+ public NutchSWFActions(HashSet<String> strings, ArrayList<String> urls) {
+ this.strings = strings;
+ this.urls = urls;
+ stack = new SmallStack(100, strings);
+ }
+
+ public void lookupTable(String[] values) throws IOException {
+ for (int i = 0; i < values.length; i++) {
+ if (!strings.contains(values[i]))
+ strings.add(values[i]);
+ }
+ super.lookupTable(values);
+ dict = values;
+ }
+
+ public void defineLocal() throws IOException {
+ stack.pop();
+ super.defineLocal();
+ }
+
+ public void getURL(int vars, int mode) {
+ // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
+ }
+
+ public void getURL(String url, String target) throws IOException {
+ // System.out.println("-getURL: url=" + url + ", target=" + target);
+ stack.push(url);
+ stack.push(target);
+ strings.remove(url);
+ strings.remove(target);
+ urls.add(url);
+ super.getURL(url, target);
+ }
+
+ public SWFActionBlock.TryCatchFinally _try(String var) throws IOException {
+ // stack.push(var);
+ strings.remove(var);
+ return super._try(var);
+ }
+
+ public void comment(String var) throws IOException {
+ // stack.push(var);
+ strings.remove(var);
+ super.comment(var);
+ }
+
+ public void goToFrame(String var) throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ super.gotoFrame(var);
+ }
+
+ public void ifJump(String var) throws IOException {
+ strings.remove(var);
+ super.ifJump(var);
+ }
+
+ public void jump(String var) throws IOException {
+ strings.remove(var);
+ super.jump(var);
+ }
+
+ public void jumpLabel(String var) throws IOException {
+ strings.remove(var);
+ super.jumpLabel(var);
+ }
+
+ public void lookup(int var) throws IOException {
+ if (dict != null && var >= 0 && var < dict.length) {
+ stack.push(dict[var]);
+ }
+ super.lookup(var);
+ }
+
+ public void push(String var) throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ super.push(var);
+ }
+
+ public void setTarget(String var) throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ super.setTarget(var);
+ }
+
+ public SWFActionBlock startFunction(String var, String[] params)
+ throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ if (params != null) {
+ for (int i = 0; i < params.length; i++) {
+ strings.remove(params[i]);
+ }
+ }
+ return this;
+ }
+
+ public SWFActionBlock startFunction2(String var, int arg1, int arg2,
+ String[] params, int[] arg3) throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ if (params != null) {
+ for (int i = 0; i < params.length; i++) {
+ strings.remove(params[i]);
+ }
+ }
+ return this;
+ }
+
+ public void waitForFrame(int num, String var) throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ super.waitForFrame(num, var);
+ }
+
+ public void waitForFrame(String var) throws IOException {
+ stack.push(var);
+ strings.remove(var);
+ super.waitForFrame(var);
+ }
+
+ public void done() throws IOException {
+ while (stack.size() > 0) {
+ strings.remove(stack.pop());
+ }
+ }
+
+ public SWFActionBlock start(int arg0, int arg1) throws IOException {
+ return this;
+ }
+
+ public SWFActionBlock start(int arg0) throws IOException {
+ return this;
+ }
+
+ public void add() throws IOException {
+ super.add();
+ }
+
+ public void asciiToChar() throws IOException {
+ super.asciiToChar();
+ }
+
+ public void asciiToCharMB() throws IOException {
+ super.asciiToCharMB();
+ }
+
+ public void push(int var) throws IOException {
+ if (dict != null && var >= 0 && var < dict.length) {
+ stack.push(dict[var]);
+ }
+ super.push(var);
+ }
+
+ public void callFunction() throws IOException {
+ strings.remove(stack.pop());
+ super.callFunction();
+ }
+
+ public void callMethod() throws IOException {
+ strings.remove(stack.pop());
+ super.callMethod();
+ }
+
+ public void getMember() throws IOException {
+ // 0: name
+ String val = (String) stack.pop();
+ strings.remove(val);
+ super.getMember();
+ }
+
+ public void setMember() throws IOException {
+ // 0: value -1: name
+ stack.pop(); // value
+ String name = (String) stack.pop();
+ strings.remove(name);
+ super.setMember();
+ }
+
+ public void setProperty() throws IOException {
+ super.setProperty();
+ }
+
+ public void setVariable() throws IOException {
+ super.setVariable();
+ }
+
+ public void call() throws IOException {
+ strings.remove(stack.pop());
+ super.call();
+ }
+
+ public void setTarget() throws IOException {
+ strings.remove(stack.pop());
+ super.setTarget();
+ }
+
+ public void pop() throws IOException {
+ strings.remove(stack.pop());
+ super.pop();
+ }
+
+ public void push(boolean arg0) throws IOException {
+ stack.push("" + arg0);
+ super.push(arg0);
+ }
+
+ public void push(double arg0) throws IOException {
+ stack.push("" + arg0);
+ super.push(arg0);
+ }
+
+ public void push(float arg0) throws IOException {
+ stack.push("" + arg0);
+ super.push(arg0);
+ }
+
+ public void pushNull() throws IOException {
+ stack.push("");
+ super.pushNull();
+ }
+
+ public void pushRegister(int arg0) throws IOException {
+ stack.push("" + arg0);
+ super.pushRegister(arg0);
+ }
+
+ public void pushUndefined() throws IOException {
+ stack.push("???");
+ super.pushUndefined();
+ }
+
+ public void getProperty() throws IOException {
+ stack.pop();
+ super.getProperty();
+ }
+
+ public void getVariable() throws IOException {
+ strings.remove(stack.pop());
+ super.getVariable();
+ }
+
+ public void gotoFrame(boolean arg0) throws IOException {
+ stack.push("" + arg0);
+ super.gotoFrame(arg0);
+ }
+
+ public void gotoFrame(int arg0) throws IOException {
+ stack.push("" + arg0);
+ super.gotoFrame(arg0);
+ }
+
+ public void gotoFrame(String arg0) throws IOException {
+ stack.push("" + arg0);
+ strings.remove(arg0);
+ super.gotoFrame(arg0);
+ }
+
+ public void newObject() throws IOException {
+ stack.pop();
+ super.newObject();
+ }
+
+ public SWFActionBlock startWith() throws IOException {
+ return this;
+ }
+
+}
+
+/*
+ * Small bottom-less stack.
+ */
+class SmallStack extends Stack<Object> {
+
+ private static final long serialVersionUID = 1L;
+
+ private int maxSize;
+
+ private HashSet<String> strings = null;
+
+ public SmallStack(int maxSize, HashSet<String> strings) {
+ this.maxSize = maxSize;
+ this.strings = strings;
+ }
+
+ public Object push(Object o) {
+ // limit max size
+ if (this.size() > maxSize) {
+ String val = (String) remove(0);
+ strings.remove(val);
+ }
+ return super.push(o);
+ }
+
+ public Object pop() {
+ // tolerate underruns
+ if (this.size() == 0)
+ return null;
+ else
+ return super.pop();
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
new file mode 100644
index 0000000..5942e64
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/main/java/org/apache/nutch/parse/swf/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse Flash SWF files.
+ */
+package org.apache.nutch.parse.swf;
+
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
new file mode 100644
index 0000000..129b85f
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/java/org/apache/nutch/parse/swf/TestSWFParser.java
@@ -0,0 +1,94 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.swf;
+
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Unit tests for SWFParser.
+ */
+public class TestSWFParser {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
+ "test3.swf" };
+ private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
+ "test3.txt" };
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parse parse;
+ Configuration conf = NutchConfiguration.create();
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+
+ parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+ String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+ Assert.assertTrue(sampleTexts[i].equals(text));
+ }
+ }
+
+ public TestSWFParser() {
+ for (int i = 0; i < sampleFiles.length; i++) {
+ try {
+ // read the test string
+ FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
+ + sampleTexts[i]);
+ StringBuffer sb = new StringBuffer();
+ int len = 0;
+ InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+ char[] buf = new char[1024];
+ while ((len = isr.read(buf)) > 0) {
+ sb.append(buf, 0, len);
+ }
+ isr.close();
+ sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.swf b/nutch-plugins/parse-swf/src/test/resources/test1.swf
new file mode 100644
index 0000000..cd2019b
Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test1.swf differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test1.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test1.txt b/nutch-plugins/parse-swf/src/test/resources/test1.txt
new file mode 100644
index 0000000..68505d5
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test1.txt
@@ -0,0 +1,60 @@
+
+--------
+/go/gnav_cart
+/go/gnav_company
+/go/gnav_devnet
+/go/gnav_downloads
+/go/gnav_fl_minmessage
+/go/gnav_help
+/go/gnav_mm_home
+/go/gnav_products
+/go/gnav_search?loc=en_us
+/go/gnav_showcase
+/go/gnav_solutions
+/go/gnav_store
+/go/gnav_support
+/go/gnav_your_account
+Acquisition Info
+Adobe Home
+AppleGothic
+Array
+Company
+Developers
+Downloads
+Help
+Home
+International
+LocaleManager
+Macromedia Flash Player
+Macromedia Home
+MovieClip
+Products
+Showcase
+Solutions
+Store
+String
+Support
+TextFormat
+To ensure the best possible Internet Experience, please download the latest version of the free
+Verdana
+_sans
+active
+bluePill
+button
+color
+company
+devnet
+downloads
+en_us
+home
+javascript:openCrosslinkWindow('/go/adobeacquisition')
+javascript:openCrosslinkWindow('/go/gnav_adobe_home')
+products
+rollOut
+rollOver
+selected
+showcase
+solutions
+support
+tabHolder
+textColor
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.swf b/nutch-plugins/parse-swf/src/test/resources/test2.swf
new file mode 100644
index 0000000..eb9b03d
Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test2.swf differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test2.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test2.txt b/nutch-plugins/parse-swf/src/test/resources/test2.txt
new file mode 100644
index 0000000..f77b78a
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test2.txt
@@ -0,0 +1,5 @@
+Impact Impact Impact Arial Arial Arial Webdings Webdings Webdings Verdana Verdana Verdana CourierNew CourierNew CourierNew Bimini Bimini Bimini
+--------
+TextFormat
+color
+font
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.swf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.swf b/nutch-plugins/parse-swf/src/test/resources/test3.swf
new file mode 100644
index 0000000..4df9f1e
Binary files /dev/null and b/nutch-plugins/parse-swf/src/test/resources/test3.swf differ
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-swf/src/test/resources/test3.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-swf/src/test/resources/test3.txt b/nutch-plugins/parse-swf/src/test/resources/test3.txt
new file mode 100644
index 0000000..66ae3d8
--- /dev/null
+++ b/nutch-plugins/parse-swf/src/test/resources/test3.txt
@@ -0,0 +1,11 @@
+Mix.
+ Edit.
+ Master.
+ Compose.
+ Animate.
+ With a single suite of powerful tools
+ that work together as one.
+ World-class video and audio tools that bring
+ new power and efficiency to your film, video,
+ DVD, and web workflows.
+ Learn more.
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/build-ivy.xml b/nutch-plugins/parse-tika/build-ivy.xml
new file mode 100644
index 0000000..e4984d8
--- /dev/null
+++ b/nutch-plugins/parse-tika/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without any special installation -->
+ <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not already dropped
+ it into ant's lib dir (note that the latter copy will always take precedence).
+ We will not fail as long as local lib dir exists (it may be empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/build.xml b/nutch-plugins/parse-tika/build.xml
new file mode 100644
index 0000000..4ecb3f8
--- /dev/null
+++ b/nutch-plugins/parse-tika/build.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-tika" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-nekohtml" />
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-nekohtml/*.jar" />
+ </fileset>
+ </path>
+
+ <!-- Deploy Unit test dependencies -->
+ <target name="deps-test">
+ <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+ <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+ <ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
+ </target>
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy todir="${build.test}/data">
+ <fileset dir="sample">
+ <include name="*.rss"/>
+ <include name="*.rtf"/>
+ <include name="*.pdf"/>
+ <include name="ootest.*"/>
+ <include name="*.doc"/>
+ <include name="*.gif"/>
+ </fileset>
+ </copy>
+
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/howto_upgrade_tika.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/howto_upgrade_tika.txt b/nutch-plugins/parse-tika/howto_upgrade_tika.txt
new file mode 100644
index 0000000..63a05a4
--- /dev/null
+++ b/nutch-plugins/parse-tika/howto_upgrade_tika.txt
@@ -0,0 +1,8 @@
+1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+
+2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
+
+3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+ To get the list of dependencies and their versions execute:
+ $ ant -f ./build-ivy.xml
+ $ ls lib | sed 's/^/ <library name="/g' | sed 's/$/"\/>/g'
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/ivy.xml b/nutch-plugins/parse-tika/ivy.xml
new file mode 100644
index 0000000..7a9e959
--- /dev/null
+++ b/nutch-plugins/parse-tika/ivy.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="org.apache.tika" name="tika-parsers" rev="1.12" conf="*->default">
+ <exclude org="org.apache.tika" name="tika-core" />
+ <exclude org="org.apache.httpcomponents" name="httpclient" />
+ <exclude org="org.apache.httpcomponents" name="httpcore" />
+ </dependency>
+ </dependencies>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/plugin.xml b/nutch-plugins/parse-tika/plugin.xml
new file mode 100644
index 0000000..04fcd2e
--- /dev/null
+++ b/nutch-plugins/parse-tika/plugin.xml
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parse-tika"
+ name="Tika Parser Plug-in"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parse-tika.jar">
+ <export name="*"/>
+ </library>
+ <library name="apache-mime4j-core-0.7.2.jar"/>
+ <library name="apache-mime4j-dom-0.7.2.jar"/>
+ <library name="asm-5.0.4.jar"/>
+ <library name="aspectjrt-1.8.0.jar"/>
+ <library name="bcmail-jdk15on-1.52.jar"/>
+ <library name="bcpkix-jdk15on-1.52.jar"/>
+ <library name="bcprov-jdk15on-1.52.jar"/>
+ <library name="boilerpipe-1.1.0.jar"/>
+ <library name="bzip2-0.9.1.jar"/>
+ <library name="c3p0-0.9.1.1.jar"/>
+ <library name="cdm-4.5.5.jar"/>
+ <library name="commons-codec-1.6.jar"/>
+ <library name="commons-compress-1.10.jar"/>
+ <library name="commons-csv-1.0.jar"/>
+ <library name="commons-exec-1.3.jar"/>
+ <library name="commons-io-2.4.jar"/>
+ <library name="commons-lang-2.6.jar"/>
+ <library name="commons-logging-1.1.3.jar"/>
+ <library name="commons-logging-api-1.1.jar"/>
+ <library name="commons-vfs2-2.0.jar"/>
+ <library name="cxf-core-3.0.3.jar"/>
+ <library name="cxf-rt-frontend-jaxrs-3.0.3.jar"/>
+ <library name="cxf-rt-rs-client-3.0.3.jar"/>
+ <library name="cxf-rt-transports-http-3.0.3.jar"/>
+ <library name="ehcache-core-2.6.2.jar"/>
+ <library name="fontbox-1.8.10.jar"/>
+ <library name="geoapi-3.0.0.jar"/>
+ <library name="grib-4.5.5.jar"/>
+ <library name="gson-2.2.4.jar"/>
+ <library name="guava-17.0.jar"/>
+ <library name="httpmime-4.2.6.jar"/>
+ <library name="httpservices-4.5.5.jar"/>
+ <library name="isoparser-1.0.2.jar"/>
+ <library name="jackcess-2.1.2.jar"/>
+ <library name="jackcess-encrypt-2.1.1.jar"/>
+ <library name="java-libpst-0.8.1.jar"/>
+ <library name="javax.annotation-api-1.2.jar"/>
+ <library name="javax.ws.rs-api-2.0.1.jar"/>
+ <library name="jcip-annotations-1.0.jar"/>
+ <library name="jcommander-1.35.jar"/>
+ <library name="jdom-2.0.2.jar"/>
+ <library name="jdom2-2.0.4.jar"/>
+ <library name="jempbox-1.8.10.jar"/>
+ <library name="jhighlight-1.0.2.jar"/>
+ <library name="jj2000-5.2.jar"/>
+ <library name="jmatio-1.0.jar"/>
+ <library name="jna-4.1.0.jar"/>
+ <library name="joda-time-2.2.jar"/>
+ <library name="json-20140107.jar"/>
+ <library name="json-simple-1.1.1.jar"/>
+ <library name="jsoup-1.7.2.jar"/>
+ <library name="jsr-275-0.9.3.jar"/>
+ <library name="juniversalchardet-1.0.3.jar"/>
+ <library name="junrar-0.7.jar"/>
+ <library name="jwnl-1.3.3.jar"/>
+ <library name="maven-scm-api-1.4.jar"/>
+ <library name="maven-scm-provider-svn-commons-1.4.jar"/>
+ <library name="maven-scm-provider-svnexe-1.4.jar"/>
+ <library name="metadata-extractor-2.8.0.jar"/>
+ <library name="netcdf4-4.5.5.jar"/>
+ <library name="opennlp-maxent-3.0.3.jar"/>
+ <library name="opennlp-tools-1.5.3.jar"/>
+ <library name="pdfbox-1.8.10.jar"/>
+ <library name="plexus-utils-1.5.6.jar"/>
+ <library name="poi-3.13.jar"/>
+ <library name="poi-ooxml-3.13.jar"/>
+ <library name="poi-ooxml-schemas-3.13.jar"/>
+ <library name="poi-scratchpad-3.13.jar"/>
+ <library name="protobuf-java-2.5.0.jar"/>
+ <library name="quartz-2.2.0.jar"/>
+ <library name="regexp-1.3.jar"/>
+ <library name="rome-1.5.1.jar"/>
+ <library name="rome-utils-1.5.1.jar"/>
+ <library name="sis-metadata-0.5.jar"/>
+ <library name="sis-netcdf-0.5.jar"/>
+ <library name="sis-referencing-0.5.jar"/>
+ <library name="sis-storage-0.5.jar"/>
+ <library name="sis-utility-0.5.jar"/>
+ <library name="slf4j-api-1.7.12.jar"/>
+ <library name="stax2-api-3.1.4.jar"/>
+ <library name="tagsoup-1.2.1.jar"/>
+ <library name="tika-parsers-1.12.jar"/>
+ <library name="udunits-4.5.5.jar"/>
+ <library name="vorbis-java-core-0.6.jar"/>
+ <library name="vorbis-java-tika-0.6.jar"/>
+ <library name="woodstox-core-asl-4.4.1.jar"/>
+ <library name="xmlbeans-2.6.0.jar"/>
+ <library name="xmlschema-core-2.1.0.jar"/>
+ <library name="xmpcore-5.1.2.jar"/>
+ <library name="xz-1.5.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-nekohtml"/>
+ </requires>
+
+ <extension point="org.apache.nutch.parse.Parser"
+ id="org.apache.nutch.parse.tika"
+ name="TikaParser">
+
+ <implementation id="org.apache.nutch.parse.tika.TikaParser"
+ class="org.apache.nutch.parse.tika.TikaParser">
+ <parameter name="contentType" value="*"/>
+ </implementation>
+
+ </extension>
+
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/pom.xml b/nutch-plugins/parse-tika/pom.xml
new file mode 100644
index 0000000..0cf2340
--- /dev/null
+++ b/nutch-plugins/parse-tika/pom.xml
@@ -0,0 +1,54 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>parse-tika</artifactId>
+ <packaging>jar</packaging>
+
+ <name>parse-tika</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>1.13</version>
+ <exclusions>
+ <!-- TODO -->
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>lib-nekohtml</artifactId>
+ <version>${project.parent.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
new file mode 100644
index 0000000..7c0d71b
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import java.lang.ClassLoader;
+import java.lang.InstantiationException;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.extractors.*;
+
+class BoilerpipeExtractorRepository {
+
+ public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
+ public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>();
+
+ /**
+ * Returns an instance of the specified extractor
+ */
+ public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+ // Check if there's no instance of this extractor
+ if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+ // FQCN
+ boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+
+ // Attempt to load the class
+ try {
+ ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+ Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+
+ // Add an instance to the repository
+ extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance());
+
+ } catch (ClassNotFoundException e) {
+ LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+ } catch (InstantiationException e) {
+ LOG.error("Could not instantiate " + boilerpipeExtractorName);
+ } catch (Exception e) {
+ LOG.error(e);
+ }
+ }
+
+ return extractorRepository.get(boilerpipeExtractorName);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
new file mode 100644
index 0000000..77a1044
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/main/java/org/apache/nutch/parse/tika/DOMBuilder.java
@@ -0,0 +1,794 @@
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id: DOMBuilder.java 823614 2009-10-09 17:02:32Z ab $
+ */
+package org.apache.nutch.parse.tika;
+
+import java.util.Stack;
+
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+import org.w3c.dom.CDATASection;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.ext.LexicalHandler;
+
+/**
+ * This class takes SAX events (in addition to some extra events that SAX
+ * doesn't handle yet) and adds the result to a document or document fragment.
+ */
+class DOMBuilder implements ContentHandler, LexicalHandler {
+ private boolean upperCaseElementNames = true;
+
+ /** Root document */
+ public Document m_doc;
+
+ /** Current node */
+ protected Node m_currentNode = null;
+
+ /** First node of document fragment or null if not a DocumentFragment */
+ public DocumentFragment m_docFrag = null;
+
+ /** Vector of element nodes */
+ protected Stack<Element> m_elemStack = new Stack<Element>();
+
+ /**
+ * Element recorded with this namespace will be converted to Node without a
+ * namespace
+ */
+ private String defaultNamespaceURI = null;
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document fragment.
+ *
+ * @param doc
+ * Root document
+ * @param node
+ * Current node
+ */
+ DOMBuilder(Document doc, Node node) {
+ m_doc = doc;
+ m_currentNode = node;
+ }
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document fragment.
+ *
+ * @param doc
+ * Root document
+ * @param docFrag
+ * Document fragment
+ */
+ DOMBuilder(Document doc, DocumentFragment docFrag) {
+ m_doc = doc;
+ m_docFrag = docFrag;
+ }
+
+ /**
+ * DOMBuilder instance constructor... it will add the DOM nodes to the
+ * document.
+ *
+ * @param doc
+ * Root document
+ */
+ DOMBuilder(Document doc) {
+ m_doc = doc;
+ }
+
+ /**
+ * Get the root node of the DOM being created. This is either a Document or a
+ * DocumentFragment.
+ *
+ * @return The root document or document fragment if not null
+ */
+ Node getRootNode() {
+ return (null != m_docFrag) ? (Node) m_docFrag : (Node) m_doc;
+ }
+
+ /**
+ * Get the node currently being processed.
+ *
+ * @return the current node being processed
+ */
+ Node getCurrentNode() {
+ return m_currentNode;
+ }
+
+ /**
+ * Return null since there is no Writer for this class.
+ *
+ * @return null
+ */
+ java.io.Writer getWriter() {
+ return null;
+ }
+
+ /**
+ * Append a node to the current container.
+ *
+ * @param newNode
+ * New node to append
+ */
+ protected void append(Node newNode) throws org.xml.sax.SAXException {
+
+ Node currentNode = m_currentNode;
+
+ if (null != currentNode) {
+ currentNode.appendChild(newNode);
+
+ // System.out.println(newNode.getNodeName());
+ } else if (null != m_docFrag) {
+ m_docFrag.appendChild(newNode);
+ } else {
+ boolean ok = true;
+ short type = newNode.getNodeType();
+
+ if (type == Node.TEXT_NODE) {
+ String data = newNode.getNodeValue();
+
+ if ((null != data) && (data.trim().length() > 0)) {
+ throw new org.xml.sax.SAXException(
+ "Warning: can't output text before document element! Ignoring...");
+ }
+
+ ok = false;
+ } else if (type == Node.ELEMENT_NODE) {
+ if (m_doc.getDocumentElement() != null) {
+ throw new org.xml.sax.SAXException(
+ "Can't have more than one root on a DOM!");
+ }
+ }
+
+ if (ok)
+ m_doc.appendChild(newNode);
+ }
+ }
+
+ /**
+ * Receive an object for locating the origin of SAX document events.
+ *
+ * <p>
+ * SAX parsers are strongly encouraged (though not absolutely required) to
+ * supply a locator: if it does so, it must supply the locator to the
+ * application by invoking this method before invoking any of the other
+ * methods in the ContentHandler interface.
+ * </p>
+ *
+ * <p>
+ * The locator allows the application to determine the end position of any
+ * document-related event, even if the parser is not reporting an error.
+ * Typically, the application will use this information for reporting its own
+ * errors (such as character content that does not match an application's
+ * business rules). The information returned by the locator is probably not
+ * sufficient for use with a search engine.
+ * </p>
+ *
+ * <p>
+ * Note that the locator will return correct information only during the
+ * invocation of the events in this interface. The application should not
+ * attempt to use it at any other time.
+ * </p>
+ *
+ * @param locator
+ * An object that can return the location of any SAX document event.
+ * @see org.xml.sax.Locator
+ */
+ public void setDocumentLocator(Locator locator) {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the beginning of a document.
+ *
+ * <p>
+ * The SAX parser will invoke this method only once, before any other methods
+ * in this interface or in DTDHandler (except for setDocumentLocator).
+ * </p>
+ */
+ public void startDocument() throws org.xml.sax.SAXException {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the end of a document.
+ *
+ * <p>
+ * The SAX parser will invoke this method only once, and it will be the last
+ * method invoked during the parse. The parser shall not invoke this method
+ * until it has either abandoned parsing (because of an unrecoverable error)
+ * or reached the end of input.
+ * </p>
+ */
+ public void endDocument() throws org.xml.sax.SAXException {
+
+ // No action for the moment.
+ }
+
+ /**
+ * Receive notification of the beginning of an element.
+ *
+ * <p>
+ * The Parser will invoke this method at the beginning of every element in the
+ * XML document; there will be a corresponding endElement() event for every
+ * startElement() event (even when the element is empty). All of the element's
+ * content will be reported, in order, before the corresponding endElement()
+ * event.
+ * </p>
+ *
+ * <p>
+ * If the element name has a namespace prefix, the prefix will still be
+ * attached. Note that the attribute list provided will contain only
+ * attributes with explicit values (specified or defaulted): #IMPLIED
+ * attributes will be omitted.
+ * </p>
+ *
+ *
+ * @param ns
+ * The namespace of the node
+ * @param localName
+ * The local part of the qualified name
+ * @param name
+ * The element name.
+ * @param atts
+ * The attributes attached to the element, if any.
+ * @see #endElement
+ * @see org.xml.sax.Attributes
+ */
+ public void startElement(String ns, String localName, String name,
+ Attributes atts) throws org.xml.sax.SAXException {
+
+ Element elem;
+
+ if (upperCaseElementNames)
+ name = name.toUpperCase();
+
+ // Note that the namespace-aware call must be used to correctly
+ // construct a Level 2 DOM, even for non-namespaced nodes.
+ if ((null == ns) || (ns.length() == 0) || ns.equals(defaultNamespaceURI))
+ elem = m_doc.createElementNS(null, name);
+ else
+ elem = m_doc.createElementNS(ns, name);
+
+ append(elem);
+
+ try {
+ int nAtts = atts.getLength();
+
+ if (0 != nAtts) {
+ for (int i = 0; i < nAtts; i++) {
+
+ // System.out.println("type " + atts.getType(i) + " name " +
+ // atts.getLocalName(i) );
+ // First handle a possible ID attribute
+ if (atts.getType(i).equalsIgnoreCase("ID"))
+ setIDAttribute(atts.getValue(i), elem);
+
+ String attrNS = atts.getURI(i);
+
+ if ("".equals(attrNS))
+ attrNS = null; // DOM represents no-namespace as null
+
+ // System.out.println("attrNS: "+attrNS+", localName: "+atts.getQName(i)
+ // +", qname: "+atts.getQName(i)+", value: "+atts.getValue(i));
+ // Crimson won't let us set an xmlns: attribute on the DOM.
+ String attrQName = atts.getQName(i);
+
+ // In SAX, xmlns: attributes have an empty namespace, while in DOM
+ // they should have the xmlns namespace
+ if (attrQName.startsWith("xmlns:"))
+ attrNS = "http://www.w3.org/2000/xmlns/";
+
+ // ALWAYS use the DOM Level 2 call!
+ elem.setAttributeNS(attrNS, attrQName, atts.getValue(i));
+ }
+ }
+
+ // append(elem);
+
+ m_elemStack.push(elem);
+
+ m_currentNode = elem;
+
+ // append(elem);
+ } catch (java.lang.Exception de) {
+ // de.printStackTrace();
+ throw new org.xml.sax.SAXException(de);
+ }
+
+ }
+
+ /**
+ *
+ *
+ *
+ * Receive notification of the end of an element.
+ *
+ * <p>
+ * The SAX parser will invoke this method at the end of every element in the
+ * XML document; there will be a corresponding startElement() event for every
+ * endElement() event (even when the element is empty).
+ * </p>
+ *
+ * <p>
+ * If the element name has a namespace prefix, the prefix will still be
+ * attached to the name.
+ * </p>
+ *
+ *
+ * @param ns
+ * the namespace of the element
+ * @param localName
+ * The local part of the qualified name of the element
+ * @param name
+ * The element name
+ */
+ public void endElement(String ns, String localName, String name)
+ throws org.xml.sax.SAXException {
+ if (!m_elemStack.isEmpty()) {
+ m_elemStack.pop();
+ }
+ m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
+ }
+
+ /**
+ * Set an ID string to node association in the ID table.
+ *
+ * @param id
+ * The ID string.
+ * @param elem
+ * The associated ID.
+ */
+ public void setIDAttribute(String id, Element elem) {
+
+ // Do nothing. This method is meant to be overiden.
+ }
+
+ /**
+ * Receive notification of character data.
+ *
+ * <p>
+ * The Parser will call this method to report each chunk of character data.
+ * SAX parsers may return all contiguous character data in a single chunk, or
+ * they may split it into several chunks; however, all of the characters in
+ * any single event must come from the same external entity, so that the
+ * Locator provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * <p>
+ * Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating parsers must
+ * do so).
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
+ * @see #ignorableWhitespace
+ * @see org.xml.sax.Locator
+ */
+ public void characters(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ if (m_inCData) {
+ cdata(ch, start, length);
+
+ return;
+ }
+
+ String s = new String(ch, start, length);
+ Node childNode;
+ childNode = m_currentNode != null ? m_currentNode.getLastChild() : null;
+ if (childNode != null && childNode.getNodeType() == Node.TEXT_NODE) {
+ ((Text) childNode).appendData(s);
+ } else {
+ Text text = m_doc.createTextNode(s);
+ append(text);
+ }
+ }
+
+ /**
+ * If available, when the disable-output-escaping attribute is used, output
+ * raw text without escaping. A PI will be inserted in front of the node with
+ * the name "lotusxsl-next-is-raw" and a value of "formatter-to-dom".
+ *
+ * @param ch
+ * Array containing the characters
+ * @param start
+ * Index to start of characters in the array
+ * @param length
+ * Number of characters in the array
+ */
+ public void charactersRaw(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ append(m_doc.createProcessingInstruction("xslt-next-is-raw",
+ "formatter-to-dom"));
+ append(m_doc.createTextNode(s));
+ }
+
+ /**
+ * Report the beginning of an entity.
+ *
+ * The start and end of the document entity are not reported. The start and
+ * end of the external DTD subset are reported using the pseudo-name "[dtd]".
+ * All other events must be properly nested within start/end entity events.
+ *
+ * @param name
+ * The name of the entity. If it is a parameter entity, the name will
+ * begin with '%'.
+ * @see #endEntity
+ * @see org.xml.sax.ext.DeclHandler#internalEntityDecl
+ * @see org.xml.sax.ext.DeclHandler#externalEntityDecl
+ */
+ public void startEntity(String name) throws org.xml.sax.SAXException {
+
+ // Almost certainly the wrong behavior...
+ // entityReference(name);
+ }
+
+ /**
+ * Report the end of an entity.
+ *
+ * @param name
+ * The name of the entity that is ending.
+ * @see #startEntity
+ */
+ public void endEntity(String name) throws org.xml.sax.SAXException {
+ }
+
+ /**
+ * Receive notivication of a entityReference.
+ *
+ * @param name
+ * name of the entity reference
+ */
+ public void entityReference(String name) throws org.xml.sax.SAXException {
+ append(m_doc.createEntityReference(name));
+ }
+
+ /**
+ * Receive notification of ignorable whitespace in element content.
+ *
+ * <p>
+ * Validating Parsers must use this method to report each chunk of ignorable
+ * whitespace (see the W3C XML 1.0 recommendation, section 2.10):
+ * non-validating parsers may also use this method if they are capable of
+ * parsing and using content models.
+ * </p>
+ *
+ * <p>
+ * SAX parsers may return all contiguous whitespace in a single chunk, or they
+ * may split it into several chunks; however, all of the characters in any
+ * single event must come from the same external entity, so that the Locator
+ * provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
+ * @see #characters
+ */
+ public void ignorableWhitespace(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem())
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ append(m_doc.createTextNode(s));
+ }
+
+ /**
+ * Tell if the current node is outside the document element.
+ *
+ * @return true if the current node is outside the document element.
+ */
+ private boolean isOutsideDocElem() {
+ return (null == m_docFrag)
+ && m_elemStack.size() == 0
+ && (null == m_currentNode || m_currentNode.getNodeType() == Node.DOCUMENT_NODE);
+ }
+
+ /**
+ * Receive notification of a processing instruction.
+ *
+ * <p>
+ * The Parser will invoke this method once for each processing instruction
+ * found: note that processing instructions may occur before or after the main
+ * document element.
+ * </p>
+ *
+ * <p>
+ * A SAX parser should never report an XML declaration (XML 1.0, section 2.8)
+ * or a text declaration (XML 1.0, section 4.3.1) using this method.
+ * </p>
+ *
+ * @param target
+ * The processing instruction target.
+ * @param data
+ * The processing instruction data, or null if none was supplied.
+ */
+ public void processingInstruction(String target, String data)
+ throws org.xml.sax.SAXException {
+ append(m_doc.createProcessingInstruction(target, data));
+ }
+
+ /**
+ * Report an XML comment anywhere in the document.
+ *
+ * This callback will be used for comments inside or outside the document
+ * element, including comments in the external DTD subset (if read).
+ *
+ * @param ch
+ * An array holding the characters in the comment.
+ * @param start
+ * The starting position in the array.
+ * @param length
+ * The number of characters to use from the array.
+ */
+ public void comment(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ // tagsoup sometimes submits invalid values here
+ if (ch == null || start < 0 || length >= (ch.length - start) || length < 0)
+ return;
+ append(m_doc.createComment(new String(ch, start, length)));
+ }
+
+ /** Flag indicating that we are processing a CData section */
+ protected boolean m_inCData = false;
+
+ /**
+ * Report the start of a CDATA section.
+ *
+ * @see #endCDATA
+ */
+ public void startCDATA() throws org.xml.sax.SAXException {
+ m_inCData = true;
+ append(m_doc.createCDATASection(""));
+ }
+
+ /**
+ * Report the end of a CDATA section.
+ *
+ * @see #startCDATA
+ */
+ public void endCDATA() throws org.xml.sax.SAXException {
+ m_inCData = false;
+ }
+
+ /**
+ * Receive notification of cdata.
+ *
+ * <p>
+ * The Parser will call this method to report each chunk of character data.
+ * SAX parsers may return all contiguous character data in a single chunk, or
+ * they may split it into several chunks; however, all of the characters in
+ * any single event must come from the same external entity, so that the
+ * Locator provides useful information.
+ * </p>
+ *
+ * <p>
+ * The application must not attempt to read from the array outside of the
+ * specified range.
+ * </p>
+ *
+ * <p>
+ * Note that some parsers will report whitespace using the
+ * ignorableWhitespace() method rather than this one (validating parsers must
+ * do so).
+ * </p>
+ *
+ * @param ch
+ * The characters from the XML document.
+ * @param start
+ * The start position in the array.
+ * @param length
+ * The number of characters to read from the array.
+ * @see #ignorableWhitespace
+ * @see org.xml.sax.Locator
+ */
+ public void cdata(char ch[], int start, int length)
+ throws org.xml.sax.SAXException {
+ if (isOutsideDocElem()
+ && XMLCharacterRecognizer.isWhiteSpace(ch, start, length))
+ return; // avoid DOM006 Hierarchy request error
+
+ String s = new String(ch, start, length);
+
+ // XXX ab@apache.org: modified from the original, to accomodate TagSoup.
+ Node n = m_currentNode.getLastChild();
+ if (n instanceof CDATASection)
+ ((CDATASection) n).appendData(s);
+ else if (n instanceof Comment)
+ ((Comment) n).appendData(s);
+ }
+
+ /**
+ * Report the start of DTD declarations, if any.
+ *
+ * Any declarations are assumed to be in the internal subset unless otherwise
+ * indicated.
+ *
+ * @param name
+ * The document type name.
+ * @param publicId
+ * The declared public identifier for the external DTD subset, or
+ * null if none was declared.
+ * @param systemId
+ * The declared system identifier for the external DTD subset, or
+ * null if none was declared.
+ * @see #endDTD
+ * @see #startEntity
+ */
+ public void startDTD(String name, String publicId, String systemId)
+ throws org.xml.sax.SAXException {
+
+ // Do nothing for now.
+ }
+
+ /**
+ * Report the end of DTD declarations.
+ *
+ * @see #startDTD
+ */
+ public void endDTD() throws org.xml.sax.SAXException {
+
+ // Do nothing for now.
+ }
+
+ /**
+ * Begin the scope of a prefix-URI Namespace mapping.
+ *
+ * <p>
+ * The information from this event is not necessary for normal Namespace
+ * processing: the SAX XML reader will automatically replace prefixes for
+ * element and attribute names when the http://xml.org/sax/features/namespaces
+ * feature is true (the default).
+ * </p>
+ *
+ * <p>
+ * There are cases, however, when applications need to use prefixes in
+ * character data or in attribute values, where they cannot safely be expanded
+ * automatically; the start/endPrefixMapping event supplies the information to
+ * the application to expand prefixes in those contexts itself, if necessary.
+ * </p>
+ *
+ * <p>
+ * Note that start/endPrefixMapping events are not guaranteed to be properly
+ * nested relative to each-other: all startPrefixMapping events will occur
+ * before the corresponding startElement event, and all endPrefixMapping
+ * events will occur after the corresponding endElement event, but their order
+ * is not guaranteed.
+ * </p>
+ *
+ * @param prefix
+ * The Namespace prefix being declared.
+ * @param uri
+ * The Namespace URI the prefix is mapped to.
+ * @see #endPrefixMapping
+ * @see #startElement
+ */
+ public void startPrefixMapping(String prefix, String uri)
+ throws org.xml.sax.SAXException {
+
+ /*
+ * // Not sure if this is needed or wanted // Also, it fails in the stree.
+ * if((null != m_currentNode) && (m_currentNode.getNodeType() ==
+ * Node.ELEMENT_NODE)) { String qname; if(((null != prefix) &&
+ * (prefix.length() == 0)) || (null == prefix)) qname = "xmlns"; else qname
+ * = "xmlns:"+prefix;
+ *
+ * Element elem = (Element)m_currentNode; String val =
+ * elem.getAttribute(qname); // Obsolete, should be DOM2...? if(val == null)
+ * { elem.setAttributeNS("http://www.w3.org/XML/1998/namespace", qname,
+ * uri); } }
+ */
+ }
+
+ /**
+ * End the scope of a prefix-URI mapping.
+ *
+ * <p>
+ * See startPrefixMapping for details. This event will always occur after the
+ * corresponding endElement event, but the order of endPrefixMapping events is
+ * not otherwise guaranteed.
+ * </p>
+ *
+ * @param prefix
+ * The prefix that was being mapping.
+ * @see #startPrefixMapping
+ * @see #endElement
+ */
+ public void endPrefixMapping(String prefix) throws org.xml.sax.SAXException {
+ }
+
+ /**
+ * Receive notification of a skipped entity.
+ *
+ * <p>
+ * The Parser will invoke this method once for each entity skipped.
+ * Non-validating processors may skip entities if they have not seen the
+ * declarations (because, for example, the entity was declared in an external
+ * DTD subset). All processors may skip external entities, depending on the
+ * values of the http://xml.org/sax/features/external-general-entities and the
+ * http://xml.org/sax/features/external-parameter-entities properties.
+ * </p>
+ *
+ * @param name
+ * The name of the skipped entity. If it is a parameter entity, the
+ * name will begin with '%'.
+ */
+ public void skippedEntity(String name) throws org.xml.sax.SAXException {
+ }
+
+ public boolean isUpperCaseElementNames() {
+ return upperCaseElementNames;
+ }
+
+ public void setUpperCaseElementNames(boolean upperCaseElementNames) {
+ this.upperCaseElementNames = upperCaseElementNames;
+ }
+
+ public String getDefaultNamespaceURI() {
+ return defaultNamespaceURI;
+ }
+
+ public void setDefaultNamespaceURI(String defaultNamespaceURI) {
+ this.defaultNamespaceURI = defaultNamespaceURI;
+ }
+}