You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:53 UTC

[09/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
deleted file mode 100644
index 9251366..0000000
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
+++ /dev/null
@@ -1,685 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.swf;
-
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.*;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.apache.hadoop.conf.Configuration;
-
-import com.anotherbigidea.flash.interfaces.*;
-import com.anotherbigidea.flash.readers.*;
-import com.anotherbigidea.flash.structs.*;
-import com.anotherbigidea.flash.writers.SWFActionBlockImpl;
-import com.anotherbigidea.flash.writers.SWFTagTypesImpl;
-import com.anotherbigidea.io.InStream;
-
-/**
- * Parser for Flash SWF files. Loosely based on the sample in JavaSWF
- * distribution.
- */
-public class SWFParser implements Parser {
-  public static final Logger LOG = LoggerFactory
-      .getLogger("org.apache.nutch.parse.swf");
-
-  private Configuration conf = null;
-
-  public SWFParser() {
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return conf;
-  }
-
-  public ParseResult getParse(Content content) {
-
-    String text = null;
-    Vector<Outlink> outlinks = new Vector<Outlink>();
-
-    try {
-
-      byte[] raw = content.getContent();
-
-      String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH);
-      if (contentLength != null
-          && raw.length != Integer.parseInt(contentLength)) {
-        return new ParseStatus(ParseStatus.FAILED,
-            ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length
-                + " bytes. Parser can't handle incomplete files.")
-            .getEmptyParseResult(content.getUrl(), getConf());
-      }
-      ExtractText extractor = new ExtractText();
-
-      // TagParser implements SWFTags and drives a SWFTagTypes interface
-      TagParser parser = new TagParser(extractor);
-      // use this instead to debug the file
-      // TagParser parser = new TagParser( new SWFTagDumper(true, true) );
-
-      // SWFReader reads an input file and drives a SWFTags interface
-      SWFReader reader = new SWFReader(parser, new InStream(raw));
-
-      // read the input SWF file and pass it through the interface pipeline
-      reader.readFile();
-      text = extractor.getText();
-      String atext = extractor.getActionText();
-      if (atext != null && atext.length() > 0)
-        text += "\n--------\n" + atext;
-      // harvest potential outlinks
-      String[] links = extractor.getUrls();
-      for (int i = 0; i < links.length; i++) {
-        Outlink out = new Outlink(links[i], "");
-        outlinks.add(out);
-      }
-      Outlink[] olinks = OutlinkExtractor.getOutlinks(text, conf);
-      if (olinks != null)
-        for (int i = 0; i < olinks.length; i++) {
-          outlinks.add(olinks[i]);
-        }
-    } catch (Exception e) { // run time exception
-      LOG.error("Error, runtime exception: ", e);
-      return new ParseStatus(ParseStatus.FAILED,
-          "Can't be handled as SWF document. " + e).getEmptyParseResult(
-          content.getUrl(), getConf());
-    }
-    if (text == null)
-      text = "";
-
-    Outlink[] links = (Outlink[]) outlinks
-        .toArray(new Outlink[outlinks.size()]);
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "", links,
-        content.getMetadata());
-    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text,
-        parseData));
-  }
-
-  /**
-   * Arguments are: 0. Name of input SWF file.
-   */
-  public static void main(String[] args) throws IOException {
-    FileInputStream in = new FileInputStream(args[0]);
-
-    byte[] buf = new byte[in.available()];
-    in.read(buf);
-    in.close();
-    SWFParser parser = new SWFParser();
-    ParseResult parseResult = parser.getParse(new Content("file:" + args[0],
-        "file:" + args[0], buf, "application/x-shockwave-flash",
-        new Metadata(), NutchConfiguration.create()));
-    Parse p = parseResult.get("file:" + args[0]);
-    System.out.println("Parse Text:");
-    System.out.println(p.getText());
-    System.out.println("Parse Data:");
-    System.out.println(p.getData());
-  }
-}
-
-/**
- * Shows how to parse a Flash movie and extract all the text in Text symbols and
- * the initial text in Edit Fields. Output is to System.out.
- * 
- * A "pipeline" is set up in the main method:
- * 
- * SWFReader-->TagParser-->ExtractText
- * 
- * SWFReader reads the input SWF file and separates out the header and the tags.
- * The separated contents are passed to TagParser which parses out the
- * individual tag types and passes them to ExtractText.
- * 
- * ExtractText extends SWFTagTypesImpl and overrides some methods.
- */
-class ExtractText extends SWFTagTypesImpl {
-  /**
-   * Store font info keyed by the font symbol id. Each entry is an int[] of
-   * character codes for the correspnding font glyphs (An empty array denotes a
-   * System Font).
-   */
-  protected HashMap<Integer, int[]> fontCodes = new HashMap<Integer, int[]>();
-
-  public ArrayList<String> strings = new ArrayList<String>();
-
-  public HashSet<String> actionStrings = new HashSet<String>();
-
-  public ArrayList<String> urls = new ArrayList<String>();
-
-  public ExtractText() {
-    super(null);
-  }
-
-  public String getText() {
-    StringBuffer res = new StringBuffer();
-    Iterator<String> it = strings.iterator();
-    while (it.hasNext()) {
-      if (res.length() > 0)
-        res.append(' ');
-      res.append(it.next());
-    }
-    return res.toString();
-  }
-
-  public String getActionText() {
-    StringBuffer res = new StringBuffer();
-    String[] strings = (String[]) actionStrings
-        .toArray(new String[actionStrings.size()]);
-    Arrays.sort(strings);
-    for (int i = 0; i < strings.length; i++) {
-      if (i > 0)
-        res.append('\n');
-      res.append(strings[i]);
-    }
-    return res.toString();
-  }
-
-  public String[] getUrls() {
-    String[] res = new String[urls.size()];
-    int i = 0;
-    Iterator<String> it = urls.iterator();
-    while (it.hasNext()) {
-      res[i] = (String) it.next();
-      i++;
-    }
-    return res;
-  }
-
-  public void tagDefineFontInfo2(int arg0, String arg1, int arg2, int[] arg3,
-      int arg4) throws IOException {
-    tagDefineFontInfo(arg0, arg1, arg2, arg3);
-  }
-
-  /**
-   * SWFTagTypes interface Save the Text Font character code info
-   */
-  public void tagDefineFontInfo(int fontId, String fontName, int flags,
-      int[] codes) throws IOException {
-    // System.out.println("-defineFontInfo id=" + fontId + ", name=" +
-    // fontName);
-    fontCodes.put(new Integer(fontId), codes);
-  }
-
-  // XXX too much hassle for too little return ... we cannot guess character
-  // XXX codes anyway, so we just give up.
-  /*
-   * public SWFVectors tagDefineFont(int arg0, int arg1) throws IOException {
-   * return null; }
-   */
-
-  /**
-   * SWFTagTypes interface. Save the character code info.
-   */
-  public SWFVectors tagDefineFont2(int id, int flags, String name,
-      int numGlyphs, int ascent, int descent, int leading, int[] codes,
-      int[] advances, Rect[] bounds, int[] kernCodes1, int[] kernCodes2,
-      int[] kernAdjustments) throws IOException {
-    // System.out.println("-defineFontInfo id=" + id + ", name=" + name);
-    fontCodes.put(new Integer(id), (codes != null) ? codes : new int[0]);
-
-    return null;
-  }
-
-  /**
-   * SWFTagTypes interface. Dump any initial text in the field.
-   */
-  public void tagDefineTextField(int fieldId, String fieldName,
-      String initialText, Rect boundary, int flags, AlphaColor textColor,
-      int alignment, int fontId, int fontSize, int charLimit, int leftMargin,
-      int rightMargin, int indentation, int lineSpacing) throws IOException {
-    if (initialText != null) {
-      strings.add(initialText);
-    }
-  }
-
-  /**
-   * SWFTagTypes interface
-   */
-  public SWFText tagDefineText(int id, Rect bounds, Matrix matrix)
-      throws IOException {
-    lastBounds = curBounds;
-    curBounds = bounds;
-    return new TextDumper();
-  }
-
-  Rect lastBounds = null;
-  Rect curBounds = null;
-
-  /**
-   * SWFTagTypes interface
-   */
-  public SWFText tagDefineText2(int id, Rect bounds, Matrix matrix)
-      throws IOException {
-    lastBounds = curBounds;
-    curBounds = bounds;
-    return new TextDumper();
-  }
-
-  public class TextDumper implements SWFText {
-    protected Integer fontId;
-
-    protected boolean firstY = true;
-
-    public void font(int fontId, int textHeight) {
-      this.fontId = new Integer(fontId);
-    }
-
-    public void setY(int y) {
-      if (firstY)
-        firstY = false;
-      else
-        strings.add("\n"); // Change in Y - dump a new line
-    }
-
-    /*
-     * There are some issues with this method: sometimes SWF files define their
-     * own font, so short of OCR we cannot guess what is the glyph code ->
-     * character mapping. Additionally, some files don't use literal space
-     * character, instead they adjust glyphAdvances. We don't handle it at all -
-     * in such cases the text will be all glued together.
-     */
-    public void text(int[] glyphIndices, int[] glyphAdvances) {
-      // System.out.println("-text id=" + fontId);
-      int[] codes = (int[]) fontCodes.get(fontId);
-      if (codes == null) {
-        // unknown font, better not guess
-        strings.add("\n**** ?????????????? ****\n");
-        return;
-      }
-
-      // --Translate the glyph indices to character codes
-      char[] chars = new char[glyphIndices.length];
-
-      for (int i = 0; i < chars.length; i++) {
-        int index = glyphIndices[i];
-
-        if (index >= codes.length) // System Font ?
-        {
-          chars[i] = (char) index;
-        } else {
-          chars[i] = (char) (codes[index]);
-        }
-        // System.out.println("-ch[" + i + "]='" + chars[i] + "'(" +
-        // (int)chars[i] + ") +" + glyphAdvances[i]);
-      }
-      strings.add(new String(chars));
-    }
-
-    public void color(Color color) {
-    }
-
-    public void setX(int x) {
-    }
-
-    public void done() {
-      strings.add("\n");
-    }
-  }
-
-  public SWFActions tagDoAction() throws IOException {
-    // ActionTextWriter actions = new ActionTextWriter(new
-    // PrintWriter(System.out));
-    NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
-    return actions;
-  }
-
-  public SWFActions tagDoInitAction(int arg0) throws IOException {
-    // ActionTextWriter actions = new ActionTextWriter(new
-    // PrintWriter(System.out));
-    NutchSWFActions actions = new NutchSWFActions(actionStrings, urls);
-    return actions;
-  }
-
-  public void tagGeneratorFont(byte[] arg0) throws IOException {
-    // TODO Auto-generated method stub
-    super.tagGeneratorFont(arg0);
-  }
-
-  public void tagGeneratorText(byte[] arg0) throws IOException {
-    // TODO Auto-generated method stub
-    super.tagGeneratorText(arg0);
-  }
-
-}
-
-/**
- * ActionScript parser. This parser tries to extract free text embedded inside
- * the script, but without polluting it too much with names of variables,
- * methods, etc. Not ideal, but it works.
- */
-class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {
-  private HashSet<String> strings = null;
-
-  private ArrayList<String> urls = null;
-
-  String[] dict = null;
-
-  Stack<Object> stack = null;
-
-  public NutchSWFActions(HashSet<String> strings, ArrayList<String> urls) {
-    this.strings = strings;
-    this.urls = urls;
-    stack = new SmallStack(100, strings);
-  }
-
-  public void lookupTable(String[] values) throws IOException {
-    for (int i = 0; i < values.length; i++) {
-      if (!strings.contains(values[i]))
-        strings.add(values[i]);
-    }
-    super.lookupTable(values);
-    dict = values;
-  }
-
-  public void defineLocal() throws IOException {
-    stack.pop();
-    super.defineLocal();
-  }
-
-  public void getURL(int vars, int mode) {
-    // System.out.println("-getURL: vars=" + vars + ", mode=" + mode);
-  }
-
-  public void getURL(String url, String target) throws IOException {
-    // System.out.println("-getURL: url=" + url + ", target=" + target);
-    stack.push(url);
-    stack.push(target);
-    strings.remove(url);
-    strings.remove(target);
-    urls.add(url);
-    super.getURL(url, target);
-  }
-
-  public SWFActionBlock.TryCatchFinally _try(String var) throws IOException {
-    // stack.push(var);
-    strings.remove(var);
-    return super._try(var);
-  }
-
-  public void comment(String var) throws IOException {
-    // stack.push(var);
-    strings.remove(var);
-    super.comment(var);
-  }
-
-  public void goToFrame(String var) throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    super.gotoFrame(var);
-  }
-
-  public void ifJump(String var) throws IOException {
-    strings.remove(var);
-    super.ifJump(var);
-  }
-
-  public void jump(String var) throws IOException {
-    strings.remove(var);
-    super.jump(var);
-  }
-
-  public void jumpLabel(String var) throws IOException {
-    strings.remove(var);
-    super.jumpLabel(var);
-  }
-
-  public void lookup(int var) throws IOException {
-    if (dict != null && var >= 0 && var < dict.length) {
-      stack.push(dict[var]);
-    }
-    super.lookup(var);
-  }
-
-  public void push(String var) throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    super.push(var);
-  }
-
-  public void setTarget(String var) throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    super.setTarget(var);
-  }
-
-  public SWFActionBlock startFunction(String var, String[] params)
-      throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    if (params != null) {
-      for (int i = 0; i < params.length; i++) {
-        strings.remove(params[i]);
-      }
-    }
-    return this;
-  }
-
-  public SWFActionBlock startFunction2(String var, int arg1, int arg2,
-      String[] params, int[] arg3) throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    if (params != null) {
-      for (int i = 0; i < params.length; i++) {
-        strings.remove(params[i]);
-      }
-    }
-    return this;
-  }
-
-  public void waitForFrame(int num, String var) throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    super.waitForFrame(num, var);
-  }
-
-  public void waitForFrame(String var) throws IOException {
-    stack.push(var);
-    strings.remove(var);
-    super.waitForFrame(var);
-  }
-
-  public void done() throws IOException {
-    while (stack.size() > 0) {
-      strings.remove(stack.pop());
-    }
-  }
-
-  public SWFActionBlock start(int arg0, int arg1) throws IOException {
-    return this;
-  }
-
-  public SWFActionBlock start(int arg0) throws IOException {
-    return this;
-  }
-
-  public void add() throws IOException {
-    super.add();
-  }
-
-  public void asciiToChar() throws IOException {
-    super.asciiToChar();
-  }
-
-  public void asciiToCharMB() throws IOException {
-    super.asciiToCharMB();
-  }
-
-  public void push(int var) throws IOException {
-    if (dict != null && var >= 0 && var < dict.length) {
-      stack.push(dict[var]);
-    }
-    super.push(var);
-  }
-
-  public void callFunction() throws IOException {
-    strings.remove(stack.pop());
-    super.callFunction();
-  }
-
-  public void callMethod() throws IOException {
-    strings.remove(stack.pop());
-    super.callMethod();
-  }
-
-  public void getMember() throws IOException {
-    // 0: name
-    String val = (String) stack.pop();
-    strings.remove(val);
-    super.getMember();
-  }
-
-  public void setMember() throws IOException {
-    // 0: value -1: name
-    stack.pop(); // value
-    String name = (String) stack.pop();
-    strings.remove(name);
-    super.setMember();
-  }
-
-  public void setProperty() throws IOException {
-    super.setProperty();
-  }
-
-  public void setVariable() throws IOException {
-    super.setVariable();
-  }
-
-  public void call() throws IOException {
-    strings.remove(stack.pop());
-    super.call();
-  }
-
-  public void setTarget() throws IOException {
-    strings.remove(stack.pop());
-    super.setTarget();
-  }
-
-  public void pop() throws IOException {
-    strings.remove(stack.pop());
-    super.pop();
-  }
-
-  public void push(boolean arg0) throws IOException {
-    stack.push("" + arg0);
-    super.push(arg0);
-  }
-
-  public void push(double arg0) throws IOException {
-    stack.push("" + arg0);
-    super.push(arg0);
-  }
-
-  public void push(float arg0) throws IOException {
-    stack.push("" + arg0);
-    super.push(arg0);
-  }
-
-  public void pushNull() throws IOException {
-    stack.push("");
-    super.pushNull();
-  }
-
-  public void pushRegister(int arg0) throws IOException {
-    stack.push("" + arg0);
-    super.pushRegister(arg0);
-  }
-
-  public void pushUndefined() throws IOException {
-    stack.push("???");
-    super.pushUndefined();
-  }
-
-  public void getProperty() throws IOException {
-    stack.pop();
-    super.getProperty();
-  }
-
-  public void getVariable() throws IOException {
-    strings.remove(stack.pop());
-    super.getVariable();
-  }
-
-  public void gotoFrame(boolean arg0) throws IOException {
-    stack.push("" + arg0);
-    super.gotoFrame(arg0);
-  }
-
-  public void gotoFrame(int arg0) throws IOException {
-    stack.push("" + arg0);
-    super.gotoFrame(arg0);
-  }
-
-  public void gotoFrame(String arg0) throws IOException {
-    stack.push("" + arg0);
-    strings.remove(arg0);
-    super.gotoFrame(arg0);
-  }
-
-  public void newObject() throws IOException {
-    stack.pop();
-    super.newObject();
-  }
-
-  public SWFActionBlock startWith() throws IOException {
-    return this;
-  }
-
-}
-
-/*
- * Small bottom-less stack.
- */
-class SmallStack extends Stack<Object> {
-
-  private static final long serialVersionUID = 1L;
-
-  private int maxSize;
-
-  private HashSet<String> strings = null;
-
-  public SmallStack(int maxSize, HashSet<String> strings) {
-    this.maxSize = maxSize;
-    this.strings = strings;
-  }
-
-  public Object push(Object o) {
-    // limit max size
-    if (this.size() > maxSize) {
-      String val = (String) remove(0);
-      strings.remove(val);
-    }
-    return super.push(o);
-  }
-
-  public Object pop() {
-    // tolerate underruns
-    if (this.size() == 0)
-      return null;
-    else
-      return super.pop();
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java
deleted file mode 100644
index 5942e64..0000000
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse Flash SWF files.
- */
-package org.apache.nutch.parse.swf;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java b/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
deleted file mode 100644
index 129b85f..0000000
--- a/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.swf;
-
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * Unit tests for SWFParser.
- */
-public class TestSWFParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  private String[] sampleFiles = new String[] { "test1.swf", "test2.swf",
-      "test3.swf" };
-  private String[] sampleTexts = new String[] { "test1.txt", "test2.txt",
-      "test3.txt" };
-
-  @Test
-  public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-    Configuration conf = NutchConfiguration.create();
-
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-
-      parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
-      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
-      Assert.assertTrue(sampleTexts[i].equals(text));
-    }
-  }
-
-  public TestSWFParser() {
-    for (int i = 0; i < sampleFiles.length; i++) {
-      try {
-        // read the test string
-        FileInputStream fis = new FileInputStream(sampleDir + fileSeparator
-            + sampleTexts[i]);
-        StringBuffer sb = new StringBuffer();
-        int len = 0;
-        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
-        char[] buf = new char[1024];
-        while ((len = isr.read(buf)) > 0) {
-          sb.append(buf, 0, len);
-        }
-        isr.close();
-        sampleTexts[i] = sb.toString().replaceAll("[ \t\r\n]+", " ").trim();
-      } catch (Exception e) {
-        e.printStackTrace();
-      }
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/build-ivy.xml b/src/plugin/parse-tika/build-ivy.xml
deleted file mode 100644
index e4984d8..0000000
--- a/src/plugin/parse-tika/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
-    <property name="ivy.install.version" value="2.1.0" />
-    <condition property="ivy.home" value="${env.IVY_HOME}">
-      <isset property="env.IVY_HOME" />
-    </condition>
-    <property name="ivy.home" value="${user.home}/.ant" />
-    <property name="ivy.checksums" value="" />
-    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
-    <target name="download-ivy" unless="offline">
-
-        <mkdir dir="${ivy.jar.dir}"/>
-        <!-- download Ivy from web site so that it can be used even without any special installation -->
-        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
-             dest="${ivy.jar.file}" usetimestamp="true"/>
-    </target>
-
-    <target name="init-ivy" depends="download-ivy">
-      <!-- try to load ivy here from ivy home, in case the user has not already dropped
-              it into ant's lib dir (note that the latter copy will always take precedence).
-              We will not fail as long as local lib dir exists (it may be empty) and
-              ivy is in at least one of ant's lib dir or the local lib dir. -->
-        <path id="ivy.lib.path">
-            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
-        </path>
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
-    </target>
-
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
deleted file mode 100644
index 4ecb3f8..0000000
--- a/src/plugin/parse-tika/build.xml
+++ /dev/null
@@ -1,55 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="parse-tika" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-  
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-nekohtml" />
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-nekohtml/*.jar" />
-    </fileset>
-  </path>
-  
-    <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
-    <ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
-  </target>
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data"/>
-  <copy todir="${build.test}/data">
-    <fileset dir="sample">
-      <include name="*.rss"/>
-      <include name="*.rtf"/>
-      <include name="*.pdf"/>
-      <include name="ootest.*"/>
-      <include name="*.doc"/>
-      <include name="*.gif"/>
-    </fileset>
-  </copy>
-  
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/howto_upgrade_tika.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
deleted file mode 100644
index 63a05a4..0000000
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-1. Upgrade Tika depencency in trunk/ivy/ivy.xml
-
-2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
-
-3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
-   To get the list of dependencies and their versions execute:
-   $ ant -f ./build-ivy.xml
-   $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
deleted file mode 100644
index 7a9e959..0000000
--- a/src/plugin/parse-tika/ivy.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.12" conf="*->default">
-     <exclude org="org.apache.tika" name="tika-core" />
-     <exclude org="org.apache.httpcomponents" name="httpclient" />
-     <exclude org="org.apache.httpcomponents" name="httpcore" />
-    </dependency>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
deleted file mode 100644
index 04fcd2e..0000000
--- a/src/plugin/parse-tika/plugin.xml
+++ /dev/null
@@ -1,136 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="parse-tika"
-   name="Tika Parser Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="parse-tika.jar">
-         <export name="*"/>
-      </library>
-      <library name="apache-mime4j-core-0.7.2.jar"/>
-      <library name="apache-mime4j-dom-0.7.2.jar"/>
-      <library name="asm-5.0.4.jar"/>
-      <library name="aspectjrt-1.8.0.jar"/>
-      <library name="bcmail-jdk15on-1.52.jar"/>
-      <library name="bcpkix-jdk15on-1.52.jar"/>
-      <library name="bcprov-jdk15on-1.52.jar"/>
-      <library name="boilerpipe-1.1.0.jar"/>
-      <library name="bzip2-0.9.1.jar"/>
-      <library name="c3p0-0.9.1.1.jar"/>
-      <library name="cdm-4.5.5.jar"/>
-      <library name="commons-codec-1.6.jar"/>
-      <library name="commons-compress-1.10.jar"/>
-      <library name="commons-csv-1.0.jar"/>
-      <library name="commons-exec-1.3.jar"/>
-      <library name="commons-io-2.4.jar"/>
-      <library name="commons-lang-2.6.jar"/>
-      <library name="commons-logging-1.1.3.jar"/>
-      <library name="commons-logging-api-1.1.jar"/>
-      <library name="commons-vfs2-2.0.jar"/>
-      <library name="cxf-core-3.0.3.jar"/>
-      <library name="cxf-rt-frontend-jaxrs-3.0.3.jar"/>
-      <library name="cxf-rt-rs-client-3.0.3.jar"/>
-      <library name="cxf-rt-transports-http-3.0.3.jar"/>
-      <library name="ehcache-core-2.6.2.jar"/>
-      <library name="fontbox-1.8.10.jar"/>
-      <library name="geoapi-3.0.0.jar"/>
-      <library name="grib-4.5.5.jar"/>
-      <library name="gson-2.2.4.jar"/>
-      <library name="guava-17.0.jar"/>
-      <library name="httpmime-4.2.6.jar"/>
-      <library name="httpservices-4.5.5.jar"/>
-      <library name="isoparser-1.0.2.jar"/>
-      <library name="jackcess-2.1.2.jar"/>
-      <library name="jackcess-encrypt-2.1.1.jar"/>
-      <library name="java-libpst-0.8.1.jar"/>
-      <library name="javax.annotation-api-1.2.jar"/>
-      <library name="javax.ws.rs-api-2.0.1.jar"/>
-      <library name="jcip-annotations-1.0.jar"/>
-      <library name="jcommander-1.35.jar"/>
-      <library name="jdom-2.0.2.jar"/>
-      <library name="jdom2-2.0.4.jar"/>
-      <library name="jempbox-1.8.10.jar"/>
-      <library name="jhighlight-1.0.2.jar"/>
-      <library name="jj2000-5.2.jar"/>
-      <library name="jmatio-1.0.jar"/>
-      <library name="jna-4.1.0.jar"/>
-      <library name="joda-time-2.2.jar"/>
-      <library name="json-20140107.jar"/>
-      <library name="json-simple-1.1.1.jar"/>
-      <library name="jsoup-1.7.2.jar"/>
-      <library name="jsr-275-0.9.3.jar"/>
-      <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-0.7.jar"/>
-      <library name="jwnl-1.3.3.jar"/>
-      <library name="maven-scm-api-1.4.jar"/>
-      <library name="maven-scm-provider-svn-commons-1.4.jar"/>
-      <library name="maven-scm-provider-svnexe-1.4.jar"/>
-      <library name="metadata-extractor-2.8.0.jar"/>
-      <library name="netcdf4-4.5.5.jar"/>
-      <library name="opennlp-maxent-3.0.3.jar"/>
-      <library name="opennlp-tools-1.5.3.jar"/>
-      <library name="pdfbox-1.8.10.jar"/>
-      <library name="plexus-utils-1.5.6.jar"/>
-      <library name="poi-3.13.jar"/>
-      <library name="poi-ooxml-3.13.jar"/>
-      <library name="poi-ooxml-schemas-3.13.jar"/>
-      <library name="poi-scratchpad-3.13.jar"/>
-      <library name="protobuf-java-2.5.0.jar"/>
-      <library name="quartz-2.2.0.jar"/>
-      <library name="regexp-1.3.jar"/>
-      <library name="rome-1.5.1.jar"/>
-      <library name="rome-utils-1.5.1.jar"/>
-      <library name="sis-metadata-0.5.jar"/>
-      <library name="sis-netcdf-0.5.jar"/>
-      <library name="sis-referencing-0.5.jar"/>
-      <library name="sis-storage-0.5.jar"/>
-      <library name="sis-utility-0.5.jar"/>
-      <library name="slf4j-api-1.7.12.jar"/>
-      <library name="stax2-api-3.1.4.jar"/>
-      <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-1.12.jar"/>
-      <library name="udunits-4.5.5.jar"/>
-      <library name="vorbis-java-core-0.6.jar"/>
-      <library name="vorbis-java-tika-0.6.jar"/>
-      <library name="woodstox-core-asl-4.4.1.jar"/>
-      <library name="xmlbeans-2.6.0.jar"/>
-      <library name="xmlschema-core-2.1.0.jar"/>
-      <library name="xmpcore-5.1.2.jar"/>
-      <library name="xz-1.5.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-nekohtml"/>
-   </requires>
-
-   <extension point="org.apache.nutch.parse.Parser"
-              id="org.apache.nutch.parse.tika"
-              name="TikaParser">
-
-      <implementation id="org.apache.nutch.parse.tika.TikaParser"
-                      class="org.apache.nutch.parse.tika.TikaParser">
-       <parameter name="contentType" value="*"/>
-      </implementation>
-
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/encrypted.pdf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/encrypted.pdf b/src/plugin/parse-tika/sample/encrypted.pdf
deleted file mode 100644
index 383cebb..0000000
Binary files a/src/plugin/parse-tika/sample/encrypted.pdf and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/nutch.html
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/nutch.html b/src/plugin/parse-tika/sample/nutch.html
deleted file mode 100644
index 0aa7c98..0000000
--- a/src/plugin/parse-tika/sample/nutch.html
+++ /dev/null
@@ -1,519 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html>
-<head>
-<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
-<meta content="Apache Forrest" name="Generator">
-<meta name="Forrest-version" content="0.8">
-<meta name="Forrest-skin-name" content="lucene">
-<title>Welcome to Nutch!</title>
-<link type="text/css" href="skin/basic.css" rel="stylesheet">
-<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
-<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
-<link type="text/css" href="skin/profile.css" rel="stylesheet">
-<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
-<link rel="shortcut icon" href="images/favicon.ico">
-</head>
-<body onload="init()">
-<script type="text/javascript">ndeSetTextSize();</script>
-<div id="top">
-<!--+
-    |breadtrail
-    +-->
-<div class="breadtrail">
-<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://lucene.apache.org/">Lucene</a> &gt; <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
-</div>
-<!--+
-    |header
-    +-->
-<div class="header">
-<!--+
-    |start group logo
-    +-->
-<div class="grouplogo">
-<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
-</div>
-<!--+
-    |end group logo
-    +-->
-<!--+
-    |start Project Logo
-    +-->
-<div class="projectlogo">
-<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
-</div>
-<!--+
-    |end Project Logo
-    +-->
-<!--+
-    |start Search
-    +-->
-<div class="searchbox">
-<form action="http://search.lucidimagination.com/p:nutch" method="get" class="roundtopsmall">
-<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
-                    <input name="Search" value="Search" type="submit">
-</form>
-<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
-</div>
-</div>
-<!--+
-    |end search
-    +-->
-<!--+
-    |start Tabs
-    +-->
-<ul id="tabs">
-<li class="current">
-<a class="selected" href="index.html">Main</a>
-</li>
-<li>
-<a class="unselected" href="http://wiki.apache.org/nutch/">Wiki</a>
-</li>
-<li>
-<a class="unselected" href="http://issues.apache.org/jira/browse/Nutch">Jira</a>
-</li>
-</ul>
-<!--+
-    |end Tabs
-    +-->
-</div>
-</div>
-<div id="main">
-<div id="publishedStrip">
-<!--+
-    |start Subtabs
-    +-->
-<div id="level2tabs"></div>
-<!--+
-    |end Endtabs
-    +-->
-<script type="text/javascript"><!--
-document.write("Last Published: " + document.lastModified);
-//  --></script>
-</div>
-<!--+
-    |breadtrail
-    +-->
-<div class="breadtrail">
-
-             &nbsp;
-           </div>
-<!--+
-    |start Menu, mainarea
-    +-->
-<!--+
-    |start Menu
-    +-->
-<div id="menu">
-<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Project</div>
-<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
-<div class="menupage">
-<div class="menupagetitle">News</div>
-</div>
-<div class="menuitem">
-<a href="about.html">About</a>
-</div>
-<div class="menuitem">
-<a href="credits.html">Credits</a>
-</div>
-<div class="menuitem">
-<a href="http://www.cafepress.com/nutch/">Buy Stuff</a>
-</div>
-</div>
-<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
-<div id="menu_1.2" class="menuitemgroup">
-<div class="menuitem">
-<a href="http://wiki.apache.org/nutch/FAQ">FAQ</a>
-</div>
-<div class="menuitem">
-<a href="http://wiki.apache.org/nutch/">Wiki</a>
-</div>
-<div class="menuitem">
-<a href="tutorial.html">Tutorial (0.7.2)</a>
-</div>
-<div class="menuitem">
-<a href="tutorial8.html">Tutorial (0.8.x)</a>
-</div>
-<div class="menuitem">
-<a href="bot.html">Robot     </a>
-</div>
-<div class="menuitem">
-<a href="i18n.html">i18n</a>
-</div>
-<div class="menuitem">
-<a href="apidocs-1.0/index.html">API Docs (1.0)</a>
-</div>
-<div class="menuitem">
-<a href="apidocs-0.9/index.html">API Docs (0.9)</a>
-</div>
-<div class="menuitem">
-<a href="apidocs-0.8.x/index.html">API Docs (0.8.x)</a>
-</div>
-<div class="menuitem">
-<a href="apidocs/index.html">API Docs (0.7.2)</a>
-</div>
-<div class="menuitem">
-<a href="http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html">API Docs (nightly)</a>
-</div>
-</div>
-<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
-<div id="menu_1.3" class="menuitemgroup">
-<div class="menuitem">
-<a href="release/">Download</a>
-</div>
-<div class="menuitem">
-<a href="nightly.html">Nightly builds</a>
-</div>
-<div class="menuitem">
-<a href="mailing_lists.html">Mailing Lists</a>
-</div>
-<div class="menuitem">
-<a href="issue_tracking.html">Issue Tracking</a>
-</div>
-<div class="menuitem">
-<a href="version_control.html">Version Control</a>
-</div>
-</div>
-<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
-<div id="menu_1.4" class="menuitemgroup">
-<div class="menuitem">
-<a href="http://lucene.apache.org/java/">Lucene Java</a>
-</div>
-<div class="menuitem">
-<a href="http://lucene.apache.org/hadoop/">Hadoop</a>
-</div>
-<div class="menuitem">
-<a href="http://incubator.apache.org/solr/">Solr</a>
-</div>
-</div>
-<div id="credit">
-<hr>
-<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
-</div>
-<div id="roundbottom">
-<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
-<!--+
-  |alternative credits
-  +-->
-<div id="credit2"></div>
-</div>
-<!--+
-    |end Menu
-    +-->
-<!--+
-    |start content
-    +-->
-<div id="content">
-<div title="Portable Document Format" class="pdflink">
-<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
-        PDF</a>
-</div>
-<h1>Welcome to Nutch!</h1>
-<div id="minitoc-area">
-<ul class="minitoc">
-<li>
-<a href="#News">News</a>
-<ul class="minitoc">
-<li>
-<a href="#14+August+2009+-+Lucene+at+US+ApacheCon">14 August 2009 - Lucene at US ApacheCon</a>
-</li>
-<li>
-<a href="#23+March+2009+-+Apache+Nutch+1.0+Released">23 March 2009 - Apache Nutch 1.0 Released</a>
-</li>
-<li>
-<a href="#09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam">09 February 2009 - Lucene at ApacheCon Europe 2009 in
-			Amsterdam</a>
-</li>
-<li>
-<a href="#2+April+2007%3A+Nutch+0.9+Released">2 April 2007: Nutch 0.9 Released</a>
-</li>
-<li>
-<a href="#24+September+2006%3A+Nutch+0.8.1+Released">24 September 2006: Nutch 0.8.1 Released</a>
-</li>
-<li>
-<a href="#25+July+2006%3A+Nutch+0.8+Released">25 July 2006: Nutch 0.8 Released</a>
-</li>
-<li>
-<a href="#31+March+2006%3A+Nutch+0.7.2+Released">31 March 2006: Nutch 0.7.2 Released</a>
-</li>
-<li>
-<a href="#1+October+2005%3A+Nutch+0.7.1+Released">1 October 2005: Nutch 0.7.1 Released</a>
-</li>
-<li>
-<a href="#17+August+2005%3A+Nutch+0.7+Released">17 August 2005: Nutch 0.7 Released</a>
-</li>
-<li>
-<a href="#June+2005%3A+Nutch+graduates+from+Incubator">June 2005: Nutch graduates from Incubator</a>
-</li>
-<li>
-<a href="#January+2005%3A+Nutch+Joins+Apache+Incubator">January 2005: Nutch Joins Apache Incubator</a>
-</li>
-<li>
-<a href="#September+2004%3A+Creative+Commons+launches+Nutch-based+Search">September 2004: Creative Commons launches Nutch-based Search</a>
-</li>
-<li>
-<a href="#September+2004%3A+Oregon+State+University+switches+to+Nutch">September 2004: Oregon State University switches to Nutch</a>
-</li>
-</ul>
-</li>
-</ul>
-</div> 
-
-    
-<a name="N1000D"></a><a name="News"></a>
-<h2 class="h3">News</h2>
-<div class="section">
-<a name="N10013"></a><a name="14+August+2009+-+Lucene+at+US+ApacheCon"></a>
-<h3 class="h4">14 August 2009 - Lucene at US ApacheCon</h3>
-<p>
-        
-<a href="http://www.us.apachecon.com/c/acus2009/" title="ApacheCon US 2009">
-            <img alt="ApacheCon Logo" class="float-right" src="http://www.apache.org/events/current-event-125x125.png">
-        </a>
-        ApacheCon US is once again in the Bay Area and Lucene is coming
-        along for the ride! The Lucene community has planned two full
-        days of talks, plus a meetup and the usual bevy of training.
-        With a well-balanced mix of first time and veteran ApacheCon
-        speakers, the
-        <a href="http://www.us.apachecon.com/c/acus2009/schedule#lucene">Lucene track</a>
-        at ApacheCon US promises to have something for everyone. Be sure
-        not to miss:
-    </p>
-<p> Training:</p>
-<ul>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/437">Lucene Boot Camp</a>
-            - A two day training session, Nov. 2nd &amp; 3rd
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/375">Solr Day</a>
-            - A one day training session, Nov. 2nd
-        </li>
-    
-</ul>
-<p>Thursday, Nov. 5th</p>
-<ul>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/428">Introduction to the Lucene Ecosystem
-            </a>
-            - Grant Ingersoll @ 9:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/461">Lucene Basics and New Features</a>
-            - Michael Busch @ 10:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/331">Apache Solr: Out of the Box</a>
-            - Chris Hostetter @ 14:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/427">Introduction to Nutch</a>
-            - Andrzej Bialecki @ 15:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/430">Lucene and Solr Performance Tuning</a>
-            - Mark Miller @ 16:30
-        </li>
-    
-</ul>
-<p>Friday, Nov. 6th</p>
-<ul>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/332">Implementing an Information Retrieval
-                Framework for an Organizational Repository</a>
-            - Sithu D Sudarsan @ 9:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/333">Apache Mahout - Going from raw data to
-                Information</a>
-            - Isabel Drost @ 10:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/334">MIME Magic with Apache Tika</a>
-            - Jukka Zitting @ 11:30
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/335">Building Intelligent Search Applications
-                with the Lucene Ecosystem</a>
-            - Ted Dunning @ 14:00
-        </li>
-        
-<li>
-            
-<a href="http://www.us.apachecon.com/c/acus2009/sessions/462">Realtime Search</a>
-            - Jason Rutherglen @ 15:00
-        </li>
-    
-</ul>
-<a name="N10091"></a><a name="23+March+2009+-+Apache+Nutch+1.0+Released"></a>
-<h3 class="h4">23 March 2009 - Apache Nutch 1.0 Released</h3>
-<p>The 1.0 release of Nutch is now available. This release includes several major feature improvements
-      such as new indexing framework, new scoring framework, Apache Solr integration just to mention a few.
-      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-1.0.txt">
-      list of changes</a>  made in this version. The release is available
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N100A3"></a><a name="09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam"></a>
-<h3 class="h4">09 February 2009 - Lucene at ApacheCon Europe 2009 in
-			Amsterdam</h3>
-<p>
-			
-<a href="http://www.eu.apachecon.com/c/aceu2009/" title="ApacheCon EU 2009">
-				<img alt="ApacheCon EU 2009 Logo" class="float-right" src="http://www.eu.apachecon.com/page_attachments/0000/0115/125x125_basic.gif">
-			</a>
-
-			Lucene will be extremely well represented at
-			<a href="http://www.eu.apachecon.com/c/aceu2009/">ApacheCon EU 2009</a>
-			in Amsterdam, Netherlands this March 23-27, 2009:
-		</p>
-<ul>
-			
-<li>
-				
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/197">Lucene Boot Camp</a>
-				- A two day training session, March 23 &amp; 24th</li>
-                
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/201">Solr Boot Camp</a> - A one day training session, March 24th</li>
-                
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/136">Introducing Apache Mahout</a> - Grant Ingersoll. March 25th @ 10:30</li>
-                
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/137">Lucene/Solr Case Studies</a> - Erik Hatcher. March 25th @ 11:30</li>
-                
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/138">Advanced Indexing Techniques with Apache Lucene</a> - Michael Busch. March 25th @ 14:00</li>  
-                   
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/251">Apache Solr - A Case Study</a> - Uri Boness. March 26th @ 17:30</li>
-           
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/250">Best of breed - httpd, forrest, solr and droids</a> - Thorsten Scherler. March 27th @ 17:30</li>
-           
-<li>
-<a href="http://eu.apachecon.com/c/aceu2009/sessions/165">Apache Droids - an intelligent standalone robot framework</a> - Thorsten Scherler. March 26th @ 15:00</li>
-
-               
-</ul>
-<a name="N100EF"></a><a name="2+April+2007%3A+Nutch+0.9+Released"></a>
-<h3 class="h4">2 April 2007: Nutch 0.9 Released</h3>
-<p>The 0.9 release of Nutch is now available. This is the second release of Nutch
-      based entirely on the underlying Hadoop platform. This release includes several critical
-      bug fixes, as well as key speedups described in more detail at 
-      <a href="http://blog.foofactory.fi/2007/03/twice-speed-half-size.html">Sami Siren's blog</a>.
-      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt">
-      list of changes</a>  made in this version. The release is available
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N10105"></a><a name="24+September+2006%3A+Nutch+0.8.1+Released"></a>
-<h3 class="h4">24 September 2006: Nutch 0.8.1 Released</h3>
-<p>The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8.
-      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt">
-      list of changes</a>  made in this version. The release is available
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N10117"></a><a name="25+July+2006%3A+Nutch+0.8+Released"></a>
-<h3 class="h4">25 July 2006: Nutch 0.8 Released</h3>
-<p>The 0.8 release of Nutch is now available. This is the first release of Nutch based on
-      hadoop architecure. See <a href="http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup">
-      CHANGES.txt</a> for list of changes made in this version. The release is available
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N10129"></a><a name="31+March+2006%3A+Nutch+0.7.2+Released"></a>
-<h3 class="h4">31 March 2006: Nutch 0.7.2 Released</h3>
-<p>The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See
-      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158">
-      CHANGES.txt</a> for details. The release is available
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N1013B"></a><a name="1+October+2005%3A+Nutch+0.7.1+Released"></a>
-<h3 class="h4">1 October 2005: Nutch 0.7.1 Released</h3>
-<p>The 0.7.1 release of Nutch is now available. This is a bug fix release. See
-      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986">
-      CHANGES.txt</a> for details. The release is available
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N1014D"></a><a name="17+August+2005%3A+Nutch+0.7+Released"></a>
-<h3 class="h4">17 August 2005: Nutch 0.7 Released</h3>
-<p>This is the first Nutch release as an Apache Lucene sub-project. See 
-      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150">
-      CHANGES.txt</a> for details. The release is available 
-      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
-<a name="N1015F"></a><a name="June+2005%3A+Nutch+graduates+from+Incubator"></a>
-<h3 class="h4">June 2005: Nutch graduates from Incubator</h3>
-<p>Nutch has now graduated from the Apache incubator, and is now
-      a Subproject of Lucene.</p>
-<a name="N10169"></a><a name="January+2005%3A+Nutch+Joins+Apache+Incubator"></a>
-<h3 class="h4">January 2005: Nutch Joins Apache Incubator</h3>
-<p>Nutch is a two-year-old open source project, previously
-        hosted at Sourceforge and backed by its own non-profit
-        organization. The non-profit was founded in order to assign
-        copyright, so that we could retain the right to change the
-        license. We have now determined that the Apache license is the
-        appropriate license for Nutch and no longer require the
-        overhead of an independent non-profit organization. Nutch's
-        board of directors and its developers were both polled and
-        supported the move to the Apache foundation.</p>
-<a name="N10173"></a><a name="September+2004%3A+Creative+Commons+launches+Nutch-based+Search"></a>
-<h3 class="h4">September 2004: Creative Commons launches Nutch-based Search</h3>
-<p>Creative Commons unveiled a beta version of its search
-      engine, which scours the web for text, images, audio, and video
-      free to re-use on certain terms a search refinement offered by
-      no other company or organization.</p>
-<p>See the <a href="http://creativecommons.org/press-releases/entry/5064">Creative
-      Commons Press Release</a> for more details.</p>
-<a name="N10184"></a><a name="September+2004%3A+Oregon+State+University+switches+to+Nutch"></a>
-<h3 class="h4">September 2004: Oregon State University switches to Nutch</h3>
-<p>Oregon State University is converting its searching
-      infrastructure from Googletm to the open source project
-      Nutch. The effort to replace the Googletm will realize
-      significant cost savings for Oregon State University, while
-      promoting both the Nutch Search Engine and transparency in
-      search engine use and management.</p>
-<p>For more details see the announcement by OSU's <a href="http://osuosl.org/news_folder/nutch">Open Source
-      Lab</a>.</p>
-</div>
-
-  
-</div>
-<!--+
-    |end content
-    +-->
-<div class="clearboth">&nbsp;</div>
-</div>
-<div id="footer">
-<!--+
-    |start bottomstrip
-    +-->
-<div class="lastmodified">
-<script type="text/javascript"><!--
-document.write("Last Published: " + document.lastModified);
-//  --></script>
-</div>
-<div class="copyright">
-        Copyright &copy;
-         2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
-</div>
-<div id="logos"></div>
-<!--+
-    |end bottomstrip
-    +-->
-</div>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/nutch_logo_tm.gif
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/nutch_logo_tm.gif b/src/plugin/parse-tika/sample/nutch_logo_tm.gif
deleted file mode 100644
index 0545a60..0000000
Binary files a/src/plugin/parse-tika/sample/nutch_logo_tm.gif and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/ootest.odt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/ootest.odt b/src/plugin/parse-tika/sample/ootest.odt
deleted file mode 100644
index e36e389..0000000
Binary files a/src/plugin/parse-tika/sample/ootest.odt and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/ootest.sxw
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/ootest.sxw b/src/plugin/parse-tika/sample/ootest.sxw
deleted file mode 100644
index 260b1c2..0000000
Binary files a/src/plugin/parse-tika/sample/ootest.sxw and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/ootest.txt
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/ootest.txt b/src/plugin/parse-tika/sample/ootest.txt
deleted file mode 100644
index 685f89a..0000000
--- a/src/plugin/parse-tika/sample/ootest.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-\ufeffAbcedfg				?????
-Abcdefg
-Abcdefg
-abcdefg
-
-
-
-
-
-
-
-
-
-
- http://www.openoffice.org
-
-Title
-Col1
-Col2
-Col3
-head
-Cell1
-Cell2
-Cel3
-total
-TOTAL
-TOTAL
-TOTAL
-
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc gravida vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. Sed nulla. Duis posuere justo eget urna. Proin lorem orci, vestibulum ut, consequat molestie, eleifend a, nibh. Mauris sed lacus. Etiam blandit tincidunt neque. Cras ac sapien. Duis erat. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/pdftest.pdf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/pdftest.pdf b/src/plugin/parse-tika/sample/pdftest.pdf
deleted file mode 100644
index e7c6e62..0000000
--- a/src/plugin/parse-tika/sample/pdftest.pdf
+++ /dev/null
@@ -1,157 +0,0 @@
-%PDF-1.2 
-%\ufffd\ufffd\ufffd\ufffd
- 
-9 0 obj
-<<
-/Length 10 0 R
-/Filter /FlateDecode 
->>
-stream
-H\ufffd\u0350\ufffdJ\ufffd0\ufffd\ufffd \ufffd\ufffd{\ufffd\ufffdf\ufffd$M\ufffd\ufffdn\ufffd-\ufffd\ufffd\ufffd[&je\ufffd\ufffd\ufffd\u06e4\ufffd~\ufffd$\ufffd\ufffd\ufffd}\ufffd\u0245\ufffdIj\ufffd\ufffd\ufffds\ufffd\ufffd\ufffd\ufffd~\ufffdX\ufffd-],\ufffd\ufffd$Y\ufffd\ufffd\ufffd)\ufffd'N\ufffdu\ufffd1!\ufffd\ufffd\ufffdV\ufffd?\ufffd\ufffd?
-\ufffdb1Rbb\ufffd\u0489\ufffdH\ufffd[\ufffd\ufffdTD:#\ufffd&\u062d\ufffd\ufffdX\ufffd\ufffd\ufffdi\ufffd$qnf\ufffd\ufffd\ufffd\ufffd\ufffd]\ufffd\ufffd\ufffd\ufffd\ufffd\ufffda\ufffd\ufffd{\ufffd\ufffd\u0623\ufffd\ufffd\ufffdq|J\ufffdLs]\ufffdQ\ufffdI\ufffd\ufffdj\ufffd%\ufffd\ufffd9\ufffd\ufffd`\ufffd\u09ba\ufffd\ufffdU\ufffdite\ufffdz\ufffd$\ufffd\ufffd\ufffd\ufffdOeB\ufffd\u0112\u04af\ufffdR\ufffd\ufffd@z\u0717\ufffd\ufffd\ufffdg\ufffd\ufffd\ufffd<\ufffd\ufffd\ufffd
-endstream
-endobj
-10 0 obj
-246
-endobj
-4 0 obj
-<<
-/Type /Page
-/Parent 5 0 R
-/Resources <<
-/Font <<
-/F0 6 0 R 
-/F1 7 0 R 
->>
-/ProcSet 2 0 R
->>
-/Contents 9 0 R
->>
-endobj
-6 0 obj
-<<
-/Type /Font
-/Subtype /TrueType
-/Name /F0
-/BaseFont /Arial
-/Encoding /WinAnsiEncoding
->>
-endobj
-7 0 obj
-<<
-/Type /Font
-/Subtype /TrueType
-/Name /F1
-/BaseFont /BookAntiqua,Bold
-/FirstChar 31
-/LastChar 255
-/Widths [ 750 250 278 402 606 500 889 833 227 333 333 444 606 250 333 250 
-296 500 500 500 500 500 500 500 500 500 500 250 250 606 606 606 
-444 747 778 667 722 833 611 556 833 833 389 389 778 611 1000 833 
-833 611 833 722 611 667 778 778 1000 667 667 667 333 606 333 606 
-500 333 500 611 444 611 500 389 556 611 333 333 611 333 889 611 
-556 611 611 389 444 333 611 556 833 500 556 500 310 606 310 606 
-750 500 750 333 500 500 1000 500 500 333 1000 611 389 1000 750 750 
-750 750 278 278 500 500 606 500 1000 333 998 444 389 833 750 750 
-667 250 278 500 500 606 500 606 500 333 747 438 500 606 333 747 
-500 400 549 361 361 333 576 641 250 333 361 488 500 889 890 889 
-444 778 778 778 778 778 778 1000 722 611 611 611 611 389 389 389 
-389 833 833 833 833 833 833 833 606 833 778 778 778 778 667 611 
-611 500 500 500 500 500 500 778 444 500 500 500 500 333 333 333 
-333 556 611 556 556 556 556 556 549 556 611 611 611 611 556 611 
-556 ]
-/Encoding /WinAnsiEncoding
-/FontDescriptor 8 0 R
->>
-endobj
-8 0 obj
-<<
-/Type /FontDescriptor
-/FontName /BookAntiqua,Bold
-/Flags 16418
-/FontBBox [ -250 -260 1236 930 ]
-/MissingWidth 750
-/StemV 146
-/StemH 146
-/ItalicAngle 0
-/CapHeight 930
-/XHeight 651
-/Ascent 930
-/Descent 260
-/Leading 210
-/MaxWidth 1030
-/AvgWidth 460
->>
-endobj
-2 0 obj
-[ /PDF /Text  ]
-endobj
-5 0 obj
-<<
-/Kids [4 0 R ]
-/Count 1
-/Type /Pages
-/MediaBox [ 0 0 612 792 ]
->>
-endobj
-1 0 obj
-<<
-/Creator (1725.fm)
-/CreationDate (1-Jan-3 18:15PM)
-/Title (1725.PDF)
-/Author (Unknown)
-/Producer (Acrobat PDFWriter 3.02 for Windows)
-/Keywords ()
-/Subject ()
->>
-endobj
-3 0 obj
-<<
-/Pages 5 0 R
-/Type /Catalog
-/DefaultGray 11 0 R
-/DefaultRGB  12 0 R
->>
-endobj
-11 0 obj
-[/CalGray
-<<
-/WhitePoint [0.9505 1 1.0891 ]
-/Gamma 0.2468 
->>
-]
-endobj
-12 0 obj
-[/CalRGB
-<<
-/WhitePoint [0.9505 1 1.0891 ]
-/Gamma [0.2468 0.2468 0.2468 ]
-/Matrix [0.4361 0.2225 0.0139 0.3851 0.7169 0.0971 0.1431 0.0606 0.7141 ]
->>
-]
-endobj
-xref
-0 13
-0000000000 65535 f
-0000002172 00000 n
-0000002046 00000 n
-0000002363 00000 n
-0000000375 00000 n
-0000002080 00000 n
-0000000518 00000 n
-0000000633 00000 n
-0000001760 00000 n
-0000000021 00000 n
-0000000352 00000 n
-0000002460 00000 n
-0000002548 00000 n
-trailer
-<<
-/Size 13
-/Root 3 0 R
-/Info 1 0 R
-/ID [<47149510433dd4882f05f8c124223734><47149510433dd4882f05f8c124223734>]
->>
-startxref
-2726
-%%EOF

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/rsstest.rss
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/rsstest.rss b/src/plugin/parse-tika/sample/rsstest.rss
deleted file mode 100644
index 6c4ae48..0000000
--- a/src/plugin/parse-tika/sample/rsstest.rss
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1" ?>
-<!--
-	Licensed to the Apache Software Foundation (ASF) under one or more
-	contributor license agreements.  See the NOTICE file distributed with
-	this work for additional information regarding copyright ownership.
-	The ASF licenses this file to You under the Apache License, Version 2.0
-	(the "License"); you may not use this file except in compliance with
-	the License.  You may obtain a copy of the License at
-	
-	http://www.apache.org/licenses/LICENSE-2.0
-	
-	Unless required by applicable law or agreed to in writing, software
-	distributed under the License is distributed on an "AS IS" BASIS,
-	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	See the License for the specific language governing permissions and
-	limitations under the License.
--->
-<rss version="0.91">
-    <channel>
-      <title>TestChannel</title>
-      <link>http://test.channel.com/</link> 
-      <description>Sample RSS File for Junit test</description> 
-      <language>en-us</language>
-      
-      <item>
-        <title>Home Page of Chris Mattmann</title>
-        <link>http://www-scf.usc.edu/~mattmann/</link>
-        <description>Chris Mattmann's home page</description>
-      </item>
-
-      <item>
-        <title>Awesome Open Source Search Engine</title> 
-        <link>http://www.nutch.org/</link> 
-        <description>Yup, that's what it is</description> 
-      </item>
-   </channel>
-</rss>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/test.rtf
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/test.rtf b/src/plugin/parse-tika/sample/test.rtf
deleted file mode 100644
index c67a6c8..0000000
--- a/src/plugin/parse-tika/sample/test.rtf
+++ /dev/null
@@ -1,17 +0,0 @@
-{\rtf1\ansi\deff1\adeflang1025
-{\fonttbl{\f0\froman\fprq2\fcharset0 Times;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fmodern\fprq1\fcharset0 Courier New;}{\f3\froman\fprq2\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 Interface User;}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}}
-{\colortbl;\red0\green0\blue0;\red0\green0\blue128;\red128\green128\blue128;}
-{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033\snext1 Default;}
-{\s2\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext2 Text body;}
-{\s3\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af1\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon2\snext3 List;}
-{\s4\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs20\lang255\ai\ltrch\dbch\afs20\langfe255\ai\loch\f1\fs20\lang1033\i\sbasedon1\snext4 Caption;}
-{\s5\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext5 Index;}
-{\*\cs7\cf0\rtlch\af2\afs24\lang255\ltrch\dbch\af2\afs24\langfe255\loch\f2\fs24\lang1033 Teletype;}
-{\*\cs8\cf2\ul\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\fs24\lang1033 Internet Link;}
-}
-{\info{\title test rft document}{\subject tests}{\creatim\yr2004\mo9\dy20\hr19\min36}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6450}}\deftab709
-{\*\pgdsctbl
-{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}}
-{\*\pgdscno0}\paperh16837\paperw11905\margl1800\margr1800\margt1440\margb1440\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
-\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\ql\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033{\loch\f2\fs24\lang1033\i0\b0\*\cs7\cf0\rtlch\ltrch\dbch\loch\f2\fs24\lang1033 The quick brown fox jumps over the lazy dog}
-\par }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/sample/word97.doc
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/sample/word97.doc b/src/plugin/parse-tika/sample/word97.doc
deleted file mode 100644
index 4d012da..0000000
Binary files a/src/plugin/parse-tika/sample/word97.doc and /dev/null differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
----------------------------------------------------------------------
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
deleted file mode 100644
index 7c0d71b..0000000
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.tika;
-
-import java.lang.ClassLoader;
-import java.lang.InstantiationException;
-import java.util.HashMap;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import de.l3s.boilerpipe.BoilerpipeExtractor;
-import de.l3s.boilerpipe.extractors.*;
-
-class BoilerpipeExtractorRepository {
-
-    public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
-    public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<String, BoilerpipeExtractor>();
- 
-    /**
-     * Returns an instance of the specified extractor
-     */
-    public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
-      // Check if there's no instance of this extractor
-      if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
-        // FQCN
-        boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
-
-        // Attempt to load the class
-        try {
-          ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
-          Class extractorClass = loader.loadClass(boilerpipeExtractorName);
-
-          // Add an instance to the repository
-          extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.newInstance());
-
-        } catch (ClassNotFoundException e) {
-          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
-        } catch (InstantiationException e) {
-          LOG.error("Could not instantiate " + boilerpipeExtractorName);
-        } catch (Exception e) {
-          LOG.error(e);
-        }
-      }
-
-      return extractorRepository.get(boilerpipeExtractorName);
-    }
-
-}