You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by gr...@apache.org on 2015/09/21 19:19:42 UTC
svn commit: r1704368 [2/2] - in /tika/trunk/tika-example/src:
main/java/org/apache/tika/example/ test/java/org/apache/tika/example/
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MetadataAwareLuceneIndexer.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -34,54 +37,52 @@ import org.apache.tika.metadata.Property
*/
@SuppressWarnings("deprecation")
public class MetadataAwareLuceneIndexer {
+ private Tika tika;
- private Tika tika;
-
- private IndexWriter writer;
-
- public MetadataAwareLuceneIndexer(IndexWriter writer, Tika tika) {
- this.writer = writer;
- this.tika = tika;
- }
-
- public void indexContentSpecificMet(File file) throws Exception {
- Metadata met = new Metadata();
- try (InputStream is = new FileInputStream(file)) {
- tika.parse(is, met);
- Document document = new Document();
- for (String key : met.names()) {
- String[] values = met.getValues(key);
- for (String val : values) {
- document.add(new Field(key, val, Store.YES, Index.ANALYZED));
- }
- writer.addDocument(document);
- }
- }
- }
-
- public void indexWithDublinCore(File file) throws Exception {
- Metadata met = new Metadata();
- met.add(Metadata.CREATOR, "Manning");
- met.add(Metadata.CREATOR, "Tika in Action");
- met.set(Metadata.DATE, new Date());
- met.set(Metadata.FORMAT, tika.detect(file));
- met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
- met.add(Metadata.SUBJECT, "File");
- met.add(Metadata.SUBJECT, "Indexing");
- met.add(Metadata.SUBJECT, "Metadata");
- met.set(Property.externalClosedChoise(Metadata.RIGHTS, "public",
- "private"), "public");
- try (InputStream is = new FileInputStream(file)) {
- tika.parse(is, met);
- Document document = new Document();
- for (String key : met.names()) {
- String[] values = met.getValues(key);
- for (String val : values) {
- document.add(new Field(key, val, Store.YES, Index.ANALYZED));
- }
- writer.addDocument(document);
- }
- }
- }
+ private IndexWriter writer;
+ public MetadataAwareLuceneIndexer(IndexWriter writer, Tika tika) {
+ this.writer = writer;
+ this.tika = tika;
+ }
+
+ public void indexContentSpecificMet(File file) throws Exception {
+ Metadata met = new Metadata();
+ try (InputStream is = new FileInputStream(file)) {
+ tika.parse(is, met);
+ Document document = new Document();
+ for (String key : met.names()) {
+ String[] values = met.getValues(key);
+ for (String val : values) {
+ document.add(new Field(key, val, Store.YES, Index.ANALYZED));
+ }
+ writer.addDocument(document);
+ }
+ }
+ }
+
+ public void indexWithDublinCore(File file) throws Exception {
+ Metadata met = new Metadata();
+ met.add(Metadata.CREATOR, "Manning");
+ met.add(Metadata.CREATOR, "Tika in Action");
+ met.set(Metadata.DATE, new Date());
+ met.set(Metadata.FORMAT, tika.detect(file));
+ met.set(DublinCore.SOURCE, file.toURI().toURL().toString());
+ met.add(Metadata.SUBJECT, "File");
+ met.add(Metadata.SUBJECT, "Indexing");
+ met.add(Metadata.SUBJECT, "Metadata");
+ met.set(Property.externalClosedChoise(Metadata.RIGHTS, "public",
+ "private"), "public");
+ try (InputStream is = new FileInputStream(file)) {
+ tika.parse(is, met);
+ Document document = new Document();
+ for (String key : met.names()) {
+ String[] values = met.getValues(key);
+ for (String val : values) {
+ document.add(new Field(key, val, Store.YES, Index.ANALYZED));
+ }
+ writer.addDocument(document);
+ }
+ }
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/MyFirstTika.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -38,23 +41,23 @@ import static java.nio.charset.StandardC
* Demonstrates how to call the different components within Tika: its
* {@link Detector} framework (aka MIME identification and repository), its
* {@link Parser} interface, its {@link LanguageIdentifier} and other goodies.
- *
+ * <p>
* It also shows the "easy way" via {@link AutoDetectParser}
*/
public class MyFirstTika {
public static void main(String[] args) throws Exception {
String filename = args[0];
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
-
+
Metadata metadata = new Metadata();
String text = parseUsingComponents(filename, tikaConfig, metadata);
System.out.println("Parsed Metadata: ");
System.out.println(metadata);
System.out.println("Parsed Text: ");
System.out.println(text);
-
+
System.out.println("-------------------------");
-
+
metadata = new Metadata();
text = parseUsingAutoDetect(filename, tikaConfig, metadata);
System.out.println("Parsed Metadata: ");
@@ -62,19 +65,20 @@ public class MyFirstTika {
System.out.println("Parsed Text: ");
System.out.println(text);
}
-
- public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig,
- Metadata metadata) throws Exception {
+
+ public static String parseUsingAutoDetect(String filename, TikaConfig tikaConfig,
+ Metadata metadata) throws Exception {
System.out.println("Handling using AutoDetectParser: [" + filename + "]");
-
+
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
ContentHandler handler = new BodyContentHandler();
TikaInputStream stream = TikaInputStream.get(new File(filename), metadata);
parser.parse(stream, handler, metadata, new ParseContext());
return handler.toString();
}
- public static String parseUsingComponents(String filename, TikaConfig tikaConfig,
- Metadata metadata) throws Exception {
+
+ public static String parseUsingComponents(String filename, TikaConfig tikaConfig,
+ Metadata metadata) throws Exception {
MimeTypes mimeRegistry = tikaConfig.getMimeRepository();
System.out.println("Examining: [" + filename + "]");
@@ -106,7 +110,7 @@ public class MyFirstTika {
// Have the file parsed to get the content and metadata
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, new ParseContext());
-
+
return handler.toString();
}
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/ParsingExample.java Mon Sep 21 17:19:26 2015
@@ -14,6 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.apache.tika.example;
import java.io.IOException;
@@ -44,7 +45,7 @@ public class ParsingExample {
/**
* Example of how to use Tika's parseToString method to parse the content of a file,
* and return any text found.
- *
+ * <p>
* Note: Tika.parseToString() will extract content from the outer container
* document and any embedded/attached documents.
*
@@ -60,20 +61,20 @@ public class ParsingExample {
/**
* Example of how to use Tika to parse a file when you do not know its file type
* ahead of time.
- *
+ * <p>
* AutoDetectParser attempts to discover the file's type automatically, then call
* the exact Parser built for that file type.
- *
+ * <p>
* The stream to be parsed by the Parser. In this case, we get a file from the
* resources folder of this project.
- *
+ * <p>
* Handlers are used to get the exact information you want out of the host of
* information gathered by Parsers. The body content handler, intuitively, extracts
* everything that would go between HTML body tags.
- *
+ * <p>
* The Metadata object will be filled by the Parser with Metadata discovered about
* the file being parsed.
- *
+ * <p>
* Note: This example will extract content from the outer document and all
* embedded documents. However, if you choose to use a {@link ParseContext},
* make sure to set a {@link Parser} or else embedded content will not be
@@ -154,7 +155,6 @@ public class ParsingExample {
*/
public List<Metadata> recursiveParserWrapperExample() throws IOException,
SAXException, TikaException {
-
Parser p = new AutoDetectParser();
ContentHandlerFactory factory = new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
@@ -187,7 +187,7 @@ public class ParsingExample {
*/
public String serializedRecursiveParserWrapperExample() throws IOException,
SAXException, TikaException {
- List metadataList = recursiveParserWrapperExample();
+ List<Metadata> metadataList = recursiveParserWrapperExample();
StringWriter writer = new StringWriter();
JsonMetadataList.toJson(metadataList, writer);
return writer.toString();
@@ -195,7 +195,6 @@ public class ParsingExample {
/**
- *
* @param outputPath -- output directory to place files
* @return list of files created
* @throws IOException
@@ -207,9 +206,9 @@ public class ParsingExample {
InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx");
ExtractEmbeddedFiles ex = new ExtractEmbeddedFiles();
ex.extract(stream, outputPath);
- List<Path> ret = new ArrayList<Path>();
+ List<Path> ret = new ArrayList<>();
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(outputPath)) {
- for (Path entry: dirStream) {
+ for (Path entry : dirStream) {
ret.add(entry);
}
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/Pharmacy.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -17,15 +20,13 @@ package org.apache.tika.example;
import java.security.Key;
public class Pharmacy {
+ private static Key key = null;
- private static Key key = null;
-
- public static Key getKey() {
- return key;
- }
-
- public static void setKey(Key key) {
- Pharmacy.key = key;
- }
-
+ public static Key getKey() {
+ return key;
+ }
+
+ public static void setKey(Key key) {
+ Pharmacy.key = key;
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/PrescriptionParser.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,27 +29,24 @@ import org.apache.tika.sax.TeeContentHan
import org.xml.sax.ContentHandler;
public class PrescriptionParser extends XMLParser {
+ private static final long serialVersionUID = 7690682277511967388L;
- private static final long serialVersionUID = 7690682277511967388L;
-
- @Override
- protected ContentHandler getContentHandler(ContentHandler handler,
- Metadata metadata, ParseContext context) {
- String xpd = "http://example.com/2011/xpd";
-
- ContentHandler doctor = new ElementMetadataHandler(xpd, "doctor",
- metadata, "xpd:doctor");
- ContentHandler patient = new ElementMetadataHandler(xpd, "patient",
- metadata, "xpd:patient");
-
- return new TeeContentHandler(super.getContentHandler(handler, metadata,
- context), doctor, patient);
- }
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(MediaType
- .application("x-prescription+xml"));
- }
-
+ @Override
+ protected ContentHandler getContentHandler(ContentHandler handler,
+ Metadata metadata, ParseContext context) {
+ String xpd = "http://example.com/2011/xpd";
+
+ ContentHandler doctor = new ElementMetadataHandler(xpd, "doctor",
+ metadata, "xpd:doctor");
+ ContentHandler patient = new ElementMetadataHandler(xpd, "patient",
+ metadata, "xpd:patient");
+
+ return new TeeContentHandler(super.getContentHandler(handler, metadata,
+ context), doctor, patient);
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.application("x-prescription+xml"));
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/RecentFiles.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,114 +38,108 @@ import org.apache.tika.metadata.DublinCo
import org.apache.tika.metadata.Metadata;
/**
- *
* Builds on top of the LuceneIndexer and the Metadata discussions in Chapter 6
* to output an RSS (or RDF) feed of files crawled by the LuceneIndexer within
* the last N minutes.
*/
@SuppressWarnings("deprecation")
public class RecentFiles {
+ private IndexReader reader;
- private IndexReader reader;
-
- private SimpleDateFormat rssDateFormat = new SimpleDateFormat(
- "E, dd MMM yyyy HH:mm:ss z", Locale.getDefault());
-
- public String generateRSS(File indexFile) throws CorruptIndexException,
- IOException {
- StringBuffer output = new StringBuffer();
- output.append(getRSSHeaders());
- IndexSearcher searcher = null;
- try {
- reader = IndexReader.open(new SimpleFSDirectory(indexFile));
- searcher = new IndexSearcher(reader);
- GregorianCalendar gc = new java.util.GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
- gc.setTime(new Date());
- String nowDateTime = ISO8601.format(gc);
- gc.add(java.util.GregorianCalendar.MINUTE, -5);
- String fiveMinsAgo = ISO8601.format(gc);
- TermRangeQuery query = new TermRangeQuery(Metadata.DATE.toString(),
- fiveMinsAgo, nowDateTime, true, true);
- TopScoreDocCollector collector = TopScoreDocCollector.create(20,
- true);
- searcher.search(query, collector);
- ScoreDoc[] hits = collector.topDocs().scoreDocs;
- for (int i = 0; i < hits.length; i++) {
- Document doc = searcher.doc(hits[i].doc);
- output.append(getRSSItem(doc));
- }
-
- } finally {
- if (reader != null) reader.close();
- if (searcher != null) searcher.close();
- }
-
- output.append(getRSSFooters());
- return output.toString();
- }
-
- public String getRSSItem(Document doc) {
- StringBuffer output = new StringBuffer();
- output.append("<item>");
- output.append(emitTag("guid", doc.get(DublinCore.SOURCE.getName()),
- "isPermalink", "true"));
- output.append(emitTag("title", doc.get(Metadata.TITLE), null, null));
- output.append(emitTag("link", doc.get(DublinCore.SOURCE.getName()),
- null, null));
- output.append(emitTag("author", doc.get(Metadata.CREATOR), null, null));
- for (String topic : doc.getValues(Metadata.SUBJECT)) {
- output.append(emitTag("category", topic, null, null));
- }
- output.append(emitTag("pubDate", rssDateFormat.format(ISO8601.parse(doc
- .get(Metadata.DATE.toString()))), null, null));
- output.append(emitTag("description", doc.get(Metadata.TITLE), null,
- null));
- output.append("</item>");
- return output.toString();
- }
-
- public String getRSSHeaders() {
- StringBuffer output = new StringBuffer();
- output.append("<?xml version=\"1.0\" encoding=\"utf-8\">");
- output.append("<rss version=\"2.0\">");
- output.append(" <channel>");
- output.append(" <title>Tika in Action: Recent Files Feed."
- + "</title>");
- output.append(" <description>Chapter 6 Examples demonstrating "
- + "use of Tika Metadata for RSS.</description>");
- output.append(" <link>tikainaction.rss</link>");
- output.append(" <lastBuildDate>" + rssDateFormat.format(new Date())
- + "</lastBuildDate>");
- output.append(" <generator>Manning Publications: Tika in Action"
- + "</generator>");
- output.append(" <copyright>All Rights Reserved</copyright>");
- return output.toString();
- }
-
- public String getRSSFooters() {
- StringBuffer output = new StringBuffer();
- output.append(" </channel>");
- return output.toString();
- }
-
- private String emitTag(String tagName, String value, String attributeName,
- String attributeValue) {
- StringBuffer output = new StringBuffer();
- output.append("<");
- output.append(tagName);
- if (attributeName != null) {
- output.append(" ");
- output.append(attributeName);
- output.append("=\"");
- output.append(attributeValue);
- output.append("\"");
- }
- output.append(">");
- output.append(value);
- output.append("</");
- output.append(tagName);
- output.append(">");
- return output.toString();
- }
+ private SimpleDateFormat rssDateFormat = new SimpleDateFormat(
+ "E, dd MMM yyyy HH:mm:ss z", Locale.getDefault());
+ public String generateRSS(File indexFile) throws CorruptIndexException,
+ IOException {
+ StringBuffer output = new StringBuffer();
+ output.append(getRSSHeaders());
+ IndexSearcher searcher = null;
+ try {
+ reader = IndexReader.open(new SimpleFSDirectory(indexFile));
+ searcher = new IndexSearcher(reader);
+ GregorianCalendar gc = new java.util.GregorianCalendar(TimeZone.getDefault(), Locale.getDefault());
+ gc.setTime(new Date());
+ String nowDateTime = ISO8601.format(gc);
+ gc.add(java.util.GregorianCalendar.MINUTE, -5);
+ String fiveMinsAgo = ISO8601.format(gc);
+ TermRangeQuery query = new TermRangeQuery(Metadata.DATE.toString(),
+ fiveMinsAgo, nowDateTime, true, true);
+ TopScoreDocCollector collector = TopScoreDocCollector.create(20,
+ true);
+ searcher.search(query, collector);
+ ScoreDoc[] hits = collector.topDocs().scoreDocs;
+ for (int i = 0; i < hits.length; i++) {
+ Document doc = searcher.doc(hits[i].doc);
+ output.append(getRSSItem(doc));
+ }
+
+ } finally {
+ if (reader != null) reader.close();
+ if (searcher != null) searcher.close();
+ }
+
+ output.append(getRSSFooters());
+ return output.toString();
+ }
+
+ public String getRSSItem(Document doc) {
+ StringBuilder output = new StringBuilder();
+ output.append("<item>");
+ output.append(emitTag("guid", doc.get(DublinCore.SOURCE.getName()),
+ "isPermalink", "true"));
+ output.append(emitTag("title", doc.get(Metadata.TITLE), null, null));
+ output.append(emitTag("link", doc.get(DublinCore.SOURCE.getName()),
+ null, null));
+ output.append(emitTag("author", doc.get(Metadata.CREATOR), null, null));
+ for (String topic : doc.getValues(Metadata.SUBJECT)) {
+ output.append(emitTag("category", topic, null, null));
+ }
+ output.append(emitTag("pubDate", rssDateFormat.format(ISO8601.parse(doc
+ .get(Metadata.DATE.toString()))), null, null));
+ output.append(emitTag("description", doc.get(Metadata.TITLE), null,
+ null));
+ output.append("</item>");
+ return output.toString();
+ }
+
+ public String getRSSHeaders() {
+ StringBuilder output = new StringBuilder();
+ output.append("<?xml version=\"1.0\" encoding=\"utf-8\">");
+ output.append("<rss version=\"2.0\">");
+ output.append(" <channel>");
+ output.append(" <title>Tika in Action: Recent Files Feed.</title>");
+ output.append(" <description>Chapter 6 Examples demonstrating "
+ + "use of Tika Metadata for RSS.</description>");
+ output.append(" <link>tikainaction.rss</link>");
+ output.append(" <lastBuildDate>");
+ output.append(rssDateFormat.format(new Date()));
+ output.append("</lastBuildDate>");
+ output.append(" <generator>Manning Publications: Tika in Action</generator>");
+ output.append(" <copyright>All Rights Reserved</copyright>");
+ return output.toString();
+ }
+
+ public String getRSSFooters() {
+ return " </channel>";
+ }
+
+ private String emitTag(String tagName, String value, String attributeName,
+ String attributeValue) {
+ StringBuilder output = new StringBuilder();
+ output.append("<");
+ output.append(tagName);
+ if (attributeName != null) {
+ output.append(" ");
+ output.append(attributeName);
+ output.append("=\"");
+ output.append(attributeValue);
+ output.append("\"");
+ }
+ output.append(">");
+ output.append(value);
+ output.append("</");
+ output.append(tagName);
+ output.append(">");
+ return output.toString();
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/RollbackSoftware.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -43,100 +46,92 @@ import static java.nio.charset.StandardC
* Demonstrates Tika and its ability to sense symlinks.
*/
public class RollbackSoftware {
-
- public static void main(String[] args) throws Exception {
- RollbackSoftware r = new RollbackSoftware();
- r.rollback(new File(args[0]));
- }
-
- public void rollback(File deployArea) throws IOException, SAXException,
- TikaException {
- LinkContentHandler handler = new LinkContentHandler();
- Metadata met = new Metadata();
- DeploymentAreaParser parser = new DeploymentAreaParser();
- parser.parse(IOUtils.toInputStream(deployArea.getAbsolutePath(), UTF_8),
- handler, met);
- List<Link> links = handler.getLinks();
- if (links.size() < 2)
- throw new IOException("Must have installed at least 2 versions!");
- Collections.sort(links, new Comparator<Link>() {
- public int compare(Link o1, Link o2) {
- return o1.getText().compareTo(o2.getText());
- }
- });
-
- this.updateVersion(links.get(links.size() - 2).getText());
-
- }
-
- private void updateVersion(String version) {
- System.out.println("Rolling back to version: [" + version + "]");
- }
-
- class DeploymentAreaParser implements Parser {
-
- private static final long serialVersionUID = -2356647405087933468L;
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.tika.parser.Parser#getSupportedTypes(
- * org.apache.tika.parser.ParseContext)
- */
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.unmodifiableSet(new HashSet<MediaType>(Arrays
- .asList(MediaType.TEXT_PLAIN)));
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
- * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
- */
- public void parse(InputStream is, ContentHandler handler,
- Metadata metadata) throws IOException, SAXException,
- TikaException {
- parse(is, handler, metadata, new ParseContext());
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
- * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
- * org.apache.tika.parser.ParseContext)
- */
-
- public void parse(InputStream is, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
-
- File deployArea = new File(IOUtils.toString(is, UTF_8));
- File[] versions = deployArea.listFiles(new FileFilter() {
-
- public boolean accept(File pathname) {
- return !pathname.getName().startsWith("current");
- }
- });
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
- metadata);
- xhtml.startDocument();
- for (File v : versions) {
- if (isSymlink(v))
- continue;
- xhtml.startElement("a", "href", v.toURI().toURL().toExternalForm());
- xhtml.characters(v.getName());
- xhtml.endElement("a");
- }
-
- }
-
- }
-
- private boolean isSymlink(File f) throws IOException {
- return !f.getAbsolutePath().equals(f.getCanonicalPath());
- }
-
+ public static void main(String[] args) throws Exception {
+ RollbackSoftware r = new RollbackSoftware();
+ r.rollback(new File(args[0]));
+ }
+
+ public void rollback(File deployArea) throws IOException, SAXException,
+ TikaException {
+ LinkContentHandler handler = new LinkContentHandler();
+ Metadata met = new Metadata();
+ DeploymentAreaParser parser = new DeploymentAreaParser();
+ parser.parse(IOUtils.toInputStream(deployArea.getAbsolutePath(), UTF_8),
+ handler, met);
+ List<Link> links = handler.getLinks();
+ if (links.size() < 2)
+ throw new IOException("Must have installed at least 2 versions!");
+ Collections.sort(links, new Comparator<Link>() {
+ public int compare(Link o1, Link o2) {
+ return o1.getText().compareTo(o2.getText());
+ }
+ });
+
+ this.updateVersion(links.get(links.size() - 2).getText());
+ }
+
+ private void updateVersion(String version) {
+ System.out.println("Rolling back to version: [" + version + "]");
+ }
+
+ class DeploymentAreaParser implements Parser {
+ private static final long serialVersionUID = -2356647405087933468L;
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.tika.parser.Parser#getSupportedTypes(
+ * org.apache.tika.parser.ParseContext)
+ */
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.unmodifiableSet(new HashSet<MediaType>(Arrays
+ .asList(MediaType.TEXT_PLAIN)));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
+ */
+ public void parse(InputStream is, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException,
+ TikaException {
+ parse(is, handler, metadata, new ParseContext());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ public void parse(InputStream is, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ File deployArea = new File(IOUtils.toString(is, UTF_8));
+ File[] versions = deployArea.listFiles(new FileFilter() {
+ public boolean accept(File pathname) {
+ return !pathname.getName().startsWith("current");
+ }
+ });
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
+ metadata);
+ xhtml.startDocument();
+ for (File v : versions) {
+ if (isSymlink(v))
+ continue;
+ xhtml.startElement("a", "href", v.toURI().toURL().toExternalForm());
+ xhtml.characters(v.getName());
+ xhtml.endElement("a");
+ }
+ }
+ }
+
+ private boolean isSymlink(File f) throws IOException {
+ return !f.getAbsolutePath().equals(f.getCanonicalPath());
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTextExtractor.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,20 +18,19 @@
package org.apache.tika.example;
import java.io.File;
+
import org.apache.tika.Tika;
public class SimpleTextExtractor {
-
- public static void main(String[] args) throws Exception {
- // Create a Tika instance with the default configuration
- Tika tika = new Tika();
-
- // Parse all given files and print out the extracted
- // text content
- for (String file : args) {
- String text = tika.parseToString(new File(file));
- System.out.print(text);
- }
- }
-
+ public static void main(String[] args) throws Exception {
+ // Create a Tika instance with the default configuration
+ Tika tika = new Tika();
+
+ // Parse all given files and print out the extracted
+ // text content
+ for (String file : args) {
+ String text = tika.parseToString(new File(file));
+ System.out.print(text);
+ }
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/SimpleTypeDetector.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -19,14 +22,12 @@ import java.io.File;
import org.apache.tika.Tika;
public class SimpleTypeDetector {
+ public static void main(String[] args) throws Exception {
+ Tika tika = new Tika();
- public static void main(String[] args) throws Exception {
- Tika tika = new Tika();
-
- for (String file : args) {
- String type = tika.detect(new File(file));
- System.out.println(file + ": " + type);
- }
- }
-
+ for (String file : args) {
+ String type = tika.detect(new File(file));
+ System.out.println(file + ": " + type);
+ }
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/SpringExample.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,14 +29,12 @@ import org.springframework.context.suppo
import static java.nio.charset.StandardCharsets.UTF_8;
public class SpringExample {
-
- public static void main(String[] args) throws Exception {
- ApplicationContext context = new ClassPathXmlApplicationContext(
- new String[] { "org/apache/tika/example/spring.xml" });
- Parser parser = context.getBean("tika", Parser.class);
- parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)),
- new WriteOutContentHandler(System.out), new Metadata(),
- new ParseContext());
- }
-
+ public static void main(String[] args) throws Exception {
+ ApplicationContext context = new ClassPathXmlApplicationContext(
+ new String[]{"org/apache/tika/example/spring.xml"});
+ Parser parser = context.getBean("tika", Parser.class);
+ parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(UTF_8)),
+ new WriteOutContentHandler(System.out), new Metadata(),
+ new ParseContext());
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/TIAParsingExample.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -52,149 +55,147 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class TIAParsingExample {
-
- public static String parseToStringExample() throws Exception {
- File document = new File("example.doc");
- String content = new Tika().parseToString(document);
- System.out.print(content);
- return content;
- }
-
- public static void parseToReaderExample() throws Exception {
- File document = new File("example.doc");
- try (Reader reader = new Tika().parse(document)) {
- char[] buffer = new char[1000];
- int n = reader.read(buffer);
- while (n != -1) {
- System.out.append(CharBuffer.wrap(buffer, 0, n));
- n = reader.read(buffer);
- }
- }
- }
-
- public static void parseFileInputStream(String filename) throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- try (InputStream stream = new FileInputStream(new File(filename))) {
- parser.parse(stream, handler, metadata, context);
- }
- }
-
- public static void parseURLStream(String address) throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
- parser.parse(stream, handler, metadata, context);
- }
- }
-
- public static void parseTikaInputStream(String filename) throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- try (InputStream stream = TikaInputStream.get(new File(filename))) {
- parser.parse(stream, handler, metadata, context);
- }
- }
-
- public static File tikaInputStreamGetFile(String filename) throws Exception {
- try (InputStream stream = TikaInputStream.get(new File(filename))) {
- TikaInputStream tikaInputStream = TikaInputStream.get(stream);
- File file = tikaInputStream.getFile();
- return file;
- }
- }
-
- public static void useHtmlParser() throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- Parser parser = new HtmlParser();
- parser.parse(stream, handler, metadata, context);
- }
-
- public static void useCompositeParser() throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ContentHandler handler = new DefaultHandler();
- ParseContext context = new ParseContext();
- Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>();
- parsersByType.put(MediaType.parse("text/html"), new HtmlParser());
- parsersByType.put(MediaType.parse("application/xml"), new XMLParser());
-
- CompositeParser parser = new CompositeParser();
- parser.setParsers(parsersByType);
- parser.setFallback(new TXTParser());
-
- Metadata metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "text/html");
- parser.parse(stream, handler, metadata, context);
- }
-
- public static void useAutoDetectParser() throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- Parser parser = new AutoDetectParser();
- parser.parse(stream, handler, metadata, context);
- }
-
- public static void testTeeContentHandler(String filename) throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- Parser parser = new AutoDetectParser();
- LinkContentHandler linkCollector = new LinkContentHandler();
- try (OutputStream output = new FileOutputStream(new File(filename))) {
- ContentHandler handler = new TeeContentHandler(
- new BodyContentHandler(output), linkCollector);
- parser.parse(stream, handler, metadata, context);
- }
- }
-
- public static void testLocale() throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- Parser parser = new AutoDetectParser();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.ENGLISH);
- parser.parse(stream, handler, metadata, context);
- }
-
- public static void testHtmlMapper() throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- Parser parser = new AutoDetectParser();
- ParseContext context = new ParseContext();
- context.set(HtmlMapper.class, new IdentityHtmlMapper());
- parser.parse(stream, handler, metadata, context);
- }
-
- public static void testCompositeDocument() throws Exception {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ContentHandler handler = new DefaultHandler();
- Metadata metadata = new Metadata();
- Parser parser = new AutoDetectParser();
- ParseContext context = new ParseContext();
- context.set(Parser.class, new ParserDecorator(parser) {
- private static final long serialVersionUID = 4424210691523343833L;
-
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // custom processing of the component document
- }
- });
- parser.parse(stream, handler, metadata, context);
- }
-
+ public static String parseToStringExample() throws Exception {
+ File document = new File("example.doc");
+ String content = new Tika().parseToString(document);
+ System.out.print(content);
+ return content;
+ }
+
+ public static void parseToReaderExample() throws Exception {
+ File document = new File("example.doc");
+ try (Reader reader = new Tika().parse(document)) {
+ char[] buffer = new char[1000];
+ int n = reader.read(buffer);
+ while (n != -1) {
+ System.out.append(CharBuffer.wrap(buffer, 0, n));
+ n = reader.read(buffer);
+ }
+ }
+ }
+
+ public static void parseFileInputStream(String filename) throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ try (InputStream stream = new FileInputStream(new File(filename))) {
+ parser.parse(stream, handler, metadata, context);
+ }
+ }
+
+ public static void parseURLStream(String address) throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ try (InputStream stream = new GZIPInputStream(new URL(address).openStream())) {
+ parser.parse(stream, handler, metadata, context);
+ }
+ }
+
+ public static void parseTikaInputStream(String filename) throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ try (InputStream stream = TikaInputStream.get(new File(filename))) {
+ parser.parse(stream, handler, metadata, context);
+ }
+ }
+
+ public static File tikaInputStreamGetFile(String filename) throws Exception {
+ try (InputStream stream = TikaInputStream.get(new File(filename))) {
+ TikaInputStream tikaInputStream = TikaInputStream.get(stream);
+ File file = tikaInputStream.getFile();
+ return file;
+ }
+ }
+
+ public static void useHtmlParser() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ Parser parser = new HtmlParser();
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void useCompositeParser() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ ParseContext context = new ParseContext();
+ Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>();
+ parsersByType.put(MediaType.parse("text/html"), new HtmlParser());
+ parsersByType.put(MediaType.parse("application/xml"), new XMLParser());
+
+ CompositeParser parser = new CompositeParser();
+ parser.setParsers(parsersByType);
+ parser.setFallback(new TXTParser());
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html");
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void useAutoDetectParser() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ Parser parser = new AutoDetectParser();
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void testTeeContentHandler(String filename) throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ Parser parser = new AutoDetectParser();
+ LinkContentHandler linkCollector = new LinkContentHandler();
+ try (OutputStream output = new FileOutputStream(new File(filename))) {
+ ContentHandler handler = new TeeContentHandler(
+ new BodyContentHandler(output), linkCollector);
+ parser.parse(stream, handler, metadata, context);
+ }
+ }
+
+ public static void testLocale() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.ENGLISH);
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void testHtmlMapper() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(HtmlMapper.class, new IdentityHtmlMapper());
+ parser.parse(stream, handler, metadata, context);
+ }
+
+ public static void testCompositeDocument() throws Exception {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ContentHandler handler = new DefaultHandler();
+ Metadata metadata = new Metadata();
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new ParserDecorator(parser) {
+ private static final long serialVersionUID = 4424210691523343833L;
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // custom processing of the component document
+ }
+ });
+ parser.parse(stream, handler, metadata, context);
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/TrecDocumentGenerator.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,87 +28,80 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.metadata.Metadata;
/**
- *
* Generates document summaries for corpus analysis in the Open Relevance
* project.
- *
*/
@SuppressWarnings("deprecation")
public class TrecDocumentGenerator {
-
- public TrecDocument summarize(File file) throws FileNotFoundException,
- IOException, TikaException {
- Tika tika = new Tika();
- Metadata met = new Metadata();
-
- String contents = tika.parseToString(new FileInputStream(file), met);
- return new TrecDocument(met.get(Metadata.RESOURCE_NAME_KEY), contents,
- met.getDate(Metadata.DATE));
-
- }
-
- // copied from
- // http://svn.apache.org/repos/asf/lucene/openrelevance/trunk/src/java/org/
- // apache/orp/util/TrecDocument.java
- // since the ORP jars aren't published anywhere
- class TrecDocument {
- private CharSequence docname;
- private CharSequence body;
- private Date date;
-
- public TrecDocument(CharSequence docname, CharSequence body, Date date) {
- this.docname = docname;
- this.body = body;
- this.date = date;
- }
-
- public TrecDocument() {
- }
-
- /**
- * @return the docname
- */
- public CharSequence getDocname() {
- return docname;
- }
-
- /**
- * @param docname
- * the docname to set
- */
- public void setDocname(CharSequence docname) {
- this.docname = docname;
- }
-
- /**
- * @return the body
- */
- public CharSequence getBody() {
- return body;
- }
-
- /**
- * @param body
- * the body to set
- */
- public void setBody(CharSequence body) {
- this.body = body;
- }
-
- /**
- * @return the date
- */
- public Date getDate() {
- return date;
- }
-
- /**
- * @param date
- * the date to set
- */
- public void setDate(Date date) {
- this.date = date;
- }
- }
-
+ public TrecDocument summarize(File file) throws FileNotFoundException,
+ IOException, TikaException {
+ Tika tika = new Tika();
+ Metadata met = new Metadata();
+
+ String contents = tika.parseToString(new FileInputStream(file), met);
+ return new TrecDocument(met.get(Metadata.RESOURCE_NAME_KEY), contents,
+ met.getDate(Metadata.DATE));
+
+ }
+
+ // copied from
+ // http://svn.apache.org/repos/asf/lucene/openrelevance/trunk/src/java/org/
+ // apache/orp/util/TrecDocument.java
+ // since the ORP jars aren't published anywhere
+ class TrecDocument {
+ private CharSequence docname;
+ private CharSequence body;
+ private Date date;
+
+ public TrecDocument(CharSequence docname, CharSequence body, Date date) {
+ this.docname = docname;
+ this.body = body;
+ this.date = date;
+ }
+
+ public TrecDocument() {
+ }
+
+ /**
+ * @return the docname
+ */
+ public CharSequence getDocname() {
+ return docname;
+ }
+
+ /**
+ * @param docname the docname to set
+ */
+ public void setDocname(CharSequence docname) {
+ this.docname = docname;
+ }
+
+ /**
+ * @return the body
+ */
+ public CharSequence getBody() {
+ return body;
+ }
+
+ /**
+ * @param body the body to set
+ */
+ public void setBody(CharSequence body) {
+ this.body = body;
+ }
+
+ /**
+ * @return the date
+ */
+ public Date getDate() {
+ return date;
+ }
+
+ /**
+ * @param date the date to set
+ */
+ public void setDate(Date date) {
+ this.date = date;
+ }
+ }
}
Modified: tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java (original)
+++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/ZipListFiles.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,34 +17,29 @@
package org.apache.tika.example;
-//JDK imports
import java.io.IOException;
import java.util.Collections;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
/**
- *
- *
* Example code listing from Chapter 1. Lists a zip file's entries using JDK's
* standard APIs.
- *
*/
public class ZipListFiles {
- public static void main(String[] args) throws Exception {
- if (args.length > 0) {
- for (String file : args) {
- System.out.println("Files in " + file + " file:");
- listZipEntries(file);
- }
- }
- }
-
- public static void listZipEntries(String path) throws IOException {
- ZipFile zip = new ZipFile(path);
- for (ZipEntry entry : Collections.list(zip.entries())) {
- System.out.println(entry.getName());
- }
- }
+ public static void main(String[] args) throws Exception {
+ if (args.length > 0) {
+ for (String file : args) {
+ System.out.println("Files in " + file + " file:");
+ listZipEntries(file);
+ }
+ }
+ }
-}
\ No newline at end of file
+ public static void listZipEntries(String path) throws IOException {
+ ZipFile zip = new ZipFile(path);
+ for (ZipEntry entry : Collections.list(zip.entries())) {
+ System.out.println(entry.getName());
+ }
+ }
+}
Modified: tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java?rev=1704368&r1=1704367&r2=1704368&view=diff
==============================================================================
--- tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java (original)
+++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/AdvancedTypeDetectorTest.java Mon Sep 21 17:19:26 2015
@@ -1,9 +1,12 @@
-/**
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,18 +17,14 @@
package org.apache.tika.example;
+import org.junit.Test;
import static org.junit.Assert.assertEquals;
-import org.junit.Test;
-
@SuppressWarnings("deprecation")
public class AdvancedTypeDetectorTest {
-
- @Test
- public void testDetectWithCustomConfig() throws Exception {
- assertEquals("application/xml",
- AdvancedTypeDetector.detectWithCustomConfig("pom.xml"));
- }
-
+ @Test
+ public void testDetectWithCustomConfig() throws Exception {
+ assertEquals("application/xml", AdvancedTypeDetector.detectWithCustomConfig("pom.xml"));
+ }
}