You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@streams.apache.org by sb...@apache.org on 2014/04/02 19:25:31 UTC
git commit: initial version of boiler pipe processor (originally
authored by @smashew)
Repository: incubator-streams
Updated Branches:
refs/heads/springcleaning f1518b3dd -> da2d80c74
initial version of boiler pipe processor (originally authored by @smashew)
Project: http://git-wip-us.apache.org/repos/asf/incubator-streams/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-streams/commit/da2d80c7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-streams/tree/da2d80c7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-streams/diff/da2d80c7
Branch: refs/heads/springcleaning
Commit: da2d80c74991dd86d45aed50edd2252a1697cb12
Parents: f1518b3
Author: sblackmon <sb...@w2odigital.com>
Authored: Wed Apr 2 12:23:49 2014 -0500
Committer: sblackmon <sb...@w2odigital.com>
Committed: Wed Apr 2 12:23:49 2014 -0500
----------------------------------------------------------------------
streams-contrib/streams-processor-tika/pom.xml | 139 ++++++++++
.../org/apache/streams/tika/CategoryParser.java | 95 +++++++
.../org/apache/streams/tika/LinkExpander.java | 251 +++++++++++++++++++
.../org/apache/streams/tika/TikaProcessor.java | 104 ++++++++
.../apache/streams/tika/BoilerPipeArticle.json | 80 ++++++
.../java/org/apache/streams/util/DateUtil.java | 174 +++++++++++++
6 files changed, 843 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/pom.xml
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-tika/pom.xml b/streams-contrib/streams-processor-tika/pom.xml
new file mode 100644
index 0000000..b320d38
--- /dev/null
+++ b/streams-contrib/streams-processor-tika/pom.xml
@@ -0,0 +1,139 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+ <artifactId>streams-processor-tika</artifactId>
+ <version>0.1-SNAPSHOT</version>
+
+ <parent>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-contrib</artifactId>
+ <version>0.1-SNAPSHOT</version>
+ </parent>
+
+ <properties>
+ <tika.version>1.5</tika.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-config</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-pojo</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.streams</groupId>
+ <artifactId>streams-processor-urls</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.jsonschema2pojo</groupId>
+ <artifactId>jsonschema2pojo-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${tika.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${tika.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <sourceDirectory>src/main/java</sourceDirectory>
+ <testSourceDirectory>src/test/java</testSourceDirectory>
+ <resources>
+ <resource>
+ <directory>src/main/resources</directory>
+ </resource>
+ </resources>
+ <testResources>
+ <testResource>
+ <directory>src/test/resources</directory>
+ </testResource>
+ </testResources>
+ <plugins>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>build-helper-maven-plugin</artifactId>
+ <version>1.8</version>
+ <executions>
+ <execution>
+ <id>add-source</id>
+ <phase>generate-sources</phase>
+ <goals>
+ <goal>add-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>target/generated-sources/jsonschema2pojo/**/*.java</source>
+ </sources>
+ </configuration>
+ </execution>
+ <execution>
+ <id>add-source-jaxb2</id>
+ <phase>generate-sources</phase>
+ <goals>
+ <goal>add-source</goal>
+ </goals>
+ <configuration>
+ <sources>
+ <source>target/generated-sources/jaxb2</source>
+ </sources>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.jsonschema2pojo</groupId>
+ <artifactId>jsonschema2pojo-maven-plugin</artifactId>
+ <configuration>
+ <addCompileSourceRoot>true</addCompileSourceRoot>
+ <generateBuilders>true</generateBuilders>
+ <sourcePaths>
+ <sourcePath>src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json</sourcePath>
+ </sourcePaths>
+ <outputDirectory>target/generated-sources/jsonschema2pojo</outputDirectory>
+ <targetPackage>org.apache.streams.tika</targetPackage>
+ <useLongIntegers>true</useLongIntegers>
+ <useJodaDates>true</useJodaDates>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>generate</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
new file mode 100644
index 0000000..36ca2de
--- /dev/null
+++ b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/CategoryParser.java
@@ -0,0 +1,95 @@
+package org.apache.streams.tika;
+
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
+import java.util.List;
+
+public class CategoryParser
+{
+ /**
+ * This method takes a URL and from that text alone determines what categories that URL belongs in.
+ * @param url - String URL to categorize
+ * @return categories - A List<String&rt; of categories the URL seemingly belongs in
+ */
+ public static List<String> getCategoriesFromUrl(String url) {
+
+ // Clean the URL to remove useless bits and encoding artifacts
+ String normalizedUrl = normalizeURL(url);
+
+ // Break the url apart and get the good stuff
+ String[] keywords = tokenizeURL(normalizedUrl);
+
+ return null;
+ }
+
+ /**
+ * Removes the protocol, if it exists, from the front and
+ * removes any random encoding characters
+ * Extend this to do other url cleaning/pre-processing
+ * @param url - The String URL to normalize
+ * @return normalizedUrl - The String URL that has no junk or surprises
+ */
+ private static String normalizeURL(String url)
+ {
+ // Decode URL to remove any %20 type stuff
+ String normalizedUrl = url;
+ try {
+ // I've used a URLDecoder that's part of Java here,
+ // but this functionality exists in most modern languages
+ // and is universally called url decoding
+ normalizedUrl = URLDecoder.decode(url, "UTF-8");
+ }
+ catch(UnsupportedEncodingException uee)
+ {
+ System.err.println("Unable to Decode URL. Decoding skipped.");
+ uee.printStackTrace();
+ }
+
+ // Remove the protocol, http:// ftp:// or similar from the front
+ if (normalizedUrl.contains("://"))
+ normalizedUrl = normalizedUrl.split(":\\/\\/")[1];
+
+ // Room here to do more pre-processing
+
+ return normalizedUrl;
+ }
+
+ /**
+ * Takes apart the url into the pieces that make at least some sense
+ * This doesn't guarantee that each token is a potentially valid keyword, however
+ * because that would require actually iterating over them again, which might be
+ * seen as a waste.
+ * @param url - Url to be tokenized
+ * @return tokens - A String array of all the tokens
+ */
+ private static String[] tokenizeURL(String url)
+ {
+ // I assume that we're going to use the whole URL to find tokens in
+ // If you want to just look in the GET parameters, or you want to ignore the domain
+ // or you want to use the domain as a token itself, that would have to be
+ // processed above the next line, and only the remaining parts split
+ String[] tokens = url.split("\\b|_");
+
+ // One could alternatively use a more complex regex to remove more invalid matches
+ // but this is subject to your (?:in)?ability to actually write the regex you want
+
+ // These next two get rid of tokens that are too short, also.
+
+ // Destroys anything that's not alphanumeric and things that are
+ // alphanumeric but only 1 character long
+ //String[] tokens = url.split("(?:[\\W_]+\\w)*[\\W_]+");
+
+ // Destroys anything that's not alphanumeric and things that are
+ // alphanumeric but only 1 or 2 characters long
+ //String[] tokens = url.split("(?:[\\W_]+\\w{1,2})*[\\W_]+");
+
+ return tokens;
+ }
+
+ // How this would be used
+ public static void main(String[] args)
+ {
+ List<String> soQuestionUrlClassifications = getCategoriesFromUrl("http://stackoverflow.com/questions/10046178/pattern-matching-for-url-classification");
+ List<String> googleQueryURLClassifications = getCategoriesFromUrl("https://www.google.com/search?sugexp=chrome,mod=18&sourceid=chrome&ie=UTF-8&q=spring+is+a+new+service+instance+created#hl=en&sugexp=ciatsh&gs_nf=1&gs_mss=spring%20is%20a%20new%20bean%20instance%20created&tok=lnAt2g0iy8CWkY65Te75sg&pq=spring%20is%20a%20new%20bean%20instance%20created&cp=6&gs_id=1l&xhr=t&q=urlencode&pf=p&safe=off&sclient=psy-ab&oq=url+en&gs_l=&pbx=1&bav=on.2,or.r_gc.r_pw.r_cp.r_qf.,cf.osb&fp=2176d1af1be1f17d&biw=1680&bih=965");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
new file mode 100644
index 0000000..fe0e898
--- /dev/null
+++ b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/LinkExpander.java
@@ -0,0 +1,251 @@
+package org.apache.streams.tika;
+
+import org.apache.streams.urls.LinkUnwinder;
+import org.apache.streams.util.DateUtil;
+import org.apache.streams.tika.BoilerPipeArticle;
+import org.apache.streams.tika.LanguageDetected;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.LanguageIdentifier;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.document.TextBlock;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.net.URL;
+import java.net.URLConnection;
+import java.text.ParseException;
+import java.util.*;
+
+
+/**
+ * Helpful resources for this class:
+ *
+ * // TODO: This needs to be rethought.
+ *
+ * URL:
+ * Tika UI: http://www.apache.org/dyn/closer.cgi/tika/tika-app-1.4.jar
+ * Tika: http://tika.apache.org/
+ * Dublin Core: http://dublincore.org/documents/dces/
+ */
+
+public class LinkExpander extends LinkUnwinder
+{
+ private final static Logger LOGGER = LoggerFactory.getLogger(LinkExpander.class);
+
+ private static final AutoDetectParser AUTO_DETECT_PARSER = new AutoDetectParser();
+
+ private final Map<String, String> metaData = new HashMap<String, String>();
+
+ private final Set<String> keywords = new HashSet<String>();
+
+ private BoilerPipeArticle article = new BoilerPipeArticle();
+
+ // sblackmon: I put this here so I wouldn't get NullPointerExceptions when serializing results
+ public TextBlock getContentTextBlock() {
+ for(TextBlock textBlock : article.getTextBlocks())
+ if(textBlock.isContent())
+ return textBlock;
+ return null;
+ }
+
+ private static final Collection<String> AUTHOR_SEARCH = new ArrayList<String>() {{
+ add("og:author");
+ add("dc:author");
+ add("author");
+ }};
+
+ private static final Collection<String> DESCRIPTION_SEARCH = new ArrayList<String>() {{
+ add("og:description");
+ add("dc:description");
+ add("description");
+ }};
+
+ private static final Collection<String> MEDIUM_SEARCH = new ArrayList<String>() {{
+ add("og:medium");
+ add("dc:medium");
+ add("medium");
+ }};
+
+ private static final Collection<String> IMAGE_SEARCH = new ArrayList<String>() {{
+ add("og:image");
+ add("twitter:image");
+ add("image");
+ }};
+
+ private static final Collection<String> KEYWORDS_SEARCH = new ArrayList<String>() {{
+ add("keywords");
+ add("news_keywords");
+ }};
+
+ private static final Collection<String> PUB_DATE_SEARCH = new ArrayList<String>() {{
+ add("pubdate");
+ add("os:pubdate");
+ add("dc:pubdate");
+ }};
+
+ private static final Collection<String> MODIFIED_DATE_SEARCH = new ArrayList<String>() {{
+ add("lastmod");
+ add("last-modified");
+ }};
+
+ private static final Collection<String> LOCALE_SEARCH = new ArrayList<String>() {{
+ add("locale");
+ add("os:locale");
+ add("dc:local");
+ }};
+
+ // Social Searchers
+ private static final Collection<String> FACEBOOK_PAGE_SEARCH = new ArrayList<String>() {{
+ add("fb:page_id");
+ }};
+
+ private static final Collection<String> FACEBOOK_APP_SEARCH = new ArrayList<String>() {{
+ add("fb:app_id");
+ }};
+
+ private static final Collection<String> TWITTER_SITE_SEARCH = new ArrayList<String>() {{
+ add("twitter:site:id");
+ add("twitter:site");
+ }};
+
+ private static final Collection<String> TWITTER_CREATOR_SEARCH = new ArrayList<String>() {{
+ add("twitter:creator:id");
+ add("twitter:creator");
+ }};
+
+
+ public LinkExpander(String url) {
+ super(url);
+ }
+
+ public void run() {
+ super.run();
+ expandLink();
+ }
+
+
+ private void expandLink()
+ {
+ InputStream is = null;
+
+ try
+ {
+ URL url = new URL(this.getFinalURL());
+ URLConnection con = url.openConnection();
+ con.setConnectTimeout(10000);
+ is = con.getInputStream();
+
+ parseMainContent(is);
+ parsePlainText(is);
+ detectLanguage(article.getPlainText());
+
+ }
+ // Handle all Exceptions by just reporting that the site status was an error.
+ catch (IOException e) {
+ article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+ }
+ catch (TikaException e) {
+ article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+ }
+ catch (SAXException e) {
+ article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+ }
+ catch (Exception e) {
+ article.setSiteStatus(BoilerPipeArticle.SiteStatus.ERROR);
+ }
+ finally {
+ if (!(is == null)) {
+ try {
+ is.close();
+ }
+ catch(IOException e) {
+ LOGGER.warn("Problem closing the input stream: {}", e.getMessage());
+ }
+ }
+ }
+ }
+
+ private void parseMainContent(InputStream is) throws IOException, SAXException, TikaException, ParseException
+ {
+ Metadata rawMetaData = new Metadata();
+ StringWriter stringWriter = new StringWriter();
+
+ BoilerpipeContentHandler boilerpipeContentHandler = new BoilerpipeContentHandler(stringWriter);
+
+ AUTO_DETECT_PARSER.parse(is,
+ boilerpipeContentHandler,
+ rawMetaData);
+
+ article.setTextBlocks(boilerpipeContentHandler.getTextDocument().getTextBlocks());
+ article.setBody(boilerpipeContentHandler.getTextDocument().getContent());
+ article.setTitle(boilerpipeContentHandler.getTextDocument().getTitle());
+
+ // this map is for ourselves so we convert it to lower-case to make it easier to search.
+ // the meta data that is going to be returned will be unmodified meta data.
+ for(String name : rawMetaData.names())
+ if(rawMetaData.get(name) != null) {
+ this.metaData.put(name.toLowerCase(), rawMetaData.get(name));
+ article.setAdditionalProperty(name.toLowerCase(), rawMetaData.get(name));
+ }
+
+ article.setAuthor(metaDataSearcher(LinkExpander.AUTHOR_SEARCH));
+ article.setDescription(metaDataSearcher(LinkExpander.DESCRIPTION_SEARCH));
+ article.setMedium(metaDataSearcher(LinkExpander.MEDIUM_SEARCH));
+ article.setImageURL(metaDataSearcher(LinkExpander.IMAGE_SEARCH));
+ article.setLocale(metaDataSearcher(LinkExpander.LOCALE_SEARCH));
+
+ article.setFacebookApp(metaDataSearcher(LinkExpander.FACEBOOK_APP_SEARCH));
+ article.setFacebookPage(metaDataSearcher(LinkExpander.FACEBOOK_PAGE_SEARCH));
+
+ article.setTwitterCreator(metaDataSearcher(LinkExpander.TWITTER_CREATOR_SEARCH));
+ article.setTwitterSite(metaDataSearcher(LinkExpander.TWITTER_SITE_SEARCH));
+
+ mergeSet(LinkExpander.KEYWORDS_SEARCH, this.keywords);
+
+ article.setPublishedDate(DateUtil.determineDate(metaDataSearcher(LinkExpander.PUB_DATE_SEARCH)));
+ article.setLastModifiedDate(DateUtil.determineDate(metaDataSearcher(LinkExpander.MODIFIED_DATE_SEARCH)));
+
+ if(article.getBody().length() > 50)
+ article.setSiteStatus(BoilerPipeArticle.SiteStatus.SUCCESS);
+ }
+
+ private void parsePlainText(InputStream is) throws Exception {
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse(is, handler, metadata, new ParseContext());
+ article.setPlainText(handler.toString());
+ }
+
+ private void detectLanguage(String plainText) throws Exception {
+ LanguageDetected languageDetected = new LanguageDetected();
+ LanguageIdentifier languageIdentifier = new LanguageIdentifier(plainText);
+ languageDetected.setLanguageCode(languageIdentifier.getLanguage());
+ languageDetected.setIsLanguageReasonablyCertain(languageIdentifier.isReasonablyCertain());
+ article.setLanguageDetected(languageDetected);
+ }
+
+ private String metaDataSearcher(Collection<String> itemsToSearch) {
+ for(String s : itemsToSearch)
+ if(this.metaData.containsKey(s))
+ return this.metaData.get(s);
+
+ // the meta searcher returned nothing.
+ return null;
+ }
+
+ private void mergeSet(Collection<String> itemsToSearch, Set<String> set) {
+ for(String s : itemsToSearch)
+ Collections.addAll(set, s == null || s.equals("") ? new String[]{} : s.split(","));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
new file mode 100644
index 0000000..b2f337d
--- /dev/null
+++ b/streams-contrib/streams-processor-tika/src/main/java/org/apache/streams/tika/TikaProcessor.java
@@ -0,0 +1,104 @@
+package org.apache.streams.tika;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.datatype.jsonorg.JsonOrgModule;
+import com.google.common.collect.Lists;
+import org.apache.commons.lang.NotImplementedException;
+import org.apache.streams.core.StreamsDatum;
+import org.apache.streams.core.StreamsProcessor;
+import org.apache.streams.jackson.StreamsJacksonMapper;
+import org.apache.streams.pojo.json.Activity;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+
+/**
+ * References:
+ * Some helpful references to help
+ * Purpose URL
+ * ------------- ----------------------------------------------------------------
+ * [Status Codes] http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
+ * [Test Cases] http://greenbytes.de/tech/tc/httpredirects/
+ * [t.co behavior] https://dev.twitter.com/docs/tco-redirection-behavior
+ */
+
+public class TikaProcessor implements StreamsProcessor
+{
+ private final static String STREAMS_ID = "LinkExpanderProcessor";
+
+ private final static Logger LOGGER = LoggerFactory.getLogger(TikaProcessor.class);
+
+ private ObjectMapper mapper;
+
+ @Override
+ public List<StreamsDatum> process(StreamsDatum entry) {
+
+ List<StreamsDatum> result = Lists.newArrayList();
+
+ LOGGER.debug("{} processing {}", STREAMS_ID, entry.getDocument().getClass());
+
+ // get list of shared urls
+ if( entry.getDocument() instanceof Activity) {
+
+ Activity input = (Activity) entry.getDocument();
+
+ List<String> outputLinks = input.getLinks();
+ // for each
+ for( String link : outputLinks ) {
+ if( link instanceof String ) {
+ // expand
+ try {
+ StreamsDatum outputDatum = expandLink((String) link, entry);
+ result.add(outputDatum);
+ } catch (Exception e) {
+ //drop unexpandable links
+ LOGGER.debug("Failed to expand link : {}", link);
+ LOGGER.debug("Excpetion expanding link : {}", e);
+ }
+ }
+ else {
+ LOGGER.warn("Expected Links to be of type java.lang.String, but received {}", link.getClass().toString());
+ }
+ }
+
+
+ }
+ else if(entry.getDocument() instanceof String) {
+ StreamsDatum outputDatum = expandLink((String) entry.getDocument(), entry);
+ result.add(outputDatum);
+ }
+ else throw new NotImplementedException();
+
+ return result;
+ }
+
+ private StreamsDatum expandLink(String link, StreamsDatum input) {
+
+ LinkExpander expander = new LinkExpander((String)link);
+ expander.run();
+ StreamsDatum datum = null;
+ if(input.getId() == null)
+ datum = new StreamsDatum(this.mapper.convertValue(expander, JSONObject.class).toString(), expander.getFinalURL());
+ else
+ datum = new StreamsDatum(this.mapper.convertValue(expander, JSONObject.class).toString(), input.getId());
+ datum.setSequenceid(input.getSequenceid());
+ datum.setMetadata(input.getMetadata());
+ datum.setTimestamp(input.getTimestamp());
+ return datum;
+
+ }
+
+ @Override
+ public void prepare(Object o) {
+ this.mapper = StreamsJacksonMapper.getInstance();
+ this.mapper.registerModule(new JsonOrgModule());
+ }
+
+ @Override
+ public void cleanUp() {
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
----------------------------------------------------------------------
diff --git a/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json b/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
new file mode 100644
index 0000000..a23b13e
--- /dev/null
+++ b/streams-contrib/streams-processor-tika/src/main/jsonschema/org/apache/streams/tika/BoilerPipeArticle.json
@@ -0,0 +1,80 @@
+{
+ "type": "object",
+ "$schema": "http://json-schema.org/draft-03/schema",
+ "id": "#",
+ "properties": {
+ "siteStatus" : {
+ "type" : "string",
+ "enum" : ["SUCCESS", "ERROR"]
+ },
+ "title": {
+ "type": "string"
+ },
+ "description": {
+ "type": "string"
+ },
+ "body": {
+ "type": "string"
+ },
+ "plainText": {
+ "type": "string"
+ },
+ "medium": {
+ "type": "string"
+ },
+ "author": {
+ "type": "string"
+ },
+ "locale": {
+ "type": "string"
+ },
+ "publishedDate": {
+ "type": "string",
+ "format" : "date-time"
+ },
+ "lastModifiedDate": {
+ "type": "string",
+ "format" : "date-time"
+ },
+ "imageURL": {
+ "type": "string"
+ },
+ "languageDetected": {
+ "type": "object",
+ "properties": {
+ "languageCode": {
+ "type": "string"
+ },
+ "isLanguageReasonablyCertain": {
+ "type": "boolean"
+ }
+ }
+ },
+ "textBlocks": {
+ "type": "array",
+ "items": {
+ "javaType": "de.l3s.boilerpipe.document.TextBlock",
+ "type": "object"
+ }
+ },
+ "keywords": {
+ "type": "array",
+ "uniqueItems": true,
+ "items": {
+ "type": "string"
+ }
+ },
+ "twitterCreator": {
+ "type": "string"
+ },
+ "twitterSite": {
+ "type": "string"
+ },
+ "facebookPage": {
+ "type": "string"
+ },
+ "facebookApp": {
+ "type": "string"
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-streams/blob/da2d80c7/streams-util/src/main/java/org/apache/streams/util/DateUtil.java
----------------------------------------------------------------------
diff --git a/streams-util/src/main/java/org/apache/streams/util/DateUtil.java b/streams-util/src/main/java/org/apache/streams/util/DateUtil.java
new file mode 100644
index 0000000..e3201bc
--- /dev/null
+++ b/streams-util/src/main/java/org/apache/streams/util/DateUtil.java
@@ -0,0 +1,174 @@
+package org.apache.streams.util;
+
+import org.joda.time.DateTime;
+import org.joda.time.DateTimeZone;
+import org.joda.time.format.DateTimeFormatter;
+import org.joda.time.format.ISODateTimeFormat;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
+
+/*
+ *
+ * If you can think of a better way, feel free to implement. This was a great class that I found that
+ * solves the majority of the issue I was dealing with.
+ *
+ * smashew 11=13=2012
+ *
+ * Site:
+ * http://stackoverflow.com/questions/3389348/parse-any-date-in-java
+ */
+
+public class DateUtil
+{
+
+ private static final String REGEX_ONLY_NUMBERS = "[0-9]+";
+
+ private static final Map<String, String> DATE_FORMAT_REGEXPS = new HashMap<String, String>()
+ {
+ private static final long serialVersionUID = 1L;
+ {
+ put("^\\d{8}$", "yyyyMMdd");
+ put("^\\d{1,2}-\\d{1,2}-\\d{4}$", "dd-MM-yyyy");
+ put("^\\d{4}-\\d{1,2}-\\d{1,2}$", "yyyy-MM-dd");
+ put("^\\d{1,2}/\\d{1,2}/\\d{4}$", "MM/dd/yyyy");
+ put("^\\d{4}/\\d{1,2}/\\d{1,2}$", "yyyy/MM/dd");
+ put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}$", "dd MMM yyyy");
+ put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}$", "dd MMMM yyyy");
+ put("^\\d{12}$", "yyyyMMddHHmm");
+ put("^\\d{8}\\s\\d{4}$", "yyyyMMdd HHmm");
+ put("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}$", "dd-MM-yyyy HH:mm");
+ put("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}$", "yyyy-MM-dd HH:mm");
+ put("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}$", "MM/dd/yyyy HH:mm");
+ put("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}$", "yyyy/MM/dd HH:mm");
+ put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}$", "dd MMM yyyy HH:mm");
+ put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}$", "dd MMMM yyyy HH:mm");
+ put("^\\d{14}$", "yyyyMMddHHmmss");
+ put("^\\d{8}\\s\\d{6}$", "yyyyMMdd HHmmss");
+ put("^\\d{1,2}-\\d{1,2}-\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd-MM-yyyy HH:mm:ss");
+ put("^\\d{4}-\\d{1,2}-\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$", "yyyy-MM-dd HH:mm:ss");
+ put("^\\d{1,2}/\\d{1,2}/\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "MM/dd/yyyy HH:mm:ss");
+ put("^\\d{4}/\\d{1,2}/\\d{1,2}\\s\\d{1,2}:\\d{2}:\\d{2}$", "yyyy/MM/dd HH:mm:ss");
+ put("^\\d{1,2}\\s[a-z]{3}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd MMM yyyy HH:mm:ss");
+ put("^\\d{1,2}\\s[a-z]{4,}\\s\\d{4}\\s\\d{1,2}:\\d{2}:\\d{2}$", "dd MMMM yyyy HH:mm:ss");
+ }
+ };
+
+ /**
+ * Determine SimpleDateFormat pattern matching with the given date string. Returns null if format is unknown. You
+ * can simply extend DateUtil with more formats if needed.
+ *
+ * @param dateString
+ * The date string to determine the SimpleDateFormat pattern for.
+ * @return The matching SimpleDateFormat pattern, or null if format is unknown.
+ * @see java.text.SimpleDateFormat
+ */
+ public static String determineDateFormat(String dateString)
+ throws ParseException
+ {
+ for (String regexp : DATE_FORMAT_REGEXPS.keySet())
+ if (dateString.toLowerCase().matches(regexp))
+ return DATE_FORMAT_REGEXPS.get(regexp);
+
+ throw new ParseException("unable to parse date",0);
+ }
+
+ public static DateTime determineDate(String dateString)
+ throws ParseException
+ {
+ // Trim the string just in case it is dirty.
+ dateString = dateString.trim();
+
+ // check to see if it looks like it is millis. If so, parse as millis and return.
+ if(dateString.matches(REGEX_ONLY_NUMBERS))
+ return new DateTime(new Date(Long.parseLong(dateString)));
+
+ try
+ {
+ // try to parse the string into a java.date object, if possible.
+ SimpleDateFormat dateFormat = new SimpleDateFormat(determineDateFormat(dateString));
+ dateFormat.setLenient(false);
+ return new DateTime(dateFormat.parse(dateString));
+ }
+ catch(Exception e)
+ {
+
+ }
+
+ return new DateTime(DateTime.parse(dateString));
+ }
+
+ public static DateTime determineDateTime(String dateString)
+ throws ParseException
+ {
+ return new DateTime(determineDate(dateString));
+ }
+
+ public static DateTime determineDateTime(String dateString, DateTimeZone theTimeZone)
+ throws ParseException
+ {
+ DateTime beforeTimeZone = determineDateTime(dateString);
+ return new DateTime(beforeTimeZone.getYear(),beforeTimeZone.getMonthOfYear(), beforeTimeZone.getDayOfMonth(), beforeTimeZone.getHourOfDay(), beforeTimeZone.getMinuteOfHour(), beforeTimeZone.getSecondOfMinute(), beforeTimeZone.getMillisOfSecond(), theTimeZone);
+ }
+
+
+ public static String getAliasForDate(String date, String prefix) throws ParseException {
+ return getAliasesForDateRange(date, null, prefix).iterator().next();
+ }
+
+ public static String getAliasForDate(DateTime date, String prefix) throws ParseException {
+ return getAliasesForDateRange(date, null, prefix).iterator().next();
+ }
+
+ public static Set<String> getAliasesForDateRange(String starDate, String endDate, String prefix)
+ throws ParseException
+ {
+ DateTime start = null;
+ DateTime end = null;
+ DateTimeFormatter df = ISODateTimeFormat.dateTimeNoMillis();
+ try {
+ start = df.parseDateTime(starDate);
+ } catch (Exception e) {
+ //do nothing. try to parse with other parsers
+ }
+ if(start == null) {
+ start = determineDateTime(starDate);
+ }
+ if(endDate != null) {
+ try {
+ end = df.parseDateTime(endDate);
+ } catch (Exception e) {
+ //do nothing. try to parse with other parsers
+ }
+ if( end == null)
+ end = determineDateTime(endDate);
+ }
+ return getAliasesForDateRange(start, end, prefix);
+ }
+
+ public static Set<String> getAliasesForDateRange(DateTime startDate, DateTime endDate, String prefix) {
+ Set<String> aliases = new HashSet<String>();
+ aliases.add(prefix+"_"+getDateAbbreviation(startDate.getYear(), startDate.getMonthOfYear()));
+ if(endDate == null) {
+ return aliases;
+ }
+ while(endDate.isAfter(startDate)) {
+ aliases.add(prefix+"_"+getDateAbbreviation(endDate.getYear(), endDate.getMonthOfYear()));
+ endDate = endDate.minusMonths(1);
+ }
+ return aliases;
+ }
+
+ private static String getDateAbbreviation(int year, int month) {
+ if(month > 9) {
+ return Integer.toString(year)+Integer.toString(month);
+ }
+ else {
+ return Integer.toString(year)+"0"+Integer.toString(month);
+ }
+ }
+
+
+}