You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [2/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-mo...
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/corenlp/CoreNLPNERecogniser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.corenlp;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * CRF classifiers from Stanford CoreNLP. This NER requires additional setup,
+ * due to runtime binding to Stanford CoreNLP.
+ * See <a href="http://wiki.apache.org/tika/TikaAndNER#CoreNLP">
+ * Tika NER Wiki</a> for configuring this recogniser.
+ * @see NERecogniser
+ *
+ */
+public class CoreNLPNERecogniser implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(CoreNLPNERecogniser.class);
+
+ //default model paths
+ public static final String NER_3CLASS_MODEL = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
+ public static final String NER_4CLASS_MODEL = "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
+ public static final String NER_7CLASS_MODEL = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
+ /**
+ * default Model path
+ */
+ public static final String DEFAULT_MODEL_PATH = NER_7CLASS_MODEL;
+ public static final String MODEL_PROP_NAME = "ner.corenlp.model";
+
+ public static final Set<String> ENTITY_TYPES = new HashSet<String>(){{
+ add(PERSON);
+ add(TIME);
+ add(LOCATION);
+ add(ORGANIZATION);
+ add(MONEY);
+ add(PERCENT);
+ add(DATE);
+ }};
+ private static final String CLASSIFIER_CLASS_NAME = "edu.stanford.nlp.ie.crf.CRFClassifier";
+
+ private boolean available = false;
+ private Field firstField;
+ private Field secondField;
+ private Field thirdField;
+ private Object classifierInstance;
+ private Method classifyMethod;
+
+ public CoreNLPNERecogniser(){
+ this(System.getProperty(MODEL_PROP_NAME, DEFAULT_MODEL_PATH));
+ }
+
+ /**
+ * Creates a NERecogniser by loading model from given path
+ * @param modelPath path to NER model file
+ */
+ public CoreNLPNERecogniser(String modelPath) {
+ try {
+ Properties props = new Properties();
+ Class<?> classifierClass = Class.forName(CLASSIFIER_CLASS_NAME);
+ Method loadMethod = classifierClass.getMethod("getClassifier", String.class, Properties.class);
+ classifierInstance = loadMethod.invoke(classifierClass, modelPath, props);
+ classifyMethod = classifierClass.getMethod("classifyToCharacterOffsets", String.class);
+
+ //these fields are for accessing result
+ Class<?> tripleClass = Class.forName("edu.stanford.nlp.util.Triple");
+ this.firstField = tripleClass.getField("first");
+ this.secondField = tripleClass.getField("second");
+ this.thirdField = tripleClass.getField("third");
+ this.available = true;
+ } catch (Exception e) {
+ LOG.warn("{} while trying to load the model from {}", e.getMessage(), modelPath);
+ }
+ LOG.info("Available for service ? {}", available);
+ }
+
+ /**
+ *
+ * @return {@code true} if model was available, valid and was able to initialise the classifier.
+ * returns {@code false} when this recogniser is not available for service.
+ */
+ public boolean isAvailable() {
+ return available;
+ }
+
+ /**
+ * Gets set of entity types recognised by this recogniser
+ * @return set of entity classes/types
+ */
+ public Set<String> getEntityTypes() {
+ return ENTITY_TYPES;
+ }
+
+ /**
+ * recognises names of entities in the text
+ * @param text text which possibly contains names
+ * @return map of entity type -> set of names
+ */
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> names = new HashMap<>();
+ try {
+ Object result = classifyMethod.invoke(classifierInstance, text);
+ List entries = (List) result;
+ for (Object entry : entries) {
+ String entityType = (String) firstField.get(entry);
+ if (!names.containsKey(entityType)) {
+ names.put(entityType, new HashSet<String>());
+ }
+ Integer start = (Integer) secondField.get(entry);
+ Integer end = (Integer) thirdField.get(entry);
+ String name = text.substring(start, end);
+ //Clean repeating spaces, replace line breaks and tabs with single space
+ name = name.trim().replaceAll("(\\s\\s+)|\n|\t", " ");
+ if (!name.isEmpty()) {
+ names.get(entityType).add(name);
+ }
+ }
+
+ } catch (Exception e) {
+ LOG.debug(e.getMessage(), e);
+ }
+ return names;
+ }
+
+ public static void main(String[] args) throws IOException {
+ if (args.length != 1) {
+ System.err.println("Error: Invalid Args");
+ System.err.println("This tool finds names inside text");
+ System.err.println("Usage: <path/to/text/file>");
+ return;
+ }
+
+ try (FileInputStream stream = new FileInputStream(args[0])) {
+ String text = IOUtils.toString(stream);
+ CoreNLPNERecogniser ner = new CoreNLPNERecogniser();
+ Map<String, Set<String>> names = ner.recognise(text);
+ JSONObject jNames = new JSONObject(names);
+ System.out.println(jNames.toString(2));
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNERecogniser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import org.apache.tika.parser.ner.NERecogniser;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+
+/**
+ *
+ * This implementation of {@link NERecogniser} chains an array of
+ * {@link OpenNLPNameFinder}s for which NER models are
+ * available in classpath.
+ *
+ * The following models are scanned during initialization via class loader.:
+ *
+ * <table>
+ * <tr>
+ * <th>Entity Type</th><th>Path</th>
+ * </tr>
+ * <tr>
+ * <td>{@value PERSON}</td><td> {@value PERSON_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value LOCATION}</td><td>{@value LOCATION_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value ORGANIZATION}</td><td>{@value ORGANIZATION_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value TIME}</td><td>{@value TIME_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value DATE}</td><td>{@value DATE_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value PERCENT}</td><td>{@value PERCENT_FILE}</td>
+ * </tr>
+ * <tr>
+ * <td>{@value MONEY}</td><td>{@value MONEY_FILE}</td>
+ * </tr>
+ * </table>
+ *
+ * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL
+ */
+public class OpenNLPNERecogniser implements NERecogniser {
+
+ public static final String MODELS_DIR = OpenNLPNERecogniser.class
+ .getPackage().getName().replace(".", "/");
+ public static final String PERSON_FILE = "ner-person.bin";
+ public static final String LOCATION_FILE = "ner-location.bin";
+ public static final String ORGANIZATION_FILE = "ner-organization.bin";
+ public static final String TIME_FILE = "ner-time.bin";
+ public static final String DATE_FILE = "ner-date.bin";
+ public static final String PERCENT_FILE = "ner-percentage.bin";
+ public static final String MONEY_FILE = "ner-money.bin";
+
+
+ //Default (English) Models for the common 7 classes of named types
+ public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE;
+ public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE;
+ public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE;
+ public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE;
+ public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE;
+ public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + PERCENT_FILE;
+ public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE;
+
+ public static final Map<String, String> DEFAULT_MODELS =
+ new HashMap<String, String>(){{
+ put(PERSON, NER_PERSON_MODEL);
+ put(LOCATION, NER_LOCATION_MODEL);
+ put(ORGANIZATION, NER_ORGANIZATION_MODEL);
+ put(TIME, NER_TIME_MODEL);
+ put(DATE, NER_DATE_MODEL);
+ put(PERCENT, NER_PERCENT_MODEL);
+ put(MONEY, NER_MONEY_MODEL);
+ }};
+
+ private Set<String> entityTypes;
+ private List<OpenNLPNameFinder> nameFinders;
+ private boolean available;
+
+ /**
+ * Creates a default chain of Name finders using default OpenNLP recognizers
+ */
+ public OpenNLPNERecogniser(){
+ this(DEFAULT_MODELS);
+ }
+
+ /**
+ * Creates a chain of Named Entity recognisers
+ * @param models map of entityType -> model path
+ * NOTE: the model path should be known to class loader.
+ */
+ public OpenNLPNERecogniser(Map<String, String> models){
+ this.nameFinders = new ArrayList<>();
+ this.entityTypes = new HashSet<>();
+ for (Map.Entry<String, String> entry : models.entrySet()) {
+ OpenNLPNameFinder finder =
+ new OpenNLPNameFinder(entry.getKey(), entry.getValue());
+ if (finder.isAvailable()) {
+ this.nameFinders.add(finder);
+ this.entityTypes.add(entry.getKey());
+ }
+ }
+ this.entityTypes = Collections.unmodifiableSet(this.entityTypes);
+ this.available = nameFinders.size() > 0; //at least one finder is present
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available;
+ }
+
+ @Override
+ public Set<String> getEntityTypes() {
+ return entityTypes;
+ }
+
+ @Override
+ public Map<String, Set<String>> recognise(String text) {
+ String[] tokens = OpenNLPNameFinder.tokenize(text);
+ Map<String, Set<String>> names = new HashMap<>();
+ for (OpenNLPNameFinder finder : nameFinders) {
+ names.putAll(finder.findNames(tokens));
+ }
+ return names;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/opennlp/OpenNLPNameFinder.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.opennlp;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.Span;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * An implementation of {@link NERecogniser} that finds names in text using Open NLP Model.
+ * This implementation works with only one entity type. For chain this name finder instances,
+ * see {@link OpenNLPNERecogniser}
+ */
+public class OpenNLPNameFinder implements NERecogniser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(OpenNLPNameFinder.class);
+ private final String nameType;
+ private final Set<String> nameTypes;
+ private NameFinderME nameFinder;
+ private boolean available;
+
+ /**
+ * Creates OpenNLP name finder
+ * @param nameType the entity type recognised by the given NER model
+ * @param nerModelPath path to ner model
+ */
+ public OpenNLPNameFinder(String nameType, String nerModelPath) {
+ this.nameTypes = Collections.singleton(nameType);
+ this.nameType = nameType;
+ InputStream nerModelStream = getClass().getClassLoader().getResourceAsStream(nerModelPath);
+ try {
+ if (nerModelStream != null){
+ TokenNameFinderModel model = new TokenNameFinderModel(nerModelStream);
+ this.nameFinder = new NameFinderME(model);
+ this.available = true;
+ } else {
+ LOG.warn("Couldn't find model from {} using class loader", nerModelPath);
+ }
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ } finally {
+ IOUtils.closeQuietly(nerModelStream);
+ }
+ LOG.info("{} NER : Available for service ? {}", nameType, available);
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available;
+ }
+
+ @Override
+ public Set<String> getEntityTypes() {
+ return nameTypes;
+ }
+
+ public static String[] tokenize(String text){
+ //NOTE: replace this with a NLP tokenizer tool
+ //clean + split
+ return text.trim().replaceAll("(\\s\\s+)", " ").split("\\s");
+ }
+
+ @Override
+ public synchronized Map<String, Set<String>> recognise(String text) {
+ String[] tokens = tokenize(text);
+ return findNames(tokens);
+ }
+
+ /**
+ * finds names from given array of tokens
+ * @param tokens the tokens array
+ * @return map of EntityType -> set of entity names
+ */
+ public Map<String, Set<String>> findNames(String[] tokens) {
+ Span[] nameSpans = nameFinder.find(tokens);
+ String[] names = Span.spansToStrings(nameSpans, tokens);
+ Map<String, Set<String>> result = new HashMap<>();
+ if (names != null && names.length > 0) {
+ result.put(nameType, new HashSet<>(Arrays.asList(names)));
+ }
+ nameFinder.clearAdaptiveData();
+ return result;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/java/org/apache/tika/parser/ner/regex/RegexNERecogniser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.parser.ner.NERecogniser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * This class offers an implementation of {@link NERecogniser} based on
+ * Regular Expressions.
+ *<p>
+ * The default configuration file {@value NER_REGEX_FILE} is used when no
+ * argument constructor is used to instantiate this class. The regex file is
+ * loaded via {@link Class#getResourceAsStream(String)}, so the file should be
+ * placed in the same package path as of this class.
+ * </p>
+ * The format of regex configuration as follows:
+ * <pre>
+ * ENTITY_TYPE1=REGEX1
+ * ENTITY_TYPE2=REGEX2
+ * </pre>
+ *
+ * <i>For example, to extract week day from text:</i>
+ * <pre>WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
+ * </pre>
+ * @since Nov. 7, 2015
+ */
+public class RegexNERecogniser implements NERecogniser {
+
+ public static final String NER_REGEX_FILE = "ner-regex.txt";
+ private static Logger LOG = LoggerFactory.getLogger(RegexNERecogniser.class);
+
+ public Set<String> entityTypes = new HashSet<>();
+ public Map<String, Pattern> patterns;
+ private boolean available = false;
+
+ private static RegexNERecogniser INSTANCE;
+
+ public RegexNERecogniser(){
+ this(RegexNERecogniser.class.getResourceAsStream(NER_REGEX_FILE));
+ }
+
+ public RegexNERecogniser(InputStream stream){
+ try {
+ patterns = new HashMap<>();
+ List<String> lines = IOUtils.readLines(stream, StandardCharsets.UTF_8);
+ IOUtils.closeQuietly(stream);
+ for (String line : lines) {
+ line = line.trim();
+ if (line.isEmpty() || line.startsWith("#")){ //empty or comment
+ continue; //skip
+ }
+
+ int delim = line.indexOf('=');
+ if (delim < 0) { //delim not found
+ //skip
+ LOG.error("Skipped : Invalid config : {} ", line);
+ continue;
+ }
+ String type = line.substring(0, delim).trim();
+ String patternStr = line.substring(delim+1, line.length()).trim();
+ patterns.put(type, Pattern.compile(patternStr));
+ entityTypes.add(type);
+ }
+ } catch (Exception e) {
+ LOG.error(e.getMessage(), e);
+ }
+ available = !entityTypes.isEmpty();
+ }
+
+ public synchronized static RegexNERecogniser getInstance() {
+ if (INSTANCE == null) {
+ INSTANCE = new RegexNERecogniser();
+ }
+ return INSTANCE;
+ }
+
+ @Override
+ public boolean isAvailable() {
+ return available;
+ }
+
+ @Override
+ public Set<String> getEntityTypes() {
+ return entityTypes;
+ }
+
+ /**
+ * finds matching sub groups in text
+ * @param text text containing interesting sub strings
+ * @param pattern pattern to find sub strings
+ * @return set of sub strings if any found, or null if none found
+ */
+ public Set<String> findMatches(String text, Pattern pattern){
+ Set<String> results = null;
+ Matcher matcher = pattern.matcher(text);
+ if (matcher.find()) {
+ results = new HashSet<>();
+ results.add(matcher.group(0));
+ while (matcher.find()) {
+ results.add(matcher.group(0));
+ }
+ }
+ return results;
+ }
+
+ @Override
+ public Map<String, Set<String>> recognise(String text) {
+ Map<String, Set<String>> result = new HashMap<>();
+ for (Map.Entry<String, Pattern> entry : patterns.entrySet()) {
+ Set<String> names = findMatches(text, entry.getValue());
+ if (names != null) {
+ result.put(entry.getKey(), names);
+ }
+ }
+ return result;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.crypto.Pkcs7Parser
+#org.apache.tika.parser.ner.NamedEntityParser
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/main/resources/org/apache/tika/parser/ner/regex/ner-regex.txt Sat Jan 16 18:23:01 2016
@@ -0,0 +1,22 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# The pattern as follows
+# type = regex
+# the first occurrence of '=' separates type from its regex
+
+# WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/crypto/Pkcs7ParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.crypto;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class Pkcs7ParserTest extends TikaTest {
+ public void testDetachedSignature() throws Exception {
+ try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream(
+ "/test-documents/testDetached.p7s")) {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
+ } catch (NullPointerException npe) {
+ fail("should not get NPE");
+ } catch (TikaException te) {
+ assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import static org.junit.Assert.assertTrue;
+
+/**
+ *Test case for {@link NamedEntityParser}
+ */
+public class NamedEntityParserTest {
+
+ public static final String CONFIG_FILE = "tika-config.xml";
+
+ @Test
+ public void testParse() throws Exception {
+
+ //test config is added to resources directory
+ TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+ Tika tika = new Tika(config);
+ String text = "I am student at University of Southern California (USC)," +
+ " located in Los Angeles . USC's football team is called by name Trojans." +
+ " Mr. John McKay was a head coach of the team from 1960 - 1975";
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+
+ HashSet<String> set = new HashSet<String>();
+ set.addAll(Arrays.asList(md.getValues("X-Parsed-By")));
+ assertTrue(set.contains(NamedEntityParser.class.getName()));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_PERSON")));
+ assertTrue(set.contains("John McKay"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_LOCATION")));
+ assertTrue(set.contains("Los Angeles"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_ORGANIZATION")));
+ assertTrue(set.contains("University of Southern California"));
+
+ set.clear();
+ set.addAll(Arrays.asList(md.getValues("NER_DATE")));
+ assertTrue(set.contains("1960 - 1975"));
+
+ }
+
+ @Test
+ public void testNerChain() throws Exception {
+ String classNames = OpenNLPNERecogniser.class.getName()
+ + "," + RegexNERecogniser.class.getName();
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, classNames);
+ TikaConfig config = new TikaConfig(getClass().getResourceAsStream(CONFIG_FILE));
+ Tika tika = new Tika(config);
+ String text = "University of Southern California (USC), is located in Los Angeles ." +
+ " Campus is busy from monday to saturday";
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(Charset.defaultCharset())), md);
+ HashSet<String> keys = new HashSet<>(Arrays.asList(md.names()));
+ assertTrue(keys.contains("NER_WEEK_DAY"));
+ assertTrue(keys.contains("NER_LOCATION"));
+
+ }
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-advanced-parser-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ner.regex;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertTrue;
+
+public class RegexNERecogniserTest {
+
+ @Test
+ public void testGetEntityTypes() throws Exception {
+
+ String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
+ System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
+
+ Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Metadata md = new Metadata();
+ tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
+
+ Set<String> days = new HashSet<>(Arrays.asList(md.getValues("NER_WEEK_DAY")));
+ assertTrue(days.contains("Sunday"));
+ assertTrue(days.contains("MONDAY"));
+ assertTrue(days.contains("Saturday"));
+ assertTrue(days.size() == 3); //and nothing else
+
+
+ }
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/pom.xml?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-cad-parser-module</artifactId>
+ <name>Apache Tika CAD Parser Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-parser-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.StringUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * DWG (CAD Drawing) parser. This is a very basic parser, which just
+ * looks for bits of the headers.
+ * Note that we use Apache POI for various parts of the processing, as
+ * lots of the low level string/int/short concepts are the same.
+ */
+public class DWGParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -7744232583079169119L;
+
+ private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(TYPE);
+ }
+
+ /** The order of the fields in the header */
+ private static final Property[] HEADER_PROPERTIES_ENTRIES = {
+ TikaCoreProperties.TITLE,
+ TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+ TikaCoreProperties.CREATOR,
+ TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
+ TikaCoreProperties.COMMENTS,
+ TikaCoreProperties.MODIFIER,
+ null, // Unknown?
+ TikaCoreProperties.RELATION, // Hyperlink
+ };
+
+ /** For the 2000 file, they're indexed */
+ private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
+ null,
+ TikaCoreProperties.RELATION, // 0x01
+ TikaCoreProperties.TITLE, // 0x02
+ TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03
+ TikaCoreProperties.CREATOR, // 0x04
+ null,
+ TikaCoreProperties.COMMENTS,// 0x06
+ TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07
+ TikaCoreProperties.MODIFIER, // 0x08
+ };
+
+ private static final String HEADER_2000_PROPERTIES_MARKER_STR =
+ "DWGPROPS COOKIE";
+
+ private static final byte[] HEADER_2000_PROPERTIES_MARKER =
+ new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+
+ static {
+ StringUtil.putCompressedUnicode(
+ HEADER_2000_PROPERTIES_MARKER_STR,
+ HEADER_2000_PROPERTIES_MARKER, 0);
+ }
+
+ /**
+ * How far to skip after the last standard property, before
+ * we find any custom properties that might be there.
+ */
+ private static final int CUSTOM_PROPERTIES_SKIP = 20;
+
+ /**
+ * The value of padding bytes other than 0 in some DWG files.
+ */
+ private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+ // First up, which version of the format are we handling?
+ byte[] header = new byte[128];
+ IOUtils.readFully(stream, header);
+ String version = new String(header, 0, 6, "US-ASCII");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ if (version.equals("AC1015")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipTo2000PropertyInfoSection(stream, header)) {
+ get2000Props(stream,metadata,xhtml);
+ }
+ } else if (version.equals("AC1018")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2004Props(stream,metadata,xhtml);
+ }
+ } else if (version.equals("AC1021") || version.equals("AC1024")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2007and2010Props(stream,metadata,xhtml);
+ }
+ } else {
+ throw new TikaException(
+ "Unsupported AutoCAD drawing version: " + version);
+ }
+
+ xhtml.endDocument();
+ }
+
+ /**
+ * Stored as US-ASCII
+ */
+ private void get2004Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ // Standard properties
+ for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+ String headerValue = read2004String(stream);
+ handleHeader(i, headerValue, metadata, xhtml);
+ }
+
+ // Custom properties
+ int customCount = skipToCustomProperties(stream);
+ for (int i = 0; i < customCount; i++) {
+ String propName = read2004String(stream);
+ String propValue = read2004String(stream);
+ if(propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
+ }
+ }
+
+ private String read2004String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
+
+ byte[] stringData = new byte[stringLen];
+ IOUtils.readFully(stream, stringData);
+
+ // Often but not always null terminated
+ if (stringData[stringLen-1] == 0) {
+ stringLen--;
+ }
+ String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
+ return value;
+ }
+
+ /**
+ * Stored as UCS2, so 16 bit "unicode"
+ */
+ private void get2007and2010Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ // Standard properties
+ for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+ String headerValue = read2007and2010String(stream);
+ handleHeader(i, headerValue, metadata, xhtml);
+ }
+
+ // Custom properties
+ int customCount = skipToCustomProperties(stream);
+ for (int i = 0; i < customCount; i++) {
+ String propName = read2007and2010String(stream);
+ String propValue = read2007and2010String(stream);
+ if(propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
+ }
+ }
+
+ private String read2007and2010String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
+
+ byte[] stringData = new byte[stringLen * 2];
+ IOUtils.readFully(stream, stringData);
+ String value = StringUtil.getFromUnicodeLE(stringData);
+
+ // Some strings are null terminated
+ if(value.charAt(value.length()-1) == 0) {
+ value = value.substring(0, value.length()-1);
+ }
+
+ return value;
+ }
+
+ private void get2000Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ int propCount = 0;
+ while(propCount < 30) {
+ int propIdx = EndianUtils.readUShortLE(stream);
+ int length = EndianUtils.readUShortLE(stream);
+ int valueType = stream.read();
+
+ if(propIdx == 0x28) {
+ // This one seems not to follow the pattern
+ length = 0x19;
+ } else if(propIdx == 90) {
+ // We think this means the end of properties
+ break;
+ }
+
+ byte[] value = new byte[length];
+ IOUtils.readFully(stream, value);
+ if(valueType == 0x1e) {
+ // Normal string, good
+ String val = StringUtil.getFromCompressedUnicode(value, 0, length);
+
+ // Is it one we can look up by index?
+ if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+ metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+ xhtml.element("p", val);
+ } else if(propIdx == 0x012c) {
+ int splitAt = val.indexOf('=');
+ if(splitAt > -1) {
+ String propName = val.substring(0, splitAt);
+ String propVal = val.substring(splitAt+1);
+ metadata.add(propName, propVal);
+ }
+ }
+ } else {
+ // No idea...
+ }
+
+ propCount++;
+ }
+ }
+
+ private void handleHeader(
+ int headerNumber, String value, Metadata metadata,
+ XHTMLContentHandler xhtml) throws SAXException {
+ if(value == null || value.length() == 0) {
+ return;
+ }
+
+ Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
+ if(headerProp != null) {
+ metadata.set(headerProp, value);
+ }
+
+ xhtml.element("p", value);
+ }
+
+ /**
+ * Grab the offset, then skip there
+ */
+ private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
+ throws IOException, TikaException {
+ // The offset is stored in the header from 0x20 onwards
+ long offsetToSection = EndianUtils.getLongLE(header, 0x20);
+
+ // Sanity check the offset. Some files seem to use a different format,
+ // and the offset isn't available at 0x20. Until we can work out how
+ // to find the offset in those files, skip them if detected
+ if (offsetToSection > 0xa00000l) {
+ // Header should never be more than 10mb into the file, something is wrong
+ offsetToSection = 0;
+ }
+
+ // Work out how far to skip, and sanity check
+ long toSkip = offsetToSection - header.length;
+ if(offsetToSection == 0){
+ return false;
+ }
+ while (toSkip > 0) {
+ byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
+ IOUtils.readFully(stream, skip);
+ toSkip -= skip.length;
+ }
+ return true;
+ }
+
+ /**
+ * We think it can be anywhere...
+ */
+ private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
+ throws IOException {
+ int val = 0;
+ while(val != -1) {
+ val = stream.read();
+ if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
+ boolean going = true;
+ for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
+ val = stream.read();
+ if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
+ }
+ if(going) {
+ // Bingo, found it
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private int skipToCustomProperties(InputStream stream)
+ throws IOException, TikaException {
+ // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
+ byte[] padding = new byte[4];
+ IOUtils.readFully(stream, padding);
+ if((padding[0] == 0 && padding[1] == 0 &&
+ padding[2] == 0 && padding[3] == 0) ||
+ (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
+ padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
+ padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
+ padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+
+ // Looks hopeful, skip on
+ padding = new byte[CUSTOM_PROPERTIES_SKIP];
+ IOUtils.readFully(stream, padding);
+
+ // We should now have the count
+ int count = EndianUtils.readUShortLE(stream);
+
+ // Sanity check it
+ if(count > 0 && count < 0x7f) {
+ // Looks plausible
+ return count;
+ } else {
+ // No properties / count is too high to trust
+ return 0;
+ }
+ } else {
+ // No padding. That probably means no custom props
+ return 0;
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,275 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ * format. It outputs text from note entries.
+ */
+
+public class PRTParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 4659638314375035178L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
+ public static final String PRT_MIME_TYPE = "application/x-prt";
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * How long do we allow a text run to claim to be, before we
+ * decide we're confused and it's not really text after all?
+ */
+ private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+
+ /*
+ * Text types:
+ * 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
+ * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
+ * (anything) e0 3f sz sz TEXT *view name*
+ * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
+ *
+ * Note - all text is null terminated
+ */
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ Last5 l5 = new Last5();
+ int read;
+
+ // Try to get the creation date, which is YYYYMMDDhhmm
+ byte[] header = new byte[30];
+ IOUtils.readFully(stream, header);
+ byte[] date = new byte[12];
+ IOUtils.readFully(stream, date);
+
+ String dateStr = new String(date, US_ASCII);
+ if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
+ String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
+ "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
+ dateStr.substring(10, 12) + ":00";
+ metadata.set(TikaCoreProperties.CREATED, formattedDate);
+ // TODO Metadata.DATE is used as modified, should it be here?
+ metadata.set(Metadata.DATE, formattedDate);
+ }
+ metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+
+ // The description, if set, is the next up-to-500 bytes
+ byte[] desc = new byte[500];
+ IOUtils.readFully(stream, desc);
+ String description = extractText(desc, true);
+ if(description.length() > 0) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ }
+
+ // Now look for text
+ while( (read = stream.read()) > -1) {
+ if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
+ int nread = stream.read();
+ if(nread == 0x3f || nread == 0xbf) {
+ // Looks promising, check back for a suitable value
+ if(read == 0xe3 && nread == 0x3f) {
+ if(l5.is33()) {
+ // Bingo, note text
+ handleNoteText(stream, xhtml);
+ }
+ } else if(l5.is00()) {
+ // Likely view name
+ handleViewName(read, nread, stream, xhtml, l5);
+ }
+ }
+ } else {
+ l5.record(read);
+ }
+ }
+ }
+
+ private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ // Ensure we have the right padding text
+ int read;
+ for(int i=0; i<10; i++) {
+ read = stream.read();
+ if(read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+ read = stream.read();
+ if(read != 0x1f) {
+ // Wrong, false detection
+ return;
+ }
+
+ int length = EndianUtils.readUShortLE(stream);
+ if(length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ }
+ }
+
+ private void handleViewName(int typeA, int typeB, InputStream stream,
+ XHTMLContentHandler xhtml, Last5 l5)
+ throws IOException, SAXException, TikaException {
+ // Is it 8 byte zero padded?
+ int maybeLength = EndianUtils.readUShortLE(stream);
+ if(maybeLength == 0) {
+ // Check the next 6 bytes too
+ for(int i=0; i<6; i++) {
+ int read = stream.read();
+ if(read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+
+ byte[] b2 = new byte[2];
+ IOUtils.readFully(stream, b2);
+ int length = EndianUtils.getUShortLE(b2);
+ if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ } else {
+ // Was probably something else
+ l5.record(b2[0]);
+ l5.record(b2[1]);
+ }
+ } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+ // Looks like it's straight into the text
+ handleText(maybeLength, stream, xhtml);
+ }
+ }
+
+ private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ byte[] str = new byte[length];
+ IOUtils.readFully(stream, str);
+ if(str[length-1] != 0) {
+ // Not properly null terminated, must be wrong
+ return;
+ }
+
+ String text = extractText(str, false);
+
+ xhtml.startElement("p");
+ xhtml.characters(text);
+ xhtml.endElement("p");
+ }
+
+ /**
+ * Does our best to turn the bytes into text
+ */
+ private String extractText(byte[] data, boolean trim) throws TikaException {
+ // The text is always stored null terminated, but sometimes
+ // may have extra null padding too
+ int length = data.length - 1;
+ if(trim) {
+ for(int i=0; i<data.length; i++) {
+ if(data[i] == 0) {
+ length = i;
+ break;
+ }
+ }
+ }
+
+ // We believe that the text is basically stored as CP437
+ // That said, there are a few characters slightly wrong for that...
+ String text;
+ try {
+ text = new String(data, 0, length, "cp437");
+ } catch(UnsupportedEncodingException e) {
+ throw new TikaException("JVM Broken, core codepage CP437 missing!");
+ }
+
+ // Fix up the known character issues
+ text = text.replace("\u03C6","\u00D8");
+
+ // All done, as best as we can!
+ return text;
+ }
+
+ /**
+ * Provides a view on the previous 5 bytes
+ */
+ private static class Last5 {
+ byte[] data = new byte[5];
+ int pos = 0;
+
+ private void record(int b) {
+ data[pos] = (byte)b;
+ pos++;
+ if(pos >= data.length) {
+ pos = 0;
+ }
+ }
+
+ private byte[] get() {
+ byte[] ret = new byte[5];
+ for(int i=0; i<ret.length; i++) {
+ int p = pos - i;
+ if(p < 0) { p += ret.length; }
+ ret[i] = data[p];
+ }
+ return ret;
+ }
+
+ private boolean is33() {
+ byte[] last5 = get();
+ for(byte b : last5) {
+ if(b != 0x33) return false;
+ }
+ return true;
+ }
+
+ private boolean is00() {
+ byte[] last5 = get();
+ for(byte b : last5) {
+ if(b != 0x00) return false;
+ }
+ return true;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+org.apache.tika.parser.dwg.DWGParser
+#org.apache.tika.parser.prt.PRTParser
Added: tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class DWGParserTest {
+
+ @Test
+ public void testDWG2000Parser() throws Exception {
+ InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2000.dwg");
+ testParserAlt(input);
+ }
+
+ @Test
+ public void testDWG2004Parser() throws Exception {
+ InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2004.dwg");
+ testParser(input);
+ }
+
+ @Test
+ public void testDWG2004ParserNoHeaderAddress() throws Exception {
+ InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2004_no_header.dwg");
+ testParserNoHeader(input);
+ }
+
+ @Test
+ public void testDWG2007Parser() throws Exception {
+ InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2007.dwg");
+ testParser(input);
+ }
+
+ @Test
+ public void testDWG2010Parser() throws Exception {
+ InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2010.dwg");
+ testParser(input);
+ }
+
+ @Test
+ public void testDWG2010CustomPropertiesParser() throws Exception {
+ // Check that standard parsing works
+ InputStream testInput = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2010_custom_props.dwg");
+ testParser(testInput);
+
+ // Check that custom properties with alternate padding work
+ try (InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWG2010_custom_props.dwg")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DWGParser().parse(input, handler, metadata, null);
+
+ assertEquals("valueforcustomprop1",
+ metadata.get("customprop1"));
+ assertEquals("valueforcustomprop2",
+ metadata.get("customprop2"));
+ }
+ }
+
+ @Test
+ public void testDWGMechParser() throws Exception {
+ String[] types = new String[] {
+ "6", "2004", "2004DX", "2005", "2006",
+ "2007", "2008", "2009", "2010", "2011"
+ };
+ for (String type : types) {
+ InputStream input = DWGParserTest.class.getResourceAsStream(
+ "/test-documents/testDWGmech"+type+".dwg");
+ testParserAlt(input);
+ }
+ }
+
+ @SuppressWarnings("deprecation")
+ private void testParser(InputStream input) throws Exception {
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DWGParser().parse(input, handler, metadata);
+
+ assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("The quick brown fox jumps over the lazy dog",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("Nevin Nollop",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Pangram, fox, dog",
+ metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Lorem ipsum",
+ metadata.get(TikaCoreProperties.COMMENTS).substring(0,11));
+ assertEquals("http://www.alfresco.com",
+ metadata.get(TikaCoreProperties.RELATION));
+
+ // Check some of the old style metadata too
+ assertEquals("The quick brown fox jumps over the lazy dog",
+ metadata.get(Metadata.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(Metadata.SUBJECT));
+
+ String content = handler.toString();
+ assertContains("The quick brown fox jumps over the lazy dog", content);
+ assertContains("Gym class", content);
+ assertContains("www.alfresco.com", content);
+ } finally {
+ input.close();
+ }
+ }
+
+ @SuppressWarnings("deprecation")
+ private void testParserNoHeader(InputStream input) throws Exception {
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DWGParser().parse(input, handler, metadata);
+
+ assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertNull(metadata.get(TikaCoreProperties.TITLE));
+ assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertNull(metadata.get(Metadata.SUBJECT));
+ assertNull(metadata.get(TikaCoreProperties.CREATOR));
+ assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
+ assertNull(metadata.get(TikaCoreProperties.COMMENTS));
+ assertNull(metadata.get(TikaCoreProperties.RELATION));
+
+ String content = handler.toString();
+ assertEquals("", content);
+ } finally {
+ input.close();
+ }
+ }
+
+ @SuppressWarnings("deprecation")
+ private void testParserAlt(InputStream input) throws Exception {
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DWGParser().parse(input, handler, metadata);
+
+ assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("Test Title",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Subject",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Test Subject",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("My Author",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("My keyword1, MyKeyword2",
+ metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("This is a comment",
+ metadata.get(TikaCoreProperties.COMMENTS));
+ assertEquals("bejanpol",
+ metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("bejanpol",
+ metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("http://mycompany/drawings",
+ metadata.get(TikaCoreProperties.RELATION));
+ assertEquals("MyCustomPropertyValue",
+ metadata.get("MyCustomProperty"));
+
+ String content = handler.toString();
+ assertContains("This is a comment", content);
+ assertContains("mycompany", content);
+ } finally {
+ input.close();
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-cad-parser-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PRTParserTest extends TikaTest {
+ /**
+ * Try with a simple file
+ */
+ @Test
+ public void testPRTParserBasics() throws Exception {
+ try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new PRTParser().parse(input, handler, metadata);
+
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ // This file has a date
+ assertEquals("2011-06-20T16:54:00",
+ metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-06-20T16:54:00",
+ metadata.get(Metadata.CREATION_DATE));
+ // But no description
+ assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ String contents = handler.toString();
+
+ assertContains("Front View", contents);
+ assertContains("Back View", contents);
+ assertContains("Bottom View", contents);
+ assertContains("Right View", contents);
+ assertContains("Left View", contents);
+ //assertContains("Isometric View", contents); // Can't detect yet
+ assertContains("Axonometric View", contents);
+
+ assertContains("You've managed to extract all the text!", contents);
+ assertContains("This is more text", contents);
+ assertContains("Text Inside a PRT file", contents);
+ }
+ }
+
+ /**
+ * Now a more complex one
+ */
+ @Test
+ public void testPRTParserComplex() throws Exception {
+ try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new PRTParser().parse(input, handler, metadata);
+
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ // File has both a date and a description
+ assertEquals("1997-04-01T08:59:00",
+ metadata.get(Metadata.DATE));
+ assertEquals("1997-04-01T08:59:00",
+ metadata.get(Metadata.CREATION_DATE));
+ assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ String contents = handler.toString();
+
+ assertContains("ITEM", contents);
+ assertContains("REQ.", contents);
+ assertContains("DESCRIPTION", contents);
+ assertContains("MAT'L", contents);
+ assertContains("TOLERANCES UNLESS", contents);
+ assertContains("FRACTIONS", contents);
+ assertContains("ANGLES", contents);
+ assertContains("Acme Corporation", contents);
+
+ assertContains("DATE", contents);
+ assertContains("CHANGE", contents);
+ assertContains("DRAWN BY", contents);
+ assertContains("SCALE", contents);
+ assertContains("TIKA TEST DRAWING", contents);
+ assertContains("TIKA LETTERS", contents);
+ assertContains("5.82", contents);
+ assertContains("112" + '\u00b0', contents); // Degrees
+ assertContains("TIKA TEST LETTER", contents);
+ assertContains("17.11", contents);
+ assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
+ assertContains("Diameter", contents);
+ assertContains("The Apache Tika toolkit", contents);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/pom.xml?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/pom.xml Sat Jan 16 18:23:01 2016
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-code-parser-module</artifactId>
+ <name>Apache Tika Code Parser Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ow2.asm</groupId>
+ <artifactId>asm</artifactId>
+ <version>5.0.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codelibs</groupId>
+ <artifactId>jhighlight</artifactId>
+ <version>1.0.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi</artifactId>
+ <version>${poi.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-parser-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-code-parser-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for Java .class files.
+ */
+public class ClassParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -3531388963354454357L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("java-vm"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ new XHTMLClassVisitor(handler, metadata).parse(stream);
+ }
+
+}